Apply clang-format
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index 238edc8..a5e6499 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -18,16 +18,15 @@
namespace internal {
//---------- float ----------
-struct Packet4cf
-{
+struct Packet4cf {
EIGEN_STRONG_INLINE Packet4cf() {}
EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {}
- __m256 v;
+ __m256 v;
};
#ifndef EIGEN_VECTORIZE_AVX512
-template<> struct packet_traits<std::complex<float> > : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
typedef Packet4cf type;
typedef Packet2cf half;
enum {
@@ -35,50 +34,58 @@
AlignedOnScalar = 1,
size = 4,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
HasNegate = 1,
- HasSqrt = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
+ HasSqrt = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
HasSetLinear = 0
};
};
#endif
-template<> struct unpacket_traits<Packet4cf> {
+template <>
+struct unpacket_traits<Packet4cf> {
typedef std::complex<float> type;
typedef Packet2cf half;
typedef Packet8f as_real;
enum {
- size=4,
- alignment=Aligned32,
- vectorizable=true,
- masked_load_available=false,
- masked_store_available=false
+ size = 4,
+ alignment = Aligned32,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
};
};
-template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+ return Packet4cf(_mm256_add_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+ return Packet4cf(_mm256_sub_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a) {
return Packet4cf(pnegate(a.v));
}
-template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a)
-{
- const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
- return Packet4cf(_mm256_xor_ps(a.v,mask));
+template <>
+EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a) {
+ const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000,
+ 0x80000000, 0x00000000, 0x80000000));
+ return Packet4cf(_mm256_xor_ps(a.v, mask));
}
-template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
__m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v);
- __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
+ __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1)));
__m256 result = _mm256_addsub_ps(tmp1, tmp2);
return Packet4cf(result);
}
@@ -89,112 +96,135 @@
return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)));
}
-template<> EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet4cf pand <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf por <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pxor <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(b.v,a.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) {
+ return Packet4cf(ptrue(Packet8f(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pand<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+ return Packet4cf(_mm256_and_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf por<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+ return Packet4cf(_mm256_or_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pxor<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+ return Packet4cf(_mm256_xor_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+ return Packet4cf(_mm256_andnot_ps(b.v, a.v));
+}
-template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
+template <>
+EIGEN_STRONG_INLINE Packet4cf pload<Packet4cf>(const std::complex<float>* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from)));
+}
-
-template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from) {
const float re = std::real(from);
const float im = std::imag(from);
return Packet4cf(_mm256_set_ps(im, re, im, re, im, re, im, re));
}
-template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from) {
// FIXME The following might be optimized using _mm256_movedup_pd
Packet2cf a = ploaddup<Packet2cf>(from);
- Packet2cf b = ploaddup<Packet2cf>(from+1);
- return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
+ Packet2cf b = ploaddup<Packet2cf>(from + 1);
+ return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
}
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, Index stride)
-{
- return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]),
- std::imag(from[2*stride]), std::real(from[2*stride]),
- std::imag(from[1*stride]), std::real(from[1*stride]),
- std::imag(from[0*stride]), std::real(from[0*stride])));
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) {
+ EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from,
+ Index stride) {
+ return Packet4cf(_mm256_set_ps(std::imag(from[3 * stride]), std::real(from[3 * stride]), std::imag(from[2 * stride]),
+ std::real(from[2 * stride]), std::imag(from[1 * stride]), std::real(from[1 * stride]),
+ std::imag(from[0 * stride]), std::real(from[0 * stride])));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from,
+ Index stride) {
__m128 low = _mm256_extractf128_ps(from.v, 0);
- to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)),
- _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
- to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)),
- _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
+ to[stride * 0] =
+ std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)), _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
+ to[stride * 1] =
+ std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)), _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
__m128 high = _mm256_extractf128_ps(from.v, 1);
- to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)),
- _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
- to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)),
- _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
-
+ to[stride * 2] =
+ std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)), _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
+ to[stride * 3] =
+ std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)), _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
}
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a) {
return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
}
-template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
- __m128 low = _mm256_extractf128_ps(a.v, 0);
+template <>
+EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
+ __m128 low = _mm256_extractf128_ps(a.v, 0);
__m128 high = _mm256_extractf128_ps(a.v, 1);
- __m128d lowd = _mm_castps_pd(low);
+ __m128d lowd = _mm_castps_pd(low);
__m128d highd = _mm_castps_pd(high);
- low = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1));
- high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1));
+ low = _mm_castpd_ps(_mm_shuffle_pd(lowd, lowd, 0x1));
+ high = _mm_castpd_ps(_mm_shuffle_pd(highd, highd, 0x1));
__m256 result = _mm256_setzero_ps();
result = _mm256_insertf128_ps(result, low, 1);
result = _mm256_insertf128_ps(result, high, 0);
return Packet4cf(result);
}
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a)
-{
- return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)),
- Packet2cf(_mm256_extractf128_ps(a.v,1))));
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a) {
+ return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v, 0)), Packet2cf(_mm256_extractf128_ps(a.v, 1))));
}
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
-{
- return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
- Packet2cf(_mm256_extractf128_ps(a.v, 1))));
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a) {
+ return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)), Packet2cf(_mm256_extractf128_ps(a.v, 1))));
}
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf, Packet8f)
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
-
-template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
return pdiv_complex(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
-{
- return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
+template <>
+EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x) {
+ return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0, 1)));
}
//---------- double ----------
-struct Packet2cd
-{
+struct Packet2cd {
EIGEN_STRONG_INLINE Packet2cd() {}
EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {}
- __m256d v;
+ __m256d v;
};
#ifndef EIGEN_VECTORIZE_AVX512
-template<> struct packet_traits<std::complex<double> > : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
typedef Packet2cd type;
typedef Packet1cd half;
enum {
@@ -202,50 +232,60 @@
AlignedOnScalar = 0,
size = 2,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
HasNegate = 1,
- HasSqrt = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
+ HasSqrt = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
HasSetLinear = 0
};
};
#endif
-template<> struct unpacket_traits<Packet2cd> {
+template <>
+struct unpacket_traits<Packet2cd> {
typedef std::complex<double> type;
typedef Packet1cd half;
typedef Packet4d as_real;
enum {
- size=2,
- alignment=Aligned32,
- vectorizable=true,
- masked_load_available=false,
- masked_store_available=false
+ size = 2,
+ alignment = Aligned32,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
};
};
-template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a)
-{
- const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
- return Packet2cd(_mm256_xor_pd(a.v,mask));
+template <>
+EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+ return Packet2cd(_mm256_add_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+ return Packet2cd(_mm256_sub_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) {
+ return Packet2cd(pnegate(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a) {
+ const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0));
+ return Packet2cd(_mm256_xor_pd(a.v, mask));
}
-template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
-{
- __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0);
+template <>
+EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+ __m256d tmp1 = _mm256_shuffle_pd(a.v, a.v, 0x0);
__m256d even = _mm256_mul_pd(tmp1, b.v);
- __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF);
- __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5);
- __m256d odd = _mm256_mul_pd(tmp2, tmp3);
+ __m256d tmp2 = _mm256_shuffle_pd(a.v, a.v, 0xF);
+ __m256d tmp3 = _mm256_shuffle_pd(b.v, b.v, 0x5);
+ __m256d odd = _mm256_mul_pd(tmp2, tmp3);
return Packet2cd(_mm256_addsub_pd(even, odd));
}
@@ -255,82 +295,110 @@
return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5)));
}
-template<> EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet2cd pand <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd por <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pxor <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(b.v,a.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) {
+ return Packet2cd(ptrue(Packet4d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pand<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+ return Packet2cd(_mm256_and_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd por<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+ return Packet2cd(_mm256_or_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pxor<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+ return Packet2cd(_mm256_xor_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+ return Packet2cd(_mm256_andnot_pd(b.v, a.v));
+}
-template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); }
+template <>
+EIGEN_STRONG_INLINE Packet2cd pload<Packet2cd>(const std::complex<double>* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from));
+}
-template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from) {
// in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though)
-// return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
- return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
+ // return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
+ return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
}
-template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); }
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, Index stride)
-{
- return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]),
- std::imag(from[0*stride]), std::real(from[0*stride])));
+template <>
+EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) {
+ return pset1<Packet2cd>(*from);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, Index stride)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet2cd& from) {
+ EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet2cd& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from,
+ Index stride) {
+ return Packet2cd(_mm256_set_pd(std::imag(from[1 * stride]), std::real(from[1 * stride]), std::imag(from[0 * stride]),
+ std::real(from[0 * stride])));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from,
+ Index stride) {
__m128d low = _mm256_extractf128_pd(from.v, 0);
- to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
+ to[stride * 0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
__m128d high = _mm256_extractf128_pd(from.v, 1);
- to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
+ to[stride * 1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
}
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a) {
__m128d low = _mm256_extractf128_pd(a.v, 0);
EIGEN_ALIGN16 double res[2];
_mm_store_pd(res, low);
- return std::complex<double>(res[0],res[1]);
+ return std::complex<double>(res[0], res[1]);
}
-template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
__m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
return Packet2cd(result);
}
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a)
-{
- return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)),
- Packet1cd(_mm256_extractf128_pd(a.v,1))));
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a) {
+ return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v, 0)), Packet1cd(_mm256_extractf128_pd(a.v, 1))));
}
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
-{
- return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
- Packet1cd(_mm256_extractf128_pd(a.v,1))));
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a) {
+ return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v, 0)), Packet1cd(_mm256_extractf128_pd(a.v, 1))));
}
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd, Packet4d)
-template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
return pdiv_complex(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x) {
return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4cf,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4cf, 4>& kernel) {
__m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
__m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
__m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
@@ -347,23 +415,24 @@
kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2cd,2>& kernel) {
- __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4));
- kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4));
- kernel.packet[0].v = tmp;
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cd, 2>& kernel) {
+ __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0 + (2 << 4));
+ kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1 + (3 << 4));
+ kernel.packet[0].v = tmp;
}
-template<> EIGEN_STRONG_INLINE Packet2cd psqrt<Packet2cd>(const Packet2cd& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2cd psqrt<Packet2cd>(const Packet2cd& a) {
return psqrt_complex<Packet2cd>(a);
}
-template<> EIGEN_STRONG_INLINE Packet4cf psqrt<Packet4cf>(const Packet4cf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4cf psqrt<Packet4cf>(const Packet4cf& a) {
return psqrt_complex<Packet4cf>(a);
}
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_COMPLEX_AVX_H
+#endif // EIGEN_COMPLEX_AVX_H
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index 6e83cfc..b125d59 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -28,20 +28,19 @@
// iteration for square root. In particular, Skylake and Zen2 processors
// have approximately doubled throughput of the _mm_sqrt_ps instruction
// compared to their predecessors.
-template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet8f psqrt<Packet8f>(const Packet8f& _x) {
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f psqrt<Packet8f>(const Packet8f& _x) {
return _mm256_sqrt_ps(_x);
}
-template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4d psqrt<Packet4d>(const Packet4d& _x) {
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4d psqrt<Packet4d>(const Packet4d& _x) {
return _mm256_sqrt_pd(_x);
}
-
// Even on Skylake, using Newton iteration is a win for reciprocal square root.
#if EIGEN_FAST_MATH
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet8f prsqrt<Packet8f>(const Packet8f& a) {
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f prsqrt<Packet8f>(const Packet8f& a) {
// _mm256_rsqrt_ps returns -inf for negative denormals.
// _mm512_rsqrt**_ps returns -NaN for negative denormals. We may want
// consistency here.
@@ -51,7 +50,8 @@
return generic_rsqrt_newton_step<Packet8f, /*Steps=*/1>::run(a, _mm256_rsqrt_ps(a));
}
-template<> EIGEN_STRONG_INLINE Packet8f preciprocal<Packet8f>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8f preciprocal<Packet8f>(const Packet8f& a) {
return generic_reciprocal_newton_step<Packet8f, /*Steps=*/1>::run(a, _mm256_rcp_ps(a));
}
@@ -106,7 +106,6 @@
F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt)
F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh)
-
} // end namespace internal
} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 6f37ba0..d752f06 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -31,7 +31,7 @@
#endif
#endif
-typedef __m256 Packet8f;
+typedef __m256 Packet8f;
typedef eigen_packet_wrapper<__m256i, 0> Packet8i;
typedef __m256d Packet4d;
#ifndef EIGEN_VECTORIZE_AVX512FP16
@@ -46,31 +46,58 @@
typedef eigen_packet_wrapper<__m256i, 5> Packet4ul;
#endif
-template<> struct is_arithmetic<__m256> { enum { value = true }; };
-template<> struct is_arithmetic<__m256i> { enum { value = true }; };
-template<> struct is_arithmetic<__m256d> { enum { value = true }; };
-template<> struct is_arithmetic<Packet8i> { enum { value = true }; };
+template <>
+struct is_arithmetic<__m256> {
+ enum { value = true };
+};
+template <>
+struct is_arithmetic<__m256i> {
+ enum { value = true };
+};
+template <>
+struct is_arithmetic<__m256d> {
+ enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet8i> {
+ enum { value = true };
+};
// Note that `Packet8ui` uses the underlying type `__m256i`, which is
// interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
// operations used in `GenericPacketMath.h`.
-template<> struct is_arithmetic<Packet8ui> { enum { value = false }; };
+template <>
+struct is_arithmetic<Packet8ui> {
+ enum { value = false };
+};
#ifndef EIGEN_VECTORIZE_AVX512FP16
-template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
+template <>
+struct is_arithmetic<Packet8h> {
+ enum { value = true };
+};
#endif
-template<> struct is_arithmetic<Packet8bf> { enum { value = true }; };
+template <>
+struct is_arithmetic<Packet8bf> {
+ enum { value = true };
+};
#ifdef EIGEN_VECTORIZE_AVX2
-template<> struct is_arithmetic<Packet4l> { enum { value = true }; };
+template <>
+struct is_arithmetic<Packet4l> {
+ enum { value = true };
+};
// Note that `Packet4ul` uses the underlying type `__m256i`, which is
// interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
// operations used in `GenericPacketMath.h`.
-template<> struct is_arithmetic<Packet4ul> { enum { value = false }; };
+template <>
+struct is_arithmetic<Packet4ul> {
+ enum { value = false };
+};
#endif
// Use the packet_traits defined in AVX512/PacketMath.h instead if we're going
// to leverage AVX512 instructions.
#ifndef EIGEN_VECTORIZE_AVX512
-template<> struct packet_traits<float> : default_packet_traits
-{
+template <>
+struct packet_traits<float> : default_packet_traits {
typedef Packet8f type;
typedef Packet4f half;
enum {
@@ -78,7 +105,7 @@
AlignedOnScalar = 1,
size = 8,
- HasCmp = 1,
+ HasCmp = 1,
HasDiv = 1,
HasReciprocal = EIGEN_FAST_MATH,
HasSin = EIGEN_FAST_MATH,
@@ -104,19 +131,19 @@
HasRint = 1
};
};
-template<> struct packet_traits<double> : default_packet_traits
-{
+template <>
+struct packet_traits<double> : default_packet_traits {
typedef Packet4d type;
typedef Packet2d half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
- size=4,
+ size = 4,
- HasCmp = 1,
- HasDiv = 1,
- HasLog = 1,
- HasExp = 1,
+ HasCmp = 1,
+ HasDiv = 1,
+ HasLog = 1,
+ HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasATan = 1,
@@ -138,35 +165,35 @@
AlignedOnScalar = 1,
size = 8,
- HasCmp = 1,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasSin = EIGEN_FAST_MATH,
- HasCos = EIGEN_FAST_MATH,
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasSin = EIGEN_FAST_MATH,
+ HasCos = EIGEN_FAST_MATH,
HasNegate = 1,
- HasAbs = 1,
- HasAbs2 = 0,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
+ HasAbs = 1,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasSetLinear = 0,
- HasLog = 1,
- HasLog1p = 1,
- HasExpm1 = 1,
- HasExp = 1,
- HasSqrt = 1,
- HasRsqrt = 1,
- HasTanh = EIGEN_FAST_MATH,
- HasErf = EIGEN_FAST_MATH,
- HasBlend = 0,
- HasRound = 1,
- HasFloor = 1,
- HasCeil = 1,
- HasRint = 1,
+ HasLog = 1,
+ HasLog1p = 1,
+ HasExpm1 = 1,
+ HasExp = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+ HasTanh = EIGEN_FAST_MATH,
+ HasErf = EIGEN_FAST_MATH,
+ HasBlend = 0,
+ HasRound = 1,
+ HasFloor = 1,
+ HasCeil = 1,
+ HasRint = 1,
HasBessel = 1,
- HasNdtri = 1
+ HasNdtri = 1
};
};
@@ -189,15 +216,15 @@
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasNegate = 1,
- HasAbs = 1,
- HasAbs2 = 0,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
+ HasAbs = 1,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasSetLinear = 0,
HasLog = 1,
- HasLog1p = 1,
- HasExpm1 = 1,
+ HasLog1p = 1,
+ HasExpm1 = 1,
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
@@ -209,24 +236,18 @@
HasCeil = 1,
HasRint = 1,
HasBessel = 1,
- HasNdtri = 1
+ HasNdtri = 1
};
};
-template<> struct packet_traits<int> : default_packet_traits
-{
+template <>
+struct packet_traits<int> : default_packet_traits {
typedef Packet8i type;
typedef Packet4i half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- HasCmp = 1,
- HasDiv = 1,
- size=8
- };
+ enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, HasDiv = 1, size = 8 };
};
-template<> struct packet_traits<uint32_t> : default_packet_traits
-{
+template <>
+struct packet_traits<uint32_t> : default_packet_traits {
typedef Packet8ui type;
typedef Packet4ui half;
enum {
@@ -246,21 +267,16 @@
};
#ifdef EIGEN_VECTORIZE_AVX2
-template<> struct packet_traits<int64_t> : default_packet_traits
-{
+template <>
+struct packet_traits<int64_t> : default_packet_traits {
typedef Packet4l type;
// There is no half-size packet for current Packet4l.
// TODO: support as SSE path.
typedef Packet4l half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- HasCmp = 1,
- size=4
- };
+ enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, size = 4 };
};
-template<> struct packet_traits<uint64_t> : default_packet_traits
-{
+template <>
+struct packet_traits<uint64_t> : default_packet_traits {
typedef Packet4ul type;
// There is no half-size packet for current Packet4ul.
// TODO: support as SSE path.
@@ -285,51 +301,106 @@
#endif
-template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
-template<> struct scalar_div_cost<double,true> { enum { value = 16 }; };
+template <>
+struct scalar_div_cost<float, true> {
+ enum { value = 14 };
+};
+template <>
+struct scalar_div_cost<double, true> {
+ enum { value = 16 };
+};
-template<> struct unpacket_traits<Packet8f> {
- typedef float type;
- typedef Packet4f half;
- typedef Packet8i integer_packet;
- typedef uint8_t mask_t;
- enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true, masked_store_available=true
+template <>
+struct unpacket_traits<Packet8f> {
+ typedef float type;
+ typedef Packet4f half;
+ typedef Packet8i integer_packet;
+ typedef uint8_t mask_t;
+ enum {
+ size = 8,
+ alignment = Aligned32,
+ vectorizable = true,
+ masked_load_available = true,
+ masked_store_available = true
#ifdef EIGEN_VECTORIZE_AVX512
- , masked_fpops_available=true
+ ,
+ masked_fpops_available = true
#endif
};
};
-template<> struct unpacket_traits<Packet4d> {
+template <>
+struct unpacket_traits<Packet4d> {
typedef double type;
typedef Packet2d half;
- enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+ enum {
+ size = 4,
+ alignment = Aligned32,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
-template<> struct unpacket_traits<Packet8i> {
- typedef int type;
+template <>
+struct unpacket_traits<Packet8i> {
+ typedef int type;
typedef Packet4i half;
- enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+ enum {
+ size = 8,
+ alignment = Aligned32,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
-template<> struct unpacket_traits<Packet8ui> {
+template <>
+struct unpacket_traits<Packet8ui> {
typedef uint32_t type;
typedef Packet4ui half;
- enum {size = 8, alignment = Aligned32, vectorizable = true, masked_load_available = false, masked_store_available = false};
+ enum {
+ size = 8,
+ alignment = Aligned32,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
#ifdef EIGEN_VECTORIZE_AVX2
-template<> struct unpacket_traits<Packet4l> {
- typedef int64_t type;
+template <>
+struct unpacket_traits<Packet4l> {
+ typedef int64_t type;
typedef Packet4l half;
- enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+ enum {
+ size = 4,
+ alignment = Aligned32,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
-template<> struct unpacket_traits<Packet4ul> {
+template <>
+struct unpacket_traits<Packet4ul> {
typedef uint64_t type;
typedef Packet4ul half;
- enum {size = 4, alignment = Aligned32, vectorizable = true, masked_load_available = false, masked_store_available = false};
+ enum {
+ size = 4,
+ alignment = Aligned32,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
#endif
-template<> struct unpacket_traits<Packet8bf> {
+template <>
+struct unpacket_traits<Packet8bf> {
typedef bfloat16 type;
typedef Packet8bf half;
- enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+ enum {
+ size = 8,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
// Helper function for bit packing snippet of low precision comparison.
@@ -380,7 +451,7 @@
EIGEN_STRONG_INLINE Packet4ul padd<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
return _mm256_add_epi64(a, b);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet4l plset<Packet4l>(const int64_t& a) {
return padd(pset1<Packet4l>(a), Packet4l(_mm256_set_epi64x(3ll, 2ll, 1ll, 0ll)));
}
@@ -468,31 +539,33 @@
}
#ifdef EIGEN_VECTORIZE_AVX512FP16
template <int N>
-EIGEN_STRONG_INLINE Packet4l parithmetic_shift_right(Packet4l a) { return _mm256_srai_epi64(a, N); }
+EIGEN_STRONG_INLINE Packet4l parithmetic_shift_right(Packet4l a) {
+ return _mm256_srai_epi64(a, N);
+}
#else
template <int N>
-EIGEN_STRONG_INLINE std::enable_if_t< (N == 0), Packet4l> parithmetic_shift_right(Packet4l a) {
+EIGEN_STRONG_INLINE std::enable_if_t<(N == 0), Packet4l> parithmetic_shift_right(Packet4l a) {
return a;
}
template <int N>
-EIGEN_STRONG_INLINE std::enable_if_t< (N > 0) && (N < 32), Packet4l> parithmetic_shift_right(Packet4l a) {
+EIGEN_STRONG_INLINE std::enable_if_t<(N > 0) && (N < 32), Packet4l> parithmetic_shift_right(Packet4l a) {
__m256i hi_word = _mm256_srai_epi32(a, N);
__m256i lo_word = _mm256_srli_epi64(a, N);
return _mm256_blend_epi32(hi_word, lo_word, 0b01010101);
}
template <int N>
-EIGEN_STRONG_INLINE std::enable_if_t< (N >= 32) && (N < 63), Packet4l> parithmetic_shift_right(Packet4l a) {
+EIGEN_STRONG_INLINE std::enable_if_t<(N >= 32) && (N < 63), Packet4l> parithmetic_shift_right(Packet4l a) {
__m256i hi_word = _mm256_srai_epi32(a, 31);
__m256i lo_word = _mm256_shuffle_epi32(_mm256_srai_epi32(a, N - 32), (shuffle_mask<1, 1, 3, 3>::mask));
return _mm256_blend_epi32(hi_word, lo_word, 0b01010101);
}
template <int N>
-EIGEN_STRONG_INLINE std::enable_if_t< (N == 63), Packet4l> parithmetic_shift_right(Packet4l a) {
+EIGEN_STRONG_INLINE std::enable_if_t<(N == 63), Packet4l> parithmetic_shift_right(Packet4l a) {
return _mm256_shuffle_epi32(_mm256_srai_epi32(a, 31), (shuffle_mask<1, 1, 3, 3>::mask));
}
template <int N>
-EIGEN_STRONG_INLINE std::enable_if_t< (N < 0) || (N > 63), Packet4l> parithmetic_shift_right(Packet4l a) {
- return parithmetic_shift_right<int(N&63)>(a);
+EIGEN_STRONG_INLINE std::enable_if_t<(N < 0) || (N > 63), Packet4l> parithmetic_shift_right(Packet4l a) {
+ return parithmetic_shift_right<int(N & 63)>(a);
}
#endif
template <>
@@ -523,7 +596,7 @@
const Packet4ul a = _mm256_castsi128_si256(_mm_loadu_si128(reinterpret_cast<const __m128i*>(from)));
return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 1, 0, 1, 2, 3, 2, 3));
}
-template<>
+template <>
EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet4l& from) {
EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
}
@@ -577,7 +650,7 @@
Packet4ul pa = pset1<Packet4ul>(a);
pstore(to, pa);
}
-template<>
+template <>
EIGEN_STRONG_INLINE int64_t pfirst<Packet4l>(const Packet4l& a) {
return _mm_cvtsi128_si64(_mm256_castsi256_si128(a));
}
@@ -667,51 +740,102 @@
}
#endif
-template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); }
-template<> EIGEN_STRONG_INLINE Packet8ui pset1<Packet8ui>(const uint32_t& from) { return _mm256_set1_epi32(from); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) {
+ return _mm256_set1_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) {
+ return _mm256_set1_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) {
+ return _mm256_set1_epi32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pset1<Packet8ui>(const uint32_t& from) {
+ return _mm256_set1_epi32(from);
+}
-template<> EIGEN_STRONG_INLINE Packet8f pset1frombits<Packet8f>(unsigned int from) { return _mm256_castsi256_ps(pset1<Packet8i>(from)); }
-template<> EIGEN_STRONG_INLINE Packet4d pset1frombits<Packet4d>(uint64_t from) { return _mm256_castsi256_pd(_mm256_set1_epi64x(from)); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pset1frombits<Packet8f>(unsigned int from) {
+ return _mm256_castsi256_ps(pset1<Packet8i>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pset1frombits<Packet4d>(uint64_t from) {
+ return _mm256_castsi256_pd(_mm256_set1_epi64x(from));
+}
-template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); }
-template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); }
-template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); }
-template<> EIGEN_STRONG_INLINE Packet8ui pzero(const Packet8ui& /*a*/) { return _mm256_setzero_si256(); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) {
+ return _mm256_setzero_ps();
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) {
+ return _mm256_setzero_pd();
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) {
+ return _mm256_setzero_si256();
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pzero(const Packet8ui& /*a*/) {
+ return _mm256_setzero_si256();
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f& /*a*/) {
+ return _mm256_castsi256_ps(_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i peven_mask(const Packet8i& /*a*/) {
+ return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui peven_mask(const Packet8ui& /*a*/) {
+ return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d peven_mask(const Packet4d& /*a*/) {
+ return _mm256_castsi256_pd(_mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1));
+}
-template<> EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f& /*a*/) { return _mm256_castsi256_ps(_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1)); }
-template<> EIGEN_STRONG_INLINE Packet8i peven_mask(const Packet8i& /*a*/) { return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); }
-template<> EIGEN_STRONG_INLINE Packet8ui peven_mask(const Packet8ui& /*a*/) { return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); }
-template<> EIGEN_STRONG_INLINE Packet4d peven_mask(const Packet4d& /*a*/) { return _mm256_castsi256_pd(_mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1)); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) {
+ return _mm256_broadcast_ss(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) {
+ return _mm256_broadcast_sd(from);
+}
-template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); }
-template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
-
-template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) {
+ return _mm256_add_ps(a, b);
+}
#ifdef EIGEN_VECTORIZE_AVX512
template <>
EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b, uint8_t umask) {
__mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
- return _mm512_castps512_ps256(_mm512_maskz_add_ps(
- mask,
- _mm512_castps256_ps512(a),
- _mm512_castps256_ps512(b)));
+ return _mm512_castps512_ps256(_mm512_maskz_add_ps(mask, _mm512_castps256_ps512(a), _mm512_castps256_ps512(b)));
}
#endif
-template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) {
+ return _mm256_add_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b) {
#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_add_epi32(a,b);
+ return _mm256_add_epi32(a, b);
#else
__m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
__m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8ui padd<Packet8ui>(const Packet8ui& a, const Packet8ui& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8ui padd<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_add_epi32(a, b);
#else
@@ -721,24 +845,43 @@
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return padd(pset1<Packet8f>(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); }
-template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return padd(pset1<Packet4d>(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); }
-template<> EIGEN_STRONG_INLINE Packet8i plset<Packet8i>(const int& a) { return padd(pset1<Packet8i>(a), (Packet8i)_mm256_set_epi32(7,6,5,4,3,2,1,0)); }
-template<> EIGEN_STRONG_INLINE Packet8ui plset<Packet8ui>(const uint32_t& a) { return padd(pset1<Packet8ui>(a), (Packet8ui)_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)); }
+template <>
+EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) {
+ return padd(pset1<Packet8f>(a), _mm256_set_ps(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) {
+ return padd(pset1<Packet4d>(a), _mm256_set_pd(3.0, 2.0, 1.0, 0.0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i plset<Packet8i>(const int& a) {
+ return padd(pset1<Packet8i>(a), (Packet8i)_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui plset<Packet8ui>(const uint32_t& a) {
+ return padd(pset1<Packet8ui>(a), (Packet8ui)_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
+}
-template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) {
+ return _mm256_sub_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) {
+ return _mm256_sub_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b) {
#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_sub_epi32(a,b);
+ return _mm256_sub_epi32(a, b);
#else
__m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
__m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8ui psub<Packet8ui>(const Packet8ui& a, const Packet8ui& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8ui psub<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_sub_epi32(a, b);
#else
@@ -748,38 +891,54 @@
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
return _mm256_xor_ps(a, mask);
}
-template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) {
const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000ULL));
return _mm256_xor_pd(a, mask);
}
-template<> EIGEN_STRONG_INLINE Packet8i pnegate(const Packet8i& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8i pnegate(const Packet8i& a) {
return psub(pzero(a), a);
}
-template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) {
+ return a;
+}
-template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i pmul<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) {
+ return _mm256_mul_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) {
+ return _mm256_mul_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pmul<Packet8i>(const Packet8i& a, const Packet8i& b) {
#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_mullo_epi32(a,b);
+ return _mm256_mullo_epi32(a, b);
#else
const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8ui pmul<Packet8ui>(const Packet8ui& a, const Packet8ui& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8ui pmul<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_mullo_epi32(a, b);
#else
@@ -789,11 +948,17 @@
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) {
+ return _mm256_div_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) {
+ return _mm256_div_pd(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& a, const Packet8i& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& a, const Packet8i& b) {
#ifdef EIGEN_VECTORIZE_AVX512
return _mm512_cvttpd_epi32(_mm512_div_pd(_mm512_cvtepi32_pd(a), _mm512_cvtepi32_pd(b)));
#else
@@ -845,20 +1010,48 @@
#endif
-template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); }
-template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); }
-template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); }
-template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); }
-template<> EIGEN_STRONG_INLINE Packet8f pisnan(const Packet8f& a) { return _mm256_cmp_ps(a,a,_CMP_UNORD_Q); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) {
+ return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) {
+ return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) {
+ return _mm256_cmp_ps(a, b, _CMP_NGE_UQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) {
+ return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pisnan(const Packet8f& a) {
+ return _mm256_cmp_ps(a, a, _CMP_UNORD_Q);
+}
-template<> EIGEN_STRONG_INLINE Packet4d pcmp_le(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LE_OQ); }
-template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LT_OQ); }
-template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a, b, _CMP_NGE_UQ); }
-template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); }
+template <>
+EIGEN_STRONG_INLINE Packet4d pcmp_le(const Packet4d& a, const Packet4d& b) {
+ return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4d& b) {
+ return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) {
+ return _mm256_cmp_pd(a, b, _CMP_NGE_UQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) {
+ return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);
+}
-template<> EIGEN_STRONG_INLINE Packet8i pcmp_le(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8i pcmp_le(const Packet8i& a, const Packet8i& b) {
#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_xor_si256(_mm256_cmpgt_epi32(a,b), _mm256_set1_epi32(-1));
+ return _mm256_xor_si256(_mm256_cmpgt_epi32(a, b), _mm256_set1_epi32(-1));
#else
__m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
lo = _mm_xor_si128(lo, _mm_set1_epi32(-1));
@@ -867,25 +1060,28 @@
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8i pcmp_lt(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8i pcmp_lt(const Packet8i& a, const Packet8i& b) {
#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_cmpgt_epi32(b,a);
+ return _mm256_cmpgt_epi32(b, a);
#else
__m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(b, 0), _mm256_extractf128_si256(a, 0));
__m128i hi = _mm_cmpgt_epi32(_mm256_extractf128_si256(b, 1), _mm256_extractf128_si256(a, 1));
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_cmpeq_epi32(a,b);
+ return _mm256_cmpeq_epi32(a, b);
#else
__m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
__m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8ui pcmp_eq(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui pcmp_eq(const Packet8ui& a, const Packet8ui& b) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_cmpeq_epi32(a, b);
#else
@@ -895,32 +1091,35 @@
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
-#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
+template <>
+EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
// There appears to be a bug in GCC, by which the optimizer may flip
// the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
Packet8f res;
- asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ asm("vminps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
return res;
#else
// Arguments are swapped to match NaN propagation behavior of std::min.
- return _mm256_min_ps(b,a);
+ return _mm256_min_ps(b, a);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
-#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
+template <>
+EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
// See pmin above
Packet4d res;
- asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ asm("vminpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
return res;
#else
// Arguments are swapped to match NaN propagation behavior of std::min.
- return _mm256_min_pd(b,a);
+ return _mm256_min_pd(b, a);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8i pmin<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8i pmin<Packet8i>(const Packet8i& a, const Packet8i& b) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_min_epi32(a, b);
#else
@@ -929,7 +1128,8 @@
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8ui pmin<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui pmin<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_min_epu32(a, b);
#else
@@ -939,29 +1139,32 @@
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
-#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
+template <>
+EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
// See pmin above
Packet8f res;
- asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ asm("vmaxps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
return res;
#else
// Arguments are swapped to match NaN propagation behavior of std::max.
- return _mm256_max_ps(b,a);
+ return _mm256_max_ps(b, a);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
-#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
+template <>
+EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
// See pmin above
Packet4d res;
- asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ asm("vmaxpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
return res;
#else
// Arguments are swapped to match NaN propagation behavior of std::max.
- return _mm256_max_pd(b,a);
+ return _mm256_max_pd(b, a);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8i pmax<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8i pmax<Packet8i>(const Packet8i& a, const Packet8i& b) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_max_epi32(a, b);
#else
@@ -970,7 +1173,8 @@
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8ui pmax<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui pmax<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_max_epu32(a, b);
#else
@@ -981,129 +1185,174 @@
}
#ifdef EIGEN_VECTORIZE_AVX2
-template<> EIGEN_STRONG_INLINE Packet8i psign(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8i psign(const Packet8i& a) {
return _mm256_sign_epi32(_mm256_set1_epi32(1), a);
}
#endif
// Add specializations for min/max with prescribed NaN progation.
-template<>
+template <>
EIGEN_STRONG_INLINE Packet8f pmin<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
return pminmax_propagate_numbers(a, b, pmin<Packet8f>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet4d pmin<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) {
return pminmax_propagate_numbers(a, b, pmin<Packet4d>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet8f pmax<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
return pminmax_propagate_numbers(a, b, pmax<Packet8f>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet4d pmax<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) {
return pminmax_propagate_numbers(a, b, pmax<Packet4d>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet8f pmin<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) {
return pminmax_propagate_nan(a, b, pmin<Packet8f>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet4d pmin<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) {
return pminmax_propagate_nan(a, b, pmin<Packet4d>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet8f pmax<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) {
return pminmax_propagate_nan(a, b, pmax<Packet8f>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet4d pmax<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) {
return pminmax_propagate_nan(a, b, pmax<Packet4d>);
}
-template<> EIGEN_STRONG_INLINE Packet8f print<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
-template<> EIGEN_STRONG_INLINE Packet4d print<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
+template <>
+EIGEN_STRONG_INLINE Packet8f print<Packet8f>(const Packet8f& a) {
+ return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d print<Packet4d>(const Packet4d& a) {
+ return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+}
-template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); }
-template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) {
+ return _mm256_ceil_ps(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) {
+ return _mm256_ceil_pd(a);
+}
-template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
-template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) {
+ return _mm256_floor_ps(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) {
+ return _mm256_floor_pd(a);
+}
-
-template<> EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
#ifdef EIGEN_VECTORIZE_AVX2
// vpcmpeqd has lower latency than the more general vcmpps
- return _mm256_cmpeq_epi32(a,a);
+ return _mm256_cmpeq_epi32(a, a);
#else
const __m256 b = _mm256_castsi256_ps(a);
- return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ));
+ return _mm256_castps_si256(_mm256_cmp_ps(b, b, _CMP_TRUE_UQ));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) {
#ifdef EIGEN_VECTORIZE_AVX2
// vpcmpeqd has lower latency than the more general vcmpps
const __m256i b = _mm256_castps_si256(a);
- return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b));
+ return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b, b));
#else
- return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ);
+ return _mm256_cmp_ps(a, a, _CMP_TRUE_UQ);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) {
#ifdef EIGEN_VECTORIZE_AVX2
// vpcmpeqq has lower latency than the more general vcmppd
const __m256i b = _mm256_castpd_si256(a);
- return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b));
+ return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b, b));
#else
- return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ);
+ return _mm256_cmp_pd(a, a, _CMP_TRUE_UQ);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) {
+ return _mm256_and_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) {
+ return _mm256_and_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_and_si256(a,b);
+ return _mm256_and_si256(a, b);
#else
- return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+ return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8ui pand<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui pand<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_and_si256(a,b);
+ return _mm256_and_si256(a, b);
#else
- return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+ return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) {
+ return _mm256_or_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) {
+ return _mm256_or_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b) {
#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_or_si256(a,b);
+ return _mm256_or_si256(a, b);
#else
- return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+ return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8ui por<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui por<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_or_si256(a,b);
+ return _mm256_or_si256(a, b);
#else
- return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+ return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) {
+ return _mm256_xor_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) {
+ return _mm256_xor_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b) {
#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_xor_si256(a,b);
+ return _mm256_xor_si256(a, b);
#else
- return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
+ return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8ui pxor<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui pxor<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_xor_si256(a, b);
#else
@@ -1111,54 +1360,75 @@
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); }
-template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); }
-template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) {
+ return _mm256_andnot_ps(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) {
+ return _mm256_andnot_pd(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b) {
#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_andnot_si256(b,a);
+ return _mm256_andnot_si256(b, a);
#else
- return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a)));
+ return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a)));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8ui pandnot<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui pandnot<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
#ifdef EIGEN_VECTORIZE_AVX2
- return _mm256_andnot_si256(b,a);
+ return _mm256_andnot_si256(b, a);
#else
- return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a)));
+ return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a)));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8ui pcmp_lt(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui pcmp_lt(const Packet8ui& a, const Packet8ui& b) {
return pxor(pcmp_eq(a, pmax(a, b)), ptrue(a));
}
-template<> EIGEN_STRONG_INLINE Packet8ui pcmp_le(const Packet8ui& a, const Packet8ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui pcmp_le(const Packet8ui& a, const Packet8ui& b) {
return pcmp_eq(a, pmin(a, b));
}
-template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) {
const Packet8f mask = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x80000000u));
const Packet8f prev0dot5 = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
return _mm256_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
}
-template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) {
const Packet4d mask = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
const Packet4d prev0dot5 = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
return _mm256_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
}
-template<> EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b)
-{ return _mm256_blendv_ps(b,a,mask); }
-template<> EIGEN_STRONG_INLINE Packet8i pselect<Packet8i>(const Packet8i& mask, const Packet8i& a, const Packet8i& b)
-{ return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask))); }
-template<> EIGEN_STRONG_INLINE Packet8ui pselect<Packet8ui>(const Packet8ui& mask, const Packet8ui& a, const Packet8ui& b)
-{ return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask))); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b) {
+ return _mm256_blendv_ps(b, a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pselect<Packet8i>(const Packet8i& mask, const Packet8i& a, const Packet8i& b) {
+ return _mm256_castps_si256(
+ _mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pselect<Packet8ui>(const Packet8ui& mask, const Packet8ui& a, const Packet8ui& b) {
+ return _mm256_castps_si256(
+ _mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask)));
+}
-template<> EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& mask, const Packet4d& a, const Packet4d& b)
-{ return _mm256_blendv_pd(b,a,mask); }
+template <>
+EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& mask, const Packet4d& a, const Packet4d& b) {
+ return _mm256_blendv_pd(b, a, mask);
+}
-template<int N> EIGEN_STRONG_INLINE Packet8i parithmetic_shift_right(Packet8i a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet8i parithmetic_shift_right(Packet8i a) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_srai_epi32(a, N);
#else
@@ -1168,7 +1438,8 @@
#endif
}
-template<int N> EIGEN_STRONG_INLINE Packet8i plogical_shift_right(Packet8i a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet8i plogical_shift_right(Packet8i a) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_srli_epi32(a, N);
#else
@@ -1178,7 +1449,8 @@
#endif
}
-template<int N> EIGEN_STRONG_INLINE Packet8i plogical_shift_left(Packet8i a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet8i plogical_shift_left(Packet8i a) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_slli_epi32(a, N);
#else
@@ -1188,33 +1460,62 @@
#endif
}
-template<int N> EIGEN_STRONG_INLINE Packet8ui parithmetic_shift_right(Packet8ui a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet8ui parithmetic_shift_right(Packet8ui a) {
return (Packet8ui)plogical_shift_right<N>((Packet8i)a);
}
-template<int N> EIGEN_STRONG_INLINE Packet8ui plogical_shift_right(Packet8ui a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet8ui plogical_shift_right(Packet8ui a) {
return (Packet8ui)plogical_shift_right<N>((Packet8i)a);
}
-template<int N> EIGEN_STRONG_INLINE Packet8ui plogical_shift_left(Packet8ui a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet8ui plogical_shift_left(Packet8ui a) {
return (Packet8ui)plogical_shift_left<N>((Packet8i)a);
}
-template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
-template<> EIGEN_STRONG_INLINE Packet8ui pload<Packet8ui>(const uint32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); }
+template <>
+EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pload<Packet8ui>(const uint32_t* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+}
-template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
-template<> EIGEN_STRONG_INLINE Packet8ui ploadu<Packet8ui>(const uint32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
+template <>
+EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui ploadu<Packet8ui>(const uint32_t* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+}
-template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from, uint8_t umask) {
+template <>
+EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from, uint8_t umask) {
#ifdef EIGEN_VECTORIZE_AVX512
__mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_castps512_ps256(_mm512_maskz_loadu_ps(mask, from));
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_castps512_ps256(_mm512_maskz_loadu_ps(mask, from));
#else
Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
- const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
+ const Packet8i bit_mask =
+ _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
mask = por<Packet8i>(mask, bit_mask);
mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask);
@@ -1222,41 +1523,44 @@
}
// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3}
-template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from) {
// TODO try to find a way to avoid the need of a temporary register
// Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from));
-// tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
-// return _mm256_unpacklo_ps(tmp,tmp);
+ // tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
+ // return _mm256_unpacklo_ps(tmp,tmp);
// _mm256_insertf128_ps is very slow on Haswell, thus:
Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
// mimic an "inplace" permutation of the lower 128bits using a blend
- tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
+ tmp = _mm256_blend_ps(
+ tmp, _mm256_castps128_ps256(_mm_permute_ps(_mm256_castps256_ps128(tmp), _MM_SHUFFLE(1, 0, 1, 0))), 15);
// then we can perform a consistent permutation on the global register to get everything in shape:
- return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2));
+ return _mm256_permute_ps(tmp, _MM_SHUFFLE(3, 3, 2, 2));
}
// Loads 2 doubles from memory a returns the packet {a0, a0, a1, a1}
-template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from) {
Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
- return _mm256_permute_pd(tmp, 3<<2);
+ return _mm256_permute_pd(tmp, 3 << 2);
}
// Loads 4 integers from memory a returns the packet {a0, a0, a1, a1, a2, a2, a3, a3}
-template<> EIGEN_STRONG_INLINE Packet8i ploaddup<Packet8i>(const int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8i ploaddup<Packet8i>(const int* from) {
#ifdef EIGEN_VECTORIZE_AVX2
const Packet8i a = _mm256_castsi128_si256(ploadu<Packet4i>(from));
return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
#else
__m256 tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
// mimic an "inplace" permutation of the lower 128bits using a blend
- tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15);
+ tmp = _mm256_blend_ps(
+ tmp, _mm256_castps128_ps256(_mm_permute_ps(_mm256_castps256_ps128(tmp), _MM_SHUFFLE(1, 0, 1, 0))), 15);
// then we can perform a consistent permutation on the global register to get everything in shape:
- return _mm256_castps_si256(_mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)));
+ return _mm256_castps_si256(_mm256_permute_ps(tmp, _MM_SHUFFLE(3, 3, 2, 2)));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8ui ploaddup<Packet8ui>(const uint32_t* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui ploaddup<Packet8ui>(const uint32_t* from) {
#ifdef EIGEN_VECTORIZE_AVX2
const Packet8ui a = _mm256_castsi128_si256(ploadu<Packet4ui>(from));
return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
@@ -1272,43 +1576,72 @@
}
// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1}
-template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from) {
Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
- return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1);
+ return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from + 1), 1);
}
-template<> EIGEN_STRONG_INLINE Packet8i ploadquad<Packet8i>(const int* from)
-{
- return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from+1)), 1);
+template <>
+EIGEN_STRONG_INLINE Packet8i ploadquad<Packet8i>(const int* from) {
+ return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from + 1)), 1);
}
-template<> EIGEN_STRONG_INLINE Packet8ui ploadquad<Packet8ui>(const uint32_t* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui ploadquad<Packet8ui>(const uint32_t* from) {
return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from + 1)), 1);
}
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet8ui& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from); }
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet8ui& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
+}
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet8ui& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet8ui& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
+}
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from, uint8_t umask) {
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from, uint8_t umask) {
#ifdef EIGEN_VECTORIZE_AVX512
__mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
EIGEN_DEBUG_UNALIGNED_STORE _mm512_mask_storeu_ps(to, mask, _mm512_castps256_ps512(from));
#else
Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
- const Packet8i bit_mask = _mm256_set_epi32(0x7f7f7f7f, 0xbfbfbfbf, 0xdfdfdfdf, 0xefefefef, 0xf7f7f7f7, 0xfbfbfbfb, 0xfdfdfdfd, 0xfefefefe);
+ const Packet8i bit_mask =
+ _mm256_set_epi32(0x7f7f7f7f, 0xbfbfbfbf, 0xdfdfdfdf, 0xefefefef, 0xf7f7f7f7, 0xfbfbfbfb, 0xfdfdfdfd, 0xfefefefe);
mask = por<Packet8i>(mask, bit_mask);
mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
#if EIGEN_COMP_MSVC
// MSVC sometimes seems to use a bogus mask with maskstore.
const __m256i ifrom = _mm256_castps_si256(from);
- EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 0), _mm256_extractf128_si256(mask, 0), reinterpret_cast<char*>(to));
- EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 1), _mm256_extractf128_si256(mask, 1), reinterpret_cast<char*>(to + 4));
+ EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 0), _mm256_extractf128_si256(mask, 0),
+ reinterpret_cast<char*>(to));
+ EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 1), _mm256_extractf128_si256(mask, 1),
+ reinterpret_cast<char*>(to + 4));
#else
EIGEN_DEBUG_UNALIGNED_STORE _mm256_maskstore_ps(to, mask, from);
#endif
@@ -1316,111 +1649,129 @@
}
// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
-// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
-template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride)
-{
- return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
- from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride),
+// 4);
+template <>
+EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride) {
+ return _mm256_set_ps(from[7 * stride], from[6 * stride], from[5 * stride], from[4 * stride], from[3 * stride],
+ from[2 * stride], from[1 * stride], from[0 * stride]);
}
-template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride)
-{
- return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+template <>
+EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride) {
+ return _mm256_set_pd(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
}
-template<> EIGEN_DEVICE_FUNC inline Packet8i pgather<int, Packet8i>(const int* from, Index stride)
-{
- return _mm256_set_epi32(from[7*stride], from[6*stride], from[5*stride], from[4*stride],
- from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+template <>
+EIGEN_DEVICE_FUNC inline Packet8i pgather<int, Packet8i>(const int* from, Index stride) {
+ return _mm256_set_epi32(from[7 * stride], from[6 * stride], from[5 * stride], from[4 * stride], from[3 * stride],
+ from[2 * stride], from[1 * stride], from[0 * stride]);
}
-template<> EIGEN_DEVICE_FUNC inline Packet8ui pgather<uint32_t, Packet8ui>(const uint32_t* from, Index stride) {
+template <>
+EIGEN_DEVICE_FUNC inline Packet8ui pgather<uint32_t, Packet8ui>(const uint32_t* from, Index stride) {
return (Packet8ui)pgather<int, Packet8i>((int*)from, stride);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) {
__m128 low = _mm256_extractf128_ps(from, 0);
- to[stride*0] = _mm_cvtss_f32(low);
- to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
- to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
- to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
+ to[stride * 0] = _mm_cvtss_f32(low);
+ to[stride * 1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
+ to[stride * 2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
+ to[stride * 3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
__m128 high = _mm256_extractf128_ps(from, 1);
- to[stride*4] = _mm_cvtss_f32(high);
- to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
- to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
- to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
+ to[stride * 4] = _mm_cvtss_f32(high);
+ to[stride * 5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
+ to[stride * 6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
+ to[stride * 7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride) {
__m128d low = _mm256_extractf128_pd(from, 0);
- to[stride*0] = _mm_cvtsd_f64(low);
- to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
+ to[stride * 0] = _mm_cvtsd_f64(low);
+ to[stride * 1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
__m128d high = _mm256_extractf128_pd(from, 1);
- to[stride*2] = _mm_cvtsd_f64(high);
- to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
+ to[stride * 2] = _mm_cvtsd_f64(high);
+ to[stride * 3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet8i>(int* to, const Packet8i& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int, Packet8i>(int* to, const Packet8i& from, Index stride) {
__m128i low = _mm256_extractf128_si256(from, 0);
- to[stride*0] = _mm_extract_epi32(low, 0);
- to[stride*1] = _mm_extract_epi32(low, 1);
- to[stride*2] = _mm_extract_epi32(low, 2);
- to[stride*3] = _mm_extract_epi32(low, 3);
+ to[stride * 0] = _mm_extract_epi32(low, 0);
+ to[stride * 1] = _mm_extract_epi32(low, 1);
+ to[stride * 2] = _mm_extract_epi32(low, 2);
+ to[stride * 3] = _mm_extract_epi32(low, 3);
__m128i high = _mm256_extractf128_si256(from, 1);
- to[stride*4] = _mm_extract_epi32(high, 0);
- to[stride*5] = _mm_extract_epi32(high, 1);
- to[stride*6] = _mm_extract_epi32(high, 2);
- to[stride*7] = _mm_extract_epi32(high, 3);
+ to[stride * 4] = _mm_extract_epi32(high, 0);
+ to[stride * 5] = _mm_extract_epi32(high, 1);
+ to[stride * 6] = _mm_extract_epi32(high, 2);
+ to[stride * 7] = _mm_extract_epi32(high, 3);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet8ui>(uint32_t* to, const Packet8ui& from, Index stride) {
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet8ui>(uint32_t* to, const Packet8ui& from, Index stride) {
pscatter<int, Packet8i>((int*)to, (Packet8i)from, stride);
}
-template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a) {
Packet8f pa = pset1<Packet8f>(a);
pstore(to, pa);
}
-template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a) {
Packet4d pa = pset1<Packet4d>(a);
pstore(to, pa);
}
-template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a) {
Packet8i pa = pset1<Packet8i>(a);
pstore(to, pa);
}
#ifndef EIGEN_VECTORIZE_AVX512
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+ _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+ _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
+ _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
+ _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
#endif
-template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
return _mm_cvtss_f32(_mm256_castps256_ps128(a));
}
-template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
}
-template<> EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) {
return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
}
-template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet8ui>(const Packet8ui& a) {
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet8ui>(const Packet8ui& a) {
return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm256_castsi256_si128(a)));
}
-
-template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a)
-{
- __m256 tmp = _mm256_shuffle_ps(a,a,0x1b);
+template <>
+EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) {
+ __m256 tmp = _mm256_shuffle_ps(a, a, 0x1b);
return _mm256_permute2f128_ps(tmp, tmp, 1);
}
-template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a)
-{
- __m256d tmp = _mm256_shuffle_pd(a,a,5);
+template <>
+EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) {
+ __m256d tmp = _mm256_shuffle_pd(a, a, 5);
return _mm256_permute2f128_pd(tmp, tmp, 1);
#if 0
// This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
@@ -1429,37 +1780,41 @@
return _mm256_permute_pd(swap_halves,5);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8i preverse(const Packet8i& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8i preverse(const Packet8i& a) {
return _mm256_castps_si256(preverse(_mm256_castsi256_ps(a)));
}
-template<> EIGEN_STRONG_INLINE Packet8ui preverse(const Packet8ui& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui preverse(const Packet8ui& a) {
return _mm256_castps_si256(preverse(_mm256_castsi256_ps(a)));
}
#ifdef EIGEN_VECTORIZE_AVX2
-template<> EIGEN_STRONG_INLINE Packet4l preverse(const Packet4l& a)
- {
+template <>
+EIGEN_STRONG_INLINE Packet4l preverse(const Packet4l& a) {
return _mm256_castpd_si256(preverse(_mm256_castsi256_pd(a)));
}
-template<> EIGEN_STRONG_INLINE Packet4ul preverse(const Packet4ul& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4ul preverse(const Packet4ul& a) {
return _mm256_castpd_si256(preverse(_mm256_castsi256_pd(a)));
}
#endif
// pabs should be ok
-template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a)
-{
- const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
- return _mm256_and_ps(a,mask);
+template <>
+EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) {
+ const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
+ 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF));
+ return _mm256_and_ps(a, mask);
}
-template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
-{
- const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
- return _mm256_and_pd(a,mask);
+template <>
+EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) {
+ const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF,
+ 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF));
+ return _mm256_and_pd(a, mask);
}
-template<> EIGEN_STRONG_INLINE Packet8i pabs(const Packet8i& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8i pabs(const Packet8i& a) {
#ifdef EIGEN_VECTORIZE_AVX2
return _mm256_abs_epi32(a);
#else
@@ -1468,26 +1823,47 @@
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8ui pabs(const Packet8ui& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet8ui pabs(const Packet8ui& a) {
+ return a;
+}
-template<> EIGEN_STRONG_INLINE Packet8h psignbit(const Packet8h& a) { return _mm_srai_epi16(a, 15); }
-template<> EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) { return _mm_srai_epi16(a, 15); }
-template<> EIGEN_STRONG_INLINE Packet8f psignbit(const Packet8f& a) { return _mm256_castsi256_ps(parithmetic_shift_right<31>((Packet8i)_mm256_castps_si256(a))); }
-template<> EIGEN_STRONG_INLINE Packet8ui psignbit(const Packet8ui& a) { return pzero(a); }
+template <>
+EIGEN_STRONG_INLINE Packet8h psignbit(const Packet8h& a) {
+ return _mm_srai_epi16(a, 15);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
+ return _mm_srai_epi16(a, 15);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f psignbit(const Packet8f& a) {
+ return _mm256_castsi256_ps(parithmetic_shift_right<31>((Packet8i)_mm256_castps_si256(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui psignbit(const Packet8ui& a) {
+ return pzero(a);
+}
#ifdef EIGEN_VECTORIZE_AVX2
-template<> EIGEN_STRONG_INLINE Packet4d psignbit(const Packet4d& a) { return _mm256_castsi256_pd(parithmetic_shift_right<63>((Packet4l)_mm256_castpd_si256(a))); }
-template<> EIGEN_STRONG_INLINE Packet4ul psignbit(const Packet4ul& a) { return pzero(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4d psignbit(const Packet4d& a) {
+ return _mm256_castsi256_pd(parithmetic_shift_right<63>((Packet4l)_mm256_castpd_si256(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul psignbit(const Packet4ul& a) {
+ return pzero(a);
+}
#endif
-template<> EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
- return pfrexp_generic(a,exponent);
+template <>
+EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
+ return pfrexp_generic(a, exponent);
}
// Extract exponent without existence of Packet4l.
-template<>
-EIGEN_STRONG_INLINE
-Packet4d pfrexp_generic_get_biased_exponent(const Packet4d& a) {
- const Packet4d cst_exp_mask = pset1frombits<Packet4d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+template <>
+EIGEN_STRONG_INLINE Packet4d pfrexp_generic_get_biased_exponent(const Packet4d& a) {
+ const Packet4d cst_exp_mask = pset1frombits<Packet4d>(static_cast<uint64_t>(0x7ff0000000000000ull));
__m256i a_expo = _mm256_castpd_si256(pand(a, cst_exp_mask));
#ifdef EIGEN_VECTORIZE_AVX2
a_expo = _mm256_srli_epi64(a_expo, 52);
@@ -1506,16 +1882,18 @@
return exponent;
}
-
-template<> EIGEN_STRONG_INLINE Packet4d pfrexp<Packet4d>(const Packet4d& a, Packet4d& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet4d pfrexp<Packet4d>(const Packet4d& a, Packet4d& exponent) {
return pfrexp_generic(a, exponent);
}
-template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) {
return pldexp_generic(a, exponent);
}
-template<> EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
// Clamp exponent to [-2099, 2099]
const Packet4d max_exponent = pset1<Packet4d>(2099.0);
const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
@@ -1537,74 +1915,76 @@
lo = _mm_slli_epi64(hi, 52);
hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
- out = pmul(out, c); // a * 2^e
+ out = pmul(out, c); // a * 2^e
return out;
}
-template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
-{
- return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1))));
+template <>
+EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) {
+ return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1))));
}
-template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
-{
- return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
+template <>
+EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a) {
+ return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a), _mm256_extractf128_pd(a, 1))));
}
-template<> EIGEN_STRONG_INLINE int predux<Packet8i>(const Packet8i& a)
-{
- return predux(Packet4i(_mm_add_epi32(_mm256_castsi256_si128(a),_mm256_extractf128_si256(a,1))));
+template <>
+EIGEN_STRONG_INLINE int predux<Packet8i>(const Packet8i& a) {
+ return predux(Packet4i(_mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1))));
}
-template<> EIGEN_STRONG_INLINE uint32_t predux<Packet8ui>(const Packet8ui& a) {
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet8ui>(const Packet8ui& a) {
return predux(Packet4ui(_mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1))));
}
-template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a)
-{
- return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
+template <>
+EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a) {
+ return _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1));
}
-template<> EIGEN_STRONG_INLINE Packet4i predux_half_dowto4<Packet8i>(const Packet8i& a)
-{
- return _mm_add_epi32(_mm256_castsi256_si128(a),_mm256_extractf128_si256(a,1));
+template <>
+EIGEN_STRONG_INLINE Packet4i predux_half_dowto4<Packet8i>(const Packet8i& a) {
+ return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
}
-template<> EIGEN_STRONG_INLINE Packet4ui predux_half_dowto4<Packet8ui>(const Packet8ui& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui predux_half_dowto4<Packet8ui>(const Packet8ui& a) {
return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
}
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a) {
Packet8f tmp;
- tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1));
- tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
- return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+ tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a, a, 1));
+ tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+ return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
}
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a)
-{
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a) {
Packet4d tmp;
- tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1));
- return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1)));
+ tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a, a, 1));
+ return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
}
-template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a)
-{
- Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1));
- tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
- return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a) {
+ Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a, a, 1));
+ tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+ return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
}
-template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a)
-{
- Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1));
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a) {
+ Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a, a, 1));
return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
}
-template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a)
-{
- Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1));
- tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2)));
- return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1)));
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a) {
+ Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a, a, 1));
+ tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp, tmp, _MM_SHUFFLE(1, 0, 3, 2)));
+ return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp, tmp, 1)));
}
-template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
-{
- Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1));
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a) {
+ Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a, a, 1));
return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
}
@@ -1614,22 +1994,21 @@
// return _mm256_movemask_ps(x)==0xFF;
// }
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
-{
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x) {
return _mm256_movemask_ps(x) != 0;
}
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8i& x)
-{
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8i& x) {
return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
}
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8ui& x)
-{
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8ui& x) {
return _mm256_movemask_ps(_mm256_castsi256_ps(x)) != 0;
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8f,8>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
__m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
__m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
__m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
@@ -1638,14 +2017,14 @@
__m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
__m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
__m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
- __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
- __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
- __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
- __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
- __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0));
- __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2));
- __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0));
- __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2));
+ __m256 S0 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+ __m256 S1 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+ __m256 S2 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+ __m256 S3 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
+ __m256 S4 = _mm256_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
+ __m256 S5 = _mm256_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
+ __m256 S6 = _mm256_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
+ __m256 S7 = _mm256_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
@@ -1656,17 +2035,16 @@
kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8f,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8f, 4>& kernel) {
__m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
__m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
__m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
__m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
- __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0));
- __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2));
- __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0));
- __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2));
+ __m256 S0 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+ __m256 S1 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+ __m256 S2 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+ __m256 S3 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
@@ -1687,9 +2065,7 @@
#define MM256_UNPACKHI_EPI32(A, B) _mm256_unpackhi_epi32(A, B)
#endif
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8i,8>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8i, 8>& kernel) {
__m256i T0 = MM256_UNPACKLO_EPI32(kernel.packet[0], kernel.packet[1]);
__m256i T1 = MM256_UNPACKHI_EPI32(kernel.packet[0], kernel.packet[1]);
__m256i T2 = MM256_UNPACKLO_EPI32(kernel.packet[2], kernel.packet[3]);
@@ -1698,14 +2074,14 @@
__m256i T5 = MM256_UNPACKHI_EPI32(kernel.packet[4], kernel.packet[5]);
__m256i T6 = MM256_UNPACKLO_EPI32(kernel.packet[6], kernel.packet[7]);
__m256i T7 = MM256_UNPACKHI_EPI32(kernel.packet[6], kernel.packet[7]);
- __m256i S0 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(1,0,1,0));
- __m256i S1 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(3,2,3,2));
- __m256i S2 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(1,0,1,0));
- __m256i S3 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(3,2,3,2));
- __m256i S4 = MM256_SHUFFLE_EPI32(T4,T6,_MM_SHUFFLE(1,0,1,0));
- __m256i S5 = MM256_SHUFFLE_EPI32(T4,T6,_MM_SHUFFLE(3,2,3,2));
- __m256i S6 = MM256_SHUFFLE_EPI32(T5,T7,_MM_SHUFFLE(1,0,1,0));
- __m256i S7 = MM256_SHUFFLE_EPI32(T5,T7,_MM_SHUFFLE(3,2,3,2));
+ __m256i S0 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+ __m256i S1 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+ __m256i S2 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+ __m256i S3 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
+ __m256i S4 = MM256_SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
+ __m256i S5 = MM256_SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
+ __m256i S6 = MM256_SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
+ __m256i S7 = MM256_SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
kernel.packet[0] = _mm256_permute2f128_si256(S0, S4, 0x20);
kernel.packet[1] = _mm256_permute2f128_si256(S1, S5, 0x20);
kernel.packet[2] = _mm256_permute2f128_si256(S2, S6, 0x20);
@@ -1719,17 +2095,16 @@
ptranspose((PacketBlock<Packet8i, 8>&)kernel);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8i,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8i, 4>& kernel) {
__m256i T0 = MM256_UNPACKLO_EPI32(kernel.packet[0], kernel.packet[1]);
__m256i T1 = MM256_UNPACKHI_EPI32(kernel.packet[0], kernel.packet[1]);
__m256i T2 = MM256_UNPACKLO_EPI32(kernel.packet[2], kernel.packet[3]);
__m256i T3 = MM256_UNPACKHI_EPI32(kernel.packet[2], kernel.packet[3]);
- __m256i S0 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(1,0,1,0));
- __m256i S1 = MM256_SHUFFLE_EPI32(T0,T2,_MM_SHUFFLE(3,2,3,2));
- __m256i S2 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(1,0,1,0));
- __m256i S3 = MM256_SHUFFLE_EPI32(T1,T3,_MM_SHUFFLE(3,2,3,2));
+ __m256i S0 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+ __m256i S1 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+ __m256i S2 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+ __m256i S3 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
kernel.packet[0] = _mm256_permute2f128_si256(S0, S1, 0x20);
kernel.packet[1] = _mm256_permute2f128_si256(S2, S3, 0x20);
@@ -1740,8 +2115,7 @@
ptranspose((PacketBlock<Packet8i, 4>&)kernel);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4d,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4d, 4>& kernel) {
__m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
__m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
__m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
@@ -1753,24 +2127,32 @@
kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
}
-template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket,
+ const Packet8f& elsePacket) {
#ifdef EIGEN_VECTORIZE_AVX2
const __m256i zero = _mm256_setzero_si256();
- const __m256i select = _mm256_set_epi32(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+ const __m256i select =
+ _mm256_set_epi32(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4],
+ ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m256i false_mask = _mm256_cmpeq_epi32(zero, select);
return _mm256_blendv_ps(thenPacket, elsePacket, _mm256_castsi256_ps(false_mask));
#else
const __m256 zero = _mm256_setzero_ps();
- const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+ const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4],
+ ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket,
+ const Packet4d& elsePacket) {
#ifdef EIGEN_VECTORIZE_AVX2
const __m256i zero = _mm256_setzero_si256();
- const __m256i select = _mm256_set_epi64x(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
+ const __m256i select =
+ _mm256_set_epi64x(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m256i false_mask = _mm256_cmpeq_epi64(select, zero);
return _mm256_blendv_pd(thenPacket, elsePacket, _mm256_castsi256_pd(false_mask));
#else
@@ -1783,35 +2165,52 @@
// Packet math for Eigen::half
#ifndef EIGEN_VECTORIZE_AVX512FP16
-template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; };
+template <>
+struct unpacket_traits<Packet8h> {
+ typedef Eigen::half type;
+ enum {
+ size = 8,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+ typedef Packet8h half;
+};
#endif
-template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
}
-template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
return numext::bit_cast<Eigen::half>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
}
-template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
}
-template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
}
-template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+template <>
+EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
_mm_store_si128(reinterpret_cast<__m128i*>(to), from);
}
-template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+template <>
+EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
_mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
}
-template<> EIGEN_STRONG_INLINE Packet8h
-ploaddup<Packet8h>(const Eigen::half* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8h ploaddup<Packet8h>(const Eigen::half* from) {
const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
@@ -1819,14 +2218,15 @@
return _mm_set_epi16(d, d, c, c, b, b, a, a);
}
-template<> EIGEN_STRONG_INLINE Packet8h
-ploadquad<Packet8h>(const Eigen::half* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8h ploadquad<Packet8h>(const Eigen::half* from) {
const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
return _mm_set_epi16(b, b, b, b, a, a, a, a);
}
-template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
return _mm_cmpeq_epi32(a, a);
}
@@ -1840,8 +2240,8 @@
#ifdef EIGEN_HAS_FP16_C
return _mm256_cvtph_ps(a);
#else
- Eigen::internal::Packet8f pp = _mm256_castsi256_ps(_mm256_insertf128_si256(
- _mm256_castsi128_si256(half2floatsse(a)), half2floatsse(_mm_srli_si128(a, 8)), 1));
+ Eigen::internal::Packet8f pp = _mm256_castsi256_ps(
+ _mm256_insertf128_si256(_mm256_castsi128_si256(half2floatsse(a)), half2floatsse(_mm_srli_si128(a, 8)), 1));
return pp;
#endif
}
@@ -1852,19 +2252,17 @@
#else
__m128i lo = float2half(_mm256_extractf128_ps(a, 0));
__m128i hi = float2half(_mm256_extractf128_ps(a, 1));
- return _mm_packus_epi32(lo, hi);
+ return _mm_packus_epi32(lo, hi);
#endif
}
template <>
-EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a,
- const Packet8h& b) {
+EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a, const Packet8h& b) {
return float2half(pmin<Packet8f>(half2float(a), half2float(b)));
}
template <>
-EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a,
- const Packet8h& b) {
+EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a, const Packet8h& b) {
return float2half(pmax<Packet8f>(half2float(a), half2float(b)));
}
@@ -1873,87 +2271,108 @@
return float2half(plset<Packet8f>(static_cast<float>(a)));
}
-template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a, const Packet8h& b) {
// in some cases Packet4i is a wrapper around __m128i, so we either need to
// cast to Packet4i to directly call the intrinsics as below:
- return _mm_or_si128(a,b);
+ return _mm_or_si128(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) {
- return _mm_xor_si128(a,b);
+template <>
+EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a, const Packet8h& b) {
+ return _mm_xor_si128(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) {
- return _mm_and_si128(a,b);
+template <>
+EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a, const Packet8h& b) {
+ return _mm_and_si128(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) {
- return _mm_andnot_si128(b,a);
+template <>
+EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a, const Packet8h& b) {
+ return _mm_andnot_si128(b, a);
}
-template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
return _mm_blendv_epi8(b, a, mask);
}
-template<> EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
return float2half(pround<Packet8f>(half2float(a)));
}
-template<> EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
return float2half(print<Packet8f>(half2float(a)));
}
-template<> EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
return float2half(pceil<Packet8f>(half2float(a)));
}
-template<> EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
return float2half(pfloor<Packet8f>(half2float(a)));
}
-template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a, const Packet8h& b) {
return Pack16To8(pcmp_eq(half2float(a), half2float(b)));
}
-template<> EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a,const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a, const Packet8h& b) {
return Pack16To8(pcmp_le(half2float(a), half2float(b)));
}
-template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a,const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a, const Packet8h& b) {
return Pack16To8(pcmp_lt(half2float(a), half2float(b)));
}
-template<> EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a,const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a, const Packet8h& b) {
return Pack16To8(pcmp_lt_or_nan(half2float(a), half2float(b)));
}
-template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) {
+ return a;
+}
-template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
Packet8h sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
return _mm_xor_si128(a, sign_mask);
}
#ifndef EIGEN_VECTORIZE_AVX512FP16
-template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
Packet8f af = half2float(a);
Packet8f bf = half2float(b);
Packet8f rf = padd(af, bf);
return float2half(rf);
}
-template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
Packet8f af = half2float(a);
Packet8f bf = half2float(b);
Packet8f rf = psub(af, bf);
return float2half(rf);
}
-template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
Packet8f af = half2float(a);
Packet8f bf = half2float(b);
Packet8f rf = pmul(af, bf);
return float2half(rf);
}
-template<> EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
Packet8f af = half2float(a);
Packet8f bf = half2float(b);
Packet8f rf = pdiv(af, bf);
@@ -1961,68 +2380,70 @@
}
#endif
-template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
-{
- const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0*stride]);
- const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1*stride]);
- const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2*stride]);
- const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3*stride]);
- const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4*stride]);
- const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5*stride]);
- const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6*stride]);
- const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7*stride]);
+template <>
+EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride) {
+ const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0 * stride]);
+ const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1 * stride]);
+ const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2 * stride]);
+ const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3 * stride]);
+ const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4 * stride]);
+ const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5 * stride]);
+ const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6 * stride]);
+ const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7 * stride]);
return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
}
-template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
-{
+template <>
+EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride) {
EIGEN_ALIGN32 Eigen::half aux[8];
pstore(aux, from);
- to[stride*0] = aux[0];
- to[stride*1] = aux[1];
- to[stride*2] = aux[2];
- to[stride*3] = aux[3];
- to[stride*4] = aux[4];
- to[stride*5] = aux[5];
- to[stride*6] = aux[6];
- to[stride*7] = aux[7];
+ to[stride * 0] = aux[0];
+ to[stride * 1] = aux[1];
+ to[stride * 2] = aux[2];
+ to[stride * 3] = aux[3];
+ to[stride * 4] = aux[4];
+ to[stride * 5] = aux[5];
+ to[stride * 6] = aux[6];
+ to[stride * 7] = aux[7];
}
-
#ifndef EIGEN_VECTORIZE_AVX512FP16
-template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a);
float reduced = predux<Packet8f>(af);
return Eigen::half(reduced);
}
#endif
-template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a);
float reduced = predux_max<Packet8f>(af);
return Eigen::half(reduced);
}
-template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a);
float reduced = predux_min<Packet8f>(af);
return Eigen::half(reduced);
}
-template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
Packet8f af = half2float(a);
float reduced = predux_mul<Packet8f>(af);
return Eigen::half(reduced);
}
-template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
-{
- __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
- return _mm_shuffle_epi8(a,m);
+template <>
+EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) {
+ __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+ return _mm_shuffle_epi8(a, m);
}
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet8h,8>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 8>& kernel) {
__m128i a = kernel.packet[0];
__m128i b = kernel.packet[1];
__m128i c = kernel.packet[2];
@@ -2069,8 +2490,7 @@
kernel.packet[7] = a7b7c7d7e7f7g7h7;
}
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet8h,4>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 4>& kernel) {
EIGEN_ALIGN32 Eigen::half in[4][8];
pstore<Eigen::half>(in[0], kernel.packet[0]);
pstore<Eigen::half>(in[1], kernel.packet[1]);
@@ -2081,10 +2501,10 @@
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
- out[i][j] = in[j][2*i];
+ out[i][j] = in[j][2 * i];
}
for (int j = 0; j < 4; ++j) {
- out[i][j+4] = in[j][2*i+1];
+ out[i][j + 4] = in[j][2 * i + 1];
}
}
@@ -2111,7 +2531,6 @@
// Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm.
EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) {
-
__m256i input = _mm256_castps_si256(a);
#ifdef EIGEN_VECTORIZE_AVX2
@@ -2130,8 +2549,7 @@
__m256i nan = _mm256_set1_epi32(0x7fc0);
t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask));
// output = numext::bit_cast<uint16_t>(input);
- return _mm_packus_epi32(_mm256_extractf128_si256(t, 0),
- _mm256_extractf128_si256(t, 1));
+ return _mm_packus_epi32(_mm256_extractf128_si256(t, 0), _mm256_extractf128_si256(t, 1));
#else
// uint32_t lsb = (input >> 16);
__m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(input, 0), 16);
@@ -2158,32 +2576,38 @@
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
}
-template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet8bf>(const Packet8bf& from) {
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst<Packet8bf>(const Packet8bf& from) {
return numext::bit_cast<bfloat16>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
}
-template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
}
-template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
}
-template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
_mm_store_si128(reinterpret_cast<__m128i*>(to), from);
}
-template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
_mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
}
-template<> EIGEN_STRONG_INLINE Packet8bf
-ploaddup<Packet8bf>(const bfloat16* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from) {
const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
@@ -2191,14 +2615,15 @@
return _mm_set_epi16(d, d, c, c, b, b, a, a);
}
-template<> EIGEN_STRONG_INLINE Packet8bf
-ploadquad<Packet8bf>(const bfloat16* from) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from) {
const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
return _mm_set_epi16(b, b, b, b, a, a, a, a);
}
-template<> EIGEN_STRONG_INLINE Packet8bf ptrue(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf ptrue(const Packet8bf& a) {
return _mm_cmpeq_epi32(a, a);
}
@@ -2209,14 +2634,12 @@
}
template <>
-EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a,
- const Packet8bf& b) {
+EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
return F32ToBf16(pmin<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
}
template <>
-EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a,
- const Packet8bf& b) {
+EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
return F32ToBf16(pmax<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
}
@@ -2225,131 +2648,153 @@
return F32ToBf16(plset<Packet8f>(static_cast<float>(a)));
}
-template<> EIGEN_STRONG_INLINE Packet8bf por(const Packet8bf& a,const Packet8bf& b) {
- return _mm_or_si128(a,b);
+template <>
+EIGEN_STRONG_INLINE Packet8bf por(const Packet8bf& a, const Packet8bf& b) {
+ return _mm_or_si128(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a,const Packet8bf& b) {
- return _mm_xor_si128(a,b);
+template <>
+EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a, const Packet8bf& b) {
+ return _mm_xor_si128(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pand(const Packet8bf& a,const Packet8bf& b) {
- return _mm_and_si128(a,b);
+template <>
+EIGEN_STRONG_INLINE Packet8bf pand(const Packet8bf& a, const Packet8bf& b) {
+ return _mm_and_si128(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pandnot(const Packet8bf& a,const Packet8bf& b) {
- return _mm_andnot_si128(b,a);
+template <>
+EIGEN_STRONG_INLINE Packet8bf pandnot(const Packet8bf& a, const Packet8bf& b) {
+ return _mm_andnot_si128(b, a);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pselect(const Packet8bf& mask, const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pselect(const Packet8bf& mask, const Packet8bf& a, const Packet8bf& b) {
return _mm_blendv_epi8(b, a, mask);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a) {
return F32ToBf16(pround<Packet8f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
return F32ToBf16(print<Packet8f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
return F32ToBf16(pceil<Packet8f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
return F32ToBf16(pfloor<Packet8f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a,const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
return Pack16To8(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a,const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
return Pack16To8(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a,const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
return Pack16To8(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a,const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
return Pack16To8(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<> EIGEN_STRONG_INLINE Packet8bf pconj(const Packet8bf& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet8bf pconj(const Packet8bf& a) {
+ return a;
+}
-template<> EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) {
Packet8bf sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
return _mm_xor_si128(a, sign_mask);
}
-template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
return F32ToBf16(padd<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
return F32ToBf16(psub<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
return F32ToBf16(pmul<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
return F32ToBf16(pdiv<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-
-template<> EIGEN_STRONG_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
-{
- const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0*stride]);
- const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1*stride]);
- const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2*stride]);
- const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3*stride]);
- const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4*stride]);
- const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5*stride]);
- const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6*stride]);
- const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7*stride]);
+template <>
+EIGEN_STRONG_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) {
+ const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0 * stride]);
+ const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1 * stride]);
+ const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2 * stride]);
+ const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3 * stride]);
+ const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4 * stride]);
+ const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5 * stride]);
+ const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6 * stride]);
+ const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7 * stride]);
return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
}
-template<> EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
-{
+template <>
+EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride) {
EIGEN_ALIGN32 bfloat16 aux[8];
pstore(aux, from);
- to[stride*0] = aux[0];
- to[stride*1] = aux[1];
- to[stride*2] = aux[2];
- to[stride*3] = aux[3];
- to[stride*4] = aux[4];
- to[stride*5] = aux[5];
- to[stride*6] = aux[6];
- to[stride*7] = aux[7];
+ to[stride * 0] = aux[0];
+ to[stride * 1] = aux[1];
+ to[stride * 2] = aux[2];
+ to[stride * 3] = aux[3];
+ to[stride * 4] = aux[4];
+ to[stride * 5] = aux[5];
+ to[stride * 6] = aux[6];
+ to[stride * 7] = aux[7];
}
-template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
return static_cast<bfloat16>(predux<Packet8f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
return static_cast<bfloat16>(predux_max<Packet8f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
return static_cast<bfloat16>(predux_min<Packet8f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
return static_cast<bfloat16>(predux_mul<Packet8f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
-{
- __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
- return _mm_shuffle_epi8(a,m);
+template <>
+EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
+ __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+ return _mm_shuffle_epi8(a, m);
}
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet8bf,8>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8bf, 8>& kernel) {
__m128i a = kernel.packet[0];
__m128i b = kernel.packet[1];
__m128i c = kernel.packet[2];
@@ -2387,8 +2832,7 @@
kernel.packet[7] = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
}
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet8bf,4>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
__m128i a = kernel.packet[0];
__m128i b = kernel.packet[1];
__m128i c = kernel.packet[2];
@@ -2405,8 +2849,8 @@
kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47);
}
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_PACKET_MATH_AVX_H
+#endif // EIGEN_PACKET_MATH_AVX_H
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
index 49927b8..3688f8d 100644
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -18,28 +18,39 @@
namespace internal {
#ifndef EIGEN_VECTORIZE_AVX512
-template<> struct type_casting_traits<float, bool> : vectorized_type_casting_traits<float, bool> {};
-template<> struct type_casting_traits<bool, float> : vectorized_type_casting_traits<bool, float> {};
+template <>
+struct type_casting_traits<float, bool> : vectorized_type_casting_traits<float, bool> {};
+template <>
+struct type_casting_traits<bool, float> : vectorized_type_casting_traits<bool, float> {};
-template<> struct type_casting_traits<float, int> : vectorized_type_casting_traits<float, int> {};
-template<> struct type_casting_traits<int, float> : vectorized_type_casting_traits<int, float> {};
+template <>
+struct type_casting_traits<float, int> : vectorized_type_casting_traits<float, int> {};
+template <>
+struct type_casting_traits<int, float> : vectorized_type_casting_traits<int, float> {};
-template<> struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
-template<> struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
+template <>
+struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
+template <>
+struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
-template<> struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
-template<> struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
+template <>
+struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
+template <>
+struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
-template<> struct type_casting_traits<half, float> : vectorized_type_casting_traits<half, float> {};
-template<> struct type_casting_traits<float, half> : vectorized_type_casting_traits<float, half> {};
+template <>
+struct type_casting_traits<half, float> : vectorized_type_casting_traits<half, float> {};
+template <>
+struct type_casting_traits<float, half> : vectorized_type_casting_traits<float, half> {};
-template<> struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfloat16, float> {};
-template<> struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {};
+template <>
+struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfloat16, float> {};
+template <>
+struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {};
#endif
template <>
-EIGEN_STRONG_INLINE Packet16b pcast<Packet8f, Packet16b>(const Packet8f& a,
- const Packet8f& b) {
+EIGEN_STRONG_INLINE Packet16b pcast<Packet8f, Packet16b>(const Packet8f& a, const Packet8f& b) {
__m256 nonzero_a = _mm256_cmp_ps(a, pzero(a), _CMP_NEQ_UQ);
__m256 nonzero_b = _mm256_cmp_ps(b, pzero(b), _CMP_NEQ_UQ);
constexpr char kFF = '\255';
@@ -54,11 +65,11 @@
__m128i b_lo = _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castps_si256(nonzero_b), 0), shuffle_mask128_b_lo);
__m128i merged = _mm_or_si128(_mm_or_si128(b_lo, b_hi), _mm_or_si128(a_lo, a_hi));
return _mm_and_si128(merged, _mm_set1_epi8(1));
- #else
- __m256i a_shuffle_mask = _mm256_set_epi8(kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF,
- kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0);
- __m256i b_shuffle_mask = _mm256_set_epi8( 12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF,
- kFF, kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF);
+#else
+ __m256i a_shuffle_mask = _mm256_set_epi8(kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF,
+ kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0);
+ __m256i b_shuffle_mask = _mm256_set_epi8(12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF,
+ kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF);
__m256i a_shuff = _mm256_shuffle_epi8(_mm256_castps_si256(nonzero_a), a_shuffle_mask);
__m256i b_shuff = _mm256_shuffle_epi8(_mm256_castps_si256(nonzero_b), b_shuffle_mask);
__m256i a_or_b = _mm256_or_si256(a_shuff, b_shuff);
@@ -70,124 +81,147 @@
template <>
EIGEN_STRONG_INLINE Packet8f pcast<Packet16b, Packet8f>(const Packet16b& a) {
const __m256 cst_one = _mm256_set1_ps(1.0f);
- #ifdef EIGEN_VECTORIZE_AVX2
+#ifdef EIGEN_VECTORIZE_AVX2
__m256i a_extended = _mm256_cvtepi8_epi32(a);
__m256i abcd_efgh = _mm256_cmpeq_epi32(a_extended, _mm256_setzero_si256());
- #else
+#else
__m128i abcd_efhg_ijkl_mnop = _mm_cmpeq_epi8(a, _mm_setzero_si128());
__m128i aabb_ccdd_eeff_gghh = _mm_unpacklo_epi8(abcd_efhg_ijkl_mnop, abcd_efhg_ijkl_mnop);
__m128i aaaa_bbbb_cccc_dddd = _mm_unpacklo_epi8(aabb_ccdd_eeff_gghh, aabb_ccdd_eeff_gghh);
__m128i eeee_ffff_gggg_hhhh = _mm_unpackhi_epi8(aabb_ccdd_eeff_gghh, aabb_ccdd_eeff_gghh);
__m256i abcd_efgh = _mm256_setr_m128i(aaaa_bbbb_cccc_dddd, eeee_ffff_gggg_hhhh);
- #endif
+#endif
__m256 result = _mm256_andnot_ps(_mm256_castsi256_ps(abcd_efgh), cst_one);
return result;
}
-template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
return _mm256_cvttps_epi32(a);
}
-template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet4d, Packet8i>(const Packet4d& a, const Packet4d& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8i pcast<Packet4d, Packet8i>(const Packet4d& a, const Packet4d& b) {
return _mm256_set_m128i(_mm256_cvttpd_epi32(b), _mm256_cvttpd_epi32(a));
}
-template <> EIGEN_STRONG_INLINE Packet4i pcast<Packet4d, Packet4i>(const Packet4d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4d, Packet4i>(const Packet4d& a) {
return _mm256_cvttpd_epi32(a);
}
-template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
return _mm256_cvtepi32_ps(a);
}
-template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet4d, Packet8f>(const Packet4d& a, const Packet4d& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet4d, Packet8f>(const Packet4d& a, const Packet4d& b) {
return _mm256_set_m128(_mm256_cvtpd_ps(b), _mm256_cvtpd_ps(a));
}
-template <> EIGEN_STRONG_INLINE Packet4f pcast<Packet4d, Packet4f>(const Packet4d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4d, Packet4f>(const Packet4d& a) {
return _mm256_cvtpd_ps(a);
}
-template <> EIGEN_STRONG_INLINE Packet4d pcast<Packet8i, Packet4d>(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4d pcast<Packet8i, Packet4d>(const Packet8i& a) {
return _mm256_cvtepi32_pd(_mm256_castsi256_si128(a));
}
-template <> EIGEN_STRONG_INLINE Packet4d pcast<Packet4i, Packet4d>(const Packet4i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4d pcast<Packet4i, Packet4d>(const Packet4i& a) {
return _mm256_cvtepi32_pd(a);
}
-template <> EIGEN_STRONG_INLINE Packet4d pcast<Packet8f, Packet4d>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4d pcast<Packet8f, Packet4d>(const Packet8f& a) {
return _mm256_cvtps_pd(_mm256_castps256_ps128(a));
}
-template <> EIGEN_STRONG_INLINE Packet4d pcast<Packet4f, Packet4d>(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4d pcast<Packet4f, Packet4d>(const Packet4f& a) {
return _mm256_cvtps_pd(a);
}
-template<> EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i,Packet8f>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i, Packet8f>(const Packet8f& a) {
return _mm256_castps_si256(a);
}
-template<> EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f,Packet8i>(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f, Packet8i>(const Packet8i& a) {
return _mm256_castsi256_ps(a);
}
-template<> EIGEN_STRONG_INLINE Packet8ui preinterpret<Packet8ui, Packet8i>(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8ui preinterpret<Packet8ui, Packet8i>(const Packet8i& a) {
return Packet8ui(a);
}
-template<> EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i, Packet8ui>(const Packet8ui& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i, Packet8ui>(const Packet8ui& a) {
return Packet8i(a);
}
// truncation operations
-template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet8f>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet8f>(const Packet8f& a) {
return _mm256_castps256_ps128(a);
}
-template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4d>(const Packet4d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4d>(const Packet4d& a) {
return _mm256_castpd256_pd128(a);
}
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet8i>(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet8i>(const Packet8i& a) {
return _mm256_castsi256_si128(a);
}
-template<> EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet8ui>(const Packet8ui& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet8ui>(const Packet8ui& a) {
return _mm256_castsi256_si128(a);
}
-
#ifdef EIGEN_VECTORIZE_AVX2
-template<> EIGEN_STRONG_INLINE Packet4ul preinterpret<Packet4ul, Packet4l>(const Packet4l& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4ul preinterpret<Packet4ul, Packet4l>(const Packet4l& a) {
return Packet4ul(a);
}
-template<> EIGEN_STRONG_INLINE Packet4l preinterpret<Packet4l, Packet4ul>(const Packet4ul& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4l preinterpret<Packet4l, Packet4ul>(const Packet4ul& a) {
return Packet4l(a);
}
#endif
-template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
return half2float(a);
}
-template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) {
return Bf16ToF32(a);
}
-template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
return float2half(a);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8f, Packet8bf>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcast<Packet8f, Packet8bf>(const Packet8f& a) {
return F32ToBf16(a);
}
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_TYPE_CASTING_AVX_H
+#endif // EIGEN_TYPE_CASTING_AVX_H
diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h
index c484517..f2c8ce6 100644
--- a/Eigen/src/Core/arch/AVX512/Complex.h
+++ b/Eigen/src/Core/arch/AVX512/Complex.h
@@ -18,15 +18,14 @@
namespace internal {
//---------- float ----------
-struct Packet8cf
-{
+struct Packet8cf {
EIGEN_STRONG_INLINE Packet8cf() {}
EIGEN_STRONG_INLINE explicit Packet8cf(const __m512& a) : v(a) {}
- __m512 v;
+ __m512 v;
};
-template<> struct packet_traits<std::complex<float> > : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
typedef Packet8cf type;
typedef Packet4cf half;
enum {
@@ -34,58 +33,80 @@
AlignedOnScalar = 1,
size = 8,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
HasNegate = 1,
- HasSqrt = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
+ HasSqrt = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
HasSetLinear = 0
};
};
-template<> struct unpacket_traits<Packet8cf> {
+template <>
+struct unpacket_traits<Packet8cf> {
typedef std::complex<float> type;
typedef Packet4cf half;
typedef Packet16f as_real;
enum {
size = 8,
- alignment=unpacket_traits<Packet16f>::alignment,
- vectorizable=true,
- masked_load_available=false,
- masked_store_available=false
+ alignment = unpacket_traits<Packet16f>::alignment,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
};
};
-template<> EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet8cf padd<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf psub<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) {
+ return Packet8cf(ptrue(Packet16f(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf padd<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+ return Packet8cf(_mm512_add_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf psub<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+ return Packet8cf(_mm512_sub_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a) {
return Packet8cf(pnegate(a.v));
}
-template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a) {
const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32(
- 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,
- 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
- return Packet8cf(pxor(a.v,mask));
+ 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000,
+ 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000));
+ return Packet8cf(pxor(a.v, mask));
}
-template<> EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
-{
- __m512 tmp2 = _mm512_mul_ps(_mm512_movehdup_ps(a.v), _mm512_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
+template <>
+EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+ __m512 tmp2 = _mm512_mul_ps(_mm512_movehdup_ps(a.v), _mm512_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1)));
return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2));
}
-template<> EIGEN_STRONG_INLINE Packet8cf pand <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pand(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf por <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(por(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf pxor <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet8cf pand<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+ return Packet8cf(pand(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf por<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+ return Packet8cf(por(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf pxor<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+ return Packet8cf(pxor(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+ return Packet8cf(pandnot(a.v, b.v));
+}
template <>
EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) {
@@ -93,60 +114,71 @@
return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1)));
}
-template<> EIGEN_STRONG_INLINE Packet8cf pload <Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload<Packet16f>(&numext::real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu<Packet16f>(&numext::real_ref(*from))); }
+template <>
+EIGEN_STRONG_INLINE Packet8cf pload<Packet8cf>(const std::complex<float>* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload<Packet16f>(&numext::real_ref(*from)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<float>* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu<Packet16f>(&numext::real_ref(*from)));
+}
-
-template<> EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from) {
const float re = std::real(from);
const float im = std::imag(from);
return Packet8cf(_mm512_set_ps(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re));
}
-template<> EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from)
-{
- return Packet8cf( _mm512_castpd_ps( ploaddup<Packet8d>((const double*)(const void*)from )) );
+template <>
+EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from) {
+ return Packet8cf(_mm512_castpd_ps(ploaddup<Packet8d>((const double*)(const void*)from)));
}
-template<> EIGEN_STRONG_INLINE Packet8cf ploadquad<Packet8cf>(const std::complex<float>* from)
-{
- return Packet8cf( _mm512_castpd_ps( ploadquad<Packet8d>((const double*)(const void*)from )) );
+template <>
+EIGEN_STRONG_INLINE Packet8cf ploadquad<Packet8cf>(const std::complex<float>* from) {
+ return Packet8cf(_mm512_castpd_ps(ploadquad<Packet8d>((const double*)(const void*)from)));
}
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet8cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet8cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet8cf pgather<std::complex<float>, Packet8cf>(const std::complex<float>* from, Index stride)
-{
- return Packet8cf(_mm512_castpd_ps(pgather<double,Packet8d>((const double*)(const void*)from, stride)));
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet8cf& from) {
+ EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet8cf& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet8cf>(std::complex<float>* to, const Packet8cf& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet8cf pgather<std::complex<float>, Packet8cf>(const std::complex<float>* from,
+ Index stride) {
+ return Packet8cf(_mm512_castpd_ps(pgather<double, Packet8d>((const double*)(const void*)from, stride)));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet8cf>(std::complex<float>* to, const Packet8cf& from,
+ Index stride) {
pscatter((double*)(void*)to, _mm512_castps_pd(from.v), stride);
}
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet8cf>(const Packet8cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet8cf>(const Packet8cf& a) {
return pfirst(Packet2cf(_mm512_castps512_ps128(a.v)));
}
-template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) {
- return Packet8cf(_mm512_castsi512_ps(
- _mm512_permutexvar_epi64( _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7),
- _mm512_castps_si512(a.v))));
+template <>
+EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) {
+ return Packet8cf(_mm512_castsi512_ps(_mm512_permutexvar_epi64(
+ _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), _mm512_castps_si512(a.v))));
}
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a)
-{
- return predux(padd(Packet4cf(extract256<0>(a.v)),
- Packet4cf(extract256<1>(a.v))));
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a) {
+ return predux(padd(Packet4cf(extract256<0>(a.v)), Packet4cf(extract256<1>(a.v))));
}
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a)
-{
- return predux_mul(pmul(Packet4cf(extract256<0>(a.v)),
- Packet4cf(extract256<1>(a.v))));
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a) {
+ return predux_mul(pmul(Packet4cf(extract256<0>(a.v)), Packet4cf(extract256<1>(a.v))));
}
template <>
@@ -157,28 +189,27 @@
return Packet4cf(res);
}
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf, Packet16f)
-template<> EIGEN_STRONG_INLINE Packet8cf pdiv<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8cf pdiv<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
return pdiv_complex(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& x)
-{
- return Packet8cf(_mm512_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
+template <>
+EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& x) {
+ return Packet8cf(_mm512_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0, 1)));
}
//---------- double ----------
-struct Packet4cd
-{
+struct Packet4cd {
EIGEN_STRONG_INLINE Packet4cd() {}
EIGEN_STRONG_INLINE explicit Packet4cd(const __m512d& a) : v(a) {}
- __m512d v;
+ __m512d v;
};
-template<> struct packet_traits<std::complex<double> > : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
typedef Packet4cd type;
typedef Packet2cd half;
enum {
@@ -186,58 +217,82 @@
AlignedOnScalar = 0,
size = 4,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
HasNegate = 1,
- HasSqrt = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
+ HasSqrt = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
HasSetLinear = 0
};
};
-template<> struct unpacket_traits<Packet4cd> {
+template <>
+struct unpacket_traits<Packet4cd> {
typedef std::complex<double> type;
typedef Packet2cd half;
typedef Packet8d as_real;
enum {
size = 4,
alignment = unpacket_traits<Packet8d>::alignment,
- vectorizable=true,
- masked_load_available=false,
- masked_store_available=false
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
};
};
-template<> EIGEN_STRONG_INLINE Packet4cd padd<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd psub<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) { return Packet4cd(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a)
-{
- const __m512d mask = _mm512_castsi512_pd(
- _mm512_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0,
- 0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
- return Packet4cd(pxor(a.v,mask));
+template <>
+EIGEN_STRONG_INLINE Packet4cd padd<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+ return Packet4cd(_mm512_add_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd psub<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+ return Packet4cd(_mm512_sub_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) {
+ return Packet4cd(pnegate(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a) {
+ const __m512d mask = _mm512_castsi512_pd(_mm512_set_epi32(0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0,
+ 0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0));
+ return Packet4cd(pxor(a.v, mask));
}
-template<> EIGEN_STRONG_INLINE Packet4cd pmul<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
-{
- __m512d tmp1 = _mm512_shuffle_pd(a.v,a.v,0x0);
- __m512d tmp2 = _mm512_shuffle_pd(a.v,a.v,0xFF);
- __m512d tmp3 = _mm512_shuffle_pd(b.v,b.v,0x55);
- __m512d odd = _mm512_mul_pd(tmp2, tmp3);
+template <>
+EIGEN_STRONG_INLINE Packet4cd pmul<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+ __m512d tmp1 = _mm512_shuffle_pd(a.v, a.v, 0x0);
+ __m512d tmp2 = _mm512_shuffle_pd(a.v, a.v, 0xFF);
+ __m512d tmp3 = _mm512_shuffle_pd(b.v, b.v, 0x55);
+ __m512d odd = _mm512_mul_pd(tmp2, tmp3);
return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd));
}
-template<> EIGEN_STRONG_INLINE Packet4cd ptrue<Packet4cd>(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet4cd pand <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd por <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pxor <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet4cd ptrue<Packet4cd>(const Packet4cd& a) {
+ return Packet4cd(ptrue(Packet8d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pand<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+ return Packet4cd(pand(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd por<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+ return Packet4cd(por(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pxor<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+ return Packet4cd(pxor(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+ return Packet4cd(pandnot(a.v, b.v));
+}
template <>
EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) {
@@ -245,81 +300,95 @@
return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55)));
}
-template<> EIGEN_STRONG_INLINE Packet4cd pload <Packet4cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload<Packet8d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet4cd ploadu<Packet4cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cd(ploadu<Packet8d>((const double*)from)); }
-
-template<> EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from)
-{
- return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1<Packet1cd>(from).v))));
+template <>
+EIGEN_STRONG_INLINE Packet4cd pload<Packet4cd>(const std::complex<double>* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload<Packet8d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd ploadu<Packet4cd>(const std::complex<double>* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cd(ploadu<Packet8d>((const double*)from));
}
-template<> EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
+template <>
+EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from) {
+ return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4(_mm_castpd_ps(pset1<Packet1cd>(from).v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
+ return Packet4cd(
+ _mm512_insertf64x4(_mm512_castpd256_pd512(ploaddup<Packet2cd>(from).v), ploaddup<Packet2cd>(from + 1).v, 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet4cd& from) {
+ EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet4cd& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4cd pgather<std::complex<double>, Packet4cd>(const std::complex<double>* from,
+ Index stride) {
return Packet4cd(_mm512_insertf64x4(
- _mm512_castpd256_pd512(ploaddup<Packet2cd>(from).v), ploaddup<Packet2cd>(from+1).v, 1));
+ _mm512_castpd256_pd512(_mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from + 0 * stride).v),
+ ploadu<Packet1cd>(from + 1 * stride).v, 1)),
+ _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from + 2 * stride).v),
+ ploadu<Packet1cd>(from + 3 * stride).v, 1),
+ 1));
}
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet4cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet4cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather<std::complex<double>, Packet4cd>(const std::complex<double>* from, Index stride)
-{
- return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512(
- _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+0*stride).v), ploadu<Packet1cd>(from+1*stride).v,1)),
- _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+2*stride).v), ploadu<Packet1cd>(from+3*stride).v,1), 1));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet4cd>(std::complex<double>* to, const Packet4cd& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet4cd>(std::complex<double>* to, const Packet4cd& from,
+ Index stride) {
__m512i fromi = _mm512_castpd_si512(from.v);
double* tod = (double*)(void*)to;
- _mm_storeu_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) );
- _mm_storeu_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) );
- _mm_storeu_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) );
- _mm_storeu_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) );
+ _mm_storeu_pd(tod + 0 * stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi, 0)));
+ _mm_storeu_pd(tod + 2 * stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi, 1)));
+ _mm_storeu_pd(tod + 4 * stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi, 2)));
+ _mm_storeu_pd(tod + 6 * stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi, 3)));
}
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet4cd>(const Packet4cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet4cd>(const Packet4cd& a) {
__m128d low = extract128<0>(a.v);
EIGEN_ALIGN16 double res[2];
_mm_store_pd(res, low);
- return std::complex<double>(res[0],res[1]);
+ return std::complex<double>(res[0], res[1]);
}
-template<> EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) {
- return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, (shuffle_mask<3,2,1,0>::mask)));
+template <>
+EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) {
+ return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, (shuffle_mask<3, 2, 1, 0>::mask)));
}
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a)
-{
- return predux(padd(Packet2cd(_mm512_extractf64x4_pd(a.v,0)),
- Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a) {
+ return predux(padd(Packet2cd(_mm512_extractf64x4_pd(a.v, 0)), Packet2cd(_mm512_extractf64x4_pd(a.v, 1))));
}
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const Packet4cd& a)
-{
- return predux_mul(pmul(Packet2cd(_mm512_extractf64x4_pd(a.v,0)),
- Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const Packet4cd& a) {
+ return predux_mul(pmul(Packet2cd(_mm512_extractf64x4_pd(a.v, 0)), Packet2cd(_mm512_extractf64x4_pd(a.v, 1))));
}
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd, Packet8d)
-template<> EIGEN_STRONG_INLINE Packet4cd pdiv<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4cd pdiv<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
return pdiv_complex(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x)
-{
- return Packet4cd(_mm512_permute_pd(x.v,0x55));
+template <>
+EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x) {
+ return Packet4cd(_mm512_permute_pd(x.v, 0x55));
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8cf,4>& kernel) {
- PacketBlock<Packet8d,4> pb;
-
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8cf, 4>& kernel) {
+ PacketBlock<Packet8d, 4> pb;
+
pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
@@ -331,10 +400,9 @@
kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8cf,8>& kernel) {
- PacketBlock<Packet8d,8> pb;
-
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8cf, 8>& kernel) {
+ PacketBlock<Packet8d, 8> pb;
+
pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
@@ -354,28 +422,33 @@
kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4cd,4>& kernel) {
- __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<0,1,0,1>::mask)); // [a0 a1 b0 b1]
- __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<2,3,2,3>::mask)); // [a2 a3 b2 b3]
- __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<0,1,0,1>::mask)); // [c0 c1 d0 d1]
- __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<2,3,2,3>::mask)); // [c2 c3 d2 d3]
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4cd, 4>& kernel) {
+ __m512d T0 =
+ _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<0, 1, 0, 1>::mask)); // [a0 a1 b0 b1]
+ __m512d T1 =
+ _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<2, 3, 2, 3>::mask)); // [a2 a3 b2 b3]
+ __m512d T2 =
+ _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<0, 1, 0, 1>::mask)); // [c0 c1 d0 d1]
+ __m512d T3 =
+ _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<2, 3, 2, 3>::mask)); // [c2 c3 d2 d3]
- kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<1,3,1,3>::mask))); // [a3 b3 c3 d3]
- kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<0,2,0,2>::mask))); // [a2 b2 c2 d2]
- kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<1,3,1,3>::mask))); // [a1 b1 c1 d1]
- kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0,2,0,2>::mask))); // [a0 b0 c0 d0]
+ kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<1, 3, 1, 3>::mask))); // [a3 b3 c3 d3]
+ kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<0, 2, 0, 2>::mask))); // [a2 b2 c2 d2]
+ kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<1, 3, 1, 3>::mask))); // [a1 b1 c1 d1]
+ kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0, 2, 0, 2>::mask))); // [a0 b0 c0 d0]
}
-template<> EIGEN_STRONG_INLINE Packet4cd psqrt<Packet4cd>(const Packet4cd& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4cd psqrt<Packet4cd>(const Packet4cd& a) {
return psqrt_complex<Packet4cd>(a);
}
-template<> EIGEN_STRONG_INLINE Packet8cf psqrt<Packet8cf>(const Packet8cf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8cf psqrt<Packet8cf>(const Packet8cf& a) {
return psqrt_complex<Packet8cf>(a);
}
-} // end namespace internal
-} // end namespace Eigen
+} // end namespace internal
+} // end namespace Eigen
-#endif // EIGEN_COMPLEX_AVX512_H
+#endif // EIGEN_COMPLEX_AVX512_H
diff --git a/Eigen/src/Core/arch/AVX512/GemmKernel.h b/Eigen/src/Core/arch/AVX512/GemmKernel.h
index 2df1704..e06b83c 100644
--- a/Eigen/src/Core/arch/AVX512/GemmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/GemmKernel.h
@@ -639,7 +639,8 @@
}
}
- template <int uk, int max_b_unroll, int a_unroll, int b_unroll, bool ktail, bool fetch_x, bool c_fetch, bool no_a_preload = false>
+ template <int uk, int max_b_unroll, int a_unroll, int b_unroll, bool ktail, bool fetch_x, bool c_fetch,
+ bool no_a_preload = false>
EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar *&aa, const Scalar *const &ao, const Scalar *const &bo,
Scalar *&co2, int &fetchA_idx, int &fetchB_idx) {
const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
@@ -696,7 +697,8 @@
* bo += b_unroll * kfactor;
*/
- template <int a_unroll, int b_unroll, int k_factor, int max_b_unroll, int max_k_factor, bool c_fetch, bool no_a_preload = false>
+ template <int a_unroll, int b_unroll, int k_factor, int max_b_unroll, int max_k_factor, bool c_fetch,
+ bool no_a_preload = false>
EIGEN_ALWAYS_INLINE void innerkernel(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co2) {
int fetchA_idx = 0;
int fetchB_idx = 0;
@@ -705,20 +707,21 @@
const bool ktail = k_factor == 1;
static_assert(k_factor <= 4 && k_factor > 0, "innerkernel maximum k_factor supported is 4");
- static_assert(no_a_preload == false || (no_a_preload == true && k_factor == 1), "skipping a preload only allowed when k unroll is 1");
+ static_assert(no_a_preload == false || (no_a_preload == true && k_factor == 1),
+ "skipping a preload only allowed when k unroll is 1");
if (k_factor > 0)
- innerkernel_1uk<0, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(aa, ao, bo, co2, fetchA_idx,
- fetchB_idx);
+ innerkernel_1uk<0, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(
+ aa, ao, bo, co2, fetchA_idx, fetchB_idx);
if (k_factor > 1)
- innerkernel_1uk<1, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(aa, ao, bo, co2, fetchA_idx,
- fetchB_idx);
+ innerkernel_1uk<1, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(
+ aa, ao, bo, co2, fetchA_idx, fetchB_idx);
if (k_factor > 2)
- innerkernel_1uk<2, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(aa, ao, bo, co2, fetchA_idx,
- fetchB_idx);
+ innerkernel_1uk<2, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(
+ aa, ao, bo, co2, fetchA_idx, fetchB_idx);
if (k_factor > 3)
- innerkernel_1uk<3, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(aa, ao, bo, co2, fetchA_idx,
- fetchB_idx);
+ innerkernel_1uk<3, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(
+ aa, ao, bo, co2, fetchA_idx, fetchB_idx);
// Advance A/B pointers after uk-loop.
ao += a_unroll * k_factor;
@@ -1201,10 +1204,9 @@
template <typename Scalar, typename Index, typename DataMapper, int mr, bool ConjugateLhs, bool ConjugateRhs>
struct gebp_kernel<Scalar, Scalar, Index, DataMapper, mr, 8, ConjugateLhs, ConjugateRhs> {
- EIGEN_ALWAYS_INLINE
- void operator()(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows, Index depth,
- Index cols, Scalar alpha, Index strideA = -1, Index strideB = -1, Index offsetA = 0,
- Index offsetB = 0);
+ EIGEN_ALWAYS_INLINE void operator()(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows,
+ Index depth, Index cols, Scalar alpha, Index strideA = -1, Index strideB = -1,
+ Index offsetA = 0, Index offsetB = 0);
};
template <typename Scalar, typename Index, typename DataMapper, int mr, bool ConjugateLhs, bool ConjugateRhs>
@@ -1233,7 +1235,7 @@
}
}
}
-#endif // EIGEN_USE_AVX512_GEMM_KERNELS
+#endif // EIGEN_USE_AVX512_GEMM_KERNELS
} // namespace internal
} // namespace Eigen
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 08e5fe8..0677248 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -47,14 +47,12 @@
#if EIGEN_FAST_MATH
template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f
-psqrt<Packet16f>(const Packet16f& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f psqrt<Packet16f>(const Packet16f& _x) {
return generic_sqrt_newton_step<Packet16f>::run(_x, _mm512_rsqrt14_ps(_x));
}
template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d
-psqrt<Packet8d>(const Packet8d& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d psqrt<Packet8d>(const Packet8d& _x) {
#ifdef EIGEN_VECTORIZE_AVX512ER
return generic_sqrt_newton_step<Packet8d, /*Steps=*/1>::run(_x, _mm512_rsqrt28_pd(_x));
#else
@@ -82,26 +80,24 @@
#elif EIGEN_FAST_MATH
template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f
-prsqrt<Packet16f>(const Packet16f& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f prsqrt<Packet16f>(const Packet16f& _x) {
return generic_rsqrt_newton_step<Packet16f, /*Steps=*/1>::run(_x, _mm512_rsqrt14_ps(_x));
}
#endif
-
// prsqrt for double.
#if EIGEN_FAST_MATH
template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d
-prsqrt<Packet8d>(const Packet8d& _x) {
- #ifdef EIGEN_VECTORIZE_AVX512ER
- return generic_rsqrt_newton_step<Packet8d, /*Steps=*/1>::run(_x, _mm512_rsqrt28_pd(_x));
- #else
- return generic_rsqrt_newton_step<Packet8d, /*Steps=*/2>::run(_x, _mm512_rsqrt14_pd(_x));
- #endif
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d prsqrt<Packet8d>(const Packet8d& _x) {
+#ifdef EIGEN_VECTORIZE_AVX512ER
+ return generic_rsqrt_newton_step<Packet8d, /*Steps=*/1>::run(_x, _mm512_rsqrt28_pd(_x));
+#else
+ return generic_rsqrt_newton_step<Packet8d, /*Steps=*/2>::run(_x, _mm512_rsqrt14_pd(_x));
+#endif
}
-template<> EIGEN_STRONG_INLINE Packet16f preciprocal<Packet16f>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f preciprocal<Packet16f>(const Packet16f& a) {
#ifdef EIGEN_VECTORIZE_AVX512ER
return _mm512_rcp28_ps(a);
#else
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index c6566a4..b6d2d98 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -53,7 +53,10 @@
};
#ifndef EIGEN_VECTORIZE_AVX512FP16
-template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
+template <>
+struct is_arithmetic<Packet16h> {
+ enum { value = true };
+};
template <>
struct packet_traits<half> : default_packet_traits {
@@ -65,41 +68,41 @@
AlignedOnScalar = 1,
size = 16,
- HasCmp = 1,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
HasNegate = 1,
- HasAbs = 1,
- HasAbs2 = 0,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
+ HasAbs = 1,
+ HasAbs2 = 0,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasSetLinear = 0,
- HasSqrt = 1,
- HasRsqrt = 1,
- HasLog = 1,
- HasLog1p = 1,
- HasExp = 1,
- HasExpm1 = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+ HasLog = 1,
+ HasLog1p = 1,
+ HasExp = 1,
+ HasExpm1 = 1,
HasBessel = 1,
- HasNdtri = 1,
- HasSin = EIGEN_FAST_MATH,
- HasCos = EIGEN_FAST_MATH,
- HasTanh = EIGEN_FAST_MATH,
- HasErf = EIGEN_FAST_MATH,
- HasBlend = 0,
- HasRound = 1,
- HasFloor = 1,
- HasCeil = 1,
- HasRint = 1
+ HasNdtri = 1,
+ HasSin = EIGEN_FAST_MATH,
+ HasCos = EIGEN_FAST_MATH,
+ HasTanh = EIGEN_FAST_MATH,
+ HasErf = EIGEN_FAST_MATH,
+ HasBlend = 0,
+ HasRound = 1,
+ HasFloor = 1,
+ HasCeil = 1,
+ HasRint = 1
};
};
#endif
-template<> struct packet_traits<float> : default_packet_traits
-{
+template <>
+struct packet_traits<float> : default_packet_traits {
typedef Packet16f type;
typedef Packet8f half;
enum {
@@ -108,9 +111,9 @@
size = 16,
HasAbs = 1,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasBlend = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
@@ -121,24 +124,24 @@
HasSqrt = 1,
HasRsqrt = 1,
HasLog = 1,
- HasLog1p = 1,
- HasExpm1 = 1,
+ HasLog1p = 1,
+ HasExpm1 = 1,
HasNdtri = 1,
- HasBessel = 1,
+ HasBessel = 1,
HasExp = 1,
HasReciprocal = EIGEN_FAST_MATH,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
- HasCmp = 1,
+ HasCmp = 1,
HasDiv = 1,
HasRound = 1,
HasFloor = 1,
HasCeil = 1,
HasRint = 1
};
- };
-template<> struct packet_traits<double> : default_packet_traits
-{
+};
+template <>
+struct packet_traits<double> : default_packet_traits {
typedef Packet8d type;
typedef Packet4d half;
enum {
@@ -148,10 +151,10 @@
HasBlend = 1,
HasSqrt = 1,
HasRsqrt = 1,
- HasLog = 1,
+ HasLog = 1,
HasExp = 1,
HasATan = 1,
- HasCmp = 1,
+ HasCmp = 1,
HasDiv = 1,
HasRound = 1,
HasFloor = 1,
@@ -160,18 +163,11 @@
};
};
-template<> struct packet_traits<int> : default_packet_traits
-{
+template <>
+struct packet_traits<int> : default_packet_traits {
typedef Packet16i type;
typedef Packet8i half;
- enum {
- Vectorizable = 1,
- AlignedOnScalar = 1,
- HasBlend = 0,
- HasCmp = 1,
- HasDiv = 1,
- size=16
- };
+ enum { Vectorizable = 1, AlignedOnScalar = 1, HasBlend = 0, HasCmp = 1, HasDiv = 1, size = 16 };
};
template <>
@@ -180,28 +176,54 @@
typedef Packet8f half;
typedef Packet16i integer_packet;
typedef uint16_t mask_t;
- enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=true, masked_store_available=true, masked_fpops_available=true };
+ enum {
+ size = 16,
+ alignment = Aligned64,
+ vectorizable = true,
+ masked_load_available = true,
+ masked_store_available = true,
+ masked_fpops_available = true
+ };
};
template <>
struct unpacket_traits<Packet8d> {
typedef double type;
typedef Packet4d half;
typedef uint8_t mask_t;
- enum { size = 8, alignment=Aligned64, vectorizable=true, masked_load_available=true, masked_store_available=true, masked_fpops_available=true };
+ enum {
+ size = 8,
+ alignment = Aligned64,
+ vectorizable = true,
+ masked_load_available = true,
+ masked_store_available = true,
+ masked_fpops_available = true
+ };
};
template <>
struct unpacket_traits<Packet16i> {
typedef int type;
typedef Packet8i half;
- enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=false, masked_store_available=false };
+ enum {
+ size = 16,
+ alignment = Aligned64,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
#ifndef EIGEN_VECTORIZE_AVX512FP16
-template<>
+template <>
struct unpacket_traits<Packet16h> {
typedef Eigen::half type;
typedef Packet8h half;
- enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+ enum {
+ size = 16,
+ alignment = Aligned32,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
#endif
@@ -228,21 +250,30 @@
return _mm512_castsi512_pd(_mm512_set1_epi64(from));
}
-template<> EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f& /*a*/) { return _mm512_setzero_ps(); }
-template<> EIGEN_STRONG_INLINE Packet8d pzero(const Packet8d& /*a*/) { return _mm512_setzero_pd(); }
-template<> EIGEN_STRONG_INLINE Packet16i pzero(const Packet16i& /*a*/) { return _mm512_setzero_si512(); }
+template <>
+EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f& /*a*/) {
+ return _mm512_setzero_ps();
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pzero(const Packet8d& /*a*/) {
+ return _mm512_setzero_pd();
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pzero(const Packet16i& /*a*/) {
+ return _mm512_setzero_si512();
+}
-template<> EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /*a*/) {
- return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
- 0, -1, 0, -1, 0, -1, 0, -1));
+template <>
+EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /*a*/) {
+ return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1));
}
-template<> EIGEN_STRONG_INLINE Packet16i peven_mask(const Packet16i& /*a*/) {
- return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1,
- 0, -1, 0, -1, 0, -1, 0, -1);
+template <>
+EIGEN_STRONG_INLINE Packet16i peven_mask(const Packet16i& /*a*/) {
+ return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
}
-template<> EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) {
- return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1,
- 0, 0, -1, -1, 0, 0, -1, -1));
+template <>
+EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) {
+ return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1));
}
template <>
@@ -251,7 +282,7 @@
// Inline asm here helps reduce some register spilling in TRSM kernels.
// See note in unrolls::gemm::microKernel in TrsmKernel.h
Packet16f ret;
- __asm__ ("vbroadcastss %[mem], %[dst]" : [dst] "=v" (ret) : [mem] "m" (*from));
+ __asm__("vbroadcastss %[mem], %[dst]" : [dst] "=v"(ret) : [mem] "m"(*from));
return ret;
#else
return _mm512_broadcastss_ps(_mm_load_ps1(from));
@@ -261,7 +292,7 @@
EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
#if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0)
Packet8d ret;
- __asm__ ("vbroadcastsd %[mem], %[dst]" : [dst] "=v" (ret) : [mem] "m" (*from));
+ __asm__("vbroadcastsd %[mem], %[dst]" : [dst] "=v"(ret) : [mem] "m"(*from));
return ret;
#else
return _mm512_set1_pd(*from);
@@ -270,67 +301,52 @@
template <>
EIGEN_STRONG_INLINE Packet16f plset<Packet16f>(const float& a) {
- return _mm512_add_ps(
- _mm512_set1_ps(a),
- _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f,
- 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
+ return _mm512_add_ps(_mm512_set1_ps(a), _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f,
+ 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
}
template <>
EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) {
- return _mm512_add_pd(_mm512_set1_pd(a),
- _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
+ return _mm512_add_pd(_mm512_set1_pd(a), _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
}
template <>
EIGEN_STRONG_INLINE Packet16i plset<Packet16i>(const int& a) {
- return _mm512_add_epi32(
- _mm512_set1_epi32(a),
- _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+ return _mm512_add_epi32(_mm512_set1_epi32(a), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
}
template <>
-EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b) {
return _mm512_add_ps(a, b);
}
template <>
-EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
- const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a, const Packet8d& b) {
return _mm512_add_pd(a, b);
}
template <>
-EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a,
- const Packet16i& b) {
+EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a, const Packet16i& b) {
return _mm512_add_epi32(a, b);
}
template <>
-EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a,
- const Packet16f& b,
- uint16_t umask) {
+EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b, uint16_t umask) {
__mmask16 mask = static_cast<__mmask16>(umask);
return _mm512_maskz_add_ps(mask, a, b);
}
template <>
-EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
- const Packet8d& b,
- uint8_t umask) {
+EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a, const Packet8d& b, uint8_t umask) {
__mmask8 mask = static_cast<__mmask8>(umask);
return _mm512_maskz_add_pd(mask, a, b);
}
template <>
-EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, const Packet16f& b) {
return _mm512_sub_ps(a, b);
}
template <>
-EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a,
- const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a, const Packet8d& b) {
return _mm512_sub_pd(a, b);
}
template <>
-EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a,
- const Packet16i& b) {
+EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a, const Packet16i& b) {
return _mm512_sub_epi32(a, b);
}
@@ -339,16 +355,16 @@
// NOTE: MSVC seems to struggle with _mm512_set1_epi32, leading to random results.
// The intel docs give it a relatively high latency as well, so we're probably
// better off with using _mm512_set_epi32 directly anyways.
- const __m512i mask = _mm512_set_epi32(0x80000000,0x80000000,0x80000000,0x80000000,
- 0x80000000,0x80000000,0x80000000,0x80000000,
- 0x80000000,0x80000000,0x80000000,0x80000000,
- 0x80000000,0x80000000,0x80000000,0x80000000);
+ const __m512i mask =
+ _mm512_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
+ 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), mask));
}
template <>
EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) {
- const __m512i mask = _mm512_set_epi64(0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL,
- 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL);
+ const __m512i mask =
+ _mm512_set_epi64(0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL,
+ 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL);
return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a), mask));
}
template <>
@@ -370,202 +386,186 @@
}
template <>
-EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, const Packet16f& b) {
return _mm512_mul_ps(a, b);
}
template <>
-EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a,
- const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a, const Packet8d& b) {
return _mm512_mul_pd(a, b);
}
template <>
-EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a,
- const Packet16i& b) {
+EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a, const Packet16i& b) {
return _mm512_mullo_epi32(a, b);
}
template <>
-EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a, const Packet16f& b) {
return _mm512_div_ps(a, b);
}
template <>
-EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
- const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a, const Packet8d& b) {
return _mm512_div_pd(a, b);
}
template <>
-EIGEN_STRONG_INLINE Packet16i pdiv<Packet16i>(const Packet16i& a,
- const Packet16i& b) {
- Packet8i q_lo = pdiv<Packet8i>(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b,0));
+EIGEN_STRONG_INLINE Packet16i pdiv<Packet16i>(const Packet16i& a, const Packet16i& b) {
+ Packet8i q_lo = pdiv<Packet8i>(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0));
Packet8i q_hi = pdiv<Packet8i>(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1));
return _mm512_inserti64x4(_mm512_castsi256_si512(q_lo), q_hi, 1);
}
#ifdef EIGEN_VECTORIZE_FMA
template <>
-EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
- const Packet16f& c) {
+EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
return _mm512_fmadd_ps(a, b, c);
}
template <>
-EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b,
- const Packet8d& c) {
+EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
return _mm512_fmadd_pd(a, b, c);
}
template <>
-EIGEN_STRONG_INLINE Packet16f pmsub(const Packet16f& a, const Packet16f& b,
- const Packet16f& c) {
+EIGEN_STRONG_INLINE Packet16f pmsub(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
return _mm512_fmsub_ps(a, b, c);
}
template <>
-EIGEN_STRONG_INLINE Packet8d pmsub(const Packet8d& a, const Packet8d& b,
- const Packet8d& c) {
+EIGEN_STRONG_INLINE Packet8d pmsub(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
return _mm512_fmsub_pd(a, b, c);
}
template <>
-EIGEN_STRONG_INLINE Packet16f pnmadd(const Packet16f& a, const Packet16f& b,
- const Packet16f& c) {
+EIGEN_STRONG_INLINE Packet16f pnmadd(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
return _mm512_fnmadd_ps(a, b, c);
}
template <>
-EIGEN_STRONG_INLINE Packet8d pnmadd(const Packet8d& a, const Packet8d& b,
- const Packet8d& c) {
+EIGEN_STRONG_INLINE Packet8d pnmadd(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
return _mm512_fnmadd_pd(a, b, c);
}
template <>
-EIGEN_STRONG_INLINE Packet16f pnmsub(const Packet16f& a, const Packet16f& b,
- const Packet16f& c) {
+EIGEN_STRONG_INLINE Packet16f pnmsub(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
return _mm512_fnmsub_ps(a, b, c);
}
template <>
-EIGEN_STRONG_INLINE Packet8d pnmsub(const Packet8d& a, const Packet8d& b,
- const Packet8d& c) {
+EIGEN_STRONG_INLINE Packet8d pnmsub(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
return _mm512_fnmsub_pd(a, b, c);
}
#endif
template <>
-EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask,
- const Packet16f& a,
- const Packet16f& b) {
+EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) {
__mmask16 mask16 = _mm512_cmpeq_epi32_mask(_mm512_castps_si512(mask), _mm512_setzero_epi32());
return _mm512_mask_blend_ps(mask16, a, b);
}
template <>
-EIGEN_DEVICE_FUNC inline Packet16i pselect(const Packet16i& mask,
- const Packet16i& a,
- const Packet16i& b) {
+EIGEN_DEVICE_FUNC inline Packet16i pselect(const Packet16i& mask, const Packet16i& a, const Packet16i& b) {
__mmask16 mask16 = _mm512_cmpeq_epi32_mask(mask, _mm512_setzero_epi32());
return _mm512_mask_blend_epi32(mask16, a, b);
}
template <>
-EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask,
- const Packet8d& a,
- const Packet8d& b) {
- __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask),
- _mm512_setzero_epi32(), _MM_CMPINT_EQ);
+EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask, const Packet8d& a, const Packet8d& b) {
+ __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
return _mm512_mask_blend_pd(mask8, a, b);
}
template <>
-EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a, const Packet16f& b) {
// Arguments are reversed to match NaN propagation behavior of std::min.
return _mm512_min_ps(b, a);
}
template <>
-EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a,
- const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a, const Packet8d& b) {
// Arguments are reversed to match NaN propagation behavior of std::min.
return _mm512_min_pd(b, a);
}
template <>
-EIGEN_STRONG_INLINE Packet16i pmin<Packet16i>(const Packet16i& a,
- const Packet16i& b) {
+EIGEN_STRONG_INLINE Packet16i pmin<Packet16i>(const Packet16i& a, const Packet16i& b) {
return _mm512_min_epi32(b, a);
}
template <>
-EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a, const Packet16f& b) {
// Arguments are reversed to match NaN propagation behavior of std::max.
return _mm512_max_ps(b, a);
}
template <>
-EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
- const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a, const Packet8d& b) {
// Arguments are reversed to match NaN propagation behavior of std::max.
return _mm512_max_pd(b, a);
}
template <>
-EIGEN_STRONG_INLINE Packet16i pmax<Packet16i>(const Packet16i& a,
- const Packet16i& b) {
+EIGEN_STRONG_INLINE Packet16i pmax<Packet16i>(const Packet16i& a, const Packet16i& b) {
return _mm512_max_epi32(b, a);
}
// Add specializations for min/max with prescribed NaN progation.
-template<>
+template <>
EIGEN_STRONG_INLINE Packet16f pmin<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
return pminmax_propagate_numbers(a, b, pmin<Packet16f>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet8d pmin<PropagateNumbers, Packet8d>(const Packet8d& a, const Packet8d& b) {
return pminmax_propagate_numbers(a, b, pmin<Packet8d>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet16f pmax<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
return pminmax_propagate_numbers(a, b, pmax<Packet16f>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet8d pmax<PropagateNumbers, Packet8d>(const Packet8d& a, const Packet8d& b) {
return pminmax_propagate_numbers(a, b, pmax<Packet8d>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet16f pmin<PropagateNaN, Packet16f>(const Packet16f& a, const Packet16f& b) {
return pminmax_propagate_nan(a, b, pmin<Packet16f>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet8d pmin<PropagateNaN, Packet8d>(const Packet8d& a, const Packet8d& b) {
return pminmax_propagate_nan(a, b, pmin<Packet8d>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet16f pmax<PropagateNaN, Packet16f>(const Packet16f& a, const Packet16f& b) {
return pminmax_propagate_nan(a, b, pmax<Packet16f>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet8d pmax<PropagateNaN, Packet8d>(const Packet8d& a, const Packet8d& b) {
return pminmax_propagate_nan(a, b, pmax<Packet8d>);
}
-
#ifdef EIGEN_VECTORIZE_AVX512DQ
-template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
-template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
-EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); }
-EIGEN_STRONG_INLINE Packet16i cat256i(Packet8i a, Packet8i b) { return _mm512_inserti32x8(_mm512_castsi256_si512(a), b, 1); }
+template <int I_>
+EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
+ return _mm512_extractf32x8_ps(x, I_);
+}
+template <int I_>
+EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
+ return _mm512_extractf64x2_pd(x, I_);
+}
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
+ return _mm512_insertf32x8(_mm512_castps256_ps512(a), b, 1);
+}
+EIGEN_STRONG_INLINE Packet16i cat256i(Packet8i a, Packet8i b) {
+ return _mm512_inserti32x8(_mm512_castsi256_si512(a), b, 1);
+}
#else
// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
-template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
- return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_));
+template <int I_>
+EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
+ return _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(x), I_));
}
// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
-template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
- return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_));
+template <int I_>
+EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
+ return _mm_castsi128_pd(_mm512_extracti32x4_epi32(_mm512_castpd_si512(x), I_));
}
EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
- return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)),
- _mm256_castps_si256(b),1));
+ return _mm512_castsi512_ps(
+ _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)), _mm256_castps_si256(b), 1));
}
EIGEN_STRONG_INLINE Packet16i cat256i(Packet8i a, Packet8i b) {
return _mm512_inserti64x4(_mm512_castsi256_si512(a), b, 1);
@@ -584,10 +584,8 @@
// dst[255:240] := Saturate16(rf[255:224])
__m256i lo = _mm256_castps_si256(extract256<0>(rf));
__m256i hi = _mm256_castps_si256(extract256<1>(rf));
- __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
- _mm256_extractf128_si256(lo, 1));
- __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
- _mm256_extractf128_si256(hi, 1));
+ __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0), _mm256_extractf128_si256(lo, 1));
+ __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0), _mm256_extractf128_si256(hi, 1));
return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
}
@@ -600,36 +598,38 @@
template <>
EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
__mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
- return _mm512_castsi512_ps(
- _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
+ return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
}
-template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
__mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
- return _mm512_castsi512_ps(
- _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
+ return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
}
-template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
__mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
- return _mm512_castsi512_ps(
- _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
+ return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
}
-template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
__mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ);
- return _mm512_castsi512_ps(
- _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
+ return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu));
}
-template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
__mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ);
return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu);
}
-template<> EIGEN_STRONG_INLINE Packet16i pcmp_le(const Packet16i& a, const Packet16i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16i pcmp_le(const Packet16i& a, const Packet16i& b) {
__mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LE);
return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu);
}
-template<> EIGEN_STRONG_INLINE Packet16i pcmp_lt(const Packet16i& a, const Packet16i& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16i pcmp_lt(const Packet16i& a, const Packet16i& b) {
__mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT);
return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, 0xffffffffu);
}
@@ -637,36 +637,50 @@
template <>
EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
__mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
- return _mm512_castsi512_pd(
- _mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+ return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
}
template <>
EIGEN_STRONG_INLINE Packet8d pcmp_le(const Packet8d& a, const Packet8d& b) {
__mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ);
- return _mm512_castsi512_pd(
- _mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+ return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
}
template <>
EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) {
__mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
- return _mm512_castsi512_pd(
- _mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+ return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
}
template <>
EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) {
__mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ);
- return _mm512_castsi512_pd(
- _mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+ return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
}
-template<> EIGEN_STRONG_INLINE Packet16f print<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_CUR_DIRECTION); }
-template<> EIGEN_STRONG_INLINE Packet8d print<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_CUR_DIRECTION); }
+template <>
+EIGEN_STRONG_INLINE Packet16f print<Packet16f>(const Packet16f& a) {
+ return _mm512_roundscale_ps(a, _MM_FROUND_CUR_DIRECTION);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d print<Packet8d>(const Packet8d& a) {
+ return _mm512_roundscale_pd(a, _MM_FROUND_CUR_DIRECTION);
+}
-template<> EIGEN_STRONG_INLINE Packet16f pceil<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF); }
-template<> EIGEN_STRONG_INLINE Packet8d pceil<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF); }
+template <>
+EIGEN_STRONG_INLINE Packet16f pceil<Packet16f>(const Packet16f& a) {
+ return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pceil<Packet8d>(const Packet8d& a) {
+ return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF);
+}
-template<> EIGEN_STRONG_INLINE Packet16f pfloor<Packet16f>(const Packet16f& a) { return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF); }
-template<> EIGEN_STRONG_INLINE Packet8d pfloor<Packet8d>(const Packet8d& a) { return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF); }
+template <>
+EIGEN_STRONG_INLINE Packet16f pfloor<Packet16f>(const Packet16f& a) {
+ return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pfloor<Packet8d>(const Packet8d& a) {
+ return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF);
+}
template <>
EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(const Packet16i& /*a*/) {
@@ -684,23 +698,20 @@
}
template <>
-EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
- const Packet16i& b) {
- return _mm512_and_si512(a,b);
+EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a, const Packet16i& b) {
+ return _mm512_and_si512(a, b);
}
template <>
-EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
- const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a, const Packet16f& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_and_ps(a, b);
#else
- return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b)));
+ return _mm512_castsi512_ps(pand(_mm512_castps_si512(a), _mm512_castps_si512(b)));
#endif
}
template <>
-EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a,
- const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a, const Packet8d& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_and_pd(a, b);
#else
@@ -725,17 +736,16 @@
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_or_ps(a, b);
#else
- return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b)));
+ return _mm512_castsi512_ps(por(_mm512_castps_si512(a), _mm512_castps_si512(b)));
#endif
}
template <>
-EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
- const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a, const Packet8d& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_or_pd(a, b);
#else
- return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
+ return _mm512_castsi512_pd(por(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
#endif
}
@@ -749,7 +759,7 @@
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_xor_ps(a, b);
#else
- return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b)));
+ return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a), _mm512_castps_si512(b)));
#endif
}
@@ -758,7 +768,7 @@
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_xor_pd(a, b);
#else
- return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
+ return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
#endif
}
@@ -772,42 +782,45 @@
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_andnot_ps(b, a);
#else
- return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b)));
+ return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a), _mm512_castps_si512(b)));
#endif
}
template <>
-EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a, const Packet8d& b) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_andnot_pd(b, a);
#else
- return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
+ return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet16f pround<Packet16f>(const Packet16f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16f pround<Packet16f>(const Packet16f& a) {
// Work-around for default std::round rounding mode.
const Packet16f mask = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x80000000u));
const Packet16f prev0dot5 = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
return _mm512_roundscale_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
}
-template<> EIGEN_STRONG_INLINE Packet8d pround<Packet8d>(const Packet8d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8d pround<Packet8d>(const Packet8d& a) {
// Work-around for default std::round rounding mode.
const Packet8d mask = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
const Packet8d prev0dot5 = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
return _mm512_roundscale_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
}
-template<int N> EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
return _mm512_srai_epi32(a, N);
}
-template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) {
return _mm512_srli_epi32(a, N);
}
-template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
+template <int N>
+EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
return _mm512_slli_epi32(a, N);
}
@@ -821,8 +834,7 @@
}
template <>
EIGEN_STRONG_INLINE Packet16i pload<Packet16i>(const int* from) {
- EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
- reinterpret_cast<const __m512i*>(from));
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(reinterpret_cast<const __m512i*>(from));
}
template <>
@@ -835,8 +847,7 @@
}
template <>
EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
- EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(
- reinterpret_cast<const __m512i*>(from));
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512(reinterpret_cast<const __m512i*>(from));
}
template <>
@@ -868,7 +879,7 @@
// a3}
template <>
EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
- __m512d x = _mm512_setzero_pd();
+ __m512d x = _mm512_setzero_pd();
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0);
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1);
x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2);
@@ -879,10 +890,10 @@
template <>
EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
__m512d x = _mm512_setzero_pd();
- x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
- x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
- x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
- x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
+ x = _mm512_mask_broadcastsd_pd(x, 0x3 << 0, _mm_load_sd(from + 0));
+ x = _mm512_mask_broadcastsd_pd(x, 0x3 << 2, _mm_load_sd(from + 1));
+ x = _mm512_mask_broadcastsd_pd(x, 0x3 << 4, _mm_load_sd(from + 2));
+ x = _mm512_mask_broadcastsd_pd(x, 0x3 << 6, _mm_load_sd(from + 3));
return x;
}
#endif
@@ -902,7 +913,7 @@
template <>
EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
Packet16f tmp = _mm512_castps128_ps512(ploadu<Packet4f>(from));
- const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
+ const Packet16i scatter_mask = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
return _mm512_permutexvar_ps(scatter_mask, tmp);
}
@@ -911,7 +922,7 @@
template <>
EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
__m256d lane0 = _mm256_set1_pd(*from);
- __m256d lane1 = _mm256_set1_pd(*(from+1));
+ __m256d lane1 = _mm256_set1_pd(*(from + 1));
__m512d tmp = _mm512_undefined_pd();
tmp = _mm512_insertf64x4(tmp, lane0, 0);
return _mm512_insertf64x4(tmp, lane1, 1);
@@ -922,7 +933,7 @@
template <>
EIGEN_STRONG_INLINE Packet16i ploadquad<Packet16i>(const int* from) {
Packet16i tmp = _mm512_castsi128_si512(ploadu<Packet4i>(from));
- const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
+ const Packet16i scatter_mask = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
return _mm512_permutexvar_epi32(scatter_mask, tmp);
}
@@ -936,8 +947,7 @@
}
template <>
EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet16i& from) {
- EIGEN_DEBUG_ALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to),
- from);
+ EIGEN_DEBUG_ALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to), from);
}
template <>
@@ -950,8 +960,7 @@
}
template <>
EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
- EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
- reinterpret_cast<__m512i*>(to), from);
+ EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to), from);
}
template <>
EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from, uint16_t umask) {
@@ -965,25 +974,20 @@
}
template <typename Scalar, typename Packet>
-EIGEN_DEVICE_FUNC inline Packet pgather(const Packet& src, const Scalar* from,
- Index stride, typename unpacket_traits<Packet>::mask_t umask);
+EIGEN_DEVICE_FUNC inline Packet pgather(const Packet& src, const Scalar* from, Index stride,
+ typename unpacket_traits<Packet>::mask_t umask);
template <>
-EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const Packet16f& src,
- const float* from,
- Index stride,
+EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const Packet16f& src, const float* from, Index stride,
uint16_t umask) {
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
- Packet16i stride_multiplier =
- _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+ Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
__mmask16 mask = static_cast<__mmask16>(umask);
return _mm512_mask_i32gather_ps(src, mask, indices, from, 4);
}
template <>
-EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const Packet8d& src,
- const double* from,
- Index stride,
+EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const Packet8d& src, const double* from, Index stride,
uint8_t umask) {
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
@@ -994,18 +998,15 @@
}
template <>
-EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
- Index stride) {
+EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from, Index stride) {
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
- Packet16i stride_multiplier =
- _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+ Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
return _mm512_i32gather_ps(indices, from, 4);
}
template <>
-EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
- Index stride) {
+EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from, Index stride) {
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
@@ -1013,34 +1014,27 @@
return _mm512_i32gather_pd(indices, from, 8);
}
template <>
-EIGEN_DEVICE_FUNC inline Packet16i pgather<int, Packet16i>(const int* from,
- Index stride) {
+EIGEN_DEVICE_FUNC inline Packet16i pgather<int, Packet16i>(const int* from, Index stride) {
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
- Packet16i stride_multiplier =
- _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+ Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
return _mm512_i32gather_epi32(indices, from, 4);
}
template <typename Scalar, typename Packet>
-EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from,
- Index stride, typename unpacket_traits<Packet>::mask_t umask);
+EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index stride,
+ typename unpacket_traits<Packet>::mask_t umask);
template <>
-EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
- const Packet16f& from,
- Index stride,
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride,
uint16_t umask) {
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
- Packet16i stride_multiplier =
- _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+ Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
__mmask16 mask = static_cast<__mmask16>(umask);
_mm512_mask_i32scatter_ps(to, mask, indices, from, 4);
}
template <>
-EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
- const Packet8d& from,
- Index stride,
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to, const Packet8d& from, Index stride,
uint8_t umask) {
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
@@ -1050,31 +1044,23 @@
}
template <>
-EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
- const Packet16f& from,
- Index stride) {
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) {
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
- Packet16i stride_multiplier =
- _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+ Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
_mm512_i32scatter_ps(to, indices, from, 4);
}
template <>
-EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
- const Packet8d& from,
- Index stride) {
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to, const Packet8d& from, Index stride) {
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
_mm512_i32scatter_pd(to, indices, from, 8);
}
template <>
-EIGEN_DEVICE_FUNC inline void pscatter<int, Packet16i>(int* to,
- const Packet16i& from,
- Index stride) {
+EIGEN_DEVICE_FUNC inline void pscatter<int, Packet16i>(int* to, const Packet16i& from, Index stride) {
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
- Packet16i stride_multiplier =
- _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+ Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
_mm512_i32scatter_epi32(to, indices, from, 4);
}
@@ -1095,9 +1081,18 @@
pstore(to, pa);
}
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+ _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+ _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
+ _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
template <>
EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
@@ -1112,69 +1107,81 @@
return _mm_extract_epi32(_mm512_extracti32x4_epi32(a, 0), 0);
}
-template<> EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) {
return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
}
-template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a) {
return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a);
}
-template<> EIGEN_STRONG_INLINE Packet16i preverse(const Packet16i& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16i preverse(const Packet16i& a) {
return _mm512_permutexvar_epi32(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
}
-template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
// _mm512_abs_ps intrinsic not found, so hack around it
return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
}
template <>
EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
// _mm512_abs_ps intrinsic not found, so hack around it
- return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a),
- _mm512_set1_epi64(0x7fffffffffffffff)));
+ return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), _mm512_set1_epi64(0x7fffffffffffffff)));
}
-template<> EIGEN_STRONG_INLINE Packet16i pabs(const Packet16i& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16i pabs(const Packet16i& a) {
return _mm512_abs_epi32(a);
}
-template<> EIGEN_STRONG_INLINE Packet16h psignbit(const Packet16h& a) { return _mm256_srai_epi16(a, 15); }
-template<> EIGEN_STRONG_INLINE Packet16bf psignbit(const Packet16bf& a) { return _mm256_srai_epi16(a, 15); }
-template<> EIGEN_STRONG_INLINE Packet16f psignbit(const Packet16f& a) { return _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(a), 31)); }
-template<> EIGEN_STRONG_INLINE Packet8d psignbit(const Packet8d& a) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), 63)); }
+template <>
+EIGEN_STRONG_INLINE Packet16h psignbit(const Packet16h& a) {
+ return _mm256_srai_epi16(a, 15);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16bf psignbit(const Packet16bf& a) {
+ return _mm256_srai_epi16(a, 15);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f psignbit(const Packet16f& a) {
+ return _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(a), 31));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d psignbit(const Packet8d& a) {
+ return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), 63));
+}
-template<>
-EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent){
+template <>
+EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent) {
return pfrexp_generic(a, exponent);
}
// Extract exponent without existence of Packet8l.
-template<>
-EIGEN_STRONG_INLINE
-Packet8d pfrexp_generic_get_biased_exponent(const Packet8d& a) {
- const Packet8d cst_exp_mask = pset1frombits<Packet8d>(static_cast<uint64_t>(0x7ff0000000000000ull));
- #ifdef EIGEN_VECTORIZE_AVX512DQ
+template <>
+EIGEN_STRONG_INLINE Packet8d pfrexp_generic_get_biased_exponent(const Packet8d& a) {
+ const Packet8d cst_exp_mask = pset1frombits<Packet8d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+#ifdef EIGEN_VECTORIZE_AVX512DQ
return _mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52));
- #else
+#else
return _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52)));
- #endif
+#endif
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet8d pfrexp<Packet8d>(const Packet8d& a, Packet8d& exponent) {
return pfrexp_generic(a, exponent);
}
-template<> EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
return pldexp_generic(a, exponent);
}
-template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
// Clamp exponent to [-2099, 2099]
const Packet8d max_exponent = pset1<Packet8d>(2099.0);
const Packet8i e = _mm512_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
@@ -1203,30 +1210,26 @@
#ifdef EIGEN_VECTORIZE_AVX512DQ
// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
-#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
- __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
+#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
+ __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
__m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
// AVX512F does not define _mm512_extracti32x8_epi32 to extract _m256i from _m512i
-#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT) \
- __m256i OUTPUT##_0 = _mm512_extracti32x8_epi32(INPUT, 0); \
+#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT) \
+ __m256i OUTPUT##_0 = _mm512_extracti32x8_epi32(INPUT, 0); \
__m256i OUTPUT##_1 = _mm512_extracti32x8_epi32(INPUT, 1)
#else
-#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
- __m256 OUTPUT##_0 = _mm256_insertf128_ps( \
- _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
- _mm512_extractf32x4_ps(INPUT, 1), 1); \
- __m256 OUTPUT##_1 = _mm256_insertf128_ps( \
- _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
- _mm512_extractf32x4_ps(INPUT, 3), 1)
+#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \
+ __m256 OUTPUT##_0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
+ _mm512_extractf32x4_ps(INPUT, 1), 1); \
+ __m256 OUTPUT##_1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
+ _mm512_extractf32x4_ps(INPUT, 3), 1)
-#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT) \
- __m256i OUTPUT##_0 = _mm256_insertf128_si256( \
- _mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 0)), \
- _mm512_extracti32x4_epi32(INPUT, 1), 1); \
- __m256i OUTPUT##_1 = _mm256_insertf128_si256( \
- _mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 2)), \
- _mm512_extracti32x4_epi32(INPUT, 3), 1)
+#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT) \
+ __m256i OUTPUT##_0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 0)), \
+ _mm512_extracti32x4_epi32(INPUT, 1), 1); \
+ __m256i OUTPUT##_1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 2)), \
+ _mm512_extracti32x4_epi32(INPUT, 3), 1)
#endif
#ifdef EIGEN_VECTORIZE_AVX512DQ
@@ -1243,7 +1246,7 @@
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
-#define EIGEN_INSERT_8i_INTO_16i(OUTPUT, INPUTA, INPUTB) \
+#define EIGEN_INSERT_8i_INTO_16i(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_undefined_epi32(); \
OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTA, 0), 0); \
OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTA, 1), 1); \
@@ -1337,7 +1340,7 @@
template <>
EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
-//#ifdef EIGEN_VECTORIZE_AVX512DQ
+// #ifdef EIGEN_VECTORIZE_AVX512DQ
#if 0
Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
@@ -1403,17 +1406,17 @@
return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
}
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x)
-{
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x) {
Packet16i xi = _mm512_castps_si512(x);
- __mmask16 tmp = _mm512_test_epi32_mask(xi,xi);
- return !_mm512_kortestz(tmp,tmp);
+ __mmask16 tmp = _mm512_test_epi32_mask(xi, xi);
+ return !_mm512_kortestz(tmp, tmp);
}
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16i& x)
-{
- __mmask16 tmp = _mm512_test_epi32_mask(x,x);
- return !_mm512_kortestz(tmp,tmp);
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16i& x) {
+ __mmask16 tmp = _mm512_test_epi32_mask(x, x);
+ return !_mm512_kortestz(tmp, tmp);
}
#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
@@ -1530,28 +1533,27 @@
PACK_OUTPUT(kernel.packet, tmp.packet, 14, 16);
PACK_OUTPUT(kernel.packet, tmp.packet, 15, 16);
}
-#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \
- EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \
- INPUT[2 * INDEX + STRIDE]);
+#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \
+ EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], INPUT[2 * INDEX + STRIDE]);
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 8>& kernel) {
- __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0],kernel.packet[1]);
- __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0],kernel.packet[1]);
- __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2],kernel.packet[3]);
- __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2],kernel.packet[3]);
- __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4],kernel.packet[5]);
- __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4],kernel.packet[5]);
- __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6],kernel.packet[7]);
- __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6],kernel.packet[7]);
+ __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+ __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+ __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+ __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+ __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
+ __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
+ __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
+ __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
- kernel.packet[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T0),_mm512_castps_pd(T2)));
- kernel.packet[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T0),_mm512_castps_pd(T2)));
- kernel.packet[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T1),_mm512_castps_pd(T3)));
- kernel.packet[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T1),_mm512_castps_pd(T3)));
- kernel.packet[4] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T4),_mm512_castps_pd(T6)));
- kernel.packet[5] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T4),_mm512_castps_pd(T6)));
- kernel.packet[6] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T5),_mm512_castps_pd(T7)));
- kernel.packet[7] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T5),_mm512_castps_pd(T7)));
+ kernel.packet[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2)));
+ kernel.packet[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2)));
+ kernel.packet[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3)));
+ kernel.packet[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3)));
+ kernel.packet[4] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6)));
+ kernel.packet[5] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6)));
+ kernel.packet[6] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7)));
+ kernel.packet[7] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7)));
T0 = _mm512_shuffle_f32x4(kernel.packet[0], kernel.packet[4], 0x44);
T1 = _mm512_shuffle_f32x4(kernel.packet[0], kernel.packet[4], 0xee);
@@ -1612,8 +1614,7 @@
#define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE) \
OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
- OUTPUT[INDEX] = \
- _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
+ OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
__m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
@@ -1623,23 +1624,15 @@
PacketBlock<Packet4d, 8> tmp;
- tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
- _mm512_extractf64x4_pd(T2, 0), 0x20);
- tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
- _mm512_extractf64x4_pd(T3, 0), 0x20);
- tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0),
- _mm512_extractf64x4_pd(T2, 0), 0x31);
- tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0),
- _mm512_extractf64x4_pd(T3, 0), 0x31);
+ tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), _mm512_extractf64x4_pd(T2, 0), 0x20);
+ tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), _mm512_extractf64x4_pd(T3, 0), 0x20);
+ tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), _mm512_extractf64x4_pd(T2, 0), 0x31);
+ tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), _mm512_extractf64x4_pd(T3, 0), 0x31);
- tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
- _mm512_extractf64x4_pd(T2, 1), 0x20);
- tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
- _mm512_extractf64x4_pd(T3, 1), 0x20);
- tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1),
- _mm512_extractf64x4_pd(T2, 1), 0x31);
- tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1),
- _mm512_extractf64x4_pd(T3, 1), 0x31);
+ tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), _mm512_extractf64x4_pd(T2, 1), 0x20);
+ tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), _mm512_extractf64x4_pd(T3, 1), 0x20);
+ tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), _mm512_extractf64x4_pd(T2, 1), 0x31);
+ tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), _mm512_extractf64x4_pd(T3, 1), 0x31);
PACK_OUTPUT_D(kernel.packet, tmp.packet, 0, 1);
PACK_OUTPUT_D(kernel.packet, tmp.packet, 1, 1);
@@ -1648,64 +1641,66 @@
}
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
- __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0],kernel.packet[1]);
- __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0],kernel.packet[1]);
- __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2],kernel.packet[3]);
- __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2],kernel.packet[3]);
- __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4],kernel.packet[5]);
- __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4],kernel.packet[5]);
- __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6],kernel.packet[7]);
- __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6],kernel.packet[7]);
+ __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
+ __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
+ __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2], kernel.packet[3]);
+ __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2], kernel.packet[3]);
+ __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4], kernel.packet[5]);
+ __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4], kernel.packet[5]);
+ __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6], kernel.packet[7]);
+ __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6], kernel.packet[7]);
- kernel.packet[0] = _mm512_permutex_pd(T2, 0x4E);
- kernel.packet[0] = _mm512_mask_blend_pd(0xCC, T0, kernel.packet[0]);
- kernel.packet[2] = _mm512_permutex_pd(T0, 0x4E);
- kernel.packet[2] = _mm512_mask_blend_pd(0xCC, kernel.packet[2], T2);
- kernel.packet[1] = _mm512_permutex_pd(T3, 0x4E);
- kernel.packet[1] = _mm512_mask_blend_pd(0xCC, T1, kernel.packet[1]);
- kernel.packet[3] = _mm512_permutex_pd(T1, 0x4E);
- kernel.packet[3] = _mm512_mask_blend_pd(0xCC, kernel.packet[3], T3);
- kernel.packet[4] = _mm512_permutex_pd(T6, 0x4E);
- kernel.packet[4] = _mm512_mask_blend_pd(0xCC, T4, kernel.packet[4]);
- kernel.packet[6] = _mm512_permutex_pd(T4, 0x4E);
- kernel.packet[6] = _mm512_mask_blend_pd(0xCC, kernel.packet[6], T6);
- kernel.packet[5] = _mm512_permutex_pd(T7, 0x4E);
- kernel.packet[5] = _mm512_mask_blend_pd(0xCC, T5, kernel.packet[5]);
- kernel.packet[7] = _mm512_permutex_pd(T5, 0x4E);
- kernel.packet[7] = _mm512_mask_blend_pd(0xCC, kernel.packet[7], T7);
+ kernel.packet[0] = _mm512_permutex_pd(T2, 0x4E);
+ kernel.packet[0] = _mm512_mask_blend_pd(0xCC, T0, kernel.packet[0]);
+ kernel.packet[2] = _mm512_permutex_pd(T0, 0x4E);
+ kernel.packet[2] = _mm512_mask_blend_pd(0xCC, kernel.packet[2], T2);
+ kernel.packet[1] = _mm512_permutex_pd(T3, 0x4E);
+ kernel.packet[1] = _mm512_mask_blend_pd(0xCC, T1, kernel.packet[1]);
+ kernel.packet[3] = _mm512_permutex_pd(T1, 0x4E);
+ kernel.packet[3] = _mm512_mask_blend_pd(0xCC, kernel.packet[3], T3);
+ kernel.packet[4] = _mm512_permutex_pd(T6, 0x4E);
+ kernel.packet[4] = _mm512_mask_blend_pd(0xCC, T4, kernel.packet[4]);
+ kernel.packet[6] = _mm512_permutex_pd(T4, 0x4E);
+ kernel.packet[6] = _mm512_mask_blend_pd(0xCC, kernel.packet[6], T6);
+ kernel.packet[5] = _mm512_permutex_pd(T7, 0x4E);
+ kernel.packet[5] = _mm512_mask_blend_pd(0xCC, T5, kernel.packet[5]);
+ kernel.packet[7] = _mm512_permutex_pd(T5, 0x4E);
+ kernel.packet[7] = _mm512_mask_blend_pd(0xCC, kernel.packet[7], T7);
- T0 = _mm512_shuffle_f64x2(kernel.packet[4], kernel.packet[4], 0x4E);
- T0 = _mm512_mask_blend_pd(0xF0, kernel.packet[0], T0);
- T4 = _mm512_shuffle_f64x2(kernel.packet[0], kernel.packet[0], 0x4E);
- T4 = _mm512_mask_blend_pd(0xF0, T4, kernel.packet[4]);
- T1 = _mm512_shuffle_f64x2(kernel.packet[5], kernel.packet[5], 0x4E);
- T1 = _mm512_mask_blend_pd(0xF0, kernel.packet[1], T1);
- T5 = _mm512_shuffle_f64x2(kernel.packet[1], kernel.packet[1], 0x4E);
- T5 = _mm512_mask_blend_pd(0xF0, T5, kernel.packet[5]);
- T2 = _mm512_shuffle_f64x2(kernel.packet[6], kernel.packet[6], 0x4E);
- T2 = _mm512_mask_blend_pd(0xF0, kernel.packet[2], T2);
- T6 = _mm512_shuffle_f64x2(kernel.packet[2], kernel.packet[2], 0x4E);
- T6 = _mm512_mask_blend_pd(0xF0, T6, kernel.packet[6]);
- T3 = _mm512_shuffle_f64x2(kernel.packet[7], kernel.packet[7], 0x4E);
- T3 = _mm512_mask_blend_pd(0xF0, kernel.packet[3], T3);
- T7 = _mm512_shuffle_f64x2(kernel.packet[3], kernel.packet[3], 0x4E);
- T7 = _mm512_mask_blend_pd(0xF0, T7, kernel.packet[7]);
+ T0 = _mm512_shuffle_f64x2(kernel.packet[4], kernel.packet[4], 0x4E);
+ T0 = _mm512_mask_blend_pd(0xF0, kernel.packet[0], T0);
+ T4 = _mm512_shuffle_f64x2(kernel.packet[0], kernel.packet[0], 0x4E);
+ T4 = _mm512_mask_blend_pd(0xF0, T4, kernel.packet[4]);
+ T1 = _mm512_shuffle_f64x2(kernel.packet[5], kernel.packet[5], 0x4E);
+ T1 = _mm512_mask_blend_pd(0xF0, kernel.packet[1], T1);
+ T5 = _mm512_shuffle_f64x2(kernel.packet[1], kernel.packet[1], 0x4E);
+ T5 = _mm512_mask_blend_pd(0xF0, T5, kernel.packet[5]);
+ T2 = _mm512_shuffle_f64x2(kernel.packet[6], kernel.packet[6], 0x4E);
+ T2 = _mm512_mask_blend_pd(0xF0, kernel.packet[2], T2);
+ T6 = _mm512_shuffle_f64x2(kernel.packet[2], kernel.packet[2], 0x4E);
+ T6 = _mm512_mask_blend_pd(0xF0, T6, kernel.packet[6]);
+ T3 = _mm512_shuffle_f64x2(kernel.packet[7], kernel.packet[7], 0x4E);
+ T3 = _mm512_mask_blend_pd(0xF0, kernel.packet[3], T3);
+ T7 = _mm512_shuffle_f64x2(kernel.packet[3], kernel.packet[3], 0x4E);
+ T7 = _mm512_mask_blend_pd(0xF0, T7, kernel.packet[7]);
- kernel.packet[0] = T0; kernel.packet[1] = T1;
- kernel.packet[2] = T2; kernel.packet[3] = T3;
- kernel.packet[4] = T4; kernel.packet[5] = T5;
- kernel.packet[6] = T6; kernel.packet[7] = T7;
+ kernel.packet[0] = T0;
+ kernel.packet[1] = T1;
+ kernel.packet[2] = T2;
+ kernel.packet[3] = T3;
+ kernel.packet[4] = T4;
+ kernel.packet[5] = T5;
+ kernel.packet[6] = T6;
+ kernel.packet[7] = T7;
}
#define PACK_OUTPUT_I32(OUTPUT, INPUT, INDEX, STRIDE) \
EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
-#define PACK_OUTPUT_I32_2(OUTPUT, INPUT, INDEX, STRIDE) \
- EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[2 * INDEX], \
- INPUT[2 * INDEX + STRIDE]);
+#define PACK_OUTPUT_I32_2(OUTPUT, INPUT, INDEX, STRIDE) \
+ EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[2 * INDEX], INPUT[2 * INDEX + STRIDE]);
-#define SHUFFLE_EPI32(A, B, M) \
- _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(A), _mm512_castsi512_ps(B), M))
+#define SHUFFLE_EPI32(A, B, M) _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(A), _mm512_castsi512_ps(B), M))
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16i, 16>& kernel) {
__m512i T0 = _mm512_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
@@ -1854,8 +1849,7 @@
}
template <>
-EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& ifPacket,
- const Packet16f& thenPacket,
+EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& ifPacket, const Packet16f& thenPacket,
const Packet16f& elsePacket) {
__mmask16 m = (ifPacket.select[0]) | (ifPacket.select[1] << 1) | (ifPacket.select[2] << 2) |
(ifPacket.select[3] << 3) | (ifPacket.select[4] << 4) | (ifPacket.select[5] << 5) |
@@ -1866,51 +1860,51 @@
return _mm512_mask_blend_ps(m, elsePacket, thenPacket);
}
template <>
-EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
- const Packet8d& thenPacket,
+EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket, const Packet8d& thenPacket,
const Packet8d& elsePacket) {
- __mmask8 m = (ifPacket.select[0] )
- | (ifPacket.select[1]<<1)
- | (ifPacket.select[2]<<2)
- | (ifPacket.select[3]<<3)
- | (ifPacket.select[4]<<4)
- | (ifPacket.select[5]<<5)
- | (ifPacket.select[6]<<6)
- | (ifPacket.select[7]<<7);
+ __mmask8 m = (ifPacket.select[0]) | (ifPacket.select[1] << 1) | (ifPacket.select[2] << 2) |
+ (ifPacket.select[3] << 3) | (ifPacket.select[4] << 4) | (ifPacket.select[5] << 5) |
+ (ifPacket.select[6] << 6) | (ifPacket.select[7] << 7);
return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
}
// Packet math for Eigen::half
-template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
return _mm256_set1_epi16(from.x);
}
-template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from, 0)));
}
-template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
}
-template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
+template <>
+EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
}
-template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
+template <>
+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
// (void*) -> workaround clang warning:
// cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
_mm256_store_si256((__m256i*)(void*)to, from);
}
-template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
+template <>
+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
// (void*) -> workaround clang warning:
// cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
_mm256_storeu_si256((__m256i*)(void*)to, from);
}
-template<> EIGEN_STRONG_INLINE Packet16h
-ploaddup<Packet16h>(const Eigen::half* from) {
+template <>
+EIGEN_STRONG_INLINE Packet16h ploaddup<Packet16h>(const Eigen::half* from) {
unsigned short a = from[0].x;
unsigned short b = from[1].x;
unsigned short c = from[2].x;
@@ -1922,8 +1916,8 @@
return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
}
-template<> EIGEN_STRONG_INLINE Packet16h
-ploadquad(const Eigen::half* from) {
+template <>
+EIGEN_STRONG_INLINE Packet16h ploadquad(const Eigen::half* from) {
unsigned short a = from[0].x;
unsigned short b = from[1].x;
unsigned short c = from[2].x;
@@ -1931,15 +1925,14 @@
return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
}
-EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
- return _mm512_cvtph_ps(a);
-}
+EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { return _mm512_cvtph_ps(a); }
EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
- return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+ return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
}
-template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
return Packet16h(ptrue(Packet8i(a)));
}
@@ -1950,14 +1943,12 @@
}
template <>
-EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a,
- const Packet16h& b) {
+EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a, const Packet16h& b) {
return float2half(pmin<Packet16f>(half2float(a), half2float(b)));
}
template <>
-EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a,
- const Packet16h& b) {
+EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a, const Packet16h& b) {
return float2half(pmax<Packet16f>(half2float(a), half2float(b)));
}
@@ -1966,96 +1957,118 @@
return float2half(plset<Packet16f>(static_cast<float>(a)));
}
-template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a, const Packet16h& b) {
// in some cases Packet8i is a wrapper around __m256i, so we need to
// cast to Packet8i to call the correct overload.
- return Packet16h(por(Packet8i(a),Packet8i(b)));
+ return Packet16h(por(Packet8i(a), Packet8i(b)));
}
-template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) {
- return Packet16h(pxor(Packet8i(a),Packet8i(b)));
+template <>
+EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a, const Packet16h& b) {
+ return Packet16h(pxor(Packet8i(a), Packet8i(b)));
}
-template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) {
- return Packet16h(pand(Packet8i(a),Packet8i(b)));
+template <>
+EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a, const Packet16h& b) {
+ return Packet16h(pand(Packet8i(a), Packet8i(b)));
}
-template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) {
- return Packet16h(pandnot(Packet8i(a),Packet8i(b)));
+template <>
+EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a, const Packet16h& b) {
+ return Packet16h(pandnot(Packet8i(a), Packet8i(b)));
}
-template<> EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
return _mm256_blendv_epi8(b, a, mask);
}
-template<> EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
return float2half(pround<Packet16f>(half2float(a)));
}
-template<> EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
return float2half(print<Packet16f>(half2float(a)));
}
-template<> EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
return float2half(pceil<Packet16f>(half2float(a)));
}
-template<> EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
return float2half(pfloor<Packet16f>(half2float(a)));
}
-template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a, const Packet16h& b) {
Packet16f af = half2float(a);
Packet16f bf = half2float(b);
return Pack32To16(pcmp_eq(af, bf));
}
-template<> EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a,const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a, const Packet16h& b) {
return Pack32To16(pcmp_le(half2float(a), half2float(b)));
}
-template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a,const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a, const Packet16h& b) {
return Pack32To16(pcmp_lt(half2float(a), half2float(b)));
}
-template<> EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a,const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a, const Packet16h& b) {
return Pack32To16(pcmp_lt_or_nan(half2float(a), half2float(b)));
}
-template<> EIGEN_STRONG_INLINE Packet16h pconj(const Packet16h& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet16h pconj(const Packet16h& a) {
+ return a;
+}
-template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
Packet16h sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
return _mm256_xor_si256(a, sign_mask);
}
#ifndef EIGEN_VECTORIZE_AVX512FP16
-template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
Packet16f af = half2float(a);
Packet16f bf = half2float(b);
Packet16f rf = padd(af, bf);
return float2half(rf);
}
-template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
Packet16f af = half2float(a);
Packet16f bf = half2float(b);
Packet16f rf = psub(af, bf);
return float2half(rf);
}
-template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
Packet16f af = half2float(a);
Packet16f bf = half2float(b);
Packet16f rf = pmul(af, bf);
return float2half(rf);
}
-template<> EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
Packet16f af = half2float(a);
Packet16f bf = half2float(b);
Packet16f rf = pdiv(af, bf);
return float2half(rf);
}
-template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
+template <>
+EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
Packet16f from_float = half2float(from);
return half(predux(from_float));
}
@@ -2069,64 +2082,64 @@
return padd<Packet8h>(lane0, lane1);
}
-template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet16h>(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_max<Packet16h>(const Packet16h& a) {
Packet16f af = half2float(a);
float reduced = predux_max<Packet16f>(af);
return Eigen::half(reduced);
}
-template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet16h>(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_min<Packet16h>(const Packet16h& a) {
Packet16f af = half2float(a);
float reduced = predux_min<Packet16f>(af);
return Eigen::half(reduced);
}
-template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
+template <>
+EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
Packet16f from_float = half2float(from);
return half(predux_mul(from_float));
}
-template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
-{
- __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
- return _mm256_insertf128_si256(
- _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a,1),m)),
- _mm_shuffle_epi8(_mm256_extractf128_si256(a,0),m), 1);
+template <>
+EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) {
+ __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a, 1), m)),
+ _mm_shuffle_epi8(_mm256_extractf128_si256(a, 0), m), 1);
}
-template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
-{
- return _mm256_set_epi16(
- from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
- from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
- from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
- from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+template <>
+EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) {
+ return _mm256_set_epi16(from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
+ from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
+ from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
+ from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
}
-template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
-{
+template <>
+EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride) {
EIGEN_ALIGN64 half aux[16];
pstore(aux, from);
- to[stride*0] = aux[0];
- to[stride*1] = aux[1];
- to[stride*2] = aux[2];
- to[stride*3] = aux[3];
- to[stride*4] = aux[4];
- to[stride*5] = aux[5];
- to[stride*6] = aux[6];
- to[stride*7] = aux[7];
- to[stride*8] = aux[8];
- to[stride*9] = aux[9];
- to[stride*10] = aux[10];
- to[stride*11] = aux[11];
- to[stride*12] = aux[12];
- to[stride*13] = aux[13];
- to[stride*14] = aux[14];
- to[stride*15] = aux[15];
+ to[stride * 0] = aux[0];
+ to[stride * 1] = aux[1];
+ to[stride * 2] = aux[2];
+ to[stride * 3] = aux[3];
+ to[stride * 4] = aux[4];
+ to[stride * 5] = aux[5];
+ to[stride * 6] = aux[6];
+ to[stride * 7] = aux[7];
+ to[stride * 8] = aux[8];
+ to[stride * 9] = aux[9];
+ to[stride * 10] = aux[10];
+ to[stride * 11] = aux[11];
+ to[stride * 12] = aux[12];
+ to[stride * 13] = aux[13];
+ to[stride * 14] = aux[14];
+ to[stride * 15] = aux[15];
}
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet16h,16>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 16>& kernel) {
__m256i a = kernel.packet[0];
__m256i b = kernel.packet[1];
__m256i c = kernel.packet[2];
@@ -2233,8 +2246,7 @@
kernel.packet[15] = a_p_f;
}
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet16h,8>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 8>& kernel) {
EIGEN_ALIGN64 half in[8][16];
pstore<half>(in[0], kernel.packet[0]);
pstore<half>(in[1], kernel.packet[1]);
@@ -2249,10 +2261,10 @@
for (int i = 0; i < 8; ++i) {
for (int j = 0; j < 8; ++j) {
- out[i][j] = in[j][2*i];
+ out[i][j] = in[j][2 * i];
}
for (int j = 0; j < 8; ++j) {
- out[i][j+8] = in[j][2*i+1];
+ out[i][j + 8] = in[j][2 * i + 1];
}
}
@@ -2266,8 +2278,7 @@
kernel.packet[7] = pload<Packet16h>(out[7]);
}
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet16h,4>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 4>& kernel) {
EIGEN_ALIGN64 half in[4][16];
pstore<half>(in[0], kernel.packet[0]);
pstore<half>(in[1], kernel.packet[1]);
@@ -2278,16 +2289,16 @@
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
- out[i][j] = in[j][4*i];
+ out[i][j] = in[j][4 * i];
}
for (int j = 0; j < 4; ++j) {
- out[i][j+4] = in[j][4*i+1];
+ out[i][j + 4] = in[j][4 * i + 1];
}
for (int j = 0; j < 4; ++j) {
- out[i][j+8] = in[j][4*i+2];
+ out[i][j + 8] = in[j][4 * i + 2];
}
for (int j = 0; j < 4; ++j) {
- out[i][j+12] = in[j][4*i+3];
+ out[i][j + 12] = in[j][4 * i + 3];
}
}
@@ -2297,7 +2308,10 @@
kernel.packet[3] = pload<Packet16h>(out[3]);
}
-template <> struct is_arithmetic<Packet16bf> { enum { value = true }; };
+template <>
+struct is_arithmetic<Packet16bf> {
+ enum { value = true };
+};
template <>
struct packet_traits<bfloat16> : default_packet_traits {
@@ -2315,24 +2329,29 @@
HasRsqrt = 1,
#ifdef EIGEN_VECTORIZE_AVX512DQ
HasLog = 1, // Currently fails test with bad accuracy.
- HasLog1p = 1,
- HasExpm1 = 1,
+ HasLog1p = 1,
+ HasExpm1 = 1,
HasNdtri = 1,
HasBessel = 1,
#endif
HasExp = 1,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
- HasCmp = 1,
+ HasCmp = 1,
HasDiv = 1
};
};
template <>
-struct unpacket_traits<Packet16bf>
-{
+struct unpacket_traits<Packet16bf> {
typedef bfloat16 type;
- enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+ enum {
+ size = 16,
+ alignment = Aligned32,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
typedef Packet8bf half;
};
@@ -2359,19 +2378,17 @@
}
template <>
-EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to,
- const Packet16bf& from) {
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet16bf& from) {
_mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
}
template <>
-EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to,
- const Packet16bf& from) {
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet16bf& from) {
_mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
}
-template<> EIGEN_STRONG_INLINE Packet16bf
-ploaddup<Packet16bf>(const bfloat16* from) {
+template <>
+EIGEN_STRONG_INLINE Packet16bf ploaddup<Packet16bf>(const bfloat16* from) {
unsigned short a = from[0].value;
unsigned short b = from[1].value;
unsigned short c = from[2].value;
@@ -2383,8 +2400,8 @@
return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
}
-template<> EIGEN_STRONG_INLINE Packet16bf
-ploadquad(const bfloat16* from) {
+template <>
+EIGEN_STRONG_INLINE Packet16bf ploadquad(const bfloat16* from) {
unsigned short a = from[0].value;
unsigned short b = from[1].value;
unsigned short c = from[2].value;
@@ -2400,7 +2417,7 @@
EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) {
Packet16bf r;
-#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_STRICT_AT_LEAST(10,1,0)
+#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_STRICT_AT_LEAST(10, 1, 0)
// Since GCC 10.1 supports avx512bf16 and C style explicit cast
// (C++ static_cast is not supported yet), do conversion via intrinsic
// and register path for performance.
@@ -2426,7 +2443,7 @@
t = _mm512_mask_blend_epi32(mask, nan, t);
// output.value = static_cast<uint16_t>(input);
r = _mm512_cvtepi32_epi16(t);
-#endif // EIGEN_VECTORIZE_AVX512BF16
+#endif // EIGEN_VECTORIZE_AVX512BF16
return r;
}
@@ -2452,58 +2469,54 @@
}
template <>
-EIGEN_STRONG_INLINE Packet16bf pandnot(const Packet16bf& a,
- const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pandnot(const Packet16bf& a, const Packet16bf& b) {
return Packet16bf(pandnot<Packet8i>(Packet8i(a), Packet8i(b)));
}
template <>
-EIGEN_STRONG_INLINE Packet16bf pselect(const Packet16bf& mask,
- const Packet16bf& a,
- const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pselect(const Packet16bf& mask, const Packet16bf& a, const Packet16bf& b) {
// Input mask is expected to be all 0/1, handle it with 8-bit
// intrinsic for performance.
return _mm256_blendv_epi8(b, a, mask);
}
-template<> EIGEN_STRONG_INLINE Packet16bf pround<Packet16bf>(const Packet16bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16bf pround<Packet16bf>(const Packet16bf& a) {
return F32ToBf16(pround<Packet16f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE Packet16bf print<Packet16bf>(const Packet16bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16bf print<Packet16bf>(const Packet16bf& a) {
return F32ToBf16(print<Packet16f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE Packet16bf pceil<Packet16bf>(const Packet16bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16bf pceil<Packet16bf>(const Packet16bf& a) {
return F32ToBf16(pceil<Packet16f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE Packet16bf pfloor<Packet16bf>(const Packet16bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16bf pfloor<Packet16bf>(const Packet16bf& a) {
return F32ToBf16(pfloor<Packet16f>(Bf16ToF32(a)));
}
template <>
-EIGEN_STRONG_INLINE Packet16bf pcmp_eq(const Packet16bf& a,
- const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pcmp_eq(const Packet16bf& a, const Packet16bf& b) {
return Pack32To16(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
}
template <>
-EIGEN_STRONG_INLINE Packet16bf pcmp_le(const Packet16bf& a,
- const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pcmp_le(const Packet16bf& a, const Packet16bf& b) {
return Pack32To16(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
}
template <>
-EIGEN_STRONG_INLINE Packet16bf pcmp_lt(const Packet16bf& a,
- const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pcmp_lt(const Packet16bf& a, const Packet16bf& b) {
return Pack32To16(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
}
template <>
-EIGEN_STRONG_INLINE Packet16bf pcmp_lt_or_nan(const Packet16bf& a,
- const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pcmp_lt_or_nan(const Packet16bf& a, const Packet16bf& b) {
return Pack32To16(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b)));
}
@@ -2525,38 +2538,32 @@
}
template <>
-EIGEN_STRONG_INLINE Packet16bf padd<Packet16bf>(const Packet16bf& a,
- const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf padd<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
return F32ToBf16(padd<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
}
template <>
-EIGEN_STRONG_INLINE Packet16bf psub<Packet16bf>(const Packet16bf& a,
- const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf psub<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
return F32ToBf16(psub<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
}
template <>
-EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(const Packet16bf& a,
- const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
return F32ToBf16(pmul<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
}
template <>
-EIGEN_STRONG_INLINE Packet16bf pdiv<Packet16bf>(const Packet16bf& a,
- const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pdiv<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
return F32ToBf16(pdiv<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
}
template <>
-EIGEN_STRONG_INLINE Packet16bf pmin<Packet16bf>(const Packet16bf& a,
- const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pmin<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
return F32ToBf16(pmin<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
}
template <>
-EIGEN_STRONG_INLINE Packet16bf pmax<Packet16bf>(const Packet16bf& a,
- const Packet16bf& b) {
+EIGEN_STRONG_INLINE Packet16bf pmax<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
return F32ToBf16(pmax<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
}
@@ -2594,8 +2601,8 @@
template <>
EIGEN_STRONG_INLINE Packet16bf preverse(const Packet16bf& a) {
- __m256i m = _mm256_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,
- 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
+ __m256i m = _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7,
+ 4, 5, 2, 3, 0, 1);
Packet16bf res;
// Swap hi and lo first because shuffle is in 128-bit lanes.
@@ -2605,40 +2612,37 @@
}
template <>
-EIGEN_STRONG_INLINE Packet16bf pgather<bfloat16, Packet16bf>(const bfloat16* from,
- Index stride) {
+EIGEN_STRONG_INLINE Packet16bf pgather<bfloat16, Packet16bf>(const bfloat16* from, Index stride) {
return _mm256_set_epi16(
- from[15*stride].value, from[14*stride].value, from[13*stride].value, from[12*stride].value,
- from[11*stride].value, from[10*stride].value, from[9*stride].value, from[8*stride].value,
- from[7*stride].value, from[6*stride].value, from[5*stride].value, from[4*stride].value,
- from[3*stride].value, from[2*stride].value, from[1*stride].value, from[0*stride].value);
+ from[15 * stride].value, from[14 * stride].value, from[13 * stride].value, from[12 * stride].value,
+ from[11 * stride].value, from[10 * stride].value, from[9 * stride].value, from[8 * stride].value,
+ from[7 * stride].value, from[6 * stride].value, from[5 * stride].value, from[4 * stride].value,
+ from[3 * stride].value, from[2 * stride].value, from[1 * stride].value, from[0 * stride].value);
}
template <>
-EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet16bf>(bfloat16* to,
- const Packet16bf& from,
- Index stride) {
+EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet16bf>(bfloat16* to, const Packet16bf& from, Index stride) {
EIGEN_ALIGN64 bfloat16 aux[16];
pstore(aux, from);
- to[stride*0] = aux[0];
- to[stride*1] = aux[1];
- to[stride*2] = aux[2];
- to[stride*3] = aux[3];
- to[stride*4] = aux[4];
- to[stride*5] = aux[5];
- to[stride*6] = aux[6];
- to[stride*7] = aux[7];
- to[stride*8] = aux[8];
- to[stride*9] = aux[9];
- to[stride*10] = aux[10];
- to[stride*11] = aux[11];
- to[stride*12] = aux[12];
- to[stride*13] = aux[13];
- to[stride*14] = aux[14];
- to[stride*15] = aux[15];
+ to[stride * 0] = aux[0];
+ to[stride * 1] = aux[1];
+ to[stride * 2] = aux[2];
+ to[stride * 3] = aux[3];
+ to[stride * 4] = aux[4];
+ to[stride * 5] = aux[5];
+ to[stride * 6] = aux[6];
+ to[stride * 7] = aux[7];
+ to[stride * 8] = aux[8];
+ to[stride * 9] = aux[9];
+ to[stride * 10] = aux[10];
+ to[stride * 11] = aux[11];
+ to[stride * 12] = aux[12];
+ to[stride * 13] = aux[13];
+ to[stride * 14] = aux[14];
+ to[stride * 15] = aux[15];
}
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,16>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf, 16>& kernel) {
__m256i a = kernel.packet[0];
__m256i b = kernel.packet[1];
__m256i c = kernel.packet[2];
@@ -2728,7 +2732,7 @@
kernel.packet[15] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
}
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf,4>& kernel) {
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf, 4>& kernel) {
__m256i a = kernel.packet[0];
__m256i b = kernel.packet[1];
__m256i c = kernel.packet[2];
@@ -2751,8 +2755,8 @@
kernel.packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31);
}
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_PACKET_MATH_AVX512_H
+#endif // EIGEN_PACKET_MATH_AVX512_H
diff --git a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
index faa3853..131e6f1 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -10,7 +10,7 @@
#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H
#define EIGEN_PACKET_MATH_FP16_AVX512_H
-// IWYU pragma: private
+// IWYU pragma: private
#include "../../InternalHeaderCheck.h"
namespace Eigen {
diff --git a/Eigen/src/Core/arch/AVX512/TrsmKernel.h b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
index a3025ec..903bca5 100644
--- a/Eigen/src/Core/arch/AVX512/TrsmKernel.h
+++ b/Eigen/src/Core/arch/AVX512/TrsmKernel.h
@@ -108,7 +108,7 @@
int64_t cutoff_l = static_cast<int64_t>(cutoff_d);
return (cutoff_l / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW;
}
-#else // !(EIGEN_USE_AVX512_TRSM_KERNELS) || !(EIGEN_COMP_CLANG != 0)
+#else // !(EIGEN_USE_AVX512_TRSM_KERNELS) || !(EIGEN_COMP_CLANG != 0)
#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS 0
#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS 0
#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS 0
@@ -118,8 +118,8 @@
* Used by gemmKernel for the case A/B row-major and C col-major.
*/
template <typename Scalar, typename vec, int64_t unrollM, int64_t unrollN, bool remM, bool remN>
-EIGEN_ALWAYS_INLINE void transStoreC(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
- Scalar *C_arr, int64_t LDC, int64_t remM_ = 0, int64_t remN_ = 0) {
+EIGEN_ALWAYS_INLINE void transStoreC(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, Scalar *C_arr,
+ int64_t LDC, int64_t remM_ = 0, int64_t remN_ = 0) {
EIGEN_UNUSED_VARIABLE(remN_);
EIGEN_UNUSED_VARIABLE(remM_);
using urolls = unrolls::trans<Scalar>;
@@ -811,7 +811,7 @@
*/
template <typename Scalar, bool toTemp = true, bool remM = false>
EIGEN_ALWAYS_INLINE void copyBToRowMajor(Scalar *B_arr, int64_t LDB, int64_t K, Scalar *B_temp, int64_t LDB_,
- int64_t remM_ = 0) {
+ int64_t remM_ = 0) {
EIGEN_UNUSED_VARIABLE(remM_);
using urolls = unrolls::transB<Scalar>;
using vecHalf = typename std::conditional<std::is_same<Scalar, float>::value, vecHalfFloat, vecFullDouble>::type;
@@ -1062,7 +1062,8 @@
// Template specializations of trsmKernelL/R for float/double and inner strides of 1.
#if (EIGEN_USE_AVX512_TRSM_KERNELS)
#if (EIGEN_USE_AVX512_TRSM_R_KERNELS)
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride, bool Specialized>
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride,
+ bool Specialized>
struct trsmKernelR;
template <typename Index, int Mode, int TriStorageOrder>
@@ -1085,7 +1086,7 @@
#ifdef EIGEN_RUNTIME_NO_MALLOC
if (!is_malloc_allowed()) {
trsmKernelR<float, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
- size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
+ size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
return;
}
#endif
@@ -1101,7 +1102,7 @@
#ifdef EIGEN_RUNTIME_NO_MALLOC
if (!is_malloc_allowed()) {
trsmKernelR<double, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
- size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
+ size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
return;
}
#endif
@@ -1112,7 +1113,8 @@
// These trsm kernels require temporary memory allocation
#if (EIGEN_USE_AVX512_TRSM_L_KERNELS)
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride, bool Specialized = true>
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride,
+ bool Specialized = true>
struct trsmKernelL;
template <typename Index, int Mode, int TriStorageOrder>
@@ -1135,7 +1137,7 @@
#ifdef EIGEN_RUNTIME_NO_MALLOC
if (!is_malloc_allowed()) {
trsmKernelL<float, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
- size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
+ size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
return;
}
#endif
@@ -1151,7 +1153,7 @@
#ifdef EIGEN_RUNTIME_NO_MALLOC
if (!is_malloc_allowed()) {
trsmKernelL<double, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
- size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
+ size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
return;
}
#endif
diff --git a/Eigen/src/Core/arch/AVX512/TypeCasting.h b/Eigen/src/Core/arch/AVX512/TypeCasting.h
index 5053230..56a94f4 100644
--- a/Eigen/src/Core/arch/AVX512/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX512/TypeCasting.h
@@ -17,161 +17,207 @@
namespace internal {
-template<> struct type_casting_traits<float, bool> : vectorized_type_casting_traits<float, bool> {};
-template<> struct type_casting_traits<bool, float> : vectorized_type_casting_traits<bool, float> {};
+template <>
+struct type_casting_traits<float, bool> : vectorized_type_casting_traits<float, bool> {};
+template <>
+struct type_casting_traits<bool, float> : vectorized_type_casting_traits<bool, float> {};
-template<> struct type_casting_traits<float, int> : vectorized_type_casting_traits<float, int> {};
-template<> struct type_casting_traits<int, float> : vectorized_type_casting_traits<int, float> {};
+template <>
+struct type_casting_traits<float, int> : vectorized_type_casting_traits<float, int> {};
+template <>
+struct type_casting_traits<int, float> : vectorized_type_casting_traits<int, float> {};
-template<> struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
-template<> struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
+template <>
+struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
+template <>
+struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
-template<> struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
-template<> struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
+template <>
+struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
+template <>
+struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
-template<> struct type_casting_traits<half, float> : vectorized_type_casting_traits<half, float> {};
-template<> struct type_casting_traits<float, half> : vectorized_type_casting_traits<float, half> {};
+template <>
+struct type_casting_traits<half, float> : vectorized_type_casting_traits<half, float> {};
+template <>
+struct type_casting_traits<float, half> : vectorized_type_casting_traits<float, half> {};
-template<> struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfloat16, float> {};
-template<> struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {};
+template <>
+struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfloat16, float> {};
+template <>
+struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {};
-template<> EIGEN_STRONG_INLINE Packet16b pcast<Packet16f, Packet16b>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16b pcast<Packet16f, Packet16b>(const Packet16f& a) {
__mmask16 mask = _mm512_cmpneq_ps_mask(a, pzero(a));
return _mm512_maskz_cvtepi32_epi8(mask, _mm512_set1_epi32(1));
}
-template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16b, Packet16f>(const Packet16b& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16b, Packet16f>(const Packet16b& a) {
return _mm512_cvtepi32_ps(_mm512_and_si512(_mm512_cvtepi8_epi32(a), _mm512_set1_epi32(1)));
}
-template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
return _mm512_cvttps_epi32(a);
}
-template<> EIGEN_STRONG_INLINE Packet8d pcast<Packet16f, Packet8d>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8d pcast<Packet16f, Packet8d>(const Packet16f& a) {
return _mm512_cvtps_pd(_mm512_castps512_ps256(a));
}
-template<> EIGEN_STRONG_INLINE Packet8d pcast<Packet8f, Packet8d>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8d pcast<Packet8f, Packet8d>(const Packet8f& a) {
return _mm512_cvtps_pd(a);
}
-template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
return _mm512_cvtepi32_ps(a);
}
-template<> EIGEN_STRONG_INLINE Packet8d pcast<Packet16i, Packet8d>(const Packet16i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8d pcast<Packet16i, Packet8d>(const Packet16i& a) {
return _mm512_cvtepi32_pd(_mm512_castsi512_si256(a));
}
-template<> EIGEN_STRONG_INLINE Packet8d pcast<Packet8i, Packet8d>(const Packet8i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8d pcast<Packet8i, Packet8d>(const Packet8i& a) {
return _mm512_cvtepi32_pd(a);
}
-template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet8d, Packet16f>(const Packet8d& a, const Packet8d& b) {
- return cat256(_mm512_cvtpd_ps(a), _mm512_cvtpd_ps(b));
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet8d, Packet16f>(const Packet8d& a, const Packet8d& b) {
+ return cat256(_mm512_cvtpd_ps(a), _mm512_cvtpd_ps(b));
}
-template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet8d, Packet16i>(const Packet8d& a, const Packet8d& b) {
- return cat256i(_mm512_cvttpd_epi32(a), _mm512_cvttpd_epi32(b));
+template <>
+EIGEN_STRONG_INLINE Packet16i pcast<Packet8d, Packet16i>(const Packet8d& a, const Packet8d& b) {
+ return cat256i(_mm512_cvttpd_epi32(a), _mm512_cvttpd_epi32(b));
}
-template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8d, Packet8i>(const Packet8d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8i pcast<Packet8d, Packet8i>(const Packet8d& a) {
return _mm512_cvtpd_epi32(a);
}
-template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8d, Packet8f>(const Packet8d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8d, Packet8f>(const Packet8d& a) {
return _mm512_cvtpd_ps(a);
}
-template<> EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i, Packet16f>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i, Packet16f>(const Packet16f& a) {
return _mm512_castps_si512(a);
}
-template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet16i>(const Packet16i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet16i>(const Packet16i& a) {
return _mm512_castsi512_ps(a);
}
-template<> EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet16f>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet16f>(const Packet16f& a) {
return _mm512_castps_pd(a);
}
-template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet8d>(const Packet8d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet8d>(const Packet8d& a) {
return _mm512_castpd_ps(a);
}
-template<> EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f, Packet16f>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f, Packet16f>(const Packet16f& a) {
return _mm512_castps512_ps256(a);
}
-template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet16f>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet16f>(const Packet16f& a) {
return _mm512_castps512_ps128(a);
}
-template<> EIGEN_STRONG_INLINE Packet4d preinterpret<Packet4d, Packet8d>(const Packet8d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4d preinterpret<Packet4d, Packet8d>(const Packet8d& a) {
return _mm512_castpd512_pd256(a);
}
-template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet8d>(const Packet8d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet8d>(const Packet8d& a) {
return _mm512_castpd512_pd128(a);
}
-template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet8f>(const Packet8f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet8f>(const Packet8f& a) {
return _mm512_castps256_ps512(a);
}
-template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet4f>(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet4f>(const Packet4f& a) {
return _mm512_castps128_ps512(a);
}
-template<> EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet4d>(const Packet4d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet4d>(const Packet4d& a) {
return _mm512_castpd256_pd512(a);
}
-template<> EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet2d>(const Packet2d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet2d>(const Packet2d& a) {
return _mm512_castpd128_pd512(a);
}
-template<> EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i, Packet16i>(const Packet16i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i, Packet16i>(const Packet16i& a) {
return _mm512_castsi512_si256(a);
}
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet16i>(const Packet16i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet16i>(const Packet16i& a) {
return _mm512_castsi512_si128(a);
}
-template<> EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet16h>(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet16h>(const Packet16h& a) {
return _mm256_castsi256_si128(a);
}
-template<> EIGEN_STRONG_INLINE Packet8bf preinterpret<Packet8bf, Packet16bf>(const Packet16bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf preinterpret<Packet8bf, Packet16bf>(const Packet16bf& a) {
return _mm256_castsi256_si128(a);
}
#ifndef EIGEN_VECTORIZE_AVX512FP16
-template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
return half2float(a);
}
-template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
return float2half(a);
}
#endif
-template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16bf, Packet16f>(const Packet16bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16bf, Packet16f>(const Packet16bf& a) {
return Bf16ToF32(a);
}
-template<> EIGEN_STRONG_INLINE Packet16bf pcast<Packet16f, Packet16bf>(const Packet16f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcast<Packet16f, Packet16bf>(const Packet16f& a) {
return F32ToBf16(a);
}
#ifdef EIGEN_VECTORIZE_AVX512FP16
-template<> EIGEN_STRONG_INLINE Packet16h preinterpret<Packet16h, Packet32h>(const Packet32h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16h preinterpret<Packet16h, Packet32h>(const Packet32h& a) {
return _mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castph_pd(a), 0));
}
-template<> EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet32h>(const Packet32h& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet32h>(const Packet32h& a) {
return _mm256_castsi256_si128(preinterpret<Packet16h>(a));
}
@@ -182,12 +228,13 @@
return _mm512_cvtxph_ps(_mm256_castsi256_ph(low));
}
-
template <>
EIGEN_STRONG_INLINE Packet32h pcast<Packet16f, Packet32h>(const Packet16f& a, const Packet16f& b) {
__m512d result = _mm512_undefined_pd();
- result = _mm512_insertf64x4(result, _mm256_castsi256_pd(_mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC)), 0);
- result = _mm512_insertf64x4(result, _mm256_castsi256_pd(_mm512_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC)), 1);
+ result = _mm512_insertf64x4(
+ result, _mm256_castsi256_pd(_mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 0);
+ result = _mm512_insertf64x4(
+ result, _mm256_castsi256_pd(_mm512_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 1);
return _mm512_castpd_ph(result);
}
@@ -198,12 +245,13 @@
return _mm256_cvtxph_ps(_mm_castsi128_ph(low));
}
-
template <>
EIGEN_STRONG_INLINE Packet16h pcast<Packet8f, Packet16h>(const Packet8f& a, const Packet8f& b) {
__m256d result = _mm256_undefined_pd();
- result = _mm256_insertf64x2(result, _mm_castsi128_pd(_mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC)), 0);
- result = _mm256_insertf64x2(result, _mm_castsi128_pd(_mm256_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC)), 1);
+ result = _mm256_insertf64x2(result,
+ _mm_castsi128_pd(_mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 0);
+ result = _mm256_insertf64x2(result,
+ _mm_castsi128_pd(_mm256_cvtps_ph(b, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)), 1);
return _mm256_castpd_si256(result);
}
@@ -214,7 +262,6 @@
return _mm256_extractf32x4_ps(full, 0);
}
-
template <>
EIGEN_STRONG_INLINE Packet8h pcast<Packet4f, Packet8h>(const Packet4f& a, const Packet4f& b) {
__m256 result = _mm256_undefined_ps();
@@ -223,11 +270,10 @@
return _mm256_cvtps_ph(result, _MM_FROUND_TO_NEAREST_INT);
}
-
#endif
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_TYPE_CASTING_AVX512_H
+#endif // EIGEN_TYPE_CASTING_AVX512_H
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index 915b01b..7bfc61d 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -18,25 +18,28 @@
namespace internal {
-static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+static Packet4ui p4ui_CONJ_XOR =
+ vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO); //{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
#ifdef EIGEN_VECTORIZE_VSX
#if defined(_BIG_ENDIAN)
-static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul p2ul_CONJ_XOR1 =
+ (Packet2ul)vec_sld((Packet4ui)p2d_MZERO, (Packet4ui)p2l_ZERO, 8); //{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul p2ul_CONJ_XOR2 =
+ (Packet2ul)vec_sld((Packet4ui)p2l_ZERO, (Packet4ui)p2d_MZERO, 8); //{ 0x8000000000000000, 0x0000000000000000 };
#else
-static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul p2ul_CONJ_XOR1 =
+ (Packet2ul)vec_sld((Packet4ui)p2l_ZERO, (Packet4ui)p2d_MZERO, 8); //{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul p2ul_CONJ_XOR2 =
+ (Packet2ul)vec_sld((Packet4ui)p2d_MZERO, (Packet4ui)p2l_ZERO, 8); //{ 0x8000000000000000, 0x0000000000000000 };
#endif
#endif
//---------- float ----------
-struct Packet2cf
-{
+struct Packet2cf {
EIGEN_STRONG_INLINE explicit Packet2cf() {}
EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
- EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b)
- {
+ EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) {
Packet4f v1, v2;
// Permute and multiply the real parts of a and b
@@ -58,33 +61,25 @@
v = pmul(Packet2cf(*this), b).v;
return *this;
}
- EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
- return Packet2cf(*this) *= b;
- }
+ EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const { return Packet2cf(*this) *= b; }
EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
v = padd(v, b.v);
return *this;
}
- EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
- return Packet2cf(*this) += b;
- }
+ EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const { return Packet2cf(*this) += b; }
EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
v = psub(v, b.v);
return *this;
}
- EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
- return Packet2cf(*this) -= b;
- }
- EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
- return Packet2cf(-v);
- }
+ EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const { return Packet2cf(*this) -= b; }
+ EIGEN_STRONG_INLINE Packet2cf operator-(void) const { return Packet2cf(-v); }
- Packet4f v;
+ Packet4f v;
};
-template<> struct packet_traits<std::complex<float> > : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
typedef Packet2cf type;
typedef Packet2cf half;
typedef Packet4f as_real;
@@ -93,160 +88,232 @@
AlignedOnScalar = 1,
size = 2,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasSqrt = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasSqrt = 1,
#ifdef EIGEN_VECTORIZE_VSX
- HasBlend = 1,
+ HasBlend = 1,
#endif
HasSetLinear = 0
};
};
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; typedef Packet4f as_real; };
+template <>
+struct unpacket_traits<Packet2cf> {
+ typedef std::complex<float> type;
+ enum {
+ size = 2,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+ typedef Packet2cf half;
+ typedef Packet4f as_real;
+};
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
Packet2cf res;
#ifdef EIGEN_VECTORIZE_VSX
// Load a single std::complex<float> from memory and duplicate
//
// Using pload would read past the end of the reference in this case
// Using vec_xl_len + vec_splat, generates poor assembly
- __asm__ ("lxvdsx %x0,%y1" : "=wa" (res.v) : "Z" (from));
+ __asm__("lxvdsx %x0,%y1" : "=wa"(res.v) : "Z"(from));
#else
- if((std::ptrdiff_t(&from) % 16) == 0)
- res.v = pload<Packet4f>((const float *)&from);
+ if ((std::ptrdiff_t(&from) % 16) == 0)
+ res.v = pload<Packet4f>((const float*)&from);
else
- res.v = ploadu<Packet4f>((const float *)&from);
+ res.v = ploadu<Packet4f>((const float*)&from);
res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
#endif
return res;
}
-template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) { return Packet2cf(pload<Packet4f>((const float *) from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { return Packet2cf(ploadu<Packet4f>((const float*) from)); }
-template<> EIGEN_ALWAYS_INLINE Packet2cf pload_partial<Packet2cf>(const std::complex<float>* from, const Index n, const Index offset)
-{
- return Packet2cf(pload_partial<Packet4f>((const float *) from, n * 2, offset * 2));
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+ return Packet2cf(pload<Packet4f>((const float*)from));
}
-template<> EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial<Packet2cf>(const std::complex<float>* from, const Index n, const Index offset)
-{
- return Packet2cf(ploadu_partial<Packet4f>((const float*) from, n * 2, offset * 2));
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+ return Packet2cf(ploadu<Packet4f>((const float*)from));
}
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
+template <>
+EIGEN_ALWAYS_INLINE Packet2cf pload_partial<Packet2cf>(const std::complex<float>* from, const Index n,
+ const Index offset) {
+ return Packet2cf(pload_partial<Packet4f>((const float*)from, n * 2, offset * 2));
+}
+template <>
+EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial<Packet2cf>(const std::complex<float>* from, const Index n,
+ const Index offset) {
+ return Packet2cf(ploadu_partial<Packet4f>((const float*)from, n * 2, offset * 2));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+ return pset1<Packet2cf>(*from);
+}
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
-template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<float> >(std::complex<float> * to, const Packet2cf& from, const Index n, const Index offset) { pstore_partial((float*)to, from.v, n * 2, offset * 2); }
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<float> >(std::complex<float> * to, const Packet2cf& from, const Index n, const Index offset) { pstoreu_partial((float*)to, from.v, n * 2, offset * 2); }
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+ pstore((float*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+ pstoreu((float*)to, from.v);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<std::complex<float> >(std::complex<float>* to, const Packet2cf& from,
+ const Index n, const Index offset) {
+ pstore_partial((float*)to, from.v, n * 2, offset * 2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<float> >(std::complex<float>* to, const Packet2cf& from,
+ const Index n, const Index offset) {
+ pstoreu_partial((float*)to, from.v, n * 2, offset * 2);
+}
-EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
-{
+EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1) {
Packet4f res0, res1;
#ifdef EIGEN_VECTORIZE_VSX
// Load two std::complex<float> from memory and combine
- __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0));
- __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1));
+ __asm__("lxsdx %x0,%y1" : "=wa"(res0) : "Z"(from0));
+ __asm__("lxsdx %x0,%y1" : "=wa"(res1) : "Z"(from1));
#ifdef _BIG_ENDIAN
- __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
+ __asm__("xxpermdi %x0, %x1, %x2, 0" : "=wa"(res0) : "wa"(res0), "wa"(res1));
#else
- __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
+ __asm__("xxpermdi %x0, %x2, %x1, 0" : "=wa"(res0) : "wa"(res0), "wa"(res1));
#endif
#else
- *reinterpret_cast<std::complex<float> *>(&res0) = from0;
- *reinterpret_cast<std::complex<float> *>(&res1) = from1;
+ *reinterpret_cast<std::complex<float>*>(&res0) = from0;
+ *reinterpret_cast<std::complex<float>*>(&res1) = from1;
res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI);
#endif
return Packet2cf(res0);
}
-template<> EIGEN_ALWAYS_INLINE Packet2cf pload_ignore<Packet2cf>(const std::complex<float>* from)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet2cf pload_ignore<Packet2cf>(const std::complex<float>* from) {
Packet2cf res;
res.v = pload_ignore<Packet4f>(reinterpret_cast<const float*>(from));
return res;
}
-template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_complex_size2(const Scalar* from, Index stride, const Index n = 2)
-{
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_complex_size2(const Scalar* from, Index stride,
+ const Index n = 2) {
eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
EIGEN_ALIGN16 Scalar af[2];
for (Index i = 0; i < n; i++) {
- af[i] = from[i*stride];
+ af[i] = from[i * stride];
}
return pload_ignore<Packet>(af);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+ Index stride) {
return pgather_complex_size2<std::complex<float>, Packet2cf>(from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather_partial<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf
+pgather_partial<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride, const Index n) {
return pgather_complex_size2<std::complex<float>, Packet2cf>(from, stride, n);
}
-template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_complex_size2(Scalar* to, const Packet& from, Index stride, const Index n = 2)
-{
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_complex_size2(Scalar* to, const Packet& from, Index stride,
+ const Index n = 2) {
eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
EIGEN_ALIGN16 Scalar af[2];
- pstore<Scalar>((Scalar *) af, from);
+ pstore<Scalar>((Scalar*)af, from);
for (Index i = 0; i < n; i++) {
- to[i*stride] = af[i];
+ to[i * stride] = af[i];
}
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to,
+ const Packet2cf& from,
+ Index stride) {
pscatter_complex_size2<std::complex<float>, Packet2cf>(to, from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<float>, Packet2cf>(std::complex<float>* to,
+ const Packet2cf& from,
+ Index stride,
+ const Index n) {
pscatter_complex_size2<std::complex<float>, Packet2cf>(to, from, stride, n);
}
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(a.v + b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(a.v - b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+ return Packet2cf(pnegate(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+ return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
+}
-template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v, b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(pand<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(por<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(pxor<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(pandnot<Packet4f>(a.v, b.v));
+}
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_PPC_PREFETCH(addr); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+ EIGEN_PPC_PREFETCH(addr);
+}
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
EIGEN_ALIGN16 std::complex<float> res[2];
- pstore((float *)&res, a.v);
+ pstore((float*)&res, a.v);
return res[0];
}
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
Packet4f rev_a;
rev_a = vec_sld(a.v, a.v, 8);
return Packet2cf(rev_a);
}
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
Packet4f b;
b = vec_sld(a.v, a.v, 8);
b = padd<Packet4f>(a.v, b);
return pfirst<Packet2cf>(Packet2cf(b));
}
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
Packet4f b;
Packet2cf prod;
b = vec_sld(a.v, a.v, 8);
@@ -255,23 +322,24 @@
return pfirst<Packet2cf>(prod);
}
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
return pdiv_complex(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x) {
return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
}
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
-{
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
#ifdef EIGEN_VECTORIZE_VSX
- Packet4f tmp = reinterpret_cast<Packet4f>(vec_mergeh(reinterpret_cast<Packet2d>(kernel.packet[0].v), reinterpret_cast<Packet2d>(kernel.packet[1].v)));
- kernel.packet[1].v = reinterpret_cast<Packet4f>(vec_mergel(reinterpret_cast<Packet2d>(kernel.packet[0].v), reinterpret_cast<Packet2d>(kernel.packet[1].v)));
+ Packet4f tmp = reinterpret_cast<Packet4f>(
+ vec_mergeh(reinterpret_cast<Packet2d>(kernel.packet[0].v), reinterpret_cast<Packet2d>(kernel.packet[1].v)));
+ kernel.packet[1].v = reinterpret_cast<Packet4f>(
+ vec_mergel(reinterpret_cast<Packet2d>(kernel.packet[0].v), reinterpret_cast<Packet2d>(kernel.packet[1].v)));
#else
Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
@@ -279,33 +347,35 @@
kernel.packet[0].v = tmp;
}
-template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
- Packet4f eq = reinterpret_cast<Packet4f>(vec_cmpeq(a.v,b.v));
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+ Packet4f eq = reinterpret_cast<Packet4f>(vec_cmpeq(a.v, b.v));
return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV)));
}
#ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+ const Packet2cf& elsePacket) {
Packet2cf result;
- result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
+ result.v = reinterpret_cast<Packet4f>(
+ pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
return result;
}
#endif
-template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
return psqrt_complex<Packet2cf>(a);
}
//---------- double ----------
#ifdef EIGEN_VECTORIZE_VSX
-struct Packet1cd
-{
+struct Packet1cd {
EIGEN_STRONG_INLINE Packet1cd() {}
EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
- EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b)
- {
+ EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) {
Packet2d a_re, a_im, v1, v2;
// Permute and multiply the real parts of a and b
@@ -326,33 +396,25 @@
v = pmul(Packet1cd(*this), b).v;
return *this;
}
- EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
- return Packet1cd(*this) *= b;
- }
+ EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const { return Packet1cd(*this) *= b; }
EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
v = padd(v, b.v);
return *this;
}
- EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
- return Packet1cd(*this) += b;
- }
+ EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const { return Packet1cd(*this) += b; }
EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
v = psub(v, b.v);
return *this;
}
- EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
- return Packet1cd(*this) -= b;
- }
- EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
- return Packet1cd(-v);
- }
+ EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const { return Packet1cd(*this) -= b; }
+ EIGEN_STRONG_INLINE Packet1cd operator-(void) const { return Packet1cd(-v); }
Packet2d v;
};
-template<> struct packet_traits<std::complex<double> > : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
typedef Packet1cd type;
typedef Packet1cd half;
typedef Packet2d as_real;
@@ -361,123 +423,204 @@
AlignedOnScalar = 0,
size = 1,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasSqrt = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasSqrt = 1,
HasSetLinear = 0
};
};
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; typedef Packet2d as_real; };
+template <>
+struct unpacket_traits<Packet1cd> {
+ typedef std::complex<double> type;
+ enum {
+ size = 1,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+ typedef Packet1cd half;
+ typedef Packet2d as_real;
+};
-template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_ALWAYS_INLINE Packet1cd pload_partial<Packet1cd>(const std::complex<double>* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+ return Packet1cd(pload<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+ return Packet1cd(ploadu<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_ALWAYS_INLINE Packet1cd pload_partial<Packet1cd>(const std::complex<double>* from, const Index n,
+ const Index offset) {
return Packet1cd(pload_partial<Packet2d>((const double*)from, n * 2, offset * 2));
}
-template<> EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial<Packet1cd>(const std::complex<double>* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial<Packet1cd>(const std::complex<double>* from, const Index n,
+ const Index offset) {
return Packet1cd(ploadu_partial<Packet2d>((const double*)from, n * 2, offset * 2));
}
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstoreu((double*)to, from.v); }
-template<> EIGEN_ALWAYS_INLINE void pstore_partial <std::complex<double> >(std::complex<double> * to, const Packet1cd& from, const Index n, const Index offset) { pstore_partial((double*)to, from.v, n * 2, offset * 2); }
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<double> >(std::complex<double> * to, const Packet1cd& from, const Index n, const Index offset) { pstoreu_partial((double*)to, from.v, n * 2, offset * 2); }
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+ pstore((double*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+ pstoreu((double*)to, from.v);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<std::complex<double> >(std::complex<double>* to, const Packet1cd& from,
+ const Index n, const Index offset) {
+ pstore_partial((double*)to, from.v, n * 2, offset * 2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<double> >(std::complex<double>* to, const Packet1cd& from,
+ const Index n, const Index offset) {
+ pstoreu_partial((double*)to, from.v, n * 2, offset * 2);
+}
-template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
-{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd
+pset1<Packet1cd>(const std::complex<double>& from) { /* here we really have to use unaligned loads :( */
+ return ploadu<Packet1cd>(&from);
+}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd
+pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index) {
return pload<Packet1cd>(from);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd pgather_partial<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index, const Index)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd
+pgather_partial<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index, const Index) {
return pload<Packet1cd>(from);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to,
+ const Packet1cd& from, Index) {
pstore<std::complex<double> >(to, from);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index, const Index)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<double>, Packet1cd>(std::complex<double>* to,
+ const Packet1cd& from,
+ Index, const Index) {
pstore<std::complex<double> >(to, from);
}
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(a.v + b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(a.v - b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+ return Packet1cd(pnegate(Packet2d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+ return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2)));
+}
-template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pandnot(a.v, b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(pand(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(por(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(pxor(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(pandnot(a.v, b.v));
+}
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+ return pset1<Packet1cd>(*from);
+}
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_PPC_PREFETCH(addr); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+ EIGEN_PPC_PREFETCH(addr);
+}
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
EIGEN_ALIGN16 std::complex<double> res[1];
pstore<std::complex<double> >(res, a);
return res[0];
}
-template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+ return a;
+}
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
+ return pfirst(a);
+}
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
+ return pfirst(a);
+}
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
return pdiv_complex(a, b);
}
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
-{
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
return Packet1cd(preverse(Packet2d(x.v)));
}
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
-{
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
Packet2d tmp = vec_mergeh(kernel.packet[0].v, kernel.packet[1].v);
kernel.packet[1].v = vec_mergel(kernel.packet[0].v, kernel.packet[1].v);
kernel.packet[0].v = tmp;
}
-template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+template <>
+EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
// Compare real and imaginary parts of a and b to get the mask vector:
// [re(a)==re(b), im(a)==im(b)]
- Packet2d eq = reinterpret_cast<Packet2d>(vec_cmpeq(a.v,b.v));
+ Packet2d eq = reinterpret_cast<Packet2d>(vec_cmpeq(a.v, b.v));
// Swap real/imag elements in the mask in to get:
// [im(a)==im(b), re(a)==re(b)]
- Packet2d eq_swapped = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(eq), reinterpret_cast<Packet4ui>(eq), 8));
+ Packet2d eq_swapped =
+ reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(eq), reinterpret_cast<Packet4ui>(eq), 8));
// Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped
return Packet1cd(vec_and(eq, eq_swapped));
}
-template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
return psqrt_complex<Packet1cd>(a);
}
-#endif // __VSX__
-} // end namespace internal
+#endif // __VSX__
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_COMPLEX32_ALTIVEC_H
+#endif // EIGEN_COMPLEX32_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
index a8a2309..c95ee38 100644
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -25,50 +25,47 @@
#endif
#ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4f psqrt<Packet4f>(const Packet4f& x)
-{
- return vec_sqrt(x);
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt<Packet4f>(const Packet4f& x) {
+ return vec_sqrt(x);
}
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d psqrt<Packet2d>(const Packet2d& x)
-{
- return vec_sqrt(x);
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d psqrt<Packet2d>(const Packet2d& x) {
+ return vec_sqrt(x);
}
#if !EIGEN_COMP_CLANG
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4f prsqrt<Packet4f>(const Packet4f& x)
-{
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f prsqrt<Packet4f>(const Packet4f& x) {
return pset1<Packet4f>(1.0f) / psqrt<Packet4f>(x);
-// vec_rsqrt returns different results from the generic version
-// return vec_rsqrt(x);
+ // vec_rsqrt returns different results from the generic version
+ // return vec_rsqrt(x);
}
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d prsqrt<Packet2d>(const Packet2d& x)
-{
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d prsqrt<Packet2d>(const Packet2d& x) {
return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
-// vec_rsqrt returns different results from the generic version
-// return vec_rsqrt(x);
+ // vec_rsqrt returns different results from the generic version
+ // return vec_rsqrt(x);
}
#endif
-template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf>(const Packet8bf& a) {
BF16_TO_F32_UNARY_OP_WRAPPER(psqrt<Packet4f>, a);
}
#if !EIGEN_COMP_CLANG
-template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf>(const Packet8bf& a) {
BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
}
#endif
#else
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4f psqrt<Packet4f>(const Packet4f& x)
-{
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt<Packet4f>(const Packet4f& x) {
Packet4f a;
for (Index i = 0; i < packet_traits<float>::size; i++) {
a[i] = numext::sqrt(x[i]);
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index e9a9307..94306da 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -12,17 +12,17 @@
#define EIGEN_MATRIX_PRODUCT_ALTIVEC_H
#ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK
-#define EIGEN_ALTIVEC_USE_CUSTOM_PACK 1
+#define EIGEN_ALTIVEC_USE_CUSTOM_PACK 1
#endif
#if !defined(EIGEN_ALTIVEC_DISABLE_MMA)
#define EIGEN_ALTIVEC_DISABLE_MMA 0
#endif
-// Check for MMA builtin support.
+// Check for MMA builtin support.
#if !EIGEN_ALTIVEC_DISABLE_MMA && defined(__has_builtin)
#if __has_builtin(__builtin_mma_assemble_acc)
- #define EIGEN_ALTIVEC_MMA_SUPPORT
+#define EIGEN_ALTIVEC_MMA_SUPPORT
#endif
#endif
@@ -41,12 +41,12 @@
#define EIGEN_ALTIVEC_MMA_ONLY 1
#endif
-#endif // EIGEN_ALTIVEC_MMA_SUPPORT
+#endif // EIGEN_ALTIVEC_MMA_SUPPORT
#include "MatrixProductCommon.h"
#if defined(EIGEN_ALTIVEC_MMA_ONLY) || defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
- #include "MatrixProductMMA.h"
+#include "MatrixProductMMA.h"
#endif
// IWYU pragma: private
@@ -59,71 +59,41 @@
/**************************
* Constants and typedefs *
**************************/
-template<typename Scalar>
-struct quad_traits
-{
- typedef typename packet_traits<Scalar>::type vectortype;
- typedef PacketBlock<vectortype,4> type;
- typedef vectortype rhstype;
- enum
- {
- vectorsize = packet_traits<Scalar>::size,
- size = 4,
- rows = 4
- };
+template <typename Scalar>
+struct quad_traits {
+ typedef typename packet_traits<Scalar>::type vectortype;
+ typedef PacketBlock<vectortype, 4> type;
+ typedef vectortype rhstype;
+ enum { vectorsize = packet_traits<Scalar>::size, size = 4, rows = 4 };
};
-template<>
-struct quad_traits<double>
-{
- typedef Packet2d vectortype;
- typedef PacketBlock<vectortype,4> type;
- typedef PacketBlock<Packet2d,2> rhstype;
- enum
- {
- vectorsize = packet_traits<double>::size,
- size = 2,
- rows = 4
- };
+template <>
+struct quad_traits<double> {
+ typedef Packet2d vectortype;
+ typedef PacketBlock<vectortype, 4> type;
+ typedef PacketBlock<Packet2d, 2> rhstype;
+ enum { vectorsize = packet_traits<double>::size, size = 2, rows = 4 };
};
-template<>
-struct quad_traits<bfloat16>
-{
- typedef Packet8bf vectortype;
- typedef PacketBlock<vectortype,4> type;
- typedef vectortype rhstype;
- enum
- {
- vectorsize = packet_traits<bfloat16>::size,
- size = 8,
- rows = 4
- };
+template <>
+struct quad_traits<bfloat16> {
+ typedef Packet8bf vectortype;
+ typedef PacketBlock<vectortype, 4> type;
+ typedef vectortype rhstype;
+ enum { vectorsize = packet_traits<bfloat16>::size, size = 8, rows = 4 };
};
// MatrixProduct decomposes real/imaginary vectors into a real vector and an imaginary vector, this turned out
// to be faster than Eigen's usual approach of having real/imaginary pairs on a single vector. This constants then
// are responsible to extract from convert between Eigen's and MatrixProduct approach.
-const static Packet16uc p16uc_GETREAL32 = { 0, 1, 2, 3,
- 8, 9, 10, 11,
- 16, 17, 18, 19,
- 24, 25, 26, 27};
+const static Packet16uc p16uc_GETREAL32 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
-const static Packet16uc p16uc_GETIMAG32 = { 4, 5, 6, 7,
- 12, 13, 14, 15,
- 20, 21, 22, 23,
- 28, 29, 30, 31};
+const static Packet16uc p16uc_GETIMAG32 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
-const static Packet16uc p16uc_GETREAL32b = { 0, 1, 2, 3,
- 16, 17, 18, 19,
- 8, 9, 10, 11,
- 24, 25, 26, 27};
+const static Packet16uc p16uc_GETREAL32b = {0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 24, 25, 26, 27};
-const static Packet16uc p16uc_GETIMAG32b = { 4, 5, 6, 7,
- 20, 21, 22, 23,
- 12, 13, 14, 15,
- 28, 29, 30, 31};
+const static Packet16uc p16uc_GETIMAG32b = {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31};
/*********************************************
* Single precision real and complex packing *
@@ -131,55 +101,50 @@
/**
* Symm packing is related to packing of symmetric adjoint blocks, as expected the packing leaves
- * the diagonal real, whatever is below it is copied from the respective upper diagonal element and
+ * the diagonal real, whatever is below it is copied from the respective upper diagonal element and
* conjugated. There's no PanelMode available for symm packing.
*
- * Packing in general is supposed to leave the lhs block and the rhs block easy to be read by gemm using
+ * Packing in general is supposed to leave the lhs block and the rhs block easy to be read by gemm using
* its respective rank-update instructions. The float32/64 versions are different because at this moment
* the size of the accumulator is fixed at 512-bits so you can't have a 4x4 accumulator of 64-bit elements.
- *
+ *
* As mentioned earlier MatrixProduct breaks complex numbers into a real vector and a complex vector so packing has
* to take that into account, at the moment, we run pack the real part and then the imaginary part, this is the main
* reason why packing for complex is broken down into several different parts, also the reason why we endup having a
* float32/64 and complex float32/64 version.
**/
-template<typename Scalar, int StorageOrder>
-EIGEN_ALWAYS_INLINE std::complex<Scalar> getAdjointVal(Index i, Index j, const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder>& dt)
-{
+template <typename Scalar, int StorageOrder>
+EIGEN_ALWAYS_INLINE std::complex<Scalar> getAdjointVal(
+ Index i, Index j, const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder>& dt) {
std::complex<Scalar> v;
- if(i < j)
- {
- v.real( dt(j,i).real());
- v.imag(-dt(j,i).imag());
- } else if(i > j)
- {
- v.real( dt(i,j).real());
- v.imag( dt(i,j).imag());
+ if (i < j) {
+ v.real(dt(j, i).real());
+ v.imag(-dt(j, i).imag());
+ } else if (i > j) {
+ v.real(dt(i, j).real());
+ v.imag(dt(i, j).imag());
} else {
- v.real( dt(i,j).real());
+ v.real(dt(i, j).real());
v.imag((Scalar)0.0);
}
return v;
}
-template<typename Scalar, int StorageOrder, int N>
-EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex<Scalar>* blockB, const std::complex<Scalar>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
-{
+template <typename Scalar, int StorageOrder, int N>
+EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex<Scalar>* blockB, const std::complex<Scalar>* _rhs,
+ Index rhsStride, Index rows, Index cols, Index k2) {
const Index depth = k2 + rows;
const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder> rhs(_rhs, rhsStride);
- const Index vectorSize = N*quad_traits<Scalar>::vectorsize;
+ const Index vectorSize = N * quad_traits<Scalar>::vectorsize;
const Index vectorDelta = vectorSize * rows;
- Scalar* blockBf = reinterpret_cast<Scalar *>(blockB);
+ Scalar* blockBf = reinterpret_cast<Scalar*>(blockB);
Index rir = 0, rii, j = 0;
- for(; j + vectorSize <= cols; j+=vectorSize)
- {
+ for (; j + vectorSize <= cols; j += vectorSize) {
rii = rir + vectorDelta;
- for(Index i = k2; i < depth; i++)
- {
- for(Index k = 0; k < vectorSize; k++)
- {
+ for (Index i = k2; i < depth; i++) {
+ for (Index k = 0; k < vectorSize; k++) {
std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(i, j + k, rhs);
blockBf[rir + k] = v.real();
@@ -192,12 +157,10 @@
rir += vectorDelta;
}
- for(; j < cols; j++)
- {
+ for (; j < cols; j++) {
rii = rir + rows;
- for(Index i = k2; i < depth; i++)
- {
+ for (Index i = k2; i < depth; i++) {
std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(i, j, rhs);
blockBf[rir] = v.real();
@@ -211,25 +174,22 @@
}
}
-template<typename Scalar, int StorageOrder>
-EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex<Scalar>* blockA, const std::complex<Scalar>* _lhs, Index lhsStride, Index cols, Index rows)
-{
+template <typename Scalar, int StorageOrder>
+EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex<Scalar>* blockA, const std::complex<Scalar>* _lhs,
+ Index lhsStride, Index cols, Index rows) {
const Index depth = cols;
const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder> lhs(_lhs, lhsStride);
const Index vectorSize = quad_traits<Scalar>::vectorsize;
const Index vectorDelta = vectorSize * depth;
- Scalar* blockAf = reinterpret_cast<Scalar *>(blockA);
+ Scalar* blockAf = reinterpret_cast<Scalar*>(blockA);
Index rir = 0, rii, j = 0;
- for(; j + vectorSize <= rows; j+=vectorSize)
- {
+ for (; j + vectorSize <= rows; j += vectorSize) {
rii = rir + vectorDelta;
- for(Index i = 0; i < depth; i++)
- {
- for(Index k = 0; k < vectorSize; k++)
- {
- std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(j+k, i, lhs);
+ for (Index i = 0; i < depth; i++) {
+ for (Index k = 0; k < vectorSize; k++) {
+ std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(j + k, i, lhs);
blockAf[rir + k] = v.real();
blockAf[rii + k] = v.imag();
@@ -241,15 +201,12 @@
rir += vectorDelta;
}
- if (j < rows)
- {
+ if (j < rows) {
rii = rir + ((rows - j) * depth);
- for(Index i = 0; i < depth; i++)
- {
+ for (Index i = 0; i < depth; i++) {
Index k = j;
- for(; k < rows; k++)
- {
+ for (; k < rows; k++) {
std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(k, i, lhs);
blockAf[rir] = v.real();
@@ -262,35 +219,30 @@
}
}
-template<typename Scalar, int StorageOrder, int N>
-EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
-{
+template <typename Scalar, int StorageOrder, int N>
+EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows,
+ Index cols, Index k2) {
const Index depth = k2 + rows;
const_blas_data_mapper<Scalar, Index, StorageOrder> rhs(_rhs, rhsStride);
const Index vectorSize = quad_traits<Scalar>::vectorsize;
Index ri = 0, j = 0;
- for(; j + N*vectorSize <= cols; j+=N*vectorSize)
- {
+ for (; j + N * vectorSize <= cols; j += N * vectorSize) {
Index i = k2;
- for(; i < depth; i++)
- {
- for(Index k = 0; k < N*vectorSize; k++)
- {
- if(i <= j+k)
- blockB[ri + k] = rhs(j+k, i);
+ for (; i < depth; i++) {
+ for (Index k = 0; k < N * vectorSize; k++) {
+ if (i <= j + k)
+ blockB[ri + k] = rhs(j + k, i);
else
- blockB[ri + k] = rhs(i, j+k);
+ blockB[ri + k] = rhs(i, j + k);
}
- ri += N*vectorSize;
+ ri += N * vectorSize;
}
}
- for(; j < cols; j++)
- {
- for(Index i = k2; i < depth; i++)
- {
- if(j <= i)
+ for (; j < cols; j++) {
+ for (Index i = k2; i < depth; i++) {
+ if (j <= i)
blockB[ri] = rhs(i, j);
else
blockB[ri] = rhs(j, i);
@@ -299,39 +251,33 @@
}
}
-template<typename Scalar, int StorageOrder>
-EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
-{
+template <typename Scalar, int StorageOrder>
+EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols,
+ Index rows) {
const Index depth = cols;
const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs, lhsStride);
const Index vectorSize = quad_traits<Scalar>::vectorsize;
Index ri = 0, j = 0;
- for(; j + vectorSize <= rows; j+=vectorSize)
- {
+ for (; j + vectorSize <= rows; j += vectorSize) {
Index i = 0;
- for(; i < depth; i++)
- {
- for(Index k = 0; k < vectorSize; k++)
- {
- if(i <= j+k)
- blockA[ri + k] = lhs(j+k, i);
+ for (; i < depth; i++) {
+ for (Index k = 0; k < vectorSize; k++) {
+ if (i <= j + k)
+ blockA[ri + k] = lhs(j + k, i);
else
- blockA[ri + k] = lhs(i, j+k);
+ blockA[ri + k] = lhs(i, j + k);
}
ri += vectorSize;
}
}
- if (j < rows)
- {
- for(Index i = 0; i < depth; i++)
- {
+ if (j < rows) {
+ for (Index i = 0; i < depth; i++) {
Index k = j;
- for(; k < rows; k++)
- {
- if(i <= k)
+ for (; k < rows; k++) {
+ if (i <= k)
blockA[ri] = lhs(k, i);
else
blockA[ri] = lhs(i, k);
@@ -341,85 +287,73 @@
}
}
-template<typename Index, int nr, int StorageOrder>
-struct symm_pack_rhs<std::complex<float>, Index, nr, StorageOrder>
-{
- void operator()(std::complex<float>* blockB, const std::complex<float>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
- {
+template <typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<std::complex<float>, Index, nr, StorageOrder> {
+ void operator()(std::complex<float>* blockB, const std::complex<float>* _rhs, Index rhsStride, Index rows, Index cols,
+ Index k2) {
symm_pack_complex_rhs_helper<float, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
}
};
-template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
-struct symm_pack_lhs<std::complex<float>, Index, Pack1, Pack2_dummy, StorageOrder>
-{
- void operator()(std::complex<float>* blockA, const std::complex<float>* _lhs, Index lhsStride, Index cols, Index rows)
- {
+template <typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<std::complex<float>, Index, Pack1, Pack2_dummy, StorageOrder> {
+ void operator()(std::complex<float>* blockA, const std::complex<float>* _lhs, Index lhsStride, Index cols,
+ Index rows) {
symm_pack_complex_lhs_helper<float, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
}
};
// *********** symm_pack std::complex<float64> ***********
-template<typename Index, int nr, int StorageOrder>
-struct symm_pack_rhs<std::complex<double>, Index, nr, StorageOrder>
-{
- void operator()(std::complex<double>* blockB, const std::complex<double>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
- {
+template <typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<std::complex<double>, Index, nr, StorageOrder> {
+ void operator()(std::complex<double>* blockB, const std::complex<double>* _rhs, Index rhsStride, Index rows,
+ Index cols, Index k2) {
symm_pack_complex_rhs_helper<double, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
}
};
-template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
-struct symm_pack_lhs<std::complex<double>, Index, Pack1, Pack2_dummy, StorageOrder>
-{
- void operator()(std::complex<double>* blockA, const std::complex<double>* _lhs, Index lhsStride, Index cols, Index rows)
- {
+template <typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<std::complex<double>, Index, Pack1, Pack2_dummy, StorageOrder> {
+ void operator()(std::complex<double>* blockA, const std::complex<double>* _lhs, Index lhsStride, Index cols,
+ Index rows) {
symm_pack_complex_lhs_helper<double, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
}
};
// *********** symm_pack float32 ***********
-template<typename Index, int nr, int StorageOrder>
-struct symm_pack_rhs<float, Index, nr, StorageOrder>
-{
- void operator()(float* blockB, const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
- {
+template <typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<float, Index, nr, StorageOrder> {
+ void operator()(float* blockB, const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2) {
symm_pack_rhs_helper<float, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
}
};
-template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
-struct symm_pack_lhs<float, Index, Pack1, Pack2_dummy, StorageOrder>
-{
- void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows)
- {
+template <typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<float, Index, Pack1, Pack2_dummy, StorageOrder> {
+ void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows) {
symm_pack_lhs_helper<float, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
}
};
// *********** symm_pack float64 ***********
-template<typename Index, int nr, int StorageOrder>
-struct symm_pack_rhs<double, Index, nr, StorageOrder>
-{
- void operator()(double* blockB, const double* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
- {
+template <typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<double, Index, nr, StorageOrder> {
+ void operator()(double* blockB, const double* _rhs, Index rhsStride, Index rows, Index cols, Index k2) {
symm_pack_rhs_helper<double, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
}
};
-template<typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
-struct symm_pack_lhs<double, Index, Pack1, Pack2_dummy, StorageOrder>
-{
- void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows)
- {
+template <typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<double, Index, Pack1, Pack2_dummy, StorageOrder> {
+ void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows) {
symm_pack_lhs_helper<double, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
}
};
/**
* PanelMode
- * Packing might be called several times before being multiplied by gebp_kernel, this happens because
+ * Packing might be called several times before being multiplied by gebp_kernel, this happens because
* on special occasions it fills part of block with other parts of the matrix. Two variables control
* how PanelMode should behave: offset and stride. The idea is that those variables represent whatever
* is going to be the real offset and stride in the future and this is what you should obey. The process
@@ -428,9 +362,8 @@
* and offset and behaves accordingly.
**/
-template<typename Scalar, typename Packet, int N>
-EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock<Packet,N>& block)
-{
+template <typename Scalar, typename Packet, int N>
+EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock<Packet, N>& block) {
const Index size = 16 / sizeof(Scalar);
pstore<Scalar>(to + (0 * size), block.packet[0]);
pstore<Scalar>(to + (1 * size), block.packet[1]);
@@ -443,11 +376,12 @@
}
// General template for lhs & rhs complex packing.
-template<typename Scalar, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode, bool UseLhs>
+template <typename Scalar, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate,
+ bool PanelMode, bool UseLhs>
struct dhs_cpack {
- template<bool transpose>
- EIGEN_ALWAYS_INLINE void dhs_cblock(PacketBlock<PacketC,8>& cblock, PacketBlock<Packet,4>& block, Packet16uc permute)
- {
+ template <bool transpose>
+ EIGEN_ALWAYS_INLINE void dhs_cblock(PacketBlock<PacketC, 8>& cblock, PacketBlock<Packet, 4>& block,
+ Packet16uc permute) {
if (transpose) {
block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, permute);
block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, permute);
@@ -456,10 +390,14 @@
Packet4f t0, t1, t2, t3;
#ifdef EIGEN_VECTORIZE_VSX
- t0 = reinterpret_cast<Packet>(vec_mergeh(reinterpret_cast<Packet2ul>(block.packet[0]), reinterpret_cast<Packet2ul>(block.packet[1])));
- t1 = reinterpret_cast<Packet>(vec_mergel(reinterpret_cast<Packet2ul>(block.packet[0]), reinterpret_cast<Packet2ul>(block.packet[1])));
- t2 = reinterpret_cast<Packet>(vec_mergeh(reinterpret_cast<Packet2ul>(block.packet[2]), reinterpret_cast<Packet2ul>(block.packet[3])));
- t3 = reinterpret_cast<Packet>(vec_mergel(reinterpret_cast<Packet2ul>(block.packet[2]), reinterpret_cast<Packet2ul>(block.packet[3])));
+ t0 = reinterpret_cast<Packet>(
+ vec_mergeh(reinterpret_cast<Packet2ul>(block.packet[0]), reinterpret_cast<Packet2ul>(block.packet[1])));
+ t1 = reinterpret_cast<Packet>(
+ vec_mergel(reinterpret_cast<Packet2ul>(block.packet[0]), reinterpret_cast<Packet2ul>(block.packet[1])));
+ t2 = reinterpret_cast<Packet>(
+ vec_mergeh(reinterpret_cast<Packet2ul>(block.packet[2]), reinterpret_cast<Packet2ul>(block.packet[3])));
+ t3 = reinterpret_cast<Packet>(
+ vec_mergel(reinterpret_cast<Packet2ul>(block.packet[2]), reinterpret_cast<Packet2ul>(block.packet[3])));
#else
t0 = reinterpret_cast<Packet>(vec_perm(block.packet[0], block.packet[1], p16uc_TRANSPOSE64_HI));
t1 = reinterpret_cast<Packet>(vec_perm(block.packet[0], block.packet[1], p16uc_TRANSPOSE64_LO));
@@ -479,21 +417,19 @@
}
}
- EIGEN_ALWAYS_INLINE void dhs_ccopy(Scalar* blockAt, const DataMapper& lhs2, Index& i, Index& rir, Index& rii, Index depth, const Index vectorSize)
- {
- PacketBlock<Packet,4> blockr, blocki;
- PacketBlock<PacketC,8> cblock;
+ EIGEN_ALWAYS_INLINE void dhs_ccopy(Scalar* blockAt, const DataMapper& lhs2, Index& i, Index& rir, Index& rii,
+ Index depth, const Index vectorSize) {
+ PacketBlock<Packet, 4> blockr, blocki;
+ PacketBlock<PacketC, 8> cblock;
- for(; i + vectorSize <= depth; i+=vectorSize)
- {
+ for (; i + vectorSize <= depth; i += vectorSize) {
if (UseLhs) {
bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, 0, i);
} else {
bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, i, 0);
}
- if(((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs)))
- {
+ if (((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs))) {
dhs_cblock<true>(cblock, blockr, p16uc_GETREAL32b);
dhs_cblock<true>(cblock, blocki, p16uc_GETIMAG32b);
} else {
@@ -501,8 +437,7 @@
dhs_cblock<false>(cblock, blocki, p16uc_GETIMAG32);
}
- if(Conjugate)
- {
+ if (Conjugate) {
blocki.packet[0] = -blocki.packet[0];
blocki.packet[1] = -blocki.packet[1];
blocki.packet[2] = -blocki.packet[2];
@@ -512,21 +447,20 @@
storeBlock<Scalar, Packet, 4>(blockAt + rir, blockr);
storeBlock<Scalar, Packet, 4>(blockAt + rii, blocki);
- rir += 4*vectorSize;
- rii += 4*vectorSize;
+ rir += 4 * vectorSize;
+ rii += 4 * vectorSize;
}
}
- EIGEN_STRONG_INLINE void operator()(std::complex<Scalar>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
- {
+ EIGEN_STRONG_INLINE void operator()(std::complex<Scalar>* blockA, const DataMapper& lhs, Index depth, Index rows,
+ Index stride, Index offset) {
const Index vectorSize = quad_traits<Scalar>::vectorsize;
const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
- Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
- Scalar* blockAt = reinterpret_cast<Scalar *>(blockA);
+ Index rir = ((PanelMode) ? (vectorSize * offset) : 0), rii;
+ Scalar* blockAt = reinterpret_cast<Scalar*>(blockA);
Index j = 0;
- for(; j + vectorSize <= rows; j+=vectorSize)
- {
+ for (; j + vectorSize <= rows; j += vectorSize) {
const DataMapper lhs2 = UseLhs ? lhs.getSubMapper(j, 0) : lhs.getSubMapper(0, j);
Index i = 0;
@@ -534,13 +468,11 @@
dhs_ccopy(blockAt, lhs2, i, rir, rii, depth, vectorSize);
- for(; i < depth; i++)
- {
- PacketBlock<Packet,1> blockr, blocki;
- PacketBlock<PacketC,2> cblock;
+ for (; i < depth; i++) {
+ PacketBlock<Packet, 1> blockr, blocki;
+ PacketBlock<PacketC, 2> cblock;
- if(((StorageOrder == ColMajor) && UseLhs) || (((StorageOrder == RowMajor) && !UseLhs)))
- {
+ if (((StorageOrder == ColMajor) && UseLhs) || (((StorageOrder == RowMajor) && !UseLhs))) {
if (UseLhs) {
cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i);
cblock.packet[1] = lhs2.template loadPacket<PacketC>(2, i);
@@ -561,8 +493,7 @@
blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL32);
blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG32);
- if(Conjugate)
- {
+ if (Conjugate) {
blocki.packet[0] = -blocki.packet[0];
}
@@ -573,50 +504,44 @@
rii += vectorSize;
}
- rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
+ rir += ((PanelMode) ? (vectorSize * (2 * stride - depth)) : vectorDelta);
}
- if (!UseLhs)
- {
- if(PanelMode) rir -= (offset*(vectorSize - 1));
+ if (!UseLhs) {
+ if (PanelMode) rir -= (offset * (vectorSize - 1));
- for(; j < rows; j++)
- {
+ for (; j < rows; j++) {
const DataMapper lhs2 = lhs.getSubMapper(0, j);
rii = rir + ((PanelMode) ? stride : depth);
- for(Index i = 0; i < depth; i++)
- {
+ for (Index i = 0; i < depth; i++) {
blockAt[rir] = lhs2(i, 0).real();
- if(Conjugate)
+ if (Conjugate)
blockAt[rii] = -lhs2(i, 0).imag();
else
- blockAt[rii] = lhs2(i, 0).imag();
+ blockAt[rii] = lhs2(i, 0).imag();
rir += 1;
rii += 1;
}
- rir += ((PanelMode) ? (2*stride - depth) : depth);
+ rir += ((PanelMode) ? (2 * stride - depth) : depth);
}
} else {
- if (j < rows)
- {
- if(PanelMode) rir += (offset*(rows - j - vectorSize));
+ if (j < rows) {
+ if (PanelMode) rir += (offset * (rows - j - vectorSize));
rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
- for(Index i = 0; i < depth; i++)
- {
+ for (Index i = 0; i < depth; i++) {
Index k = j;
- for(; k < rows; k++)
- {
+ for (; k < rows; k++) {
blockAt[rir] = lhs(k, i).real();
- if(Conjugate)
+ if (Conjugate)
blockAt[rii] = -lhs(k, i).imag();
else
- blockAt[rii] = lhs(k, i).imag();
+ blockAt[rii] = lhs(k, i).imag();
rir += 1;
rii += 1;
@@ -628,68 +553,63 @@
};
// General template for lhs & rhs packing.
-template<typename Scalar, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode, bool UseLhs>
-struct dhs_pack{
- template<Index n>
- EIGEN_ALWAYS_INLINE void dhs_copy(Scalar* blockA, const DataMapper& lhs2, Index& i, Index& ri, Index depth, const Index vectorSize)
- {
- PacketBlock<Packet,4> block[n];
+template <typename Scalar, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode, bool UseLhs>
+struct dhs_pack {
+ template <Index n>
+ EIGEN_ALWAYS_INLINE void dhs_copy(Scalar* blockA, const DataMapper& lhs2, Index& i, Index& ri, Index depth,
+ const Index vectorSize) {
+ PacketBlock<Packet, 4> block[n];
- for(; i + n*vectorSize <= depth; i+=n*vectorSize)
- {
+ for (; i + n * vectorSize <= depth; i += n * vectorSize) {
for (Index k = 0; k < n; k++) {
if (UseLhs) {
- bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block[k], lhs2, 0, i + k*vectorSize);
+ bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block[k], lhs2, 0, i + k * vectorSize);
} else {
- bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block[k], lhs2, i + k*vectorSize, 0);
+ bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block[k], lhs2, i + k * vectorSize, 0);
}
}
- if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
- {
+ if (((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) {
for (Index k = 0; k < n; k++) {
ptranspose(block[k]);
}
}
for (Index k = 0; k < n; k++) {
- storeBlock<Scalar, Packet, 4>(blockA + ri + k*4*vectorSize, block[k]);
+ storeBlock<Scalar, Packet, 4>(blockA + ri + k * 4 * vectorSize, block[k]);
}
- ri += n*4*vectorSize;
+ ri += n * 4 * vectorSize;
}
}
- EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
- {
+ EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride,
+ Index offset) {
const Index vectorSize = quad_traits<Scalar>::vectorsize;
Index ri = 0, j = 0;
- for(; j + vectorSize <= rows; j+=vectorSize)
- {
+ for (; j + vectorSize <= rows; j += vectorSize) {
const DataMapper lhs2 = UseLhs ? lhs.getSubMapper(j, 0) : lhs.getSubMapper(0, j);
Index i = 0;
- if(PanelMode) ri += vectorSize*offset;
+ if (PanelMode) ri += vectorSize * offset;
dhs_copy<4>(blockA, lhs2, i, ri, depth, vectorSize);
dhs_copy<2>(blockA, lhs2, i, ri, depth, vectorSize);
dhs_copy<1>(blockA, lhs2, i, ri, depth, vectorSize);
- for(; i < depth; i++)
- {
- if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
- {
+ for (; i < depth; i++) {
+ if (((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) {
if (UseLhs) {
- blockA[ri+0] = lhs2(0, i);
- blockA[ri+1] = lhs2(1, i);
- blockA[ri+2] = lhs2(2, i);
- blockA[ri+3] = lhs2(3, i);
+ blockA[ri + 0] = lhs2(0, i);
+ blockA[ri + 1] = lhs2(1, i);
+ blockA[ri + 2] = lhs2(2, i);
+ blockA[ri + 3] = lhs2(3, i);
} else {
- blockA[ri+0] = lhs2(i, 0);
- blockA[ri+1] = lhs2(i, 1);
- blockA[ri+2] = lhs2(i, 2);
- blockA[ri+3] = lhs2(i, 3);
+ blockA[ri + 0] = lhs2(i, 0);
+ blockA[ri + 1] = lhs2(i, 1);
+ blockA[ri + 2] = lhs2(i, 2);
+ blockA[ri + 3] = lhs2(i, 3);
}
} else {
Packet lhsV;
@@ -704,34 +624,28 @@
ri += vectorSize;
}
- if(PanelMode) ri += vectorSize*(stride - offset - depth);
+ if (PanelMode) ri += vectorSize * (stride - offset - depth);
}
- if (!UseLhs)
- {
- if(PanelMode) ri += offset;
+ if (!UseLhs) {
+ if (PanelMode) ri += offset;
- for(; j < rows; j++)
- {
+ for (; j < rows; j++) {
const DataMapper lhs2 = lhs.getSubMapper(0, j);
- for(Index i = 0; i < depth; i++)
- {
+ for (Index i = 0; i < depth; i++) {
blockA[ri] = lhs2(i, 0);
ri += 1;
}
- if(PanelMode) ri += stride - depth;
+ if (PanelMode) ri += stride - depth;
}
} else {
- if (j < rows)
- {
- if(PanelMode) ri += offset*(rows - j);
+ if (j < rows) {
+ if (PanelMode) ri += offset * (rows - j);
- for(Index i = 0; i < depth; i++)
- {
+ for (Index i = 0; i < depth; i++) {
Index k = j;
- for(; k < rows; k++)
- {
+ for (; k < rows; k++) {
blockA[ri] = lhs(k, i);
ri += 1;
}
@@ -742,64 +656,57 @@
};
// General template for lhs packing, float64 specialization.
-template<typename DataMapper, int StorageOrder, bool PanelMode>
-struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, true>
-{
- template<Index n>
- EIGEN_ALWAYS_INLINE void dhs_copy(double* blockA, const DataMapper& lhs2, Index& i, Index& ri, Index depth, const Index vectorSize)
- {
- PacketBlock<Packet2d,2> block[n];
+template <typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, true> {
+ template <Index n>
+ EIGEN_ALWAYS_INLINE void dhs_copy(double* blockA, const DataMapper& lhs2, Index& i, Index& ri, Index depth,
+ const Index vectorSize) {
+ PacketBlock<Packet2d, 2> block[n];
- for(; i + n*vectorSize <= depth; i+=n*vectorSize)
- {
+ for (; i + n * vectorSize <= depth; i += n * vectorSize) {
for (Index k = 0; k < n; k++) {
- if(StorageOrder == RowMajor)
- {
- block[k].packet[0] = lhs2.template loadPacket<Packet2d>(0, i + k*vectorSize);
- block[k].packet[1] = lhs2.template loadPacket<Packet2d>(1, i + k*vectorSize);
+ if (StorageOrder == RowMajor) {
+ block[k].packet[0] = lhs2.template loadPacket<Packet2d>(0, i + k * vectorSize);
+ block[k].packet[1] = lhs2.template loadPacket<Packet2d>(1, i + k * vectorSize);
} else {
- block[k].packet[0] = lhs2.template loadPacket<Packet2d>(0, i + k*vectorSize + 0);
- block[k].packet[1] = lhs2.template loadPacket<Packet2d>(0, i + k*vectorSize + 1);
+ block[k].packet[0] = lhs2.template loadPacket<Packet2d>(0, i + k * vectorSize + 0);
+ block[k].packet[1] = lhs2.template loadPacket<Packet2d>(0, i + k * vectorSize + 1);
}
}
- if(StorageOrder == RowMajor)
- {
+ if (StorageOrder == RowMajor) {
for (Index k = 0; k < n; k++) {
ptranspose(block[k]);
}
}
for (Index k = 0; k < n; k++) {
- storeBlock<double, Packet2d, 2>(blockA + ri + k*2*vectorSize, block[k]);
+ storeBlock<double, Packet2d, 2>(blockA + ri + k * 2 * vectorSize, block[k]);
}
- ri += n*2*vectorSize;
+ ri += n * 2 * vectorSize;
}
}
- EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
- {
+ EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride,
+ Index offset) {
const Index vectorSize = quad_traits<double>::vectorsize;
Index ri = 0, j = 0;
- for(; j + vectorSize <= rows; j+=vectorSize)
- {
+ for (; j + vectorSize <= rows; j += vectorSize) {
const DataMapper lhs2 = lhs.getSubMapper(j, 0);
Index i = 0;
- if(PanelMode) ri += vectorSize*offset;
+ if (PanelMode) ri += vectorSize * offset;
dhs_copy<4>(blockA, lhs2, i, ri, depth, vectorSize);
dhs_copy<2>(blockA, lhs2, i, ri, depth, vectorSize);
dhs_copy<1>(blockA, lhs2, i, ri, depth, vectorSize);
- for(; i < depth; i++)
- {
- if(StorageOrder == RowMajor)
- {
- blockA[ri+0] = lhs2(0, i);
- blockA[ri+1] = lhs2(1, i);
+ for (; i < depth; i++) {
+ if (StorageOrder == RowMajor) {
+ blockA[ri + 0] = lhs2(0, i);
+ blockA[ri + 1] = lhs2(1, i);
} else {
Packet2d lhsV = lhs2.template loadPacket<Packet2d>(0, i);
pstore<double>(blockA + ri, lhsV);
@@ -808,18 +715,15 @@
ri += vectorSize;
}
- if(PanelMode) ri += vectorSize*(stride - offset - depth);
+ if (PanelMode) ri += vectorSize * (stride - offset - depth);
}
- if (j < rows)
- {
- if(PanelMode) ri += offset*(rows - j);
+ if (j < rows) {
+ if (PanelMode) ri += offset * (rows - j);
- for(Index i = 0; i < depth; i++)
- {
+ for (Index i = 0; i < depth; i++) {
Index k = j;
- for(; k < rows; k++)
- {
+ for (; k < rows; k++) {
blockA[ri] = lhs(k, i);
ri += 1;
}
@@ -829,34 +733,30 @@
};
// General template for rhs packing, float64 specialization.
-template<typename DataMapper, int StorageOrder, bool PanelMode>
-struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, false>
-{
- template<Index n>
- EIGEN_ALWAYS_INLINE void dhs_copy(double* blockB, const DataMapper& rhs2, Index& i, Index& ri, Index depth, const Index vectorSize)
- {
- PacketBlock<Packet2d,2> block1[n], block2[n];
- PacketBlock<Packet2d,4> block3[n];
+template <typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, false> {
+ template <Index n>
+ EIGEN_ALWAYS_INLINE void dhs_copy(double* blockB, const DataMapper& rhs2, Index& i, Index& ri, Index depth,
+ const Index vectorSize) {
+ PacketBlock<Packet2d, 2> block1[n], block2[n];
+ PacketBlock<Packet2d, 4> block3[n];
- for(; i + n*vectorSize <= depth; i+=n*vectorSize)
- {
+ for (; i + n * vectorSize <= depth; i += n * vectorSize) {
for (Index k = 0; k < n; k++) {
- if(StorageOrder == ColMajor)
- {
- block1[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize, 0);
- block1[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize, 1);
- block2[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize, 2);
- block2[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize, 3);
+ if (StorageOrder == ColMajor) {
+ block1[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize, 0);
+ block1[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize, 1);
+ block2[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize, 2);
+ block2[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize, 3);
} else {
- block3[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize + 0, 0); //[a1 a2]
- block3[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize + 0, 2); //[a3 a4]
- block3[k].packet[2] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize + 1, 0); //[b1 b2]
- block3[k].packet[3] = rhs2.template loadPacket<Packet2d>(i + k*vectorSize + 1, 2); //[b3 b4]
+ block3[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize + 0, 0); //[a1 a2]
+ block3[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize + 0, 2); //[a3 a4]
+ block3[k].packet[2] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize + 1, 0); //[b1 b2]
+ block3[k].packet[3] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize + 1, 2); //[b3 b4]
}
}
- if(StorageOrder == ColMajor)
- {
+ if (StorageOrder == ColMajor) {
for (Index k = 0; k < n; k++) {
ptranspose(block1[k]);
ptranspose(block2[k]);
@@ -864,48 +764,44 @@
}
for (Index k = 0; k < n; k++) {
- if(StorageOrder == ColMajor)
- {
- pstore<double>(blockB + ri + k*4*vectorSize , block1[k].packet[0]);
- pstore<double>(blockB + ri + k*4*vectorSize + 2, block2[k].packet[0]);
- pstore<double>(blockB + ri + k*4*vectorSize + 4, block1[k].packet[1]);
- pstore<double>(blockB + ri + k*4*vectorSize + 6, block2[k].packet[1]);
+ if (StorageOrder == ColMajor) {
+ pstore<double>(blockB + ri + k * 4 * vectorSize, block1[k].packet[0]);
+ pstore<double>(blockB + ri + k * 4 * vectorSize + 2, block2[k].packet[0]);
+ pstore<double>(blockB + ri + k * 4 * vectorSize + 4, block1[k].packet[1]);
+ pstore<double>(blockB + ri + k * 4 * vectorSize + 6, block2[k].packet[1]);
} else {
- storeBlock<double, Packet2d, 4>(blockB + ri + k*4*vectorSize, block3[k]);
+ storeBlock<double, Packet2d, 4>(blockB + ri + k * 4 * vectorSize, block3[k]);
}
}
- ri += n*4*vectorSize;
+ ri += n * 4 * vectorSize;
}
}
- EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
- {
+ EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride,
+ Index offset) {
const Index vectorSize = quad_traits<double>::vectorsize;
Index ri = 0, j = 0;
- for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
- {
+ for (; j + 2 * vectorSize <= cols; j += 2 * vectorSize) {
const DataMapper rhs2 = rhs.getSubMapper(0, j);
Index i = 0;
- if(PanelMode) ri += offset*(2*vectorSize);
+ if (PanelMode) ri += offset * (2 * vectorSize);
dhs_copy<4>(blockB, rhs2, i, ri, depth, vectorSize);
dhs_copy<2>(blockB, rhs2, i, ri, depth, vectorSize);
dhs_copy<1>(blockB, rhs2, i, ri, depth, vectorSize);
- for(; i < depth; i++)
- {
- if(StorageOrder == ColMajor)
- {
- blockB[ri+0] = rhs2(i, 0);
- blockB[ri+1] = rhs2(i, 1);
+ for (; i < depth; i++) {
+ if (StorageOrder == ColMajor) {
+ blockB[ri + 0] = rhs2(i, 0);
+ blockB[ri + 1] = rhs2(i, 1);
ri += vectorSize;
- blockB[ri+0] = rhs2(i, 2);
- blockB[ri+1] = rhs2(i, 3);
+ blockB[ri + 0] = rhs2(i, 2);
+ blockB[ri + 1] = rhs2(i, 3);
} else {
Packet2d rhsV = rhs2.template loadPacket<Packet2d>(i, 0);
pstore<double>(blockB + ri, rhsV);
@@ -918,46 +814,40 @@
ri += vectorSize;
}
- if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth);
+ if (PanelMode) ri += (2 * vectorSize) * (stride - offset - depth);
}
- if(PanelMode) ri += offset;
+ if (PanelMode) ri += offset;
- for(; j < cols; j++)
- {
+ for (; j < cols; j++) {
const DataMapper rhs2 = rhs.getSubMapper(0, j);
- for(Index i = 0; i < depth; i++)
- {
+ for (Index i = 0; i < depth; i++) {
blockB[ri] = rhs2(i, 0);
ri += 1;
}
- if(PanelMode) ri += stride - depth;
+ if (PanelMode) ri += stride - depth;
}
}
};
// General template for lhs packing, bfloat16 specialization.
-template<typename DataMapper, int StorageOrder, bool PanelMode>
-struct dhs_pack<bfloat16, DataMapper, Packet8bf, StorageOrder, PanelMode, true>
-{
- EIGEN_STRONG_INLINE void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
- {
+template <typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<bfloat16, DataMapper, Packet8bf, StorageOrder, PanelMode, true> {
+ EIGEN_STRONG_INLINE void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride,
+ Index offset) {
const Index vectorSize = quad_traits<bfloat16>::vectorsize;
Index ri = 0, j = 0;
- for(; j + 2*vectorSize <= rows; j+=2*vectorSize)
- {
+ for (; j + 2 * vectorSize <= rows; j += 2 * vectorSize) {
const DataMapper lhs2 = lhs.getSubMapper(j, 0);
Index i = 0;
- if(PanelMode) ri += 2*vectorSize*offset;
+ if (PanelMode) ri += 2 * vectorSize * offset;
- if(StorageOrder == ColMajor)
- {
- for(; i + 2 <= depth; i+=2)
- {
- PacketBlock<Packet8bf,4> block;
+ if (StorageOrder == ColMajor) {
+ for (; i + 2 <= depth; i += 2) {
+ PacketBlock<Packet8bf, 4> block;
block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
block.packet[1] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 0);
@@ -965,8 +855,8 @@
block.packet[3] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 1);
Packet8bf t0, t1;
- t0 = vec_mergeh(block.packet[0].m_val, block.packet[2].m_val);
- t1 = vec_mergel(block.packet[0].m_val, block.packet[2].m_val);
+ t0 = vec_mergeh(block.packet[0].m_val, block.packet[2].m_val);
+ t1 = vec_mergel(block.packet[0].m_val, block.packet[2].m_val);
block.packet[2] = vec_mergeh(block.packet[1].m_val, block.packet[3].m_val);
block.packet[3] = vec_mergel(block.packet[1].m_val, block.packet[3].m_val);
block.packet[0] = t0;
@@ -974,200 +864,237 @@
storeBlock<bfloat16, Packet8bf, 4>(blockA + ri, block);
- ri += 2*2*vectorSize;
+ ri += 2 * 2 * vectorSize;
}
- if (depth & 1)
- {
- PacketBlock<Packet8bf,2> block;
+ if (depth & 1) {
+ PacketBlock<Packet8bf, 2> block;
block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
block.packet[1] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 0);
storeBlock<bfloat16, Packet8bf, 2>(blockA + ri, block);
- ri += 2*vectorSize;
+ ri += 2 * vectorSize;
}
} else {
- for(; i + vectorSize <= depth; i+=vectorSize)
- {
- PacketBlock<Packet8bf,8> block1, block2;
+ for (; i + vectorSize <= depth; i += vectorSize) {
+ PacketBlock<Packet8bf, 8> block1, block2;
bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block1, lhs2, 0 * vectorSize, i);
bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block2, lhs2, 1 * vectorSize, i);
Packet4ui v1[8], v2[8];
- v1[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
- v1[1] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
- v1[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
- v1[3] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
- v1[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
- v1[5] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
- v1[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
- v1[7] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
- v2[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[0].m_val), reinterpret_cast<Packet4ui>(block2.packet[1].m_val));
- v2[1] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[0].m_val), reinterpret_cast<Packet4ui>(block2.packet[1].m_val));
- v2[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[2].m_val), reinterpret_cast<Packet4ui>(block2.packet[3].m_val));
- v2[3] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[2].m_val), reinterpret_cast<Packet4ui>(block2.packet[3].m_val));
- v2[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[4].m_val), reinterpret_cast<Packet4ui>(block2.packet[5].m_val));
- v2[5] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[4].m_val), reinterpret_cast<Packet4ui>(block2.packet[5].m_val));
- v2[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[6].m_val), reinterpret_cast<Packet4ui>(block2.packet[7].m_val));
- v2[7] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[6].m_val), reinterpret_cast<Packet4ui>(block2.packet[7].m_val));
+ v1[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val),
+ reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
+ v1[1] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val),
+ reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
+ v1[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val),
+ reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
+ v1[3] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[2].m_val),
+ reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
+ v1[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[4].m_val),
+ reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
+ v1[5] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[4].m_val),
+ reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
+ v1[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[6].m_val),
+ reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
+ v1[7] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[6].m_val),
+ reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
+ v2[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[0].m_val),
+ reinterpret_cast<Packet4ui>(block2.packet[1].m_val));
+ v2[1] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[0].m_val),
+ reinterpret_cast<Packet4ui>(block2.packet[1].m_val));
+ v2[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[2].m_val),
+ reinterpret_cast<Packet4ui>(block2.packet[3].m_val));
+ v2[3] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[2].m_val),
+ reinterpret_cast<Packet4ui>(block2.packet[3].m_val));
+ v2[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[4].m_val),
+ reinterpret_cast<Packet4ui>(block2.packet[5].m_val));
+ v2[5] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[4].m_val),
+ reinterpret_cast<Packet4ui>(block2.packet[5].m_val));
+ v2[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[6].m_val),
+ reinterpret_cast<Packet4ui>(block2.packet[7].m_val));
+ v2[7] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[6].m_val),
+ reinterpret_cast<Packet4ui>(block2.packet[7].m_val));
#ifdef EIGEN_VECTORIZE_VSX
- block1.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[0]),reinterpret_cast<Packet2ul>(v1[2])));
- block1.packet[2] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[0]),reinterpret_cast<Packet2ul>(v1[2])));
- block1.packet[4] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[1]),reinterpret_cast<Packet2ul>(v1[3])));
- block1.packet[6] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[1]),reinterpret_cast<Packet2ul>(v1[3])));
- block1.packet[1] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[4]),reinterpret_cast<Packet2ul>(v1[6])));
- block1.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[4]),reinterpret_cast<Packet2ul>(v1[6])));
- block1.packet[5] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[5]),reinterpret_cast<Packet2ul>(v1[7])));
- block1.packet[7] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[5]),reinterpret_cast<Packet2ul>(v1[7])));
- block2.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v2[0]),reinterpret_cast<Packet2ul>(v2[2])));
- block2.packet[2] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v2[0]),reinterpret_cast<Packet2ul>(v2[2])));
- block2.packet[4] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v2[1]),reinterpret_cast<Packet2ul>(v2[3])));
- block2.packet[6] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v2[1]),reinterpret_cast<Packet2ul>(v2[3])));
- block2.packet[1] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v2[4]),reinterpret_cast<Packet2ul>(v2[6])));
- block2.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v2[4]),reinterpret_cast<Packet2ul>(v2[6])));
- block2.packet[5] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v2[5]),reinterpret_cast<Packet2ul>(v2[7])));
- block2.packet[7] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v2[5]),reinterpret_cast<Packet2ul>(v2[7])));
+ block1.packet[0] = reinterpret_cast<Packet8us>(
+ vec_mergeh(reinterpret_cast<Packet2ul>(v1[0]), reinterpret_cast<Packet2ul>(v1[2])));
+ block1.packet[2] = reinterpret_cast<Packet8us>(
+ vec_mergel(reinterpret_cast<Packet2ul>(v1[0]), reinterpret_cast<Packet2ul>(v1[2])));
+ block1.packet[4] = reinterpret_cast<Packet8us>(
+ vec_mergeh(reinterpret_cast<Packet2ul>(v1[1]), reinterpret_cast<Packet2ul>(v1[3])));
+ block1.packet[6] = reinterpret_cast<Packet8us>(
+ vec_mergel(reinterpret_cast<Packet2ul>(v1[1]), reinterpret_cast<Packet2ul>(v1[3])));
+ block1.packet[1] = reinterpret_cast<Packet8us>(
+ vec_mergeh(reinterpret_cast<Packet2ul>(v1[4]), reinterpret_cast<Packet2ul>(v1[6])));
+ block1.packet[3] = reinterpret_cast<Packet8us>(
+ vec_mergel(reinterpret_cast<Packet2ul>(v1[4]), reinterpret_cast<Packet2ul>(v1[6])));
+ block1.packet[5] = reinterpret_cast<Packet8us>(
+ vec_mergeh(reinterpret_cast<Packet2ul>(v1[5]), reinterpret_cast<Packet2ul>(v1[7])));
+ block1.packet[7] = reinterpret_cast<Packet8us>(
+ vec_mergel(reinterpret_cast<Packet2ul>(v1[5]), reinterpret_cast<Packet2ul>(v1[7])));
+ block2.packet[0] = reinterpret_cast<Packet8us>(
+ vec_mergeh(reinterpret_cast<Packet2ul>(v2[0]), reinterpret_cast<Packet2ul>(v2[2])));
+ block2.packet[2] = reinterpret_cast<Packet8us>(
+ vec_mergel(reinterpret_cast<Packet2ul>(v2[0]), reinterpret_cast<Packet2ul>(v2[2])));
+ block2.packet[4] = reinterpret_cast<Packet8us>(
+ vec_mergeh(reinterpret_cast<Packet2ul>(v2[1]), reinterpret_cast<Packet2ul>(v2[3])));
+ block2.packet[6] = reinterpret_cast<Packet8us>(
+ vec_mergel(reinterpret_cast<Packet2ul>(v2[1]), reinterpret_cast<Packet2ul>(v2[3])));
+ block2.packet[1] = reinterpret_cast<Packet8us>(
+ vec_mergeh(reinterpret_cast<Packet2ul>(v2[4]), reinterpret_cast<Packet2ul>(v2[6])));
+ block2.packet[3] = reinterpret_cast<Packet8us>(
+ vec_mergel(reinterpret_cast<Packet2ul>(v2[4]), reinterpret_cast<Packet2ul>(v2[6])));
+ block2.packet[5] = reinterpret_cast<Packet8us>(
+ vec_mergeh(reinterpret_cast<Packet2ul>(v2[5]), reinterpret_cast<Packet2ul>(v2[7])));
+ block2.packet[7] = reinterpret_cast<Packet8us>(
+ vec_mergel(reinterpret_cast<Packet2ul>(v2[5]), reinterpret_cast<Packet2ul>(v2[7])));
#else
- block1.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v1[0],v1[2],p16uc_TRANSPOSE64_HI));
- block1.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v1[0],v1[2],p16uc_TRANSPOSE64_LO));
- block1.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v1[1],v1[3],p16uc_TRANSPOSE64_HI));
- block1.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v1[1],v1[3],p16uc_TRANSPOSE64_LO));
- block1.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v1[4],v1[6],p16uc_TRANSPOSE64_HI));
- block1.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v1[4],v1[6],p16uc_TRANSPOSE64_LO));
- block1.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v1[5],v1[7],p16uc_TRANSPOSE64_HI));
- block1.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v1[5],v1[7],p16uc_TRANSPOSE64_LO));
- block2.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v2[0],v2[2],p16uc_TRANSPOSE64_HI));
- block2.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v2[0],v2[2],p16uc_TRANSPOSE64_LO));
- block2.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v2[1],v2[3],p16uc_TRANSPOSE64_HI));
- block2.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v2[1],v2[3],p16uc_TRANSPOSE64_LO));
- block2.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v2[4],v2[6],p16uc_TRANSPOSE64_HI));
- block2.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v2[4],v2[6],p16uc_TRANSPOSE64_LO));
- block2.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v2[5],v2[7],p16uc_TRANSPOSE64_HI));
- block2.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v2[5],v2[7],p16uc_TRANSPOSE64_LO));
+ block1.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v1[0], v1[2], p16uc_TRANSPOSE64_HI));
+ block1.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v1[0], v1[2], p16uc_TRANSPOSE64_LO));
+ block1.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v1[1], v1[3], p16uc_TRANSPOSE64_HI));
+ block1.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v1[1], v1[3], p16uc_TRANSPOSE64_LO));
+ block1.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v1[4], v1[6], p16uc_TRANSPOSE64_HI));
+ block1.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v1[4], v1[6], p16uc_TRANSPOSE64_LO));
+ block1.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v1[5], v1[7], p16uc_TRANSPOSE64_HI));
+ block1.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v1[5], v1[7], p16uc_TRANSPOSE64_LO));
+ block2.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v2[0], v2[2], p16uc_TRANSPOSE64_HI));
+ block2.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v2[0], v2[2], p16uc_TRANSPOSE64_LO));
+ block2.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v2[1], v2[3], p16uc_TRANSPOSE64_HI));
+ block2.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v2[1], v2[3], p16uc_TRANSPOSE64_LO));
+ block2.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v2[4], v2[6], p16uc_TRANSPOSE64_HI));
+ block2.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v2[4], v2[6], p16uc_TRANSPOSE64_LO));
+ block2.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v2[5], v2[7], p16uc_TRANSPOSE64_HI));
+ block2.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v2[5], v2[7], p16uc_TRANSPOSE64_LO));
#endif
- for(Index M = 0; M < 8; M+=2) {
- pstore<bfloat16>(blockA + ri + (0 * vectorSize) + (2*vectorSize * M), block1.packet[M+0]);
- pstore<bfloat16>(blockA + ri + (1 * vectorSize) + (2*vectorSize * M), block1.packet[M+1]);
- pstore<bfloat16>(blockA + ri + (2 * vectorSize) + (2*vectorSize * M), block2.packet[M+0]);
- pstore<bfloat16>(blockA + ri + (3 * vectorSize) + (2*vectorSize * M), block2.packet[M+1]);
+ for (Index M = 0; M < 8; M += 2) {
+ pstore<bfloat16>(blockA + ri + (0 * vectorSize) + (2 * vectorSize * M), block1.packet[M + 0]);
+ pstore<bfloat16>(blockA + ri + (1 * vectorSize) + (2 * vectorSize * M), block1.packet[M + 1]);
+ pstore<bfloat16>(blockA + ri + (2 * vectorSize) + (2 * vectorSize * M), block2.packet[M + 0]);
+ pstore<bfloat16>(blockA + ri + (3 * vectorSize) + (2 * vectorSize * M), block2.packet[M + 1]);
}
- ri += 2*vectorSize*vectorSize;
+ ri += 2 * vectorSize * vectorSize;
}
- for(; i + 2 <= depth; i+=2)
- {
- for(Index M = 0; M < 2*vectorSize; M++) {
+ for (; i + 2 <= depth; i += 2) {
+ for (Index M = 0; M < 2 * vectorSize; M++) {
blockA[ri + (M * 2) + 0] = lhs2(M, i + 0);
blockA[ri + (M * 2) + 1] = lhs2(M, i + 1);
}
- ri += 2*2*vectorSize;
+ ri += 2 * 2 * vectorSize;
}
- if (depth & 1)
- {
- for(Index M = 0; M < 2*vectorSize; M++) {
+ if (depth & 1) {
+ for (Index M = 0; M < 2 * vectorSize; M++) {
blockA[ri + M] = lhs2(M, i);
}
- ri += 2*vectorSize;
+ ri += 2 * vectorSize;
}
}
- if(PanelMode) ri += 2*vectorSize*(stride - offset - depth);
+ if (PanelMode) ri += 2 * vectorSize * (stride - offset - depth);
}
- for(; j + vectorSize <= rows; j+=vectorSize)
- {
+ for (; j + vectorSize <= rows; j += vectorSize) {
const DataMapper lhs2 = lhs.getSubMapper(j, 0);
Index i = 0;
- if(PanelMode) ri += vectorSize*offset;
+ if (PanelMode) ri += vectorSize * offset;
- if(StorageOrder == ColMajor)
- {
- for(; i + 2 <= depth; i+=2)
- {
- PacketBlock<Packet8bf,2> block;
+ if (StorageOrder == ColMajor) {
+ for (; i + 2 <= depth; i += 2) {
+ PacketBlock<Packet8bf, 2> block;
block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
block.packet[1] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 1);
Packet8bf t0;
- t0 = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
+ t0 = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
block.packet[1] = vec_mergel(block.packet[0].m_val, block.packet[1].m_val);
block.packet[0] = t0;
storeBlock<bfloat16, Packet8bf, 2>(blockA + ri, block);
- ri += 2*vectorSize;
+ ri += 2 * vectorSize;
}
- if (depth & 1)
- {
+ if (depth & 1) {
Packet8bf lhsV = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
pstore<bfloat16>(blockA + ri, lhsV);
ri += vectorSize;
}
} else {
- for(; i + vectorSize <= depth; i+=vectorSize)
- {
- PacketBlock<Packet8bf,8> block1;
+ for (; i + vectorSize <= depth; i += vectorSize) {
+ PacketBlock<Packet8bf, 8> block1;
bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block1, lhs2, 0 * vectorSize, i);
Packet4ui v1[8];
// This is transposing and interleaving data
- v1[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
- v1[1] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val), reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
- v1[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
- v1[3] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[2].m_val), reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
- v1[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
- v1[5] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[4].m_val), reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
- v1[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
- v1[7] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[6].m_val), reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
+ v1[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val),
+ reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
+ v1[1] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val),
+ reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
+ v1[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val),
+ reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
+ v1[3] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[2].m_val),
+ reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
+ v1[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[4].m_val),
+ reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
+ v1[5] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[4].m_val),
+ reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
+ v1[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[6].m_val),
+ reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
+ v1[7] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[6].m_val),
+ reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
#ifdef EIGEN_VECTORIZE_VSX
- block1.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[0]),reinterpret_cast<Packet2ul>(v1[2])));
- block1.packet[2] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[0]),reinterpret_cast<Packet2ul>(v1[2])));
- block1.packet[4] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[1]),reinterpret_cast<Packet2ul>(v1[3])));
- block1.packet[6] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[1]),reinterpret_cast<Packet2ul>(v1[3])));
- block1.packet[1] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[4]),reinterpret_cast<Packet2ul>(v1[6])));
- block1.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[4]),reinterpret_cast<Packet2ul>(v1[6])));
- block1.packet[5] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(v1[5]),reinterpret_cast<Packet2ul>(v1[7])));
- block1.packet[7] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(v1[5]),reinterpret_cast<Packet2ul>(v1[7])));
+ block1.packet[0] = reinterpret_cast<Packet8us>(
+ vec_mergeh(reinterpret_cast<Packet2ul>(v1[0]), reinterpret_cast<Packet2ul>(v1[2])));
+ block1.packet[2] = reinterpret_cast<Packet8us>(
+ vec_mergel(reinterpret_cast<Packet2ul>(v1[0]), reinterpret_cast<Packet2ul>(v1[2])));
+ block1.packet[4] = reinterpret_cast<Packet8us>(
+ vec_mergeh(reinterpret_cast<Packet2ul>(v1[1]), reinterpret_cast<Packet2ul>(v1[3])));
+ block1.packet[6] = reinterpret_cast<Packet8us>(
+ vec_mergel(reinterpret_cast<Packet2ul>(v1[1]), reinterpret_cast<Packet2ul>(v1[3])));
+ block1.packet[1] = reinterpret_cast<Packet8us>(
+ vec_mergeh(reinterpret_cast<Packet2ul>(v1[4]), reinterpret_cast<Packet2ul>(v1[6])));
+ block1.packet[3] = reinterpret_cast<Packet8us>(
+ vec_mergel(reinterpret_cast<Packet2ul>(v1[4]), reinterpret_cast<Packet2ul>(v1[6])));
+ block1.packet[5] = reinterpret_cast<Packet8us>(
+ vec_mergeh(reinterpret_cast<Packet2ul>(v1[5]), reinterpret_cast<Packet2ul>(v1[7])));
+ block1.packet[7] = reinterpret_cast<Packet8us>(
+ vec_mergel(reinterpret_cast<Packet2ul>(v1[5]), reinterpret_cast<Packet2ul>(v1[7])));
#else
- block1.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v1[0],v1[2],p16uc_TRANSPOSE64_HI));
- block1.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v1[0],v1[2],p16uc_TRANSPOSE64_LO));
- block1.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v1[1],v1[3],p16uc_TRANSPOSE64_HI));
- block1.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v1[1],v1[3],p16uc_TRANSPOSE64_LO));
- block1.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v1[4],v1[6],p16uc_TRANSPOSE64_HI));
- block1.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v1[4],v1[6],p16uc_TRANSPOSE64_LO));
- block1.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v1[5],v1[7],p16uc_TRANSPOSE64_HI));
- block1.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v1[5],v1[7],p16uc_TRANSPOSE64_LO));
+ block1.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v1[0], v1[2], p16uc_TRANSPOSE64_HI));
+ block1.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v1[0], v1[2], p16uc_TRANSPOSE64_LO));
+ block1.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v1[1], v1[3], p16uc_TRANSPOSE64_HI));
+ block1.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v1[1], v1[3], p16uc_TRANSPOSE64_LO));
+ block1.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v1[4], v1[6], p16uc_TRANSPOSE64_HI));
+ block1.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v1[4], v1[6], p16uc_TRANSPOSE64_LO));
+ block1.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v1[5], v1[7], p16uc_TRANSPOSE64_HI));
+ block1.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v1[5], v1[7], p16uc_TRANSPOSE64_LO));
#endif
- for(Index M = 0; M < 8; M++) {
+ for (Index M = 0; M < 8; M++) {
pstore<bfloat16>(blockA + ri + (vectorSize * M), block1.packet[M]);
}
- ri += vectorSize*vectorSize;
+ ri += vectorSize * vectorSize;
}
- for(; i + 2 <= depth; i+=2)
- {
- for(Index M = 0; M < vectorSize; M++) {
+ for (; i + 2 <= depth; i += 2) {
+ for (Index M = 0; M < vectorSize; M++) {
blockA[ri + (M * 2) + 0] = lhs2(M, i + 0);
blockA[ri + (M * 2) + 1] = lhs2(M, i + 1);
}
- ri += 2*vectorSize;
+ ri += 2 * vectorSize;
}
- if (depth & 1)
- {
- for(Index M = 0; M < vectorSize; M++) {
+ if (depth & 1) {
+ for (Index M = 0; M < vectorSize; M++) {
blockA[ri + M] = lhs2(M, i);
}
@@ -1175,20 +1102,17 @@
}
}
- if(PanelMode) ri += vectorSize*(stride - offset - depth);
+ if (PanelMode) ri += vectorSize * (stride - offset - depth);
}
- if(j + 4 <= rows)
- {
+ if (j + 4 <= rows) {
const DataMapper lhs2 = lhs.getSubMapper(j, 0);
Index i = 0;
- if(PanelMode) ri += 4*offset;
+ if (PanelMode) ri += 4 * offset;
- for(; i + 2 <= depth; i+=2)
- {
- if(StorageOrder == ColMajor)
- {
- PacketBlock<Packet8bf,2> block;
+ for (; i + 2 <= depth; i += 2) {
+ if (StorageOrder == ColMajor) {
+ PacketBlock<Packet8bf, 2> block;
block.packet[0] = lhs2.template loadPacketPartial<Packet8bf>(0, i + 0, 4);
block.packet[1] = lhs2.template loadPacketPartial<Packet8bf>(0, i + 1, 4);
@@ -1197,58 +1121,51 @@
pstore<bfloat16>(blockA + ri, block.packet[0]);
} else {
- blockA[ri+0] = lhs2(0, i + 0);
- blockA[ri+1] = lhs2(0, i + 1);
- blockA[ri+2] = lhs2(1, i + 0);
- blockA[ri+3] = lhs2(1, i + 1);
- blockA[ri+4] = lhs2(2, i + 0);
- blockA[ri+5] = lhs2(2, i + 1);
- blockA[ri+6] = lhs2(3, i + 0);
- blockA[ri+7] = lhs2(3, i + 1);
+ blockA[ri + 0] = lhs2(0, i + 0);
+ blockA[ri + 1] = lhs2(0, i + 1);
+ blockA[ri + 2] = lhs2(1, i + 0);
+ blockA[ri + 3] = lhs2(1, i + 1);
+ blockA[ri + 4] = lhs2(2, i + 0);
+ blockA[ri + 5] = lhs2(2, i + 1);
+ blockA[ri + 6] = lhs2(3, i + 0);
+ blockA[ri + 7] = lhs2(3, i + 1);
}
- ri += 2*4;
+ ri += 2 * 4;
}
- if (depth & 1)
- {
- if(StorageOrder == ColMajor)
- {
+ if (depth & 1) {
+ if (StorageOrder == ColMajor) {
Packet8bf lhsV = lhs2.template loadPacketPartial<Packet8bf>(0, i + 0, 4);
pstore_partial<bfloat16>(blockA + ri, lhsV, 4);
} else {
- blockA[ri+0] = lhs2(0, i);
- blockA[ri+1] = lhs2(1, i);
- blockA[ri+2] = lhs2(2, i);
- blockA[ri+3] = lhs2(3, i);
+ blockA[ri + 0] = lhs2(0, i);
+ blockA[ri + 1] = lhs2(1, i);
+ blockA[ri + 2] = lhs2(2, i);
+ blockA[ri + 3] = lhs2(3, i);
}
ri += 4;
}
- if(PanelMode) ri += 4*(stride - offset - depth);
+ if (PanelMode) ri += 4 * (stride - offset - depth);
j += 4;
}
- if (j < rows)
- {
- if(PanelMode) ri += offset*(rows - j);
+ if (j < rows) {
+ if (PanelMode) ri += offset * (rows - j);
Index i = 0;
- for(; i + 2 <= depth; i+=2)
- {
+ for (; i + 2 <= depth; i += 2) {
Index k = j;
- for(; k < rows; k++)
- {
- blockA[ri+0] = lhs(k, i + 0);
- blockA[ri+1] = lhs(k, i + 1);
+ for (; k < rows; k++) {
+ blockA[ri + 0] = lhs(k, i + 0);
+ blockA[ri + 1] = lhs(k, i + 1);
ri += 2;
}
}
- if (depth & 1)
- {
- for(; j < rows; j++)
- {
+ if (depth & 1) {
+ for (; j < rows; j++) {
blockA[ri] = lhs(j, i);
ri += 1;
}
@@ -1258,51 +1175,55 @@
};
// General template for rhs packing, bfloat16 specialization.
-template<typename DataMapper, int StorageOrder, bool PanelMode>
-struct dhs_pack<bfloat16, DataMapper, Packet8bf, StorageOrder, PanelMode, false>
-{
- EIGEN_STRONG_INLINE void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
- {
+template <typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<bfloat16, DataMapper, Packet8bf, StorageOrder, PanelMode, false> {
+ EIGEN_STRONG_INLINE void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride,
+ Index offset) {
const Index vectorSize = quad_traits<bfloat16>::vectorsize;
Index ri = 0, j = 0;
- for(; j + 4 <= cols; j+=4)
- {
+ for (; j + 4 <= cols; j += 4) {
const DataMapper rhs2 = rhs.getSubMapper(0, j);
Index i = 0;
- if(PanelMode) ri += 4*offset;
+ if (PanelMode) ri += 4 * offset;
- for(; i + vectorSize <= depth; i+=vectorSize)
- {
- if(StorageOrder == ColMajor)
- {
- PacketBlock<Packet8bf,4> block;
+ for (; i + vectorSize <= depth; i += vectorSize) {
+ if (StorageOrder == ColMajor) {
+ PacketBlock<Packet8bf, 4> block;
bload<DataMapper, Packet8bf, 4, StorageOrder, false, 4>(block, rhs2, i, 0);
Packet4ui t0, t1, t2, t3;
- t0 = vec_mergeh(reinterpret_cast<Packet4ui>(block.packet[0].m_val), reinterpret_cast<Packet4ui>(block.packet[1].m_val));
- t1 = vec_mergel(reinterpret_cast<Packet4ui>(block.packet[0].m_val), reinterpret_cast<Packet4ui>(block.packet[1].m_val));
- t2 = vec_mergeh(reinterpret_cast<Packet4ui>(block.packet[2].m_val), reinterpret_cast<Packet4ui>(block.packet[3].m_val));
- t3 = vec_mergel(reinterpret_cast<Packet4ui>(block.packet[2].m_val), reinterpret_cast<Packet4ui>(block.packet[3].m_val));
+ t0 = vec_mergeh(reinterpret_cast<Packet4ui>(block.packet[0].m_val),
+ reinterpret_cast<Packet4ui>(block.packet[1].m_val));
+ t1 = vec_mergel(reinterpret_cast<Packet4ui>(block.packet[0].m_val),
+ reinterpret_cast<Packet4ui>(block.packet[1].m_val));
+ t2 = vec_mergeh(reinterpret_cast<Packet4ui>(block.packet[2].m_val),
+ reinterpret_cast<Packet4ui>(block.packet[3].m_val));
+ t3 = vec_mergel(reinterpret_cast<Packet4ui>(block.packet[2].m_val),
+ reinterpret_cast<Packet4ui>(block.packet[3].m_val));
#ifdef EIGEN_VECTORIZE_VSX
- block.packet[0] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(t0),reinterpret_cast<Packet2ul>(t2)));
- block.packet[1] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(t0),reinterpret_cast<Packet2ul>(t2)));
- block.packet[2] = reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(t1),reinterpret_cast<Packet2ul>(t3)));
- block.packet[3] = reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(t1),reinterpret_cast<Packet2ul>(t3)));
+ block.packet[0] =
+ reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(t0), reinterpret_cast<Packet2ul>(t2)));
+ block.packet[1] =
+ reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(t0), reinterpret_cast<Packet2ul>(t2)));
+ block.packet[2] =
+ reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(t1), reinterpret_cast<Packet2ul>(t3)));
+ block.packet[3] =
+ reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(t1), reinterpret_cast<Packet2ul>(t3)));
#else
- block.packet[0] = reinterpret_cast<Packet8us>(vec_perm(t0,t2,p16uc_TRANSPOSE64_HI));
- block.packet[1] = reinterpret_cast<Packet8us>(vec_perm(t0,t2,p16uc_TRANSPOSE64_LO));
- block.packet[2] = reinterpret_cast<Packet8us>(vec_perm(t1,t3,p16uc_TRANSPOSE64_HI));
- block.packet[3] = reinterpret_cast<Packet8us>(vec_perm(t1,t3,p16uc_TRANSPOSE64_LO));
+ block.packet[0] = reinterpret_cast<Packet8us>(vec_perm(t0, t2, p16uc_TRANSPOSE64_HI));
+ block.packet[1] = reinterpret_cast<Packet8us>(vec_perm(t0, t2, p16uc_TRANSPOSE64_LO));
+ block.packet[2] = reinterpret_cast<Packet8us>(vec_perm(t1, t3, p16uc_TRANSPOSE64_HI));
+ block.packet[3] = reinterpret_cast<Packet8us>(vec_perm(t1, t3, p16uc_TRANSPOSE64_LO));
#endif
storeBlock<bfloat16, Packet8bf, 4>(blockB + ri, block);
} else {
- PacketBlock<Packet8bf,8> block;
+ PacketBlock<Packet8bf, 8> block;
for (int M = 0; M < 8; M++) {
block.packet[M] = rhs2.template loadPacketPartial<Packet8bf>(i + M, 0, 4);
@@ -1320,21 +1241,20 @@
}
}
- ri += 4*vectorSize;
+ ri += 4 * vectorSize;
}
for (; i + 2 <= depth; i += 2) {
- if(StorageOrder == ColMajor)
- {
- blockB[ri+0] = rhs2(i + 0, 0);
- blockB[ri+1] = rhs2(i + 1, 0);
- blockB[ri+2] = rhs2(i + 0, 1);
- blockB[ri+3] = rhs2(i + 1, 1);
- blockB[ri+4] = rhs2(i + 0, 2);
- blockB[ri+5] = rhs2(i + 1, 2);
- blockB[ri+6] = rhs2(i + 0, 3);
- blockB[ri+7] = rhs2(i + 1, 3);
+ if (StorageOrder == ColMajor) {
+ blockB[ri + 0] = rhs2(i + 0, 0);
+ blockB[ri + 1] = rhs2(i + 1, 0);
+ blockB[ri + 2] = rhs2(i + 0, 1);
+ blockB[ri + 3] = rhs2(i + 1, 1);
+ blockB[ri + 4] = rhs2(i + 0, 2);
+ blockB[ri + 5] = rhs2(i + 1, 2);
+ blockB[ri + 6] = rhs2(i + 0, 3);
+ blockB[ri + 7] = rhs2(i + 1, 3);
} else {
- PacketBlock<Packet8bf,2> block;
+ PacketBlock<Packet8bf, 2> block;
for (int M = 0; M < 2; M++) {
block.packet[M] = rhs2.template loadPacketPartial<Packet8bf>(i + M, 0, 4);
@@ -1345,40 +1265,34 @@
pstore<bfloat16>(blockB + ri, block.packet[0]);
}
- ri += 4*2;
+ ri += 4 * 2;
}
- if (depth & 1)
- {
- blockB[ri+0] = rhs2(i, 0);
- blockB[ri+1] = rhs2(i, 1);
- blockB[ri+2] = rhs2(i, 2);
- blockB[ri+3] = rhs2(i, 3);
+ if (depth & 1) {
+ blockB[ri + 0] = rhs2(i, 0);
+ blockB[ri + 1] = rhs2(i, 1);
+ blockB[ri + 2] = rhs2(i, 2);
+ blockB[ri + 3] = rhs2(i, 3);
ri += 4;
}
- if(PanelMode) ri += 4*(stride - offset - depth);
+ if (PanelMode) ri += 4 * (stride - offset - depth);
}
- if (j < cols)
- {
- if(PanelMode) ri += offset*(cols - j);
+ if (j < cols) {
+ if (PanelMode) ri += offset * (cols - j);
Index i = 0;
- for(; i + 2 <= depth; i+=2)
- {
+ for (; i + 2 <= depth; i += 2) {
Index k = j;
- for(; k < cols; k++)
- {
- blockB[ri+0] = rhs(i + 0, k);
- blockB[ri+1] = rhs(i + 1, k);
+ for (; k < cols; k++) {
+ blockB[ri + 0] = rhs(i + 0, k);
+ blockB[ri + 1] = rhs(i + 1, k);
ri += 2;
}
}
- if (depth & 1)
- {
- for(; j < cols; j++)
- {
+ if (depth & 1) {
+ for (; j < cols; j++) {
blockB[ri] = rhs(i, j);
ri += 1;
}
@@ -1388,45 +1302,41 @@
};
// General template for lhs complex packing, float64 specialization.
-template<typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
-struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true>
-{
- EIGEN_ALWAYS_INLINE void dhs_ccopy(double* blockAt, const DataMapper& lhs2, Index& i, Index& rir, Index& rii, Index depth, const Index vectorSize)
- {
- PacketBlock<Packet,2> blockr, blocki;
- PacketBlock<PacketC,4> cblock;
+template <typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
+struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true> {
+ EIGEN_ALWAYS_INLINE void dhs_ccopy(double* blockAt, const DataMapper& lhs2, Index& i, Index& rir, Index& rii,
+ Index depth, const Index vectorSize) {
+ PacketBlock<Packet, 2> blockr, blocki;
+ PacketBlock<PacketC, 4> cblock;
- for(; i + vectorSize <= depth; i+=vectorSize)
- {
- if(StorageOrder == ColMajor)
- {
- cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i + 0); //[a1 a1i]
- cblock.packet[1] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
+ for (; i + vectorSize <= depth; i += vectorSize) {
+ if (StorageOrder == ColMajor) {
+ cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i + 0); //[a1 a1i]
+ cblock.packet[1] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
- cblock.packet[2] = lhs2.template loadPacket<PacketC>(1, i + 0); //[a2 a2i]
- cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i]
+ cblock.packet[2] = lhs2.template loadPacket<PacketC>(1, i + 0); //[a2 a2i]
+ cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i]
- blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[2].v); //[a1 a2]
- blockr.packet[1] = vec_mergeh(cblock.packet[1].v, cblock.packet[3].v); //[b1 b2]
+ blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[2].v); //[a1 a2]
+ blockr.packet[1] = vec_mergeh(cblock.packet[1].v, cblock.packet[3].v); //[b1 b2]
blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[2].v);
blocki.packet[1] = vec_mergel(cblock.packet[1].v, cblock.packet[3].v);
} else {
- cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i); //[a1 a1i]
- cblock.packet[1] = lhs2.template loadPacket<PacketC>(1, i); //[a2 a2i]
+ cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i); //[a1 a1i]
+ cblock.packet[1] = lhs2.template loadPacket<PacketC>(1, i); //[a2 a2i]
- cblock.packet[2] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
- cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i
+ cblock.packet[2] = lhs2.template loadPacket<PacketC>(0, i + 1); //[b1 b1i]
+ cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1); //[b2 b2i
- blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v); //[a1 a2]
- blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v); //[b1 b2]
+ blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v); //[a1 a2]
+ blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v); //[b1 b2]
blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
}
- if(Conjugate)
- {
+ if (Conjugate) {
blocki.packet[0] = -blocki.packet[0];
blocki.packet[1] = -blocki.packet[1];
}
@@ -1434,21 +1344,20 @@
storeBlock<double, Packet, 2>(blockAt + rir, blockr);
storeBlock<double, Packet, 2>(blockAt + rii, blocki);
- rir += 2*vectorSize;
- rii += 2*vectorSize;
+ rir += 2 * vectorSize;
+ rii += 2 * vectorSize;
}
}
- EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
- {
+ EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows,
+ Index stride, Index offset) {
const Index vectorSize = quad_traits<double>::vectorsize;
const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
- Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
- double* blockAt = reinterpret_cast<double *>(blockA);
+ Index rir = ((PanelMode) ? (vectorSize * offset) : 0), rii;
+ double* blockAt = reinterpret_cast<double*>(blockA);
Index j = 0;
- for(; j + vectorSize <= rows; j+=vectorSize)
- {
+ for (; j + vectorSize <= rows; j += vectorSize) {
const DataMapper lhs2 = lhs.getSubMapper(j, 0);
Index i = 0;
@@ -1456,10 +1365,9 @@
dhs_ccopy(blockAt, lhs2, i, rir, rii, depth, vectorSize);
- for(; i < depth; i++)
- {
- PacketBlock<Packet,1> blockr, blocki;
- PacketBlock<PacketC,2> cblock;
+ for (; i < depth; i++) {
+ PacketBlock<Packet, 1> blockr, blocki;
+ PacketBlock<PacketC, 2> cblock;
cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i);
cblock.packet[1] = lhs2.template loadPacket<PacketC>(1, i);
@@ -1467,8 +1375,7 @@
blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v);
blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
- if(Conjugate)
- {
+ if (Conjugate) {
blocki.packet[0] = -blocki.packet[0];
}
@@ -1479,25 +1386,22 @@
rii += vectorSize;
}
- rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
+ rir += ((PanelMode) ? (vectorSize * (2 * stride - depth)) : vectorDelta);
}
- if (j < rows)
- {
- if(PanelMode) rir += (offset*(rows - j - vectorSize));
+ if (j < rows) {
+ if (PanelMode) rir += (offset * (rows - j - vectorSize));
rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
- for(Index i = 0; i < depth; i++)
- {
+ for (Index i = 0; i < depth; i++) {
Index k = j;
- for(; k < rows; k++)
- {
+ for (; k < rows; k++) {
blockAt[rir] = lhs(k, i).real();
- if(Conjugate)
+ if (Conjugate)
blockAt[rii] = -lhs(k, i).imag();
else
- blockAt[rii] = lhs(k, i).imag();
+ blockAt[rii] = lhs(k, i).imag();
rir += 1;
rii += 1;
@@ -1508,15 +1412,13 @@
};
// General template for rhs complex packing, float64 specialization.
-template<typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
-struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, false>
-{
- EIGEN_ALWAYS_INLINE void dhs_ccopy(double* blockBt, const DataMapper& rhs2, Index& i, Index& rir, Index& rii, Index depth, const Index vectorSize)
- {
- for(; i < depth; i++)
- {
- PacketBlock<PacketC,4> cblock;
- PacketBlock<Packet,2> blockr, blocki;
+template <typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
+struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, false> {
+ EIGEN_ALWAYS_INLINE void dhs_ccopy(double* blockBt, const DataMapper& rhs2, Index& i, Index& rir, Index& rii,
+ Index depth, const Index vectorSize) {
+ for (; i < depth; i++) {
+ PacketBlock<PacketC, 4> cblock;
+ PacketBlock<Packet, 2> blockr, blocki;
bload<DataMapper, PacketC, 2, ColMajor, false, 4>(cblock, rhs2, i, 0);
@@ -1526,8 +1428,7 @@
blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
- if(Conjugate)
- {
+ if (Conjugate) {
blocki.packet[0] = -blocki.packet[0];
blocki.packet[1] = -blocki.packet[1];
}
@@ -1535,21 +1436,20 @@
storeBlock<double, Packet, 2>(blockBt + rir, blockr);
storeBlock<double, Packet, 2>(blockBt + rii, blocki);
- rir += 2*vectorSize;
- rii += 2*vectorSize;
+ rir += 2 * vectorSize;
+ rii += 2 * vectorSize;
}
}
- EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
- {
+ EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols,
+ Index stride, Index offset) {
const Index vectorSize = quad_traits<double>::vectorsize;
- const Index vectorDelta = 2*vectorSize * ((PanelMode) ? stride : depth);
- Index rir = ((PanelMode) ? (2*vectorSize*offset) : 0), rii;
- double* blockBt = reinterpret_cast<double *>(blockB);
+ const Index vectorDelta = 2 * vectorSize * ((PanelMode) ? stride : depth);
+ Index rir = ((PanelMode) ? (2 * vectorSize * offset) : 0), rii;
+ double* blockBt = reinterpret_cast<double*>(blockB);
Index j = 0;
- for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
- {
+ for (; j + 2 * vectorSize <= cols; j += 2 * vectorSize) {
const DataMapper rhs2 = rhs.getSubMapper(0, j);
Index i = 0;
@@ -1557,30 +1457,28 @@
dhs_ccopy(blockBt, rhs2, i, rir, rii, depth, vectorSize);
- rir += ((PanelMode) ? (2*vectorSize*(2*stride - depth)) : vectorDelta);
+ rir += ((PanelMode) ? (2 * vectorSize * (2 * stride - depth)) : vectorDelta);
}
- if(PanelMode) rir -= (offset*(2*vectorSize - 1));
+ if (PanelMode) rir -= (offset * (2 * vectorSize - 1));
- for(; j < cols; j++)
- {
+ for (; j < cols; j++) {
const DataMapper rhs2 = rhs.getSubMapper(0, j);
rii = rir + ((PanelMode) ? stride : depth);
- for(Index i = 0; i < depth; i++)
- {
+ for (Index i = 0; i < depth; i++) {
blockBt[rir] = rhs2(i, 0).real();
- if(Conjugate)
+ if (Conjugate)
blockBt[rii] = -rhs2(i, 0).imag();
else
- blockBt[rii] = rhs2(i, 0).imag();
+ blockBt[rii] = rhs2(i, 0).imag();
rir += 1;
rii += 1;
}
- rir += ((PanelMode) ? (2*stride - depth) : depth);
+ rir += ((PanelMode) ? (2 * stride - depth) : depth);
}
}
};
@@ -1590,11 +1488,9 @@
**************/
// 512-bits rank1-update of acc. It can either positive or negative accumulate (useful for complex gemm).
-template<typename Packet, bool NegativeAccumulate, int N>
-EIGEN_ALWAYS_INLINE void pger_common(PacketBlock<Packet,N>* acc, const Packet& lhsV, const Packet* rhsV)
-{
- if(NegativeAccumulate)
- {
+template <typename Packet, bool NegativeAccumulate, int N>
+EIGEN_ALWAYS_INLINE void pger_common(PacketBlock<Packet, N>* acc, const Packet& lhsV, const Packet* rhsV) {
+ if (NegativeAccumulate) {
for (int M = 0; M < N; M++) {
acc->packet[M] = vec_nmsub(lhsV, rhsV[M], acc->packet[M]);
}
@@ -1605,21 +1501,20 @@
}
}
-template<int N, typename Scalar, typename Packet, bool NegativeAccumulate>
-EIGEN_ALWAYS_INLINE void pger(PacketBlock<Packet,N>* acc, const Scalar* lhs, const Packet* rhsV)
-{
+template <int N, typename Scalar, typename Packet, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger(PacketBlock<Packet, N>* acc, const Scalar* lhs, const Packet* rhsV) {
Packet lhsV = pload<Packet>(lhs);
pger_common<Packet, NegativeAccumulate, N>(acc, lhsV, rhsV);
}
-// 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real.
-template<int N, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Packet &lhsV, Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi)
-{
+// 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types
+// real * complex and complex * real.
+template <int N, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock<Packet, N>* accReal, PacketBlock<Packet, N>* accImag,
+ const Packet& lhsV, Packet& lhsVi, const Packet* rhsV, const Packet* rhsVi) {
pger_common<Packet, false, N>(accReal, lhsV, rhsV);
- if(LhsIsReal)
- {
+ if (LhsIsReal) {
pger_common<Packet, ConjugateRhs, N>(accImag, lhsV, rhsVi);
EIGEN_UNUSED_VARIABLE(lhsVi);
} else {
@@ -1633,52 +1528,52 @@
}
}
-template<int N, typename Scalar, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi)
-{
+template <int N, typename Scalar, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void pgerc(PacketBlock<Packet, N>* accReal, PacketBlock<Packet, N>* accImag, const Scalar* lhs_ptr,
+ const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) {
Packet lhsV = ploadLhs<Packet>(lhs_ptr);
Packet lhsVi;
- if(!LhsIsReal) lhsVi = ploadLhs<Packet>(lhs_ptr_imag);
- else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+ if (!LhsIsReal)
+ lhsVi = ploadLhs<Packet>(lhs_ptr_imag);
+ else
+ EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
}
-template<typename Packet>
-EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet)* lhs)
-{
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet) * lhs) {
return ploadu<Packet>(lhs);
}
// Zero the accumulator on PacketBlock.
-template<typename Packet, int N>
-EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock<Packet,N>& acc)
-{
+template <typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock<Packet, N>& acc) {
for (int M = 0; M < N; M++) {
acc.packet[M] = pset1<Packet>((__UNPACK_TYPE__(Packet))0);
}
}
-template<typename Packet, int N>
-EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha)
-{
+template <typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock<Packet, N>& acc, PacketBlock<Packet, N>& accZ,
+ const Packet& pAlpha) {
for (int M = 0; M < N; M++) {
acc.packet[M] = vec_mul(accZ.packet[M], pAlpha);
}
}
-template<typename Packet, int N>
-EIGEN_ALWAYS_INLINE void band(PacketBlock<Packet,N>& acc, const Packet& pMask)
-{
+template <typename Packet, int N>
+EIGEN_ALWAYS_INLINE void band(PacketBlock<Packet, N>& acc, const Packet& pMask) {
for (int M = 0; M < N; M++) {
acc.packet[M] = pand<Packet>(acc.packet[M], pMask);
}
}
// Complex version of PacketBlock scaling.
-template<typename Packet, int N, bool mask>
-EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag, const Packet& pMask)
-{
+template <typename Packet, int N, bool mask>
+EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet, N>& aReal, PacketBlock<Packet, N>& aImag, const Packet& bReal,
+ const Packet& bImag, PacketBlock<Packet, N>& cReal, PacketBlock<Packet, N>& cImag,
+ const Packet& pMask) {
if (mask && (sizeof(__UNPACK_TYPE__(Packet)) == sizeof(float))) {
band<Packet, N>(aReal, pMask);
band<Packet, N>(aImag, pMask);
@@ -1698,16 +1593,16 @@
// Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed.
//
// full = operate (load) on the entire PacketBlock or only half
-template<typename DataMapper, typename Packet, const Index accCols, int StorageOrder, bool Complex, int N, bool full>
-EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index col)
-{
+template <typename DataMapper, typename Packet, const Index accCols, int StorageOrder, bool Complex, int N, bool full>
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet, N*(Complex ? 2 : 1)>& acc, const DataMapper& res, Index row,
+ Index col) {
if (StorageOrder == RowMajor) {
for (int M = 0; M < N; M++) {
acc.packet[M] = res.template loadPacket<Packet>(row + M, col);
}
if (Complex) {
for (int M = 0; M < N; M++) {
- acc.packet[M+N] = res.template loadPacket<Packet>(row + M, col + accCols);
+ acc.packet[M + N] = res.template loadPacket<Packet>(row + M, col + accCols);
}
}
} else {
@@ -1716,37 +1611,35 @@
}
if (Complex && full) {
for (int M = 0; M < N; M++) {
- acc.packet[M+N] = res.template loadPacket<Packet>(row + accCols, col + M);
+ acc.packet[M + N] = res.template loadPacket<Packet>(row + accCols, col + M);
}
}
}
}
-template<typename DataMapper, typename Packet, int N>
-EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row)
-{
+template <typename DataMapper, typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet, N>& acc, const DataMapper& res, Index row) {
for (int M = 0; M < N; M++) {
res.template storePacket<Packet>(row, M, acc.packet[M]);
}
}
#ifdef USE_PARTIAL_PACKETS
-template<typename DataMapper, typename Packet, const Index accCols, bool Complex, Index N, bool full>
-EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index elements)
-{
+template <typename DataMapper, typename Packet, const Index accCols, bool Complex, Index N, bool full>
+EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock<Packet, N*(Complex ? 2 : 1)>& acc, const DataMapper& res, Index row,
+ Index elements) {
for (Index M = 0; M < N; M++) {
acc.packet[M] = res.template loadPacketPartial<Packet>(row, M, elements);
}
if (Complex && full) {
for (Index M = 0; M < N; M++) {
- acc.packet[M+N] = res.template loadPacketPartial<Packet>(row + accCols, M, elements);
+ acc.packet[M + N] = res.template loadPacketPartial<Packet>(row + accCols, M, elements);
}
}
}
-template<typename DataMapper, typename Packet, Index N>
-EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row, Index elements)
-{
+template <typename DataMapper, typename Packet, Index N>
+EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock<Packet, N>& acc, const DataMapper& res, Index row, Index elements) {
for (Index M = 0; M < N; M++) {
res.template storePacketPartial<Packet>(row, M, acc.packet[M], elements);
}
@@ -1760,12 +1653,11 @@
#endif
#if !USE_P10_AND_PVIPR2_0
-const static Packet4i mask4[4] = { { 0, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, -1, 0, 0 }, { -1, -1, -1, 0 } };
+const static Packet4i mask4[4] = {{0, 0, 0, 0}, {-1, 0, 0, 0}, {-1, -1, 0, 0}, {-1, -1, -1, 0}};
#endif
-template<typename Packet>
-EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows)
-{
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows) {
#if USE_P10_AND_PVIPR2_0
#ifdef _BIG_ENDIAN
return Packet(vec_reve(vec_genwm((1 << remaining_rows) - 1)));
@@ -1777,9 +1669,8 @@
#endif
}
-template<>
-EIGEN_ALWAYS_INLINE Packet2d bmask<Packet2d>(const Index remaining_rows)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet2d bmask<Packet2d>(const Index remaining_rows) {
#if USE_P10_AND_PVIPR2_0
Packet2d mask2 = Packet2d(vec_gendm(remaining_rows));
#ifdef _BIG_ENDIAN
@@ -1788,23 +1679,22 @@
return mask2;
#endif
#else
- Packet2l ret = { -remaining_rows, 0 };
+ Packet2l ret = {-remaining_rows, 0};
return Packet2d(ret);
#endif
}
-template<typename Packet, int N>
-EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha)
-{
+template <typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet, N>& acc, PacketBlock<Packet, N>& accZ, const Packet& pAlpha) {
for (int M = 0; M < N; M++) {
acc.packet[M] = pmadd<Packet>(pAlpha, accZ.packet[M], acc.packet[M]);
}
}
// Scale the PacketBlock vectors by alpha.
-template<typename Packet, int N, bool mask>
-EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha, const Packet& pMask)
-{
+template <typename Packet, int N, bool mask>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet, N>& acc, PacketBlock<Packet, N>& accZ, const Packet& pAlpha,
+ const Packet& pMask) {
if (mask) {
band<Packet, N>(accZ, pMask);
} else {
@@ -1814,11 +1704,10 @@
bscale<Packet, N>(acc, accZ, pAlpha);
}
-template<typename Packet, int N, bool real>
-EIGEN_ALWAYS_INLINE void pbroadcastN(const __UNPACK_TYPE__(Packet) *ap0,
- const __UNPACK_TYPE__(Packet) *ap1, const __UNPACK_TYPE__(Packet) *ap2,
- Packet& a0, Packet& a1, Packet& a2, Packet& a3)
-{
+template <typename Packet, int N, bool real>
+EIGEN_ALWAYS_INLINE void pbroadcastN(const __UNPACK_TYPE__(Packet) * ap0, const __UNPACK_TYPE__(Packet) * ap1,
+ const __UNPACK_TYPE__(Packet) * ap2, Packet& a0, Packet& a1, Packet& a2,
+ Packet& a3) {
a0 = pset1<Packet>(ap0[0]);
if (N == 4) {
a1 = pset1<Packet>(ap0[1]);
@@ -1842,24 +1731,21 @@
}
}
-template<> EIGEN_ALWAYS_INLINE void
-pbroadcastN<Packet4f,4,true>(const float *ap0, const float *, const float *,
- Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pbroadcastN<Packet4f, 4, true>(const float* ap0, const float*, const float*, Packet4f& a0,
+ Packet4f& a1, Packet4f& a2, Packet4f& a3) {
pbroadcast4<Packet4f>(ap0, a0, a1, a2, a3);
}
-template<> EIGEN_ALWAYS_INLINE void
-pbroadcastN<Packet4f,4,false>(const float *ap0, const float *ap1, const float *ap2,
- Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
- pbroadcastN<Packet4f,4,true>(ap0, ap1, ap2, a0, a1, a2, a3);
+template <>
+EIGEN_ALWAYS_INLINE void pbroadcastN<Packet4f, 4, false>(const float* ap0, const float* ap1, const float* ap2,
+ Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
+ pbroadcastN<Packet4f, 4, true>(ap0, ap1, ap2, a0, a1, a2, a3);
}
-template<>
-EIGEN_ALWAYS_INLINE void pbroadcastN<Packet2d,4,false>(const double* ap0, const double *,
- const double *, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pbroadcastN<Packet2d, 4, false>(const double* ap0, const double*, const double*, Packet2d& a0,
+ Packet2d& a1, Packet2d& a2, Packet2d& a3) {
a1 = pload<Packet2d>(ap0);
a3 = pload<Packet2d>(ap0 + 2);
a0 = vec_splat(a1, 0);
@@ -1869,9 +1755,9 @@
}
// Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
-template<typename Packet, typename Packetc, int N, bool full>
-EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
-{
+template <typename Packet, typename Packetc, int N, bool full>
+EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet, N>& taccReal, PacketBlock<Packet, N>& taccImag,
+ PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2) {
for (int M = 0; M < N; M++) {
acc1.packet[M].v = vec_mergeh(taccReal.packet[M], taccImag.packet[M]);
}
@@ -1883,9 +1769,10 @@
}
}
-template<typename Packet, typename Packetc, int N, bool full>
-EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc,N*2>& tRes, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
-{
+template <typename Packet, typename Packetc, int N, bool full>
+EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet, N>& taccReal, PacketBlock<Packet, N>& taccImag,
+ PacketBlock<Packetc, N * 2>& tRes, PacketBlock<Packetc, N>& acc1,
+ PacketBlock<Packetc, N>& acc2) {
bcouple_common<Packet, Packetc, N, full>(taccReal, taccImag, acc1, acc2);
for (int M = 0; M < N; M++) {
@@ -1894,7 +1781,7 @@
if (full) {
for (int M = 0; M < N; M++) {
- acc2.packet[M] = padd<Packetc>(tRes.packet[M+N], acc2.packet[M]);
+ acc2.packet[M] = padd<Packetc>(tRes.packet[M + N], acc2.packet[M]);
}
}
}
@@ -1903,143 +1790,132 @@
#define PEEL 7
#define PEEL_ROW 7
-#define MICRO_UNROLL(func) \
- func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
+#define MICRO_UNROLL(func) func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
-#define MICRO_NORMAL_ROWS \
- accRows == quad_traits<Scalar>::rows || accRows == 1
+#define MICRO_NORMAL_ROWS accRows == quad_traits<Scalar>::rows || accRows == 1
#define MICRO_NEW_ROWS ((MICRO_NORMAL_ROWS) ? accRows : 1)
#define MICRO_RHS(ptr, N) rhs_##ptr##N
-#define MICRO_ZERO_PEEL(peel) \
- if ((PEEL_ROW > peel) && (peel != 0)) { \
+#define MICRO_ZERO_PEEL(peel) \
+ if ((PEEL_ROW > peel) && (peel != 0)) { \
bsetzero<Packet, accRows>(accZero##peel); \
- } else { \
- EIGEN_UNUSED_VARIABLE(accZero##peel); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(accZero##peel); \
}
-#define MICRO_ADD(ptr, N) \
- if (MICRO_NORMAL_ROWS) { \
- MICRO_RHS(ptr,0) += (accRows * N); \
- } else { \
- MICRO_RHS(ptr,0) += N; \
- MICRO_RHS(ptr,1) += N; \
- if (accRows == 3) { \
- MICRO_RHS(ptr,2) += N; \
- } \
+#define MICRO_ADD(ptr, N) \
+ if (MICRO_NORMAL_ROWS) { \
+ MICRO_RHS(ptr, 0) += (accRows * N); \
+ } else { \
+ MICRO_RHS(ptr, 0) += N; \
+ MICRO_RHS(ptr, 1) += N; \
+ if (accRows == 3) { \
+ MICRO_RHS(ptr, 2) += N; \
+ } \
}
#define MICRO_ADD_ROWS(N) MICRO_ADD(ptr, N)
-#define MICRO_BROADCAST1(peel, ptr, rhsV, real) \
- if (MICRO_NORMAL_ROWS) { \
- pbroadcastN<Packet,accRows,real>(MICRO_RHS(ptr,0) + (accRows * peel), MICRO_RHS(ptr,0), MICRO_RHS(ptr,0), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
- } else { \
- pbroadcastN<Packet,accRows,real>(MICRO_RHS(ptr,0) + peel, MICRO_RHS(ptr,1) + peel, MICRO_RHS(ptr,2) + peel, rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
+#define MICRO_BROADCAST1(peel, ptr, rhsV, real) \
+ if (MICRO_NORMAL_ROWS) { \
+ pbroadcastN<Packet, accRows, real>(MICRO_RHS(ptr, 0) + (accRows * peel), MICRO_RHS(ptr, 0), MICRO_RHS(ptr, 0), \
+ rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
+ } else { \
+ pbroadcastN<Packet, accRows, real>(MICRO_RHS(ptr, 0) + peel, MICRO_RHS(ptr, 1) + peel, MICRO_RHS(ptr, 2) + peel, \
+ rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
}
#define MICRO_BROADCAST(peel) MICRO_BROADCAST1(peel, ptr, rhsV, true)
-#define MICRO_BROADCAST_EXTRA1(ptr, rhsV, real) \
- pbroadcastN<Packet,accRows,real>(MICRO_RHS(ptr,0), MICRO_RHS(ptr,1), MICRO_RHS(ptr,2), rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+#define MICRO_BROADCAST_EXTRA1(ptr, rhsV, real) \
+ pbroadcastN<Packet, accRows, real>(MICRO_RHS(ptr, 0), MICRO_RHS(ptr, 1), MICRO_RHS(ptr, 2), rhsV[0], rhsV[1], \
+ rhsV[2], rhsV[3]);
-#define MICRO_BROADCAST_EXTRA \
- Packet rhsV[4]; \
+#define MICRO_BROADCAST_EXTRA \
+ Packet rhsV[4]; \
MICRO_BROADCAST_EXTRA1(ptr, rhsV, true) \
MICRO_ADD_ROWS(1)
-#define MICRO_SRC2(ptr, N, M) \
- if (MICRO_NORMAL_ROWS) { \
- EIGEN_UNUSED_VARIABLE(strideB); \
- EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr,1)); \
- EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr,2)); \
- } else { \
- MICRO_RHS(ptr,1) = rhs_base + N + M; \
- if (accRows == 3) { \
- MICRO_RHS(ptr,2) = rhs_base + N*2 + M; \
- } else { \
- EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr,2)); \
- } \
+#define MICRO_SRC2(ptr, N, M) \
+ if (MICRO_NORMAL_ROWS) { \
+ EIGEN_UNUSED_VARIABLE(strideB); \
+ EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr, 1)); \
+ EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr, 2)); \
+ } else { \
+ MICRO_RHS(ptr, 1) = rhs_base + N + M; \
+ if (accRows == 3) { \
+ MICRO_RHS(ptr, 2) = rhs_base + N * 2 + M; \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr, 2)); \
+ } \
}
#define MICRO_SRC2_PTR MICRO_SRC2(ptr, strideB, 0)
#define MICRO_ZERO_PEEL_ROW MICRO_UNROLL(MICRO_ZERO_PEEL)
-#define MICRO_WORK_PEEL(peel) \
- if (PEEL_ROW > peel) { \
- MICRO_BROADCAST(peel) \
+#define MICRO_WORK_PEEL(peel) \
+ if (PEEL_ROW > peel) { \
+ MICRO_BROADCAST(peel) \
pger<accRows, Scalar, Packet, false>(&accZero##peel, lhs_ptr + (remaining_rows * peel), rhsV##peel); \
- } else { \
- EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(rhsV##peel); \
}
-#define MICRO_WORK_PEEL_ROW \
+#define MICRO_WORK_PEEL_ROW \
Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4], rhsV4[4], rhsV5[4], rhsV6[4], rhsV7[4]; \
- MICRO_UNROLL(MICRO_WORK_PEEL) \
- lhs_ptr += (remaining_rows * PEEL_ROW); \
+ MICRO_UNROLL(MICRO_WORK_PEEL) \
+ lhs_ptr += (remaining_rows * PEEL_ROW); \
MICRO_ADD_ROWS(PEEL_ROW)
-#define MICRO_ADD_PEEL(peel, sum) \
- if (PEEL_ROW > peel) { \
- for (Index i = 0; i < accRows; i++) { \
+#define MICRO_ADD_PEEL(peel, sum) \
+ if (PEEL_ROW > peel) { \
+ for (Index i = 0; i < accRows; i++) { \
accZero##sum.packet[i] += accZero##peel.packet[i]; \
- } \
+ } \
}
#define MICRO_ADD_PEEL_ROW \
- MICRO_ADD_PEEL(4, 0) MICRO_ADD_PEEL(5, 1) MICRO_ADD_PEEL(6, 2) MICRO_ADD_PEEL(7, 3) \
- MICRO_ADD_PEEL(2, 0) MICRO_ADD_PEEL(3, 1) MICRO_ADD_PEEL(1, 0)
+ MICRO_ADD_PEEL(4, 0) \
+ MICRO_ADD_PEEL(5, 1) \
+ MICRO_ADD_PEEL(6, 2) MICRO_ADD_PEEL(7, 3) MICRO_ADD_PEEL(2, 0) MICRO_ADD_PEEL(3, 1) MICRO_ADD_PEEL(1, 0)
-#define MICRO_PREFETCHN1(ptr, N) \
- EIGEN_POWER_PREFETCH(MICRO_RHS(ptr,0)); \
- if (N == 2 || N == 3) { \
- EIGEN_POWER_PREFETCH(MICRO_RHS(ptr,1)); \
- if (N == 3) { \
- EIGEN_POWER_PREFETCH(MICRO_RHS(ptr,2)); \
- } \
+#define MICRO_PREFETCHN1(ptr, N) \
+ EIGEN_POWER_PREFETCH(MICRO_RHS(ptr, 0)); \
+ if (N == 2 || N == 3) { \
+ EIGEN_POWER_PREFETCH(MICRO_RHS(ptr, 1)); \
+ if (N == 3) { \
+ EIGEN_POWER_PREFETCH(MICRO_RHS(ptr, 2)); \
+ } \
}
#define MICRO_PREFETCHN(N) MICRO_PREFETCHN1(ptr, N)
#define MICRO_COMPLEX_PREFETCHN(N) \
- MICRO_PREFETCHN1(ptr_real, N); \
- if(!RhsIsReal) { \
+ MICRO_PREFETCHN1(ptr_real, N); \
+ if (!RhsIsReal) { \
MICRO_PREFETCHN1(ptr_imag, N); \
}
-template<typename Scalar, typename Packet, const Index accRows, const Index remaining_rows>
-EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW(
- const Scalar* &lhs_ptr,
- const Scalar* &rhs_ptr0,
- const Scalar* &rhs_ptr1,
- const Scalar* &rhs_ptr2,
- PacketBlock<Packet,accRows> &accZero)
-{
+template <typename Scalar, typename Packet, const Index accRows, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW(const Scalar*& lhs_ptr, const Scalar*& rhs_ptr0, const Scalar*& rhs_ptr1,
+ const Scalar*& rhs_ptr2, PacketBlock<Packet, accRows>& accZero) {
MICRO_BROADCAST_EXTRA
pger<accRows, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
lhs_ptr += remaining_rows;
}
-template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols, const Index remaining_rows>
-EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration(
- const DataMapper& res,
- const Scalar* lhs_base,
- const Scalar* rhs_base,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index row,
- Index rows,
- const Packet& pAlpha,
- const Packet& pMask)
-{
- const Scalar* rhs_ptr0 = rhs_base, * rhs_ptr1 = NULL, * rhs_ptr2 = NULL;
- const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
- PacketBlock<Packet,accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7, acc;
+template <typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols,
+ const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration(const DataMapper& res, const Scalar* lhs_base,
+ const Scalar* rhs_base, Index depth, Index strideA, Index offsetA,
+ Index strideB, Index row, Index rows, const Packet& pAlpha,
+ const Packet& pMask) {
+ const Scalar *rhs_ptr0 = rhs_base, *rhs_ptr1 = NULL, *rhs_ptr2 = NULL;
+ const Scalar* lhs_ptr = lhs_base + row * strideA + remaining_rows * offsetA;
+ PacketBlock<Packet, accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7, acc;
MICRO_SRC2_PTR
bsetzero<Packet, accRows>(accZero0);
@@ -2048,16 +1924,14 @@
Index k = 0;
if (remaining_depth >= PEEL_ROW) {
MICRO_ZERO_PEEL_ROW
- do
- {
+ do {
MICRO_PREFETCHN(accRows)
EIGEN_POWER_PREFETCH(lhs_ptr);
MICRO_WORK_PEEL_ROW
} while ((k += PEEL_ROW) + PEEL_ROW <= remaining_depth);
MICRO_ADD_PEEL_ROW
}
- for(; k < depth; k++)
- {
+ for (; k < depth; k++) {
MICRO_EXTRA_ROW<Scalar, Packet, accRows, remaining_rows>(lhs_ptr, rhs_ptr0, rhs_ptr1, rhs_ptr2, accZero0);
}
@@ -2065,18 +1939,17 @@
EIGEN_UNUSED_VARIABLE(rows);
EIGEN_UNUSED_VARIABLE(pMask);
bload_partial<DataMapper, Packet, 0, false, accRows>(acc, res, row, remaining_rows);
- bscale<Packet,accRows>(acc, accZero0, pAlpha);
+ bscale<Packet, accRows>(acc, accZero0, pAlpha);
bstore_partial<DataMapper, Packet, accRows>(acc, res, row, remaining_rows);
#else
bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row, 0);
- if ((accRows == 1) || (rows >= accCols))
- {
- bscale<Packet,accRows,true>(acc, accZero0, pAlpha, pMask);
+ if ((accRows == 1) || (rows >= accCols)) {
+ bscale<Packet, accRows, true>(acc, accZero0, pAlpha, pMask);
bstore<DataMapper, Packet, accRows>(acc, res, row);
} else {
- bscale<Packet,accRows,false>(acc, accZero0, pAlpha, pMask);
- for(Index j = 0; j < accRows; j++) {
- for(Index i = 0; i < remaining_rows; i++) {
+ bscale<Packet, accRows, false>(acc, accZero0, pAlpha, pMask);
+ for (Index j = 0; j < accRows; j++) {
+ for (Index i = 0; i < remaining_rows; i++) {
res(row + i, j) = acc.packet[j][i];
}
}
@@ -2084,75 +1957,62 @@
#endif
}
-#define MICRO_EXTRA(MICRO_EXTRA_UNROLL, value, is_col) \
- switch(value) { \
- default: \
- MICRO_EXTRA_UNROLL(1) \
- break; \
- case 2: \
+#define MICRO_EXTRA(MICRO_EXTRA_UNROLL, value, is_col) \
+ switch (value) { \
+ default: \
+ MICRO_EXTRA_UNROLL(1) \
+ break; \
+ case 2: \
if (is_col || (sizeof(Scalar) == sizeof(float))) { \
- MICRO_EXTRA_UNROLL(2) \
- } \
- break; \
- case 3: \
+ MICRO_EXTRA_UNROLL(2) \
+ } \
+ break; \
+ case 3: \
if (is_col || (sizeof(Scalar) == sizeof(float))) { \
- MICRO_EXTRA_UNROLL(3) \
- } \
- break; \
+ MICRO_EXTRA_UNROLL(3) \
+ } \
+ break; \
}
-#define MICRO_EXTRA_ROWS(N) \
- gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, accRows, accCols, N>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlpha, pMask);
+#define MICRO_EXTRA_ROWS(N) \
+ gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, accRows, accCols, N>( \
+ res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlpha, pMask);
-template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
-EIGEN_ALWAYS_INLINE void gemm_extra_row(
- const DataMapper& res,
- const Scalar* lhs_base,
- const Scalar* rhs_base,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index row,
- Index rows,
- Index remaining_rows,
- const Packet& pAlpha,
- const Packet& pMask)
-{
+template <typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_row(const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base,
+ Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index rows,
+ Index remaining_rows, const Packet& pAlpha, const Packet& pMask) {
MICRO_EXTRA(MICRO_EXTRA_ROWS, remaining_rows, false)
}
#define MICRO_UNROLL_WORK(func, func2, peel) \
- MICRO_UNROLL(func2); \
- func(0,peel) func(1,peel) func(2,peel) func(3,peel) \
- func(4,peel) func(5,peel) func(6,peel) func(7,peel)
+ MICRO_UNROLL(func2); \
+ func(0, peel) func(1, peel) func(2, peel) func(3, peel) func(4, peel) func(5, peel) func(6, peel) func(7, peel)
-#define MICRO_WORK_ONE(iter, peel) \
- if (unroll_factor > iter) { \
+#define MICRO_WORK_ONE(iter, peel) \
+ if (unroll_factor > iter) { \
pger_common<Packet, false, accRows>(&accZero##iter, lhsV##iter, rhsV##peel); \
}
-#define MICRO_TYPE_PEEL4(func, func2, peel) \
- if (PEEL > peel) { \
+#define MICRO_TYPE_PEEL4(func, func2, peel) \
+ if (PEEL > peel) { \
Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
- MICRO_BROADCAST(peel) \
- MICRO_UNROLL_WORK(func, func2, peel) \
- } else { \
- EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+ MICRO_BROADCAST(peel) \
+ MICRO_UNROLL_WORK(func, func2, peel) \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(rhsV##peel); \
}
-#define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \
- Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M]; \
- func(func1,func2,0) func(func1,func2,1) \
- func(func1,func2,2) func(func1,func2,3) \
- func(func1,func2,4) func(func1,func2,5) \
- func(func1,func2,6) func(func1,func2,7)
+#define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \
+ Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M]; \
+ func(func1, func2, 0) func(func1, func2, 1) func(func1, func2, 2) func(func1, func2, 3) func(func1, func2, 4) \
+ func(func1, func2, 5) func(func1, func2, 6) func(func1, func2, 7)
#define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \
- Packet rhsV0[M]; \
- func(func1,func2,0)
+ Packet rhsV0[M]; \
+ func(func1, func2, 0)
-#define MICRO_UNROLL_TYPE(MICRO_TYPE, size) \
+#define MICRO_UNROLL_TYPE(MICRO_TYPE, size) \
MICRO_TYPE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE) \
MICRO_ADD_ROWS(size)
@@ -2160,11 +2020,11 @@
#define MICRO_ONE4 MICRO_UNROLL_TYPE(MICRO_UNROLL_TYPE_ONE, 1)
-#define MICRO_DST_PTR_ONE(iter) \
- if (unroll_factor > iter) { \
+#define MICRO_DST_PTR_ONE(iter) \
+ if (unroll_factor > iter) { \
bsetzero<Packet, accRows>(accZero##iter); \
- } else { \
- EIGEN_UNUSED_VARIABLE(accZero##iter); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(accZero##iter); \
}
#define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE)
@@ -2174,69 +2034,62 @@
#define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE)
#ifdef USE_PARTIAL_PACKETS
-#define MICRO_STORE_ONE(iter) \
- if (unroll_factor > iter) { \
- if (MICRO_NORMAL_PARTIAL(iter)) { \
- bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row + iter*accCols, 0); \
- bscale<Packet,accRows>(acc, accZero##iter, pAlpha); \
- bstore<DataMapper, Packet, accRows>(acc, res, row + iter*accCols); \
- } else { \
- bload_partial<DataMapper, Packet, 0, false, accRows>(acc, res, row + iter*accCols, accCols2); \
- bscale<Packet,accRows>(acc, accZero##iter, pAlpha); \
- bstore_partial<DataMapper, Packet, accRows>(acc, res, row + iter*accCols, accCols2); \
- } \
+#define MICRO_STORE_ONE(iter) \
+ if (unroll_factor > iter) { \
+ if (MICRO_NORMAL_PARTIAL(iter)) { \
+ bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row + iter * accCols, 0); \
+ bscale<Packet, accRows>(acc, accZero##iter, pAlpha); \
+ bstore<DataMapper, Packet, accRows>(acc, res, row + iter * accCols); \
+ } else { \
+ bload_partial<DataMapper, Packet, 0, false, accRows>(acc, res, row + iter * accCols, accCols2); \
+ bscale<Packet, accRows>(acc, accZero##iter, pAlpha); \
+ bstore_partial<DataMapper, Packet, accRows>(acc, res, row + iter * accCols, accCols2); \
+ } \
}
#else
-#define MICRO_STORE_ONE(iter) \
- if (unroll_factor > iter) { \
- bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row + iter*accCols, 0); \
- bscale<Packet,accRows,!(MICRO_NORMAL(iter))>(acc, accZero##iter, pAlpha, pMask); \
- bstore<DataMapper, Packet, accRows>(acc, res, row + iter*accCols); \
+#define MICRO_STORE_ONE(iter) \
+ if (unroll_factor > iter) { \
+ bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row + iter * accCols, 0); \
+ bscale<Packet, accRows, !(MICRO_NORMAL(iter))>(acc, accZero##iter, pAlpha, pMask); \
+ bstore<DataMapper, Packet, accRows>(acc, res, row + iter * accCols); \
}
#endif
#define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE)
#ifdef USE_PARTIAL_PACKETS
-template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols, bool full>
+template <int unroll_factor, typename Scalar, typename Packet, typename DataMapper, const Index accRows,
+ const Index accCols, bool full>
#else
-template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2>
+template <int unroll_factor, typename Scalar, typename Packet, typename DataMapper, const Index accRows,
+ const Index accCols, const Index accCols2>
#endif
-EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration(
- const DataMapper& res,
- const Scalar* lhs_base,
- const Scalar* rhs_base,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index& row,
- const Packet& pAlpha,
+EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration(const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base,
+ Index depth, Index strideA, Index offsetA, Index strideB, Index& row,
+ const Packet& pAlpha,
#ifdef USE_PARTIAL_PACKETS
- Index accCols2
+ Index accCols2
#else
- const Packet& pMask
+ const Packet& pMask
#endif
- )
-{
- const Scalar* rhs_ptr0 = rhs_base, * rhs_ptr1 = NULL, * rhs_ptr2 = NULL;
- const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
- PacketBlock<Packet,accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
- PacketBlock<Packet,accRows> acc;
+) {
+ const Scalar *rhs_ptr0 = rhs_base, *rhs_ptr1 = NULL, *rhs_ptr2 = NULL;
+ const Scalar *lhs_ptr0 = NULL, *lhs_ptr1 = NULL, *lhs_ptr2 = NULL, *lhs_ptr3 = NULL, *lhs_ptr4 = NULL,
+ *lhs_ptr5 = NULL, *lhs_ptr6 = NULL, *lhs_ptr7 = NULL;
+ PacketBlock<Packet, accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
+ PacketBlock<Packet, accRows> acc;
MICRO_SRC2_PTR
MICRO_SRC_PTR
MICRO_DST_PTR
Index k = 0;
- for(; k + PEEL <= depth; k+= PEEL)
- {
+ for (; k + PEEL <= depth; k += PEEL) {
MICRO_PREFETCHN(accRows)
MICRO_PREFETCH
MICRO_ONE_PEEL4
}
- for(; k < depth; k++)
- {
+ for (; k < depth; k++) {
MICRO_ONE4
}
MICRO_STORE
@@ -2245,42 +2098,32 @@
}
#ifdef USE_PARTIAL_PACKETS
-#define MICRO_UNROLL_ITER2(N, M) \
- gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, accRows, accCols, !M>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, M ? remaining_rows : accCols); \
+#define MICRO_UNROLL_ITER2(N, M) \
+ gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, accRows, accCols, !M>( \
+ res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, M ? remaining_rows : accCols); \
if (M) return;
#else
-#define MICRO_UNROLL_ITER2(N, M) \
- gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, accRows, accCols, M ? M : accCols>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, pMask); \
+#define MICRO_UNROLL_ITER2(N, M) \
+ gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, accRows, accCols, M ? M : accCols>( \
+ res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, pMask); \
if (M) return;
#endif
-template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
-EIGEN_ALWAYS_INLINE void gemm_cols(
- const DataMapper& res,
- const Scalar* blockA,
- const Scalar* blockB,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index offsetB,
- Index col,
- Index rows,
- Index remaining_rows,
- const Packet& pAlpha,
- const Packet& pMask)
-{
+template <typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index depth,
+ Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows,
+ Index remaining_rows, const Packet& pAlpha, const Packet& pMask) {
const DataMapper res3 = res.getSubMapper(0, col);
- const Scalar* rhs_base = blockB + col*strideB + MICRO_NEW_ROWS*offsetB;
- const Scalar* lhs_base = blockA + accCols*offsetA;
+ const Scalar* rhs_base = blockB + col * strideB + MICRO_NEW_ROWS * offsetB;
+ const Scalar* lhs_base = blockA + accCols * offsetA;
Index row = 0;
#define MAX_UNROLL 7
- while(row + MAX_UNROLL*accCols <= rows) {
+ while (row + MAX_UNROLL * accCols <= rows) {
MICRO_UNROLL_ITER2(MAX_UNROLL, 0);
}
- switch( (rows-row)/accCols ) {
+ switch ((rows - row) / accCols) {
#if MAX_UNROLL > 7
case 7:
MICRO_UNROLL_ITER(MICRO_UNROLL_ITER2, 7)
@@ -2321,59 +2164,50 @@
}
#undef MAX_UNROLL
- if(remaining_rows > 0)
- {
- gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask);
+ if (remaining_rows > 0) {
+ gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA,
+ strideB, row, rows, remaining_rows, pAlpha, pMask);
}
}
-#define MICRO_EXTRA_COLS(N) \
- gemm_cols<Scalar, Packet, DataMapper, N, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
+#define MICRO_EXTRA_COLS(N) \
+ gemm_cols<Scalar, Packet, DataMapper, N, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, \
+ col, rows, remaining_rows, pAlpha, pMask);
-template<typename Scalar, typename Packet, typename DataMapper, const Index accCols>
-EIGEN_ALWAYS_INLINE void gemm_extra_cols(
- const DataMapper& res,
- const Scalar* blockA,
- const Scalar* blockB,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index offsetB,
- Index col,
- Index rows,
- Index cols,
- Index remaining_rows,
- const Packet& pAlpha,
- const Packet& pMask)
-{
- MICRO_EXTRA(MICRO_EXTRA_COLS, cols-col, true)
+template <typename Scalar, typename Packet, typename DataMapper, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index depth,
+ Index strideA, Index offsetA, Index strideB, Index offsetB, Index col,
+ Index rows, Index cols, Index remaining_rows, const Packet& pAlpha,
+ const Packet& pMask) {
+ MICRO_EXTRA(MICRO_EXTRA_COLS, cols - col, true)
}
/****************
* GEMM kernels *
* **************/
-template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
-EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
- const Index remaining_rows = rows % accCols;
+template <typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows,
+ const Index accCols>
+EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows,
+ Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA,
+ Index offsetB) {
+ const Index remaining_rows = rows % accCols;
- if( strideA == -1 ) strideA = depth;
- if( strideB == -1 ) strideB = depth;
+ if (strideA == -1) strideA = depth;
+ if (strideB == -1) strideB = depth;
- const Packet pAlpha = pset1<Packet>(alpha);
- const Packet pMask = bmask<Packet>(remaining_rows);
+ const Packet pAlpha = pset1<Packet>(alpha);
+ const Packet pMask = bmask<Packet>(remaining_rows);
- Index col = 0;
- for(; col + accRows <= cols; col += accRows)
- {
- gemm_cols<Scalar, Packet, DataMapper, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask);
- }
+ Index col = 0;
+ for (; col + accRows <= cols; col += accRows) {
+ gemm_cols<Scalar, Packet, DataMapper, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB,
+ offsetB, col, rows, remaining_rows, pAlpha, pMask);
+ }
- if (col != cols)
- {
- gemm_extra_cols<Scalar, Packet, DataMapper, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
- }
+ if (col != cols) {
+ gemm_extra_cols<Scalar, Packet, DataMapper, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB,
+ col, rows, cols, remaining_rows, pAlpha, pMask);
+ }
}
#define accColsC (accCols / 2)
@@ -2384,129 +2218,128 @@
#define PEEL_COMPLEX 3
#define PEEL_COMPLEX_ROW 3
-#define MICRO_COMPLEX_UNROLL(func) \
- func(0) func(1) func(2) func(3)
+#define MICRO_COMPLEX_UNROLL(func) func(0) func(1) func(2) func(3)
-#define MICRO_COMPLEX_ZERO_PEEL(peel) \
+#define MICRO_COMPLEX_ZERO_PEEL(peel) \
if ((PEEL_COMPLEX_ROW > peel) && (peel != 0)) { \
- bsetzero<Packet, accRows>(accReal##peel); \
- bsetzero<Packet, accRows>(accImag##peel); \
- } else { \
- EIGEN_UNUSED_VARIABLE(accReal##peel); \
- EIGEN_UNUSED_VARIABLE(accImag##peel); \
+ bsetzero<Packet, accRows>(accReal##peel); \
+ bsetzero<Packet, accRows>(accImag##peel); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(accReal##peel); \
+ EIGEN_UNUSED_VARIABLE(accImag##peel); \
}
-#define MICRO_COMPLEX_ADD_ROWS(N, used) \
- MICRO_ADD(ptr_real, N) \
- if (!RhsIsReal) { \
- MICRO_ADD(ptr_imag, N) \
- } else if (used) { \
- EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,0)); \
- EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,1)); \
- EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,2)); \
+#define MICRO_COMPLEX_ADD_ROWS(N, used) \
+ MICRO_ADD(ptr_real, N) \
+ if (!RhsIsReal) { \
+ MICRO_ADD(ptr_imag, N) \
+ } else if (used) { \
+ EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 0)); \
+ EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 1)); \
+ EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 2)); \
}
-#define MICRO_COMPLEX_BROADCAST(peel) \
- MICRO_BROADCAST1(peel, ptr_real, rhsV, false) \
- if (!RhsIsReal) { \
+#define MICRO_COMPLEX_BROADCAST(peel) \
+ MICRO_BROADCAST1(peel, ptr_real, rhsV, false) \
+ if (!RhsIsReal) { \
MICRO_BROADCAST1(peel, ptr_imag, rhsVi, false) \
- } else { \
- EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
}
-#define MICRO_COMPLEX_BROADCAST_EXTRA \
- Packet rhsV[4], rhsVi[4]; \
- MICRO_BROADCAST_EXTRA1(ptr_real, rhsV, false) \
- if(!RhsIsReal) { \
+#define MICRO_COMPLEX_BROADCAST_EXTRA \
+ Packet rhsV[4], rhsVi[4]; \
+ MICRO_BROADCAST_EXTRA1(ptr_real, rhsV, false) \
+ if (!RhsIsReal) { \
MICRO_BROADCAST_EXTRA1(ptr_imag, rhsVi, false) \
- } else { \
- EIGEN_UNUSED_VARIABLE(rhsVi); \
- } \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(rhsVi); \
+ } \
MICRO_COMPLEX_ADD_ROWS(1, true)
-#define MICRO_COMPLEX_SRC2_PTR \
- MICRO_SRC2(ptr_real, strideB*advanceCols, 0) \
- if (!RhsIsReal) { \
- MICRO_RHS(ptr_imag,0) = rhs_base + MICRO_NEW_ROWS*strideB; \
- MICRO_SRC2(ptr_imag, strideB*advanceCols, strideB) \
- } else { \
- EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,0)); \
- EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,1)); \
- EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag,2)); \
+#define MICRO_COMPLEX_SRC2_PTR \
+ MICRO_SRC2(ptr_real, strideB* advanceCols, 0) \
+ if (!RhsIsReal) { \
+ MICRO_RHS(ptr_imag, 0) = rhs_base + MICRO_NEW_ROWS * strideB; \
+ MICRO_SRC2(ptr_imag, strideB* advanceCols, strideB) \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 0)); \
+ EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 1)); \
+ EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 2)); \
}
#define MICRO_COMPLEX_ZERO_PEEL_ROW MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_ZERO_PEEL)
-#define MICRO_COMPLEX_WORK_PEEL(peel) \
- if (PEEL_COMPLEX_ROW > peel) { \
- MICRO_COMPLEX_BROADCAST(peel) \
- pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##peel, &accImag##peel, lhs_ptr_real + (remaining_rows * peel), lhs_ptr_imag + (remaining_rows * peel), rhsV##peel, rhsVi##peel); \
- } else { \
- EIGEN_UNUSED_VARIABLE(rhsV##peel); \
- EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+#define MICRO_COMPLEX_WORK_PEEL(peel) \
+ if (PEEL_COMPLEX_ROW > peel) { \
+ MICRO_COMPLEX_BROADCAST(peel) \
+ pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>( \
+ &accReal##peel, &accImag##peel, lhs_ptr_real + (remaining_rows * peel), \
+ lhs_ptr_imag + (remaining_rows * peel), rhsV##peel, rhsVi##peel); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+ EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
}
-#define MICRO_COMPLEX_ADD_COLS(size) \
- lhs_ptr_real += (remaining_rows * size); \
- if(!LhsIsReal) lhs_ptr_imag += (remaining_rows * size); \
- else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+#define MICRO_COMPLEX_ADD_COLS(size) \
+ lhs_ptr_real += (remaining_rows * size); \
+ if (!LhsIsReal) \
+ lhs_ptr_imag += (remaining_rows * size); \
+ else \
+ EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
-#define MICRO_COMPLEX_WORK_PEEL_ROW \
- Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4]; \
+#define MICRO_COMPLEX_WORK_PEEL_ROW \
+ Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4]; \
Packet rhsVi0[4], rhsVi1[4], rhsVi2[4], rhsVi3[4]; \
- MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_WORK_PEEL) \
- MICRO_COMPLEX_ADD_COLS(PEEL_COMPLEX_ROW) \
+ MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_WORK_PEEL) \
+ MICRO_COMPLEX_ADD_COLS(PEEL_COMPLEX_ROW) \
MICRO_COMPLEX_ADD_ROWS(PEEL_COMPLEX_ROW, false)
-#define MICRO_COMPLEX_ADD_PEEL(peel, sum) \
- if (PEEL_COMPLEX_ROW > peel) { \
- for (Index i = 0; i < accRows; i++) { \
+#define MICRO_COMPLEX_ADD_PEEL(peel, sum) \
+ if (PEEL_COMPLEX_ROW > peel) { \
+ for (Index i = 0; i < accRows; i++) { \
accReal##sum.packet[i] += accReal##peel.packet[i]; \
accImag##sum.packet[i] += accImag##peel.packet[i]; \
- } \
+ } \
}
#define MICRO_COMPLEX_ADD_PEEL_ROW \
- MICRO_COMPLEX_ADD_PEEL(2, 0) MICRO_COMPLEX_ADD_PEEL(3, 1) \
- MICRO_COMPLEX_ADD_PEEL(1, 0)
+ MICRO_COMPLEX_ADD_PEEL(2, 0) MICRO_COMPLEX_ADD_PEEL(3, 1) MICRO_COMPLEX_ADD_PEEL(1, 0)
-template<typename Scalar, typename Packet, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index remaining_rows>
-EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW(
- const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag,
- const Scalar* &rhs_ptr_real0, const Scalar* &rhs_ptr_real1, const Scalar* &rhs_ptr_real2,
- const Scalar* &rhs_ptr_imag0, const Scalar* &rhs_ptr_imag1, const Scalar* &rhs_ptr_imag2,
- PacketBlock<Packet,accRows> &accReal, PacketBlock<Packet,accRows> &accImag)
-{
+template <typename Scalar, typename Packet, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal,
+ bool RhsIsReal, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW(const Scalar*& lhs_ptr_real, const Scalar*& lhs_ptr_imag,
+ const Scalar*& rhs_ptr_real0, const Scalar*& rhs_ptr_real1,
+ const Scalar*& rhs_ptr_real2, const Scalar*& rhs_ptr_imag0,
+ const Scalar*& rhs_ptr_imag1, const Scalar*& rhs_ptr_imag2,
+ PacketBlock<Packet, accRows>& accReal,
+ PacketBlock<Packet, accRows>& accImag) {
MICRO_COMPLEX_BROADCAST_EXTRA
- pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
+ pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real,
+ lhs_ptr_imag, rhsV, rhsVi);
MICRO_COMPLEX_ADD_COLS(1)
}
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index remaining_rows>
-EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration(
- const DataMapper& res,
- const Scalar* lhs_base,
- const Scalar* rhs_base,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index row,
- Index rows,
- const Packet& pAlphaReal,
- const Packet& pAlphaImag,
- const Packet& pMask)
-{
- const Scalar* rhs_ptr_real0 = rhs_base, * rhs_ptr_real1 = NULL, * rhs_ptr_real2 = NULL;
- const Scalar* rhs_ptr_imag0 = NULL, * rhs_ptr_imag1 = NULL, * rhs_ptr_imag2 = NULL;
- const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows,
+ const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal,
+ const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration(const DataMapper& res, const Scalar* lhs_base,
+ const Scalar* rhs_base, Index depth, Index strideA,
+ Index offsetA, Index strideB, Index row, Index rows,
+ const Packet& pAlphaReal, const Packet& pAlphaImag,
+ const Packet& pMask) {
+ const Scalar *rhs_ptr_real0 = rhs_base, *rhs_ptr_real1 = NULL, *rhs_ptr_real2 = NULL;
+ const Scalar *rhs_ptr_imag0 = NULL, *rhs_ptr_imag1 = NULL, *rhs_ptr_imag2 = NULL;
+ const Scalar* lhs_ptr_real = lhs_base + advanceRows * row * strideA + remaining_rows * offsetA;
const Scalar* lhs_ptr_imag = NULL;
- if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
- else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
- PacketBlock<Packet,accRows> accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
- PacketBlock<Packet,accRows> taccReal, taccImag;
- PacketBlock<Packetc,accRows> acc0, acc1;
- PacketBlock<Packetc,accRows*2> tRes;
+ if (!LhsIsReal)
+ lhs_ptr_imag = lhs_ptr_real + remaining_rows * strideA;
+ else
+ EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+ PacketBlock<Packet, accRows> accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
+ PacketBlock<Packet, accRows> taccReal, taccImag;
+ PacketBlock<Packetc, accRows> acc0, acc1;
+ PacketBlock<Packetc, accRows * 2> tRes;
MICRO_COMPLEX_SRC2_PTR
@@ -2517,45 +2350,43 @@
Index k = 0;
if (remaining_depth >= PEEL_COMPLEX_ROW) {
MICRO_COMPLEX_ZERO_PEEL_ROW
- do
- {
+ do {
MICRO_COMPLEX_PREFETCHN(accRows)
EIGEN_POWER_PREFETCH(lhs_ptr_real);
- if(!LhsIsReal) {
+ if (!LhsIsReal) {
EIGEN_POWER_PREFETCH(lhs_ptr_imag);
}
MICRO_COMPLEX_WORK_PEEL_ROW
} while ((k += PEEL_COMPLEX_ROW) + PEEL_COMPLEX_ROW <= remaining_depth);
MICRO_COMPLEX_ADD_PEEL_ROW
}
- for(; k < depth; k++)
- {
- MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, remaining_rows>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real0, rhs_ptr_real1, rhs_ptr_real2, rhs_ptr_imag0, rhs_ptr_imag1, rhs_ptr_imag2, accReal0, accImag0);
+ for (; k < depth; k++) {
+ MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, remaining_rows>(
+ lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real0, rhs_ptr_real1, rhs_ptr_real2, rhs_ptr_imag0, rhs_ptr_imag1,
+ rhs_ptr_imag2, accReal0, accImag0);
}
constexpr bool full = (remaining_rows > accColsC);
bload<DataMapper, Packetc, accColsC, ColMajor, true, accRows, full>(tRes, res, row, 0);
- if ((accRows == 1) || (rows >= accCols))
- {
- bscalec<Packet,accRows,true>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
+ if ((accRows == 1) || (rows >= accCols)) {
+ bscalec<Packet, accRows, true>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
bcouple<Packet, Packetc, accRows, full>(taccReal, taccImag, tRes, acc0, acc1);
bstore<DataMapper, Packetc, accRows>(acc0, res, row + 0);
if (full) {
bstore<DataMapper, Packetc, accRows>(acc1, res, row + accColsC);
}
} else {
- bscalec<Packet,accRows,false>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
+ bscalec<Packet, accRows, false>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
bcouple<Packet, Packetc, accRows, full>(taccReal, taccImag, tRes, acc0, acc1);
- if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1))
- {
- for(Index j = 0; j < accRows; j++) {
+ if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) {
+ for (Index j = 0; j < accRows; j++) {
res(row + 0, j) = pfirst<Packetc>(acc0.packet[j]);
}
} else {
bstore<DataMapper, Packetc, accRows>(acc0, res, row + 0);
if (full) {
- for(Index j = 0; j < accRows; j++) {
+ for (Index j = 0; j < accRows; j++) {
res(row + accColsC, j) = pfirst<Packetc>(acc1.packet[j]);
}
}
@@ -2563,59 +2394,51 @@
}
}
-#define MICRO_COMPLEX_EXTRA_ROWS(N) \
- gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, N>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlphaReal, pAlphaImag, pMask);
+#define MICRO_COMPLEX_EXTRA_ROWS(N) \
+ gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, \
+ ConjugateRhs, LhsIsReal, RhsIsReal, N>( \
+ res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlphaReal, pAlphaImag, pMask);
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(
- const DataMapper& res,
- const Scalar* lhs_base,
- const Scalar* rhs_base,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index row,
- Index rows,
- Index remaining_rows,
- const Packet& pAlphaReal,
- const Packet& pAlphaImag,
- const Packet& pMask)
-{
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows,
+ const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base,
+ Index depth, Index strideA, Index offsetA, Index strideB, Index row,
+ Index rows, Index remaining_rows, const Packet& pAlphaReal,
+ const Packet& pAlphaImag, const Packet& pMask) {
MICRO_EXTRA(MICRO_COMPLEX_EXTRA_ROWS, remaining_rows, false)
}
#define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
- MICRO_COMPLEX_UNROLL(func2); \
- func(0,peel) func(1,peel) func(2,peel) func(3,peel)
+ MICRO_COMPLEX_UNROLL(func2); \
+ func(0, peel) func(1, peel) func(2, peel) func(3, peel)
-#define MICRO_COMPLEX_WORK_ONE4(iter, peel) \
- if (unroll_factor > iter) { \
- pgerc_common<accRows, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
+#define MICRO_COMPLEX_WORK_ONE4(iter, peel) \
+ if (unroll_factor > iter) { \
+ pgerc_common<accRows, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>( \
+ &accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
}
#define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \
- if (PEEL_COMPLEX > peel) { \
- Packet lhsV0, lhsV1, lhsV2, lhsV3; \
- Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
- MICRO_COMPLEX_BROADCAST(peel) \
- MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
- } else { \
- EIGEN_UNUSED_VARIABLE(rhsV##peel); \
- EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
+ if (PEEL_COMPLEX > peel) { \
+ Packet lhsV0, lhsV1, lhsV2, lhsV3; \
+ Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
+ MICRO_COMPLEX_BROADCAST(peel) \
+ MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+ EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
}
#define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \
- Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M]; \
- Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M]; \
- func(func1,func2,0) func(func1,func2,1) \
- func(func1,func2,2) func(func1,func2,3)
+ Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M]; \
+ Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M]; \
+ func(func1, func2, 0) func(func1, func2, 1) func(func1, func2, 2) func(func1, func2, 3)
#define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \
- Packet rhsV0[M], rhsVi0[M];\
- func(func1,func2,0)
+ Packet rhsV0[M], rhsVi0[M]; \
+ func(func1, func2, 0)
-#define MICRO_COMPLEX_UNROLL_TYPE(MICRO_COMPLEX_TYPE, size) \
+#define MICRO_COMPLEX_UNROLL_TYPE(MICRO_COMPLEX_TYPE, size) \
MICRO_COMPLEX_TYPE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE) \
MICRO_COMPLEX_ADD_ROWS(size, false)
@@ -2623,13 +2446,13 @@
#define MICRO_COMPLEX_ONE4 MICRO_COMPLEX_UNROLL_TYPE(MICRO_COMPLEX_UNROLL_TYPE_ONE, 1)
-#define MICRO_COMPLEX_DST_PTR_ONE(iter) \
- if (unroll_factor > iter) { \
+#define MICRO_COMPLEX_DST_PTR_ONE(iter) \
+ if (unroll_factor > iter) { \
bsetzero<Packet, accRows>(accReal##iter); \
bsetzero<Packet, accRows>(accImag##iter); \
- } else { \
- EIGEN_UNUSED_VARIABLE(accReal##iter); \
- EIGEN_UNUSED_VARIABLE(accImag##iter); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(accReal##iter); \
+ EIGEN_UNUSED_VARIABLE(accImag##iter); \
}
#define MICRO_COMPLEX_DST_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_DST_PTR_ONE)
@@ -2638,59 +2461,52 @@
#define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
-#define MICRO_COMPLEX_STORE_ONE(iter) \
- if (unroll_factor > iter) { \
- constexpr bool full = ((MICRO_NORMAL(iter)) || (accCols2 > accColsC)); \
- bload<DataMapper, Packetc, accColsC, ColMajor, true, accRows, full>(tRes, res, row + iter*accCols, 0); \
- bscalec<Packet,accRows,!(MICRO_NORMAL(iter))>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); \
- bcouple<Packet, Packetc, accRows, full>(taccReal, taccImag, tRes, acc0, acc1); \
- bstore<DataMapper, Packetc, accRows>(acc0, res, row + iter*accCols + 0); \
- if (full) { \
- bstore<DataMapper, Packetc, accRows>(acc1, res, row + iter*accCols + accColsC); \
- } \
+#define MICRO_COMPLEX_STORE_ONE(iter) \
+ if (unroll_factor > iter) { \
+ constexpr bool full = ((MICRO_NORMAL(iter)) || (accCols2 > accColsC)); \
+ bload<DataMapper, Packetc, accColsC, ColMajor, true, accRows, full>(tRes, res, row + iter * accCols, 0); \
+ bscalec<Packet, accRows, !(MICRO_NORMAL(iter))>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, \
+ taccImag, pMask); \
+ bcouple<Packet, Packetc, accRows, full>(taccReal, taccImag, tRes, acc0, acc1); \
+ bstore<DataMapper, Packetc, accRows>(acc0, res, row + iter * accCols + 0); \
+ if (full) { \
+ bstore<DataMapper, Packetc, accRows>(acc1, res, row + iter * accCols + accColsC); \
+ } \
}
#define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE)
-template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_iteration(
- const DataMapper& res,
- const Scalar* lhs_base,
- const Scalar* rhs_base,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index& row,
- const Packet& pAlphaReal,
- const Packet& pAlphaImag,
- const Packet& pMask)
-{
- const Scalar* rhs_ptr_real0 = rhs_base, * rhs_ptr_real1 = NULL, * rhs_ptr_real2 = NULL;
- const Scalar* rhs_ptr_imag0 = NULL, * rhs_ptr_imag1 = NULL, * rhs_ptr_imag2 = NULL;
- const Index imag_delta = accCols*strideA;
- const Index imag_delta2 = accCols2*strideA;
- const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL;
- const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL;
- PacketBlock<Packet,accRows> accReal0, accImag0, accReal1, accImag1;
- PacketBlock<Packet,accRows> accReal2, accImag2, accReal3, accImag3;
- PacketBlock<Packet,accRows> taccReal, taccImag;
- PacketBlock<Packetc,accRows> acc0, acc1;
- PacketBlock<Packetc,accRows*2> tRes;
+template <int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper,
+ const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs,
+ bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_iteration(const DataMapper& res, const Scalar* lhs_base,
+ const Scalar* rhs_base, Index depth, Index strideA,
+ Index offsetA, Index strideB, Index& row,
+ const Packet& pAlphaReal, const Packet& pAlphaImag,
+ const Packet& pMask) {
+ const Scalar *rhs_ptr_real0 = rhs_base, *rhs_ptr_real1 = NULL, *rhs_ptr_real2 = NULL;
+ const Scalar *rhs_ptr_imag0 = NULL, *rhs_ptr_imag1 = NULL, *rhs_ptr_imag2 = NULL;
+ const Index imag_delta = accCols * strideA;
+ const Index imag_delta2 = accCols2 * strideA;
+ const Scalar *lhs_ptr_real0 = NULL, *lhs_ptr_real1 = NULL;
+ const Scalar *lhs_ptr_real2 = NULL, *lhs_ptr_real3 = NULL;
+ PacketBlock<Packet, accRows> accReal0, accImag0, accReal1, accImag1;
+ PacketBlock<Packet, accRows> accReal2, accImag2, accReal3, accImag3;
+ PacketBlock<Packet, accRows> taccReal, taccImag;
+ PacketBlock<Packetc, accRows> acc0, acc1;
+ PacketBlock<Packetc, accRows * 2> tRes;
MICRO_COMPLEX_SRC2_PTR
MICRO_COMPLEX_SRC_PTR
MICRO_COMPLEX_DST_PTR
Index k = 0;
- for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
- {
+ for (; k + PEEL_COMPLEX <= depth; k += PEEL_COMPLEX) {
MICRO_COMPLEX_PREFETCHN(accRows)
MICRO_COMPLEX_PREFETCH
MICRO_COMPLEX_ONE_PEEL4
}
- for(; k < depth; k++)
- {
+ for (; k < depth; k++) {
MICRO_COMPLEX_ONE4
}
MICRO_COMPLEX_STORE
@@ -2698,38 +2514,29 @@
MICRO_COMPLEX_UPDATE
}
-#define MICRO_COMPLEX_UNROLL_ITER2(N, M) \
- gemm_complex_unrolled_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, DataMapper, accRows, accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \
+#define MICRO_COMPLEX_UNROLL_ITER2(N, M) \
+ gemm_complex_unrolled_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, DataMapper, accRows, accCols, \
+ M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>( \
+ res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \
if (M) return;
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void gemm_complex_cols(
- const DataMapper& res,
- const Scalar* blockA,
- const Scalar* blockB,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index offsetB,
- Index col,
- Index rows,
- Index remaining_rows,
- const Packet& pAlphaReal,
- const Packet& pAlphaImag,
- const Packet& pMask)
-{
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows,
+ const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+ Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB,
+ Index col, Index rows, Index remaining_rows, const Packet& pAlphaReal,
+ const Packet& pAlphaImag, const Packet& pMask) {
const DataMapper res3 = res.getSubMapper(0, col);
- const Scalar* rhs_base = blockB + advanceCols*col*strideB + MICRO_NEW_ROWS*offsetB;
- const Scalar* lhs_base = blockA + accCols*offsetA;
+ const Scalar* rhs_base = blockB + advanceCols * col * strideB + MICRO_NEW_ROWS * offsetB;
+ const Scalar* lhs_base = blockA + accCols * offsetA;
Index row = 0;
#define MAX_COMPLEX_UNROLL 4
- while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
+ while (row + MAX_COMPLEX_UNROLL * accCols <= rows) {
MICRO_COMPLEX_UNROLL_ITER2(MAX_COMPLEX_UNROLL, 0);
}
- switch( (rows-row)/accCols ) {
+ switch ((rows - row) / accCols) {
#if MAX_COMPLEX_UNROLL > 4
case 4:
MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 4)
@@ -2755,87 +2562,81 @@
}
#undef MAX_COMPLEX_UNROLL
- if(remaining_rows > 0)
- {
- gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+ if (remaining_rows > 0) {
+ gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal,
+ RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows,
+ remaining_rows, pAlphaReal, pAlphaImag, pMask);
}
}
-#define MICRO_COMPLEX_EXTRA_COLS(N) \
- gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, N, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+#define MICRO_COMPLEX_EXTRA_COLS(N) \
+ gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, N, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, \
+ RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, \
+ remaining_rows, pAlphaReal, pAlphaImag, pMask);
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void gemm_complex_extra_cols(
- const DataMapper& res,
- const Scalar* blockA,
- const Scalar* blockB,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index offsetB,
- Index col,
- Index rows,
- Index cols,
- Index remaining_rows,
- const Packet& pAlphaReal,
- const Packet& pAlphaImag,
- const Packet& pMask)
-{
- MICRO_EXTRA(MICRO_COMPLEX_EXTRA_COLS, cols-col, true)
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accCols,
+ bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+ Index depth, Index strideA, Index offsetA, Index strideB,
+ Index offsetB, Index col, Index rows, Index cols, Index remaining_rows,
+ const Packet& pAlphaReal, const Packet& pAlphaImag,
+ const Packet& pMask) {
+ MICRO_EXTRA(MICRO_COMPLEX_EXTRA_COLS, cols - col, true)
}
-template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
- const Index remaining_rows = rows % accCols;
+template <typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc,
+ typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs,
+ bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc,
+ Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB,
+ Index offsetA, Index offsetB) {
+ const Index remaining_rows = rows % accCols;
- if( strideA == -1 ) strideA = depth;
- if( strideB == -1 ) strideB = depth;
+ if (strideA == -1) strideA = depth;
+ if (strideB == -1) strideB = depth;
- const Packet pAlphaReal = pset1<Packet>(alpha.real());
- const Packet pAlphaImag = pset1<Packet>(alpha.imag());
- const Packet pMask = bmask<Packet>(remaining_rows);
+ const Packet pAlphaReal = pset1<Packet>(alpha.real());
+ const Packet pAlphaImag = pset1<Packet>(alpha.imag());
+ const Packet pMask = bmask<Packet>(remaining_rows);
- const Scalar* blockA = (Scalar *) blockAc;
- const Scalar* blockB = (Scalar *) blockBc;
+ const Scalar* blockA = (Scalar*)blockAc;
+ const Scalar* blockB = (Scalar*)blockBc;
- Index col = 0;
- for(; col + accRows <= cols; col += accRows)
- {
- gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
- }
+ Index col = 0;
+ for (; col + accRows <= cols; col += accRows) {
+ gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal,
+ RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows,
+ remaining_rows, pAlphaReal, pAlphaImag, pMask);
+ }
- if (col != cols)
- {
- gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
- }
+ if (col != cols) {
+ gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal,
+ RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols,
+ remaining_rows, pAlphaReal, pAlphaImag, pMask);
+ }
}
#undef accColsC
#undef advanceCols
#undef advanceRows
-EIGEN_ALWAYS_INLINE bool supportsMMA()
-{
+EIGEN_ALWAYS_INLINE bool supportsMMA() {
#if defined(EIGEN_ALTIVEC_MMA_ONLY)
return true;
#elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) && defined(__BUILTIN_CPU_SUPPORTS__)
- return __builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma");
+ return __builtin_cpu_supports("arch_3_1") && __builtin_cpu_supports("mma");
#else
return false; // No dynamic dispatch for LLVM or older GCC
#endif
}
-EIGEN_ALWAYS_INLINE Packet4f loadAndMultiplyF32(Packet4f acc, const Packet4f pAlpha, float* result)
-{
+EIGEN_ALWAYS_INLINE Packet4f loadAndMultiplyF32(Packet4f acc, const Packet4f pAlpha, float* result) {
Packet4f result_block = ploadu<Packet4f>(result);
return pmadd(acc, pAlpha, result_block);
}
-template<bool lhsExtraRows>
-EIGEN_ALWAYS_INLINE void storeF32(float*& result, Packet4f result_block, Index rows, Index extra_rows)
-{
+template <bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void storeF32(float*& result, Packet4f result_block, Index rows, Index extra_rows) {
if (lhsExtraRows) {
pstoreu_partial(result, result_block, extra_rows);
} else {
@@ -2844,31 +2645,30 @@
result += rows;
}
-template<bool rhsExtraCols, bool lhsExtraRows>
-EIGEN_ALWAYS_INLINE void storeResults(Packet4f (&acc)[4], Index rows, const Packet4f pAlpha, float* result, Index extra_cols, Index extra_rows)
-{
+template <bool rhsExtraCols, bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void storeResults(Packet4f (&acc)[4], Index rows, const Packet4f pAlpha, float* result,
+ Index extra_cols, Index extra_rows) {
Index x = 0;
if (rhsExtraCols) {
- do{
+ do {
Packet4f result_block = loadAndMultiplyF32(acc[x], pAlpha, result);
storeF32<lhsExtraRows>(result, result_block, rows, extra_rows);
} while (++x < extra_cols);
} else {
Packet4f result_block[4];
- float *result2 = result;
- do{
+ float* result2 = result;
+ do {
result_block[x] = loadAndMultiplyF32(acc[x], pAlpha, result);
result += rows;
} while (++x < 4);
x = 0;
- do{
+ do {
storeF32<lhsExtraRows>(result2, result_block[x], rows, extra_rows);
} while (++x < 4);
}
}
-EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Hi(Packet8us data)
-{
+EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Hi(Packet8us data) {
Packet8us z = pset1<Packet8us>(0);
#ifdef _BIG_ENDIAN
return reinterpret_cast<Packet4f>(vec_mergeh(data, z));
@@ -2877,8 +2677,7 @@
#endif
}
-EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Lo(Packet8us data)
-{
+EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Lo(Packet8us data) {
Packet8us z = pset1<Packet8us>(0);
#ifdef _BIG_ENDIAN
return reinterpret_cast<Packet4f>(vec_mergel(data, z));
@@ -2887,12 +2686,11 @@
#endif
}
-template<Index N, Index M>
-EIGEN_ALWAYS_INLINE void storeConvertTwoBF16(float* to, PacketBlock<Packet8bf,(N+7)/8>& block, Index extra = 0)
-{
+template <Index N, Index M>
+EIGEN_ALWAYS_INLINE void storeConvertTwoBF16(float* to, PacketBlock<Packet8bf, (N + 7) / 8>& block, Index extra = 0) {
if (N < 4) {
- pstoreu_partial(to + 0, oneConvertBF16Hi(block.packet[0].m_val), extra);
- } else if (N >= (M*8+4)) {
+ pstoreu_partial(to + 0, oneConvertBF16Hi(block.packet[0].m_val), extra);
+ } else if (N >= (M * 8 + 4)) {
pstoreu(to + 0, oneConvertBF16Hi(block.packet[M].m_val));
if (N >= 8) {
pstoreu(to + 4, oneConvertBF16Lo(block.packet[M].m_val));
@@ -2900,9 +2698,8 @@
}
}
-template<Index N>
-EIGEN_ALWAYS_INLINE void storeConvertBlockBF16(float* to, PacketBlock<Packet8bf,(N+7)/8>& block, Index extra)
-{
+template <Index N>
+EIGEN_ALWAYS_INLINE void storeConvertBlockBF16(float* to, PacketBlock<Packet8bf, (N + 7) / 8>& block, Index extra) {
storeConvertTwoBF16<N, 0>(to + 0, block, extra);
if (N >= 16) {
storeConvertTwoBF16<N, 1>(to + 8, block);
@@ -2913,28 +2710,26 @@
}
}
-template<bool non_unit_stride, Index delta>
-EIGEN_ALWAYS_INLINE Packet8bf loadBF16fromResult(bfloat16* src, Index resInc)
-{
+template <bool non_unit_stride, Index delta>
+EIGEN_ALWAYS_INLINE Packet8bf loadBF16fromResult(bfloat16* src, Index resInc) {
if (non_unit_stride) {
- return pgather<bfloat16, Packet8bf>(src + delta*resInc, resInc);
+ return pgather<bfloat16, Packet8bf>(src + delta * resInc, resInc);
} else {
return ploadu<Packet8bf>(src + delta);
}
}
-static Packet16uc p16uc_MERGE16_32_1 = { 0, 1, 16,17, 2, 3, 18,19, 0, 1, 16,17, 2, 3, 18,19 };
-static Packet16uc p16uc_MERGE16_32_2 = { 4, 5, 20,21, 6, 7, 22,23, 4, 5, 20,21, 6, 7, 22,23 };
-static Packet16uc p16uc_MERGE16_32_3 = { 8, 9, 24,25, 10,11, 26,27, 8, 9, 24,25, 10,11, 26,27 };
-static Packet16uc p16uc_MERGE16_32_4 = { 12,13, 28,29, 14,15, 30,31, 12,13, 28,29, 14,15, 30,31 };
+static Packet16uc p16uc_MERGE16_32_1 = {0, 1, 16, 17, 2, 3, 18, 19, 0, 1, 16, 17, 2, 3, 18, 19};
+static Packet16uc p16uc_MERGE16_32_2 = {4, 5, 20, 21, 6, 7, 22, 23, 4, 5, 20, 21, 6, 7, 22, 23};
+static Packet16uc p16uc_MERGE16_32_3 = {8, 9, 24, 25, 10, 11, 26, 27, 8, 9, 24, 25, 10, 11, 26, 27};
+static Packet16uc p16uc_MERGE16_32_4 = {12, 13, 28, 29, 14, 15, 30, 31, 12, 13, 28, 29, 14, 15, 30, 31};
-static Packet16uc p16uc_MERGE16_32_5 = { 0,1, 16,17, 16,17, 16,17, 0,1, 16,17, 16,17, 16,17 };
-static Packet16uc p16uc_MERGE16_32_6 = { 2,3, 18,19, 18,19, 18,19, 2,3, 18,19, 18,19, 18,19 };
-static Packet16uc p16uc_MERGE16_32_7 = { 4,5, 20,21, 20,21, 20,21, 4,5, 20,21, 20,21, 20,21 };
-static Packet16uc p16uc_MERGE16_32_8 = { 6,7, 22,23, 22,23, 22,23, 6,7, 22,23, 22,23, 22,23 };
+static Packet16uc p16uc_MERGE16_32_5 = {0, 1, 16, 17, 16, 17, 16, 17, 0, 1, 16, 17, 16, 17, 16, 17};
+static Packet16uc p16uc_MERGE16_32_6 = {2, 3, 18, 19, 18, 19, 18, 19, 2, 3, 18, 19, 18, 19, 18, 19};
+static Packet16uc p16uc_MERGE16_32_7 = {4, 5, 20, 21, 20, 21, 20, 21, 4, 5, 20, 21, 20, 21, 20, 21};
+static Packet16uc p16uc_MERGE16_32_8 = {6, 7, 22, 23, 22, 23, 22, 23, 6, 7, 22, 23, 22, 23, 22, 23};
-EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Perm(Packet8us data, Packet16uc mask)
-{
+EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Perm(Packet8us data, Packet16uc mask) {
Packet8us z = pset1<Packet8us>(0);
#ifdef _BIG_ENDIAN
return reinterpret_cast<Packet4f>(vec_perm(data, z, mask));
@@ -2943,63 +2738,62 @@
#endif
}
-template<bool lhsExtraRows, bool odd, Index size>
-EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32DupOne(float *result, Index rows, const bfloat16* src, Index extra_rows)
-{
- Packet4f dup[4*4];
+template <bool lhsExtraRows, bool odd, Index size>
+EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32DupOne(float* result, Index rows, const bfloat16* src,
+ Index extra_rows) {
+ Packet4f dup[4 * 4];
Packet8bf data[4];
for (Index i = 0; i < size; i++) {
- data[i] = ploadu<Packet8bf>(src + rows*i);
+ data[i] = ploadu<Packet8bf>(src + rows * i);
}
for (Index i = 0, j = 0; i < size; i++, j += 4) {
- dup[j+0] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_5 : p16uc_MERGE16_32_1);
- dup[j+1] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_6 : p16uc_MERGE16_32_2);
- dup[j+2] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_7 : p16uc_MERGE16_32_3);
- dup[j+3] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_8 : p16uc_MERGE16_32_4);
+ dup[j + 0] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_5 : p16uc_MERGE16_32_1);
+ dup[j + 1] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_6 : p16uc_MERGE16_32_2);
+ dup[j + 2] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_7 : p16uc_MERGE16_32_3);
+ dup[j + 3] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_8 : p16uc_MERGE16_32_4);
}
- for (Index j = 0; j < 4*size; j += 4) {
+ for (Index j = 0; j < 4 * size; j += 4) {
if (lhsExtraRows) {
Packet4f z = pset1<Packet4f>(float(0));
Index i = 0;
do {
- pstoreu(result + (j+i)*4, dup[j+i]);
+ pstoreu(result + (j + i) * 4, dup[j + i]);
} while (++i < extra_rows);
do {
- pstoreu(result + (j+i)*4, z);
+ pstoreu(result + (j + i) * 4, z);
} while (++i < 4);
} else {
for (Index i = 0; i < 4; i++) {
- pstoreu(result + (j+i)*4, dup[j+i]);
+ pstoreu(result + (j + i) * 4, dup[j + i]);
}
}
}
}
-template<bool lhsExtraRows>
-EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32Dup(float *result, Index cols, Index rows, const bfloat16* src, Index delta, Index extra_rows)
-{
+template <bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32Dup(float* result, Index cols, Index rows, const bfloat16* src,
+ Index delta, Index extra_rows) {
Index col = 0;
- src += delta*2;
- for(; col + 4*2 <= cols; col += 4*2, result += 4*4*4, src += 4*rows) {
- convertArrayPointerBF16toF32DupOne<lhsExtraRows,false,4>(result, rows, src, extra_rows);
+ src += delta * 2;
+ for (; col + 4 * 2 <= cols; col += 4 * 2, result += 4 * 4 * 4, src += 4 * rows) {
+ convertArrayPointerBF16toF32DupOne<lhsExtraRows, false, 4>(result, rows, src, extra_rows);
}
- for(; col + 2 <= cols; col += 2, result += 4*4, src += rows) {
- convertArrayPointerBF16toF32DupOne<lhsExtraRows,false,1>(result, rows, src, extra_rows);
+ for (; col + 2 <= cols; col += 2, result += 4 * 4, src += rows) {
+ convertArrayPointerBF16toF32DupOne<lhsExtraRows, false, 1>(result, rows, src, extra_rows);
}
if (cols & 1) {
- convertArrayPointerBF16toF32DupOne<lhsExtraRows,true,1>(result, rows, src - delta, extra_rows);
+ convertArrayPointerBF16toF32DupOne<lhsExtraRows, true, 1>(result, rows, src - delta, extra_rows);
}
}
-template<const Index size, bool non_unit_stride>
-EIGEN_ALWAYS_INLINE void convertPointerBF16toF32(Index& i, float *result, Index rows, bfloat16*& src, Index resInc)
-{
+template <const Index size, bool non_unit_stride>
+EIGEN_ALWAYS_INLINE void convertPointerBF16toF32(Index& i, float* result, Index rows, bfloat16*& src, Index resInc) {
constexpr Index extra = ((size < 4) ? 4 : size);
while (i + size <= rows) {
- PacketBlock<Packet8bf,(size+7)/8> r32;
+ PacketBlock<Packet8bf, (size + 7) / 8> r32;
r32.packet[0] = loadBF16fromResult<non_unit_stride, 0>(src, resInc);
if (size >= 16) {
r32.packet[1] = loadBF16fromResult<non_unit_stride, 8>(src, resInc);
@@ -3009,41 +2803,40 @@
r32.packet[3] = loadBF16fromResult<non_unit_stride, 24>(src, resInc);
}
storeConvertBlockBF16<size>(result + i, r32, rows & 3);
- i += extra; src += extra*resInc;
+ i += extra;
+ src += extra * resInc;
if (size != 32) break;
}
}
-template<bool non_unit_stride>
-EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32(float *result, Index cols, Index rows, bfloat16* src, Index resInc)
-{
- for(Index col = 0; col < cols; col++, src += (rows*resInc), result += rows) {
+template <bool non_unit_stride>
+EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32(float* result, Index cols, Index rows, bfloat16* src,
+ Index resInc) {
+ for (Index col = 0; col < cols; col++, src += (rows * resInc), result += rows) {
Index i = 0;
bfloat16* src2 = src;
convertPointerBF16toF32<32, non_unit_stride>(i, result, rows, src2, resInc);
convertPointerBF16toF32<16, non_unit_stride>(i, result, rows, src2, resInc);
- convertPointerBF16toF32<8, non_unit_stride>(i, result, rows, src2, resInc);
- convertPointerBF16toF32<4, non_unit_stride>(i, result, rows, src2, resInc);
- convertPointerBF16toF32<1, non_unit_stride>(i, result, rows, src2, resInc);
+ convertPointerBF16toF32<8, non_unit_stride>(i, result, rows, src2, resInc);
+ convertPointerBF16toF32<4, non_unit_stride>(i, result, rows, src2, resInc);
+ convertPointerBF16toF32<1, non_unit_stride>(i, result, rows, src2, resInc);
}
}
-template<Index num_acc, Index size = 4>
-EIGEN_ALWAYS_INLINE void zeroAccumulators(Packet4f (&acc)[num_acc][size])
-{
+template <Index num_acc, Index size = 4>
+EIGEN_ALWAYS_INLINE void zeroAccumulators(Packet4f (&acc)[num_acc][size]) {
Packet4f z = pset1<Packet4f>(float(0));
- for(Index k = 0; k < num_acc; k++) {
- for(Index j = 0; j < size; j++) {
+ for (Index k = 0; k < num_acc; k++) {
+ for (Index j = 0; j < size; j++) {
acc[k][j] = z;
}
}
}
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void tranposeResults(Packet4f (&acc)[num_acc][4])
-{
- for(Index i = 0; i < num_acc; i++) {
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void tranposeResults(Packet4f (&acc)[num_acc][4]) {
+ for (Index i = 0; i < num_acc; i++) {
Packet4ui t0, t1, t2, t3;
t0 = vec_mergeh(reinterpret_cast<Packet4ui>(acc[i][0]), reinterpret_cast<Packet4ui>(acc[i][2]));
t1 = vec_mergel(reinterpret_cast<Packet4ui>(acc[i][0]), reinterpret_cast<Packet4ui>(acc[i][2]));
@@ -3056,85 +2849,75 @@
}
}
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void addResults(Packet4f (&acc)[num_acc][4])
-{
- for(Index i = 0, j = 0; j < num_acc; i++, j += 2) {
- for(Index x = 0, y = 0; x < 2; x++, y += 2) {
- for(Index w = 0, z = 0; w < 2; w++, z += 2) {
- acc[i][y+w] = acc[j+x][z+0] + acc[j+x][z+1];
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void addResults(Packet4f (&acc)[num_acc][4]) {
+ for (Index i = 0, j = 0; j < num_acc; i++, j += 2) {
+ for (Index x = 0, y = 0; x < 2; x++, y += 2) {
+ for (Index w = 0, z = 0; w < 2; w++, z += 2) {
+ acc[i][y + w] = acc[j + x][z + 0] + acc[j + x][z + 1];
}
}
}
}
-template<Index num_acc, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs>
-EIGEN_ALWAYS_INLINE void outputResultsVSX(Packet4f (&acc)[num_acc][4], Index rows, const Packet4f pAlpha, float* result, const Index extra_cols, Index extra_rows)
-{
+template <Index num_acc, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs>
+EIGEN_ALWAYS_INLINE void outputResultsVSX(Packet4f (&acc)[num_acc][4], Index rows, const Packet4f pAlpha, float* result,
+ const Index extra_cols, Index extra_rows) {
tranposeResults<num_acc>(acc);
addResults<num_acc>(acc);
constexpr Index real_rhs = ((num_rhs / 2) - (rhsExtraCols ? 1 : 0));
Index k = 0;
- for(Index i = 0; i < real_rhs; i++, result += 4*rows, k++){
+ for (Index i = 0; i < real_rhs; i++, result += 4 * rows, k++) {
storeResults<false, lhsExtraRows>(acc[k], rows, pAlpha, result, extra_cols, extra_rows);
}
- if(rhsExtraCols) {
+ if (rhsExtraCols) {
storeResults<rhsExtraCols, lhsExtraRows>(acc[k], rows, pAlpha, result, extra_cols, extra_rows);
}
}
-template<bool zero>
-EIGEN_ALWAYS_INLINE void loadTwoRhsFloat32(const float* block, Index strideB, Index i, Packet4f& dhs0, Packet4f &dhs1)
-{
- dhs0 = ploadu<Packet4f>(block + strideB*i + 0);
+template <bool zero>
+EIGEN_ALWAYS_INLINE void loadTwoRhsFloat32(const float* block, Index strideB, Index i, Packet4f& dhs0, Packet4f& dhs1) {
+ dhs0 = ploadu<Packet4f>(block + strideB * i + 0);
if (zero) {
Packet4f dhs2 = pset1<Packet4f>(float(0));
dhs1 = vec_mergel(dhs0, dhs2);
dhs0 = vec_mergeh(dhs0, dhs2);
} else {
- dhs1 = ploadu<Packet4f>(block + strideB*i + 4);
+ dhs1 = ploadu<Packet4f>(block + strideB * i + 4);
}
}
-template<Index num_acc, bool zero, bool rhsExtraCols, Index num_rhs>
-EIGEN_ALWAYS_INLINE void KLoop
-(
- const float* indexA,
- const float* indexB,
- Packet4f (&acc)[num_acc][4],
- Index strideB,
- Index k,
- Index offsetB,
- Index extra_cols
-)
-{
+template <Index num_acc, bool zero, bool rhsExtraCols, Index num_rhs>
+EIGEN_ALWAYS_INLINE void KLoop(const float* indexA, const float* indexB, Packet4f (&acc)[num_acc][4], Index strideB,
+ Index k, Index offsetB, Index extra_cols) {
constexpr Index num_lhs = 4;
Packet4f lhs[num_lhs], rhs[num_rhs];
constexpr Index real_rhs = (num_rhs - (rhsExtraCols ? 2 : 0));
- for(Index i = 0; i < real_rhs; i += 2){
- loadTwoRhsFloat32<zero>(indexB + k*4, strideB, i, rhs[i + 0], rhs[i + 1]);
+ for (Index i = 0; i < real_rhs; i += 2) {
+ loadTwoRhsFloat32<zero>(indexB + k * 4, strideB, i, rhs[i + 0], rhs[i + 1]);
}
- if(rhsExtraCols) {
- loadTwoRhsFloat32<zero>(indexB + k*extra_cols - offsetB, strideB, real_rhs, rhs[real_rhs + 0], rhs[real_rhs + 1]);
+ if (rhsExtraCols) {
+ loadTwoRhsFloat32<zero>(indexB + k * extra_cols - offsetB, strideB, real_rhs, rhs[real_rhs + 0], rhs[real_rhs + 1]);
}
- indexA += 2*k*4;
- for(Index j = 0; j < num_lhs; j++) {
- lhs[j] = ploadu<Packet4f>(indexA + j*4);
+ indexA += 2 * k * 4;
+ for (Index j = 0; j < num_lhs; j++) {
+ lhs[j] = ploadu<Packet4f>(indexA + j * 4);
}
- for(Index j = 0; j < num_rhs; j++) {
- for(Index i = 0; i < num_lhs; i++) {
+ for (Index j = 0; j < num_rhs; j++) {
+ for (Index i = 0; i < num_lhs; i++) {
acc[j][i] = pmadd(rhs[j], lhs[i], acc[j][i]);
}
}
}
-template<const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
-EIGEN_ALWAYS_INLINE void colVSXLoopBodyIter(Index depth, Index rows, const Packet4f pAlpha, const float* indexA, const float* indexB, Index strideB, Index offsetB, float* result, const Index extra_cols, const Index extra_rows)
-{
+template <const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void colVSXLoopBodyIter(Index depth, Index rows, const Packet4f pAlpha, const float* indexA,
+ const float* indexB, Index strideB, Index offsetB, float* result,
+ const Index extra_cols, const Index extra_rows) {
constexpr Index num_rhs = num_acc;
Packet4f acc[num_acc][4];
@@ -3142,10 +2925,10 @@
zeroAccumulators<num_acc>(acc);
Index k;
- for(k = 0; k + 2 <= depth; k += 2){
+ for (k = 0; k + 2 <= depth; k += 2) {
KLoop<num_acc, false, rhsExtraCols, num_rhs>(indexA, indexB, acc, strideB, k, offsetB, extra_cols);
}
- if(depth&1){
+ if (depth & 1) {
KLoop<num_acc, true, rhsExtraCols, num_rhs>(indexA, indexB, acc, strideB, k, offsetB, extra_cols);
}
@@ -3153,97 +2936,108 @@
}
// No more than 4 (uses 2X the accumulators or 8X the number of VSX registers)
-#define MAX_BFLOAT16_ACC_VSX 4
+#define MAX_BFLOAT16_ACC_VSX 4
-template<const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
-void colVSXLoopBody(Index& col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexA, const float* indexB, Index strideB, Index offsetB, float* result)
-{
- constexpr Index step = (num_acc * 4); // each accumulator has 4 elements
+template <const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
+void colVSXLoopBody(Index& col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexA,
+ const float* indexB, Index strideB, Index offsetB, float* result) {
+ constexpr Index step = (num_acc * 4); // each accumulator has 4 elements
const Index extra_cols = (rhsExtraCols) ? (cols & 3) : 0;
const Index extra_rows = (lhsExtraRows) ? (rows & 3) : 0;
constexpr bool multiIters = !rhsExtraCols && (num_acc == MAX_BFLOAT16_ACC_VSX);
- do{
- colVSXLoopBodyIter<num_acc*2, rhsExtraCols, lhsExtraRows>(depth, rows, pAlpha, indexA, indexB, strideB, offsetB, result, extra_cols, extra_rows);
+ do {
+ colVSXLoopBodyIter<num_acc * 2, rhsExtraCols, lhsExtraRows>(depth, rows, pAlpha, indexA, indexB, strideB, offsetB,
+ result, extra_cols, extra_rows);
- indexB += strideB*(num_acc * 2);
- result += rows*step;
- } while(multiIters && (step <= cols - (col += step)));
+ indexB += strideB * (num_acc * 2);
+ result += rows * step;
+ } while (multiIters && (step <= cols - (col += step)));
}
-template<const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
-EIGEN_ALWAYS_INLINE void colVSXLoopBodyExtraN(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexA, const float* blockB, Index strideB, Index offsetB, float* result)
-{
+template <const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void colVSXLoopBodyExtraN(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha,
+ const float* indexA, const float* blockB, Index strideB, Index offsetB,
+ float* result) {
if (MAX_BFLOAT16_ACC_VSX > num_acc) {
- colVSXLoopBody<num_acc + (rhsExtraCols ? 1 : 0), rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
+ colVSXLoopBody<num_acc + (rhsExtraCols ? 1 : 0), rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA,
+ blockB, strideB, offsetB, result);
}
}
-template<bool rhsExtraCols, bool lhsExtraRows>
-void colVSXLoopBodyExtra(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexA, const float* blockB, Index strideB, Index offsetB, float* result)
-{
+template <bool rhsExtraCols, bool lhsExtraRows>
+void colVSXLoopBodyExtra(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexA,
+ const float* blockB, Index strideB, Index offsetB, float* result) {
switch ((cols - col) >> 2) {
- case 3:
- colVSXLoopBodyExtraN<3, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
- break;
- case 2:
- colVSXLoopBodyExtraN<2, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
- break;
- case 1:
- colVSXLoopBodyExtraN<1, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
- break;
- default:
- if (rhsExtraCols) {
- colVSXLoopBody<1, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
- }
- break;
+ case 3:
+ colVSXLoopBodyExtraN<3, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB,
+ offsetB, result);
+ break;
+ case 2:
+ colVSXLoopBodyExtraN<2, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB,
+ offsetB, result);
+ break;
+ case 1:
+ colVSXLoopBodyExtraN<1, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB,
+ offsetB, result);
+ break;
+ default:
+ if (rhsExtraCols) {
+ colVSXLoopBody<1, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
+ }
+ break;
}
}
-template<Index size, bool lhsExtraRows = false>
-EIGEN_ALWAYS_INLINE void colVSXLoops(Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA, const float* indexA2, const float* blockB2, Index strideA, Index strideB, Index offsetB, float* result2)
-{
- Index delta_rows = 2*(lhsExtraRows ? (rows & 3) : size);
+template <Index size, bool lhsExtraRows = false>
+EIGEN_ALWAYS_INLINE void colVSXLoops(Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
+ const float* indexA2, const float* blockB2, Index strideA, Index strideB,
+ Index offsetB, float* result2) {
+ Index delta_rows = 2 * (lhsExtraRows ? (rows & 3) : size);
for (Index row = 0; row < size; row += 4) {
- convertArrayPointerBF16toF32Dup<lhsExtraRows>(const_cast<float *>(indexA2), strideA, delta_rows, indexA, row, rows & 3);
+ convertArrayPointerBF16toF32Dup<lhsExtraRows>(const_cast<float*>(indexA2), strideA, delta_rows, indexA, row,
+ rows & 3);
- const float *blockB = blockB2;
- float *result = result2 + row;
+ const float* blockB = blockB2;
+ float* result = result2 + row;
Index col = 0;
if (cols >= (MAX_BFLOAT16_ACC_VSX * 4)) {
- colVSXLoopBody<MAX_BFLOAT16_ACC_VSX, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA2, blockB, strideB, 0, result);
- blockB += (strideB >> 1)*col;
- result += rows*col;
+ colVSXLoopBody<MAX_BFLOAT16_ACC_VSX, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA2, blockB,
+ strideB, 0, result);
+ blockB += (strideB >> 1) * col;
+ result += rows * col;
}
if (cols & 3) {
- colVSXLoopBodyExtra<true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA2, blockB, strideB, offsetB, result);
+ colVSXLoopBodyExtra<true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA2, blockB, strideB, offsetB,
+ result);
} else {
colVSXLoopBodyExtra<false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA2, blockB, strideB, 0, result);
}
}
}
-template<Index size>
-EIGEN_ALWAYS_INLINE void calcVSXColLoops(const bfloat16*& indexA, const float* indexA2, Index& row, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexB, Index strideA, Index strideB, Index offsetA, Index offsetB, Index bigSuffix, float *result)
-{
+template <Index size>
+EIGEN_ALWAYS_INLINE void calcVSXColLoops(const bfloat16*& indexA, const float* indexA2, Index& row, Index depth,
+ Index cols, Index rows, const Packet4f pAlpha, const float* indexB,
+ Index strideA, Index strideB, Index offsetA, Index offsetB, Index bigSuffix,
+ float* result) {
if ((size == 16) || (rows & size)) {
- indexA += size*offsetA;
+ indexA += size * offsetA;
colVSXLoops<size>(depth, cols, rows, pAlpha, indexA, indexA2, indexB, strideA, strideB, offsetB, result + row);
row += size;
- indexA += bigSuffix*size/16;
+ indexA += bigSuffix * size / 16;
}
}
-template<const Index size, typename DataMapper>
-EIGEN_ALWAYS_INLINE void convertBF16toF32(Index& i, float *result, Index rows, const DataMapper& src)
-{
+template <const Index size, typename DataMapper>
+EIGEN_ALWAYS_INLINE void convertBF16toF32(Index& i, float* result, Index rows, const DataMapper& src) {
constexpr Index extra = ((size < 4) ? 4 : size);
while (i + size <= rows) {
- PacketBlock<Packet8bf,(size+7)/8> r32;
- r32.packet[0] = src.template loadPacket<Packet8bf>(i + 0);
+ PacketBlock<Packet8bf, (size + 7) / 8> r32;
+ r32.packet[0] = src.template loadPacket<Packet8bf>(i + 0);
if (size >= 16) {
- r32.packet[1] = src.template loadPacket<Packet8bf>(i + 8);
+ r32.packet[1] = src.template loadPacket<Packet8bf>(i + 8);
}
if (size >= 32) {
r32.packet[2] = src.template loadPacket<Packet8bf>(i + 16);
@@ -3255,104 +3049,104 @@
}
}
-template<typename DataMapper>
-EIGEN_ALWAYS_INLINE void convertArrayBF16toF32(float *result, Index cols, Index rows, const DataMapper& src)
-{
+template <typename DataMapper>
+EIGEN_ALWAYS_INLINE void convertArrayBF16toF32(float* result, Index cols, Index rows, const DataMapper& src) {
typedef typename DataMapper::LinearMapper LinearMapper;
- for(Index j = 0; j < cols; j++, result += rows){
+ for (Index j = 0; j < cols; j++, result += rows) {
const LinearMapper src2 = src.getLinearMapper(0, j);
Index i = 0;
convertBF16toF32<32, LinearMapper>(i, result, rows, src2);
convertBF16toF32<16, LinearMapper>(i, result, rows, src2);
- convertBF16toF32<8, LinearMapper>(i, result, rows, src2);
- convertBF16toF32<4, LinearMapper>(i, result, rows, src2);
- convertBF16toF32<1, LinearMapper>(i, result, rows, src2);
+ convertBF16toF32<8, LinearMapper>(i, result, rows, src2);
+ convertBF16toF32<4, LinearMapper>(i, result, rows, src2);
+ convertBF16toF32<1, LinearMapper>(i, result, rows, src2);
}
}
-EIGEN_ALWAYS_INLINE Packet8bf convertF32toBF16VSX(const float *res)
-{
+EIGEN_ALWAYS_INLINE Packet8bf convertF32toBF16VSX(const float* res) {
return F32ToBf16Both(ploadu<Packet4f>(res + 0), ploadu<Packet4f>(res + 4));
}
-template<typename DataMapper, const Index size>
-EIGEN_ALWAYS_INLINE void convertArrayF32toBF16ColVSX(float *result, Index col, Index rows, const DataMapper& res)
-{
+template <typename DataMapper, const Index size>
+EIGEN_ALWAYS_INLINE void convertArrayF32toBF16ColVSX(float* result, Index col, Index rows, const DataMapper& res) {
const DataMapper res2 = res.getSubMapper(0, col);
Index row;
- float *result2 = result + col*rows;
- for(row = 0; row + 8 <= rows; row += 8, result2 += 8){
+ float* result2 = result + col * rows;
+ for (row = 0; row + 8 <= rows; row += 8, result2 += 8) {
// get and save block
- PacketBlock<Packet8bf,size> block;
- for(Index j = 0; j < size; j++){
- block.packet[j] = convertF32toBF16VSX(result2 + j*rows);
+ PacketBlock<Packet8bf, size> block;
+ for (Index j = 0; j < size; j++) {
+ block.packet[j] = convertF32toBF16VSX(result2 + j * rows);
}
- res2.template storePacketBlock<Packet8bf,size>(row, 0, block);
+ res2.template storePacketBlock<Packet8bf, size>(row, 0, block);
}
// extra rows
- if(row < rows){
- for(Index j = 0; j < size; j++){
- Packet8bf fp16 = convertF32toBF16VSX(result2 + j*rows);
+ if (row < rows) {
+ for (Index j = 0; j < size; j++) {
+ Packet8bf fp16 = convertF32toBF16VSX(result2 + j * rows);
res2.template storePacketPartial<Packet8bf>(row, j, fp16, rows & 7);
}
}
}
-template<typename DataMapper>
-EIGEN_ALWAYS_INLINE void convertArrayF32toBF16VSX(float *result, Index cols, Index rows, const DataMapper& res)
-{
+template <typename DataMapper>
+EIGEN_ALWAYS_INLINE void convertArrayF32toBF16VSX(float* result, Index cols, Index rows, const DataMapper& res) {
Index col;
- for(col = 0; col + 4 <= cols; col += 4){
- convertArrayF32toBF16ColVSX<DataMapper,4>(result, col, rows, res);
+ for (col = 0; col + 4 <= cols; col += 4) {
+ convertArrayF32toBF16ColVSX<DataMapper, 4>(result, col, rows, res);
}
// extra cols
switch (cols - col) {
- case 1:
- convertArrayF32toBF16ColVSX<DataMapper,1>(result, col, rows, res);
- break;
- case 2:
- convertArrayF32toBF16ColVSX<DataMapper,2>(result, col, rows, res);
- break;
- case 3:
- convertArrayF32toBF16ColVSX<DataMapper,3>(result, col, rows, res);
- break;
+ case 1:
+ convertArrayF32toBF16ColVSX<DataMapper, 1>(result, col, rows, res);
+ break;
+ case 2:
+ convertArrayF32toBF16ColVSX<DataMapper, 2>(result, col, rows, res);
+ break;
+ case 3:
+ convertArrayF32toBF16ColVSX<DataMapper, 3>(result, col, rows, res);
+ break;
}
}
-template<typename DataMapper>
-void gemmbfloat16(const DataMapper& res, const bfloat16* indexA, const bfloat16* indexB, Index rows, Index depth, Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename DataMapper>
+void gemmbfloat16(const DataMapper& res, const bfloat16* indexA, const bfloat16* indexB, Index rows, Index depth,
+ Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
const Packet4f pAlpha = pset1<Packet4f>(falpha);
- if( strideA == -1 ) strideA = depth;
- if( strideB == -1 ) strideB = depth;
+ if (strideA == -1) strideA = depth;
+ if (strideB == -1) strideB = depth;
- ei_declare_aligned_stack_constructed_variable(float, result, cols*rows, 0);
- ei_declare_aligned_stack_constructed_variable(float, indexB2, strideB*cols, 0);
- ei_declare_aligned_stack_constructed_variable(float, indexA2, ((strideA + 1) & -2)*4*2, 0);
+ ei_declare_aligned_stack_constructed_variable(float, result, cols* rows, 0);
+ ei_declare_aligned_stack_constructed_variable(float, indexB2, strideB* cols, 0);
+ ei_declare_aligned_stack_constructed_variable(float, indexA2, ((strideA + 1) & -2) * 4 * 2, 0);
convertArrayBF16toF32<DataMapper>(result, cols, rows, res);
- convertArrayPointerBF16toF32(indexB2, cols, strideB, const_cast<bfloat16 *>(indexB));
+ convertArrayPointerBF16toF32(indexB2, cols, strideB, const_cast<bfloat16*>(indexB));
- Index bigSuffix = 2*8*(strideA-offsetA);
- float* indexBF32 = indexB2 + 4*offsetB;
+ Index bigSuffix = 2 * 8 * (strideA - offsetA);
+ float* indexBF32 = indexB2 + 4 * offsetB;
offsetB *= 3;
strideB *= 2;
Index row = 0;
// LHS (8x16) block
- while(row + 16 <= rows){
- calcVSXColLoops<16>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB, bigSuffix, result);
+ while (row + 16 <= rows) {
+ calcVSXColLoops<16>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB,
+ bigSuffix, result);
}
// LHS (8x8) block
- calcVSXColLoops<8>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB, bigSuffix, result);
+ calcVSXColLoops<8>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB,
+ bigSuffix, result);
// LHS (8x4) block
- calcVSXColLoops<4>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB, bigSuffix, result);
+ calcVSXColLoops<4>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB,
+ bigSuffix, result);
// extra rows
- if(rows & 3){
+ if (rows & 3) {
// This index is the beginning of remaining block.
- colVSXLoops<4, true>(depth, cols, rows, pAlpha, indexA, indexA2, indexBF32, strideA, strideB, offsetB, result + row);
+ colVSXLoops<4, true>(depth, cols, rows, pAlpha, indexA, indexA2, indexBF32, strideA, strideB, offsetB,
+ result + row);
}
// Convert back to bfloat16
@@ -3366,554 +3160,527 @@
/************************************
* ppc64le template specializations *
* **********************************/
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
-{
- void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
+ void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
};
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
- ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
- dhs_pack<double, DataMapper, Packet2d, ColMajor, PanelMode, true> pack;
- pack(blockA, lhs, depth, rows, stride, offset);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>::operator()(
+ double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
+ dhs_pack<double, DataMapper, Packet2d, ColMajor, PanelMode, true> pack;
+ pack(blockA, lhs, depth, rows, stride, offset);
}
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
-{
- void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
+ void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
};
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
- ::operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
- dhs_pack<double, DataMapper, Packet2d, RowMajor, PanelMode, true> pack;
- pack(blockA, lhs, depth, rows, stride, offset);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>::operator()(
+ double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
+ dhs_pack<double, DataMapper, Packet2d, RowMajor, PanelMode, true> pack;
+ pack(blockA, lhs, depth, rows, stride, offset);
}
#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
-{
- void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+ void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
};
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
- ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
+ double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
dhs_pack<double, DataMapper, Packet2d, ColMajor, PanelMode, false> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-{
- void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
+ void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
};
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
- ::operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>::operator()(
+ double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
dhs_pack<double, DataMapper, Packet2d, RowMajor, PanelMode, false> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
-{
- void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+ void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
};
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
- ::operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
+ bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
dhs_pack<bfloat16, DataMapper, Packet8bf, ColMajor, PanelMode, false> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-{
- void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
+ void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
};
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<bfloat16, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
- ::operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<bfloat16, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>::operator()(
+ bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
dhs_pack<bfloat16, DataMapper, Packet8bf, RowMajor, PanelMode, false> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
#endif
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
-{
- void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
+ void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
};
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
- ::operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>::operator()(
+ bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
dhs_pack<bfloat16, DataMapper, Packet8bf, ColMajor, PanelMode, true> pack;
pack(blockA, lhs, depth, rows, stride, offset);
}
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
-{
- void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
+ void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
};
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
- ::operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>::operator()(
+ bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
dhs_pack<bfloat16, DataMapper, Packet8bf, RowMajor, PanelMode, true> pack;
pack(blockA, lhs, depth, rows, stride, offset);
}
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
-{
- void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
+ void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
};
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
- ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>::operator()(
+ float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
dhs_pack<float, DataMapper, Packet4f, RowMajor, PanelMode, true> pack;
pack(blockA, lhs, depth, rows, stride, offset);
}
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
-{
- void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
+ void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
};
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
- ::operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>::operator()(
+ float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
dhs_pack<float, DataMapper, Packet4f, ColMajor, PanelMode, true> pack;
pack(blockA, lhs, depth, rows, stride, offset);
}
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
-{
- void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
+ void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
+ Index offset = 0);
};
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
- ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate,
+ PanelMode>::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows,
+ Index stride, Index offset) {
dhs_cpack<float, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, true> pack;
pack(blockA, lhs, depth, rows, stride, offset);
}
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
-{
- void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
+ void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
+ Index offset = 0);
};
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
- ::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate,
+ PanelMode>::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows,
+ Index stride, Index offset) {
dhs_cpack<float, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, true> pack;
pack(blockA, lhs, depth, rows, stride, offset);
}
#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
-{
- void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+ void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
};
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
- ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
+ float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
dhs_pack<float, DataMapper, Packet4f, ColMajor, PanelMode, false> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-{
- void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
+ void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
};
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
- ::operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>::operator()(
+ float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
dhs_pack<float, DataMapper, Packet4f, RowMajor, PanelMode, false> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
#endif
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
-{
- void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+ void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
+ Index offset = 0);
};
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
- ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
+ std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
dhs_cpack<float, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, false> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-{
- void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
+ void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
+ Index offset = 0);
};
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
- ::operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>::operator()(
+ std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
dhs_cpack<float, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, false> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
-{
- void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
+ void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
+ Index offset = 0);
};
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
- ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate,
+ PanelMode>::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows,
+ Index stride, Index offset) {
dhs_cpack<double, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, true> pack;
pack(blockA, lhs, depth, rows, stride, offset);
}
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
-{
- void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
+ void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
+ Index offset = 0);
};
-template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
- ::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate,
+ PanelMode>::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows,
+ Index stride, Index offset) {
dhs_cpack<double, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, true> pack;
pack(blockA, lhs, depth, rows, stride, offset);
}
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
-{
- void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+ void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
+ Index offset = 0);
};
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
- ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
+ std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
dhs_cpack<double, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, false> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
-{
- void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
+ void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
+ Index offset = 0);
};
-template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
-void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
- ::operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>::operator()(
+ std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
dhs_cpack<double, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, false> pack;
pack(blockB, rhs, depth, cols, stride, offset);
}
// ********* gebp specializations *********
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
- typedef typename quad_traits<float>::vectortype Packet;
- typedef typename quad_traits<float>::rhstype RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+ typedef typename quad_traits<float>::vectortype Packet;
+ typedef typename quad_traits<float>::rhstype RhsPacket;
- void operator()(const DataMapper& res, const float* blockA, const float* blockB,
- Index rows, Index depth, Index cols, float alpha,
- Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+ void operator()(const DataMapper& res, const float* blockA, const float* blockB, Index rows, Index depth, Index cols,
+ float alpha, Index strideA = -1, Index strideB = -1, Index offsetA = 0, Index offsetB = 0);
};
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
- ::operator()(const DataMapper& res, const float* blockA, const float* blockB,
- Index rows, Index depth, Index cols, float alpha,
- Index strideA, Index strideB, Index offsetA, Index offsetB)
- {
- const Index accRows = quad_traits<float>::rows;
- const Index accCols = quad_traits<float>::size;
- static void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index) =
- #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
- (supportsMMA()) ?
- &Eigen::internal::gemmMMA<float, Packet, RhsPacket, DataMapper, accRows, accCols> :
- #endif
- &Eigen::internal::gemm<float, Packet, RhsPacket, DataMapper, accRows, accCols>;
- gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
- }
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+ const DataMapper& res, const float* blockA, const float* blockB, Index rows, Index depth, Index cols, float alpha,
+ Index strideA, Index strideB, Index offsetA, Index offsetB) {
+ const Index accRows = quad_traits<float>::rows;
+ const Index accCols = quad_traits<float>::size;
+ static void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index,
+ Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+ (supportsMMA()) ? &Eigen::internal::gemmMMA<float, Packet, RhsPacket, DataMapper, accRows, accCols> :
+#endif
+ &Eigen::internal::gemm<float, Packet, RhsPacket, DataMapper, accRows, accCols>;
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
- typedef Packet4f Packet;
- typedef Packet2cf Packetc;
- typedef Packet4f RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+ typedef Packet4f Packet;
+ typedef Packet2cf Packetc;
+ typedef Packet4f RhsPacket;
void operator()(const DataMapper& res, const std::complex<float>* blockA, const std::complex<float>* blockB,
- Index rows, Index depth, Index cols, std::complex<float> alpha,
- Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+ Index rows, Index depth, Index cols, std::complex<float> alpha, Index strideA = -1,
+ Index strideB = -1, Index offsetA = 0, Index offsetB = 0);
};
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
- ::operator()(const DataMapper& res, const std::complex<float>* blockA, const std::complex<float>* blockB,
- Index rows, Index depth, Index cols, std::complex<float> alpha,
- Index strideA, Index strideB, Index offsetA, Index offsetB)
- {
- const Index accRows = quad_traits<float>::rows;
- const Index accCols = quad_traits<float>::size;
- static void (*gemm_function)(const DataMapper&, const std::complex<float>*, const std::complex<float>*,
- Index, Index, Index, std::complex<float>, Index, Index, Index, Index) =
- #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
- (supportsMMA()) ?
- &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false> :
- #endif
- &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
- gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
- }
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs,
+ ConjugateRhs>::operator()(const DataMapper& res, const std::complex<float>* blockA,
+ const std::complex<float>* blockB, Index rows, Index depth, Index cols,
+ std::complex<float> alpha, Index strideA, Index strideB, Index offsetA,
+ Index offsetB) {
+ const Index accRows = quad_traits<float>::rows;
+ const Index accCols = quad_traits<float>::size;
+ static void (*gemm_function)(const DataMapper&, const std::complex<float>*, const std::complex<float>*, Index, Index,
+ Index, std::complex<float>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+ (supportsMMA()) ? &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>,
+ float, Packet, Packetc, RhsPacket, DataMapper, accRows,
+ accCols, ConjugateLhs, ConjugateRhs, false, false>
+ :
+#endif
+ &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>,
+ float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols,
+ ConjugateLhs, ConjugateRhs, false, false>;
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
- typedef Packet4f Packet;
- typedef Packet2cf Packetc;
- typedef Packet4f RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+ typedef Packet4f Packet;
+ typedef Packet2cf Packetc;
+ typedef Packet4f RhsPacket;
- void operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB,
- Index rows, Index depth, Index cols, std::complex<float> alpha,
- Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+ void operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB, Index rows,
+ Index depth, Index cols, std::complex<float> alpha, Index strideA = -1, Index strideB = -1,
+ Index offsetA = 0, Index offsetB = 0);
};
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
- ::operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB,
- Index rows, Index depth, Index cols, std::complex<float> alpha,
- Index strideA, Index strideB, Index offsetA, Index offsetB)
- {
- const Index accRows = quad_traits<float>::rows;
- const Index accCols = quad_traits<float>::size;
- static void (*gemm_function)(const DataMapper&, const float*, const std::complex<float>*,
- Index, Index, Index, std::complex<float>, Index, Index, Index, Index) =
- #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
- (supportsMMA()) ?
- &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false> :
- #endif
- &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
- gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
- }
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+ const DataMapper& res, const float* blockA, const std::complex<float>* blockB, Index rows, Index depth, Index cols,
+ std::complex<float> alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+ const Index accRows = quad_traits<float>::rows;
+ const Index accCols = quad_traits<float>::size;
+ static void (*gemm_function)(const DataMapper&, const float*, const std::complex<float>*, Index, Index, Index,
+ std::complex<float>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+ (supportsMMA()) ? &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float,
+ Packet, Packetc, RhsPacket, DataMapper, accRows, accCols,
+ ConjugateLhs, ConjugateRhs, true, false>
+ :
+#endif
+ &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Packet,
+ Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+ ConjugateRhs, true, false>;
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
- typedef Packet4f Packet;
- typedef Packet2cf Packetc;
- typedef Packet4f RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+ typedef Packet4f Packet;
+ typedef Packet2cf Packetc;
+ typedef Packet4f RhsPacket;
- void operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB,
- Index rows, Index depth, Index cols, std::complex<float> alpha,
- Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+ void operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB, Index rows,
+ Index depth, Index cols, std::complex<float> alpha, Index strideA = -1, Index strideB = -1,
+ Index offsetA = 0, Index offsetB = 0);
};
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
- ::operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB,
- Index rows, Index depth, Index cols, std::complex<float> alpha,
- Index strideA, Index strideB, Index offsetA, Index offsetB)
- {
- const Index accRows = quad_traits<float>::rows;
- const Index accCols = quad_traits<float>::size;
- static void (*gemm_function)(const DataMapper&, const std::complex<float>*, const float*,
- Index, Index, Index, std::complex<float>, Index, Index, Index, Index) =
- #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
- (supportsMMA()) ?
- &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true> :
- #endif
- &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
- gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
- }
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+ const DataMapper& res, const std::complex<float>* blockA, const float* blockB, Index rows, Index depth, Index cols,
+ std::complex<float> alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+ const Index accRows = quad_traits<float>::rows;
+ const Index accCols = quad_traits<float>::size;
+ static void (*gemm_function)(const DataMapper&, const std::complex<float>*, const float*, Index, Index, Index,
+ std::complex<float>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+ (supportsMMA()) ? &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float,
+ Packet, Packetc, RhsPacket, DataMapper, accRows, accCols,
+ ConjugateLhs, ConjugateRhs, false, true>
+ :
+#endif
+ &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Packet,
+ Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+ ConjugateRhs, false, true>;
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
- typedef typename quad_traits<double>::vectortype Packet;
- typedef typename quad_traits<double>::rhstype RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+ typedef typename quad_traits<double>::vectortype Packet;
+ typedef typename quad_traits<double>::rhstype RhsPacket;
- void operator()(const DataMapper& res, const double* blockA, const double* blockB,
- Index rows, Index depth, Index cols, double alpha,
- Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+ void operator()(const DataMapper& res, const double* blockA, const double* blockB, Index rows, Index depth,
+ Index cols, double alpha, Index strideA = -1, Index strideB = -1, Index offsetA = 0,
+ Index offsetB = 0);
};
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
- ::operator()(const DataMapper& res, const double* blockA, const double* blockB,
- Index rows, Index depth, Index cols, double alpha,
- Index strideA, Index strideB, Index offsetA, Index offsetB)
- {
- const Index accRows = quad_traits<double>::rows;
- const Index accCols = quad_traits<double>::size;
- static void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index) =
- #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
- (supportsMMA()) ?
- &Eigen::internal::gemmMMA<double, Packet, RhsPacket, DataMapper, accRows, accCols> :
- #endif
- &Eigen::internal::gemm<double, Packet, RhsPacket, DataMapper, accRows, accCols>;
- gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
- }
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+ const DataMapper& res, const double* blockA, const double* blockB, Index rows, Index depth, Index cols,
+ double alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+ const Index accRows = quad_traits<double>::rows;
+ const Index accCols = quad_traits<double>::size;
+ static void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index,
+ Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+ (supportsMMA()) ? &Eigen::internal::gemmMMA<double, Packet, RhsPacket, DataMapper, accRows, accCols> :
+#endif
+ &Eigen::internal::gemm<double, Packet, RhsPacket, DataMapper, accRows, accCols>;
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
- typedef quad_traits<double>::vectortype Packet;
- typedef Packet1cd Packetc;
- typedef quad_traits<double>::rhstype RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+ typedef quad_traits<double>::vectortype Packet;
+ typedef Packet1cd Packetc;
+ typedef quad_traits<double>::rhstype RhsPacket;
void operator()(const DataMapper& res, const std::complex<double>* blockA, const std::complex<double>* blockB,
- Index rows, Index depth, Index cols, std::complex<double> alpha,
- Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+ Index rows, Index depth, Index cols, std::complex<double> alpha, Index strideA = -1,
+ Index strideB = -1, Index offsetA = 0, Index offsetB = 0);
};
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
- ::operator()(const DataMapper& res, const std::complex<double>* blockA, const std::complex<double>* blockB,
- Index rows, Index depth, Index cols, std::complex<double> alpha,
- Index strideA, Index strideB, Index offsetA, Index offsetB)
- {
- const Index accRows = quad_traits<double>::rows;
- const Index accCols = quad_traits<double>::size;
- static void (*gemm_function)(const DataMapper&, const std::complex<double>*, const std::complex<double>*,
- Index, Index, Index, std::complex<double>, Index, Index, Index, Index) =
- #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
- (supportsMMA()) ?
- &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false> :
- #endif
- &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
- gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
- }
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs,
+ ConjugateRhs>::operator()(const DataMapper& res, const std::complex<double>* blockA,
+ const std::complex<double>* blockB, Index rows, Index depth, Index cols,
+ std::complex<double> alpha, Index strideA, Index strideB, Index offsetA,
+ Index offsetB) {
+ const Index accRows = quad_traits<double>::rows;
+ const Index accCols = quad_traits<double>::size;
+ static void (*gemm_function)(const DataMapper&, const std::complex<double>*, const std::complex<double>*, Index,
+ Index, Index, std::complex<double>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+ (supportsMMA())
+ ? &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double,
+ Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+ ConjugateRhs, false, false>
+ :
+#endif
+ &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double,
+ Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+ ConjugateRhs, false, false>;
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
- typedef quad_traits<double>::vectortype Packet;
- typedef Packet1cd Packetc;
- typedef quad_traits<double>::rhstype RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+ typedef quad_traits<double>::vectortype Packet;
+ typedef Packet1cd Packetc;
+ typedef quad_traits<double>::rhstype RhsPacket;
- void operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB,
- Index rows, Index depth, Index cols, std::complex<double> alpha,
- Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+ void operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB, Index rows,
+ Index depth, Index cols, std::complex<double> alpha, Index strideA = -1, Index strideB = -1,
+ Index offsetA = 0, Index offsetB = 0);
};
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
- ::operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB,
- Index rows, Index depth, Index cols, std::complex<double> alpha,
- Index strideA, Index strideB, Index offsetA, Index offsetB)
- {
- const Index accRows = quad_traits<double>::rows;
- const Index accCols = quad_traits<double>::size;
- static void (*gemm_function)(const DataMapper&, const std::complex<double>*, const double*,
- Index, Index, Index, std::complex<double>, Index, Index, Index, Index) =
- #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
- (supportsMMA()) ?
- &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true> :
- #endif
- &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
- gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
- }
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+ const DataMapper& res, const std::complex<double>* blockA, const double* blockB, Index rows, Index depth,
+ Index cols, std::complex<double> alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+ const Index accRows = quad_traits<double>::rows;
+ const Index accCols = quad_traits<double>::size;
+ static void (*gemm_function)(const DataMapper&, const std::complex<double>*, const double*, Index, Index, Index,
+ std::complex<double>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+ (supportsMMA()) ? &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double,
+ Packet, Packetc, RhsPacket, DataMapper, accRows, accCols,
+ ConjugateLhs, ConjugateRhs, false, true>
+ :
+#endif
+ &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Packet,
+ Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+ ConjugateRhs, false, true>;
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
- typedef quad_traits<double>::vectortype Packet;
- typedef Packet1cd Packetc;
- typedef quad_traits<double>::rhstype RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+ typedef quad_traits<double>::vectortype Packet;
+ typedef Packet1cd Packetc;
+ typedef quad_traits<double>::rhstype RhsPacket;
- void operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB,
- Index rows, Index depth, Index cols, std::complex<double> alpha,
- Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+ void operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB, Index rows,
+ Index depth, Index cols, std::complex<double> alpha, Index strideA = -1, Index strideB = -1,
+ Index offsetA = 0, Index offsetB = 0);
};
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
- ::operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB,
- Index rows, Index depth, Index cols, std::complex<double> alpha,
- Index strideA, Index strideB, Index offsetA, Index offsetB)
- {
- const Index accRows = quad_traits<double>::rows;
- const Index accCols = quad_traits<double>::size;
- static void (*gemm_function)(const DataMapper&, const double*, const std::complex<double>*,
- Index, Index, Index, std::complex<double>, Index, Index, Index, Index) =
- #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
- (supportsMMA()) ?
- &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false> :
- #endif
- &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
- gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
- }
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+ const DataMapper& res, const double* blockA, const std::complex<double>* blockB, Index rows, Index depth,
+ Index cols, std::complex<double> alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+ const Index accRows = quad_traits<double>::rows;
+ const Index accCols = quad_traits<double>::size;
+ static void (*gemm_function)(const DataMapper&, const double*, const std::complex<double>*, Index, Index, Index,
+ std::complex<double>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+ (supportsMMA()) ? &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double,
+ Packet, Packetc, RhsPacket, DataMapper, accRows, accCols,
+ ConjugateLhs, ConjugateRhs, true, false>
+ :
+#endif
+ &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Packet,
+ Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+ ConjugateRhs, true, false>;
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel<bfloat16, bfloat16, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
-{
- typedef typename quad_traits<bfloat16>::vectortype Packet;
- typedef typename quad_traits<bfloat16>::rhstype RhsPacket;
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<bfloat16, bfloat16, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+ typedef typename quad_traits<bfloat16>::vectortype Packet;
+ typedef typename quad_traits<bfloat16>::rhstype RhsPacket;
- void operator()(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB,
- Index rows, Index depth, Index cols, bfloat16 alpha,
- Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
+ void operator()(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB, Index rows, Index depth,
+ Index cols, bfloat16 alpha, Index strideA = -1, Index strideB = -1, Index offsetA = 0,
+ Index offsetB = 0);
};
-template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-void gebp_kernel<bfloat16, bfloat16, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
- ::operator()(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB,
- Index rows, Index depth, Index cols, bfloat16 alpha,
- Index strideA, Index strideB, Index offsetA, Index offsetB)
- {
- static void (*gemm_function)(const DataMapper&, const bfloat16*, const bfloat16*, Index, Index, Index, bfloat16, Index, Index, Index, Index) =
- #ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
- (supportsMMA()) ?
- &Eigen::internal::gemmMMAbfloat16<DataMapper> :
- #endif
- &Eigen::internal::gemmbfloat16<DataMapper>;
- gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
- }
-} // end namespace internal
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<bfloat16, bfloat16, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+ const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB, Index rows, Index depth, Index cols,
+ bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+ static void (*gemm_function)(const DataMapper&, const bfloat16*, const bfloat16*, Index, Index, Index, bfloat16,
+ Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+ (supportsMMA()) ? &Eigen::internal::gemmMMAbfloat16<DataMapper> :
+#endif
+ &Eigen::internal::gemmbfloat16<DataMapper>;
+ gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H
+#endif // EIGEN_MATRIX_PRODUCT_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
index fa1755f..e78ca5a 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
@@ -1,6 +1,6 @@
-//#define EIGEN_POWER_USE_PREFETCH // Use prefetching in gemm routines
+// #define EIGEN_POWER_USE_PREFETCH // Use prefetching in gemm routines
#ifdef EIGEN_POWER_USE_PREFETCH
-#define EIGEN_POWER_PREFETCH(p) prefetch(p)
+#define EIGEN_POWER_PREFETCH(p) prefetch(p)
#else
#define EIGEN_POWER_PREFETCH(p)
#endif
@@ -16,158 +16,125 @@
namespace internal {
-template<typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
-EIGEN_ALWAYS_INLINE void gemm_extra_row(
- const DataMapper& res,
- const Scalar* lhs_base,
- const Scalar* rhs_base,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index row,
- Index rows,
- Index remaining_rows,
- const Packet& pAlpha,
- const Packet& pMask);
+template <typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_row(const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base,
+ Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index rows,
+ Index remaining_rows, const Packet& pAlpha, const Packet& pMask);
-template<typename Scalar, typename Packet, typename DataMapper, const Index accCols>
-EIGEN_ALWAYS_INLINE void gemm_extra_cols(
- const DataMapper& res,
- const Scalar* blockA,
- const Scalar* blockB,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index offsetB,
- Index col,
- Index rows,
- Index cols,
- Index remaining_rows,
- const Packet& pAlpha,
- const Packet& pMask);
+template <typename Scalar, typename Packet, typename DataMapper, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index depth,
+ Index strideA, Index offsetA, Index strideB, Index offsetB, Index col,
+ Index rows, Index cols, Index remaining_rows, const Packet& pAlpha,
+ const Packet& pMask);
-template<typename Packet>
+template <typename Packet>
EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows);
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(
- const DataMapper& res,
- const Scalar* lhs_base,
- const Scalar* rhs_base,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index row,
- Index rows,
- Index remaining_rows,
- const Packet& pAlphaReal,
- const Packet& pAlphaImag,
- const Packet& pMask);
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows,
+ const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base,
+ Index depth, Index strideA, Index offsetA, Index strideB, Index row,
+ Index rows, Index remaining_rows, const Packet& pAlphaReal,
+ const Packet& pAlphaImag, const Packet& pMask);
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void gemm_complex_extra_cols(
- const DataMapper& res,
- const Scalar* blockA,
- const Scalar* blockB,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index offsetB,
- Index col,
- Index rows,
- Index cols,
- Index remaining_rows,
- const Packet& pAlphaReal,
- const Packet& pAlphaImag,
- const Packet& pMask);
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accCols,
+ bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+ Index depth, Index strideA, Index offsetA, Index strideB,
+ Index offsetB, Index col, Index rows, Index cols, Index remaining_rows,
+ const Packet& pAlphaReal, const Packet& pAlphaImag,
+ const Packet& pMask);
-template<typename DataMapper>
-EIGEN_ALWAYS_INLINE void convertArrayBF16toF32(float *result, Index cols, Index rows, const DataMapper& src);
+template <typename DataMapper>
+EIGEN_ALWAYS_INLINE void convertArrayBF16toF32(float* result, Index cols, Index rows, const DataMapper& src);
-template<const Index size, bool non_unit_stride, Index delta>
+template <const Index size, bool non_unit_stride, Index delta>
EIGEN_ALWAYS_INLINE void storeBF16fromResult(bfloat16* dst, Packet8bf data, Index resInc, Index extra = 0);
-template<bool non_unit_stride = false>
-EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32(float *result, Index cols, Index rows, bfloat16* src, Index resInc = 1);
+template <bool non_unit_stride = false>
+EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32(float* result, Index cols, Index rows, bfloat16* src,
+ Index resInc = 1);
-template<bool rhsExtraCols, bool lhsExtraRows>
-EIGEN_ALWAYS_INLINE void storeResults(Packet4f (&acc)[4], Index rows, const Packet4f pAlpha, float* result, Index extra_cols, Index extra_rows);
+template <bool rhsExtraCols, bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void storeResults(Packet4f (&acc)[4], Index rows, const Packet4f pAlpha, float* result,
+ Index extra_cols, Index extra_rows);
-template<Index num_acc, bool extraRows, Index size = 4>
-EIGEN_ALWAYS_INLINE void outputVecColResults(Packet4f (&acc)[num_acc][size], float *result, Packet4f pAlpha, Index extra_rows);
+template <Index num_acc, bool extraRows, Index size = 4>
+EIGEN_ALWAYS_INLINE void outputVecColResults(Packet4f (&acc)[num_acc][size], float* result, Packet4f pAlpha,
+ Index extra_rows);
-template<Index num_acc, Index size = 4>
-EIGEN_ALWAYS_INLINE void outputVecResults(Packet4f (&acc)[num_acc][size], float *result, Packet4f pAlpha);
+template <Index num_acc, Index size = 4>
+EIGEN_ALWAYS_INLINE void outputVecResults(Packet4f (&acc)[num_acc][size], float* result, Packet4f pAlpha);
-template<typename RhsMapper, bool linear>
+template <typename RhsMapper, bool linear>
EIGEN_ALWAYS_INLINE Packet8bf loadColData(RhsMapper& rhs, Index j);
-template<typename Packet>
-EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet)* lhs);
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet) * lhs);
-template<typename DataMapper, typename Packet, const Index accCols, int StorageOrder, bool Complex, int N, bool full = true>
-EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index col);
+template <typename DataMapper, typename Packet, const Index accCols, int StorageOrder, bool Complex, int N,
+ bool full = true>
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet, N*(Complex ? 2 : 1)>& acc, const DataMapper& res, Index row,
+ Index col);
-template<typename DataMapper, typename Packet, int N>
-EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row);
+template <typename DataMapper, typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet, N>& acc, const DataMapper& res, Index row);
#ifdef USE_PARTIAL_PACKETS
-template<typename DataMapper, typename Packet, const Index accCols, bool Complex, Index N, bool full = true>
-EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index elements);
+template <typename DataMapper, typename Packet, const Index accCols, bool Complex, Index N, bool full = true>
+EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock<Packet, N*(Complex ? 2 : 1)>& acc, const DataMapper& res, Index row,
+ Index elements);
-template<typename DataMapper, typename Packet, Index N>
-EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row, Index elements);
+template <typename DataMapper, typename Packet, Index N>
+EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock<Packet, N>& acc, const DataMapper& res, Index row, Index elements);
#endif
-template<typename Packet, int N>
-EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha);
+template <typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet, N>& acc, PacketBlock<Packet, N>& accZ, const Packet& pAlpha);
-template<typename Packet, int N, bool mask>
-EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha, const Packet& pMask);
+template <typename Packet, int N, bool mask>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet, N>& acc, PacketBlock<Packet, N>& accZ, const Packet& pAlpha,
+ const Packet& pMask);
-template<typename Packet, int N, bool mask>
-EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag, const Packet& pMask);
+template <typename Packet, int N, bool mask>
+EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet, N>& aReal, PacketBlock<Packet, N>& aImag, const Packet& bReal,
+ const Packet& bImag, PacketBlock<Packet, N>& cReal, PacketBlock<Packet, N>& cImag,
+ const Packet& pMask);
-template<typename Packet, typename Packetc, int N, bool full>
-EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc,N*2>& tRes, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2);
+template <typename Packet, typename Packetc, int N, bool full>
+EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet, N>& taccReal, PacketBlock<Packet, N>& taccImag,
+ PacketBlock<Packetc, N * 2>& tRes, PacketBlock<Packetc, N>& acc1,
+ PacketBlock<Packetc, N>& acc2);
-#define MICRO_NORMAL(iter) \
- (accCols == accCols2) || (unroll_factor != (iter + 1))
+#define MICRO_NORMAL(iter) (accCols == accCols2) || (unroll_factor != (iter + 1))
-#define MICRO_UNROLL_ITER1(func, N) \
- switch (remaining_rows) { \
- default: \
- func(N, 0) \
- break; \
- case 1: \
- func(N, 1) \
- break; \
- case 2: \
+#define MICRO_UNROLL_ITER1(func, N) \
+ switch (remaining_rows) { \
+ default: \
+ func(N, 0) break; \
+ case 1: \
+ func(N, 1) break; \
+ case 2: \
if (sizeof(Scalar) == sizeof(float)) { \
- func(N, 2) \
- } \
- break; \
- case 3: \
+ func(N, 2) \
+ } \
+ break; \
+ case 3: \
if (sizeof(Scalar) == sizeof(float)) { \
- func(N, 3) \
- } \
- break; \
+ func(N, 3) \
+ } \
+ break; \
}
#ifdef USE_PARTIAL_PACKETS
#define MICRO_UNROLL_ITER(func, N) \
- if (remaining_rows) { \
- func(N, true); \
- } else { \
- func(N, false); \
+ if (remaining_rows) { \
+ func(N, true); \
+ } else { \
+ func(N, false); \
}
-#define MICRO_NORMAL_PARTIAL(iter) \
- full || (unroll_factor != (iter + 1))
+#define MICRO_NORMAL_PARTIAL(iter) full || (unroll_factor != (iter + 1))
#else
#define MICRO_UNROLL_ITER(func, N) MICRO_UNROLL_ITER1(func, N)
#endif
@@ -176,37 +143,38 @@
#define MICRO_NORMAL_COLS(iter, a, b) ((MICRO_NORMAL(iter)) ? a : b)
-#define MICRO_LOAD1(lhs_ptr, iter) \
- if (unroll_factor > iter) { \
- lhsV##iter = ploadLhs<Packet>(lhs_ptr##iter); \
+#define MICRO_LOAD1(lhs_ptr, iter) \
+ if (unroll_factor > iter) { \
+ lhsV##iter = ploadLhs<Packet>(lhs_ptr##iter); \
lhs_ptr##iter += MICRO_NORMAL_COLS(iter, accCols, accCols2); \
- } else { \
- EIGEN_UNUSED_VARIABLE(lhsV##iter); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(lhsV##iter); \
}
#define MICRO_LOAD_ONE(iter) MICRO_LOAD1(lhs_ptr, iter)
-#define MICRO_COMPLEX_LOAD_ONE(iter) \
- if (!LhsIsReal && (unroll_factor > iter)) { \
+#define MICRO_COMPLEX_LOAD_ONE(iter) \
+ if (!LhsIsReal && (unroll_factor > iter)) { \
lhsVi##iter = ploadLhs<Packet>(lhs_ptr_real##iter + MICRO_NORMAL_COLS(iter, imag_delta, imag_delta2)); \
- } else { \
- EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
- } \
- MICRO_LOAD1(lhs_ptr_real, iter) \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
+ } \
+ MICRO_LOAD1(lhs_ptr_real, iter)
-#define MICRO_SRC_PTR1(lhs_ptr, advRows, iter) \
- if (unroll_factor > iter) { \
- lhs_ptr##iter = lhs_base + (row+(iter*accCols))*strideA*advRows - MICRO_NORMAL_COLS(iter, 0, (accCols-accCols2)*offsetA); \
- } else { \
- EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
+#define MICRO_SRC_PTR1(lhs_ptr, advRows, iter) \
+ if (unroll_factor > iter) { \
+ lhs_ptr##iter = lhs_base + (row + (iter * accCols)) * strideA * advRows - \
+ MICRO_NORMAL_COLS(iter, 0, (accCols - accCols2) * offsetA); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
}
#define MICRO_SRC_PTR_ONE(iter) MICRO_SRC_PTR1(lhs_ptr, 1, iter)
#define MICRO_COMPLEX_SRC_PTR_ONE(iter) MICRO_SRC_PTR1(lhs_ptr_real, advanceRows, iter)
-#define MICRO_PREFETCH1(lhs_ptr, iter) \
- if (unroll_factor > iter) { \
+#define MICRO_PREFETCH1(lhs_ptr, iter) \
+ if (unroll_factor > iter) { \
EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
}
@@ -220,19 +188,18 @@
#define MICRO_UPDATE_MASK EIGEN_UNUSED_VARIABLE(pMask);
#endif
-#define MICRO_UPDATE \
- if (accCols == accCols2) { \
- MICRO_UPDATE_MASK \
+#define MICRO_UPDATE \
+ if (accCols == accCols2) { \
+ MICRO_UPDATE_MASK \
EIGEN_UNUSED_VARIABLE(offsetA); \
- row += unroll_factor*accCols; \
+ row += unroll_factor * accCols; \
}
-#define MICRO_COMPLEX_UPDATE \
- MICRO_UPDATE \
- if(LhsIsReal || (accCols == accCols2)) { \
- EIGEN_UNUSED_VARIABLE(imag_delta2); \
+#define MICRO_COMPLEX_UPDATE \
+ MICRO_UPDATE \
+ if (LhsIsReal || (accCols == accCols2)) { \
+ EIGEN_UNUSED_VARIABLE(imag_delta2); \
}
-
-} // end namespace internal
-} // end namespace Eigen
+} // end namespace internal
+} // end namespace Eigen
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
index 72e8c31..94c5dd2 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
@@ -37,14 +37,11 @@
#define accColsC (accCols / 2)
-EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc)
-{
- __builtin_mma_xxsetaccz(acc);
-}
+EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc) { __builtin_mma_xxsetaccz(acc); }
-template<typename DataMapper, typename Packet, bool full>
-EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Index elements, __vector_quad* acc)
-{
+template <typename DataMapper, typename Packet, bool full>
+EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Index elements,
+ __vector_quad* acc) {
PacketBlock<Packet, 4> result;
__builtin_mma_disassemble_acc(&result.packet, acc);
@@ -61,9 +58,10 @@
}
}
-template<typename DataMapper, typename Packet, typename Packetc, const Index accCols, const Index accCols2>
-EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, const Packet& pMask, __vector_quad* accReal, __vector_quad* accImag)
-{
+template <typename DataMapper, typename Packet, typename Packetc, const Index accCols, const Index accCols2>
+EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal,
+ const Packet& alphaImag, const Packet& pMask, __vector_quad* accReal,
+ __vector_quad* accImag) {
constexpr bool full = (accCols2 > accColsC);
PacketBlock<Packet, 4> resultReal, resultImag;
__builtin_mma_disassemble_acc(&resultReal.packet, accReal);
@@ -85,80 +83,70 @@
}
// Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
-template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b)
-{
- if(NegativeAccumulate)
- {
+template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) {
+ if (NegativeAccumulate) {
__builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
} else {
__builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
}
}
-template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b)
-{
- if(NegativeAccumulate)
- {
+template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b) {
+ if (NegativeAccumulate) {
__builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b);
} else {
__builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b);
}
}
-template<typename Packet, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, Packet& lhsVi, const RhsPacket& rhsV, RhsPacket& rhsVi)
-{
- pgerMMA<Packet, RhsPacket, false>(accReal, rhsV, lhsV);
- if(LhsIsReal) {
- pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi, lhsV);
+template <typename Packet, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, Packet& lhsVi,
+ const RhsPacket& rhsV, RhsPacket& rhsVi) {
+ pgerMMA<Packet, RhsPacket, false>(accReal, rhsV, lhsV);
+ if (LhsIsReal) {
+ pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi, lhsV);
EIGEN_UNUSED_VARIABLE(lhsVi);
} else {
- if(!RhsIsReal) {
+ if (!RhsIsReal) {
pgerMMA<Packet, RhsPacket, ConjugateLhs == ConjugateRhs>(accReal, rhsVi, lhsVi);
- pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi, lhsV);
+ pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi, lhsV);
} else {
EIGEN_UNUSED_VARIABLE(rhsVi);
}
- pgerMMA<Packet, RhsPacket, ConjugateLhs>(accImag, rhsV, lhsVi);
+ pgerMMA<Packet, RhsPacket, ConjugateLhs>(accImag, rhsV, lhsVi);
}
}
// This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
-template<typename Packet>
-EIGEN_ALWAYS_INLINE Packet ploadRhs(const __UNPACK_TYPE__(Packet)* rhs)
-{
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadRhs(const __UNPACK_TYPE__(Packet) * rhs) {
return ploadu<Packet>(rhs);
}
-template<typename Scalar, typename Packet>
-EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV)
-{
+template <typename Scalar, typename Packet>
+EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) {
rhsV = ploadRhs<Packet>(rhs);
-}
+}
-template<>
-EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV)
-{
+template <>
+EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV) {
#if EIGEN_COMP_LLVM
- __builtin_vsx_assemble_pair(&rhsV,
- reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs + (sizeof(Packet2d) / sizeof(double)))),
- reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs)));
+ __builtin_vsx_assemble_pair(
+ &rhsV, reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs + (sizeof(Packet2d) / sizeof(double)))),
+ reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs)));
#else
- rhsV = *reinterpret_cast<__vector_pair *>(const_cast<double *>(rhs));
+ rhsV = *reinterpret_cast<__vector_pair*>(const_cast<double*>(rhs));
#endif
}
-EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV)
-{
- ploadRhsMMA(lhs, lhsV);
-}
+EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) { ploadRhsMMA(lhs, lhsV); }
#define GEMM_MULTIPLE_COLS
// Disable in GCC until unnecessary register moves are fixed
-//#if (EIGEN_COMP_LLVM || (__GNUC__ >= 11))
+// #if (EIGEN_COMP_LLVM || (__GNUC__ >= 11))
#if EIGEN_COMP_LLVM
#define VECTOR_PAIR_LOADS_LHS
#endif
@@ -175,134 +163,127 @@
#endif
#endif
-#define MICRO_MMA_UNROLL(func) \
- func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
+#define MICRO_MMA_UNROLL(func) func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
-#define MICRO_MMA_WORK(func, type, peel) \
- if (accItr == 1) { \
- func(0,type,peel,0,0) func(1,type,peel,1,0) func(2,type,peel,2,0) func(3,type,peel,3,0) \
- func(4,type,peel,4,0) func(5,type,peel,5,0) func(6,type,peel,6,0) func(7,type,peel,7,0) \
- } else if (accItr == 2) { \
- func(0,type,peel,0,0) func(1,type,peel,0,1) func(2,type,peel,1,0) func(3,type,peel,1,1) \
- func(4,type,peel,2,0) func(5,type,peel,2,1) func(6,type,peel,3,0) func(7,type,peel,3,1) \
- } else { \
- func(0,type,peel,0,0) func(1,type,peel,0,1) func(2,type,peel,0,2) func(3,type,peel,0,3) \
- func(4,type,peel,1,0) func(5,type,peel,1,1) func(6,type,peel,1,2) func(7,type,peel,1,3) \
+#define MICRO_MMA_WORK(func, type, peel) \
+ if (accItr == 1) { \
+ func(0, type, peel, 0, 0) func(1, type, peel, 1, 0) func(2, type, peel, 2, 0) func(3, type, peel, 3, 0) \
+ func(4, type, peel, 4, 0) func(5, type, peel, 5, 0) func(6, type, peel, 6, 0) func(7, type, peel, 7, 0) \
+ } else if (accItr == 2) { \
+ func(0, type, peel, 0, 0) func(1, type, peel, 0, 1) func(2, type, peel, 1, 0) func(3, type, peel, 1, 1) \
+ func(4, type, peel, 2, 0) func(5, type, peel, 2, 1) func(6, type, peel, 3, 0) func(7, type, peel, 3, 1) \
+ } else { \
+ func(0, type, peel, 0, 0) func(1, type, peel, 0, 1) func(2, type, peel, 0, 2) func(3, type, peel, 0, 3) \
+ func(4, type, peel, 1, 0) func(5, type, peel, 1, 1) func(6, type, peel, 1, 2) func(7, type, peel, 1, 3) \
}
-#define MICRO_MMA_WORK_ONE(iter, type, peel, left, right) \
- if (unroll_factor > left) { \
+#define MICRO_MMA_WORK_ONE(iter, type, peel, left, right) \
+ if (unroll_factor > left) { \
pgerMMA<Packet, type, false>(&accZero##iter, rhsV##right[peel], lhsV##left); \
}
#ifdef VECTOR_PAIR_LOADS_LHS
-#define MICRO_MMA_WORK_TWO(iter, type, peel, left, right) \
- if (unroll_factor > left) { \
+#define MICRO_MMA_WORK_TWO(iter, type, peel, left, right) \
+ if (unroll_factor > left) { \
pgerMMA<Packet, type, false>(&accZero##iter, rhsV##right[peel], lhsV2##left.packet[peel & 1]); \
}
-#define MICRO_MMA_LOAD1_TWO(lhs_ptr, left) \
- if (unroll_factor > left) { \
- if (MICRO_NORMAL(left)) { \
- ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr##left), plhsV##left); \
+#define MICRO_MMA_LOAD1_TWO(lhs_ptr, left) \
+ if (unroll_factor > left) { \
+ if (MICRO_NORMAL(left)) { \
+ ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr##left), plhsV##left); \
__builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsV2##left.packet), &plhsV##left); \
- lhs_ptr##left += accCols*2; \
- } else { \
- lhsV2##left.packet[0] = ploadLhs<Packet>(lhs_ptr##left); \
- lhsV2##left.packet[1] = ploadLhs<Packet>(lhs_ptr##left + accCols2); \
- lhs_ptr##left += accCols2*2; \
- EIGEN_UNUSED_VARIABLE(plhsV##left); \
- } \
- } else { \
- EIGEN_UNUSED_VARIABLE(lhsV2##left); \
- EIGEN_UNUSED_VARIABLE(plhsV##left); \
+ lhs_ptr##left += accCols * 2; \
+ } else { \
+ lhsV2##left.packet[0] = ploadLhs<Packet>(lhs_ptr##left); \
+ lhsV2##left.packet[1] = ploadLhs<Packet>(lhs_ptr##left + accCols2); \
+ lhs_ptr##left += accCols2 * 2; \
+ EIGEN_UNUSED_VARIABLE(plhsV##left); \
+ } \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(lhsV2##left); \
+ EIGEN_UNUSED_VARIABLE(plhsV##left); \
}
#define MICRO_MMA_LOAD_TWO(left) MICRO_MMA_LOAD1_TWO(lhs_ptr, left)
#endif
-#define MICRO_MMA_UNROLL_ITER(func, val) \
- func(val,0) \
- if (accItr > 1) { \
- func(val,1) \
- if (accItr > 2) { \
- func(val,2) \
- func(val,3) \
- } \
+#define MICRO_MMA_UNROLL_ITER(func, val) \
+ func(val, 0) if (accItr > 1) { \
+ func(val, 1) if (accItr > 2) { func(val, 2) func(val, 3) } \
}
-#define MICRO_MMA_LOAD_ONE_RHS1(peel, right) \
- ploadRhsMMA(rhs_ptr##right + (accRows * peel), rhsV##right[peel]);
+#define MICRO_MMA_LOAD_ONE_RHS1(peel, right) ploadRhsMMA(rhs_ptr##right + (accRows * peel), rhsV##right[peel]);
-#define MICRO_MMA_LOAD_ONE_RHS(peel) \
- MICRO_MMA_UNROLL_ITER(MICRO_MMA_LOAD_ONE_RHS1, peel)
+#define MICRO_MMA_LOAD_ONE_RHS(peel) MICRO_MMA_UNROLL_ITER(MICRO_MMA_LOAD_ONE_RHS1, peel)
-#define MICRO_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
- if (PEEL_MMA > peel) { \
+#define MICRO_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
+ if (PEEL_MMA > peel) { \
Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
- MICRO_MMA_LOAD_ONE_RHS(peel) \
- MICRO_MMA_UNROLL(funcl) \
- MICRO_MMA_WORK(funcw, type, peel) \
+ MICRO_MMA_LOAD_ONE_RHS(peel) \
+ MICRO_MMA_UNROLL(funcl) \
+ MICRO_MMA_WORK(funcw, type, peel) \
}
#ifndef VECTOR_PAIR_LOADS_LHS
-#define MICRO_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \
+#define MICRO_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \
type rhsV0[8], rhsV1[(accItr > 1) ? 8 : 1], rhsV2[(accItr > 2) ? 8 : 1], rhsV3[(accItr > 2) ? 8 : 1]; \
- MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,1) \
- MICRO_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,3) \
- MICRO_MMA_TYPE_PEEL(funcw,funcl,type,4) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,5) \
- MICRO_MMA_TYPE_PEEL(funcw,funcl,type,6) MICRO_MMA_TYPE_PEEL(funcw,funcl,type,7)
+ MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 0) \
+ MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 1) \
+ MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 2) \
+ MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 3) \
+ MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 4) \
+ MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 5) \
+ MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 6) MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 7)
#else
-#define MICRO_MMA_LOAD_TWO_RHS(peel1, right) \
+#define MICRO_MMA_LOAD_TWO_RHS(peel1, right) \
ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr##right + (accRows * peel1)), prhsV##peel1); \
__builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV##right[peel1]), &prhsV##peel1);
-#define MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
- if (PEEL_MMA > peel2) { \
- PacketBlock<Packet,2> lhsV20, lhsV21, lhsV22, lhsV23, lhsV24, lhsV25, lhsV26, lhsV27; \
- __vector_pair plhsV0, plhsV1, plhsV2, plhsV3, plhsV4, plhsV5, plhsV6, plhsV7; \
- if (sizeof(type) == 16) { \
- MICRO_MMA_UNROLL_ITER(MICRO_MMA_LOAD_TWO_RHS, peel1) \
- } else { \
- EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
- MICRO_MMA_LOAD_ONE_RHS(peel1) \
- MICRO_MMA_LOAD_ONE_RHS(peel2) \
- } \
- MICRO_MMA_UNROLL(funcl2) \
- MICRO_MMA_WORK(funcw2, type, peel1) \
- MICRO_MMA_WORK(funcw2, type, peel2) \
- } else { \
- EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
- MICRO_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \
+#define MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
+ if (PEEL_MMA > peel2) { \
+ PacketBlock<Packet, 2> lhsV20, lhsV21, lhsV22, lhsV23, lhsV24, lhsV25, lhsV26, lhsV27; \
+ __vector_pair plhsV0, plhsV1, plhsV2, plhsV3, plhsV4, plhsV5, plhsV6, plhsV7; \
+ if (sizeof(type) == 16) { \
+ MICRO_MMA_UNROLL_ITER(MICRO_MMA_LOAD_TWO_RHS, peel1) \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
+ MICRO_MMA_LOAD_ONE_RHS(peel1) \
+ MICRO_MMA_LOAD_ONE_RHS(peel2) \
+ } \
+ MICRO_MMA_UNROLL(funcl2) \
+ MICRO_MMA_WORK(funcw2, type, peel1) \
+ MICRO_MMA_WORK(funcw2, type, peel2) \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
+ MICRO_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \
}
-#define MICRO_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
+#define MICRO_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
type rhsV0[8], rhsV1[(accItr > 1) ? 8 : 1], rhsV2[(accItr > 2) ? 8 : 1], rhsV3[(accItr > 2) ? 8 : 1]; \
- __vector_pair prhsV0, prhsV2, prhsV4, prhsV6; \
- MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \
- MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3) \
- MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,4,5) \
- MICRO_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,6,7)
+ __vector_pair prhsV0, prhsV2, prhsV4, prhsV6; \
+ MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 0, 1) \
+ MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 2, 3) \
+ MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 4, 5) \
+ MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 6, 7)
#endif
#define MICRO_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
- type rhsV0[1], rhsV1[1], rhsV2[1], rhsV3[1]; \
- MICRO_MMA_TYPE_PEEL(funcw,funcl,type,0)
+ type rhsV0[1], rhsV1[1], rhsV2[1], rhsV3[1]; \
+ MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 0)
-#define MICRO_MMA_UPDATE_RHS1(size, right) \
- rhs_ptr##right += (accRows * size);
+#define MICRO_MMA_UPDATE_RHS1(size, right) rhs_ptr##right += (accRows * size);
-#define MICRO_MMA_UPDATE_RHS(size) \
- MICRO_MMA_UNROLL_ITER(MICRO_MMA_UPDATE_RHS1, size)
+#define MICRO_MMA_UPDATE_RHS(size) MICRO_MMA_UNROLL_ITER(MICRO_MMA_UPDATE_RHS1, size)
-#define MICRO_MMA_UNROLL_TYPE(MICRO_MMA_TYPE, size) \
+#define MICRO_MMA_UNROLL_TYPE(MICRO_MMA_TYPE, size) \
MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, RhsPacket) \
MICRO_MMA_UPDATE_RHS(size)
#ifndef VECTOR_PAIR_LOADS_LHS
#define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_PEEL, PEEL_MMA)
#else
-#define MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_TYPE, size) \
+#define MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_TYPE, size) \
MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, MICRO_MMA_WORK_TWO, MICRO_MMA_LOAD_TWO, RhsPacket) \
MICRO_MMA_UPDATE_RHS(size)
@@ -311,10 +292,10 @@
#define MICRO_MMA_ONE MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_ONE, 1)
-#define MICRO_MMA_DST_PTR_ONE(iter) \
- if (unroll_factor * accItr > iter) { \
- bsetzeroMMA(&accZero##iter); \
- } else { \
+#define MICRO_MMA_DST_PTR_ONE(iter) \
+ if (unroll_factor * accItr > iter) { \
+ bsetzeroMMA(&accZero##iter); \
+ } else { \
EIGEN_UNUSED_VARIABLE(accZero##iter); \
}
@@ -324,50 +305,40 @@
#define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_PREFETCH_ONE)
-#define MICRO_MMA_STORE_ONE(iter, left, right) \
- if (unroll_factor > left) { \
- storeAccumulator<DataMapper, Packet, MICRO_NORMAL_PARTIAL(left)>(row + left*accCols, res##right, pAlpha, accCols2, &accZero##iter); \
+#define MICRO_MMA_STORE_ONE(iter, left, right) \
+ if (unroll_factor > left) { \
+ storeAccumulator<DataMapper, Packet, MICRO_NORMAL_PARTIAL(left)>(row + left * accCols, res##right, pAlpha, \
+ accCols2, &accZero##iter); \
}
-#define MICRO_MMA_ITER_UNROLL(func) \
- if (accItr == 1) { \
- func(0,0,0) func(1,1,0) func(2,2,0) func(3,3,0) \
- func(4,4,0) func(5,5,0) func(6,6,0) func(7,7,0) \
- } else if (accItr == 2) { \
- func(0,0,0) func(1,0,1) func(2,1,0) func(3,1,1) \
- func(4,2,0) func(5,2,1) func(6,3,0) func(7,3,1) \
- } else { \
- func(0,0,0) func(1,0,1) func(2,0,2) func(3,0,3) \
- func(4,1,0) func(5,1,1) func(6,1,2) func(7,1,3) \
+#define MICRO_MMA_ITER_UNROLL(func) \
+ if (accItr == 1) { \
+ func(0, 0, 0) func(1, 1, 0) func(2, 2, 0) func(3, 3, 0) func(4, 4, 0) func(5, 5, 0) func(6, 6, 0) func(7, 7, 0) \
+ } else if (accItr == 2) { \
+ func(0, 0, 0) func(1, 0, 1) func(2, 1, 0) func(3, 1, 1) func(4, 2, 0) func(5, 2, 1) func(6, 3, 0) func(7, 3, 1) \
+ } else { \
+ func(0, 0, 0) func(1, 0, 1) func(2, 0, 2) func(3, 0, 3) func(4, 1, 0) func(5, 1, 1) func(6, 1, 2) func(7, 1, 3) \
}
#define MICRO_MMA_STORE MICRO_MMA_ITER_UNROLL(MICRO_MMA_STORE_ONE)
-#define MICRO_MMA_EXTRA_ROWS(right) \
- gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(res3##right, blockA, rhs_base + right*accRows*strideB, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlpha, pMask);
+#define MICRO_MMA_EXTRA_ROWS(right) \
+ gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>( \
+ res3##right, blockA, rhs_base + right * accRows * strideB, depth, strideA, offsetA, strideB, row, rows, \
+ remaining_rows, pAlpha, pMask);
-#define MICRO_MMA_EXTRA_ROWS1(val, right) \
- MICRO_MMA_EXTRA_ROWS(right);
+#define MICRO_MMA_EXTRA_ROWS1(val, right) MICRO_MMA_EXTRA_ROWS(right);
-template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool full, const Index accItr>
-EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(
- const DataMapper& res0,
- const DataMapper& res1,
- const DataMapper& res2,
- const DataMapper& res3,
- const Scalar* lhs_base,
- const Scalar* rhs_base,
- Index depth,
- Index strideA,
- Index strideB,
- Index offsetA,
- Index& row,
- const Packet& pAlpha,
- Index accCols2
- )
-{
- const Scalar* rhs_ptr0 = rhs_base, * rhs_ptr1 = NULL, * rhs_ptr2 = NULL, * rhs_ptr3 = NULL;
- const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
+template <int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper,
+ const Index accRows, const Index accCols, bool full, const Index accItr>
+EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(const DataMapper& res0, const DataMapper& res1,
+ const DataMapper& res2, const DataMapper& res3,
+ const Scalar* lhs_base, const Scalar* rhs_base, Index depth,
+ Index strideA, Index strideB, Index offsetA, Index& row,
+ const Packet& pAlpha, Index accCols2) {
+ const Scalar *rhs_ptr0 = rhs_base, *rhs_ptr1 = NULL, *rhs_ptr2 = NULL, *rhs_ptr3 = NULL;
+ const Scalar *lhs_ptr0 = NULL, *lhs_ptr1 = NULL, *lhs_ptr2 = NULL, *lhs_ptr3 = NULL, *lhs_ptr4 = NULL,
+ *lhs_ptr5 = NULL, *lhs_ptr6 = NULL, *lhs_ptr7 = NULL;
__vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
if (accItr > 1) {
@@ -391,14 +362,12 @@
MICRO_MMA_DST_PTR
Index k = 0, depth2 = depth - PEEL_MMA;
- for(; k <= depth2; k += PEEL_MMA)
- {
+ for (; k <= depth2; k += PEEL_MMA) {
EIGEN_POWER_PREFETCH(rhs_ptr);
MICRO_MMA_PREFETCH
MICRO_MMA_ONE_PEEL
}
- for(; k < depth; k++)
- {
+ for (; k < depth; k++) {
MICRO_MMA_ONE
}
MICRO_MMA_STORE
@@ -406,38 +375,29 @@
MICRO_UPDATE
}
-#define MICRO_MMA_UNROLL_ITER2(N, M) \
- gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, !M, accItr>(res30, res31, res32, res33, lhs_base, rhs_base, depth, strideA, strideB, offsetA, row, pAlpha, M ? remaining_rows : accCols); \
+#define MICRO_MMA_UNROLL_ITER2(N, M) \
+ gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, !M, accItr>( \
+ res30, res31, res32, res33, lhs_base, rhs_base, depth, strideA, strideB, offsetA, row, pAlpha, \
+ M ? remaining_rows : accCols); \
if (M) return;
-#define MICRO_MMA_ROWS(n) \
- while(row + n*accCols <= rows) { \
- MICRO_MMA_UNROLL_ITER2(n, 0); \
+#define MICRO_MMA_ROWS(n) \
+ while (row + n * accCols <= rows) { \
+ MICRO_MMA_UNROLL_ITER2(n, 0); \
}
-template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accItr>
-EIGEN_ALWAYS_INLINE void gemmMMA_cols(
- const DataMapper& res,
- const Scalar* blockA,
- const Scalar* blockB,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index offsetB,
- Index col,
- Index rows,
- Index remaining_rows,
- const Packet& pAlpha,
- const Packet& pMask)
-{
+template <typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows,
+ const Index accCols, const Index accItr>
+EIGEN_ALWAYS_INLINE void gemmMMA_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index depth,
+ Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows,
+ Index remaining_rows, const Packet& pAlpha, const Packet& pMask) {
const DataMapper res30 = res.getSubMapper(0, col);
- const DataMapper res31 = (accItr > 1) ? res30.getSubMapper(0, accRows*1) : res30;
- const DataMapper res32 = (accItr > 2) ? res30.getSubMapper(0, accRows*2) : res30;
- const DataMapper res33 = (accItr > 2) ? res30.getSubMapper(0, accRows*3) : res30;
+ const DataMapper res31 = (accItr > 1) ? res30.getSubMapper(0, accRows * 1) : res30;
+ const DataMapper res32 = (accItr > 2) ? res30.getSubMapper(0, accRows * 2) : res30;
+ const DataMapper res33 = (accItr > 2) ? res30.getSubMapper(0, accRows * 3) : res30;
- const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
- const Scalar* lhs_base = blockA + accCols*offsetA;
+ const Scalar* rhs_base = blockB + col * strideB + accRows * offsetB;
+ const Scalar* lhs_base = blockA + accCols * offsetA;
Index row = 0;
#define MAX_MMA_UNROLL 7
@@ -455,7 +415,7 @@
} else {
MICRO_MMA_ROWS(2);
}
- switch( (rows-row)/accCols ) {
+ switch ((rows - row) / accCols) {
#if MAX_MMA_UNROLL > 7
case 7:
if (accItr == 1) {
@@ -508,42 +468,42 @@
}
#undef MAX_MMA_UNROLL
- if(remaining_rows > 0)
- {
+ if (remaining_rows > 0) {
MICRO_MMA_UNROLL_ITER(MICRO_MMA_EXTRA_ROWS1, 0)
}
}
-#define MICRO_MMA_COLS(n) \
- for(; col + n*accRows <= cols; col += n*accRows) \
- { \
- gemmMMA_cols<Scalar, Packet, RhsPacket2, DataMapper, accRows, accCols, n>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); \
+#define MICRO_MMA_COLS(n) \
+ for (; col + n * accRows <= cols; col += n * accRows) { \
+ gemmMMA_cols<Scalar, Packet, RhsPacket2, DataMapper, accRows, accCols, n>( \
+ res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); \
}
-template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
-void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
- const Index remaining_rows = rows % accCols;
+template <typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows,
+ const Index accCols>
+void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols,
+ Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+ const Index remaining_rows = rows % accCols;
- if( strideA == -1 ) strideA = depth;
- if( strideB == -1 ) strideB = depth;
+ if (strideA == -1) strideA = depth;
+ if (strideB == -1) strideB = depth;
- const Packet pAlpha = pset1<Packet>(alpha);
- const Packet pMask = bmask<Packet>(remaining_rows);
+ const Packet pAlpha = pset1<Packet>(alpha);
+ const Packet pMask = bmask<Packet>(remaining_rows);
- typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
+ typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
- Index col = 0;
+ Index col = 0;
#ifdef GEMM_MULTIPLE_COLS
- MICRO_MMA_COLS(4);
- MICRO_MMA_COLS(2);
+ MICRO_MMA_COLS(4);
+ MICRO_MMA_COLS(2);
#endif
- MICRO_MMA_COLS(1);
+ MICRO_MMA_COLS(1);
- if (col != cols)
- {
- gemm_extra_cols<Scalar, Packet, DataMapper, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
- }
+ if (col != cols) {
+ gemm_extra_cols<Scalar, Packet, DataMapper, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB,
+ col, rows, cols, remaining_rows, pAlpha, pMask);
+ }
}
#define advanceRows ((LhsIsReal) ? 1 : 2)
@@ -556,133 +516,137 @@
#define PEEL_COMPLEX_MMA 3
#endif
-#define MICRO_COMPLEX_MMA_UNROLL(func) \
- func(0) func(1) func(2) func(3)
+#define MICRO_COMPLEX_MMA_UNROLL(func) func(0) func(1) func(2) func(3)
-#define MICRO_COMPLEX_MMA_WORK(func, type, peel) \
- if (accItr == 1) { \
- func(0,type,peel,0,0) func(1,type,peel,1,0) func(2,type,peel,2,0) func(3,type,peel,3,0) \
- } else if (accItr == 2) { \
- func(0,type,peel,0,0) func(1,type,peel,0,1) func(2,type,peel,1,0) func(3,type,peel,1,1) \
- } else { \
- func(0,type,peel,0,0) func(1,type,peel,0,1) func(2,type,peel,0,2) func(3,type,peel,0,3) \
+#define MICRO_COMPLEX_MMA_WORK(func, type, peel) \
+ if (accItr == 1) { \
+ func(0, type, peel, 0, 0) func(1, type, peel, 1, 0) func(2, type, peel, 2, 0) func(3, type, peel, 3, 0) \
+ } else if (accItr == 2) { \
+ func(0, type, peel, 0, 0) func(1, type, peel, 0, 1) func(2, type, peel, 1, 0) func(3, type, peel, 1, 1) \
+ } else { \
+ func(0, type, peel, 0, 0) func(1, type, peel, 0, 1) func(2, type, peel, 0, 2) func(3, type, peel, 0, 3) \
}
-#define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel, left, right) \
- if (unroll_factor > left) { \
- pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##left, lhsVi##left, rhsV##right[peel], rhsVi##right[peel]); \
+#define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel, left, right) \
+ if (unroll_factor > left) { \
+ pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>( \
+ &accReal##iter, &accImag##iter, lhsV##left, lhsVi##left, rhsV##right[peel], rhsVi##right[peel]); \
}
#ifdef VECTOR_PAIR_LOADS_LHS
-#define MICRO_COMPLEX_MMA_WORK_TWO(iter, type, peel, left, right) \
- if (unroll_factor > left) { \
- pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV2##left.packet[peel & 1], lhsVi2##left.packet[peel & 1], rhsV##right[peel], rhsVi##right[peel]); \
+#define MICRO_COMPLEX_MMA_WORK_TWO(iter, type, peel, left, right) \
+ if (unroll_factor > left) { \
+ pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>( \
+ &accReal##iter, &accImag##iter, lhsV2##left.packet[peel & 1], lhsVi2##left.packet[peel & 1], \
+ rhsV##right[peel], rhsVi##right[peel]); \
}
-#define MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, left) \
- if (!LhsIsReal && (unroll_factor > left)) { \
- if (MICRO_NORMAL(left)) { \
- ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr_real##left + imag_delta), plhsVi##left); \
+#define MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, left) \
+ if (!LhsIsReal && (unroll_factor > left)) { \
+ if (MICRO_NORMAL(left)) { \
+ ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr_real##left + imag_delta), plhsVi##left); \
__builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsVi2##left.packet), &plhsVi##left); \
- } else { \
- lhsVi2##left.packet[0] = ploadLhs<Packet>(lhs_ptr_real##left + imag_delta2); \
- lhsVi2##left.packet[1] = ploadLhs<Packet>(lhs_ptr_real##left + imag_delta2 + accCols2); \
- EIGEN_UNUSED_VARIABLE(plhsVi##left); \
- } \
- } else { \
- EIGEN_UNUSED_VARIABLE(lhsVi2##left); \
- EIGEN_UNUSED_VARIABLE(plhsVi##left); \
- } \
+ } else { \
+ lhsVi2##left.packet[0] = ploadLhs<Packet>(lhs_ptr_real##left + imag_delta2); \
+ lhsVi2##left.packet[1] = ploadLhs<Packet>(lhs_ptr_real##left + imag_delta2 + accCols2); \
+ EIGEN_UNUSED_VARIABLE(plhsVi##left); \
+ } \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(lhsVi2##left); \
+ EIGEN_UNUSED_VARIABLE(plhsVi##left); \
+ } \
MICRO_MMA_LOAD1_TWO(lhs_ptr_real, left)
#define MICRO_COMPLEX_MMA_LOAD_TWO(left) MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, left)
#endif
-#define MICRO_COMPLEX_MMA_LOAD_RHS1(peel, right) \
- ploadRhsMMA(rhs_ptr_real##right + (accRows * peel), rhsV##right[peel]); \
- if (!RhsIsReal) { \
+#define MICRO_COMPLEX_MMA_LOAD_RHS1(peel, right) \
+ ploadRhsMMA(rhs_ptr_real##right + (accRows * peel), rhsV##right[peel]); \
+ if (!RhsIsReal) { \
ploadRhsMMA(rhs_ptr_imag##right + (accRows * peel), rhsVi##right[peel]); \
}
-#define MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel) \
- MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_LOAD_RHS1, peel)
+#define MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel) MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_LOAD_RHS1, peel)
#define MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
- if (PEEL_COMPLEX_MMA > peel) { \
- Packet lhsV0, lhsV1, lhsV2, lhsV3; \
- Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
- MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel) \
- MICRO_COMPLEX_MMA_UNROLL(funcl) \
- MICRO_COMPLEX_MMA_WORK(funcw, type, peel) \
+ if (PEEL_COMPLEX_MMA > peel) { \
+ Packet lhsV0, lhsV1, lhsV2, lhsV3; \
+ Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
+ MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel) \
+ MICRO_COMPLEX_MMA_UNROLL(funcl) \
+ MICRO_COMPLEX_MMA_WORK(funcw, type, peel) \
}
#ifndef VECTOR_PAIR_LOADS_LHS
-#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \
- type rhsV0[4], rhsVi0[4], rhsV1[(accItr > 1) ? 4 : 1], rhsVi1[(accItr > 1) ? 4 : 1], rhsV2[(accItr > 2) ? 4 : 1], rhsVi2[(accItr > 2) ? 4 : 1], rhsV3[(accItr > 2) ? 4 : 1], rhsVi3[(accItr > 2) ? 4 : 1]; \
- MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,1) \
- MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,2) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,3)
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type) \
+ type rhsV0[4], rhsVi0[4], rhsV1[(accItr > 1) ? 4 : 1], rhsVi1[(accItr > 1) ? 4 : 1], rhsV2[(accItr > 2) ? 4 : 1], \
+ rhsVi2[(accItr > 2) ? 4 : 1], rhsV3[(accItr > 2) ? 4 : 1], rhsVi3[(accItr > 2) ? 4 : 1]; \
+ MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 0) \
+ MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 1) \
+ MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 2) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 3)
#else
-#define MICRO_COMPLEX_MMA_LOAD_TWO_RHS(peel1, right) \
- ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_real##right + (accRows * peel1)), prhsV##peel1); \
- __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV##right[peel1]), &prhsV##peel1); \
- if(!RhsIsReal) { \
+#define MICRO_COMPLEX_MMA_LOAD_TWO_RHS(peel1, right) \
+ ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_real##right + (accRows * peel1)), prhsV##peel1); \
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV##right[peel1]), &prhsV##peel1); \
+ if (!RhsIsReal) { \
ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_imag##right + (accRows * peel1)), prhsVi##peel1); \
- __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsVi##right[peel1]), &prhsVi##peel1); \
- } else { \
- EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsVi##right[peel1]), &prhsVi##peel1); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
}
#define MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
- if (PEEL_COMPLEX_MMA > peel2) { \
- PacketBlock<Packet,2> lhsV20, lhsV21, lhsV22, lhsV23; \
- PacketBlock<Packet,2> lhsVi20, lhsVi21, lhsVi22, lhsVi23; \
- __vector_pair plhsV0, plhsV1, plhsV2, plhsV3; \
- __vector_pair plhsVi0, plhsVi1, plhsVi2, plhsVi3; \
- if (sizeof(type) == 16) { \
- MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_LOAD_TWO_RHS, peel1) \
- } else { \
- EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
- EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
- MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel1); \
- MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel2); \
- } \
- MICRO_COMPLEX_MMA_UNROLL(funcl2) \
- MICRO_COMPLEX_MMA_WORK(funcw2, type, peel1) \
- MICRO_COMPLEX_MMA_WORK(funcw2, type, peel2) \
- } else { \
- EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
- EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
- MICRO_COMPLEX_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \
+ if (PEEL_COMPLEX_MMA > peel2) { \
+ PacketBlock<Packet, 2> lhsV20, lhsV21, lhsV22, lhsV23; \
+ PacketBlock<Packet, 2> lhsVi20, lhsVi21, lhsVi22, lhsVi23; \
+ __vector_pair plhsV0, plhsV1, plhsV2, plhsV3; \
+ __vector_pair plhsVi0, plhsVi1, plhsVi2, plhsVi3; \
+ if (sizeof(type) == 16) { \
+ MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_LOAD_TWO_RHS, peel1) \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
+ EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
+ MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel1); \
+ MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel2); \
+ } \
+ MICRO_COMPLEX_MMA_UNROLL(funcl2) \
+ MICRO_COMPLEX_MMA_WORK(funcw2, type, peel1) \
+ MICRO_COMPLEX_MMA_WORK(funcw2, type, peel2) \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(prhsV##peel1); \
+ EIGEN_UNUSED_VARIABLE(prhsVi##peel1); \
+ MICRO_COMPLEX_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1) \
}
-#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
- type rhsV0[4], rhsVi0[4], rhsV1[(accItr > 1) ? 4 : 1], rhsVi1[(accItr > 1) ? 4 : 1], rhsV2[(accItr > 2) ? 4 : 1], rhsVi2[(accItr > 2) ? 4 : 1], rhsV3[(accItr > 2) ? 4 : 1], rhsVi3[(accItr > 2) ? 4 : 1]; \
- __vector_pair prhsV0, prhsV2; \
- __vector_pair prhsVi0, prhsVi2; \
- MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,0,1) \
- MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1,funcl1,funcw2,funcl2,type,2,3)
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type) \
+ type rhsV0[4], rhsVi0[4], rhsV1[(accItr > 1) ? 4 : 1], rhsVi1[(accItr > 1) ? 4 : 1], rhsV2[(accItr > 2) ? 4 : 1], \
+ rhsVi2[(accItr > 2) ? 4 : 1], rhsV3[(accItr > 2) ? 4 : 1], rhsVi3[(accItr > 2) ? 4 : 1]; \
+ __vector_pair prhsV0, prhsV2; \
+ __vector_pair prhsVi0, prhsVi2; \
+ MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 0, 1) \
+ MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 2, 3)
#endif
-#define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
type rhsV0[1], rhsVi0[1], rhsV1[1], rhsVi1[1], rhsV2[1], rhsVi2[1], rhsV3[1], rhsVi3[1]; \
- MICRO_COMPLEX_MMA_TYPE_PEEL(funcw,funcl,type,0)
+ MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 0)
#define MICRO_COMPLEX_MMA_UPDATE_RHS1(size, right) \
- rhs_ptr_real##right += (accRows * size); \
- if(!RhsIsReal) rhs_ptr_imag##right += (accRows * size);
+ rhs_ptr_real##right += (accRows * size); \
+ if (!RhsIsReal) rhs_ptr_imag##right += (accRows * size);
-#define MICRO_COMPLEX_MMA_UPDATE_RHS(size) \
- MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_UPDATE_RHS1, size)
+#define MICRO_COMPLEX_MMA_UPDATE_RHS(size) MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_UPDATE_RHS1, size)
-#define MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_TYPE, size) \
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_TYPE, size) \
MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, RhsPacket) \
MICRO_COMPLEX_MMA_UPDATE_RHS(size);
#ifndef VECTOR_PAIR_LOADS_LHS
#define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL, PEEL_COMPLEX_MMA)
#else
-#define MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_TYPE, size) \
- MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, MICRO_COMPLEX_MMA_WORK_TWO, MICRO_COMPLEX_MMA_LOAD_TWO, RhsPacket) \
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_TYPE, size) \
+ MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, MICRO_COMPLEX_MMA_WORK_TWO, \
+ MICRO_COMPLEX_MMA_LOAD_TWO, RhsPacket) \
MICRO_COMPLEX_MMA_UPDATE_RHS(size);
#define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2, PEEL_COMPLEX_MMA)
@@ -691,12 +655,12 @@
#define MICRO_COMPLEX_MMA_ONE MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE, 1)
#define MICRO_COMPLEX_MMA_DST_PTR_ONE(iter) \
- if (unroll_factor * accItr > iter) { \
- bsetzeroMMA(&accReal##iter); \
- bsetzeroMMA(&accImag##iter); \
- } else { \
- EIGEN_UNUSED_VARIABLE(accReal##iter); \
- EIGEN_UNUSED_VARIABLE(accImag##iter); \
+ if (unroll_factor * accItr > iter) { \
+ bsetzeroMMA(&accReal##iter); \
+ bsetzeroMMA(&accImag##iter); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(accReal##iter); \
+ EIGEN_UNUSED_VARIABLE(accImag##iter); \
}
#define MICRO_COMPLEX_MMA_DST_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_DST_PTR_ONE)
@@ -705,61 +669,56 @@
#define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
-#define MICRO_COMPLEX_MMA_STORE_ONE(iter, left, right) \
- if (unroll_factor > left) { \
- storeComplexAccumulator<DataMapper, Packet, Packetc, accCols, (unroll_factor != (left + 1)) ? accCols : accCols2>(row + left*accCols, res##right, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter); \
+#define MICRO_COMPLEX_MMA_STORE_ONE(iter, left, right) \
+ if (unroll_factor > left) { \
+ storeComplexAccumulator<DataMapper, Packet, Packetc, accCols, (unroll_factor != (left + 1)) ? accCols : accCols2>( \
+ row + left * accCols, res##right, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter); \
}
-#define MICRO_COMPLEX_MMA_ITER_UNROLL(func) \
- if (accItr == 1) { \
- func(0,0,0) func(1,1,0) func(2,2,0) func(3,3,0) \
- } else if (accItr == 2) { \
- func(0,0,0) func(1,0,1) func(2,1,0) func(3,1,1) \
- } else { \
- func(0,0,0) func(1,0,1) func(2,0,2) func(3,0,3) \
+#define MICRO_COMPLEX_MMA_ITER_UNROLL(func) \
+ if (accItr == 1) { \
+ func(0, 0, 0) func(1, 1, 0) func(2, 2, 0) func(3, 3, 0) \
+ } else if (accItr == 2) { \
+ func(0, 0, 0) func(1, 0, 1) func(2, 1, 0) func(3, 1, 1) \
+ } else { \
+ func(0, 0, 0) func(1, 0, 1) func(2, 0, 2) func(3, 0, 3) \
}
#define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_ITER_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
-#define MICRO_COMPLEX_MMA_EXTRA_ROWS(right) \
- gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3##right, blockA, rhs_base + right*accRows*(RhsIsReal ? 1 : 2)*strideB, depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+#define MICRO_COMPLEX_MMA_EXTRA_ROWS(right) \
+ gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, \
+ RhsIsReal>(res3##right, blockA, rhs_base + right * accRows * (RhsIsReal ? 1 : 2) * strideB, \
+ depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal, \
+ pAlphaImag, pMask);
-#define MICRO_COMPLEX_MMA_EXTRA_ROWS1(val, right) \
- MICRO_COMPLEX_MMA_EXTRA_ROWS(right);
+#define MICRO_COMPLEX_MMA_EXTRA_ROWS1(val, right) MICRO_COMPLEX_MMA_EXTRA_ROWS(right);
-template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index accItr>
-EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(
- const DataMapper& res0,
- const DataMapper& res1,
- const DataMapper& res2,
- const DataMapper& res3,
- const Scalar* lhs_base,
- const Scalar* rhs_base,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index& row,
- const Packet& pAlphaReal,
- const Packet& pAlphaImag,
- const Packet& pMask)
-{
- const Scalar* rhs_ptr_real0 = rhs_base, * rhs_ptr_real1 = NULL, * rhs_ptr_real2 = NULL, * rhs_ptr_real3 = NULL;
- const Scalar* rhs_ptr_imag0 = NULL, * rhs_ptr_imag1 = NULL, * rhs_ptr_imag2 = NULL, * rhs_ptr_imag3 = NULL;
- const Index imag_delta = accCols*strideA;
- const Index imag_delta2 = accCols2*strideA;
+template <int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket,
+ typename DataMapper, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs,
+ bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index accItr>
+EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(const DataMapper& res0, const DataMapper& res1,
+ const DataMapper& res2, const DataMapper& res3,
+ const Scalar* lhs_base, const Scalar* rhs_base,
+ Index depth, Index strideA, Index offsetA, Index strideB,
+ Index& row, const Packet& pAlphaReal,
+ const Packet& pAlphaImag, const Packet& pMask) {
+ const Scalar *rhs_ptr_real0 = rhs_base, *rhs_ptr_real1 = NULL, *rhs_ptr_real2 = NULL, *rhs_ptr_real3 = NULL;
+ const Scalar *rhs_ptr_imag0 = NULL, *rhs_ptr_imag1 = NULL, *rhs_ptr_imag2 = NULL, *rhs_ptr_imag3 = NULL;
+ const Index imag_delta = accCols * strideA;
+ const Index imag_delta2 = accCols2 * strideA;
- if(!RhsIsReal) {
- rhs_ptr_imag0 = rhs_base + accRows*strideB;
+ if (!RhsIsReal) {
+ rhs_ptr_imag0 = rhs_base + accRows * strideB;
} else {
EIGEN_UNUSED_VARIABLE(rhs_ptr_imag0);
}
if (accItr > 1) {
- if(!RhsIsReal) {
- rhs_ptr_real1 = rhs_base + (2*accRows*strideB);
- rhs_ptr_imag1 = rhs_base + (3*accRows*strideB);
+ if (!RhsIsReal) {
+ rhs_ptr_real1 = rhs_base + (2 * accRows * strideB);
+ rhs_ptr_imag1 = rhs_base + (3 * accRows * strideB);
} else {
- rhs_ptr_real1 = rhs_base + accRows*strideB;
+ rhs_ptr_real1 = rhs_base + accRows * strideB;
EIGEN_UNUSED_VARIABLE(rhs_ptr_imag1);
}
} else {
@@ -768,14 +727,14 @@
EIGEN_UNUSED_VARIABLE(res1);
}
if (accItr > 2) {
- if(!RhsIsReal) {
- rhs_ptr_real2 = rhs_base + (4*accRows*strideB);
- rhs_ptr_imag2 = rhs_base + (5*accRows*strideB);
- rhs_ptr_real3 = rhs_base + (6*accRows*strideB);
- rhs_ptr_imag3 = rhs_base + (7*accRows*strideB);
+ if (!RhsIsReal) {
+ rhs_ptr_real2 = rhs_base + (4 * accRows * strideB);
+ rhs_ptr_imag2 = rhs_base + (5 * accRows * strideB);
+ rhs_ptr_real3 = rhs_base + (6 * accRows * strideB);
+ rhs_ptr_imag3 = rhs_base + (7 * accRows * strideB);
} else {
- rhs_ptr_real2 = rhs_base + (2*accRows*strideB);
- rhs_ptr_real3 = rhs_base + (3*accRows*strideB);
+ rhs_ptr_real2 = rhs_base + (2 * accRows * strideB);
+ rhs_ptr_real3 = rhs_base + (3 * accRows * strideB);
EIGEN_UNUSED_VARIABLE(rhs_ptr_imag2);
EIGEN_UNUSED_VARIABLE(rhs_ptr_imag3);
}
@@ -787,25 +746,23 @@
EIGEN_UNUSED_VARIABLE(res2);
EIGEN_UNUSED_VARIABLE(res3);
}
- const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL;
- const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL;
+ const Scalar *lhs_ptr_real0 = NULL, *lhs_ptr_real1 = NULL;
+ const Scalar *lhs_ptr_real2 = NULL, *lhs_ptr_real3 = NULL;
__vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
MICRO_COMPLEX_MMA_SRC_PTR
MICRO_COMPLEX_MMA_DST_PTR
Index k = 0, depth2 = depth - PEEL_COMPLEX_MMA;
- for(; k <= depth2; k += PEEL_COMPLEX_MMA)
- {
+ for (; k <= depth2; k += PEEL_COMPLEX_MMA) {
EIGEN_POWER_PREFETCH(rhs_ptr_real);
- if(!RhsIsReal) {
+ if (!RhsIsReal) {
EIGEN_POWER_PREFETCH(rhs_ptr_imag);
}
MICRO_COMPLEX_MMA_PREFETCH
MICRO_COMPLEX_MMA_ONE_PEEL
}
- for(; k < depth; k++)
- {
+ for (; k < depth; k++) {
MICRO_COMPLEX_MMA_ONE
}
MICRO_COMPLEX_MMA_STORE
@@ -813,39 +770,32 @@
MICRO_COMPLEX_UPDATE
}
-#define MICRO_COMPLEX_MMA_UNROLL_ITER2(N, M) \
- gemm_complex_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, accItr>(res30, res31, res32, res33, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask); \
+#define MICRO_COMPLEX_MMA_UNROLL_ITER2(N, M) \
+ gemm_complex_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, RhsPacket, DataMapper, accRows, \
+ accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, \
+ accItr>(res30, res31, res32, res33, lhs_base, rhs_base, depth, strideA, offsetA, \
+ strideB, row, pAlphaReal, pAlphaImag, pMask); \
if (M) return;
-#define MICRO_COMPLEX_MMA_ROWS(n) \
- while(row + n*accCols <= rows) { \
+#define MICRO_COMPLEX_MMA_ROWS(n) \
+ while (row + n * accCols <= rows) { \
MICRO_COMPLEX_MMA_UNROLL_ITER2(n, 0); \
}
-template<typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index accItr>
-EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(
- const DataMapper& res,
- const Scalar* blockA,
- const Scalar* blockB,
- Index depth,
- Index strideA,
- Index offsetA,
- Index strideB,
- Index offsetB,
- Index col,
- Index rows,
- Index remaining_rows,
- const Packet& pAlphaReal,
- const Packet& pAlphaImag,
- const Packet& pMask)
-{
+template <typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper,
+ const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal,
+ bool RhsIsReal, const Index accItr>
+EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+ Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB,
+ Index col, Index rows, Index remaining_rows, const Packet& pAlphaReal,
+ const Packet& pAlphaImag, const Packet& pMask) {
const DataMapper res30 = res.getSubMapper(0, col);
- const DataMapper res31 = (accItr > 1) ? res30.getSubMapper(0, accRows*1) : res30;
- const DataMapper res32 = (accItr > 2) ? res30.getSubMapper(0, accRows*2) : res30;
- const DataMapper res33 = (accItr > 2) ? res30.getSubMapper(0, accRows*3) : res30;
+ const DataMapper res31 = (accItr > 1) ? res30.getSubMapper(0, accRows * 1) : res30;
+ const DataMapper res32 = (accItr > 2) ? res30.getSubMapper(0, accRows * 2) : res30;
+ const DataMapper res33 = (accItr > 2) ? res30.getSubMapper(0, accRows * 3) : res30;
- const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
- const Scalar* lhs_base = blockA + accCols*offsetA;
+ const Scalar* rhs_base = blockB + advanceCols * col * strideB + accRows * offsetB;
+ const Scalar* lhs_base = blockA + accCols * offsetA;
Index row = 0;
#define MAX_COMPLEX_MMA_UNROLL 4
@@ -863,7 +813,7 @@
} else {
MICRO_COMPLEX_MMA_ROWS(1);
}
- switch( (rows-row)/accCols ) {
+ switch ((rows - row) / accCols) {
#if MAX_COMPLEX_MMA_UNROLL > 3
case 3:
if (accItr == 1) {
@@ -890,59 +840,62 @@
}
#undef MAX_COMPLEX_MMA_UNROLL
- if(remaining_rows > 0)
- {
+ if (remaining_rows > 0) {
MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_EXTRA_ROWS1, 0)
}
}
-#define MICRO_COMPLEX_MMA_COLS(n) \
- for(; col + n*accRows <= cols; col += n*accRows) \
- { \
- gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket2, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, n>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlphaReal, pAlphaImag, pMask); \
+#define MICRO_COMPLEX_MMA_COLS(n) \
+ for (; col + n * accRows <= cols; col += n * accRows) { \
+ gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket2, DataMapper, accRows, accCols, ConjugateLhs, \
+ ConjugateRhs, LhsIsReal, RhsIsReal, n>(res, blockA, blockB, depth, strideA, offsetA, strideB, \
+ offsetB, col, rows, remaining_rows, pAlphaReal, \
+ pAlphaImag, pMask); \
}
-template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
- const Index remaining_rows = rows % accCols;
+template <typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc,
+ typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs,
+ bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth,
+ Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+ const Index remaining_rows = rows % accCols;
- if( strideA == -1 ) strideA = depth;
- if( strideB == -1 ) strideB = depth;
+ if (strideA == -1) strideA = depth;
+ if (strideB == -1) strideB = depth;
- const Packet pAlphaReal = pset1<Packet>(alpha.real());
- const Packet pAlphaImag = pset1<Packet>(alpha.imag());
- const Packet pMask = bmask<Packet>(remaining_rows);
+ const Packet pAlphaReal = pset1<Packet>(alpha.real());
+ const Packet pAlphaImag = pset1<Packet>(alpha.imag());
+ const Packet pMask = bmask<Packet>(remaining_rows);
- const Scalar* blockA = (Scalar *) blockAc;
- const Scalar* blockB = (Scalar *) blockBc;
+ const Scalar* blockA = (Scalar*)blockAc;
+ const Scalar* blockB = (Scalar*)blockBc;
- typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
+ typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
- Index col = 0;
+ Index col = 0;
#ifdef GEMM_MULTIPLE_COLS
- MICRO_COMPLEX_MMA_COLS(4);
- MICRO_COMPLEX_MMA_COLS(2);
+ MICRO_COMPLEX_MMA_COLS(4);
+ MICRO_COMPLEX_MMA_COLS(2);
#endif
- MICRO_COMPLEX_MMA_COLS(1);
+ MICRO_COMPLEX_MMA_COLS(1);
- if (col != cols)
- {
- gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
- }
+ if (col != cols) {
+ gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal,
+ RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols,
+ remaining_rows, pAlphaReal, pAlphaImag, pMask);
+ }
}
#undef accColsC
#undef advanceRows
#undef advanceCols
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
#if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
#pragma GCC pop_options
#endif
-#endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
-
+#endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
index 5094118..6ecec0e 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
@@ -11,11 +11,10 @@
namespace internal {
-template<bool zero>
-EIGEN_ALWAYS_INLINE Packet8bf loadBfloat16(const bfloat16* indexA)
-{
+template <bool zero>
+EIGEN_ALWAYS_INLINE Packet8bf loadBfloat16(const bfloat16* indexA) {
Packet8bf lhs1 = ploadu<Packet8bf>(indexA);
- if(zero){
+ if (zero) {
Packet8bf lhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
return vec_mergeh(lhs1.m_val, lhs2.m_val);
} else {
@@ -23,239 +22,243 @@
}
}
-template<bool zero>
-EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16(const bfloat16* blockB, Index strideB, Index i)
-{
- return loadBfloat16<zero>(blockB + strideB*i);
+template <bool zero>
+EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16(const bfloat16* blockB, Index strideB, Index i) {
+ return loadBfloat16<zero>(blockB + strideB * i);
}
-template<Index num_acc, Index num_packets, bool zero, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs, Index num_lhs>
-EIGEN_ALWAYS_INLINE void KLoop
-(
- const bfloat16* indexA,
- const bfloat16* indexB,
- __vector_quad (&quad_acc)[num_acc],
- Index strideB,
- Index k,
- Index offsetB,
- Index extra_cols,
- Index extra_rows
-)
-{
+template <Index num_acc, Index num_packets, bool zero, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs,
+ Index num_lhs>
+EIGEN_ALWAYS_INLINE void KLoop(const bfloat16* indexA, const bfloat16* indexB, __vector_quad (&quad_acc)[num_acc],
+ Index strideB, Index k, Index offsetB, Index extra_cols, Index extra_rows) {
Packet8bf lhs[num_lhs], rhs[num_rhs];
BFLOAT16_UNROLL
- for(Index i = 0; i < (num_rhs - (rhsExtraCols ? 1 : 0)); i++){
- rhs[i] = loadRhsBfloat16<zero>(indexB + k*4, strideB, i);
+ for (Index i = 0; i < (num_rhs - (rhsExtraCols ? 1 : 0)); i++) {
+ rhs[i] = loadRhsBfloat16<zero>(indexB + k * 4, strideB, i);
}
- if(rhsExtraCols) {
- rhs[num_rhs - 1] = loadRhsBfloat16<zero>(indexB + k*extra_cols - offsetB, strideB, num_rhs - 1);
+ if (rhsExtraCols) {
+ rhs[num_rhs - 1] = loadRhsBfloat16<zero>(indexB + k * extra_cols - offsetB, strideB, num_rhs - 1);
}
- indexA += k*(lhsExtraRows ? extra_rows : num_packets);
+ indexA += k * (lhsExtraRows ? extra_rows : num_packets);
if (num_lhs == 1) {
lhs[0] = loadBfloat16<zero>(indexA);
} else {
BFLOAT16_UNROLL
- for(Index j = 0; j < num_lhs; j += 2) {
- Packet8bf lhs1 = ploadu<Packet8bf>(indexA + (j + 0)*(zero ? 4 : 8));
+ for (Index j = 0; j < num_lhs; j += 2) {
+ Packet8bf lhs1 = ploadu<Packet8bf>(indexA + (j + 0) * (zero ? 4 : 8));
if (zero) {
Packet8bf lhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
lhs[j + 0] = vec_mergeh(lhs1.m_val, lhs2.m_val);
lhs[j + 1] = vec_mergel(lhs1.m_val, lhs2.m_val);
} else {
lhs[j + 0] = lhs1;
- lhs[j + 1] = ploadu<Packet8bf>(indexA + (j + 1)*8);
+ lhs[j + 1] = ploadu<Packet8bf>(indexA + (j + 1) * 8);
}
}
}
BFLOAT16_UNROLL
- for(Index i = 0, x = 0; i < num_rhs; i++) {
+ for (Index i = 0, x = 0; i < num_rhs; i++) {
BFLOAT16_UNROLL
- for(Index j = 0; j < num_lhs; j++, x++) {
- __builtin_mma_xvbf16ger2pp(&(quad_acc[x]), reinterpret_cast<Packet16uc>(rhs[i].m_val), reinterpret_cast<Packet16uc>(lhs[j].m_val));
+ for (Index j = 0; j < num_lhs; j++, x++) {
+ __builtin_mma_xvbf16ger2pp(&(quad_acc[x]), reinterpret_cast<Packet16uc>(rhs[i].m_val),
+ reinterpret_cast<Packet16uc>(lhs[j].m_val));
}
}
}
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void zeroAccumulators(__vector_quad (&quad_acc)[num_acc])
-{
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void zeroAccumulators(__vector_quad (&quad_acc)[num_acc]) {
BFLOAT16_UNROLL
- for(Index k = 0; k < num_acc; k++)
- __builtin_mma_xxsetaccz(&(quad_acc[k]));
+ for (Index k = 0; k < num_acc; k++) __builtin_mma_xxsetaccz(&(quad_acc[k]));
}
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void disassembleAccumulators(__vector_quad (&quad_acc)[num_acc], Packet4f (&acc)[num_acc][4])
-{
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void disassembleAccumulators(__vector_quad (&quad_acc)[num_acc], Packet4f (&acc)[num_acc][4]) {
BFLOAT16_UNROLL
- for(Index k = 0; k < num_acc; k++)
- __builtin_mma_disassemble_acc((void*)acc[k], &(quad_acc[k]));
+ for (Index k = 0; k < num_acc; k++) __builtin_mma_disassemble_acc((void*)acc[k], &(quad_acc[k]));
}
-template<Index num_acc, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs, Index num_lhs>
-EIGEN_ALWAYS_INLINE void outputResults(Packet4f (&acc)[num_acc][4], Index rows, const Packet4f pAlpha, float* result, const Index extra_cols, Index extra_rows)
-{
+template <Index num_acc, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs, Index num_lhs>
+EIGEN_ALWAYS_INLINE void outputResults(Packet4f (&acc)[num_acc][4], Index rows, const Packet4f pAlpha, float* result,
+ const Index extra_cols, Index extra_rows) {
BFLOAT16_UNROLL
- for(Index i = 0, k = 0; i < num_rhs - (rhsExtraCols ? 1 : 0); i++, result += 4*rows){
+ for (Index i = 0, k = 0; i < num_rhs - (rhsExtraCols ? 1 : 0); i++, result += 4 * rows) {
BFLOAT16_UNROLL
- for(Index j = 0; j < num_lhs; j++, k++) {
- storeResults<false, lhsExtraRows>(acc[k], rows, pAlpha, result + j*4, extra_cols, extra_rows);
+ for (Index j = 0; j < num_lhs; j++, k++) {
+ storeResults<false, lhsExtraRows>(acc[k], rows, pAlpha, result + j * 4, extra_cols, extra_rows);
}
}
- if(rhsExtraCols) {
+ if (rhsExtraCols) {
storeResults<rhsExtraCols, lhsExtraRows>(acc[num_acc - 1], rows, pAlpha, result, extra_cols, extra_rows);
}
}
-template<const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows, bool multiIter = false>
-EIGEN_ALWAYS_INLINE void colLoopBodyIter(Index depth, Index rows, const Packet4f pAlpha, const bfloat16* indexA, const bfloat16* indexB, Index strideB, Index offsetB, float* result, const Index extra_cols, const Index extra_rows)
-{
+template <const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows, bool multiIter = false>
+EIGEN_ALWAYS_INLINE void colLoopBodyIter(Index depth, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
+ const bfloat16* indexB, Index strideB, Index offsetB, float* result,
+ const Index extra_cols, const Index extra_rows) {
constexpr Index num_lhs = multiIter ? (num_packets / 4) : 1;
constexpr Index num_rhs = (num_acc + num_lhs - 1) / num_lhs;
- for(Index offset_row = 0; offset_row < num_packets; offset_row += 4, indexA += (multiIter ? 0 : 8), indexB += (multiIter ? (num_rhs*strideB) : 0), result += (multiIter ? (4*rows*num_rhs) : 4)) {
+ for (Index offset_row = 0; offset_row < num_packets; offset_row += 4, indexA += (multiIter ? 0 : 8),
+ indexB += (multiIter ? (num_rhs * strideB) : 0), result += (multiIter ? (4 * rows * num_rhs) : 4)) {
Packet4f acc[num_acc][4];
__vector_quad quad_acc[num_acc];
zeroAccumulators<num_acc>(quad_acc);
Index k;
- for(k = 0; k + 2 <= depth; k += 2){
- KLoop<num_acc, num_packets, false, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(indexA, indexB, quad_acc, strideB, k, offsetB, extra_cols, extra_rows);
+ for (k = 0; k + 2 <= depth; k += 2) {
+ KLoop<num_acc, num_packets, false, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(
+ indexA, indexB, quad_acc, strideB, k, offsetB, extra_cols, extra_rows);
}
- if(depth&1){
- KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(indexA - (multiIter ? 0 : offset_row), indexB, quad_acc, strideB, k, offsetB, extra_cols, extra_rows);
+ if (depth & 1) {
+ KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(
+ indexA - (multiIter ? 0 : offset_row), indexB, quad_acc, strideB, k, offsetB, extra_cols, extra_rows);
}
disassembleAccumulators<num_acc>(quad_acc, acc);
- outputResults<num_acc, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(acc, rows, pAlpha, result, extra_cols, extra_rows);
+ outputResults<num_acc, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(acc, rows, pAlpha, result, extra_cols,
+ extra_rows);
}
}
-#define MAX_BFLOAT16_ACC 8
+#define MAX_BFLOAT16_ACC 8
-template<const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
-void colLoopBody(Index& col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA, const bfloat16* indexB, Index strideB, Index offsetB, float* result)
-{
- constexpr Index step = (num_acc * 4); // each accumulator has 4 elements
+template <const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
+void colLoopBody(Index& col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
+ const bfloat16* indexB, Index strideB, Index offsetB, float* result) {
+ constexpr Index step = (num_acc * 4); // each accumulator has 4 elements
const Index extra_cols = (rhsExtraCols) ? (cols & 3) : 0;
const Index extra_rows = (lhsExtraRows) ? (rows & 3) : 0;
constexpr bool multiIters = !rhsExtraCols && (num_acc == MAX_BFLOAT16_ACC);
constexpr bool normIters = multiIters && ((num_acc % (num_packets / 4)) == 0);
- do{
- colLoopBodyIter<num_acc, num_packets, rhsExtraCols, lhsExtraRows, normIters>(depth, rows, pAlpha, indexA, indexB, strideB, offsetB, result, extra_cols, extra_rows);
+ do {
+ colLoopBodyIter<num_acc, num_packets, rhsExtraCols, lhsExtraRows, normIters>(
+ depth, rows, pAlpha, indexA, indexB, strideB, offsetB, result, extra_cols, extra_rows);
- indexB += strideB*num_acc;
- result += rows*step;
- } while(multiIters && (step <= cols - (col += step)));
+ indexB += strideB * num_acc;
+ result += rows * step;
+ } while (multiIters && (step <= cols - (col += step)));
}
-template<const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
-EIGEN_ALWAYS_INLINE void colLoopBodyExtraN(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA, const bfloat16* blockB, Index strideB, Index offsetB, float* result)
-{
+template <const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void colLoopBodyExtraN(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha,
+ const bfloat16* indexA, const bfloat16* blockB, Index strideB, Index offsetB,
+ float* result) {
if (MAX_BFLOAT16_ACC > num_acc) {
- colLoopBody<num_acc + (rhsExtraCols ? 1 : 0), num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
+ colLoopBody<num_acc + (rhsExtraCols ? 1 : 0), num_packets, rhsExtraCols, lhsExtraRows>(
+ col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
}
}
-template<const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
-void colLoopBodyExtra(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA, const bfloat16* blockB, Index strideB, Index offsetB, float* result)
-{
+template <const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
+void colLoopBodyExtra(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
+ const bfloat16* blockB, Index strideB, Index offsetB, float* result) {
switch ((cols - col) >> 2) {
- case 7:
- colLoopBodyExtraN<7, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
- break;
- case 6:
- colLoopBodyExtraN<6, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
- break;
- case 5:
- colLoopBodyExtraN<5, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
- break;
- case 4:
- colLoopBodyExtraN<4, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
- break;
- case 3:
- colLoopBodyExtraN<3, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
- break;
- case 2:
- colLoopBodyExtraN<2, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
- break;
- case 1:
- colLoopBodyExtraN<1, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
- break;
- default:
- if (rhsExtraCols) {
- colLoopBody<1, num_packets, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
- }
- break;
+ case 7:
+ colLoopBodyExtraN<7, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+ strideB, offsetB, result);
+ break;
+ case 6:
+ colLoopBodyExtraN<6, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+ strideB, offsetB, result);
+ break;
+ case 5:
+ colLoopBodyExtraN<5, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+ strideB, offsetB, result);
+ break;
+ case 4:
+ colLoopBodyExtraN<4, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+ strideB, offsetB, result);
+ break;
+ case 3:
+ colLoopBodyExtraN<3, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+ strideB, offsetB, result);
+ break;
+ case 2:
+ colLoopBodyExtraN<2, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+ strideB, offsetB, result);
+ break;
+ case 1:
+ colLoopBodyExtraN<1, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+ strideB, offsetB, result);
+ break;
+ default:
+ if (rhsExtraCols) {
+ colLoopBody<1, num_packets, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB,
+ offsetB, result);
+ }
+ break;
}
}
-template<const Index num_packets, bool lhsExtraRows = false>
-EIGEN_ALWAYS_INLINE void colLoops(Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA, const bfloat16* blockB, Index strideB, Index offsetB, float* result)
-{
+template <const Index num_packets, bool lhsExtraRows = false>
+EIGEN_ALWAYS_INLINE void colLoops(Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
+ const bfloat16* blockB, Index strideB, Index offsetB, float* result) {
Index col = 0;
if (cols >= (MAX_BFLOAT16_ACC * 4)) {
- colLoopBody<MAX_BFLOAT16_ACC, num_packets, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, 0, result);
- blockB += (strideB >> 2)*col;
- result += rows*col;
+ colLoopBody<MAX_BFLOAT16_ACC, num_packets, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+ strideB, 0, result);
+ blockB += (strideB >> 2) * col;
+ result += rows * col;
}
if (cols & 3) {
- colLoopBodyExtra<num_packets, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
+ colLoopBodyExtra<num_packets, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB,
+ result);
} else {
- colLoopBodyExtra<num_packets, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, 0, result);
+ colLoopBodyExtra<num_packets, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, 0,
+ result);
}
}
-EIGEN_ALWAYS_INLINE Packet8bf convertF32toBF16(const float *res)
-{
+EIGEN_ALWAYS_INLINE Packet8bf convertF32toBF16(const float* res) {
Packet16uc fp16[2];
- __vector_pair fp16_vp = *reinterpret_cast<__vector_pair *>(const_cast<float *>(res));
+ __vector_pair fp16_vp = *reinterpret_cast<__vector_pair*>(const_cast<float*>(res));
__builtin_vsx_disassemble_pair(reinterpret_cast<void*>(fp16), &fp16_vp);
fp16[0] = __builtin_vsx_xvcvspbf16(fp16[0]);
fp16[1] = __builtin_vsx_xvcvspbf16(fp16[1]);
return vec_pack(reinterpret_cast<Packet4ui>(fp16[0]), reinterpret_cast<Packet4ui>(fp16[1]));
}
-template<typename DataMapper, const Index size>
-EIGEN_ALWAYS_INLINE void convertArrayF32toBF16Col(float *result, Index col, Index rows, const DataMapper& res)
-{
+template <typename DataMapper, const Index size>
+EIGEN_ALWAYS_INLINE void convertArrayF32toBF16Col(float* result, Index col, Index rows, const DataMapper& res) {
const DataMapper res2 = res.getSubMapper(0, col);
Index row;
- float *result2 = result + col*rows;
- for(row = 0; row + 8 <= rows; row += 8, result2 += 8){
+ float* result2 = result + col * rows;
+ for (row = 0; row + 8 <= rows; row += 8, result2 += 8) {
// get and save block
- PacketBlock<Packet8bf,size> block;
+ PacketBlock<Packet8bf, size> block;
BFLOAT16_UNROLL
- for(Index j = 0; j < size; j++){
- block.packet[j] = convertF32toBF16(result2 + j*rows);
+ for (Index j = 0; j < size; j++) {
+ block.packet[j] = convertF32toBF16(result2 + j * rows);
}
- res2.template storePacketBlock<Packet8bf,size>(row, 0, block);
+ res2.template storePacketBlock<Packet8bf, size>(row, 0, block);
}
// extra rows
- if(row < rows){
+ if (row < rows) {
BFLOAT16_UNROLL
- for(Index j = 0; j < size; j++){
- Packet8bf fp16 = convertF32toBF16(result2 + j*rows);
+ for (Index j = 0; j < size; j++) {
+ Packet8bf fp16 = convertF32toBF16(result2 + j * rows);
res2.template storePacketPartial<Packet8bf>(row, j, fp16, rows & 7);
}
}
}
-template<const Index size, bool non_unit_stride = false>
-EIGEN_ALWAYS_INLINE void convertPointerF32toBF16(Index& i, float* result, Index rows, bfloat16*& dst, Index resInc = 1)
-{
+template <const Index size, bool non_unit_stride = false>
+EIGEN_ALWAYS_INLINE void convertPointerF32toBF16(Index& i, float* result, Index rows, bfloat16*& dst,
+ Index resInc = 1) {
constexpr Index extra = ((size < 8) ? 8 : size);
- while (i + size <= rows){
- PacketBlock<Packet8bf,(size+7)/8> r32;
- r32.packet[0] = convertF32toBF16(result + i + 0);
+ while (i + size <= rows) {
+ PacketBlock<Packet8bf, (size + 7) / 8> r32;
+ r32.packet[0] = convertF32toBF16(result + i + 0);
if (size >= 16) {
- r32.packet[1] = convertF32toBF16(result + i + 8);
+ r32.packet[1] = convertF32toBF16(result + i + 8);
}
if (size >= 32) {
r32.packet[2] = convertF32toBF16(result + i + 16);
@@ -269,64 +272,64 @@
storeBF16fromResult<size, non_unit_stride, 16>(dst, r32.packet[2], resInc);
storeBF16fromResult<size, non_unit_stride, 24>(dst, r32.packet[3], resInc);
}
- i += extra; dst += extra*resInc;
+ i += extra;
+ dst += extra * resInc;
if (size != 32) break;
}
}
-template<bool non_unit_stride = false>
-EIGEN_ALWAYS_INLINE void convertArrayPointerF32toBF16(float *result, Index rows, bfloat16* dst, Index resInc = 1)
-{
+template <bool non_unit_stride = false>
+EIGEN_ALWAYS_INLINE void convertArrayPointerF32toBF16(float* result, Index rows, bfloat16* dst, Index resInc = 1) {
Index i = 0;
- convertPointerF32toBF16<32,non_unit_stride>(i, result, rows, dst, resInc);
- convertPointerF32toBF16<16,non_unit_stride>(i, result, rows, dst, resInc);
- convertPointerF32toBF16<8,non_unit_stride>(i, result, rows, dst, resInc);
- convertPointerF32toBF16<1,non_unit_stride>(i, result, rows, dst, resInc);
+ convertPointerF32toBF16<32, non_unit_stride>(i, result, rows, dst, resInc);
+ convertPointerF32toBF16<16, non_unit_stride>(i, result, rows, dst, resInc);
+ convertPointerF32toBF16<8, non_unit_stride>(i, result, rows, dst, resInc);
+ convertPointerF32toBF16<1, non_unit_stride>(i, result, rows, dst, resInc);
}
-template<typename DataMapper>
-EIGEN_ALWAYS_INLINE void convertArrayF32toBF16(float *result, Index cols, Index rows, const DataMapper& res)
-{
+template <typename DataMapper>
+EIGEN_ALWAYS_INLINE void convertArrayF32toBF16(float* result, Index cols, Index rows, const DataMapper& res) {
Index col;
- for(col = 0; col + 4 <= cols; col += 4){
- convertArrayF32toBF16Col<DataMapper,4>(result, col, rows, res);
+ for (col = 0; col + 4 <= cols; col += 4) {
+ convertArrayF32toBF16Col<DataMapper, 4>(result, col, rows, res);
}
// extra cols
switch (cols - col) {
- case 1:
- convertArrayF32toBF16Col<DataMapper,1>(result, col, rows, res);
- break;
- case 2:
- convertArrayF32toBF16Col<DataMapper,2>(result, col, rows, res);
- break;
- case 3:
- convertArrayF32toBF16Col<DataMapper,3>(result, col, rows, res);
- break;
+ case 1:
+ convertArrayF32toBF16Col<DataMapper, 1>(result, col, rows, res);
+ break;
+ case 2:
+ convertArrayF32toBF16Col<DataMapper, 2>(result, col, rows, res);
+ break;
+ case 3:
+ convertArrayF32toBF16Col<DataMapper, 3>(result, col, rows, res);
+ break;
}
}
-template<Index size>
-EIGEN_ALWAYS_INLINE void calcColLoops(const bfloat16*& indexA, Index& row, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexB, Index strideB, Index offsetA, Index offsetB, Index bigSuffix, float *result)
-{
+template <Index size>
+EIGEN_ALWAYS_INLINE void calcColLoops(const bfloat16*& indexA, Index& row, Index depth, Index cols, Index rows,
+ const Packet4f pAlpha, const bfloat16* indexB, Index strideB, Index offsetA,
+ Index offsetB, Index bigSuffix, float* result) {
if ((size == 16) || (rows & size)) {
- indexA += size*offsetA;
+ indexA += size * offsetA;
colLoops<size>(depth, cols, rows, pAlpha, indexA, indexB, strideB, offsetB, result + row);
row += size;
- indexA += bigSuffix*size/16;
+ indexA += bigSuffix * size / 16;
}
}
-template<typename DataMapper>
-void gemmMMAbfloat16(const DataMapper& res, const bfloat16* indexA, const bfloat16* indexB, Index rows, Index depth, Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
-{
+template <typename DataMapper>
+void gemmMMAbfloat16(const DataMapper& res, const bfloat16* indexA, const bfloat16* indexB, Index rows, Index depth,
+ Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
const Packet4f pAlpha = pset1<Packet4f>(falpha);
- ei_declare_aligned_stack_constructed_variable(float, result, cols*rows, 0);
+ ei_declare_aligned_stack_constructed_variable(float, result, cols* rows, 0);
convertArrayBF16toF32<DataMapper>(result, cols, rows, res);
- if( strideA == -1 ) strideA = depth;
- if( strideB == -1 ) strideB = depth;
+ if (strideA == -1) strideA = depth;
+ if (strideB == -1) strideB = depth;
// Packing is done in blocks.
// There's 4 possible sizes of blocks
// Blocks of 8 columns with 16 elements (8x16)
@@ -335,13 +338,13 @@
// Blocks of 8 columns with < 4 elements. This happens when there's less than 4 remaining rows
// Loop for LHS standard block (8x16)
- Index bigSuffix = (2*8) * (strideA-offsetA);
- indexB += 4*offsetB;
+ Index bigSuffix = (2 * 8) * (strideA - offsetA);
+ indexB += 4 * offsetB;
strideB *= 4;
offsetB *= 3;
Index row = 0;
- while(row + 16 <= rows){
+ while (row + 16 <= rows) {
calcColLoops<16>(indexA, row, depth, cols, rows, pAlpha, indexB, strideB, offsetA, offsetB, bigSuffix, result);
}
// LHS (8x8) block
@@ -349,7 +352,7 @@
// LHS (8x4) block
calcColLoops<4>(indexA, row, depth, cols, rows, pAlpha, indexB, strideB, offsetA, offsetB, bigSuffix, result);
// extra rows
- if(rows & 3){
+ if (rows & 3) {
// This index is the beginning of remaining block.
colLoops<4, true>(depth, cols, rows, pAlpha, indexA, indexB, strideB, offsetB, result + row);
}
@@ -361,12 +364,11 @@
#undef MAX_BFLOAT16_ACC
#if !EIGEN_ALTIVEC_DISABLE_MMA
-template<Index num_acc, typename LhsMapper, bool zero>
-EIGEN_ALWAYS_INLINE void loadVecLoop(Index k, LhsMapper& lhs, Packet8bf (&a0)[num_acc], Packet8bf b1)
-{
- a0[k + 0] = lhs.template loadPacket<Packet8bf>(k*4, 0);
+template <Index num_acc, typename LhsMapper, bool zero>
+EIGEN_ALWAYS_INLINE void loadVecLoop(Index k, LhsMapper& lhs, Packet8bf (&a0)[num_acc], Packet8bf b1) {
+ a0[k + 0] = lhs.template loadPacket<Packet8bf>(k * 4, 0);
if (!zero) {
- b1 = lhs.template loadPacket<Packet8bf>(k*4, 1);
+ b1 = lhs.template loadPacket<Packet8bf>(k * 4, 1);
}
if (num_acc > (k + 1)) {
a0[k + 1] = vec_mergel(a0[k + 0].m_val, b1.m_val);
@@ -374,18 +376,17 @@
a0[k + 0] = vec_mergeh(a0[k + 0].m_val, b1.m_val);
}
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void multVec(__vector_quad (&quad_acc)[num_acc], Packet8bf (&a0)[num_acc], Packet8bf b0)
-{
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void multVec(__vector_quad (&quad_acc)[num_acc], Packet8bf (&a0)[num_acc], Packet8bf b0) {
BFLOAT16_UNROLL
- for(Index k = 0; k < num_acc; k++) {
- __builtin_mma_xvbf16ger2pp(&(quad_acc[k]), reinterpret_cast<Packet16uc>(b0.m_val), reinterpret_cast<Packet16uc>(a0[k].m_val));
+ for (Index k = 0; k < num_acc; k++) {
+ __builtin_mma_xvbf16ger2pp(&(quad_acc[k]), reinterpret_cast<Packet16uc>(b0.m_val),
+ reinterpret_cast<Packet16uc>(a0[k].m_val));
}
}
-template<Index num_acc, typename LhsMapper, typename RhsMapper, bool zero, bool linear>
-EIGEN_ALWAYS_INLINE void vecColLoop(Index j, LhsMapper& lhs, RhsMapper& rhs, __vector_quad (&quad_acc)[num_acc])
-{
+template <Index num_acc, typename LhsMapper, typename RhsMapper, bool zero, bool linear>
+EIGEN_ALWAYS_INLINE void vecColLoop(Index j, LhsMapper& lhs, RhsMapper& rhs, __vector_quad (&quad_acc)[num_acc]) {
Packet8bf a0[num_acc];
Packet8bf b1 = pset1<Packet8bf>(Eigen::bfloat16(0));
Packet8bf b0 = loadColData<RhsMapper, linear>(rhs, j);
@@ -398,23 +399,23 @@
LhsSubMapper lhs2 = lhs.getSubMapper(0, j);
BFLOAT16_UNROLL
- for(Index k = 0; k < num_acc; k += 2) {
+ for (Index k = 0; k < num_acc; k += 2) {
loadVecLoop<num_acc, LhsSubMapper, zero>(k, lhs2, a0, b1);
}
multVec<num_acc>(quad_acc, a0, b0);
}
-#define MAX_BFLOAT16_VEC_ACC 8
+#define MAX_BFLOAT16_VEC_ACC 8
-template<const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
-void colVecColLoopBody(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+void colVecColLoopBody(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+ float* result) {
constexpr Index step = (num_acc * 4);
const Index extra_rows = (extraRows) ? (rows & 3) : 0;
constexpr bool multiIters = !extraRows && (num_acc == MAX_BFLOAT16_VEC_ACC);
- do{
+ do {
Packet4f acc[num_acc][4];
__vector_quad quad_acc[num_acc];
@@ -423,7 +424,7 @@
using LhsSubMapper = typename LhsMapper::SubMapper;
LhsSubMapper lhs2 = lhs.getSubMapper(row, 0);
- for(Index j = 0; j + 2 <= cend; j += 2) {
+ for (Index j = 0; j + 2 <= cend; j += 2) {
vecColLoop<num_acc, LhsSubMapper, RhsMapper, false, linear>(j, lhs2, rhs, quad_acc);
}
if (cend & 1) {
@@ -435,56 +436,58 @@
outputVecColResults<num_acc, extraRows>(acc, result, pAlpha, extra_rows);
result += step;
- } while(multiIters && (step <= rows - (row += step)));
+ } while (multiIters && (step <= rows - (row += step)));
}
-template<const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
-EIGEN_ALWAYS_INLINE void colVecColLoopBodyExtraN(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+EIGEN_ALWAYS_INLINE void colVecColLoopBodyExtraN(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+ const Packet4f pAlpha, float* result) {
if (MAX_BFLOAT16_VEC_ACC > num_acc) {
- colVecColLoopBody<num_acc + (extraRows ? 1 : 0), LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ colVecColLoopBody<num_acc + (extraRows ? 1 : 0), LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs,
+ pAlpha, result);
}
}
-template<typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
-EIGEN_ALWAYS_INLINE void colVecColLoopBodyExtra(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+EIGEN_ALWAYS_INLINE void colVecColLoopBodyExtra(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+ const Packet4f pAlpha, float* result) {
switch ((rows - row) >> 2) {
- case 7:
- colVecColLoopBodyExtraN<7, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
- break;
- case 6:
- colVecColLoopBodyExtraN<6, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
- break;
- case 5:
- colVecColLoopBodyExtraN<5, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
- break;
- case 4:
- colVecColLoopBodyExtraN<4, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
- break;
- case 3:
- colVecColLoopBodyExtraN<3, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
- break;
- case 2:
- colVecColLoopBodyExtraN<2, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
- break;
- case 1:
- colVecColLoopBodyExtraN<1, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
- break;
- default:
- if (extraRows) {
- colVecColLoopBody<1, LhsMapper, RhsMapper, true, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
- }
- break;
+ case 7:
+ colVecColLoopBodyExtraN<7, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 6:
+ colVecColLoopBodyExtraN<6, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 5:
+ colVecColLoopBodyExtraN<5, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 4:
+ colVecColLoopBodyExtraN<4, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 3:
+ colVecColLoopBodyExtraN<3, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 2:
+ colVecColLoopBodyExtraN<2, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 1:
+ colVecColLoopBodyExtraN<1, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ break;
+ default:
+ if (extraRows) {
+ colVecColLoopBody<1, LhsMapper, RhsMapper, true, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ }
+ break;
}
}
-template<typename LhsMapper, typename RhsMapper, bool linear>
-EIGEN_ALWAYS_INLINE void calcVecColLoops(Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <typename LhsMapper, typename RhsMapper, bool linear>
+EIGEN_ALWAYS_INLINE void calcVecColLoops(Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+ float* result) {
Index row = 0;
if (rows >= (MAX_BFLOAT16_VEC_ACC * 4)) {
- colVecColLoopBody<MAX_BFLOAT16_VEC_ACC, LhsMapper, RhsMapper, false, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ colVecColLoopBody<MAX_BFLOAT16_VEC_ACC, LhsMapper, RhsMapper, false, linear>(row, cend, rows, lhs, rhs, pAlpha,
+ result);
result += row;
}
if (rows & 3) {
@@ -494,10 +497,10 @@
}
}
-template<typename RhsMapper, typename LhsMapper, typename = void>
+template <typename RhsMapper, typename LhsMapper, typename = void>
struct UseMMAStride : std::false_type {
- static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha, float *result)
- {
+ static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha,
+ float* result) {
using RhsSubMapper = typename RhsMapper::SubMapper;
RhsSubMapper rhs2 = rhs.getSubMapper(j2, 0);
@@ -505,11 +508,12 @@
}
};
-template<typename RhsMapper, typename LhsMapper>
-struct UseMMAStride<RhsMapper, LhsMapper, std::enable_if_t<std::is_member_function_pointer<
- decltype(&RhsMapper::stride)>::value>> : std::true_type {
- static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha, float *result)
- {
+template <typename RhsMapper, typename LhsMapper>
+struct UseMMAStride<RhsMapper, LhsMapper,
+ std::enable_if_t<std::is_member_function_pointer<decltype(&RhsMapper::stride)>::value>>
+ : std::true_type {
+ static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha,
+ float* result) {
using RhsSubMapper = typename RhsMapper::SubMapper;
RhsSubMapper rhs2 = rhs.getSubMapper(j2, 0);
@@ -521,14 +525,9 @@
}
};
-template<typename LhsMapper, typename RhsMapper>
-void gemvMMA_bfloat16_col(
- Index rows, Index cols,
- const LhsMapper& alhs,
- const RhsMapper& rhs,
- bfloat16* res, Index resIncr,
- bfloat16 alpha)
-{
+template <typename LhsMapper, typename RhsMapper>
+void gemvMMA_bfloat16_col(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, bfloat16* res,
+ Index resIncr, bfloat16 alpha) {
EIGEN_UNUSED_VARIABLE(resIncr);
eigen_internal_assert(resIncr == 1);
@@ -548,8 +547,7 @@
convertArrayPointerBF16toF32(result, 1, rows, res);
- for (Index j2 = 0; j2 < cols; j2 += block_cols)
- {
+ for (Index j2 = 0; j2 < cols; j2 += block_cols) {
Index jend = numext::mini(j2 + block_cols, cols);
using LhsSubMapper = typename LhsMapper::SubMapper;
@@ -561,11 +559,11 @@
convertArrayPointerF32toBF16(result, rows, res);
}
-static Packet16uc p16uc_ELEMENT_VEC3 = { 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f, 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f };
+static Packet16uc p16uc_ELEMENT_VEC3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void preduxVecResults2(Packet4f (&acc)[num_acc][4], Index k)
-{
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void preduxVecResults2(Packet4f (&acc)[num_acc][4], Index k) {
if (num_acc > (k + 1)) {
acc[k][0] = vec_mergeh(acc[k][0], acc[k + 1][0]);
acc[k][1] = vec_mergeo(acc[k][1], acc[k + 1][1]);
@@ -584,22 +582,22 @@
}
}
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void preduxVecResults(Packet4f (&acc)[num_acc][4])
-{
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void preduxVecResults(Packet4f (&acc)[num_acc][4]) {
BFLOAT16_UNROLL
- for(Index k = 0; k < num_acc; k += 4) {
+ for (Index k = 0; k < num_acc; k += 4) {
preduxVecResults2<num_acc>(acc, k + 0);
if (num_acc > (k + 2)) {
preduxVecResults2<num_acc>(acc, k + 2);
- acc[k + 0][0] = reinterpret_cast<Packet4f>(vec_mergeh(reinterpret_cast<Packet2ul>(acc[k + 0][0]), reinterpret_cast<Packet2ul>(acc[k + 2][0])));
+ acc[k + 0][0] = reinterpret_cast<Packet4f>(
+ vec_mergeh(reinterpret_cast<Packet2ul>(acc[k + 0][0]), reinterpret_cast<Packet2ul>(acc[k + 2][0])));
}
}
}
-template<Index num_acc, typename LhsMapper, typename RhsMapper, bool extra>
-EIGEN_ALWAYS_INLINE void multVecLoop(__vector_quad (&quad_acc)[num_acc], const LhsMapper& lhs, RhsMapper& rhs, Index j, Index extra_cols)
-{
+template <Index num_acc, typename LhsMapper, typename RhsMapper, bool extra>
+EIGEN_ALWAYS_INLINE void multVecLoop(__vector_quad (&quad_acc)[num_acc], const LhsMapper& lhs, RhsMapper& rhs, Index j,
+ Index extra_cols) {
Packet8bf a0[num_acc], b0;
if (extra) {
@@ -610,7 +608,7 @@
const LhsMapper lhs2 = lhs.getSubMapper(0, j);
BFLOAT16_UNROLL
- for(Index k = 0; k < num_acc; k++) {
+ for (Index k = 0; k < num_acc; k++) {
if (extra) {
a0[k] = lhs2.template loadPacketPartial<Packet8bf>(k, 0, extra_cols);
} else {
@@ -621,11 +619,11 @@
multVec<num_acc>(quad_acc, a0, b0);
}
-template<Index num_acc, typename LhsMapper, typename RhsMapper>
-EIGEN_ALWAYS_INLINE void vecLoop(Index cols, const LhsMapper& lhs, RhsMapper& rhs, __vector_quad (&quad_acc)[num_acc], Index extra_cols)
-{
+template <Index num_acc, typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void vecLoop(Index cols, const LhsMapper& lhs, RhsMapper& rhs, __vector_quad (&quad_acc)[num_acc],
+ Index extra_cols) {
Index j = 0;
- for(; j + 8 <= cols; j += 8){
+ for (; j + 8 <= cols; j += 8) {
multVecLoop<num_acc, LhsMapper, RhsMapper, false>(quad_acc, lhs, rhs, j, extra_cols);
}
@@ -634,13 +632,13 @@
}
}
-template<const Index num_acc, typename LhsMapper, typename RhsMapper>
-void colVecLoopBody(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <const Index num_acc, typename LhsMapper, typename RhsMapper>
+void colVecLoopBody(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+ float* result) {
constexpr bool multiIters = (num_acc == MAX_BFLOAT16_VEC_ACC);
const Index extra_cols = (cols & 7);
- do{
+ do {
Packet4f acc[num_acc][4];
__vector_quad quad_acc[num_acc];
@@ -656,48 +654,48 @@
outputVecResults<num_acc>(acc, result, pAlpha);
result += num_acc;
- } while(multiIters && (num_acc <= rows - (row += num_acc)));
+ } while (multiIters && (num_acc <= rows - (row += num_acc)));
}
-template<const Index num_acc, typename LhsMapper, typename RhsMapper>
-EIGEN_ALWAYS_INLINE void colVecLoopBodyExtraN(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <const Index num_acc, typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void colVecLoopBodyExtraN(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+ const Packet4f pAlpha, float* result) {
if (MAX_BFLOAT16_VEC_ACC > num_acc) {
colVecLoopBody<num_acc, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
}
}
-template<typename LhsMapper, typename RhsMapper>
-EIGEN_ALWAYS_INLINE void colVecLoopBodyExtra(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void colVecLoopBodyExtra(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+ const Packet4f pAlpha, float* result) {
switch (rows - row) {
- case 7:
- colVecLoopBodyExtraN<7, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
- break;
- case 6:
- colVecLoopBodyExtraN<6, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
- break;
- case 5:
- colVecLoopBodyExtraN<5, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
- break;
- case 4:
- colVecLoopBodyExtraN<4, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
- break;
- case 3:
- colVecLoopBodyExtraN<3, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
- break;
- case 2:
- colVecLoopBodyExtraN<2, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
- break;
- case 1:
- colVecLoopBodyExtraN<1, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
- break;
+ case 7:
+ colVecLoopBodyExtraN<7, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 6:
+ colVecLoopBodyExtraN<6, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 5:
+ colVecLoopBodyExtraN<5, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 4:
+ colVecLoopBodyExtraN<4, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 3:
+ colVecLoopBodyExtraN<3, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 2:
+ colVecLoopBodyExtraN<2, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 1:
+ colVecLoopBodyExtraN<1, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+ break;
}
}
-template<typename LhsMapper, typename RhsMapper>
-EIGEN_ALWAYS_INLINE void calcVecLoops(Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void calcVecLoops(Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+ float* result) {
Index row = 0;
if (rows >= MAX_BFLOAT16_VEC_ACC) {
colVecLoopBody<MAX_BFLOAT16_VEC_ACC, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
@@ -706,14 +704,9 @@
colVecLoopBodyExtra<LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
}
-template<typename LhsMapper, typename RhsMapper>
-EIGEN_STRONG_INLINE void gemvMMA_bfloat16_row(
- Index rows, Index cols,
- const LhsMapper& alhs,
- const RhsMapper& rhs,
- bfloat16* res, Index resIncr,
- bfloat16 alpha)
-{
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_STRONG_INLINE void gemvMMA_bfloat16_row(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
+ bfloat16* res, Index resIncr, bfloat16 alpha) {
typedef typename RhsMapper::LinearMapper LinearMapper;
// The following copy tells the compiler that lhs's attributes are not modified outside this function
@@ -744,6 +737,6 @@
#undef MAX_BFLOAT16_VEC_ACC
#undef BFLOAT16_UNROLL
-}
-}
-#endif //EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
+} // namespace internal
+} // namespace Eigen
+#endif // EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
index 66e1088..90c0d39 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h
@@ -24,11 +24,12 @@
#endif
#endif
-//#define USE_SLOWER_GEMV_MMA // MMA is currently not as fast as VSX in complex double GEMV (revisit when gcc is improved)
+// #define USE_SLOWER_GEMV_MMA // MMA is currently not as fast as VSX in complex double GEMV (revisit when gcc is
+// improved)
-//#define EIGEN_POWER_USE_GEMV_PREFETCH
+// #define EIGEN_POWER_USE_GEMV_PREFETCH
#ifdef EIGEN_POWER_USE_GEMV_PREFETCH
-#define EIGEN_POWER_GEMV_PREFETCH(p) prefetch(p)
+#define EIGEN_POWER_GEMV_PREFETCH(p) prefetch(p)
#else
#define EIGEN_POWER_GEMV_PREFETCH(p)
#endif
@@ -61,58 +62,50 @@
#endif
#define GEMV_IS_COMPLEX_COMPLEX ((sizeof(LhsPacket) == 16) && (sizeof(RhsPacket) == 16))
-#define GEMV_IS_FLOAT (ResPacketSize == (16 / sizeof(float)))
-#define GEMV_IS_SCALAR (sizeof(ResPacket) != 16)
-#define GEMV_IS_COMPLEX_FLOAT (ResPacketSize == (16 / sizeof(std::complex<float>)))
+#define GEMV_IS_FLOAT (ResPacketSize == (16 / sizeof(float)))
+#define GEMV_IS_SCALAR (sizeof(ResPacket) != 16)
+#define GEMV_IS_COMPLEX_FLOAT (ResPacketSize == (16 / sizeof(std::complex<float>)))
/** \internal multiply and add and store results */
-template<typename ResPacket, typename ResScalar>
-EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResPacket& palpha, ResPacket& data)
-{
- pstoreu(res, pmadd(data, palpha, ploadu<ResPacket>(res)));
+template <typename ResPacket, typename ResScalar>
+EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResPacket& palpha, ResPacket& data) {
+ pstoreu(res, pmadd(data, palpha, ploadu<ResPacket>(res)));
}
-template<typename ResScalar>
-EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResScalar& alpha, ResScalar& data)
-{
- *res += (alpha * data);
+template <typename ResScalar>
+EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResScalar& alpha, ResScalar& data) {
+ *res += (alpha * data);
}
-#define GEMV_UNROLL(func, N) \
- func(0, N) func(1, N) func(2, N) func(3, N) \
- func(4, N) func(5, N) func(6, N) func(7, N)
+#define GEMV_UNROLL(func, N) func(0, N) func(1, N) func(2, N) func(3, N) func(4, N) func(5, N) func(6, N) func(7, N)
-#define GEMV_UNROLL_HALF(func, N) \
- func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N)
+#define GEMV_UNROLL_HALF(func, N) func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N)
#define GEMV_GETN(N) (((N) * ResPacketSize) >> 2)
-#define GEMV_LOADPACKET_COL(iter) \
- lhs.template load<LhsPacket, LhsAlignment>(i + ((iter) * LhsPacketSize), j)
+#define GEMV_LOADPACKET_COL(iter) lhs.template load<LhsPacket, LhsAlignment>(i + ((iter) * LhsPacketSize), j)
#ifdef USE_GEMV_MMA
-#define GEMV_UNROLL3(func, N, which) \
- func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) \
- func(4, N, which) func(5, N, which) func(6, N, which) func(7, N, which)
+#define GEMV_UNROLL3(func, N, which) \
+ func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) func(4, N, which) func(5, N, which) \
+ func(6, N, which) func(7, N, which)
#define GEMV_UNUSED_VAR(iter, N, which) \
- if (GEMV_GETN(N) <= iter) { \
+ if (GEMV_GETN(N) <= iter) { \
EIGEN_UNUSED_VARIABLE(which##iter); \
}
#define GEMV_UNUSED_EXTRA_VAR(iter, N, which) \
- if (N <= iter) { \
- EIGEN_UNUSED_VARIABLE(which##iter); \
+ if (N <= iter) { \
+ EIGEN_UNUSED_VARIABLE(which##iter); \
}
-#define GEMV_UNUSED_EXTRA(N, which) \
- GEMV_UNROLL3(GEMV_UNUSED_EXTRA_VAR, N, which)
+#define GEMV_UNUSED_EXTRA(N, which) GEMV_UNROLL3(GEMV_UNUSED_EXTRA_VAR, N, which)
-#define GEMV_UNUSED(N, which) \
- GEMV_UNROLL3(GEMV_UNUSED_VAR, N, which)
+#define GEMV_UNUSED(N, which) GEMV_UNROLL3(GEMV_UNUSED_VAR, N, which)
-#define GEMV_INIT_MMA(iter, N) \
- if (GEMV_GETN(N) > iter) { \
+#define GEMV_INIT_MMA(iter, N) \
+ if (GEMV_GETN(N) > iter) { \
__builtin_mma_xxsetaccz(&e##iter); \
}
@@ -120,354 +113,336 @@
#define GEMV_LOADPAIR_COL_MMA(iter1, iter2) \
GEMV_BUILDPAIR_MMA(b##iter1, GEMV_LOADPACKET_COL(iter2), GEMV_LOADPACKET_COL((iter2) + 1));
#else
-#define GEMV_LOADPAIR_COL_MMA(iter1, iter2) \
+#define GEMV_LOADPAIR_COL_MMA(iter1, iter2) \
const LhsScalar& src##iter1 = lhs(i + ((iter1 * 32) / sizeof(LhsScalar)), j); \
- b##iter1 = *reinterpret_cast<__vector_pair *>(const_cast<LhsScalar *>(&src##iter1));
+ b##iter1 = *reinterpret_cast<__vector_pair*>(const_cast<LhsScalar*>(&src##iter1));
#endif
-#define GEMV_LOAD1A_COL_MMA(iter, N) \
- if (GEMV_GETN(N) > iter) { \
- if (GEMV_IS_FLOAT) { \
- g##iter = GEMV_LOADPACKET_COL(iter); \
- EIGEN_UNUSED_VARIABLE(b##iter); \
- } else { \
+#define GEMV_LOAD1A_COL_MMA(iter, N) \
+ if (GEMV_GETN(N) > iter) { \
+ if (GEMV_IS_FLOAT) { \
+ g##iter = GEMV_LOADPACKET_COL(iter); \
+ EIGEN_UNUSED_VARIABLE(b##iter); \
+ } else { \
GEMV_LOADPAIR_COL_MMA(iter, iter << 1) \
- EIGEN_UNUSED_VARIABLE(g##iter); \
- } \
- } else { \
- EIGEN_UNUSED_VARIABLE(b##iter); \
- EIGEN_UNUSED_VARIABLE(g##iter); \
+ EIGEN_UNUSED_VARIABLE(g##iter); \
+ } \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(b##iter); \
+ EIGEN_UNUSED_VARIABLE(g##iter); \
}
-#define GEMV_WORK1A_COL_MMA(iter, N) \
- if (GEMV_GETN(N) > iter) { \
- if (GEMV_IS_FLOAT) { \
+#define GEMV_WORK1A_COL_MMA(iter, N) \
+ if (GEMV_GETN(N) > iter) { \
+ if (GEMV_IS_FLOAT) { \
pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter, a0, g##iter); \
- } else { \
+ } else { \
pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter, b##iter, a0); \
- } \
+ } \
}
#define GEMV_LOAD1B_COL_MMA(iter1, iter2, iter3, N) \
- if (GEMV_GETN(N) > iter1) { \
- if (GEMV_IS_FLOAT) { \
- GEMV_LOADPAIR_COL_MMA(iter2, iter2) \
- EIGEN_UNUSED_VARIABLE(b##iter3); \
- } else { \
- GEMV_LOADPAIR_COL_MMA(iter2, iter2 << 1) \
- GEMV_LOADPAIR_COL_MMA(iter3, iter3 << 1) \
- } \
- } else { \
- EIGEN_UNUSED_VARIABLE(b##iter2); \
- EIGEN_UNUSED_VARIABLE(b##iter3); \
- } \
- EIGEN_UNUSED_VARIABLE(g##iter2); \
+ if (GEMV_GETN(N) > iter1) { \
+ if (GEMV_IS_FLOAT) { \
+ GEMV_LOADPAIR_COL_MMA(iter2, iter2) \
+ EIGEN_UNUSED_VARIABLE(b##iter3); \
+ } else { \
+ GEMV_LOADPAIR_COL_MMA(iter2, iter2 << 1) \
+ GEMV_LOADPAIR_COL_MMA(iter3, iter3 << 1) \
+ } \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(b##iter2); \
+ EIGEN_UNUSED_VARIABLE(b##iter3); \
+ } \
+ EIGEN_UNUSED_VARIABLE(g##iter2); \
EIGEN_UNUSED_VARIABLE(g##iter3);
-#define GEMV_WORK1B_COL_MMA(iter1, iter2, iter3, N) \
- if (GEMV_GETN(N) > iter1) { \
- if (GEMV_IS_FLOAT) { \
- LhsPacket h[2]; \
+#define GEMV_WORK1B_COL_MMA(iter1, iter2, iter3, N) \
+ if (GEMV_GETN(N) > iter1) { \
+ if (GEMV_IS_FLOAT) { \
+ LhsPacket h[2]; \
__builtin_vsx_disassemble_pair(reinterpret_cast<void*>(h), &b##iter2); \
- pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter2, a0, h[0]); \
- pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter3, a0, h[1]); \
- } else { \
- pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter2, b##iter2, a0); \
- pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter3, b##iter3, a0); \
- } \
+ pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter2, a0, h[0]); \
+ pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter3, a0, h[1]); \
+ } else { \
+ pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter2, b##iter2, a0); \
+ pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter3, b##iter3, a0); \
+ } \
}
#if EIGEN_COMP_LLVM
-#define GEMV_LOAD_COL_MMA(N) \
- if (GEMV_GETN(N) > 1) { \
+#define GEMV_LOAD_COL_MMA(N) \
+ if (GEMV_GETN(N) > 1) { \
GEMV_UNROLL_HALF(GEMV_LOAD1B_COL_MMA, (N >> 1)) \
- } else { \
- GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N) \
+ } else { \
+ GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N) \
}
-#define GEMV_WORK_COL_MMA(N) \
- if (GEMV_GETN(N) > 1) { \
+#define GEMV_WORK_COL_MMA(N) \
+ if (GEMV_GETN(N) > 1) { \
GEMV_UNROLL_HALF(GEMV_WORK1B_COL_MMA, (N >> 1)) \
- } else { \
- GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N) \
+ } else { \
+ GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N) \
}
#else
-#define GEMV_LOAD_COL_MMA(N) \
- GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N)
+#define GEMV_LOAD_COL_MMA(N) GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N)
-#define GEMV_WORK_COL_MMA(N) \
- GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N)
+#define GEMV_WORK_COL_MMA(N) GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N)
#endif
-#define GEMV_DISASSEMBLE_MMA(iter, N) \
- if (GEMV_GETN(N) > iter) { \
+#define GEMV_DISASSEMBLE_MMA(iter, N) \
+ if (GEMV_GETN(N) > iter) { \
__builtin_mma_disassemble_acc(&result##iter.packet, &e##iter); \
- if (!GEMV_IS_FLOAT) { \
- result##iter.packet[0][1] = result##iter.packet[1][0]; \
- result##iter.packet[2][1] = result##iter.packet[3][0]; \
- } \
+ if (!GEMV_IS_FLOAT) { \
+ result##iter.packet[0][1] = result##iter.packet[1][0]; \
+ result##iter.packet[2][1] = result##iter.packet[3][0]; \
+ } \
}
#define GEMV_LOADPAIR2_COL_MMA(iter1, iter2) \
- b##iter1 = *reinterpret_cast<__vector_pair *>(res + i + ((iter2) * ResPacketSize));
+ b##iter1 = *reinterpret_cast<__vector_pair*>(res + i + ((iter2) * ResPacketSize));
#define GEMV_LOAD2_COL_MMA(iter1, iter2, iter3, N) \
- if (GEMV_GETN(N) > iter1) { \
- if (GEMV_IS_FLOAT) { \
- GEMV_LOADPAIR2_COL_MMA(iter2, iter2); \
- EIGEN_UNUSED_VARIABLE(b##iter3); \
- } else { \
- GEMV_LOADPAIR2_COL_MMA(iter2, iter2 << 1); \
- GEMV_LOADPAIR2_COL_MMA(iter3, iter3 << 1); \
- } \
- } else { \
- EIGEN_UNUSED_VARIABLE(b##iter2); \
- EIGEN_UNUSED_VARIABLE(b##iter3); \
+ if (GEMV_GETN(N) > iter1) { \
+ if (GEMV_IS_FLOAT) { \
+ GEMV_LOADPAIR2_COL_MMA(iter2, iter2); \
+ EIGEN_UNUSED_VARIABLE(b##iter3); \
+ } else { \
+ GEMV_LOADPAIR2_COL_MMA(iter2, iter2 << 1); \
+ GEMV_LOADPAIR2_COL_MMA(iter3, iter3 << 1); \
+ } \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(b##iter2); \
+ EIGEN_UNUSED_VARIABLE(b##iter3); \
}
#if EIGEN_COMP_LLVM
-#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \
- ResPacket f##iter2[2]; \
- __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(f##iter2), &b##iter2); \
- f##iter2[0] = pmadd(result##iter2.packet[0], palpha, f##iter2[0]); \
+#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \
+ ResPacket f##iter2[2]; \
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(f##iter2), &b##iter2); \
+ f##iter2[0] = pmadd(result##iter2.packet[0], palpha, f##iter2[0]); \
f##iter2[1] = pmadd(result##iter3.packet[(iter2 == iter3) ? 2 : 0], palpha, f##iter2[1]); \
GEMV_BUILDPAIR_MMA(b##iter2, f##iter2[0], f##iter2[1]);
#else
-#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \
- if (GEMV_IS_FLOAT) { \
- __asm__ ("xvmaddasp %0,%x1,%x3\n\txvmaddasp %L0,%x2,%x3" : "+&d" (b##iter2) : "wa" (result##iter3.packet[0]), "wa" (result##iter2.packet[0]), "wa" (palpha)); \
- } else { \
- __asm__ ("xvmaddadp %0,%x1,%x3\n\txvmaddadp %L0,%x2,%x3" : "+&d" (b##iter2) : "wa" (result##iter2.packet[2]), "wa" (result##iter2.packet[0]), "wa" (palpha)); \
+#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \
+ if (GEMV_IS_FLOAT) { \
+ __asm__("xvmaddasp %0,%x1,%x3\n\txvmaddasp %L0,%x2,%x3" \
+ : "+&d"(b##iter2) \
+ : "wa"(result##iter3.packet[0]), "wa"(result##iter2.packet[0]), "wa"(palpha)); \
+ } else { \
+ __asm__("xvmaddadp %0,%x1,%x3\n\txvmaddadp %L0,%x2,%x3" \
+ : "+&d"(b##iter2) \
+ : "wa"(result##iter2.packet[2]), "wa"(result##iter2.packet[0]), "wa"(palpha)); \
}
#endif
-#define GEMV_WORK2_COL_MMA(iter1, iter2, iter3, N) \
- if (GEMV_GETN(N) > iter1) { \
- if (GEMV_IS_FLOAT) { \
- GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter2); \
- } else { \
+#define GEMV_WORK2_COL_MMA(iter1, iter2, iter3, N) \
+ if (GEMV_GETN(N) > iter1) { \
+ if (GEMV_IS_FLOAT) { \
+ GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter2); \
+ } else { \
GEMV_WORKPAIR2_COL_MMA(iter2, iter2, iter2 << 1); \
GEMV_WORKPAIR2_COL_MMA(iter3, iter3, iter3 << 1); \
- } \
+ } \
}
#define GEMV_STOREPAIR2_COL_MMA(iter1, iter2) \
- *reinterpret_cast<__vector_pair *>(res + i + ((iter2) * ResPacketSize)) = b##iter1;
+ *reinterpret_cast<__vector_pair*>(res + i + ((iter2) * ResPacketSize)) = b##iter1;
-#define GEMV_STORE_COL_MMA(iter, N) \
- if (GEMV_GETN(N) > iter) { \
- if (GEMV_IS_FLOAT) { \
+#define GEMV_STORE_COL_MMA(iter, N) \
+ if (GEMV_GETN(N) > iter) { \
+ if (GEMV_IS_FLOAT) { \
storeMaddData<ResPacket, ResScalar>(res + i + (iter * ResPacketSize), palpha, result##iter.packet[0]); \
- } else { \
- GEMV_LOADPAIR2_COL_MMA(iter, iter << 1) \
- GEMV_WORKPAIR2_COL_MMA(iter, iter, iter << 1) \
- GEMV_STOREPAIR2_COL_MMA(iter, iter << 1) \
- } \
+ } else { \
+ GEMV_LOADPAIR2_COL_MMA(iter, iter << 1) \
+ GEMV_WORKPAIR2_COL_MMA(iter, iter, iter << 1) \
+ GEMV_STOREPAIR2_COL_MMA(iter, iter << 1) \
+ } \
}
#define GEMV_STORE2_COL_MMA(iter1, iter2, iter3, N) \
- if (GEMV_GETN(N) > iter1) { \
- if (GEMV_IS_FLOAT) { \
- GEMV_STOREPAIR2_COL_MMA(iter2, iter2); \
- } else { \
- GEMV_STOREPAIR2_COL_MMA(iter2, iter2 << 1) \
- GEMV_STOREPAIR2_COL_MMA(iter3, iter3 << 1) \
- } \
+ if (GEMV_GETN(N) > iter1) { \
+ if (GEMV_IS_FLOAT) { \
+ GEMV_STOREPAIR2_COL_MMA(iter2, iter2); \
+ } else { \
+ GEMV_STOREPAIR2_COL_MMA(iter2, iter2 << 1) \
+ GEMV_STOREPAIR2_COL_MMA(iter3, iter3 << 1) \
+ } \
}
-#define GEMV_PROCESS_COL_ONE_MMA(N) \
- GEMV_UNROLL(GEMV_INIT_MMA, N) \
- Index j = j2; \
- __vector_pair b0, b1, b2, b3, b4, b5, b6, b7; \
- do { \
- LhsPacket g0, g1, g2, g3, g4, g5, g6, g7; \
- RhsPacket a0 = pset1<RhsPacket>(rhs2(j, 0)); \
- GEMV_UNROLL(GEMV_PREFETCH, N) \
- GEMV_LOAD_COL_MMA(N) \
- GEMV_WORK_COL_MMA(N) \
- } while (++j < jend); \
- GEMV_UNROLL(GEMV_DISASSEMBLE_MMA, N) \
- if (GEMV_GETN(N) <= 1) { \
- GEMV_UNROLL(GEMV_STORE_COL_MMA, N) \
- } else { \
- GEMV_UNROLL_HALF(GEMV_LOAD2_COL_MMA, (N >> 1)) \
- GEMV_UNROLL_HALF(GEMV_WORK2_COL_MMA, (N >> 1)) \
+#define GEMV_PROCESS_COL_ONE_MMA(N) \
+ GEMV_UNROLL(GEMV_INIT_MMA, N) \
+ Index j = j2; \
+ __vector_pair b0, b1, b2, b3, b4, b5, b6, b7; \
+ do { \
+ LhsPacket g0, g1, g2, g3, g4, g5, g6, g7; \
+ RhsPacket a0 = pset1<RhsPacket>(rhs2(j, 0)); \
+ GEMV_UNROLL(GEMV_PREFETCH, N) \
+ GEMV_LOAD_COL_MMA(N) \
+ GEMV_WORK_COL_MMA(N) \
+ } while (++j < jend); \
+ GEMV_UNROLL(GEMV_DISASSEMBLE_MMA, N) \
+ if (GEMV_GETN(N) <= 1) { \
+ GEMV_UNROLL(GEMV_STORE_COL_MMA, N) \
+ } else { \
+ GEMV_UNROLL_HALF(GEMV_LOAD2_COL_MMA, (N >> 1)) \
+ GEMV_UNROLL_HALF(GEMV_WORK2_COL_MMA, (N >> 1)) \
GEMV_UNROLL_HALF(GEMV_STORE2_COL_MMA, (N >> 1)) \
- } \
+ } \
i += (ResPacketSize * N);
#endif
-#define GEMV_INIT(iter, N) \
- if (N > iter) { \
+#define GEMV_INIT(iter, N) \
+ if (N > iter) { \
c##iter = pset1<ResPacket>(ResScalar(0)); \
- } else { \
- EIGEN_UNUSED_VARIABLE(c##iter); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(c##iter); \
}
#ifdef EIGEN_POWER_USE_GEMV_PREFETCH
-#define GEMV_PREFETCH(iter, N) \
- if (GEMV_GETN(N) > ((iter >> 1) + ((N >> 1) * (iter & 1)))) { \
+#define GEMV_PREFETCH(iter, N) \
+ if (GEMV_GETN(N) > ((iter >> 1) + ((N >> 1) * (iter & 1)))) { \
lhs.prefetch(i + (iter * LhsPacketSize) + prefetch_dist, j); \
}
#else
#define GEMV_PREFETCH(iter, N)
#endif
-#define GEMV_WORK_COL(iter, N) \
- if (N > iter) { \
+#define GEMV_WORK_COL(iter, N) \
+ if (N > iter) { \
c##iter = pcj.pmadd(GEMV_LOADPACKET_COL(iter), a0, c##iter); \
}
-#define GEMV_STORE_COL(iter, N) \
- if (N > iter) { \
- pstoreu(res + i + (iter * ResPacketSize), pmadd(c##iter, palpha, ploadu<ResPacket>(res + i + (iter * ResPacketSize)))); \
+#define GEMV_STORE_COL(iter, N) \
+ if (N > iter) { \
+ pstoreu(res + i + (iter * ResPacketSize), \
+ pmadd(c##iter, palpha, ploadu<ResPacket>(res + i + (iter * ResPacketSize)))); \
}
/** \internal main macro for gemv_col - initialize accumulators, multiply and add inputs, and store results */
-#define GEMV_PROCESS_COL_ONE(N) \
- GEMV_UNROLL(GEMV_INIT, N) \
- Index j = j2; \
- do { \
+#define GEMV_PROCESS_COL_ONE(N) \
+ GEMV_UNROLL(GEMV_INIT, N) \
+ Index j = j2; \
+ do { \
RhsPacket a0 = pset1<RhsPacket>(rhs2(j, 0)); \
- GEMV_UNROLL(GEMV_PREFETCH, N) \
- GEMV_UNROLL(GEMV_WORK_COL, N) \
- } while (++j < jend); \
- GEMV_UNROLL(GEMV_STORE_COL, N) \
+ GEMV_UNROLL(GEMV_PREFETCH, N) \
+ GEMV_UNROLL(GEMV_WORK_COL, N) \
+ } while (++j < jend); \
+ GEMV_UNROLL(GEMV_STORE_COL, N) \
i += (ResPacketSize * N);
#ifdef USE_GEMV_MMA
-#define GEMV_PROCESS_COL(N) \
- GEMV_PROCESS_COL_ONE_MMA(N)
+#define GEMV_PROCESS_COL(N) GEMV_PROCESS_COL_ONE_MMA(N)
#else
-#define GEMV_PROCESS_COL(N) \
- GEMV_PROCESS_COL_ONE(N)
+#define GEMV_PROCESS_COL(N) GEMV_PROCESS_COL_ONE(N)
#endif
/** \internal perform a matrix multiply and accumulate of packet a and packet b */
#ifdef USE_GEMV_MMA
-template<typename LhsPacket, typename RhsPacket, bool accumulate>
-EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b)
-{
- if (accumulate)
- {
- __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
- }
- else
- {
- __builtin_mma_xvf32ger(acc, (__vector unsigned char)a, (__vector unsigned char)b);
- }
+template <typename LhsPacket, typename RhsPacket, bool accumulate>
+EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) {
+ if (accumulate) {
+ __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+ } else {
+ __builtin_mma_xvf32ger(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+ }
}
/** \internal perform a matrix multiply and accumulate of vector_pair a and packet b */
-template<typename LhsPacket, typename RhsPacket, bool accumulate>
-EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, __vector_pair& a, const LhsPacket& b)
-{
- if (accumulate)
- {
- __builtin_mma_xvf64gerpp(acc, a, (__vector unsigned char)b);
- }
- else
- {
- __builtin_mma_xvf64ger(acc, a, (__vector unsigned char)b);
- }
+template <typename LhsPacket, typename RhsPacket, bool accumulate>
+EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, __vector_pair& a, const LhsPacket& b) {
+ if (accumulate) {
+ __builtin_mma_xvf64gerpp(acc, a, (__vector unsigned char)b);
+ } else {
+ __builtin_mma_xvf64ger(acc, a, (__vector unsigned char)b);
+ }
}
#endif
-template<typename LhsScalar, typename LhsMapper, typename RhsScalar, typename RhsMapper, typename ResScalar>
-EIGEN_STRONG_INLINE void gemv_col(
- Index rows, Index cols,
- const LhsMapper& alhs,
- const RhsMapper& rhs,
- ResScalar* res, Index resIncr,
- ResScalar alpha)
-{
- typedef gemv_traits<LhsScalar, RhsScalar> Traits;
+template <typename LhsScalar, typename LhsMapper, typename RhsScalar, typename RhsMapper, typename ResScalar>
+EIGEN_STRONG_INLINE void gemv_col(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, ResScalar* res,
+ Index resIncr, ResScalar alpha) {
+ typedef gemv_traits<LhsScalar, RhsScalar> Traits;
- typedef typename Traits::LhsPacket LhsPacket;
- typedef typename Traits::RhsPacket RhsPacket;
- typedef typename Traits::ResPacket ResPacket;
+ typedef typename Traits::LhsPacket LhsPacket;
+ typedef typename Traits::RhsPacket RhsPacket;
+ typedef typename Traits::ResPacket ResPacket;
- EIGEN_UNUSED_VARIABLE(resIncr);
- eigen_internal_assert(resIncr == 1);
+ EIGEN_UNUSED_VARIABLE(resIncr);
+ eigen_internal_assert(resIncr == 1);
- // The following copy tells the compiler that lhs's attributes are not modified outside this function
- // This helps GCC to generate proper code.
- LhsMapper lhs(alhs);
- RhsMapper rhs2(rhs);
+ // The following copy tells the compiler that lhs's attributes are not modified outside this function
+ // This helps GCC to generate proper code.
+ LhsMapper lhs(alhs);
+ RhsMapper rhs2(rhs);
- conj_helper<LhsScalar, RhsScalar, false, false> cj;
- conj_helper<LhsPacket, RhsPacket, false, false> pcj;
+ conj_helper<LhsScalar, RhsScalar, false, false> cj;
+ conj_helper<LhsPacket, RhsPacket, false, false> pcj;
- const Index lhsStride = lhs.stride();
- // TODO: for padded aligned inputs, we could enable aligned reads
- enum {
- LhsAlignment = Unaligned,
- ResPacketSize = Traits::ResPacketSize,
- LhsPacketSize = Traits::LhsPacketSize,
- RhsPacketSize = Traits::RhsPacketSize,
- };
+ const Index lhsStride = lhs.stride();
+ // TODO: for padded aligned inputs, we could enable aligned reads
+ enum {
+ LhsAlignment = Unaligned,
+ ResPacketSize = Traits::ResPacketSize,
+ LhsPacketSize = Traits::LhsPacketSize,
+ RhsPacketSize = Traits::RhsPacketSize,
+ };
#ifndef GCC_ONE_VECTORPAIR_BUG
- const Index n8 = rows - 8 * ResPacketSize + 1;
- const Index n4 = rows - 4 * ResPacketSize + 1;
- const Index n2 = rows - 2 * ResPacketSize + 1;
+ const Index n8 = rows - 8 * ResPacketSize + 1;
+ const Index n4 = rows - 4 * ResPacketSize + 1;
+ const Index n2 = rows - 2 * ResPacketSize + 1;
#endif
- const Index n1 = rows - 1 * ResPacketSize + 1;
+ const Index n1 = rows - 1 * ResPacketSize + 1;
#ifdef EIGEN_POWER_USE_GEMV_PREFETCH
- const Index prefetch_dist = 64 * LhsPacketSize;
+ const Index prefetch_dist = 64 * LhsPacketSize;
#endif
- // TODO: improve the following heuristic:
- const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8);
- ResPacket palpha = pset1<ResPacket>(alpha);
+ // TODO: improve the following heuristic:
+ const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8);
+ ResPacket palpha = pset1<ResPacket>(alpha);
- for (Index j2 = 0; j2 < cols; j2 += block_cols)
- {
- Index jend = numext::mini(j2 + block_cols, cols);
- Index i = 0;
- ResPacket c0, c1, c2, c3, c4, c5, c6, c7;
+ for (Index j2 = 0; j2 < cols; j2 += block_cols) {
+ Index jend = numext::mini(j2 + block_cols, cols);
+ Index i = 0;
+ ResPacket c0, c1, c2, c3, c4, c5, c6, c7;
#ifdef USE_GEMV_MMA
- __vector_quad e0, e1, e2, e3, e4, e5, e6, e7;
- PacketBlock<ResPacket, 4> result0, result1, result2, result3, result4, result5, result6, result7;
- GEMV_UNUSED(8, e)
- GEMV_UNUSED(8, result)
- GEMV_UNUSED_EXTRA(1, c)
+ __vector_quad e0, e1, e2, e3, e4, e5, e6, e7;
+ PacketBlock<ResPacket, 4> result0, result1, result2, result3, result4, result5, result6, result7;
+ GEMV_UNUSED(8, e)
+ GEMV_UNUSED(8, result)
+ GEMV_UNUSED_EXTRA(1, c)
#endif
#ifndef GCC_ONE_VECTORPAIR_BUG
- while (i < n8)
- {
- GEMV_PROCESS_COL(8)
- }
- if (i < n4)
- {
- GEMV_PROCESS_COL(4)
- }
- if (i < n2)
- {
- GEMV_PROCESS_COL(2)
- }
- if (i < n1)
-#else
- while (i < n1)
-#endif
- {
- GEMV_PROCESS_COL_ONE(1)
- }
- for (;i < rows;++i)
- {
- ResScalar d0(0);
- Index j = j2;
- do {
- d0 += cj.pmul(lhs(i, j), rhs2(j, 0));
- } while (++j < jend);
- res[i] += alpha * d0;
- }
+ while (i < n8) {
+ GEMV_PROCESS_COL(8)
}
+ if (i < n4) {
+ GEMV_PROCESS_COL(4)
+ }
+ if (i < n2) {
+ GEMV_PROCESS_COL(2)
+ }
+ if (i < n1)
+#else
+ while (i < n1)
+#endif
+ {
+ GEMV_PROCESS_COL_ONE(1)
+ }
+ for (; i < rows; ++i) {
+ ResScalar d0(0);
+ Index j = j2;
+ do {
+ d0 += cj.pmul(lhs(i, j), rhs2(j, 0));
+ } while (++j < jend);
+ res[i] += alpha * d0;
+ }
+ }
}
-template<bool extraRows>
-EIGEN_ALWAYS_INLINE void outputVecCol(Packet4f acc, float *result, Packet4f pAlpha, Index extra_rows)
-{
+template <bool extraRows>
+EIGEN_ALWAYS_INLINE void outputVecCol(Packet4f acc, float* result, Packet4f pAlpha, Index extra_rows) {
Packet4f d0 = ploadu<Packet4f>(result);
d0 = pmadd(acc, pAlpha, d0);
if (extraRows) {
@@ -477,28 +452,27 @@
}
}
-template<Index num_acc, bool extraRows, Index size>
-EIGEN_ALWAYS_INLINE void outputVecColResults(Packet4f (&acc)[num_acc][size], float *result, Packet4f pAlpha, Index extra_rows)
-{
+template <Index num_acc, bool extraRows, Index size>
+EIGEN_ALWAYS_INLINE void outputVecColResults(Packet4f (&acc)[num_acc][size], float* result, Packet4f pAlpha,
+ Index extra_rows) {
constexpr Index real_acc = (num_acc - (extraRows ? 1 : 0));
- for(Index k = 0; k < real_acc; k++) {
- outputVecCol<false>(acc[k][0], result + k*4, pAlpha, extra_rows);
+ for (Index k = 0; k < real_acc; k++) {
+ outputVecCol<false>(acc[k][0], result + k * 4, pAlpha, extra_rows);
}
if (extraRows) {
- outputVecCol<true>(acc[real_acc][0], result + real_acc*4, pAlpha, extra_rows);
+ outputVecCol<true>(acc[real_acc][0], result + real_acc * 4, pAlpha, extra_rows);
}
}
-static Packet16uc p16uc_MERGE16_32_V1 = { 0, 1, 16,17, 0, 1, 16,17, 0, 1, 16,17, 0, 1, 16,17 };
-static Packet16uc p16uc_MERGE16_32_V2 = { 2, 3, 18,19, 2, 3, 18,19, 2, 3, 18,19, 2, 3, 18,19 };
+static Packet16uc p16uc_MERGE16_32_V1 = {0, 1, 16, 17, 0, 1, 16, 17, 0, 1, 16, 17, 0, 1, 16, 17};
+static Packet16uc p16uc_MERGE16_32_V2 = {2, 3, 18, 19, 2, 3, 18, 19, 2, 3, 18, 19, 2, 3, 18, 19};
-template<Index num_acc, typename LhsMapper, bool zero>
-EIGEN_ALWAYS_INLINE void loadVecLoopVSX(Index k, LhsMapper& lhs, Packet4f (&a0)[num_acc][2])
-{
- Packet8bf c0 = lhs.template loadPacket<Packet8bf>(k*4, 0);
+template <Index num_acc, typename LhsMapper, bool zero>
+EIGEN_ALWAYS_INLINE void loadVecLoopVSX(Index k, LhsMapper& lhs, Packet4f (&a0)[num_acc][2]) {
+ Packet8bf c0 = lhs.template loadPacket<Packet8bf>(k * 4, 0);
Packet8bf b1;
if (!zero) {
- b1 = lhs.template loadPacket<Packet8bf>(k*4, 1);
+ b1 = lhs.template loadPacket<Packet8bf>(k * 4, 1);
a0[k + 0][1] = oneConvertBF16Hi(b1.m_val);
}
@@ -512,22 +486,19 @@
}
}
-template<Index num_acc, bool zero>
-EIGEN_ALWAYS_INLINE void multVecVSX(Packet4f (&acc)[num_acc][2], Packet4f (&a0)[num_acc][2], Packet4f (&b0)[2])
-{
- for(Index k = 0; k < num_acc; k++) {
- for(Index i = 0; i < (zero ? 1 : 2); i++) {
+template <Index num_acc, bool zero>
+EIGEN_ALWAYS_INLINE void multVecVSX(Packet4f (&acc)[num_acc][2], Packet4f (&a0)[num_acc][2], Packet4f (&b0)[2]) {
+ for (Index k = 0; k < num_acc; k++) {
+ for (Index i = 0; i < (zero ? 1 : 2); i++) {
acc[k][i] = pmadd(b0[i], a0[k][i], acc[k][i]);
}
}
}
-template<typename RhsMapper, bool linear>
-struct loadColData_impl
-{
+template <typename RhsMapper, bool linear>
+struct loadColData_impl {
// linear == false
- static EIGEN_ALWAYS_INLINE Packet8bf run(RhsMapper& rhs, Index j)
- {
+ static EIGEN_ALWAYS_INLINE Packet8bf run(RhsMapper& rhs, Index j) {
const Index n = unpacket_traits<Packet8bf>::size;
EIGEN_ALIGN16 bfloat16 to[n];
LOAD_STORE_UNROLL_16
@@ -538,25 +509,21 @@
}
};
-template<typename RhsMapper>
-struct loadColData_impl<RhsMapper, true>
-{
+template <typename RhsMapper>
+struct loadColData_impl<RhsMapper, true> {
// linear == true
- static EIGEN_ALWAYS_INLINE Packet8bf run(RhsMapper& rhs, Index j)
- {
+ static EIGEN_ALWAYS_INLINE Packet8bf run(RhsMapper& rhs, Index j) {
return rhs.template loadPacket<Packet8bf>(j + 0, 0);
}
};
-template<typename RhsMapper, bool linear>
-EIGEN_ALWAYS_INLINE Packet8bf loadColData(RhsMapper& rhs, Index j)
-{
+template <typename RhsMapper, bool linear>
+EIGEN_ALWAYS_INLINE Packet8bf loadColData(RhsMapper& rhs, Index j) {
return loadColData_impl<RhsMapper, linear>::run(rhs, j);
}
-template<Index num_acc, typename LhsMapper, typename RhsMapper, bool zero, bool linear>
-EIGEN_ALWAYS_INLINE void vecColLoopVSX(Index j, LhsMapper& lhs, RhsMapper& rhs, Packet4f (&acc)[num_acc][2])
-{
+template <Index num_acc, typename LhsMapper, typename RhsMapper, bool zero, bool linear>
+EIGEN_ALWAYS_INLINE void vecColLoopVSX(Index j, LhsMapper& lhs, RhsMapper& rhs, Packet4f (&acc)[num_acc][2]) {
Packet4f a0[num_acc][2], b0[2];
Packet8bf b2 = loadColData<RhsMapper, linear>(rhs, j);
@@ -568,32 +535,31 @@
using LhsSubMapper = typename LhsMapper::SubMapper;
LhsSubMapper lhs2 = lhs.getSubMapper(0, j);
- for(Index k = 0; k < num_acc; k += 2) {
+ for (Index k = 0; k < num_acc; k += 2) {
loadVecLoopVSX<num_acc, LhsSubMapper, zero>(k, lhs2, a0);
}
multVecVSX<num_acc, zero>(acc, a0, b0);
}
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void addResultsVSX(Packet4f (&acc)[num_acc][2])
-{
- for(Index i = 0; i < num_acc; i++) {
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void addResultsVSX(Packet4f (&acc)[num_acc][2]) {
+ for (Index i = 0; i < num_acc; i++) {
acc[i][0] = acc[i][0] + acc[i][1];
}
}
// Uses 2X the accumulators or 4X the number of VSX registers
-#define MAX_BFLOAT16_VEC_ACC_VSX 8
+#define MAX_BFLOAT16_VEC_ACC_VSX 8
-template<const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
-void colVSXVecColLoopBody(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+void colVSXVecColLoopBody(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+ float* result) {
constexpr Index step = (num_acc * 4);
const Index extra_rows = (extraRows) ? (rows & 3) : 0;
constexpr bool multiIters = !extraRows && (num_acc == MAX_BFLOAT16_VEC_ACC_VSX);
- do{
+ do {
Packet4f acc[num_acc][2];
zeroAccumulators<num_acc, 2>(acc);
@@ -601,7 +567,7 @@
using LhsSubMapper = typename LhsMapper::SubMapper;
LhsSubMapper lhs2 = lhs.getSubMapper(row, 0);
- for(Index j = 0; j + 2 <= cend; j += 2) {
+ for (Index j = 0; j + 2 <= cend; j += 2) {
vecColLoopVSX<num_acc, LhsSubMapper, RhsMapper, false, linear>(j, lhs2, rhs, acc);
}
if (cend & 1) {
@@ -613,56 +579,58 @@
outputVecColResults<num_acc, extraRows, 2>(acc, result, pAlpha, extra_rows);
result += step;
- } while(multiIters && (step <= rows - (row += step)));
+ } while (multiIters && (step <= rows - (row += step)));
}
-template<const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
-EIGEN_ALWAYS_INLINE void colVSXVecColLoopBodyExtraN(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+EIGEN_ALWAYS_INLINE void colVSXVecColLoopBodyExtraN(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+ const Packet4f pAlpha, float* result) {
if (MAX_BFLOAT16_VEC_ACC_VSX > num_acc) {
- colVSXVecColLoopBody<num_acc + (extraRows ? 1 : 0), LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ colVSXVecColLoopBody<num_acc + (extraRows ? 1 : 0), LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs,
+ rhs, pAlpha, result);
}
}
-template<typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
-EIGEN_ALWAYS_INLINE void colVSXVecColLoopBodyExtra(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+EIGEN_ALWAYS_INLINE void colVSXVecColLoopBodyExtra(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+ const Packet4f pAlpha, float* result) {
switch ((rows - row) >> 2) {
- case 7:
- colVSXVecColLoopBodyExtraN<7, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
- break;
- case 6:
- colVSXVecColLoopBodyExtraN<6, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
- break;
- case 5:
- colVSXVecColLoopBodyExtraN<5, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
- break;
- case 4:
- colVSXVecColLoopBodyExtraN<4, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
- break;
- case 3:
- colVSXVecColLoopBodyExtraN<3, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
- break;
- case 2:
- colVSXVecColLoopBodyExtraN<2, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
- break;
- case 1:
- colVSXVecColLoopBodyExtraN<1, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
- break;
- default:
- if (extraRows) {
- colVSXVecColLoopBody<1, LhsMapper, RhsMapper, true, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
- }
- break;
+ case 7:
+ colVSXVecColLoopBodyExtraN<7, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 6:
+ colVSXVecColLoopBodyExtraN<6, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 5:
+ colVSXVecColLoopBodyExtraN<5, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 4:
+ colVSXVecColLoopBodyExtraN<4, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 3:
+ colVSXVecColLoopBodyExtraN<3, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 2:
+ colVSXVecColLoopBodyExtraN<2, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 1:
+ colVSXVecColLoopBodyExtraN<1, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ break;
+ default:
+ if (extraRows) {
+ colVSXVecColLoopBody<1, LhsMapper, RhsMapper, true, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ }
+ break;
}
}
-template<typename LhsMapper, typename RhsMapper, bool linear>
-EIGEN_ALWAYS_INLINE void calcVSXVecColLoops(Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <typename LhsMapper, typename RhsMapper, bool linear>
+EIGEN_ALWAYS_INLINE void calcVSXVecColLoops(Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+ const Packet4f pAlpha, float* result) {
Index row = 0;
if (rows >= (MAX_BFLOAT16_VEC_ACC_VSX * 4)) {
- colVSXVecColLoopBody<MAX_BFLOAT16_VEC_ACC_VSX, LhsMapper, RhsMapper, false, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+ colVSXVecColLoopBody<MAX_BFLOAT16_VEC_ACC_VSX, LhsMapper, RhsMapper, false, linear>(row, cend, rows, lhs, rhs,
+ pAlpha, result);
result += row;
}
if (rows & 3) {
@@ -672,14 +640,13 @@
}
}
-template<const Index size, bool inc, Index delta>
-EIGEN_ALWAYS_INLINE void storeBF16fromResult(bfloat16* dst, Packet8bf data, Index resInc, Index extra)
-{
+template <const Index size, bool inc, Index delta>
+EIGEN_ALWAYS_INLINE void storeBF16fromResult(bfloat16* dst, Packet8bf data, Index resInc, Index extra) {
if (inc) {
if (size < 8) {
- pscatter_partial(dst + delta*resInc, data, resInc, extra);
+ pscatter_partial(dst + delta * resInc, data, resInc, extra);
} else {
- pscatter(dst + delta*resInc, data, resInc);
+ pscatter(dst + delta * resInc, data, resInc);
}
} else {
if (size < 8) {
@@ -690,15 +657,15 @@
}
}
-template<const Index size, bool inc = false>
-EIGEN_ALWAYS_INLINE void convertPointerF32toBF16VSX(Index& i, float* result, Index rows, bfloat16*& dst, Index resInc = 1)
-{
+template <const Index size, bool inc = false>
+EIGEN_ALWAYS_INLINE void convertPointerF32toBF16VSX(Index& i, float* result, Index rows, bfloat16*& dst,
+ Index resInc = 1) {
constexpr Index extra = ((size < 8) ? 8 : size);
while (i + size <= rows) {
- PacketBlock<Packet8bf,(size+7)/8> r32;
- r32.packet[0] = convertF32toBF16VSX(result + i + 0);
+ PacketBlock<Packet8bf, (size + 7) / 8> r32;
+ r32.packet[0] = convertF32toBF16VSX(result + i + 0);
if (size >= 16) {
- r32.packet[1] = convertF32toBF16VSX(result + i + 8);
+ r32.packet[1] = convertF32toBF16VSX(result + i + 8);
}
if (size >= 32) {
r32.packet[2] = convertF32toBF16VSX(result + i + 16);
@@ -712,25 +679,25 @@
storeBF16fromResult<size, inc, 16>(dst, r32.packet[2], resInc);
storeBF16fromResult<size, inc, 24>(dst, r32.packet[3], resInc);
}
- i += extra; dst += extra*resInc;
+ i += extra;
+ dst += extra * resInc;
if (size != 32) break;
}
}
-template<bool inc = false>
-EIGEN_ALWAYS_INLINE void convertArrayPointerF32toBF16VSX(float *result, Index rows, bfloat16* dst, Index resInc = 1)
-{
+template <bool inc = false>
+EIGEN_ALWAYS_INLINE void convertArrayPointerF32toBF16VSX(float* result, Index rows, bfloat16* dst, Index resInc = 1) {
Index i = 0;
- convertPointerF32toBF16VSX<32,inc>(i, result, rows, dst, resInc);
- convertPointerF32toBF16VSX<16,inc>(i, result, rows, dst, resInc);
- convertPointerF32toBF16VSX<8,inc>(i, result, rows, dst, resInc);
- convertPointerF32toBF16VSX<1,inc>(i, result, rows, dst, resInc);
+ convertPointerF32toBF16VSX<32, inc>(i, result, rows, dst, resInc);
+ convertPointerF32toBF16VSX<16, inc>(i, result, rows, dst, resInc);
+ convertPointerF32toBF16VSX<8, inc>(i, result, rows, dst, resInc);
+ convertPointerF32toBF16VSX<1, inc>(i, result, rows, dst, resInc);
}
-template<typename RhsMapper, typename LhsMapper, typename = void>
+template <typename RhsMapper, typename LhsMapper, typename = void>
struct UseStride : std::false_type {
- static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha, float *result)
- {
+ static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha,
+ float* result) {
using RhsSubMapper = typename RhsMapper::SubMapper;
RhsSubMapper rhs2 = rhs.getSubMapper(j2, 0);
@@ -738,11 +705,12 @@
}
};
-template<typename RhsMapper, typename LhsMapper>
-struct UseStride<RhsMapper, LhsMapper, std::enable_if_t<std::is_member_function_pointer<
- decltype(&RhsMapper::stride)>::value>> : std::true_type {
- static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha, float *result)
- {
+template <typename RhsMapper, typename LhsMapper>
+struct UseStride<RhsMapper, LhsMapper,
+ std::enable_if_t<std::is_member_function_pointer<decltype(&RhsMapper::stride)>::value>>
+ : std::true_type {
+ static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha,
+ float* result) {
using RhsSubMapper = typename RhsMapper::SubMapper;
RhsSubMapper rhs2 = rhs.getSubMapper(j2, 0);
@@ -754,14 +722,9 @@
}
};
-template<typename LhsMapper, typename RhsMapper>
-void gemv_bfloat16_col(
- Index rows, Index cols,
- const LhsMapper& alhs,
- const RhsMapper& rhs,
- bfloat16* res, Index resIncr,
- bfloat16 alpha)
-{
+template <typename LhsMapper, typename RhsMapper>
+void gemv_bfloat16_col(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, bfloat16* res,
+ Index resIncr, bfloat16 alpha) {
EIGEN_UNUSED_VARIABLE(resIncr);
eigen_internal_assert(resIncr == 1);
@@ -781,8 +744,7 @@
convertArrayPointerBF16toF32(result, 1, rows, res);
- for (Index j2 = 0; j2 < cols; j2 += block_cols)
- {
+ for (Index j2 = 0; j2 < cols; j2 += block_cols) {
Index jend = numext::mini(j2 + block_cols, cols);
using LhsSubMapper = typename LhsMapper::SubMapper;
@@ -794,12 +756,11 @@
convertArrayPointerF32toBF16VSX(result, rows, res);
}
-template<Index num_acc, Index size>
-EIGEN_ALWAYS_INLINE void outputVecResults(Packet4f (&acc)[num_acc][size], float *result, Packet4f pAlpha)
-{
+template <Index num_acc, Index size>
+EIGEN_ALWAYS_INLINE void outputVecResults(Packet4f (&acc)[num_acc][size], float* result, Packet4f pAlpha) {
constexpr Index extra = num_acc & 3;
- for(Index k = 0; k < num_acc; k += 4) {
+ for (Index k = 0; k < num_acc; k += 4) {
Packet4f d0 = ploadu<Packet4f>(result + k);
d0 = pmadd(acc[k + 0][0], pAlpha, d0);
@@ -809,15 +770,14 @@
if (extra == 3) {
pstoreu_partial(result + k, d0, extra);
} else {
- memcpy((void *)(result + k), (void *)(&d0), sizeof(float) * extra);
+ memcpy((void*)(result + k), (void*)(&d0), sizeof(float) * extra);
}
}
}
}
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void preduxVecResults2VSX(Packet4f (&acc)[num_acc][2], Index k)
-{
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void preduxVecResults2VSX(Packet4f (&acc)[num_acc][2], Index k) {
if (num_acc > (k + 1)) {
acc[k][1] = vec_mergel(acc[k + 0][0], acc[k + 1][0]);
acc[k][0] = vec_mergeh(acc[k + 0][0], acc[k + 1][0]);
@@ -833,25 +793,24 @@
}
}
-template<Index num_acc>
-EIGEN_ALWAYS_INLINE void preduxVecResultsVSX(Packet4f (&acc)[num_acc][2])
-{
- for(Index k = 0; k < num_acc; k += 4) {
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void preduxVecResultsVSX(Packet4f (&acc)[num_acc][2]) {
+ for (Index k = 0; k < num_acc; k += 4) {
preduxVecResults2VSX<num_acc>(acc, k + 0);
if (num_acc > (k + 2)) {
preduxVecResults2VSX<num_acc>(acc, k + 2);
#ifdef EIGEN_VECTORIZE_VSX
- acc[k + 0][0] = reinterpret_cast<Packet4f>(vec_mergeh(reinterpret_cast<Packet2ul>(acc[k + 0][0]), reinterpret_cast<Packet2ul>(acc[k + 2][0])));
+ acc[k + 0][0] = reinterpret_cast<Packet4f>(
+ vec_mergeh(reinterpret_cast<Packet2ul>(acc[k + 0][0]), reinterpret_cast<Packet2ul>(acc[k + 2][0])));
#else
- acc[k + 0][0] = reinterpret_cast<Packet4f>(vec_perm(acc[k + 0][0],acc[k + 2][0],p16uc_TRANSPOSE64_HI));
+ acc[k + 0][0] = reinterpret_cast<Packet4f>(vec_perm(acc[k + 0][0], acc[k + 2][0], p16uc_TRANSPOSE64_HI));
#endif
}
}
}
#ifndef _ARCH_PWR9
-EIGEN_ALWAYS_INLINE Packet8us loadPacketPartialZero(Packet8us data, Index extra_cols)
-{
+EIGEN_ALWAYS_INLINE Packet8us loadPacketPartialZero(Packet8us data, Index extra_cols) {
Packet16uc shift = pset1<Packet16uc>(8 * 2 * (8 - extra_cols));
#ifdef _BIG_ENDIAN
return reinterpret_cast<Packet8us>(vec_slo(vec_sro(reinterpret_cast<Packet16uc>(data), shift), shift));
@@ -861,9 +820,9 @@
}
#endif
-template<Index num_acc, typename LhsMapper, typename RhsMapper, bool extra>
-EIGEN_ALWAYS_INLINE void multVSXVecLoop(Packet4f (&acc)[num_acc][2], const LhsMapper& lhs, RhsMapper& rhs, Index j, Index extra_cols)
-{
+template <Index num_acc, typename LhsMapper, typename RhsMapper, bool extra>
+EIGEN_ALWAYS_INLINE void multVSXVecLoop(Packet4f (&acc)[num_acc][2], const LhsMapper& lhs, RhsMapper& rhs, Index j,
+ Index extra_cols) {
Packet4f a0[num_acc][2], b0[2];
Packet8bf a1, b1;
@@ -879,7 +838,7 @@
b0[1] = oneConvertBF16Lo(b1.m_val);
const LhsMapper lhs2 = lhs.getSubMapper(0, j);
- for(Index k = 0; k < num_acc; k++) {
+ for (Index k = 0; k < num_acc; k++) {
if (extra) {
a1 = lhs2.template loadPacketPartial<Packet8bf>(k, 0, extra_cols);
#ifndef _ARCH_PWR9
@@ -895,11 +854,11 @@
multVecVSX<num_acc, false>(acc, a0, b0);
}
-template<Index num_acc, typename LhsMapper, typename RhsMapper>
-EIGEN_ALWAYS_INLINE void vecVSXLoop(Index cols, const LhsMapper& lhs, RhsMapper& rhs, Packet4f (&acc)[num_acc][2], Index extra_cols)
-{
+template <Index num_acc, typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void vecVSXLoop(Index cols, const LhsMapper& lhs, RhsMapper& rhs, Packet4f (&acc)[num_acc][2],
+ Index extra_cols) {
Index j = 0;
- for(; j + 8 <= cols; j += 8){
+ for (; j + 8 <= cols; j += 8) {
multVSXVecLoop<num_acc, LhsMapper, RhsMapper, false>(acc, lhs, rhs, j, extra_cols);
}
@@ -908,13 +867,13 @@
}
}
-template<const Index num_acc, typename LhsMapper, typename RhsMapper>
-void colVSXVecLoopBody(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <const Index num_acc, typename LhsMapper, typename RhsMapper>
+void colVSXVecLoopBody(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+ float* result) {
constexpr bool multiIters = (num_acc == MAX_BFLOAT16_VEC_ACC_VSX);
const Index extra_cols = (cols & 7);
- do{
+ do {
Packet4f acc[num_acc][2];
zeroAccumulators<num_acc, 2>(acc);
@@ -929,48 +888,48 @@
outputVecResults<num_acc, 2>(acc, result, pAlpha);
result += num_acc;
- } while(multiIters && (num_acc <= rows - (row += num_acc)));
+ } while (multiIters && (num_acc <= rows - (row += num_acc)));
}
-template<const Index num_acc, typename LhsMapper, typename RhsMapper>
-EIGEN_ALWAYS_INLINE void colVSXVecLoopBodyExtraN(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <const Index num_acc, typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void colVSXVecLoopBodyExtraN(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+ const Packet4f pAlpha, float* result) {
if (MAX_BFLOAT16_VEC_ACC_VSX > num_acc) {
colVSXVecLoopBody<num_acc, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
}
}
-template<typename LhsMapper, typename RhsMapper>
-EIGEN_ALWAYS_INLINE void colVSXVecLoopBodyExtra(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void colVSXVecLoopBodyExtra(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+ const Packet4f pAlpha, float* result) {
switch (rows - row) {
- case 7:
- colVSXVecLoopBodyExtraN<7, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
- break;
- case 6:
- colVSXVecLoopBodyExtraN<6, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
- break;
- case 5:
- colVSXVecLoopBodyExtraN<5, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
- break;
- case 4:
- colVSXVecLoopBodyExtraN<4, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
- break;
- case 3:
- colVSXVecLoopBodyExtraN<3, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
- break;
- case 2:
- colVSXVecLoopBodyExtraN<2, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
- break;
- case 1:
- colVSXVecLoopBodyExtraN<1, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
- break;
+ case 7:
+ colVSXVecLoopBodyExtraN<7, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 6:
+ colVSXVecLoopBodyExtraN<6, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 5:
+ colVSXVecLoopBodyExtraN<5, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 4:
+ colVSXVecLoopBodyExtraN<4, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 3:
+ colVSXVecLoopBodyExtraN<3, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 2:
+ colVSXVecLoopBodyExtraN<2, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+ break;
+ case 1:
+ colVSXVecLoopBodyExtraN<1, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+ break;
}
}
-template<typename LhsMapper, typename RhsMapper>
-EIGEN_ALWAYS_INLINE void calcVSXVecLoops(Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha, float *result)
-{
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void calcVSXVecLoops(Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+ float* result) {
Index row = 0;
if (rows >= MAX_BFLOAT16_VEC_ACC_VSX) {
colVSXVecLoopBody<MAX_BFLOAT16_VEC_ACC_VSX, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
@@ -979,14 +938,9 @@
colVSXVecLoopBodyExtra<LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
}
-template<typename LhsMapper, typename RhsMapper>
-EIGEN_STRONG_INLINE void gemv_bfloat16_row(
- Index rows, Index cols,
- const LhsMapper& alhs,
- const RhsMapper& rhs,
- bfloat16* res, Index resIncr,
- bfloat16 alpha)
-{
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_STRONG_INLINE void gemv_bfloat16_row(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
+ bfloat16* res, Index resIncr, bfloat16 alpha) {
typedef typename RhsMapper::LinearMapper LinearMapper;
// The following copy tells the compiler that lhs's attributes are not modified outside this function
@@ -1015,51 +969,65 @@
#undef MAX_BFLOAT16_VEC_ACC_VSX
-const Packet16uc p16uc_COMPLEX32_XORFLIP = { 0x44,0x55,0x66,0x77, 0x00,0x11,0x22,0x33, 0xcc,0xdd,0xee,0xff, 0x88,0x99,0xaa,0xbb };
-const Packet16uc p16uc_COMPLEX64_XORFLIP = { 0x88,0x99,0xaa,0xbb, 0xcc,0xdd,0xee,0xff, 0x00,0x11,0x22,0x33, 0x44,0x55,0x66,0x77 };
+const Packet16uc p16uc_COMPLEX32_XORFLIP = {0x44, 0x55, 0x66, 0x77, 0x00, 0x11, 0x22, 0x33,
+ 0xcc, 0xdd, 0xee, 0xff, 0x88, 0x99, 0xaa, 0xbb};
+const Packet16uc p16uc_COMPLEX64_XORFLIP = {0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
+ 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77};
#ifdef _BIG_ENDIAN
-const Packet16uc p16uc_COMPLEX32_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 };
-const Packet16uc p16uc_COMPLEX64_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
-const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
-const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
-const Packet16uc p16uc_COMPLEX32_NEGATE = { 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 };
-const Packet16uc p16uc_COMPLEX64_NEGATE = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
+const Packet16uc p16uc_COMPLEX32_CONJ_XOR = {0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX64_CONJ_XOR = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX32_NEGATE = {0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+ 0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX64_NEGATE = {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
#else
-const Packet16uc p16uc_COMPLEX32_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 };
-const Packet16uc p16uc_COMPLEX64_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 };
-const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = { 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00 };
-const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
-const Packet16uc p16uc_COMPLEX32_NEGATE = { 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80 };
-const Packet16uc p16uc_COMPLEX64_NEGATE = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 };
+const Packet16uc p16uc_COMPLEX32_CONJ_XOR = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80};
+const Packet16uc p16uc_COMPLEX64_CONJ_XOR = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80};
+const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = {0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX32_NEGATE = {0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
+ 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80};
+const Packet16uc p16uc_COMPLEX64_NEGATE = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80};
#endif
#ifdef _BIG_ENDIAN
-#define COMPLEX_DELTA 0
+#define COMPLEX_DELTA 0
#else
-#define COMPLEX_DELTA 2
+#define COMPLEX_DELTA 2
#endif
/** \internal packet conjugate (same as pconj but uses the constants in pcplxflipconj for better code generation) */
EIGEN_ALWAYS_INLINE Packet2cf pconj2(const Packet2cf& a) {
- return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_CONJ_XOR)));
+ return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_CONJ_XOR)));
}
EIGEN_ALWAYS_INLINE Packet1cd pconj2(const Packet1cd& a) {
- return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_CONJ_XOR)));
+ return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_CONJ_XOR)));
}
/** \internal packet conjugate with real & imaginary operation inverted */
EIGEN_ALWAYS_INLINE Packet2cf pconjinv(const Packet2cf& a) {
#ifdef __POWER8_VECTOR__
- return Packet2cf(Packet4f(vec_neg(Packet2d(a.v))));
+ return Packet2cf(Packet4f(vec_neg(Packet2d(a.v))));
#else
- return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_CONJ_XOR2)));
+ return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_CONJ_XOR2)));
#endif
}
EIGEN_ALWAYS_INLINE Packet1cd pconjinv(const Packet1cd& a) {
- return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_CONJ_XOR2)));
+ return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_CONJ_XOR2)));
}
#if defined(_ARCH_PWR8) && (!EIGEN_COMP_LLVM || __clang_major__ >= 12)
@@ -1067,883 +1035,773 @@
#endif
/** \internal flip the real & imaginary results and packet conjugate */
-EIGEN_ALWAYS_INLINE Packet2cf pcplxflipconj(Packet2cf a)
-{
+EIGEN_ALWAYS_INLINE Packet2cf pcplxflipconj(Packet2cf a) {
#ifdef PERMXOR_GOOD
- return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR, p16uc_COMPLEX32_XORFLIP)));
+ return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR, p16uc_COMPLEX32_XORFLIP)));
#else
- return pcplxflip(pconj2(a));
+ return pcplxflip(pconj2(a));
#endif
}
-EIGEN_ALWAYS_INLINE Packet1cd pcplxflipconj(Packet1cd a)
-{
+EIGEN_ALWAYS_INLINE Packet1cd pcplxflipconj(Packet1cd a) {
#ifdef PERMXOR_GOOD
- return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR, p16uc_COMPLEX64_XORFLIP)));
+ return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR, p16uc_COMPLEX64_XORFLIP)));
#else
- return pcplxflip(pconj2(a));
+ return pcplxflip(pconj2(a));
#endif
}
/** \internal packet conjugate and flip the real & imaginary results */
-EIGEN_ALWAYS_INLINE Packet2cf pcplxconjflip(Packet2cf a)
-{
+EIGEN_ALWAYS_INLINE Packet2cf pcplxconjflip(Packet2cf a) {
#ifdef PERMXOR_GOOD
- return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR2, p16uc_COMPLEX32_XORFLIP)));
+ return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR2, p16uc_COMPLEX32_XORFLIP)));
#else
- return pconj2(pcplxflip(a));
+ return pconj2(pcplxflip(a));
#endif
}
-EIGEN_ALWAYS_INLINE Packet1cd pcplxconjflip(Packet1cd a)
-{
+EIGEN_ALWAYS_INLINE Packet1cd pcplxconjflip(Packet1cd a) {
#ifdef PERMXOR_GOOD
- return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR2, p16uc_COMPLEX64_XORFLIP)));
+ return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR2, p16uc_COMPLEX64_XORFLIP)));
#else
- return pconj2(pcplxflip(a));
+ return pconj2(pcplxflip(a));
#endif
}
/** \internal packet negate */
-EIGEN_ALWAYS_INLINE Packet2cf pnegate2(Packet2cf a)
-{
+EIGEN_ALWAYS_INLINE Packet2cf pnegate2(Packet2cf a) {
#ifdef __POWER8_VECTOR__
- return Packet2cf(vec_neg(a.v));
+ return Packet2cf(vec_neg(a.v));
#else
- return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_NEGATE)));
+ return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_NEGATE)));
#endif
}
-EIGEN_ALWAYS_INLINE Packet1cd pnegate2(Packet1cd a)
-{
+EIGEN_ALWAYS_INLINE Packet1cd pnegate2(Packet1cd a) {
#ifdef __POWER8_VECTOR__
- return Packet1cd(vec_neg(a.v));
+ return Packet1cd(vec_neg(a.v));
#else
- return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_NEGATE)));
+ return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_NEGATE)));
#endif
}
/** \internal flip the real & imaginary results and negate */
-EIGEN_ALWAYS_INLINE Packet2cf pcplxflipnegate(Packet2cf a)
-{
+EIGEN_ALWAYS_INLINE Packet2cf pcplxflipnegate(Packet2cf a) {
#ifdef PERMXOR_GOOD
- return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_NEGATE, p16uc_COMPLEX32_XORFLIP)));
+ return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_NEGATE, p16uc_COMPLEX32_XORFLIP)));
#else
- return pcplxflip(pnegate2(a));
+ return pcplxflip(pnegate2(a));
#endif
}
-EIGEN_ALWAYS_INLINE Packet1cd pcplxflipnegate(Packet1cd a)
-{
+EIGEN_ALWAYS_INLINE Packet1cd pcplxflipnegate(Packet1cd a) {
#ifdef PERMXOR_GOOD
- return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_NEGATE, p16uc_COMPLEX64_XORFLIP)));
+ return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_NEGATE, p16uc_COMPLEX64_XORFLIP)));
#else
- return pcplxflip(pnegate2(a));
+ return pcplxflip(pnegate2(a));
#endif
}
/** \internal flip the real & imaginary results */
-EIGEN_ALWAYS_INLINE Packet2cf pcplxflip2(Packet2cf a)
-{
- return Packet2cf(Packet4f(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX32_XORFLIP)));
+EIGEN_ALWAYS_INLINE Packet2cf pcplxflip2(Packet2cf a) {
+ return Packet2cf(Packet4f(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX32_XORFLIP)));
}
-EIGEN_ALWAYS_INLINE Packet1cd pcplxflip2(Packet1cd a)
-{
+EIGEN_ALWAYS_INLINE Packet1cd pcplxflip2(Packet1cd a) {
#ifdef EIGEN_VECTORIZE_VSX
- return Packet1cd(__builtin_vsx_xxpermdi(a.v, a.v, 2));
+ return Packet1cd(__builtin_vsx_xxpermdi(a.v, a.v, 2));
#else
- return Packet1cd(Packet2d(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX64_XORFLIP)));
+ return Packet1cd(Packet2d(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX64_XORFLIP)));
#endif
}
/** \internal load half a vector with one complex value */
-EIGEN_ALWAYS_INLINE Packet4f pload_complex_half(std::complex<float>* src)
-{
- Packet4f t;
+EIGEN_ALWAYS_INLINE Packet4f pload_complex_half(std::complex<float>* src) {
+ Packet4f t;
#ifdef EIGEN_VECTORIZE_VSX
- // Load float64/two float32 (doubleword alignment)
- __asm__("lxsdx %x0,%y1" : "=wa" (t) : "Z" (*src));
+ // Load float64/two float32 (doubleword alignment)
+ __asm__("lxsdx %x0,%y1" : "=wa"(t) : "Z"(*src));
#else
- *reinterpret_cast<std::complex<float>*>(reinterpret_cast<float*>(&t) + COMPLEX_DELTA) = *src;
+ *reinterpret_cast<std::complex<float>*>(reinterpret_cast<float*>(&t) + COMPLEX_DELTA) = *src;
#endif
- return t;
+ return t;
}
/** \internal load two vectors from the real and imaginary portions of a complex value */
-template<typename RhsScalar>
-EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet4f& r, Packet4f& i)
-{
+template <typename RhsScalar>
+EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet4f& r, Packet4f& i) {
#ifdef _ARCH_PWR9
- __asm__("lxvwsx %x0,%y1" : "=wa" (r) : "Z" (*(reinterpret_cast<float*>(src) + 0)));
- __asm__("lxvwsx %x0,%y1" : "=wa" (i) : "Z" (*(reinterpret_cast<float*>(src) + 1)));
+ __asm__("lxvwsx %x0,%y1" : "=wa"(r) : "Z"(*(reinterpret_cast<float*>(src) + 0)));
+ __asm__("lxvwsx %x0,%y1" : "=wa"(i) : "Z"(*(reinterpret_cast<float*>(src) + 1)));
#else
- Packet4f t = pload_complex_half(src);
- r = vec_splat(t, COMPLEX_DELTA + 0);
- i = vec_splat(t, COMPLEX_DELTA + 1);
+ Packet4f t = pload_complex_half(src);
+ r = vec_splat(t, COMPLEX_DELTA + 0);
+ i = vec_splat(t, COMPLEX_DELTA + 1);
#endif
}
-template<typename RhsScalar>
-EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet2d& r, Packet2d& i)
-{
+template <typename RhsScalar>
+EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet2d& r, Packet2d& i) {
#ifdef EIGEN_VECTORIZE_VSX
- __asm__("lxvdsx %x0,%y1" : "=wa" (r) : "Z" (*(reinterpret_cast<double*>(src) + 0)));
- __asm__("lxvdsx %x0,%y1" : "=wa" (i) : "Z" (*(reinterpret_cast<double*>(src) + 1)));
+ __asm__("lxvdsx %x0,%y1" : "=wa"(r) : "Z"(*(reinterpret_cast<double*>(src) + 0)));
+ __asm__("lxvdsx %x0,%y1" : "=wa"(i) : "Z"(*(reinterpret_cast<double*>(src) + 1)));
#else
- Packet2d t = ploadu<Packet2d>(reinterpret_cast<double*>(src));
- r = vec_splat(t, 0);
- i = vec_splat(t, 1);
+ Packet2d t = ploadu<Packet2d>(reinterpret_cast<double*>(src));
+ r = vec_splat(t, 0);
+ i = vec_splat(t, 1);
#endif
}
#ifndef __POWER8_VECTOR__
-const Packet16uc p16uc_MERGEE = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B };
+const Packet16uc p16uc_MERGEE = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+ 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B};
-const Packet16uc p16uc_MERGEO = { 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F };
+const Packet16uc p16uc_MERGEO = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+ 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F};
#endif
/** \internal load two vectors from the interleaved real & imaginary values of src */
-template<typename RhsScalar>
-EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet4f& r, Packet4f& i)
-{
- Packet4f t = ploadu<Packet4f>(reinterpret_cast<float*>(src));
+template <typename RhsScalar>
+EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet4f& r, Packet4f& i) {
+ Packet4f t = ploadu<Packet4f>(reinterpret_cast<float*>(src));
#ifdef __POWER8_VECTOR__
- r = vec_mergee(t, t);
- i = vec_mergeo(t, t);
+ r = vec_mergee(t, t);
+ i = vec_mergeo(t, t);
#else
- r = vec_perm(t, t, p16uc_MERGEE);
- i = vec_perm(t, t, p16uc_MERGEO);
+ r = vec_perm(t, t, p16uc_MERGEE);
+ i = vec_perm(t, t, p16uc_MERGEO);
#endif
}
-template<typename RhsScalar>
-EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet2d& r, Packet2d& i)
-{
- return pload_realimag(src, r, i);
+template <typename RhsScalar>
+EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet2d& r, Packet2d& i) {
+ return pload_realimag(src, r, i);
}
/** \internal load and splat a complex value into a vector - column-wise */
-EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine(std::complex<float>* src)
-{
+EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine(std::complex<float>* src) {
#ifdef EIGEN_VECTORIZE_VSX
- Packet4f ret;
- __asm__("lxvdsx %x0,%y1" : "=wa" (ret) : "Z" (*(reinterpret_cast<double*>(src) + 0)));
- return ret;
+ Packet4f ret;
+ __asm__("lxvdsx %x0,%y1" : "=wa"(ret) : "Z"(*(reinterpret_cast<double*>(src) + 0)));
+ return ret;
#else
- return Packet4f(ploaddup<Packet2d>(reinterpret_cast<double *>(src)));
+ return Packet4f(ploaddup<Packet2d>(reinterpret_cast<double*>(src)));
#endif
}
-EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine(std::complex<double>* src)
-{
- return ploadu<Packet1cd>(src).v;
-}
+EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine(std::complex<double>* src) { return ploadu<Packet1cd>(src).v; }
/** \internal load a complex value into a vector - row-wise */
-EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine_row(std::complex<float>* src)
-{
- return ploadu<Packet2cf>(src).v;
-}
+EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine_row(std::complex<float>* src) { return ploadu<Packet2cf>(src).v; }
-EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine_row(std::complex<double>* src)
-{
- return ploadu<Packet1cd>(src).v;
-}
+EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine_row(std::complex<double>* src) { return ploadu<Packet1cd>(src).v; }
/** \internal load a scalar or a vector from complex location */
-template<typename ResPacket>
-EIGEN_ALWAYS_INLINE Packet4f pload_complex(std::complex<float>* src)
-{
- if (GEMV_IS_SCALAR) {
- return pload_complex_half(src);
- }
- else
- {
- return ploadu<Packet4f>(reinterpret_cast<float*>(src));
- }
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet4f pload_complex(std::complex<float>* src) {
+ if (GEMV_IS_SCALAR) {
+ return pload_complex_half(src);
+ } else {
+ return ploadu<Packet4f>(reinterpret_cast<float*>(src));
+ }
}
-template<typename ResPacket>
-EIGEN_ALWAYS_INLINE Packet2d pload_complex(std::complex<double>* src)
-{
- return ploadu<Packet2d>(reinterpret_cast<double*>(src));
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet2d pload_complex(std::complex<double>* src) {
+ return ploadu<Packet2d>(reinterpret_cast<double*>(src));
}
/** \internal load from a complex vector and convert to a real vector */
-template<typename ResPacket>
-EIGEN_ALWAYS_INLINE Packet4f pload_complex(Packet2cf* src)
-{
- return src->v;
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet4f pload_complex(Packet2cf* src) {
+ return src->v;
}
-template<typename ResPacket>
-EIGEN_ALWAYS_INLINE Packet2d pload_complex(Packet1cd* src)
-{
- return src->v;
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet2d pload_complex(Packet1cd* src) {
+ return src->v;
}
/** \internal load a full vector from complex location - column-wise */
-EIGEN_ALWAYS_INLINE Packet4f pload_complex_full(std::complex<float>* src)
-{
- return Packet4f(ploaddup<Packet2d>(reinterpret_cast<double *>(src)));
+EIGEN_ALWAYS_INLINE Packet4f pload_complex_full(std::complex<float>* src) {
+ return Packet4f(ploaddup<Packet2d>(reinterpret_cast<double*>(src)));
}
-EIGEN_ALWAYS_INLINE Packet2d pload_complex_full(std::complex<double>* src)
-{
- return ploadu<Packet1cd>(src).v;
-}
+EIGEN_ALWAYS_INLINE Packet2d pload_complex_full(std::complex<double>* src) { return ploadu<Packet1cd>(src).v; }
/** \internal load a full vector from complex location - row-wise */
-EIGEN_ALWAYS_INLINE Packet4f pload_complex_full_row(std::complex<float>* src)
-{
- return ploadu<Packet2cf>(src).v;
-}
+EIGEN_ALWAYS_INLINE Packet4f pload_complex_full_row(std::complex<float>* src) { return ploadu<Packet2cf>(src).v; }
-EIGEN_ALWAYS_INLINE Packet2d pload_complex_full_row(std::complex<double>* src)
-{
- return pload_complex_full(src);
-}
+EIGEN_ALWAYS_INLINE Packet2d pload_complex_full_row(std::complex<double>* src) { return pload_complex_full(src); }
/** \internal load a vector from a real-only scalar location - column-wise */
-EIGEN_ALWAYS_INLINE Packet4f pload_real(float* src)
-{
- return pset1<Packet4f>(*src);
-}
+EIGEN_ALWAYS_INLINE Packet4f pload_real(float* src) { return pset1<Packet4f>(*src); }
-EIGEN_ALWAYS_INLINE Packet2d pload_real(double* src)
-{
- return pset1<Packet2d>(*src);
-}
+EIGEN_ALWAYS_INLINE Packet2d pload_real(double* src) { return pset1<Packet2d>(*src); }
-EIGEN_ALWAYS_INLINE Packet4f pload_real(Packet4f& src)
-{
- return src;
-}
+EIGEN_ALWAYS_INLINE Packet4f pload_real(Packet4f& src) { return src; }
-EIGEN_ALWAYS_INLINE Packet2d pload_real(Packet2d& src)
-{
- return src;
-}
+EIGEN_ALWAYS_INLINE Packet2d pload_real(Packet2d& src) { return src; }
/** \internal load a vector from a real-only vector location */
-EIGEN_ALWAYS_INLINE Packet4f pload_real_full(float* src)
-{
- Packet4f ret = ploadu<Packet4f>(src);
- return vec_mergeh(ret, ret);
+EIGEN_ALWAYS_INLINE Packet4f pload_real_full(float* src) {
+ Packet4f ret = ploadu<Packet4f>(src);
+ return vec_mergeh(ret, ret);
}
-EIGEN_ALWAYS_INLINE Packet2d pload_real_full(double* src)
-{
- return pload_real(src);
+EIGEN_ALWAYS_INLINE Packet2d pload_real_full(double* src) { return pload_real(src); }
+
+EIGEN_ALWAYS_INLINE Packet4f pload_real_full(std::complex<float>* src) {
+ return pload_complex_full(src); // Just for compilation
}
-EIGEN_ALWAYS_INLINE Packet4f pload_real_full(std::complex<float>* src)
-{
- return pload_complex_full(src); // Just for compilation
-}
-
-EIGEN_ALWAYS_INLINE Packet2d pload_real_full(std::complex<double>* src)
-{
- return pload_complex_full(src); // Just for compilation
+EIGEN_ALWAYS_INLINE Packet2d pload_real_full(std::complex<double>* src) {
+ return pload_complex_full(src); // Just for compilation
}
/** \internal load a vector from a real-only scalar location - row-wise */
-template<typename ResPacket>
-EIGEN_ALWAYS_INLINE Packet4f pload_real_row(float* src)
-{
- if (GEMV_IS_SCALAR) {
- return pload_real_full(src);
- }
- else {
- return ploadu<Packet4f>(src);
- }
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet4f pload_real_row(float* src) {
+ if (GEMV_IS_SCALAR) {
+ return pload_real_full(src);
+ } else {
+ return ploadu<Packet4f>(src);
+ }
}
-template<typename ResPacket>
-EIGEN_ALWAYS_INLINE Packet2d pload_real_row(double* src)
-{
- return pload_real(src);
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet2d pload_real_row(double* src) {
+ return pload_real(src);
}
-EIGEN_ALWAYS_INLINE Packet2cf padd(Packet2cf& a, std::complex<float>& b)
-{
- EIGEN_UNUSED_VARIABLE(b);
- return a; // Just for compilation
+EIGEN_ALWAYS_INLINE Packet2cf padd(Packet2cf& a, std::complex<float>& b) {
+ EIGEN_UNUSED_VARIABLE(b);
+ return a; // Just for compilation
}
-EIGEN_ALWAYS_INLINE Packet1cd padd(Packet1cd& a, std::complex<double>& b)
-{
- EIGEN_UNUSED_VARIABLE(b);
- return a; // Just for compilation
+EIGEN_ALWAYS_INLINE Packet1cd padd(Packet1cd& a, std::complex<double>& b) {
+ EIGEN_UNUSED_VARIABLE(b);
+ return a; // Just for compilation
}
/** \internal set a scalar from complex location */
-template<typename Scalar, typename ResScalar>
-EIGEN_ALWAYS_INLINE Scalar pset1_realimag(ResScalar& alpha, int which, int conj)
-{
- return (which) ? ((conj) ? -alpha.real() : alpha.real()) : ((conj) ? -alpha.imag() : alpha.imag());
+template <typename Scalar, typename ResScalar>
+EIGEN_ALWAYS_INLINE Scalar pset1_realimag(ResScalar& alpha, int which, int conj) {
+ return (which) ? ((conj) ? -alpha.real() : alpha.real()) : ((conj) ? -alpha.imag() : alpha.imag());
}
/** \internal set a vector from complex location */
-template<typename Scalar, typename ResScalar, typename ResPacket, int which>
-EIGEN_ALWAYS_INLINE Packet2cf pset1_complex(std::complex<float>& alpha)
-{
- Packet2cf ret;
- ret.v[COMPLEX_DELTA + 0] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x01), (which & 0x04));
- ret.v[COMPLEX_DELTA + 1] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x02), (which & 0x08));
- ret.v[2 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 0];
- ret.v[3 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 1];
- return ret;
+template <typename Scalar, typename ResScalar, typename ResPacket, int which>
+EIGEN_ALWAYS_INLINE Packet2cf pset1_complex(std::complex<float>& alpha) {
+ Packet2cf ret;
+ ret.v[COMPLEX_DELTA + 0] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x01), (which & 0x04));
+ ret.v[COMPLEX_DELTA + 1] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x02), (which & 0x08));
+ ret.v[2 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 0];
+ ret.v[3 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 1];
+ return ret;
}
-template<typename Scalar, typename ResScalar, typename ResPacket, int which>
-EIGEN_ALWAYS_INLINE Packet1cd pset1_complex(std::complex<double>& alpha)
-{
- Packet1cd ret;
- ret.v[0] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x01), (which & 0x04));
- ret.v[1] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x02), (which & 0x08));
- return ret;
+template <typename Scalar, typename ResScalar, typename ResPacket, int which>
+EIGEN_ALWAYS_INLINE Packet1cd pset1_complex(std::complex<double>& alpha) {
+ Packet1cd ret;
+ ret.v[0] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x01), (which & 0x04));
+ ret.v[1] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x02), (which & 0x08));
+ return ret;
}
/** \internal zero out a vector for real or complex forms */
-template<typename Packet>
-EIGEN_ALWAYS_INLINE Packet pset_zero()
-{
- return pset1<Packet>(__UNPACK_TYPE__(Packet)(0));
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet pset_zero() {
+ return pset1<Packet>(__UNPACK_TYPE__(Packet)(0));
}
-template<>
-EIGEN_ALWAYS_INLINE Packet2cf pset_zero<Packet2cf>()
-{
- return Packet2cf(pset1<Packet4f>(float(0)));
+template <>
+EIGEN_ALWAYS_INLINE Packet2cf pset_zero<Packet2cf>() {
+ return Packet2cf(pset1<Packet4f>(float(0)));
}
-template<>
-EIGEN_ALWAYS_INLINE Packet1cd pset_zero<Packet1cd>()
-{
- return Packet1cd(pset1<Packet2d>(double(0)));
+template <>
+EIGEN_ALWAYS_INLINE Packet1cd pset_zero<Packet1cd>() {
+ return Packet1cd(pset1<Packet2d>(double(0)));
}
/** \internal initialize a vector from another vector */
-template<typename Packet, typename LhsPacket, typename RhsPacket>
-EIGEN_ALWAYS_INLINE Packet pset_init(Packet& c1)
-{
- if (GEMV_IS_COMPLEX_COMPLEX) {
- EIGEN_UNUSED_VARIABLE(c1);
- return pset_zero<Packet>();
- }
- else
- {
- return c1; // Intentionally left uninitialized
- }
+template <typename Packet, typename LhsPacket, typename RhsPacket>
+EIGEN_ALWAYS_INLINE Packet pset_init(Packet& c1) {
+ if (GEMV_IS_COMPLEX_COMPLEX) {
+ EIGEN_UNUSED_VARIABLE(c1);
+ return pset_zero<Packet>();
+ } else {
+ return c1; // Intentionally left uninitialized
+ }
}
-template<typename PResPacket, typename ResPacket, typename ResScalar, typename Scalar>
-struct alpha_store
-{
- alpha_store(ResScalar& alpha) {
- separate.r = pset1_complex<Scalar, ResScalar, ResPacket, 0x3>(alpha);
- separate.i = pset1_complex<Scalar, ResScalar, ResPacket, 0x0>(alpha);
- }
- struct ri {
- PResPacket r;
- PResPacket i;
- } separate;
+template <typename PResPacket, typename ResPacket, typename ResScalar, typename Scalar>
+struct alpha_store {
+ alpha_store(ResScalar& alpha) {
+ separate.r = pset1_complex<Scalar, ResScalar, ResPacket, 0x3>(alpha);
+ separate.i = pset1_complex<Scalar, ResScalar, ResPacket, 0x0>(alpha);
+ }
+ struct ri {
+ PResPacket r;
+ PResPacket i;
+ } separate;
};
/** \internal multiply and add for complex math */
-template<typename ScalarPacket, typename AlphaData>
-EIGEN_ALWAYS_INLINE ScalarPacket pmadd_complex(ScalarPacket& c0, ScalarPacket& c2, ScalarPacket& c4, AlphaData& b0)
-{
- return pmadd(c2, b0.separate.i.v, pmadd(c0, b0.separate.r.v, c4));
+template <typename ScalarPacket, typename AlphaData>
+EIGEN_ALWAYS_INLINE ScalarPacket pmadd_complex(ScalarPacket& c0, ScalarPacket& c2, ScalarPacket& c4, AlphaData& b0) {
+ return pmadd(c2, b0.separate.i.v, pmadd(c0, b0.separate.r.v, c4));
}
/** \internal store and madd for complex math */
-template<typename Scalar, typename ScalarPacket, typename PResPacket, typename ResPacket, typename ResScalar, typename AlphaData>
-EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, AlphaData& b0, ResScalar* res)
-{
- PResPacket c2 = pcplxflipconj(c0);
- if (GEMV_IS_SCALAR) {
- ScalarPacket c4 = ploadu<ScalarPacket>(reinterpret_cast<Scalar*>(res));
- ScalarPacket c3 = pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0);
- pstoreu(reinterpret_cast<Scalar*>(res), c3);
- } else {
- ScalarPacket c4 = pload_complex<ResPacket>(res);
- PResPacket c3 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
- pstoreu(res, c3);
- }
+template <typename Scalar, typename ScalarPacket, typename PResPacket, typename ResPacket, typename ResScalar,
+ typename AlphaData>
+EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, AlphaData& b0, ResScalar* res) {
+ PResPacket c2 = pcplxflipconj(c0);
+ if (GEMV_IS_SCALAR) {
+ ScalarPacket c4 = ploadu<ScalarPacket>(reinterpret_cast<Scalar*>(res));
+ ScalarPacket c3 = pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0);
+ pstoreu(reinterpret_cast<Scalar*>(res), c3);
+ } else {
+ ScalarPacket c4 = pload_complex<ResPacket>(res);
+ PResPacket c3 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
+ pstoreu(res, c3);
+ }
}
-template<typename ScalarPacket, typename PResPacket, typename ResPacket, typename ResScalar, typename AlphaData, Index ResPacketSize, Index iter2>
-EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, PResPacket& c1, AlphaData& b0, ResScalar* res)
-{
- PResPacket c2 = pcplxflipconj(c0);
- PResPacket c3 = pcplxflipconj(c1);
+template <typename ScalarPacket, typename PResPacket, typename ResPacket, typename ResScalar, typename AlphaData,
+ Index ResPacketSize, Index iter2>
+EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, PResPacket& c1, AlphaData& b0, ResScalar* res) {
+ PResPacket c2 = pcplxflipconj(c0);
+ PResPacket c3 = pcplxflipconj(c1);
#if !defined(_ARCH_PWR10)
- ScalarPacket c4 = pload_complex<ResPacket>(res + (iter2 * ResPacketSize));
- ScalarPacket c5 = pload_complex<ResPacket>(res + ((iter2 + 1) * ResPacketSize));
- PResPacket c6 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
- PResPacket c7 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c1.v, c3.v, c5, b0));
- pstoreu(res + (iter2 * ResPacketSize), c6);
- pstoreu(res + ((iter2 + 1) * ResPacketSize), c7);
+ ScalarPacket c4 = pload_complex<ResPacket>(res + (iter2 * ResPacketSize));
+ ScalarPacket c5 = pload_complex<ResPacket>(res + ((iter2 + 1) * ResPacketSize));
+ PResPacket c6 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
+ PResPacket c7 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c1.v, c3.v, c5, b0));
+ pstoreu(res + (iter2 * ResPacketSize), c6);
+ pstoreu(res + ((iter2 + 1) * ResPacketSize), c7);
#else
- __vector_pair a = *reinterpret_cast<__vector_pair *>(res + (iter2 * ResPacketSize));
+ __vector_pair a = *reinterpret_cast<__vector_pair*>(res + (iter2 * ResPacketSize));
#if EIGEN_COMP_LLVM
- PResPacket c6[2];
- __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(c6), &a);
- c6[0] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c6[0].v, b0));
- c6[1] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c1.v, c3.v, c6[1].v, b0));
- GEMV_BUILDPAIR_MMA(a, c6[0].v, c6[1].v);
+ PResPacket c6[2];
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(c6), &a);
+ c6[0] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c6[0].v, b0));
+ c6[1] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c1.v, c3.v, c6[1].v, b0));
+ GEMV_BUILDPAIR_MMA(a, c6[0].v, c6[1].v);
#else
- if (GEMV_IS_COMPLEX_FLOAT) {
- __asm__ ("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.r.v), "wa" (c0.v), "wa" (c1.v));
- __asm__ ("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.i.v), "wa" (c2.v), "wa" (c3.v));
- } else {
- __asm__ ("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.r.v), "wa" (c0.v), "wa" (c1.v));
- __asm__ ("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.i.v), "wa" (c2.v), "wa" (c3.v));
- }
+ if (GEMV_IS_COMPLEX_FLOAT) {
+ __asm__("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d"(a) : "wa"(b0.separate.r.v), "wa"(c0.v), "wa"(c1.v));
+ __asm__("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d"(a) : "wa"(b0.separate.i.v), "wa"(c2.v), "wa"(c3.v));
+ } else {
+ __asm__("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d"(a) : "wa"(b0.separate.r.v), "wa"(c0.v), "wa"(c1.v));
+ __asm__("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d"(a) : "wa"(b0.separate.i.v), "wa"(c2.v), "wa"(c3.v));
+ }
#endif
- *reinterpret_cast<__vector_pair *>(res + (iter2 * ResPacketSize)) = a;
+ *reinterpret_cast<__vector_pair*>(res + (iter2 * ResPacketSize)) = a;
#endif
}
/** \internal load lhs packet */
-template<typename Scalar, typename LhsScalar, typename LhsMapper, typename LhsPacket>
-EIGEN_ALWAYS_INLINE LhsPacket loadLhsPacket(LhsMapper& lhs, Index i, Index j)
-{
- if (sizeof(Scalar) == sizeof(LhsScalar)) {
- const LhsScalar& src = lhs(i + 0, j);
- return LhsPacket(pload_real_full(const_cast<LhsScalar*>(&src)));
- }
- return lhs.template load<LhsPacket, Unaligned>(i + 0, j);
+template <typename Scalar, typename LhsScalar, typename LhsMapper, typename LhsPacket>
+EIGEN_ALWAYS_INLINE LhsPacket loadLhsPacket(LhsMapper& lhs, Index i, Index j) {
+ if (sizeof(Scalar) == sizeof(LhsScalar)) {
+ const LhsScalar& src = lhs(i + 0, j);
+ return LhsPacket(pload_real_full(const_cast<LhsScalar*>(&src)));
+ }
+ return lhs.template load<LhsPacket, Unaligned>(i + 0, j);
}
/** \internal madd for complex times complex */
-template<typename ComplexPacket, typename RealPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
-EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_complex(RealPacket& a, RealPacket& b, RealPacket& c)
-{
- if (ConjugateLhs && ConjugateRhs) {
- return vec_madd(a, pconj2(ComplexPacket(b)).v, c);
- }
- else if (Negate && !ConjugateLhs && ConjugateRhs) {
- return vec_nmsub(a, b, c);
- }
- else {
- return vec_madd(a, b, c);
- }
+template <typename ComplexPacket, typename RealPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
+EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_complex(RealPacket& a, RealPacket& b, RealPacket& c) {
+ if (ConjugateLhs && ConjugateRhs) {
+ return vec_madd(a, pconj2(ComplexPacket(b)).v, c);
+ } else if (Negate && !ConjugateLhs && ConjugateRhs) {
+ return vec_nmsub(a, b, c);
+ } else {
+ return vec_madd(a, b, c);
+ }
}
/** \internal madd for complex times real */
-template<typename ComplexPacket, typename RealPacket, bool Conjugate>
-EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_real(RealPacket& a, RealPacket& b, RealPacket& c)
-{
- if (Conjugate) {
- return vec_madd(a, pconj2(ComplexPacket(b)).v, c);
- }
- else {
- return vec_madd(a, b, c);
- }
+template <typename ComplexPacket, typename RealPacket, bool Conjugate>
+EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_real(RealPacket& a, RealPacket& b, RealPacket& c) {
+ if (Conjugate) {
+ return vec_madd(a, pconj2(ComplexPacket(b)).v, c);
+ } else {
+ return vec_madd(a, b, c);
+ }
}
-template<typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
-EIGEN_ALWAYS_INLINE void gemv_mult_generic(LhsPacket& a0, RhsScalar* b, PResPacket& c0)
-{
- conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
- RhsPacket b0;
- if (StorageOrder == ColMajor) {
- b0 = pset1<RhsPacket>(*b);
- }
- else {
- b0 = ploadu<RhsPacket>(b);
- }
- c0 = pcj.pmadd(a0, b0, c0);
+template <typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, bool ConjugateLhs,
+ bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_generic(LhsPacket& a0, RhsScalar* b, PResPacket& c0) {
+ conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
+ RhsPacket b0;
+ if (StorageOrder == ColMajor) {
+ b0 = pset1<RhsPacket>(*b);
+ } else {
+ b0 = ploadu<RhsPacket>(b);
+ }
+ c0 = pcj.pmadd(a0, b0, c0);
}
/** \internal core multiply operation for vectors - complex times complex */
-template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
-EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0, ResPacket& c1)
-{
- ScalarPacket br, bi;
- if (StorageOrder == ColMajor) {
- pload_realimag<RhsScalar>(b, br, bi);
- }
- else {
- pload_realimag_row<RhsScalar>(b, br, bi);
- }
- if (ConjugateLhs && !ConjugateRhs) a0 = pconj2(a0);
- LhsPacket a1 = pcplxflipconj(a0);
- ScalarPacket cr = pmadd_complex_complex<LhsPacket, ScalarPacket, ConjugateLhs, ConjugateRhs, false>(a0.v, br, c0.v);
- ScalarPacket ci = pmadd_complex_complex<LhsPacket, ScalarPacket, ConjugateLhs, ConjugateRhs, true>(a1.v, bi, c1.v);
- c1 = ResPacket(ci);
- c0 = PResPacket(cr);
+template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket,
+ typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0, ResPacket& c1) {
+ ScalarPacket br, bi;
+ if (StorageOrder == ColMajor) {
+ pload_realimag<RhsScalar>(b, br, bi);
+ } else {
+ pload_realimag_row<RhsScalar>(b, br, bi);
+ }
+ if (ConjugateLhs && !ConjugateRhs) a0 = pconj2(a0);
+ LhsPacket a1 = pcplxflipconj(a0);
+ ScalarPacket cr = pmadd_complex_complex<LhsPacket, ScalarPacket, ConjugateLhs, ConjugateRhs, false>(a0.v, br, c0.v);
+ ScalarPacket ci = pmadd_complex_complex<LhsPacket, ScalarPacket, ConjugateLhs, ConjugateRhs, true>(a1.v, bi, c1.v);
+ c1 = ResPacket(ci);
+ c0 = PResPacket(cr);
}
/** \internal core multiply operation for vectors - real times complex */
-template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
-EIGEN_ALWAYS_INLINE void gemv_mult_real_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0)
-{
- ScalarPacket b0;
- if (StorageOrder == ColMajor) {
- b0 = pload_complex_full(b);
- }
- else {
- b0 = pload_complex_full_row(b);
- }
- ScalarPacket cri = pmadd_complex_real<PResPacket, ScalarPacket, ConjugateRhs>(a0, b0, c0.v);
- c0 = PResPacket(cri);
+template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket,
+ typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_real_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0) {
+ ScalarPacket b0;
+ if (StorageOrder == ColMajor) {
+ b0 = pload_complex_full(b);
+ } else {
+ b0 = pload_complex_full_row(b);
+ }
+ ScalarPacket cri = pmadd_complex_real<PResPacket, ScalarPacket, ConjugateRhs>(a0, b0, c0.v);
+ c0 = PResPacket(cri);
}
/** \internal core multiply operation for vectors - complex times real */
-template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
-EIGEN_ALWAYS_INLINE void gemv_mult_complex_real(LhsPacket& a0, RhsScalar* b, PResPacket& c0)
-{
- ScalarPacket a1 = pload_complex<ResPacket>(&a0);
- ScalarPacket b0;
- if (StorageOrder == ColMajor) {
- b0 = pload_real(b);
- }
- else {
- b0 = pload_real_row<ResPacket>(b);
- }
- ScalarPacket cri = pmadd_complex_real<PResPacket, ScalarPacket, ConjugateLhs>(a1, b0, c0.v);
- c0 = PResPacket(cri);
+template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket,
+ typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_complex_real(LhsPacket& a0, RhsScalar* b, PResPacket& c0) {
+ ScalarPacket a1 = pload_complex<ResPacket>(&a0);
+ ScalarPacket b0;
+ if (StorageOrder == ColMajor) {
+ b0 = pload_real(b);
+ } else {
+ b0 = pload_real_row<ResPacket>(b);
+ }
+ ScalarPacket cri = pmadd_complex_real<PResPacket, ScalarPacket, ConjugateLhs>(a1, b0, c0.v);
+ c0 = PResPacket(cri);
}
-#define GEMV_MULT_COMPLEX_COMPLEX(LhsType, RhsType, ResType) \
-template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
-EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, ResType& c1) \
-{ \
- gemv_mult_complex_complex<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0, c1); \
-}
+#define GEMV_MULT_COMPLEX_COMPLEX(LhsType, RhsType, ResType) \
+ template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, \
+ typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
+ EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, ResType& c1) { \
+ gemv_mult_complex_complex<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, \
+ ConjugateRhs, StorageOrder>(a0, b, c0, c1); \
+ }
-GEMV_MULT_COMPLEX_COMPLEX(Packet2cf, std::complex<float>, Packet2cf)
+GEMV_MULT_COMPLEX_COMPLEX(Packet2cf, std::complex<float>, Packet2cf)
GEMV_MULT_COMPLEX_COMPLEX(Packet1cd, std::complex<double>, Packet1cd)
-#define GEMV_MULT_REAL_COMPLEX(LhsType, RhsType, ResType) \
-template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
-EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, RhsType&) \
-{ \
- gemv_mult_real_complex<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
-}
+#define GEMV_MULT_REAL_COMPLEX(LhsType, RhsType, ResType) \
+ template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, \
+ typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
+ EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, RhsType&) { \
+ gemv_mult_real_complex<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, \
+ ConjugateRhs, StorageOrder>(a0, b, c0); \
+ }
-GEMV_MULT_REAL_COMPLEX(float, std::complex<float>, Packet2cf)
-GEMV_MULT_REAL_COMPLEX(double, std::complex<double>, Packet1cd)
-GEMV_MULT_REAL_COMPLEX(Packet4f, std::complex<float>, Packet2cf)
+GEMV_MULT_REAL_COMPLEX(float, std::complex<float>, Packet2cf)
+GEMV_MULT_REAL_COMPLEX(double, std::complex<double>, Packet1cd)
+GEMV_MULT_REAL_COMPLEX(Packet4f, std::complex<float>, Packet2cf)
GEMV_MULT_REAL_COMPLEX(Packet2d, std::complex<double>, Packet1cd)
-#define GEMV_MULT_COMPLEX_REAL(LhsType, RhsType, ResType1, ResType2) \
-template<typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
-EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType1& c0, ResType2&) \
-{ \
- gemv_mult_complex_real<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
-}
+#define GEMV_MULT_COMPLEX_REAL(LhsType, RhsType, ResType1, ResType2) \
+ template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, \
+ typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
+ EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType1& c0, ResType2&) { \
+ gemv_mult_complex_real<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, \
+ ConjugateRhs, StorageOrder>(a0, b, c0); \
+ }
-GEMV_MULT_COMPLEX_REAL(Packet2cf, float, Packet2cf, std::complex<float>)
-GEMV_MULT_COMPLEX_REAL(Packet1cd, double, Packet1cd, std::complex<double>)
-GEMV_MULT_COMPLEX_REAL(std::complex<float>, float, Packet2cf, std::complex<float>)
+GEMV_MULT_COMPLEX_REAL(Packet2cf, float, Packet2cf, std::complex<float>)
+GEMV_MULT_COMPLEX_REAL(Packet1cd, double, Packet1cd, std::complex<double>)
+GEMV_MULT_COMPLEX_REAL(std::complex<float>, float, Packet2cf, std::complex<float>)
GEMV_MULT_COMPLEX_REAL(std::complex<double>, double, Packet1cd, std::complex<double>)
#ifdef USE_GEMV_MMA
/** \internal convert packet to real form */
-template<typename T>
-EIGEN_ALWAYS_INLINE T convertReal(T a)
-{
- return a;
+template <typename T>
+EIGEN_ALWAYS_INLINE T convertReal(T a) {
+ return a;
}
-EIGEN_ALWAYS_INLINE Packet4f convertReal(Packet2cf a)
-{
- return a.v;
-}
+EIGEN_ALWAYS_INLINE Packet4f convertReal(Packet2cf a) { return a.v; }
-EIGEN_ALWAYS_INLINE Packet2d convertReal(Packet1cd a)
-{
- return a.v;
-}
+EIGEN_ALWAYS_INLINE Packet2d convertReal(Packet1cd a) { return a.v; }
/** \internal convert packet to complex form */
-template<typename T>
-EIGEN_ALWAYS_INLINE T convertComplex(T a)
-{
- return a;
+template <typename T>
+EIGEN_ALWAYS_INLINE T convertComplex(T a) {
+ return a;
}
-EIGEN_ALWAYS_INLINE Packet2cf convertComplex(Packet4f a)
-{
- return Packet2cf(a);
-}
+EIGEN_ALWAYS_INLINE Packet2cf convertComplex(Packet4f a) { return Packet2cf(a); }
-EIGEN_ALWAYS_INLINE Packet1cd convertComplex(Packet2d a)
-{
- return Packet1cd(a);
-}
+EIGEN_ALWAYS_INLINE Packet1cd convertComplex(Packet2d a) { return Packet1cd(a); }
/** \internal load a vector from a complex location (for MMA version) */
-template<typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename ResPacket>
-EIGEN_ALWAYS_INLINE void pload_complex_MMA(SLhsPacket& a)
-{
- a = SLhsPacket(pload_complex<ResPacket>(&a));
+template <typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename ResPacket>
+EIGEN_ALWAYS_INLINE void pload_complex_MMA(SLhsPacket& a) {
+ a = SLhsPacket(pload_complex<ResPacket>(&a));
}
-template<typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename ResPacket>
-EIGEN_ALWAYS_INLINE void pload_complex_MMA(__vector_pair&)
-{
- // Pass thru
+template <typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename ResPacket>
+EIGEN_ALWAYS_INLINE void pload_complex_MMA(__vector_pair&) {
+ // Pass thru
}
/** \internal perform a matrix multiply and accumulate (positive and negative) of packet a and packet b */
-template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, RhsPacket& a, LhsPacket& b)
-{
- if (NegativeAccumulate)
- {
- __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
- }
- else {
- __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
- }
+template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, RhsPacket& a, LhsPacket& b) {
+ if (NegativeAccumulate) {
+ __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+ } else {
+ __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+ }
}
/** \internal perform a matrix multiply and accumulate (positive and negative) of vector_pair a and packet b */
-template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, __vector_pair& a, Packet2d& b)
-{
- if (NegativeAccumulate)
- {
- __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b);
- }
- else {
- __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b);
- }
+template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, __vector_pair& a, Packet2d& b) {
+ if (NegativeAccumulate) {
+ __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b);
+ } else {
+ __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b);
+ }
}
-template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad*, __vector_pair&, Packet4f&)
-{
- // Just for compilation
+template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad*, __vector_pair&, Packet4f&) {
+ // Just for compilation
}
/** \internal madd for complex times complex (MMA version) */
-template<typename RealPacket, typename LhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
-EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c)
-{
- if (ConjugateLhs && ConjugateRhs) {
- RealPacket b2 = pconj2(convertComplex(b)).v;
- return pger_vecMMA<RealPacket, RealPacket, false>(c, b2, a.v);
- }
- else if (Negate && !ConjugateLhs && ConjugateRhs) {
- return pger_vecMMA<RealPacket, RealPacket, true>(c, b, a.v);
- }
- else {
- return pger_vecMMA<RealPacket, RealPacket, false>(c, b, a.v);
- }
+template <typename RealPacket, typename LhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
+EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c) {
+ if (ConjugateLhs && ConjugateRhs) {
+ RealPacket b2 = pconj2(convertComplex(b)).v;
+ return pger_vecMMA<RealPacket, RealPacket, false>(c, b2, a.v);
+ } else if (Negate && !ConjugateLhs && ConjugateRhs) {
+ return pger_vecMMA<RealPacket, RealPacket, true>(c, b, a.v);
+ } else {
+ return pger_vecMMA<RealPacket, RealPacket, false>(c, b, a.v);
+ }
}
-template<typename RealPacket, typename LhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
-EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c)
-{
- if (ConjugateLhs && ConjugateRhs) {
- RealPacket b2 = pconj2(convertComplex(b)).v;
- return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b2);
- }
- else if (Negate && !ConjugateLhs && ConjugateRhs) {
- return pger_vecMMA<RealPacket, __vector_pair, true>(c, a, b);
- }
- else {
- return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b);
- }
+template <typename RealPacket, typename LhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
+EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c) {
+ if (ConjugateLhs && ConjugateRhs) {
+ RealPacket b2 = pconj2(convertComplex(b)).v;
+ return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b2);
+ } else if (Negate && !ConjugateLhs && ConjugateRhs) {
+ return pger_vecMMA<RealPacket, __vector_pair, true>(c, a, b);
+ } else {
+ return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b);
+ }
}
/** \internal madd for complex times real (MMA version) */
-template<typename RealPacket, typename LhsPacket, bool Conjugate, int StorageOrder>
-EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c)
-{
- RealPacket a2 = convertReal(a);
- if (Conjugate) {
- RealPacket b2 = pconj2(convertComplex(b)).v;
- if (StorageOrder == ColMajor) {
- return pger_vecMMA<RealPacket, RealPacket, false>(c, b2, a2);
- } else {
- return pger_vecMMA<RealPacket, RealPacket, false>(c, a2, b2);
- }
+template <typename RealPacket, typename LhsPacket, bool Conjugate, int StorageOrder>
+EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c) {
+ RealPacket a2 = convertReal(a);
+ if (Conjugate) {
+ RealPacket b2 = pconj2(convertComplex(b)).v;
+ if (StorageOrder == ColMajor) {
+ return pger_vecMMA<RealPacket, RealPacket, false>(c, b2, a2);
+ } else {
+ return pger_vecMMA<RealPacket, RealPacket, false>(c, a2, b2);
}
- else {
- if (StorageOrder == ColMajor) {
- return pger_vecMMA<RealPacket, RealPacket, false>(c, b, a2);
- } else {
- return pger_vecMMA<RealPacket, RealPacket, false>(c, a2, b);
- }
+ } else {
+ if (StorageOrder == ColMajor) {
+ return pger_vecMMA<RealPacket, RealPacket, false>(c, b, a2);
+ } else {
+ return pger_vecMMA<RealPacket, RealPacket, false>(c, a2, b);
}
+ }
}
/** \internal madd for real times complex (MMA version) */
-template<typename RealPacket, typename LhsPacket, bool Conjugate, int StorageOrder>
-EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c)
-{
- if (Conjugate) {
- RealPacket b2 = pconj2(convertComplex(b)).v;
- return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b2);
- }
- else {
- return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b);
- }
+template <typename RealPacket, typename LhsPacket, bool Conjugate, int StorageOrder>
+EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c) {
+ if (Conjugate) {
+ RealPacket b2 = pconj2(convertComplex(b)).v;
+ return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b2);
+ } else {
+ return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b);
+ }
}
/** \internal core multiply operation for vectors (MMA version) - complex times complex */
-template<typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
-EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0)
-{
- ScalarPacket b0;
- if (StorageOrder == ColMajor) {
- b0 = pload_realimag_combine(b);
- } else {
- b0 = pload_realimag_combine_row(b);
- }
- pmadd_complex_complex_MMA<ScalarPacket, LhsPacket, ConjugateLhs, ConjugateRhs, false>(a0, b0, c0);
+template <typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket,
+ bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) {
+ ScalarPacket b0;
+ if (StorageOrder == ColMajor) {
+ b0 = pload_realimag_combine(b);
+ } else {
+ b0 = pload_realimag_combine_row(b);
+ }
+ pmadd_complex_complex_MMA<ScalarPacket, LhsPacket, ConjugateLhs, ConjugateRhs, false>(a0, b0, c0);
}
/** \internal core multiply operation for vectors (MMA version) - complex times real */
-template<typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
-EIGEN_ALWAYS_INLINE void gemv_mult_complex_real_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0)
-{
- pload_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, ResPacket>(a0);
- ScalarPacket b0;
- if (StorageOrder == ColMajor) {
- b0 = pload_real(b);
- }
- else {
- b0 = pload_real_row<ResPacket>(b);
- }
- pmadd_complex_real_MMA<ScalarPacket, LhsPacket, ConjugateLhs, ColMajor>(a0, b0, c0);
+template <typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket,
+ bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_complex_real_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) {
+ pload_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, ResPacket>(a0);
+ ScalarPacket b0;
+ if (StorageOrder == ColMajor) {
+ b0 = pload_real(b);
+ } else {
+ b0 = pload_real_row<ResPacket>(b);
+ }
+ pmadd_complex_real_MMA<ScalarPacket, LhsPacket, ConjugateLhs, ColMajor>(a0, b0, c0);
}
/** \internal core multiply operation for vectors (MMA version) - real times complex */
-template<typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
-EIGEN_ALWAYS_INLINE void gemv_mult_real_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0)
-{
- ScalarPacket b0;
- if (StorageOrder == ColMajor) {
- b0 = pload_complex_full(b);
- }
- else {
- b0 = pload_complex_full_row(b);
- }
- pmadd_complex_real_MMA<ScalarPacket, LhsPacket, ConjugateRhs, (sizeof(RhsScalar) == sizeof(std::complex<float>)) ? StorageOrder : ColMajor>(a0, b0, c0);
+template <typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket,
+ bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_real_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) {
+ ScalarPacket b0;
+ if (StorageOrder == ColMajor) {
+ b0 = pload_complex_full(b);
+ } else {
+ b0 = pload_complex_full_row(b);
+ }
+ pmadd_complex_real_MMA<ScalarPacket, LhsPacket, ConjugateRhs,
+ (sizeof(RhsScalar) == sizeof(std::complex<float>)) ? StorageOrder : ColMajor>(a0, b0, c0);
}
-#define GEMV_MULT_COMPLEX_COMPLEX_MMA(LhsType, RhsType) \
-template<typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
-EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \
-{ \
- gemv_mult_complex_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
-}
+#define GEMV_MULT_COMPLEX_COMPLEX_MMA(LhsType, RhsType) \
+ template <typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, \
+ typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
+ EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) { \
+ gemv_mult_complex_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, \
+ ConjugateRhs, StorageOrder>(a0, b, c0); \
+ }
-GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet2cf, std::complex<float>)
+GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet2cf, std::complex<float>)
GEMV_MULT_COMPLEX_COMPLEX_MMA(__vector_pair, std::complex<float>)
-GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet1cd, std::complex<double>)
+GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet1cd, std::complex<double>)
/** \internal core multiply operation for vectors (MMA version) - complex times complex */
-template<typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
-EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(__vector_pair& a0, std::complex<double>* b, __vector_quad* c0)
-{
- if (sizeof(LhsScalar) == 16) {
- gemv_mult_complex_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0);
- }
- else {
- gemv_mult_real_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0);
- }
+template <typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar,
+ typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(__vector_pair& a0, std::complex<double>* b, __vector_quad* c0) {
+ if (sizeof(LhsScalar) == 16) {
+ gemv_mult_complex_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs,
+ StorageOrder>(a0, b, c0);
+ } else {
+ gemv_mult_real_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs,
+ StorageOrder>(a0, b, c0);
+ }
}
-#define GEMV_MULT_REAL_COMPLEX_MMA(LhsType, RhsType) \
-template<typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
-EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \
-{ \
- gemv_mult_real_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
-}
+#define GEMV_MULT_REAL_COMPLEX_MMA(LhsType, RhsType) \
+ template <typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, \
+ typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
+ EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) { \
+ gemv_mult_real_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, \
+ StorageOrder>(a0, b, c0); \
+ }
GEMV_MULT_REAL_COMPLEX_MMA(Packet4f, std::complex<float>)
GEMV_MULT_REAL_COMPLEX_MMA(Packet2d, std::complex<double>)
-#define GEMV_MULT_COMPLEX_REAL_MMA(LhsType, RhsType) \
-template<typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
-EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \
-{ \
- gemv_mult_complex_real_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, StorageOrder>(a0, b, c0); \
-}
+#define GEMV_MULT_COMPLEX_REAL_MMA(LhsType, RhsType) \
+ template <typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, \
+ typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder> \
+ EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) { \
+ gemv_mult_complex_real_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, \
+ StorageOrder>(a0, b, c0); \
+ }
-GEMV_MULT_COMPLEX_REAL_MMA(Packet2cf, float)
-GEMV_MULT_COMPLEX_REAL_MMA(Packet1cd, double)
+GEMV_MULT_COMPLEX_REAL_MMA(Packet2cf, float)
+GEMV_MULT_COMPLEX_REAL_MMA(Packet1cd, double)
GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, float)
GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, double)
/** \internal disassemble MMA accumulator results into packets */
-template <typename Scalar, typename ScalarPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_ALWAYS_INLINE void disassembleResults2(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0)
-{
- __builtin_mma_disassemble_acc(&result0.packet, c0);
- if (sizeof(LhsPacket) == 16) {
- if (sizeof(RhsPacket) == 16) {
- ScalarPacket tmp0, tmp2;
- tmp2 = vec_mergeh(result0.packet[2], result0.packet[3]);
- tmp0 = vec_mergeh(result0.packet[0], result0.packet[1]);
- result0.packet[3] = vec_mergel(result0.packet[3], result0.packet[2]);
- result0.packet[1] = vec_mergel(result0.packet[1], result0.packet[0]);
- result0.packet[2] = tmp2;
- result0.packet[0] = tmp0;
+template <typename Scalar, typename ScalarPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs,
+ bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE void disassembleResults2(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0) {
+ __builtin_mma_disassemble_acc(&result0.packet, c0);
+ if (sizeof(LhsPacket) == 16) {
+ if (sizeof(RhsPacket) == 16) {
+ ScalarPacket tmp0, tmp2;
+ tmp2 = vec_mergeh(result0.packet[2], result0.packet[3]);
+ tmp0 = vec_mergeh(result0.packet[0], result0.packet[1]);
+ result0.packet[3] = vec_mergel(result0.packet[3], result0.packet[2]);
+ result0.packet[1] = vec_mergel(result0.packet[1], result0.packet[0]);
+ result0.packet[2] = tmp2;
+ result0.packet[0] = tmp0;
- if (ConjugateLhs) {
- result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
- result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v;
- } else if (ConjugateRhs) {
- result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v;
- result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v;
- } else {
- result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v;
- result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v;
- }
- result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
- result0.packet[2] = vec_add(result0.packet[2], result0.packet[3]);
- } else {
- result0.packet[0][1] = result0.packet[1][1];
- result0.packet[2][1] = result0.packet[3][1];
- }
+ if (ConjugateLhs) {
+ result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+ result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v;
+ } else if (ConjugateRhs) {
+ result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v;
+ result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v;
+ } else {
+ result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v;
+ result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v;
+ }
+ result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
+ result0.packet[2] = vec_add(result0.packet[2], result0.packet[3]);
+ } else {
+ result0.packet[0][1] = result0.packet[1][1];
+ result0.packet[2][1] = result0.packet[3][1];
}
+ }
}
-template <typename Scalar, typename ScalarPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_ALWAYS_INLINE void disassembleResults4(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0)
-{
- __builtin_mma_disassemble_acc(&result0.packet, c0);
- if (GEMV_IS_COMPLEX_COMPLEX) {
- if (ConjugateLhs) {
- result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
- result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v;
- } else {
- if (ConjugateRhs) {
- result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v;
- } else {
- result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v;
- }
- }
- result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
- } else if (sizeof(LhsPacket) == sizeof(std::complex<float>)) {
- if (ConjugateLhs) {
- result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
- }
+template <typename Scalar, typename ScalarPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs,
+ bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE void disassembleResults4(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0) {
+ __builtin_mma_disassemble_acc(&result0.packet, c0);
+ if (GEMV_IS_COMPLEX_COMPLEX) {
+ if (ConjugateLhs) {
+ result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+ result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v;
} else {
- result0.packet[0] = vec_mergee(result0.packet[0], result0.packet[1]);
+ if (ConjugateRhs) {
+ result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v;
+ } else {
+ result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v;
+ }
}
+ result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
+ } else if (sizeof(LhsPacket) == sizeof(std::complex<float>)) {
+ if (ConjugateLhs) {
+ result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+ }
+ } else {
+ result0.packet[0] = vec_mergee(result0.packet[0], result0.packet[1]);
+ }
}
-template <typename Scalar, typename ScalarPacket, int ResPacketSize, typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0)
-{
- if (!GEMV_IS_COMPLEX_FLOAT) {
- disassembleResults2<Scalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(c0, result0);
- } else {
- disassembleResults4<Scalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(c0, result0);
- }
+template <typename Scalar, typename ScalarPacket, int ResPacketSize, typename LhsPacket, typename RhsPacket,
+ bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0) {
+ if (!GEMV_IS_COMPLEX_FLOAT) {
+ disassembleResults2<Scalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(c0, result0);
+ } else {
+ disassembleResults4<Scalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(c0, result0);
+ }
}
#endif
@@ -1952,194 +1810,207 @@
#define GEMV_LOADPACKET_COL_COMPLEX(iter) \
loadLhsPacket<Scalar, LhsScalar, LhsMapper, PLhsPacket>(lhs, i + ((iter) * ResPacketSize), j)
-#define GEMV_LOADPACKET_COL_COMPLEX_DATA(iter) \
- convertReal(GEMV_LOADPACKET_COL_COMPLEX(iter))
+#define GEMV_LOADPACKET_COL_COMPLEX_DATA(iter) convertReal(GEMV_LOADPACKET_COL_COMPLEX(iter))
#ifdef USE_GEMV_MMA
#define GEMV_INIT_COL_COMPLEX_MMA(iter, N) \
- if (GEMV_GETN_COMPLEX(N) > iter) { \
- __builtin_mma_xxsetaccz(&e0##iter); \
+ if (GEMV_GETN_COMPLEX(N) > iter) { \
+ __builtin_mma_xxsetaccz(&e0##iter); \
}
#if EIGEN_COMP_LLVM
-#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2) \
- GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1)); \
+#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2) \
+ GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), \
+ GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1)); \
EIGEN_UNUSED_VARIABLE(f##iter1);
#else
-#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2) \
- if (sizeof(LhsPacket) == 16) { \
- const LhsScalar& src = lhs(i + ((32 * iter1) / sizeof(LhsScalar)), j); \
- a##iter1 = *reinterpret_cast<__vector_pair *>(const_cast<LhsScalar *>(&src)); \
- EIGEN_UNUSED_VARIABLE(f##iter1); \
- } else { \
- f##iter1 = lhs.template load<PLhsPacket, Unaligned>(i + ((iter2) * ResPacketSize), j); \
+#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2) \
+ if (sizeof(LhsPacket) == 16) { \
+ const LhsScalar& src = lhs(i + ((32 * iter1) / sizeof(LhsScalar)), j); \
+ a##iter1 = *reinterpret_cast<__vector_pair*>(const_cast<LhsScalar*>(&src)); \
+ EIGEN_UNUSED_VARIABLE(f##iter1); \
+ } else { \
+ f##iter1 = lhs.template load<PLhsPacket, Unaligned>(i + ((iter2) * ResPacketSize), j); \
GEMV_BUILDPAIR_MMA(a##iter1, vec_splat(convertReal(f##iter1), 0), vec_splat(convertReal(f##iter1), 1)); \
}
#endif
-#define GEMV_LOAD1_COL_COMPLEX_MMA(iter, N) \
- if (GEMV_GETN_COMPLEX(N) > iter) { \
- if (GEMV_IS_COMPLEX_FLOAT) { \
- f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \
- EIGEN_UNUSED_VARIABLE(a##iter); \
- } else { \
+#define GEMV_LOAD1_COL_COMPLEX_MMA(iter, N) \
+ if (GEMV_GETN_COMPLEX(N) > iter) { \
+ if (GEMV_IS_COMPLEX_FLOAT) { \
+ f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \
+ EIGEN_UNUSED_VARIABLE(a##iter); \
+ } else { \
GEMV_LOADPAIR_COL_COMPLEX_MMA(iter, iter << 1) \
- } \
- } else { \
- EIGEN_UNUSED_VARIABLE(a##iter); \
- EIGEN_UNUSED_VARIABLE(f##iter); \
+ } \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(a##iter); \
+ EIGEN_UNUSED_VARIABLE(f##iter); \
}
-#define GEMV_WORK1_COL_COMPLEX_MMA(iter, N) \
- if (GEMV_GETN_COMPLEX(N) > iter) { \
- if (GEMV_IS_COMPLEX_FLOAT) { \
- gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(f##iter, b, &e0##iter); \
- } else { \
- gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(a##iter, b, &e0##iter); \
- } \
+#define GEMV_WORK1_COL_COMPLEX_MMA(iter, N) \
+ if (GEMV_GETN_COMPLEX(N) > iter) { \
+ if (GEMV_IS_COMPLEX_FLOAT) { \
+ gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, \
+ ConjugateLhs, ConjugateRhs, ColMajor>(f##iter, b, &e0##iter); \
+ } else { \
+ gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, \
+ ConjugateLhs, ConjugateRhs, ColMajor>(a##iter, b, &e0##iter); \
+ } \
}
#define GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter1, iter2) \
GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1));
#define GEMV_LOAD2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \
- if (GEMV_GETN_COMPLEX(N) > iter1) { \
- if (GEMV_IS_COMPLEX_FLOAT) { \
- GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2); \
- EIGEN_UNUSED_VARIABLE(a##iter3) \
- } else { \
- GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2 << 1); \
- GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter3, iter3 << 1); \
- } \
- } else { \
- EIGEN_UNUSED_VARIABLE(a##iter2); \
- EIGEN_UNUSED_VARIABLE(a##iter3); \
- } \
- EIGEN_UNUSED_VARIABLE(f##iter2); \
+ if (GEMV_GETN_COMPLEX(N) > iter1) { \
+ if (GEMV_IS_COMPLEX_FLOAT) { \
+ GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2); \
+ EIGEN_UNUSED_VARIABLE(a##iter3) \
+ } else { \
+ GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2 << 1); \
+ GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter3, iter3 << 1); \
+ } \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(a##iter2); \
+ EIGEN_UNUSED_VARIABLE(a##iter3); \
+ } \
+ EIGEN_UNUSED_VARIABLE(f##iter2); \
EIGEN_UNUSED_VARIABLE(f##iter3);
-#define GEMV_WORK2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \
- if (GEMV_GETN_COMPLEX(N) > iter1) { \
- if (GEMV_IS_COMPLEX_FLOAT) { \
- PLhsPacket g[2]; \
- __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(g), &a##iter2); \
- gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(g[0], b, &e0##iter2); \
- gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(g[1], b, &e0##iter3); \
- } else { \
- gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(a##iter2, b, &e0##iter2); \
- gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(a##iter3, b, &e0##iter3); \
- } \
+#define GEMV_WORK2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \
+ if (GEMV_GETN_COMPLEX(N) > iter1) { \
+ if (GEMV_IS_COMPLEX_FLOAT) { \
+ PLhsPacket g[2]; \
+ __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(g), &a##iter2); \
+ gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, \
+ ConjugateLhs, ConjugateRhs, ColMajor>(g[0], b, &e0##iter2); \
+ gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, \
+ ConjugateLhs, ConjugateRhs, ColMajor>(g[1], b, &e0##iter3); \
+ } else { \
+ gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, \
+ ConjugateLhs, ConjugateRhs, ColMajor>(a##iter2, b, &e0##iter2); \
+ gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, \
+ ConjugateLhs, ConjugateRhs, ColMajor>(a##iter3, b, &e0##iter3); \
+ } \
}
#if EIGEN_COMP_LLVM
-#define GEMV_LOAD_COL_COMPLEX_MMA(N) \
- if (GEMV_GETN_COMPLEX(N) > 1) { \
+#define GEMV_LOAD_COL_COMPLEX_MMA(N) \
+ if (GEMV_GETN_COMPLEX(N) > 1) { \
GEMV_UNROLL_HALF(GEMV_LOAD2_COL_COMPLEX_MMA, (N >> 1)) \
- } else { \
- GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N) \
+ } else { \
+ GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N) \
}
-#define GEMV_WORK_COL_COMPLEX_MMA(N) \
- if (GEMV_GETN_COMPLEX(N) > 1) { \
+#define GEMV_WORK_COL_COMPLEX_MMA(N) \
+ if (GEMV_GETN_COMPLEX(N) > 1) { \
GEMV_UNROLL_HALF(GEMV_WORK2_COL_COMPLEX_MMA, (N >> 1)) \
- } else { \
- GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N) \
+ } else { \
+ GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N) \
}
#else
-#define GEMV_LOAD_COL_COMPLEX_MMA(N) \
- GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N)
+#define GEMV_LOAD_COL_COMPLEX_MMA(N) GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N)
-#define GEMV_WORK_COL_COMPLEX_MMA(N) \
- GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N)
+#define GEMV_WORK_COL_COMPLEX_MMA(N) GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N)
#endif
-#define GEMV_DISASSEMBLE_COMPLEX_MMA(iter) \
- disassembleResults<Scalar, ScalarPacket, ResPacketSize, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter, result0##iter);
+#define GEMV_DISASSEMBLE_COMPLEX_MMA(iter) \
+ disassembleResults<Scalar, ScalarPacket, ResPacketSize, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>( \
+ &e0##iter, result0##iter);
-#define GEMV_STORE_COL_COMPLEX_MMA(iter, N) \
- if (GEMV_GETN_COMPLEX(N) > iter) { \
- GEMV_DISASSEMBLE_COMPLEX_MMA(iter); \
- c0##iter = PResPacket(result0##iter.packet[0]); \
- if (GEMV_IS_COMPLEX_FLOAT) { \
- pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>(c0##iter, alpha_data, res + i + (iter * ResPacketSize)); \
- } else { \
- pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>(c0##iter, alpha_data, res + i + ((iter << 1) * ResPacketSize)); \
- c0##iter = PResPacket(result0##iter.packet[2]); \
- pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>(c0##iter, alpha_data, res + i + (((iter << 1) + 1) * ResPacketSize)); \
- } \
+#define GEMV_STORE_COL_COMPLEX_MMA(iter, N) \
+ if (GEMV_GETN_COMPLEX(N) > iter) { \
+ GEMV_DISASSEMBLE_COMPLEX_MMA(iter); \
+ c0##iter = PResPacket(result0##iter.packet[0]); \
+ if (GEMV_IS_COMPLEX_FLOAT) { \
+ pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>( \
+ c0##iter, alpha_data, res + i + (iter * ResPacketSize)); \
+ } else { \
+ pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>( \
+ c0##iter, alpha_data, res + i + ((iter << 1) * ResPacketSize)); \
+ c0##iter = PResPacket(result0##iter.packet[2]); \
+ pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>( \
+ c0##iter, alpha_data, res + i + (((iter << 1) + 1) * ResPacketSize)); \
+ } \
}
-#define GEMV_STORE2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \
- if (GEMV_GETN_COMPLEX(N) > iter1) { \
- GEMV_DISASSEMBLE_COMPLEX_MMA(iter2); \
- GEMV_DISASSEMBLE_COMPLEX_MMA(iter3); \
- c0##iter2 = PResPacket(result0##iter2.packet[0]); \
- if (GEMV_IS_COMPLEX_FLOAT) { \
- c0##iter3 = PResPacket(result0##iter3.packet[0]); \
- pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2>(c0##iter2, c0##iter3, alpha_data, res + i); \
- } else { \
- c0##iter3 = PResPacket(result0##iter2.packet[2]); \
- pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2 << 1>(c0##iter2, c0##iter3, alpha_data, res + i); \
- c0##iter2 = PResPacket(result0##iter3.packet[0]); \
- c0##iter3 = PResPacket(result0##iter3.packet[2]); \
- pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter3 << 1>(c0##iter2, c0##iter3, alpha_data, res + i); \
- } \
+#define GEMV_STORE2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \
+ if (GEMV_GETN_COMPLEX(N) > iter1) { \
+ GEMV_DISASSEMBLE_COMPLEX_MMA(iter2); \
+ GEMV_DISASSEMBLE_COMPLEX_MMA(iter3); \
+ c0##iter2 = PResPacket(result0##iter2.packet[0]); \
+ if (GEMV_IS_COMPLEX_FLOAT) { \
+ c0##iter3 = PResPacket(result0##iter3.packet[0]); \
+ pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2>( \
+ c0##iter2, c0##iter3, alpha_data, res + i); \
+ } else { \
+ c0##iter3 = PResPacket(result0##iter2.packet[2]); \
+ pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2 << 1>( \
+ c0##iter2, c0##iter3, alpha_data, res + i); \
+ c0##iter2 = PResPacket(result0##iter3.packet[0]); \
+ c0##iter3 = PResPacket(result0##iter3.packet[2]); \
+ pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter3 << 1>( \
+ c0##iter2, c0##iter3, alpha_data, res + i); \
+ } \
}
-#define GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) \
- GEMV_UNROLL(GEMV_INIT_COL_COMPLEX_MMA, N) \
- Index j = j2; \
- do { \
- const RhsScalar& b1 = rhs2(j, 0); \
- RhsScalar* b = const_cast<RhsScalar *>(&b1); \
- GEMV_UNROLL(GEMV_PREFETCH, N) \
- GEMV_LOAD_COL_COMPLEX_MMA(N) \
- GEMV_WORK_COL_COMPLEX_MMA(N) \
- } while (++j < jend); \
- if (GEMV_GETN(N) <= 2) { \
- GEMV_UNROLL(GEMV_STORE_COL_COMPLEX_MMA, N) \
- } else { \
+#define GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) \
+ GEMV_UNROLL(GEMV_INIT_COL_COMPLEX_MMA, N) \
+ Index j = j2; \
+ do { \
+ const RhsScalar& b1 = rhs2(j, 0); \
+ RhsScalar* b = const_cast<RhsScalar*>(&b1); \
+ GEMV_UNROLL(GEMV_PREFETCH, N) \
+ GEMV_LOAD_COL_COMPLEX_MMA(N) \
+ GEMV_WORK_COL_COMPLEX_MMA(N) \
+ } while (++j < jend); \
+ if (GEMV_GETN(N) <= 2) { \
+ GEMV_UNROLL(GEMV_STORE_COL_COMPLEX_MMA, N) \
+ } else { \
GEMV_UNROLL_HALF(GEMV_STORE2_COL_COMPLEX_MMA, (N >> 1)) \
- } \
+ } \
i += (ResPacketSize * N);
#endif
-#define GEMV_INIT_COMPLEX(iter, N) \
- if (N > iter) { \
- c0##iter = pset_zero<PResPacket>(); \
+#define GEMV_INIT_COMPLEX(iter, N) \
+ if (N > iter) { \
+ c0##iter = pset_zero<PResPacket>(); \
c1##iter = pset_init<ResPacket, LhsPacket, RhsPacket>(c1##iter); \
- } else { \
- EIGEN_UNUSED_VARIABLE(c0##iter); \
- EIGEN_UNUSED_VARIABLE(c1##iter); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(c0##iter); \
+ EIGEN_UNUSED_VARIABLE(c1##iter); \
}
-#define GEMV_WORK_COL_COMPLEX(iter, N) \
- if (N > iter) { \
- f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \
- gemv_mult_complex<ScalarPacket, PLhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, ColMajor>(f##iter, b, c0##iter, c1##iter); \
- } else { \
- EIGEN_UNUSED_VARIABLE(f##iter); \
+#define GEMV_WORK_COL_COMPLEX(iter, N) \
+ if (N > iter) { \
+ f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \
+ gemv_mult_complex<ScalarPacket, PLhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, \
+ ConjugateRhs, ColMajor>(f##iter, b, c0##iter, c1##iter); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(f##iter); \
}
-#define GEMV_STORE_COL_COMPLEX(iter, N) \
- if (N > iter) { \
- if (GEMV_IS_COMPLEX_COMPLEX) { \
- c0##iter = padd(c0##iter, c1##iter); \
- } \
- pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>(c0##iter, alpha_data, res + i + (iter * ResPacketSize)); \
+#define GEMV_STORE_COL_COMPLEX(iter, N) \
+ if (N > iter) { \
+ if (GEMV_IS_COMPLEX_COMPLEX) { \
+ c0##iter = padd(c0##iter, c1##iter); \
+ } \
+ pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>( \
+ c0##iter, alpha_data, res + i + (iter * ResPacketSize)); \
}
/** \internal main macro for gemv_complex_col - initialize accumulators, multiply and add inputs, and store results */
-#define GEMV_PROCESS_COL_COMPLEX_ONE(N) \
- GEMV_UNROLL(GEMV_INIT_COMPLEX, N) \
- Index j = j2; \
- do { \
- const RhsScalar& b1 = rhs2(j, 0); \
- RhsScalar* b = const_cast<RhsScalar *>(&b1); \
- GEMV_UNROLL(GEMV_PREFETCH, N) \
- GEMV_UNROLL(GEMV_WORK_COL_COMPLEX, N) \
- } while (++j < jend); \
- GEMV_UNROLL(GEMV_STORE_COL_COMPLEX, N) \
+#define GEMV_PROCESS_COL_COMPLEX_ONE(N) \
+ GEMV_UNROLL(GEMV_INIT_COMPLEX, N) \
+ Index j = j2; \
+ do { \
+ const RhsScalar& b1 = rhs2(j, 0); \
+ RhsScalar* b = const_cast<RhsScalar*>(&b1); \
+ GEMV_UNROLL(GEMV_PREFETCH, N) \
+ GEMV_UNROLL(GEMV_WORK_COL_COMPLEX, N) \
+ } while (++j < jend); \
+ GEMV_UNROLL(GEMV_STORE_COL_COMPLEX, N) \
i += (ResPacketSize * N);
#if defined(USE_GEMV_MMA) && (EIGEN_COMP_LLVM || defined(USE_SLOWER_GEMV_MMA))
@@ -2147,465 +2018,440 @@
#endif
#ifdef USE_GEMV_COL_COMPLEX_MMA
-#define GEMV_PROCESS_COL_COMPLEX(N) \
- GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N)
+#define GEMV_PROCESS_COL_COMPLEX(N) GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N)
#else
#if defined(USE_GEMV_MMA) && (__GNUC__ > 10)
-#define GEMV_PROCESS_COL_COMPLEX(N) \
+#define GEMV_PROCESS_COL_COMPLEX(N) \
if (sizeof(Scalar) != sizeof(LhsPacket)) { \
- GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) \
- } else { \
- GEMV_PROCESS_COL_COMPLEX_ONE(N) \
+ GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) \
+ } else { \
+ GEMV_PROCESS_COL_COMPLEX_ONE(N) \
}
#else
-#define GEMV_PROCESS_COL_COMPLEX(N) \
- GEMV_PROCESS_COL_COMPLEX_ONE(N)
+#define GEMV_PROCESS_COL_COMPLEX(N) GEMV_PROCESS_COL_COMPLEX_ONE(N)
#endif
#endif
-template<typename Scalar, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, bool LhsIsReal, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, bool RhsIsReal, typename ResScalar>
-EIGEN_STRONG_INLINE void gemv_complex_col(
- Index rows, Index cols,
- const LhsMapper& alhs,
- const RhsMapper& rhs,
- ResScalar* res, Index resIncr,
- ResScalar alpha)
-{
- typedef gemv_traits<LhsScalar, RhsScalar> Traits;
+template <typename Scalar, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, bool LhsIsReal,
+ typename RhsScalar, typename RhsMapper, bool ConjugateRhs, bool RhsIsReal, typename ResScalar>
+EIGEN_STRONG_INLINE void gemv_complex_col(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
+ ResScalar* res, Index resIncr, ResScalar alpha) {
+ typedef gemv_traits<LhsScalar, RhsScalar> Traits;
- typedef typename Traits::LhsPacket LhsPacket;
- typedef typename Traits::RhsPacket RhsPacket;
- typedef typename Traits::ResPacket ResPacket;
+ typedef typename Traits::LhsPacket LhsPacket;
+ typedef typename Traits::RhsPacket RhsPacket;
+ typedef typename Traits::ResPacket ResPacket;
- typedef typename packet_traits<Scalar>::type ScalarPacket;
- typedef typename packet_traits<LhsScalar>::type PLhsPacket;
- typedef typename packet_traits<ResScalar>::type PResPacket;
- typedef gemv_traits<ResPacket, ResPacket> PTraits;
+ typedef typename packet_traits<Scalar>::type ScalarPacket;
+ typedef typename packet_traits<LhsScalar>::type PLhsPacket;
+ typedef typename packet_traits<ResScalar>::type PResPacket;
+ typedef gemv_traits<ResPacket, ResPacket> PTraits;
- EIGEN_UNUSED_VARIABLE(resIncr);
- eigen_internal_assert(resIncr == 1);
+ EIGEN_UNUSED_VARIABLE(resIncr);
+ eigen_internal_assert(resIncr == 1);
- // The following copy tells the compiler that lhs's attributes are not modified outside this function
- // This helps GCC to generate proper code.
- LhsMapper lhs(alhs);
- RhsMapper rhs2(rhs);
+ // The following copy tells the compiler that lhs's attributes are not modified outside this function
+ // This helps GCC to generate proper code.
+ LhsMapper lhs(alhs);
+ RhsMapper rhs2(rhs);
- conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
+ conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
- const Index lhsStride = lhs.stride();
- // TODO: for padded aligned inputs, we could enable aligned reads
- enum {
- LhsAlignment = Unaligned,
- ResPacketSize = PTraits::ResPacketSize,
- LhsPacketSize = PTraits::LhsPacketSize,
- RhsPacketSize = PTraits::RhsPacketSize,
- };
+ const Index lhsStride = lhs.stride();
+ // TODO: for padded aligned inputs, we could enable aligned reads
+ enum {
+ LhsAlignment = Unaligned,
+ ResPacketSize = PTraits::ResPacketSize,
+ LhsPacketSize = PTraits::LhsPacketSize,
+ RhsPacketSize = PTraits::RhsPacketSize,
+ };
#ifdef EIGEN_POWER_USE_GEMV_PREFETCH
- const Index prefetch_dist = 64 * LhsPacketSize;
+ const Index prefetch_dist = 64 * LhsPacketSize;
#endif
#ifndef GCC_ONE_VECTORPAIR_BUG
- const Index n8 = rows - 8 * ResPacketSize + 1;
- const Index n4 = rows - 4 * ResPacketSize + 1;
- const Index n2 = rows - 2 * ResPacketSize + 1;
+ const Index n8 = rows - 8 * ResPacketSize + 1;
+ const Index n4 = rows - 4 * ResPacketSize + 1;
+ const Index n2 = rows - 2 * ResPacketSize + 1;
#endif
- const Index n1 = rows - 1 * ResPacketSize + 1;
+ const Index n1 = rows - 1 * ResPacketSize + 1;
- // TODO: improve the following heuristic:
- const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8);
+ // TODO: improve the following heuristic:
+ const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8);
- typedef alpha_store<PResPacket, ResPacket, ResScalar, Scalar> AlphaData;
- AlphaData alpha_data(alpha);
+ typedef alpha_store<PResPacket, ResPacket, ResScalar, Scalar> AlphaData;
+ AlphaData alpha_data(alpha);
- for (Index j2 = 0; j2 < cols; j2 += block_cols)
- {
- Index jend = numext::mini(j2 + block_cols, cols);
- Index i = 0;
- PResPacket c00, c01, c02, c03, c04, c05, c06, c07;
- ResPacket c10, c11, c12, c13, c14, c15, c16, c17;
- PLhsPacket f0, f1, f2, f3, f4, f5, f6, f7;
+ for (Index j2 = 0; j2 < cols; j2 += block_cols) {
+ Index jend = numext::mini(j2 + block_cols, cols);
+ Index i = 0;
+ PResPacket c00, c01, c02, c03, c04, c05, c06, c07;
+ ResPacket c10, c11, c12, c13, c14, c15, c16, c17;
+ PLhsPacket f0, f1, f2, f3, f4, f5, f6, f7;
#ifdef USE_GEMV_MMA
- __vector_quad e00, e01, e02, e03, e04, e05, e06, e07;
- __vector_pair a0, a1, a2, a3, a4, a5, a6, a7;
- PacketBlock<ScalarPacket, 4> result00, result01, result02, result03, result04, result05, result06, result07;
- GEMV_UNUSED(8, e0)
- GEMV_UNUSED(8, result0)
- GEMV_UNUSED(8, a)
- GEMV_UNUSED(8, f)
+ __vector_quad e00, e01, e02, e03, e04, e05, e06, e07;
+ __vector_pair a0, a1, a2, a3, a4, a5, a6, a7;
+ PacketBlock<ScalarPacket, 4> result00, result01, result02, result03, result04, result05, result06, result07;
+ GEMV_UNUSED(8, e0)
+ GEMV_UNUSED(8, result0)
+ GEMV_UNUSED(8, a)
+ GEMV_UNUSED(8, f)
#if !defined(GCC_ONE_VECTORPAIR_BUG) && defined(USE_GEMV_COL_COMPLEX_MMA)
- if (GEMV_IS_COMPLEX_COMPLEX || !GEMV_IS_COMPLEX_FLOAT)
+ if (GEMV_IS_COMPLEX_COMPLEX || !GEMV_IS_COMPLEX_FLOAT)
#endif
#endif
#ifndef GCC_ONE_VECTORPAIR_BUG
- {
- while (i < n8)
- {
- GEMV_PROCESS_COL_COMPLEX(8)
- }
- }
- while (i < n4)
- {
- GEMV_PROCESS_COL_COMPLEX(4)
- }
- if (i < n2)
- {
- GEMV_PROCESS_COL_COMPLEX(2)
- }
- if (i < n1)
-#else
- while (i < n1)
-#endif
- {
- GEMV_PROCESS_COL_COMPLEX_ONE(1)
- }
- for (;i < rows;++i)
- {
- ResScalar d0(0);
- Index j = j2;
- do {
- d0 += cj.pmul(lhs(i, j), rhs2(j, 0));
- } while (++j < jend);
- res[i] += alpha * d0;
- }
+ {
+ while (i < n8) {
+ GEMV_PROCESS_COL_COMPLEX(8)
+ }
}
+ while (i < n4) {
+ GEMV_PROCESS_COL_COMPLEX(4)
+ }
+ if (i < n2) {
+ GEMV_PROCESS_COL_COMPLEX(2)
+ }
+ if (i < n1)
+#else
+ while (i < n1)
+#endif
+ {
+ GEMV_PROCESS_COL_COMPLEX_ONE(1)
+ }
+ for (; i < rows; ++i) {
+ ResScalar d0(0);
+ Index j = j2;
+ do {
+ d0 += cj.pmul(lhs(i, j), rhs2(j, 0));
+ } while (++j < jend);
+ res[i] += alpha * d0;
+ }
+ }
}
-template <typename Scalar, int N> struct ScalarBlock {
- Scalar scalar[N];
+template <typename Scalar, int N>
+struct ScalarBlock {
+ Scalar scalar[N];
};
#ifdef USE_GEMV_MMA
-static Packet16uc p16uc_ELEMENT_3 = { 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f, 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f };
+static Packet16uc p16uc_ELEMENT_3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
/** \internal predux (add elements of a vector) from a MMA accumulator - real results */
-template<typename ResScalar, typename ResPacket>
-EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(__vector_quad* acc0, __vector_quad* acc1)
-{
- PacketBlock<ResPacket, 4> result0, result1;
- __builtin_mma_disassemble_acc(&result0.packet, acc0);
- __builtin_mma_disassemble_acc(&result1.packet, acc1);
- result0.packet[0] = vec_mergeh(result0.packet[0], result1.packet[0]);
- result0.packet[1] = vec_mergeo(result0.packet[1], result1.packet[1]);
- result0.packet[2] = vec_mergel(result0.packet[2], result1.packet[2]);
- result0.packet[3] = vec_perm(result0.packet[3], result1.packet[3], p16uc_ELEMENT_3);
- result0.packet[0] = vec_add(vec_add(result0.packet[0], result0.packet[2]), vec_add(result0.packet[1], result0.packet[3]));
- return *reinterpret_cast<ScalarBlock<ResScalar, 2> *>(&result0.packet[0]);
+template <typename ResScalar, typename ResPacket>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(__vector_quad* acc0, __vector_quad* acc1) {
+ PacketBlock<ResPacket, 4> result0, result1;
+ __builtin_mma_disassemble_acc(&result0.packet, acc0);
+ __builtin_mma_disassemble_acc(&result1.packet, acc1);
+ result0.packet[0] = vec_mergeh(result0.packet[0], result1.packet[0]);
+ result0.packet[1] = vec_mergeo(result0.packet[1], result1.packet[1]);
+ result0.packet[2] = vec_mergel(result0.packet[2], result1.packet[2]);
+ result0.packet[3] = vec_perm(result0.packet[3], result1.packet[3], p16uc_ELEMENT_3);
+ result0.packet[0] =
+ vec_add(vec_add(result0.packet[0], result0.packet[2]), vec_add(result0.packet[1], result0.packet[3]));
+ return *reinterpret_cast<ScalarBlock<ResScalar, 2>*>(&result0.packet[0]);
}
-template<>
-EIGEN_ALWAYS_INLINE ScalarBlock<double, 2> predux_real<double, Packet2d>(__vector_quad* acc0, __vector_quad* acc1)
-{
- PacketBlock<Packet2d, 4> result0, result1;
- __builtin_mma_disassemble_acc(&result0.packet, acc0);
- __builtin_mma_disassemble_acc(&result1.packet, acc1);
- result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result1.packet[0]), vec_mergel(result0.packet[1], result1.packet[1]));
- return *reinterpret_cast<ScalarBlock<double, 2> *>(&result0.packet[0]);
+template <>
+EIGEN_ALWAYS_INLINE ScalarBlock<double, 2> predux_real<double, Packet2d>(__vector_quad* acc0, __vector_quad* acc1) {
+ PacketBlock<Packet2d, 4> result0, result1;
+ __builtin_mma_disassemble_acc(&result0.packet, acc0);
+ __builtin_mma_disassemble_acc(&result1.packet, acc1);
+ result0.packet[0] =
+ vec_add(vec_mergeh(result0.packet[0], result1.packet[0]), vec_mergel(result0.packet[1], result1.packet[1]));
+ return *reinterpret_cast<ScalarBlock<double, 2>*>(&result0.packet[0]);
}
/** \internal add complex results together */
-template<typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_ALWAYS_INLINE ScalarBlock<std::complex<float>, 2> addComplexResults(PacketBlock<Packet4f, 4>& result0, PacketBlock<Packet4f, 4>& result1)
-{
- ScalarBlock<std::complex<float>, 2> cc0;
- result0.packet[0] = reinterpret_cast<Packet4f>(vec_mergeh(reinterpret_cast<Packet2d>(result0.packet[0]), reinterpret_cast<Packet2d>(result1.packet[0])));
- result0.packet[2] = reinterpret_cast<Packet4f>(vec_mergel(reinterpret_cast<Packet2d>(result0.packet[2]), reinterpret_cast<Packet2d>(result1.packet[2])));
- result0.packet[0] = vec_add(result0.packet[0], result0.packet[2]);
- if (GEMV_IS_COMPLEX_COMPLEX) {
- result0.packet[1] = reinterpret_cast<Packet4f>(vec_mergeh(reinterpret_cast<Packet2d>(result0.packet[1]), reinterpret_cast<Packet2d>(result1.packet[1])));
- result0.packet[3] = reinterpret_cast<Packet4f>(vec_mergel(reinterpret_cast<Packet2d>(result0.packet[3]), reinterpret_cast<Packet2d>(result1.packet[3])));
- result0.packet[1] = vec_add(result0.packet[1], result0.packet[3]);
- if (ConjugateLhs) {
- result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
- result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v;
- } else if (ConjugateRhs) {
- result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v;
- } else {
- result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v;
- }
- result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
+template <typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE ScalarBlock<std::complex<float>, 2> addComplexResults(PacketBlock<Packet4f, 4>& result0,
+ PacketBlock<Packet4f, 4>& result1) {
+ ScalarBlock<std::complex<float>, 2> cc0;
+ result0.packet[0] = reinterpret_cast<Packet4f>(
+ vec_mergeh(reinterpret_cast<Packet2d>(result0.packet[0]), reinterpret_cast<Packet2d>(result1.packet[0])));
+ result0.packet[2] = reinterpret_cast<Packet4f>(
+ vec_mergel(reinterpret_cast<Packet2d>(result0.packet[2]), reinterpret_cast<Packet2d>(result1.packet[2])));
+ result0.packet[0] = vec_add(result0.packet[0], result0.packet[2]);
+ if (GEMV_IS_COMPLEX_COMPLEX) {
+ result0.packet[1] = reinterpret_cast<Packet4f>(
+ vec_mergeh(reinterpret_cast<Packet2d>(result0.packet[1]), reinterpret_cast<Packet2d>(result1.packet[1])));
+ result0.packet[3] = reinterpret_cast<Packet4f>(
+ vec_mergel(reinterpret_cast<Packet2d>(result0.packet[3]), reinterpret_cast<Packet2d>(result1.packet[3])));
+ result0.packet[1] = vec_add(result0.packet[1], result0.packet[3]);
+ if (ConjugateLhs) {
+ result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+ result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v;
+ } else if (ConjugateRhs) {
+ result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v;
} else {
- if (ConjugateLhs && (sizeof(LhsPacket) == sizeof(std::complex<float>))) {
- result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
- }
+ result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v;
}
- cc0.scalar[0].real(result0.packet[0][0]);
- cc0.scalar[0].imag(result0.packet[0][1]);
- cc0.scalar[1].real(result0.packet[0][2]);
- cc0.scalar[1].imag(result0.packet[0][3]);
- return cc0;
+ result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
+ } else {
+ if (ConjugateLhs && (sizeof(LhsPacket) == sizeof(std::complex<float>))) {
+ result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+ }
+ }
+ cc0.scalar[0].real(result0.packet[0][0]);
+ cc0.scalar[0].imag(result0.packet[0][1]);
+ cc0.scalar[1].real(result0.packet[0][2]);
+ cc0.scalar[1].imag(result0.packet[0][3]);
+ return cc0;
}
-template<typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_ALWAYS_INLINE ScalarBlock<std::complex<double>, 2> addComplexResults(PacketBlock<Packet2d, 4>&, PacketBlock<Packet2d, 4>&)
-{
- ScalarBlock<std::complex<double>, 2> cc0;
- EIGEN_UNUSED_VARIABLE(cc0);
- return cc0; // Just for compilation
+template <typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE ScalarBlock<std::complex<double>, 2> addComplexResults(PacketBlock<Packet2d, 4>&,
+ PacketBlock<Packet2d, 4>&) {
+ ScalarBlock<std::complex<double>, 2> cc0;
+ EIGEN_UNUSED_VARIABLE(cc0);
+ return cc0; // Just for compilation
}
/** \internal predux (add elements of a vector) from a MMA accumulator - complex results */
-template<typename ResScalar, typename ResPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(__vector_quad* acc0, __vector_quad* acc1)
-{
- PacketBlock<ResPacket, 4> result0, result1;
- __builtin_mma_disassemble_acc(&result0.packet, acc0);
- __builtin_mma_disassemble_acc(&result1.packet, acc1);
- return addComplexResults<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(result0, result1);
+template <typename ResScalar, typename ResPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs,
+ bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(__vector_quad* acc0, __vector_quad* acc1) {
+ PacketBlock<ResPacket, 4> result0, result1;
+ __builtin_mma_disassemble_acc(&result0.packet, acc0);
+ __builtin_mma_disassemble_acc(&result1.packet, acc1);
+ return addComplexResults<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(result0, result1);
}
-template<typename ResScalar, typename ResPacket>
-EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(__vector_quad* acc0)
-{
- PacketBlock<ResPacket, 4> result0;
- __builtin_mma_disassemble_acc(&result0.packet, acc0);
- result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result0.packet[2]), vec_mergel(result0.packet[1], result0.packet[3]));
- return *reinterpret_cast<ScalarBlock<ResScalar, 2> *>(&result0.packet[0]);
+template <typename ResScalar, typename ResPacket>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(__vector_quad* acc0) {
+ PacketBlock<ResPacket, 4> result0;
+ __builtin_mma_disassemble_acc(&result0.packet, acc0);
+ result0.packet[0] =
+ vec_add(vec_mergeh(result0.packet[0], result0.packet[2]), vec_mergel(result0.packet[1], result0.packet[3]));
+ return *reinterpret_cast<ScalarBlock<ResScalar, 2>*>(&result0.packet[0]);
}
-template<typename ResScalar, typename ResPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(__vector_quad* acc0)
-{
- ScalarBlock<ResScalar, 2> cc0;
- PacketBlock<ResPacket, 4> result0;
- __builtin_mma_disassemble_acc(&result0.packet, acc0);
- if (GEMV_IS_COMPLEX_COMPLEX) {
- if (ConjugateLhs) {
- result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v;
- result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v;
- } else if (ConjugateRhs) {
- result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
- result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v;
- } else {
- result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v;
- result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v;
- }
- result0.packet[0] = vec_add(result0.packet[0], __builtin_vsx_xxpermdi(result0.packet[1], result0.packet[1], 2));
- result0.packet[2] = vec_add(result0.packet[2], __builtin_vsx_xxpermdi(result0.packet[3], result0.packet[3], 2));
+template <typename ResScalar, typename ResPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs,
+ bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(__vector_quad* acc0) {
+ ScalarBlock<ResScalar, 2> cc0;
+ PacketBlock<ResPacket, 4> result0;
+ __builtin_mma_disassemble_acc(&result0.packet, acc0);
+ if (GEMV_IS_COMPLEX_COMPLEX) {
+ if (ConjugateLhs) {
+ result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v;
+ result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v;
+ } else if (ConjugateRhs) {
+ result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+ result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v;
} else {
- result0.packet[0] = __builtin_vsx_xxpermdi(result0.packet[0], result0.packet[1], 1);
- result0.packet[2] = __builtin_vsx_xxpermdi(result0.packet[2], result0.packet[3], 1);
+ result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v;
+ result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v;
}
- cc0.scalar[0].real(result0.packet[0][0]);
- cc0.scalar[0].imag(result0.packet[0][1]);
- cc0.scalar[1].real(result0.packet[2][0]);
- cc0.scalar[1].imag(result0.packet[2][1]);
- return cc0;
+ result0.packet[0] = vec_add(result0.packet[0], __builtin_vsx_xxpermdi(result0.packet[1], result0.packet[1], 2));
+ result0.packet[2] = vec_add(result0.packet[2], __builtin_vsx_xxpermdi(result0.packet[3], result0.packet[3], 2));
+ } else {
+ result0.packet[0] = __builtin_vsx_xxpermdi(result0.packet[0], result0.packet[1], 1);
+ result0.packet[2] = __builtin_vsx_xxpermdi(result0.packet[2], result0.packet[3], 1);
+ }
+ cc0.scalar[0].real(result0.packet[0][0]);
+ cc0.scalar[0].imag(result0.packet[0][1]);
+ cc0.scalar[1].real(result0.packet[2][0]);
+ cc0.scalar[1].imag(result0.packet[2][1]);
+ return cc0;
}
#endif
-template<typename ResScalar, typename ResPacket>
-EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(ResPacket& a, ResPacket& b)
-{
- ScalarBlock<ResScalar, 2> cc0;
- cc0.scalar[0] = predux(a);
- cc0.scalar[1] = predux(b);
- return cc0;
+template <typename ResScalar, typename ResPacket>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(ResPacket& a, ResPacket& b) {
+ ScalarBlock<ResScalar, 2> cc0;
+ cc0.scalar[0] = predux(a);
+ cc0.scalar[1] = predux(b);
+ return cc0;
}
-template<typename ResScalar, typename ResPacket>
-EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(ResPacket& a, ResPacket& b)
-{
- return predux_real<ResScalar, ResPacket>(a, b);
+template <typename ResScalar, typename ResPacket>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(ResPacket& a, ResPacket& b) {
+ return predux_real<ResScalar, ResPacket>(a, b);
}
-#define GEMV_UNROLL_ROW(func, N) \
- func(0, N) func(1, N) func(2, N) func(3, N) func(4, N) func(5, N) func(6, N) func(7, N)
+#define GEMV_UNROLL_ROW(func, N) func(0, N) func(1, N) func(2, N) func(3, N) func(4, N) func(5, N) func(6, N) func(7, N)
-#define GEMV_UNROLL_ROW_HALF(func, N) \
- func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N)
+#define GEMV_UNROLL_ROW_HALF(func, N) func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N)
-#define GEMV_LOADPACKET_ROW(iter) \
- lhs.template load<LhsPacket, Unaligned>(i + (iter), j)
+#define GEMV_LOADPACKET_ROW(iter) lhs.template load<LhsPacket, Unaligned>(i + (iter), j)
#ifdef USE_GEMV_MMA
-#define GEMV_UNROLL3_ROW(func, N, which) \
- func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) \
- func(4, N, which) func(5, N, which) func(6, N, which) func(7, N, which)
+#define GEMV_UNROLL3_ROW(func, N, which) \
+ func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) func(4, N, which) func(5, N, which) \
+ func(6, N, which) func(7, N, which)
-#define GEMV_UNUSED_ROW(N, which) \
- GEMV_UNROLL3_ROW(GEMV_UNUSED_VAR, N, which)
+#define GEMV_UNUSED_ROW(N, which) GEMV_UNROLL3_ROW(GEMV_UNUSED_VAR, N, which)
-#define GEMV_INIT_ROW(iter, N) \
- if (GEMV_GETN(N) > iter) { \
+#define GEMV_INIT_ROW(iter, N) \
+ if (GEMV_GETN(N) > iter) { \
__builtin_mma_xxsetaccz(&c##iter); \
}
#define GEMV_LOADPAIR_ROW(iter1, iter2) \
GEMV_BUILDPAIR_MMA(b##iter1, GEMV_LOADPACKET_ROW(iter2), GEMV_LOADPACKET_ROW((iter2) + 1));
-#define GEMV_WORK_ROW(iter, N) \
- if (GEMV_GETN(N) > iter) { \
- if (GEMV_IS_FLOAT) { \
+#define GEMV_WORK_ROW(iter, N) \
+ if (GEMV_GETN(N) > iter) { \
+ if (GEMV_IS_FLOAT) { \
pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&c##iter, a0, GEMV_LOADPACKET_ROW(iter)); \
- } else { \
- __vector_pair b##iter; \
- GEMV_LOADPAIR_ROW(iter, iter << 1) \
- pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&c##iter, b##iter, a0); \
- } \
+ } else { \
+ __vector_pair b##iter; \
+ GEMV_LOADPAIR_ROW(iter, iter << 1) \
+ pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&c##iter, b##iter, a0); \
+ } \
}
-#define GEMV_PREDUX2(iter1, iter2, iter3, N) \
- if (N > iter1) { \
- if (GEMV_IS_FLOAT) { \
+#define GEMV_PREDUX2(iter1, iter2, iter3, N) \
+ if (N > iter1) { \
+ if (GEMV_IS_FLOAT) { \
cc##iter1 = predux_real<ResScalar, ResPacket>(&c##iter2, &c##iter3); \
- } else { \
- cc##iter1 = predux_real<ResScalar, ResPacket>(&c##iter1); \
- } \
- } else { \
- EIGEN_UNUSED_VARIABLE(cc##iter1); \
+ } else { \
+ cc##iter1 = predux_real<ResScalar, ResPacket>(&c##iter1); \
+ } \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(cc##iter1); \
}
#else
-#define GEMV_INIT_ROW(iter, N) \
- if (N > iter) { \
+#define GEMV_INIT_ROW(iter, N) \
+ if (N > iter) { \
c##iter = pset1<ResPacket>(ResScalar(0)); \
- } else { \
- EIGEN_UNUSED_VARIABLE(c##iter); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(c##iter); \
}
-#define GEMV_WORK_ROW(iter, N) \
- if (N > iter) { \
+#define GEMV_WORK_ROW(iter, N) \
+ if (N > iter) { \
c##iter = pcj.pmadd(GEMV_LOADPACKET_ROW(iter), a0, c##iter); \
}
-#define GEMV_PREDUX2(iter1, iter2, iter3, N) \
- if (N > iter1) { \
+#define GEMV_PREDUX2(iter1, iter2, iter3, N) \
+ if (N > iter1) { \
cc##iter1 = predux_real<ResScalar, ResPacket>(c##iter2, c##iter3); \
- } else { \
- EIGEN_UNUSED_VARIABLE(cc##iter1); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(cc##iter1); \
}
#endif
-#define GEMV_MULT(iter1, iter2, iter3, N) \
- if (N > iter1) { \
+#define GEMV_MULT(iter1, iter2, iter3, N) \
+ if (N > iter1) { \
cc##iter1.scalar[0] += cj.pmul(lhs(i + iter2, j), a0); \
cc##iter1.scalar[1] += cj.pmul(lhs(i + iter3, j), a0); \
}
-#define GEMV_STORE_ROW(iter1, iter2, iter3, N) \
- if (N > iter1) { \
+#define GEMV_STORE_ROW(iter1, iter2, iter3, N) \
+ if (N > iter1) { \
storeMaddData<ResScalar>(res + ((i + iter2) * resIncr), alpha, cc##iter1.scalar[0]); \
storeMaddData<ResScalar>(res + ((i + iter3) * resIncr), alpha, cc##iter1.scalar[1]); \
}
/** \internal main macro for gemv_row - initialize accumulators, multiply and add inputs, predux and store results */
-#define GEMV_PROCESS_ROW(N) \
- for (; i < n##N; i += N) { \
- GEMV_UNROLL_ROW(GEMV_INIT_ROW, N) \
- Index j = 0; \
- for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
+#define GEMV_PROCESS_ROW(N) \
+ for (; i < n##N; i += N) { \
+ GEMV_UNROLL_ROW(GEMV_INIT_ROW, N) \
+ Index j = 0; \
+ for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
RhsPacket a0 = rhs2.template load<RhsPacket, Unaligned>(j); \
- GEMV_UNROLL_ROW(GEMV_WORK_ROW, N) \
- } \
- GEMV_UNROLL_ROW_HALF(GEMV_PREDUX2, (N >> 1)) \
- for (; j < cols; ++j) { \
- RhsScalar a0 = rhs2(j); \
- GEMV_UNROLL_ROW_HALF(GEMV_MULT, (N >> 1)) \
- } \
- GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW, (N >> 1)) \
+ GEMV_UNROLL_ROW(GEMV_WORK_ROW, N) \
+ } \
+ GEMV_UNROLL_ROW_HALF(GEMV_PREDUX2, (N >> 1)) \
+ for (; j < cols; ++j) { \
+ RhsScalar a0 = rhs2(j); \
+ GEMV_UNROLL_ROW_HALF(GEMV_MULT, (N >> 1)) \
+ } \
+ GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW, (N >> 1)) \
}
-template<typename LhsScalar, typename LhsMapper, typename RhsScalar, typename RhsMapper, typename ResScalar>
-EIGEN_STRONG_INLINE void gemv_row(
- Index rows, Index cols,
- const LhsMapper& alhs,
- const RhsMapper& rhs,
- ResScalar* res, Index resIncr,
- ResScalar alpha)
-{
- typedef gemv_traits<LhsScalar, RhsScalar> Traits;
+template <typename LhsScalar, typename LhsMapper, typename RhsScalar, typename RhsMapper, typename ResScalar>
+EIGEN_STRONG_INLINE void gemv_row(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, ResScalar* res,
+ Index resIncr, ResScalar alpha) {
+ typedef gemv_traits<LhsScalar, RhsScalar> Traits;
- typedef typename Traits::LhsPacket LhsPacket;
- typedef typename Traits::RhsPacket RhsPacket;
- typedef typename Traits::ResPacket ResPacket;
+ typedef typename Traits::LhsPacket LhsPacket;
+ typedef typename Traits::RhsPacket RhsPacket;
+ typedef typename Traits::ResPacket ResPacket;
- // The following copy tells the compiler that lhs's attributes are not modified outside this function
- // This helps GCC to generate proper code.
- LhsMapper lhs(alhs);
- typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
+ // The following copy tells the compiler that lhs's attributes are not modified outside this function
+ // This helps GCC to generate proper code.
+ LhsMapper lhs(alhs);
+ typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
- eigen_internal_assert(rhs.stride() == 1);
- conj_helper<LhsScalar, RhsScalar, false, false> cj;
- conj_helper<LhsPacket, RhsPacket, false, false> pcj;
+ eigen_internal_assert(rhs.stride() == 1);
+ conj_helper<LhsScalar, RhsScalar, false, false> cj;
+ conj_helper<LhsPacket, RhsPacket, false, false> pcj;
- // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
- // processing 8 rows at once might be counter productive wrt cache.
+ // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
+ // processing 8 rows at once might be counter productive wrt cache.
#ifndef GCC_ONE_VECTORPAIR_BUG
- const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7);
- const Index n4 = rows - 3;
- const Index n2 = rows - 1;
+ const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7);
+ const Index n4 = rows - 3;
+ const Index n2 = rows - 1;
#endif
- // TODO: for padded aligned inputs, we could enable aligned reads
- enum {
- LhsAlignment = Unaligned,
- ResPacketSize = Traits::ResPacketSize,
- LhsPacketSize = Traits::LhsPacketSize,
- RhsPacketSize = Traits::RhsPacketSize,
- };
+ // TODO: for padded aligned inputs, we could enable aligned reads
+ enum {
+ LhsAlignment = Unaligned,
+ ResPacketSize = Traits::ResPacketSize,
+ LhsPacketSize = Traits::LhsPacketSize,
+ RhsPacketSize = Traits::RhsPacketSize,
+ };
- Index i = 0;
+ Index i = 0;
#ifdef USE_GEMV_MMA
- __vector_quad c0, c1, c2, c3, c4, c5, c6, c7;
- GEMV_UNUSED_ROW(8, c)
+ __vector_quad c0, c1, c2, c3, c4, c5, c6, c7;
+ GEMV_UNUSED_ROW(8, c)
#else
- ResPacket c0, c1, c2, c3, c4, c5, c6, c7;
+ ResPacket c0, c1, c2, c3, c4, c5, c6, c7;
#endif
#ifndef GCC_ONE_VECTORPAIR_BUG
- ScalarBlock<ResScalar, 2> cc0, cc1, cc2, cc3;
- GEMV_PROCESS_ROW(8)
- GEMV_PROCESS_ROW(4)
- GEMV_PROCESS_ROW(2)
+ ScalarBlock<ResScalar, 2> cc0, cc1, cc2, cc3;
+ GEMV_PROCESS_ROW(8)
+ GEMV_PROCESS_ROW(4)
+ GEMV_PROCESS_ROW(2)
#endif
- for (; i < rows; ++i)
- {
- ResPacket d0 = pset1<ResPacket>(ResScalar(0));
- Index j = 0;
- for (; j + LhsPacketSize <= cols; j += LhsPacketSize)
- {
- RhsPacket b0 = rhs2.template load<RhsPacket, Unaligned>(j);
+ for (; i < rows; ++i) {
+ ResPacket d0 = pset1<ResPacket>(ResScalar(0));
+ Index j = 0;
+ for (; j + LhsPacketSize <= cols; j += LhsPacketSize) {
+ RhsPacket b0 = rhs2.template load<RhsPacket, Unaligned>(j);
- d0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, d0);
- }
- ResScalar dd0 = predux(d0);
- for (; j < cols; ++j)
- {
- dd0 += cj.pmul(lhs(i, j), rhs2(j));
- }
- res[i * resIncr] += alpha * dd0;
+ d0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, d0);
}
+ ResScalar dd0 = predux(d0);
+ for (; j < cols; ++j) {
+ dd0 += cj.pmul(lhs(i, j), rhs2(j));
+ }
+ res[i * resIncr] += alpha * dd0;
+ }
}
-#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(Scalar) \
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
-struct general_matrix_vector_product<Index, Scalar, LhsMapper, ColMajor, ConjugateLhs, Scalar, RhsMapper, ConjugateRhs, Version> \
-{ \
- typedef typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType ResScalar; \
-\
- EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
- Index rows, Index cols, \
- const LhsMapper& lhs, \
- const RhsMapper& rhs, \
- ResScalar* res, Index resIncr, \
- ResScalar alpha) { \
- gemv_col<Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
- } \
-};
+#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(Scalar) \
+ template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+ struct general_matrix_vector_product<Index, Scalar, LhsMapper, ColMajor, ConjugateLhs, Scalar, RhsMapper, \
+ ConjugateRhs, Version> { \
+ typedef typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType ResScalar; \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs, \
+ const RhsMapper& rhs, ResScalar* res, Index resIncr, \
+ ResScalar alpha) { \
+ gemv_col<Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
+ } \
+ };
-#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(Scalar) \
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
-struct general_matrix_vector_product<Index, Scalar, LhsMapper, RowMajor, ConjugateLhs, Scalar, RhsMapper, ConjugateRhs, Version> \
-{ \
- typedef typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType ResScalar; \
-\
- EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
- Index rows, Index cols, \
- const LhsMapper& lhs, \
- const RhsMapper& rhs, \
- ResScalar* res, Index resIncr, \
- ResScalar alpha) { \
- gemv_row<Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
- } \
-};
+#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(Scalar) \
+ template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+ struct general_matrix_vector_product<Index, Scalar, LhsMapper, RowMajor, ConjugateLhs, Scalar, RhsMapper, \
+ ConjugateRhs, Version> { \
+ typedef typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType ResScalar; \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs, \
+ const RhsMapper& rhs, ResScalar* res, Index resIncr, \
+ ResScalar alpha) { \
+ gemv_row<Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
+ } \
+ };
EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(float)
EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(double)
@@ -2613,378 +2459,360 @@
EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(double)
#ifdef USE_GEMV_MMA
-#define gemv_bf16_col gemvMMA_bfloat16_col
-#define gemv_bf16_row gemvMMA_bfloat16_row
+#define gemv_bf16_col gemvMMA_bfloat16_col
+#define gemv_bf16_row gemvMMA_bfloat16_row
#else
-#define gemv_bf16_col gemv_bfloat16_col
-#define gemv_bf16_row gemv_bfloat16_row
+#define gemv_bf16_col gemv_bfloat16_col
+#define gemv_bf16_row gemv_bfloat16_row
#endif
-#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL_BFLOAT16() \
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
-struct general_matrix_vector_product<Index, bfloat16, LhsMapper, ColMajor, ConjugateLhs, bfloat16, RhsMapper, ConjugateRhs, Version> \
-{ \
- EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
- Index rows, Index cols, \
- const LhsMapper& lhs, \
- const RhsMapper& rhs, \
- bfloat16* res, Index resIncr, \
- bfloat16 alpha) { \
- gemv_bf16_col<LhsMapper, RhsMapper>(rows, cols, lhs, rhs, res, resIncr, alpha); \
- } \
-};
+#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL_BFLOAT16() \
+ template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+ struct general_matrix_vector_product<Index, bfloat16, LhsMapper, ColMajor, ConjugateLhs, bfloat16, RhsMapper, \
+ ConjugateRhs, Version> { \
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs, \
+ const RhsMapper& rhs, bfloat16* res, Index resIncr, \
+ bfloat16 alpha) { \
+ gemv_bf16_col<LhsMapper, RhsMapper>(rows, cols, lhs, rhs, res, resIncr, alpha); \
+ } \
+ };
-#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW_BFLOAT16() \
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
-struct general_matrix_vector_product<Index, bfloat16, LhsMapper, RowMajor, ConjugateLhs, bfloat16, RhsMapper, ConjugateRhs, Version> \
-{ \
- EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
- Index rows, Index cols, \
- const LhsMapper& lhs, \
- const RhsMapper& rhs, \
- bfloat16* res, Index resIncr, \
- bfloat16 alpha) { \
- gemv_bf16_row<LhsMapper, RhsMapper>(rows, cols, lhs, rhs, res, resIncr, alpha); \
- } \
-};
+#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW_BFLOAT16() \
+ template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+ struct general_matrix_vector_product<Index, bfloat16, LhsMapper, RowMajor, ConjugateLhs, bfloat16, RhsMapper, \
+ ConjugateRhs, Version> { \
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs, \
+ const RhsMapper& rhs, bfloat16* res, Index resIncr, \
+ bfloat16 alpha) { \
+ gemv_bf16_row<LhsMapper, RhsMapper>(rows, cols, lhs, rhs, res, resIncr, alpha); \
+ } \
+ };
EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL_BFLOAT16()
EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW_BFLOAT16()
-template<typename ResScalar, typename PResPacket, typename ResPacket, typename LhsPacket, typename RhsPacket>
-EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(PResPacket& a0, PResPacket& b0, ResPacket& a1, ResPacket& b1)
-{
- if (GEMV_IS_COMPLEX_COMPLEX) {
- a0 = padd(a0, a1);
- b0 = padd(b0, b1);
- }
- return predux_complex<ResScalar, PResPacket>(a0, b0);
+template <typename ResScalar, typename PResPacket, typename ResPacket, typename LhsPacket, typename RhsPacket>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(PResPacket& a0, PResPacket& b0, ResPacket& a1,
+ ResPacket& b1) {
+ if (GEMV_IS_COMPLEX_COMPLEX) {
+ a0 = padd(a0, a1);
+ b0 = padd(b0, b1);
+ }
+ return predux_complex<ResScalar, PResPacket>(a0, b0);
}
-#define GEMV_LOADPACKET_ROW_COMPLEX(iter) \
- loadLhsPacket<Scalar, LhsScalar, LhsMapper, PLhsPacket>(lhs, i + (iter), j)
+#define GEMV_LOADPACKET_ROW_COMPLEX(iter) loadLhsPacket<Scalar, LhsScalar, LhsMapper, PLhsPacket>(lhs, i + (iter), j)
-#define GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter) \
- convertReal(GEMV_LOADPACKET_ROW_COMPLEX(iter))
+#define GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter) convertReal(GEMV_LOADPACKET_ROW_COMPLEX(iter))
-#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(which, N) \
- j = 0; \
+#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(which, N) \
+ j = 0; \
for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
- const RhsScalar& b1 = rhs2(j); \
- RhsScalar* b = const_cast<RhsScalar *>(&b1); \
- GEMV_UNROLL_ROW(which, N) \
+ const RhsScalar& b1 = rhs2(j); \
+ RhsScalar* b = const_cast<RhsScalar*>(&b1); \
+ GEMV_UNROLL_ROW(which, N) \
}
-#define GEMV_PROCESS_END_ROW_COMPLEX(N) \
- for (; j < cols; ++j) { \
- RhsScalar b0 = rhs2(j); \
+#define GEMV_PROCESS_END_ROW_COMPLEX(N) \
+ for (; j < cols; ++j) { \
+ RhsScalar b0 = rhs2(j); \
GEMV_UNROLL_ROW_HALF(GEMV_MULT_COMPLEX, (N >> 1)) \
- } \
+ } \
GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW_COMPLEX, (N >> 1))
#ifdef USE_GEMV_MMA
#define GEMV_INIT_ROW_COMPLEX_MMA(iter, N) \
- if (GEMV_GETN_COMPLEX(N) > iter) { \
- __builtin_mma_xxsetaccz(&e0##iter); \
+ if (GEMV_GETN_COMPLEX(N) > iter) { \
+ __builtin_mma_xxsetaccz(&e0##iter); \
}
#define GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter1, iter2) \
GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter2), GEMV_LOADPACKET_ROW_COMPLEX_DATA((iter2) + 1));
-#define GEMV_WORK_ROW_COMPLEX_MMA(iter, N) \
- if (GEMV_GETN_COMPLEX(N) > iter) { \
- if (GEMV_IS_COMPLEX_FLOAT) { \
- PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter); \
- gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, &e0##iter); \
- } else { \
- __vector_pair a##iter; \
- GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter, iter << 1) \
- gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, &e0##iter); \
- } \
+#define GEMV_WORK_ROW_COMPLEX_MMA(iter, N) \
+ if (GEMV_GETN_COMPLEX(N) > iter) { \
+ if (GEMV_IS_COMPLEX_FLOAT) { \
+ PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter); \
+ gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket, \
+ ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, &e0##iter); \
+ } else { \
+ __vector_pair a##iter; \
+ GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter, iter << 1) \
+ gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, \
+ ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, &e0##iter); \
+ } \
}
-#define GEMV_PREDUX4_COMPLEX_MMA(iter1, iter2, iter3, N) \
- if (N > iter1) { \
- if (GEMV_IS_COMPLEX_FLOAT) { \
- cc##iter1 = predux_complex<ResScalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter2, &e0##iter3); \
- } else { \
- cc##iter1 = predux_complex<ResScalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter1); \
- } \
- } else { \
- EIGEN_UNUSED_VARIABLE(cc##iter1); \
+#define GEMV_PREDUX4_COMPLEX_MMA(iter1, iter2, iter3, N) \
+ if (N > iter1) { \
+ if (GEMV_IS_COMPLEX_FLOAT) { \
+ cc##iter1 = predux_complex<ResScalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>( \
+ &e0##iter2, &e0##iter3); \
+ } else { \
+ cc##iter1 = \
+ predux_complex<ResScalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter1); \
+ } \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(cc##iter1); \
}
-#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \
+#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \
GEMV_UNROLL_ROW(GEMV_INIT_ROW_COMPLEX_MMA, N) \
GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(GEMV_WORK_ROW_COMPLEX_MMA, N)
-#define GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N) \
- for (; i < n##N; i += N) { \
- GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \
+#define GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N) \
+ for (; i < n##N; i += N) { \
+ GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \
GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX_MMA, (N >> 1)) \
- GEMV_PROCESS_END_ROW_COMPLEX(N); \
+ GEMV_PROCESS_END_ROW_COMPLEX(N); \
}
#endif
-#define GEMV_WORK_ROW_COMPLEX(iter, N) \
- if (N > iter) { \
- PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter); \
- gemv_mult_complex<ScalarPacket, PLhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, c0##iter, c1##iter); \
+#define GEMV_WORK_ROW_COMPLEX(iter, N) \
+ if (N > iter) { \
+ PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter); \
+ gemv_mult_complex<ScalarPacket, PLhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, \
+ ConjugateRhs, RowMajor>(a##iter, b, c0##iter, c1##iter); \
}
-#define GEMV_PREDUX4_COMPLEX(iter1, iter2, iter3, N) \
- if (N > iter1) { \
- cc##iter1 = predux_complex<ResScalar, PResPacket, ResPacket, LhsPacket, RhsPacket>(c0##iter2, c0##iter3, c1##iter2, c1##iter3); \
- } else { \
- EIGEN_UNUSED_VARIABLE(cc##iter1); \
+#define GEMV_PREDUX4_COMPLEX(iter1, iter2, iter3, N) \
+ if (N > iter1) { \
+ cc##iter1 = predux_complex<ResScalar, PResPacket, ResPacket, LhsPacket, RhsPacket>(c0##iter2, c0##iter3, \
+ c1##iter2, c1##iter3); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(cc##iter1); \
}
-#define GEMV_MULT_COMPLEX(iter1, iter2, iter3, N) \
- if (N > iter1) { \
+#define GEMV_MULT_COMPLEX(iter1, iter2, iter3, N) \
+ if (N > iter1) { \
cc##iter1.scalar[0] += cj.pmul(lhs(i + iter2, j), b0); \
cc##iter1.scalar[1] += cj.pmul(lhs(i + iter3, j), b0); \
}
-#define GEMV_STORE_ROW_COMPLEX(iter1, iter2, iter3, N) \
- if (N > iter1) { \
+#define GEMV_STORE_ROW_COMPLEX(iter1, iter2, iter3, N) \
+ if (N > iter1) { \
storeMaddData<ResScalar>(res + ((i + iter2) * resIncr), alpha, cc##iter1.scalar[0]); \
storeMaddData<ResScalar>(res + ((i + iter3) * resIncr), alpha, cc##iter1.scalar[1]); \
}
#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \
- GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX, N) \
+ GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX, N) \
GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(GEMV_WORK_ROW_COMPLEX, N)
-/** \internal main macro for gemv_complex_row - initialize accumulators, multiply and add inputs, predux and store results */
-#define GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) \
- for (; i < n##N; i += N) { \
- GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \
+/** \internal main macro for gemv_complex_row - initialize accumulators, multiply and add inputs, predux and store
+ * results */
+#define GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) \
+ for (; i < n##N; i += N) { \
+ GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \
GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX, (N >> 1)) \
- GEMV_PROCESS_END_ROW_COMPLEX(N); \
+ GEMV_PROCESS_END_ROW_COMPLEX(N); \
}
#define GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) \
- if (GEMV_IS_COMPLEX_COMPLEX) { \
- c0##iter = padd(c0##iter, c1##iter); \
- } \
+ if (GEMV_IS_COMPLEX_COMPLEX) { \
+ c0##iter = padd(c0##iter, c1##iter); \
+ } \
dd0 = predux(c0##iter);
#if EIGEN_COMP_LLVM
-#define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) \
- GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N)
+#define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N)
-#define GEMV_PROCESS_ROW_COMPLEX_ONE(N) \
- GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N)
+#define GEMV_PROCESS_ROW_COMPLEX_ONE(N) GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N)
-#define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) \
- GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter)
+#define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter)
#else
// gcc seems to be reading and writing registers unnecessarily to memory.
// Use the old way for complex double until it is fixed.
-#define GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter) \
- lhs.template load<LhsPacket, LhsAlignment>(i + (iter), j)
+#define GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter) lhs.template load<LhsPacket, LhsAlignment>(i + (iter), j)
#define GEMV_INIT_COMPLEX_OLD(iter, N) \
- EIGEN_UNUSED_VARIABLE(c0##iter); \
- if (N > iter) { \
+ EIGEN_UNUSED_VARIABLE(c0##iter); \
+ if (N > iter) { \
c1##iter = pset_zero<ResPacket>(); \
- } else { \
- EIGEN_UNUSED_VARIABLE(c1##iter); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(c1##iter); \
}
-#define GEMV_WORK_ROW_COMPLEX_OLD(iter, N) \
- if (N > iter) { \
+#define GEMV_WORK_ROW_COMPLEX_OLD(iter, N) \
+ if (N > iter) { \
LhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter); \
- c1##iter = pcj.pmadd(a##iter, b0, c1##iter); \
+ c1##iter = pcj.pmadd(a##iter, b0, c1##iter); \
}
#define GEMV_PREDUX4_COMPLEX_OLD(iter1, iter2, iter3, N) \
- if (N > iter1) { \
- cc##iter1.scalar[0] = predux(c1##iter2); \
- cc##iter1.scalar[1] = predux(c1##iter3); \
- } else { \
- EIGEN_UNUSED_VARIABLE(cc##iter1); \
+ if (N > iter1) { \
+ cc##iter1.scalar[0] = predux(c1##iter2); \
+ cc##iter1.scalar[1] = predux(c1##iter3); \
+ } else { \
+ EIGEN_UNUSED_VARIABLE(cc##iter1); \
}
-#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \
- GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX_OLD, N) \
- j = 0; \
- for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
+#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \
+ GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX_OLD, N) \
+ j = 0; \
+ for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
RhsPacket b0 = rhs2.template load<RhsPacket, Unaligned>(j); \
- GEMV_UNROLL_ROW(GEMV_WORK_ROW_COMPLEX_OLD, N) \
+ GEMV_UNROLL_ROW(GEMV_WORK_ROW_COMPLEX_OLD, N) \
}
-#define GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N) \
- for (; i < n##N; i += N) { \
- GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \
+#define GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N) \
+ for (; i < n##N; i += N) { \
+ GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \
GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX_OLD, (N >> 1)) \
- GEMV_PROCESS_END_ROW_COMPLEX(N) \
+ GEMV_PROCESS_END_ROW_COMPLEX(N) \
}
-#define GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) \
- dd0 = predux(c1##iter);
+#define GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) dd0 = predux(c1##iter);
#if (__GNUC__ > 10)
-#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW 1
+#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW 1
#else
-#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW \
- (sizeof(Scalar) == sizeof(float)) || GEMV_IS_COMPLEX_COMPLEX
+#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW (sizeof(Scalar) == sizeof(float)) || GEMV_IS_COMPLEX_COMPLEX
#endif
#define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) \
- if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \
+ if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \
GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \
- } else { \
+ } else { \
GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \
}
-#define GEMV_PROCESS_ROW_COMPLEX_ONE(N) \
+#define GEMV_PROCESS_ROW_COMPLEX_ONE(N) \
if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \
- GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) \
- } else { \
- GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N) \
+ GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) \
+ } else { \
+ GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N) \
}
#define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) \
- if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \
+ if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \
GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) \
- } else { \
+ } else { \
GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) \
}
#endif
#ifdef USE_GEMV_MMA
-#define GEMV_PROCESS_ROW_COMPLEX(N) \
- GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N)
+#define GEMV_PROCESS_ROW_COMPLEX(N) GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N)
#else
-#define GEMV_PROCESS_ROW_COMPLEX(N) \
- GEMV_PROCESS_ROW_COMPLEX_ONE(N)
+#define GEMV_PROCESS_ROW_COMPLEX(N) GEMV_PROCESS_ROW_COMPLEX_ONE(N)
#endif
-template<typename Scalar, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, bool LhsIsReal, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, bool RhsIsReal, typename ResScalar>
-EIGEN_STRONG_INLINE void gemv_complex_row(
- Index rows, Index cols,
- const LhsMapper& alhs,
- const RhsMapper& rhs,
- ResScalar* res, Index resIncr,
- ResScalar alpha)
-{
- typedef gemv_traits<LhsScalar, RhsScalar> Traits;
+template <typename Scalar, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, bool LhsIsReal,
+ typename RhsScalar, typename RhsMapper, bool ConjugateRhs, bool RhsIsReal, typename ResScalar>
+EIGEN_STRONG_INLINE void gemv_complex_row(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
+ ResScalar* res, Index resIncr, ResScalar alpha) {
+ typedef gemv_traits<LhsScalar, RhsScalar> Traits;
- typedef typename Traits::LhsPacket LhsPacket;
- typedef typename Traits::RhsPacket RhsPacket;
- typedef typename Traits::ResPacket ResPacket;
+ typedef typename Traits::LhsPacket LhsPacket;
+ typedef typename Traits::RhsPacket RhsPacket;
+ typedef typename Traits::ResPacket ResPacket;
- typedef typename packet_traits<Scalar>::type ScalarPacket;
- typedef typename packet_traits<LhsScalar>::type PLhsPacket;
- typedef typename packet_traits<ResScalar>::type PResPacket;
- typedef gemv_traits<ResPacket, ResPacket> PTraits;
+ typedef typename packet_traits<Scalar>::type ScalarPacket;
+ typedef typename packet_traits<LhsScalar>::type PLhsPacket;
+ typedef typename packet_traits<ResScalar>::type PResPacket;
+ typedef gemv_traits<ResPacket, ResPacket> PTraits;
- // The following copy tells the compiler that lhs's attributes are not modified outside this function
- // This helps GCC to generate proper code.
- LhsMapper lhs(alhs);
- typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
+ // The following copy tells the compiler that lhs's attributes are not modified outside this function
+ // This helps GCC to generate proper code.
+ LhsMapper lhs(alhs);
+ typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
- eigen_internal_assert(rhs.stride() == 1);
- conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
+ eigen_internal_assert(rhs.stride() == 1);
+ conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
#if !EIGEN_COMP_LLVM
- conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
+ conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
#endif
- // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
- // processing 8 rows at once might be counter productive wrt cache.
+ // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
+ // processing 8 rows at once might be counter productive wrt cache.
#ifndef GCC_ONE_VECTORPAIR_BUG
- const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7);
- const Index n4 = rows - 3;
- const Index n2 = rows - 1;
+ const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7);
+ const Index n4 = rows - 3;
+ const Index n2 = rows - 1;
#endif
- // TODO: for padded aligned inputs, we could enable aligned reads
- enum {
- LhsAlignment = Unaligned,
- ResPacketSize = PTraits::ResPacketSize,
- LhsPacketSize = PTraits::LhsPacketSize,
- RhsPacketSize = PTraits::RhsPacketSize,
- };
+ // TODO: for padded aligned inputs, we could enable aligned reads
+ enum {
+ LhsAlignment = Unaligned,
+ ResPacketSize = PTraits::ResPacketSize,
+ LhsPacketSize = PTraits::LhsPacketSize,
+ RhsPacketSize = PTraits::RhsPacketSize,
+ };
- Index i = 0, j;
- PResPacket c00, c01, c02, c03, c04, c05, c06, c07;
- ResPacket c10, c11, c12, c13, c14, c15, c16, c17;
+ Index i = 0, j;
+ PResPacket c00, c01, c02, c03, c04, c05, c06, c07;
+ ResPacket c10, c11, c12, c13, c14, c15, c16, c17;
#ifdef USE_GEMV_MMA
- __vector_quad e00, e01, e02, e03, e04, e05, e06, e07;
- GEMV_UNUSED_ROW(8, e0)
- GEMV_UNUSED_EXTRA(1, c0)
- GEMV_UNUSED_EXTRA(1, c1)
+ __vector_quad e00, e01, e02, e03, e04, e05, e06, e07;
+ GEMV_UNUSED_ROW(8, e0)
+ GEMV_UNUSED_EXTRA(1, c0)
+ GEMV_UNUSED_EXTRA(1, c1)
#endif
- ResScalar dd0;
+ ResScalar dd0;
#ifndef GCC_ONE_VECTORPAIR_BUG
- ScalarBlock<ResScalar, 2> cc0, cc1, cc2, cc3;
+ ScalarBlock<ResScalar, 2> cc0, cc1, cc2, cc3;
#ifdef USE_GEMV_MMA
- if (!GEMV_IS_COMPLEX_COMPLEX)
+ if (!GEMV_IS_COMPLEX_COMPLEX)
#endif
- {
- GEMV_PROCESS_ROW_COMPLEX(8)
- }
- GEMV_PROCESS_ROW_COMPLEX(4)
- GEMV_PROCESS_ROW_COMPLEX(2)
+ {
+ GEMV_PROCESS_ROW_COMPLEX(8)
+ }
+ GEMV_PROCESS_ROW_COMPLEX(4)
+ GEMV_PROCESS_ROW_COMPLEX(2)
#endif
- for (; i < rows; ++i)
- {
- GEMV_PROCESS_ROW_COMPLEX_SINGLE(1)
- GEMV_PROCESS_ROW_COMPLEX_PREDUX(0)
- for (; j < cols; ++j)
- {
- dd0 += cj.pmul(lhs(i, j), rhs2(j));
- }
- res[i * resIncr] += alpha * dd0;
+ for (; i < rows; ++i) {
+ GEMV_PROCESS_ROW_COMPLEX_SINGLE(1)
+ GEMV_PROCESS_ROW_COMPLEX_PREDUX(0)
+ for (; j < cols; ++j) {
+ dd0 += cj.pmul(lhs(i, j), rhs2(j));
}
+ res[i * resIncr] += alpha * dd0;
+ }
}
-#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(Scalar, LhsScalar, RhsScalar) \
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
-struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version> \
-{ \
- typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; \
-\
- EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
- Index rows, Index cols, \
- const LhsMapper& lhs, \
- const RhsMapper& rhs, \
- ResScalar* res, Index resIncr, \
- ResScalar alpha) { \
- gemv_complex_col<Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar, RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
- } \
-};
+#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(Scalar, LhsScalar, RhsScalar) \
+ template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+ struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, \
+ ConjugateRhs, Version> { \
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs, \
+ const RhsMapper& rhs, ResScalar* res, Index resIncr, \
+ ResScalar alpha) { \
+ gemv_complex_col<Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar, \
+ RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs, \
+ res, resIncr, alpha); \
+ } \
+ };
-#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(Scalar, LhsScalar, RhsScalar) \
-template<typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
-struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs, Version> \
-{ \
- typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; \
-\
- EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \
- Index rows, Index cols, \
- const LhsMapper& lhs, \
- const RhsMapper& rhs, \
- ResScalar* res, Index resIncr, \
- ResScalar alpha) { \
- gemv_complex_row<Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar, RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha); \
- } \
-};
+#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(Scalar, LhsScalar, RhsScalar) \
+ template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+ struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, \
+ ConjugateRhs, Version> { \
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs, \
+ const RhsMapper& rhs, ResScalar* res, Index resIncr, \
+ ResScalar alpha) { \
+ gemv_complex_row<Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar, \
+ RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs, \
+ res, resIncr, alpha); \
+ } \
+ };
-EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, float, std::complex<float>)
-EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, std::complex<float>, float)
-EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, std::complex<float>, std::complex<float>)
-EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, double, std::complex<double>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, float, std::complex<float>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, std::complex<float>, float)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, std::complex<float>, std::complex<float>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, double, std::complex<double>)
EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, std::complex<double>, double)
EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, std::complex<double>, std::complex<double>)
-EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, float, std::complex<float>)
-EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, std::complex<float>, float)
-EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, std::complex<float>, std::complex<float>)
-EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, double, std::complex<double>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, float, std::complex<float>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, std::complex<float>, float)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, std::complex<float>, std::complex<float>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, double, std::complex<double>)
EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, std::complex<double>, double)
EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, std::complex<double>, std::complex<double>)
-#endif // EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H
-
+#endif // EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index b945b33..414f05c 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -27,127 +27,132 @@
// NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
#endif
-typedef __vector float Packet4f;
-typedef __vector int Packet4i;
-typedef __vector unsigned int Packet4ui;
-typedef __vector __bool int Packet4bi;
-typedef __vector short int Packet8s;
-typedef __vector unsigned short int Packet8us;
-typedef __vector __bool short Packet8bi;
-typedef __vector signed char Packet16c;
-typedef __vector unsigned char Packet16uc;
-typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf;
+typedef __vector float Packet4f;
+typedef __vector int Packet4i;
+typedef __vector unsigned int Packet4ui;
+typedef __vector __bool int Packet4bi;
+typedef __vector short int Packet8s;
+typedef __vector unsigned short int Packet8us;
+typedef __vector __bool short Packet8bi;
+typedef __vector signed char Packet16c;
+typedef __vector unsigned char Packet16uc;
+typedef eigen_packet_wrapper<__vector unsigned short int, 0> Packet8bf;
// We don't want to write the same code all the time, but we need to reuse the constants
// and it doesn't really work to declare them global, so we define macros instead
-#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
- Packet4f p4f_##NAME = {X, X, X, X}
+#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = {X, X, X, X}
-#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
- Packet4i p4i_##NAME = vec_splat_s32(X)
+#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = vec_splat_s32(X)
-#define EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \
- Packet4ui p4ui_##NAME = {X, X, X, X}
+#define EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME, X) Packet4ui p4ui_##NAME = {X, X, X, X}
-#define EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \
- Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
+#define EIGEN_DECLARE_CONST_FAST_Packet8us(NAME, X) Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
-#define EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \
+#define EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME, X) \
Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
-#define EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
- Packet4f p4f_##NAME = pset1<Packet4f>(X)
+#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
-#define EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
- Packet4i p4i_##NAME = pset1<Packet4i>(X)
+#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
-#define EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
- Packet2d p2d_##NAME = pset1<Packet2d>(X)
+#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
-#define EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
- Packet2l p2l_##NAME = pset1<Packet2l>(X)
+#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
-#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
#define DST_CHAN 1
#define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
-#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
+#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
// These constants are endian-agnostic
-static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
-static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
-static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
-static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
-static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
+static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16, -16); //{ -16, -16, -16, -16}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1); //{ -1, -1, -1, -1}
static EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
static EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
-static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
-static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
+static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE, 1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
+static Packet4f p4f_MZERO =
+ (Packet4f)vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
#ifndef __VSX__
-static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
+static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
#endif
-static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
-static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
-static Packet8s p8s_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
-static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
+static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
+static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
+static Packet8s p8s_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
+static Packet8us p8us_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
-static Packet16c p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
- 8, 9, 10, 11, 12, 13, 14, 15};
-static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
- 8, 9, 10, 11, 12, 13, 14, 15};
+static Packet16c p16c_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+static Packet16uc p16uc_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
-static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 };
+static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+static Packet16uc p16uc_REVERSE16 = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
#ifndef _ARCH_PWR9
-static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 };
+static Packet16uc p16uc_REVERSE8 = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
#endif
#ifdef _BIG_ENDIAN
-static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
+static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
#endif
-static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 };
-static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 };
+static const Packet16uc p16uc_DUPLICATE16_EVEN = {0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13};
+static const Packet16uc p16uc_DUPLICATE16_ODD = {2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15};
-static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 };
+static Packet16uc p16uc_QUADRUPLICATE16_HI = {0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3};
-static Packet16uc p16uc_MERGEE16 = { 0,1, 16,17, 4,5, 20,21, 8,9, 24,25, 12,13, 28,29 };
-static Packet16uc p16uc_MERGEO16 = { 2,3, 18,19, 6,7, 22,23, 10,11, 26,27, 14,15, 30,31 };
+static Packet16uc p16uc_MERGEE16 = {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
+static Packet16uc p16uc_MERGEO16 = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
#ifdef _BIG_ENDIAN
-static Packet16uc p16uc_MERGEH16 = { 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29 };
+static Packet16uc p16uc_MERGEH16 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
#else
-static Packet16uc p16uc_MERGEL16 = { 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31 };
+static Packet16uc p16uc_MERGEL16 = {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
#endif
// Handle endianness properly while loading constants
// Define global static constants:
#ifdef _BIG_ENDIAN
static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
-static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
-static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
-static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+static Packet16uc p16uc_PSET32_WODD =
+ vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
+ 8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
+ 8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 3),
+ 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
#else
static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
-static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
-static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
-static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
-#endif // _BIG_ENDIAN
+static Packet16uc p16uc_PSET32_WODD =
+ vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
+ 8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN =
+ vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
+ 8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO,
+ 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+#endif // _BIG_ENDIAN
-static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
-static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
-static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+static Packet16uc p16uc_PSET64_HI = (Packet16uc)vec_mergeh(
+ (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
+ (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+static Packet16uc p16uc_TRANSPOSE64_HI =
+ p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_LO =
+ p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
-static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+static Packet16uc p16uc_COMPLEX32_REV =
+ vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
- #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
#else
- #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#define EIGEN_PPC_PREFETCH(ADDR) asm(" dcbt [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
#endif
#if EIGEN_COMP_LLVM
@@ -256,14 +261,14 @@
AlignedOnScalar = 1,
size = 4,
- HasAdd = 1,
- HasSub = 1,
+ HasAdd = 1,
+ HasSub = 1,
HasShift = 1,
- HasMul = 1,
-#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11,0,0))
- HasDiv = 1,
+ HasMul = 1,
+#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
+ HasDiv = 1,
#else
- HasDiv = 0,
+ HasDiv = 0,
#endif
HasBlend = 1,
HasCmp = 1
@@ -279,10 +284,10 @@
AlignedOnScalar = 1,
size = 8,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 0,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 0,
HasBlend = 1,
HasCmp = 1
};
@@ -297,10 +302,10 @@
AlignedOnScalar = 1,
size = 8,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 0,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 0,
HasBlend = 1,
HasCmp = 1
};
@@ -315,10 +320,10 @@
AlignedOnScalar = 1,
size = 16,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 0,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 0,
HasBlend = 1,
HasCmp = 1
};
@@ -333,88 +338,125 @@
AlignedOnScalar = 1,
size = 16,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 0,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 0,
HasBlend = 1,
HasCmp = 1
};
};
-template<> struct unpacket_traits<Packet4f>
-{
- typedef float type;
- typedef Packet4f half;
- typedef Packet4i integer_packet;
- enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+template <>
+struct unpacket_traits<Packet4f> {
+ typedef float type;
+ typedef Packet4f half;
+ typedef Packet4i integer_packet;
+ enum {
+ size = 4,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
-template<> struct unpacket_traits<Packet4i>
-{
- typedef int type;
- typedef Packet4i half;
- enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+template <>
+struct unpacket_traits<Packet4i> {
+ typedef int type;
+ typedef Packet4i half;
+ enum {
+ size = 4,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
-template<> struct unpacket_traits<Packet8s>
-{
+template <>
+struct unpacket_traits<Packet8s> {
typedef short int type;
- typedef Packet8s half;
- enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+ typedef Packet8s half;
+ enum {
+ size = 8,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
-template<> struct unpacket_traits<Packet8us>
-{
+template <>
+struct unpacket_traits<Packet8us> {
typedef unsigned short int type;
- typedef Packet8us half;
- enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+ typedef Packet8us half;
+ enum {
+ size = 8,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
-template<> struct unpacket_traits<Packet16c>
-{
+template <>
+struct unpacket_traits<Packet16c> {
typedef signed char type;
- typedef Packet16c half;
- enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+ typedef Packet16c half;
+ enum {
+ size = 16,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
-template<> struct unpacket_traits<Packet16uc>
-{
+template <>
+struct unpacket_traits<Packet16uc> {
typedef unsigned char type;
- typedef Packet16uc half;
- enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+ typedef Packet16uc half;
+ enum {
+ size = 16,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
-template<> struct unpacket_traits<Packet8bf>
-{
+template <>
+struct unpacket_traits<Packet8bf> {
typedef bfloat16 type;
- typedef Packet8bf half;
- enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+ typedef Packet8bf half;
+ enum {
+ size = 8,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
-inline std::ostream & operator <<(std::ostream & s, const Packet16c & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet16c& v) {
union {
- Packet16c v;
+ Packet16c v;
signed char n[16];
} vt;
vt.v = v;
- for (int i=0; i< 16; i++)
- s << vt.n[i] << ", ";
+ for (int i = 0; i < 16; i++) s << vt.n[i] << ", ";
return s;
}
-inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet16uc& v) {
union {
- Packet16uc v;
+ Packet16uc v;
unsigned char n[16];
} vt;
vt.v = v;
- for (int i=0; i< 16; i++)
- s << vt.n[i] << ", ";
+ for (int i = 0; i < 16; i++) s << vt.n[i] << ", ";
return s;
}
-inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet4f& v) {
union {
- Packet4f v;
+ Packet4f v;
float n[4];
} vt;
vt.v = v;
@@ -422,10 +464,9 @@
return s;
}
-inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet4i& v) {
union {
- Packet4i v;
+ Packet4i v;
int n[4];
} vt;
vt.v = v;
@@ -433,10 +474,9 @@
return s;
}
-inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet4ui& v) {
union {
- Packet4ui v;
+ Packet4ui v;
unsigned int n[4];
} vt;
vt.v = v;
@@ -445,8 +485,7 @@
}
template <typename Packet>
-EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from)
-{
+EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet) * from) {
// some versions of GCC throw "unused-but-set-parameter".
// ignoring these warnings for now.
EIGEN_UNUSED_VARIABLE(from);
@@ -459,52 +498,51 @@
}
// Need to define them first or we get specialization after instantiation errors
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
return pload_common<Packet4f>(from);
}
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
return pload_common<Packet4i>(from);
}
-template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from) {
return pload_common<Packet8s>(from);
}
-template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from) {
return pload_common<Packet8us>(from);
}
-template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from) {
return pload_common<Packet16c>(from);
}
-template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from) {
return pload_common<Packet16uc>(from);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
}
template <typename Packet>
-EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet)* from)
-{
+EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet) * from) {
// some versions of GCC throw "unused-but-set-parameter".
// ignoring these warnings for now.
EIGEN_UNUSED_VARIABLE(from);
EIGEN_DEBUG_ALIGNED_LOAD
// Ignore partial input memory initialized
#if !EIGEN_COMP_LLVM
- #pragma GCC diagnostic push
- #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif
#ifdef EIGEN_VECTORIZE_VSX
return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
@@ -512,18 +550,18 @@
return vec_ld(0, from);
#endif
#if !EIGEN_COMP_LLVM
- #pragma GCC diagnostic pop
+#pragma GCC diagnostic pop
#endif
}
-template<> EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(const bfloat16* from)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(const bfloat16* from) {
return pload_ignore<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
}
template <typename Packet>
-EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n, const Index offset)
-{
+EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
+ const Index offset) {
// some versions of GCC throw "unused-but-set-parameter".
// ignoring these warnings for now.
const Index packet_size = unpacket_traits<Packet>::size;
@@ -546,13 +584,13 @@
#else
if (n) {
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
- unsigned char* load2 = reinterpret_cast<unsigned char *>(load + offset);
- unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
+ unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
+ unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
Index n2 = n * size;
if (16 <= n2) {
pstoreu(load2, ploadu<Packet16uc>(from2));
} else {
- memcpy((void *)load2, (void *)from2, n2);
+ memcpy((void*)load2, (void*)from2, n2);
}
return pload_ignore<Packet>(load);
} else {
@@ -561,43 +599,44 @@
#endif
}
-template<> EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(const float* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(const float* from, const Index n, const Index offset) {
return pload_partial_common<Packet4f>(from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(const int* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(const int* from, const Index n, const Index offset) {
return pload_partial_common<Packet4i>(from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(const short int* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
return pload_partial_common<Packet8s>(from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(const unsigned short int* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(const unsigned short int* from, const Index n,
+ const Index offset) {
return pload_partial_common<Packet8us>(from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
return pload_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
}
-template<> EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(const signed char* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
return pload_partial_common<Packet16c>(from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset) {
return pload_partial_common<Packet16uc>(from, n, offset);
}
template <typename Packet>
-EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){
+EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
// some versions of GCC throw "unused-but-set-parameter" (float *to).
// ignoring these warnings for now.
EIGEN_UNUSED_VARIABLE(to);
@@ -609,43 +648,44 @@
#endif
}
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
pstore_common<Packet4f>(to, from);
}
-template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
pstore_common<Packet4i>(to, from);
}
-template<> EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from) {
pstore_common<Packet8s>(to, from);
}
-template<> EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from) {
pstore_common<Packet8us>(to, from);
}
-template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
}
-template<> EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from) {
pstore_common<Packet16c>(to, from);
}
-template<> EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from) {
pstore_common<Packet16uc>(to, from);
}
-template<typename Packet> EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n, const Index offset)
-{
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
+ const Index offset) {
// some versions of GCC throw "unused-but-set-parameter" (float *to).
// ignoring these warnings for now.
const Index packet_size = unpacket_traits<Packet>::size;
@@ -669,110 +709,119 @@
if (n) {
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
pstore(store, from);
- unsigned char* store2 = reinterpret_cast<unsigned char *>(store + offset);
- unsigned char* to2 = reinterpret_cast<unsigned char *>(to);
+ unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
+ unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
Index n2 = n * size;
if (16 <= n2) {
pstore(to2, ploadu<Packet16uc>(store2));
} else {
- memcpy((void *)to2, (void *)store2, n2);
+ memcpy((void*)to2, (void*)store2, n2);
}
}
#endif
}
-template<> EIGEN_ALWAYS_INLINE void pstore_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
pstore_partial_common<Packet4f>(to, from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE void pstore_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
pstore_partial_common<Packet4i>(to, from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE void pstore_partial<short int>(short int* to, const Packet8s& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<short int>(short int* to, const Packet8s& from, const Index n,
+ const Index offset) {
pstore_partial_common<Packet8s>(to, from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE void pstore_partial<unsigned short int>(unsigned short int* to, const Packet8us& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
+ const Index n, const Index offset) {
pstore_partial_common<Packet8us>(to, from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE void pstore_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
+ const Index offset) {
pstore_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE void pstore_partial<signed char>(signed char* to, const Packet16c& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
+ const Index offset) {
pstore_partial_common<Packet16c>(to, from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE void pstore_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
+ const Index offset) {
pstore_partial_common<Packet16uc>(to, from, n, offset);
}
-template<typename Packet>
-EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet) & from) {
Packet v = {from, from, from, from};
return v;
}
-template<typename Packet>
-EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet)& from)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet) & from) {
Packet v = {from, from, from, from, from, from, from, from};
return v;
}
-template<typename Packet>
-EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet)& from)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet) & from) {
Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
return v;
}
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
return pset1_size4<Packet4f>(from);
}
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
return pset1_size4<Packet4i>(from);
}
-template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) {
+template <>
+EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) {
return pset1_size8<Packet8s>(from);
}
-template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) {
+template <>
+EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) {
return pset1_size8<Packet8us>(from);
}
-template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) {
+template <>
+EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) {
return pset1_size16<Packet16c>(from);
}
-template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) {
+template <>
+EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) {
return pset1_size16<Packet16uc>(from);
}
-template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
}
-template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from));
}
-template<typename Packet> EIGEN_STRONG_INLINE void
-pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a,
- Packet& a0, Packet& a1, Packet& a2, Packet& a3)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE void pbroadcast4_common(const __UNPACK_TYPE__(Packet) * a, Packet& a0, Packet& a1, Packet& a2,
+ Packet& a3) {
a3 = pload<Packet>(a);
a0 = vec_splat(a3, 0);
a1 = vec_splat(a3, 1);
@@ -780,21 +829,18 @@
a3 = vec_splat(a3, 3);
}
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
- Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
}
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4i>(const int *a,
- Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
-{
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
}
-template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride, const Index n = unpacket_traits<Packet>::size)
-{
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet) * from, Index stride,
+ const Index n = unpacket_traits<Packet>::size) {
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
if (stride == 1) {
@@ -806,85 +852,97 @@
} else {
LOAD_STORE_UNROLL_16
for (Index i = 0; i < n; i++) {
- a[i] = from[i*stride];
+ a[i] = from[i * stride];
}
// Leave rest of the array uninitialized
return pload_ignore<Packet>(a);
}
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
return pgather_common<Packet4f>(from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
return pgather_common<Packet4i>(from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(const short int* from, Index stride) {
return pgather_common<Packet8s>(from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from,
+ Index stride) {
return pgather_common<Packet8us>(from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) {
return pgather_common<Packet8bf>(from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride) {
return pgather_common<Packet16c>(from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from,
+ Index stride) {
return pgather_common<Packet16uc>(from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(const float* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(const float* from, Index stride,
+ const Index n) {
return pgather_common<Packet4f>(from, stride, n);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(const int* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(const int* from, Index stride,
+ const Index n) {
return pgather_common<Packet4i>(from, stride, n);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(const short int* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(const short int* from, Index stride,
+ const Index n) {
return pgather_common<Packet8s>(from, stride, n);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather_partial<unsigned short int, Packet8us>(const unsigned short int* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us
+pgather_partial<unsigned short int, Packet8us>(const unsigned short int* from, Index stride, const Index n) {
return pgather_common<Packet8us>(from, stride, n);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(const bfloat16* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(const bfloat16* from, Index stride,
+ const Index n) {
return pgather_common<Packet8bf>(from, stride, n);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(const signed char* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(const signed char* from,
+ Index stride, const Index n) {
return pgather_common<Packet16c>(from, stride, n);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(const unsigned char* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(const unsigned char* from,
+ Index stride,
+ const Index n) {
return pgather_common<Packet16uc>(from, stride, n);
}
-template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride, const Index n = unpacket_traits<Packet>::size)
-{
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet) * to, const Packet& from,
+ Index stride,
+ const Index n = unpacket_traits<Packet>::size) {
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
if (stride == 1) {
@@ -897,129 +955,203 @@
pstore<__UNPACK_TYPE__(Packet)>(a, from);
LOAD_STORE_UNROLL_16
for (Index i = 0; i < n; i++) {
- to[i*stride] = a[i];
+ to[i * stride] = a[i];
}
}
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
pscatter_common<Packet4f>(to, from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
pscatter_common<Packet4i>(to, from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<short int, Packet8s>(short int* to, const Packet8s& from,
+ Index stride) {
pscatter_common<Packet8s>(to, from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned short int, Packet8us>(unsigned short int* to,
+ const Packet8us& from,
+ Index stride) {
pscatter_common<Packet8us>(to, from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
+ Index stride) {
pscatter_common<Packet8bf>(to, from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from,
+ Index stride) {
pscatter_common<Packet16c>(to, from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned char, Packet16uc>(unsigned char* to,
+ const Packet16uc& from, Index stride) {
pscatter_common<Packet16uc>(to, from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<float, Packet4f>(float* to, const Packet4f& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<float, Packet4f>(float* to, const Packet4f& from,
+ Index stride, const Index n) {
pscatter_common<Packet4f>(to, from, stride, n);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<int, Packet4i>(int* to, const Packet4i& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<int, Packet4i>(int* to, const Packet4i& from, Index stride,
+ const Index n) {
pscatter_common<Packet4i>(to, from, stride, n);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<short int, Packet8s>(short int* to, const Packet8s& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<short int, Packet8s>(short int* to, const Packet8s& from,
+ Index stride, const Index n) {
pscatter_common<Packet8s>(to, from, stride, n);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned short int, Packet8us>(unsigned short int* to,
+ const Packet8us& from,
+ Index stride,
+ const Index n) {
pscatter_common<Packet8us>(to, from, stride, n);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
+ Index stride, const Index n) {
pscatter_common<Packet8bf>(to, from, stride, n);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<signed char, Packet16c>(signed char* to,
+ const Packet16c& from, Index stride,
+ const Index n) {
pscatter_common<Packet16c>(to, from, stride, n);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned char, Packet16uc>(unsigned char* to,
+ const Packet16uc& from,
+ Index stride, const Index n) {
pscatter_common<Packet16uc>(to, from, stride, n);
}
-template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
-template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN; }
-template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) { return pset1<Packet8s>(a) + p8s_COUNTDOWN; }
-template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) { return pset1<Packet8us>(a) + p8us_COUNTDOWN; }
-template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) { return pset1<Packet16c>(a) + p16c_COUNTDOWN; }
-template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) { return pset1<Packet16uc>(a) + p16uc_COUNTDOWN; }
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+ return pset1<Packet4f>(a) + p4f_COUNTDOWN;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
+ return pset1<Packet4i>(a) + p4i_COUNTDOWN;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) {
+ return pset1<Packet8s>(a) + p8s_COUNTDOWN;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) {
+ return pset1<Packet8us>(a) + p8us_COUNTDOWN;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) {
+ return pset1<Packet16c>(a) + p16c_COUNTDOWN;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) {
+ return pset1<Packet16uc>(a) + p16uc_COUNTDOWN;
+}
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f> (const Packet4f& a, const Packet4f& b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i> (const Packet4i& a, const Packet4i& b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui> (const Packet4ui& a, const Packet4ui& b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s> (const Packet8s& a, const Packet8s& b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us> (const Packet8us& a, const Packet8us& b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c> (const Packet16c& a, const Packet16c& b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a + b; }
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return a + b;
+}
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f> (const Packet4f& a, const Packet4f& b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i> (const Packet4i& a, const Packet4i& b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s> (const Packet8s& a, const Packet8s& b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us> (const Packet8us& a, const Packet8us& b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c> (const Packet16c& a, const Packet16c& b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; }
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return a - b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return a - b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return a - b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return a - b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return a - b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return a - b;
+}
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
#ifdef __POWER8_VECTOR__
return vec_neg(a);
#else
return vec_xor(a, p4f_MZERO);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
#ifdef __POWER8_VECTOR__
return vec_neg(a);
#else
return reinterpret_cast<Packet16c>(p4i_ZERO) - a;
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
#ifdef __POWER8_VECTOR__
return vec_neg(a);
#else
return reinterpret_cast<Packet8s>(p4i_ZERO) - a;
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
#ifdef __POWER8_VECTOR__
return vec_neg(a);
#else
@@ -1027,19 +1159,42 @@
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+ return a;
+}
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); }
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i> (const Packet4i& a, const Packet4i& b) { return a * b; }
-template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s> (const Packet8s& a, const Packet8s& b) { return vec_mul(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us> (const Packet8us& a, const Packet8us& b) { return vec_mul(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c> (const Packet16c& a, const Packet16c& b) { return vec_mul(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vec_madd(a, b, p4f_MZERO);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return a * b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return vec_mul(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vec_mul(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return vec_mul(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return vec_mul(a, b);
+}
-
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
#ifndef __VSX__ // VSX actually provides a div instruction
Packet4f t, y_0, y_1;
@@ -1047,7 +1202,7 @@
y_0 = vec_re(b);
// Do one Newton-Raphson iteration to get the needed accuracy
- t = vec_nmsub(y_0, b, p4f_ONE);
+ t = vec_nmsub(y_0, b, p4f_ONE);
y_1 = vec_madd(y_0, t, y_0);
return vec_madd(a, y_1, p4f_MZERO);
@@ -1056,9 +1211,9 @@
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
-#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11,0,0))
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
+#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
return vec_div(a, b);
#else
EIGEN_UNUSED_VARIABLE(a);
@@ -1069,154 +1224,302 @@
}
// for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; }
-template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+ return vec_madd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+ return a * b + c;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
+ return vec_madd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
+ return vec_madd(a, b, c);
+}
#ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_msub(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_nmsub(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_nmadd(a,b,c); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+ return vec_msub(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+ return vec_nmsub(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+ return vec_nmadd(a, b, c);
+}
#endif
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
- #ifdef EIGEN_VECTORIZE_VSX
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#ifdef EIGEN_VECTORIZE_VSX
// NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
Packet4f ret;
- __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+ __asm__("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
return ret;
- #else
+#else
return vec_min(a, b);
- #endif
+#endif
}
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return vec_min(a, b);
+}
-
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
- #ifdef EIGEN_VECTORIZE_VSX
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#ifdef EIGEN_VECTORIZE_VSX
// NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
Packet4f ret;
- __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+ __asm__("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
return ret;
- #else
+#else
return vec_max(a, b);
- #endif
+#endif
}
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return vec_max(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
+ return reinterpret_cast<Packet4f>(vec_cmple(a, b));
+}
// To fix bug with vec_cmplt on older versions
#ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) {
+ return reinterpret_cast<Packet4f>(vec_cmplt(a, b));
+}
#endif
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
- Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
- return vec_nor(c,c);
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) {
+ return reinterpret_cast<Packet4f>(vec_cmpeq(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
+ Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a, b));
+ return vec_nor(c, c);
}
#ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmple(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
+ return reinterpret_cast<Packet4i>(vec_cmple(a, b));
+}
#endif
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmplt(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
+ return reinterpret_cast<Packet4i>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
+ return reinterpret_cast<Packet4i>(vec_cmpeq(a, b));
+}
#ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmple(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) {
+ return reinterpret_cast<Packet8s>(vec_cmple(a, b));
+}
#endif
-template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmplt(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) {
+ return reinterpret_cast<Packet8s>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) {
+ return reinterpret_cast<Packet8s>(vec_cmpeq(a, b));
+}
#ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmple(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) {
+ return reinterpret_cast<Packet8us>(vec_cmple(a, b));
+}
#endif
-template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmplt(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) {
+ return reinterpret_cast<Packet8us>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) {
+ return reinterpret_cast<Packet8us>(vec_cmpeq(a, b));
+}
#ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmple(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) {
+ return reinterpret_cast<Packet16c>(vec_cmple(a, b));
+}
#endif
-template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmplt(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) {
+ return reinterpret_cast<Packet16c>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) {
+ return reinterpret_cast<Packet16c>(vec_cmpeq(a, b));
+}
#ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmple(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) {
+ return reinterpret_cast<Packet16uc>(vec_cmple(a, b));
+}
#endif
-template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmplt(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmpeq(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) {
+ return reinterpret_cast<Packet16uc>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) {
+ return reinterpret_cast<Packet16uc>(vec_cmpeq(a, b));
+}
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
return pand<Packet8us>(a, b);
}
-
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
return por<Packet8us>(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vec_xor(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vec_xor(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vec_xor(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
return pxor<Packet8us>(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_andc(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_andc(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vec_andc(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vec_andc(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
}
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
-{
- Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
- Packet4f res;
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
+ Packet4f t = vec_add(
+ reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
+ Packet4f res;
#ifdef EIGEN_VECTORIZE_VSX
- __asm__("xvrspiz %x0, %x1\n\t"
- : "=&wa" (res)
- : "wa" (t));
+ __asm__("xvrspiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
#else
- __asm__("vrfiz %0, %1\n\t"
- : "=v" (res)
- : "v" (t));
+ __asm__("vrfiz %0, %1\n\t" : "=v"(res) : "v"(t));
#endif
- return res;
+ return res;
}
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+ return vec_ceil(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+ return vec_floor(a);
+}
#ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
-{
- Packet4f res;
+template <>
+EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
+ Packet4f res;
- __asm__("xvrspic %x0, %x1\n\t"
- : "=&wa" (res)
- : "wa" (a));
+ __asm__("xvrspic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
- return res;
+ return res;
}
#endif
-template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet) * from) {
EIGEN_DEBUG_ALIGNED_LOAD
#if defined(EIGEN_VECTORIZE_VSX) || !defined(_BIG_ENDIAN)
EIGEN_DEBUG_UNALIGNED_LOAD
@@ -1224,45 +1527,46 @@
#else
Packet16uc MSQ, LSQ;
Packet16uc mask;
- MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword
- LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword
- mask = vec_lvsl(0, from); // create the permute mask
- //TODO: Add static_cast here
- return (Packet) vec_perm(MSQ, LSQ, mask); // align the data
+ MSQ = vec_ld(0, (unsigned char*)from); // most significant quadword
+ LSQ = vec_ld(15, (unsigned char*)from); // least significant quadword
+ mask = vec_lvsl(0, from); // create the permute mask
+ // TODO: Add static_cast here
+ return (Packet)vec_perm(MSQ, LSQ, mask); // align the data
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
return ploadu_common<Packet4f>(from);
}
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
return ploadu_common<Packet4i>(from);
}
-template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from) {
return ploadu_common<Packet8s>(from);
}
-template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from) {
return ploadu_common<Packet8us>(from);
}
-template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
}
-template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from) {
return ploadu_common<Packet16c>(from);
}
-template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from) {
return ploadu_common<Packet16uc>(from);
}
-template<typename Packet> EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet)* from, const Index n, const Index offset)
-{
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
+ const Index offset) {
const Index packet_size = unpacket_traits<Packet>::size;
eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
const Index size = sizeof(__UNPACK_TYPE__(Packet));
@@ -1283,13 +1587,13 @@
#else
if (n) {
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
- unsigned char* load2 = reinterpret_cast<unsigned char *>(load + offset);
- unsigned char* from2 = reinterpret_cast<unsigned char *>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
+ unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
+ unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
Index n2 = n * size;
if (16 <= n2) {
pstoreu(load2, ploadu<Packet16uc>(from2));
} else {
- memcpy((void *)load2, (void *)from2, n2);
+ memcpy((void*)load2, (void*)from2, n2);
}
return pload_ignore<Packet>(load);
} else {
@@ -1298,106 +1602,122 @@
#endif
}
-template<> EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n, const Index offset) {
return ploadu_partial_common<Packet4f>(from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n, const Index offset) {
return ploadu_partial_common<Packet4i>(from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
return ploadu_partial_common<Packet8s>(from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n,
+ const Index offset) {
return ploadu_partial_common<Packet8us>(from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
}
-template<> EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
return ploadu_partial_common<Packet16c>(from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n,
+ const Index offset) {
return ploadu_partial_common<Packet16uc>(from, n, offset);
}
-template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet) * from) {
Packet p;
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet>(from);
- else p = ploadu<Packet>(from);
+ if ((std::ptrdiff_t(from) % 16) == 0)
+ p = pload<Packet>(from);
+ else
+ p = ploadu<Packet>(from);
return vec_mergeh(p, p);
}
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
return ploaddup_common<Packet4f>(from);
}
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
return ploaddup_common<Packet4i>(from);
}
-template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from) {
Packet8s p;
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from);
- else p = ploadu<Packet8s>(from);
+ if ((std::ptrdiff_t(from) % 16) == 0)
+ p = pload<Packet8s>(from);
+ else
+ p = ploadu<Packet8s>(from);
return vec_mergeh(p, p);
}
-template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from) {
Packet8us p;
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from);
- else p = ploadu<Packet8us>(from);
+ if ((std::ptrdiff_t(from) % 16) == 0)
+ p = pload<Packet8us>(from);
+ else
+ p = ploadu<Packet8us>(from);
return vec_mergeh(p, p);
}
-template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from) {
Packet8s p;
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from);
- else p = ploadu<Packet8s>(from);
+ if ((std::ptrdiff_t(from) % 16) == 0)
+ p = pload<Packet8s>(from);
+ else
+ p = ploadu<Packet8s>(from);
return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
}
-template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from) {
Packet8us p;
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from);
- else p = ploadu<Packet8us>(from);
+ if ((std::ptrdiff_t(from) % 16) == 0)
+ p = pload<Packet8us>(from);
+ else
+ p = ploadu<Packet8us>(from);
return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
}
-template<> EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from) {
return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
}
-template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from) {
Packet16c p;
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16c>(from);
- else p = ploadu<Packet16c>(from);
+ if ((std::ptrdiff_t(from) % 16) == 0)
+ p = pload<Packet16c>(from);
+ else
+ p = ploadu<Packet16c>(from);
return vec_mergeh(p, p);
}
-template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from) {
Packet16uc p;
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16uc>(from);
- else p = ploadu<Packet16uc>(from);
+ if ((std::ptrdiff_t(from) % 16) == 0)
+ p = pload<Packet16uc>(from);
+ else
+ p = ploadu<Packet16uc>(from);
return vec_mergeh(p, p);
}
-template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
EIGEN_DEBUG_UNALIGNED_STORE
#if defined(EIGEN_VECTORIZE_VSX) || !defined(_BIG_ENDIAN)
vec_xst(from, 0, to);
@@ -1407,48 +1727,49 @@
Packet16uc MSQ, LSQ, edges;
Packet16uc edgeAlign, align;
- MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword
- LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword
- edgeAlign = vec_lvsl(0, to); // permute map to extract edges
- edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges
- align = vec_lvsr( 0, to ); // permute map to misalign data
- MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ)
- LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ)
- vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first
- vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part second
+ MSQ = vec_ld(0, (unsigned char*)to); // most significant quadword
+ LSQ = vec_ld(15, (unsigned char*)to); // least significant quadword
+ edgeAlign = vec_lvsl(0, to); // permute map to extract edges
+ edges = vec_perm(LSQ, MSQ, edgeAlign); // extract the edges
+ align = vec_lvsr(0, to); // permute map to misalign data
+ MSQ = vec_perm(edges, (Packet16uc)from, align); // misalign the data (MSQ)
+ LSQ = vec_perm((Packet16uc)from, edges, align); // misalign the data (LSQ)
+ vec_st(LSQ, 15, (unsigned char*)to); // Store the LSQ part first
+ vec_st(MSQ, 0, (unsigned char*)to); // Store the MSQ part second
#endif
}
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
pstoreu_common<Packet4f>(to, from);
}
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
pstoreu_common<Packet4i>(to, from);
}
-template<> EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from) {
pstoreu_common<Packet8s>(to, from);
}
-template<> EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from) {
pstoreu_common<Packet8us>(to, from);
}
-template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
}
-template<> EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from) {
pstoreu_common<Packet16c>(to, from);
}
-template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from) {
pstoreu_common<Packet16uc>(to, from);
}
-template<typename Packet> EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet)* to, const Packet& from, const Index n, const Index offset)
-{
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
+ const Index offset) {
const Index packet_size = unpacket_traits<Packet>::size;
eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
const Index size = sizeof(__UNPACK_TYPE__(Packet));
@@ -1469,181 +1790,237 @@
if (n) {
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
pstore(store, from);
- unsigned char* store2 = reinterpret_cast<unsigned char *>(store + offset);
- unsigned char* to2 = reinterpret_cast<unsigned char *>(to);
+ unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
+ unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
Index n2 = n * size;
if (16 <= n2) {
pstoreu(to2, ploadu<Packet16uc>(store2));
} else {
- memcpy((void *)to2, (void *)store2, n2);
+ memcpy((void*)to2, (void*)store2, n2);
}
}
#endif
}
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
pstoreu_partial_common<Packet4f>(to, from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
pstoreu_partial_common<Packet4i>(to, from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n,
+ const Index offset) {
pstoreu_partial_common<Packet8s>(to, from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
+ const Index n, const Index offset) {
pstoreu_partial_common<Packet8us>(to, from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
+ const Index offset) {
pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
+ const Index offset) {
pstoreu_partial_common<Packet16c>(to, from, n, offset);
}
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
+ const Index offset) {
pstoreu_partial_common<Packet16uc>(to, from, n, offset);
}
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+ EIGEN_PPC_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
+ EIGEN_PPC_PREFETCH(addr);
+}
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
-template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; }
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+ EIGEN_ALIGN16 float x;
+ vec_ste(a, 0, &x);
+ return x;
+}
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
+ EIGEN_ALIGN16 int x;
+ vec_ste(a, 0, &x);
+ return x;
+}
-template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
+template <typename Packet>
+EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
vec_ste(a, 0, &x);
return x;
}
-template<> EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
+template <>
+EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
return pfirst_common<Packet8s>(a);
}
-template<> EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
+template <>
+EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
return pfirst_common<Packet8us>(a);
}
-template<> EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a) {
return pfirst_common<Packet16c>(a);
}
-template<> EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a) {
return pfirst_common<Packet16uc>(a);
}
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{
- return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+ return reinterpret_cast<Packet4f>(
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
}
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
-{
- return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+ return reinterpret_cast<Packet4i>(
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
}
-template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
-{
- return reinterpret_cast<Packet8s>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
+template <>
+EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
+ return reinterpret_cast<Packet8s>(
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
}
-template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
-{
- return reinterpret_cast<Packet8us>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
+template <>
+EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
+ return reinterpret_cast<Packet8us>(
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
}
-template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
#ifdef _ARCH_PWR9
return vec_revb(a);
#else
return vec_perm(a, a, p16uc_REVERSE8);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
#ifdef _ARCH_PWR9
return vec_revb(a);
#else
return vec_perm(a, a, p16uc_REVERSE8);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
return preverse<Packet8us>(a);
}
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
- EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF);
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+ return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
+ return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
+ return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
+ return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
+ EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask, 0x7FFF);
return pand<Packet8us>(p8us_abs_mask, a);
}
-template<> EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) { return vec_sra(a.m_val, vec_splat_u16(15)); }
-template<> EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) { return (Packet4f)vec_sra((Packet4i)a, vec_splats((unsigned int)(31))); }
+template <>
+EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
+ return vec_sra(a.m_val, vec_splat_u16(15));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
+ return (Packet4f)vec_sra((Packet4i)a, vec_splats((unsigned int)(31)));
+}
-template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a)
-{ return vec_sra(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a)
-{ return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a)
-{ return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
-template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a)
-{
+template <int N>
+EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
+ return vec_sra(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
+ return vec_sr(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
+ return vec_sl(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a) {
const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
return reinterpret_cast<Packet4f>(r);
}
-template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a)
-{
+template <int N>
+EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a) {
const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
return reinterpret_cast<Packet4f>(r);
}
-template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a)
-{
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
return vec_sr(a, p4ui_mask);
}
-template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a)
-{
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
return vec_sl(a, p4ui_mask);
}
-template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a)
-{
+template <int N>
+EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) {
const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
return vec_sl(a, p8us_mask);
}
-template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a)
-{
+template <int N>
+EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) {
const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
return vec_sr(a, p8us_mask);
}
-EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf) {
return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
}
-EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf) {
const EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
- return pand<Packet4f>(
- reinterpret_cast<Packet4f>(bf.m_val),
- reinterpret_cast<Packet4f>(p4ui_high_mask)
- );
+ return pand<Packet4f>(reinterpret_cast<Packet4f>(bf.m_val), reinterpret_cast<Packet4f>(p4ui_high_mask));
}
EIGEN_ALWAYS_INLINE Packet8us pmerge(Packet4ui even, Packet4ui odd) {
@@ -1660,20 +2037,20 @@
return pmerge(reinterpret_cast<Packet4ui>(even), reinterpret_cast<Packet4ui>(odd));
}
-//#define SUPPORT_BF16_SUBNORMALS
+// #define SUPPORT_BF16_SUBNORMALS
#ifndef __VEC_CLASS_FP_NAN
-#define __VEC_CLASS_FP_NAN (1<<6)
+#define __VEC_CLASS_FP_NAN (1 << 6)
#endif
#if defined(SUPPORT_BF16_SUBNORMALS) && !defined(__VEC_CLASS_FP_SUBNORMAL)
-#define __VEC_CLASS_FP_SUBNORMAL_P (1<<1)
-#define __VEC_CLASS_FP_SUBNORMAL_N (1<<0)
+#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1)
+#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | __VEC_CLASS_FP_SUBNORMAL_N)
#endif
-EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f) {
#ifdef _ARCH_PWR10
return reinterpret_cast<Packet8us>(__builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(p4f)));
#else
@@ -1681,7 +2058,7 @@
Packet4ui lsb = plogical_shift_right<16>(input);
lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE));
- EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu);
+ EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS, 0x7FFFu);
Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
input = padd<Packet4ui>(input, rounding_bias);
@@ -1696,7 +2073,7 @@
#endif
#else
#ifdef SUPPORT_BF16_SUBNORMALS
- //Test NaN and Subnormal
+ // Test NaN and Subnormal
const EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f));
@@ -1706,22 +2083,18 @@
Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_exp_mask);
Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
- Packet4ui nan_selector = pandnot<Packet4ui>(
- reinterpret_cast<Packet4ui>(is_max_exp),
- reinterpret_cast<Packet4ui>(is_mant_zero)
- );
+ Packet4ui nan_selector =
+ pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_max_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
- Packet4ui subnormal_selector = pandnot<Packet4ui>(
- reinterpret_cast<Packet4ui>(is_zero_exp),
- reinterpret_cast<Packet4ui>(is_mant_zero)
- );
+ Packet4ui subnormal_selector =
+ pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_zero_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
input = vec_sel(input, p4ui_nan, nan_selector);
input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
#else
- //Test only NaN
+ // Test only NaN
Packet4bi nan_selector = vec_cmpeq(p4f, p4f);
input = vec_sel(p4ui_nan, input, nan_selector);
@@ -1739,9 +2112,8 @@
*
* @param lohi to expect either a low & high OR odd & even order
*/
-template<bool lohi>
-EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f lo, Packet4f hi)
-{
+template <bool lohi>
+EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f lo, Packet4f hi) {
if (lohi) {
return vec_perm(reinterpret_cast<Packet8us>(lo), reinterpret_cast<Packet8us>(hi), p16uc_MERGEH16);
} else {
@@ -1754,9 +2126,8 @@
*
* @param lohi to expect either a low & high OR odd & even order
*/
-template<bool lohi>
-EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f lo, Packet4f hi)
-{
+template <bool lohi>
+EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f lo, Packet4f hi) {
if (lohi) {
return vec_pack(reinterpret_cast<Packet4ui>(lo), reinterpret_cast<Packet4ui>(hi));
} else {
@@ -1764,9 +2135,8 @@
}
}
#else
-template<bool lohi>
-EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f hi, Packet4f lo)
-{
+template <bool lohi>
+EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f hi, Packet4f lo) {
if (lohi) {
return vec_pack(reinterpret_cast<Packet4ui>(hi), reinterpret_cast<Packet4ui>(lo));
} else {
@@ -1774,9 +2144,8 @@
}
}
-template<bool lohi>
-EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f hi, Packet4f lo)
-{
+template <bool lohi>
+EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f hi, Packet4f lo) {
if (lohi) {
return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEL16);
} else {
@@ -1790,14 +2159,13 @@
*
* @param lohi to expect either a low & high OR odd & even order
*/
-template<bool lohi = true>
-EIGEN_ALWAYS_INLINE Packet8bf F32ToBf16Two(Packet4f lo, Packet4f hi)
-{
+template <bool lohi = true>
+EIGEN_ALWAYS_INLINE Packet8bf F32ToBf16Two(Packet4f lo, Packet4f hi) {
Packet8us p4f = Bf16PackHigh<lohi>(lo, hi);
Packet8us p4f2 = Bf16PackLow<lohi>(lo, hi);
Packet8us lsb = pand<Packet8us>(p4f, p8us_ONE);
- EIGEN_DECLARE_CONST_FAST_Packet8us(BIAS,0x7FFFu);
+ EIGEN_DECLARE_CONST_FAST_Packet8us(BIAS, 0x7FFFu);
lsb = padd<Packet8us>(lsb, p8us_BIAS);
lsb = padd<Packet8us>(lsb, p4f2);
@@ -1807,20 +2175,22 @@
#ifdef _ARCH_PWR9
Packet4bi nan_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_NAN);
Packet4bi nan_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_NAN);
- Packet8us nan_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
+ Packet8us nan_selector =
+ Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
input = vec_sel(input, p8us_BIAS, nan_selector);
#ifdef SUPPORT_BF16_SUBNORMALS
Packet4bi subnormal_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_SUBNORMAL);
Packet4bi subnormal_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_SUBNORMAL);
- Packet8us subnormal_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(subnormal_selector_lo), reinterpret_cast<Packet4f>(subnormal_selector_hi));
+ Packet8us subnormal_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(subnormal_selector_lo),
+ reinterpret_cast<Packet4f>(subnormal_selector_hi));
input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
#endif
#else
#ifdef SUPPORT_BF16_SUBNORMALS
- //Test NaN and Subnormal
+ // Test NaN and Subnormal
const EIGEN_DECLARE_CONST_FAST_Packet8us(exp_mask, 0x7F80);
Packet8us exp = pand<Packet8us>(p8us_exp_mask, p4f);
@@ -1830,26 +2200,23 @@
Packet8bi is_max_exp = vec_cmpeq(exp, p8us_exp_mask);
Packet8bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet8us>(p4i_ZERO));
- Packet8us nan_selector = pandnot<Packet8us>(
- reinterpret_cast<Packet8us>(is_max_exp),
- reinterpret_cast<Packet8us>(is_mant_zero)
- );
+ Packet8us nan_selector =
+ pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_max_exp), reinterpret_cast<Packet8us>(is_mant_zero));
Packet8bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet8us>(p4i_ZERO));
- Packet8us subnormal_selector = pandnot<Packet8us>(
- reinterpret_cast<Packet8us>(is_zero_exp),
- reinterpret_cast<Packet8us>(is_mant_zero)
- );
+ Packet8us subnormal_selector =
+ pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_zero_exp), reinterpret_cast<Packet8us>(is_mant_zero));
// Using BIAS as NaN (since any or all of the last 7 bits can be set)
input = vec_sel(input, p8us_BIAS, nan_selector);
input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
#else
- //Test only NaN
+ // Test only NaN
Packet4bi nan_selector_lo = vec_cmpeq(lo, lo);
Packet4bi nan_selector_hi = vec_cmpeq(hi, hi);
- Packet8us nan_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
+ Packet8us nan_selector =
+ Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
input = vec_sel(p8us_BIAS, input, nan_selector);
#endif
@@ -1861,8 +2228,7 @@
/**
* Convert and pack two float Packets into one bfloat16 Packet - low & high order
*/
-EIGEN_STRONG_INLINE Packet8bf F32ToBf16Both(Packet4f lo, Packet4f hi)
-{
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16Both(Packet4f lo, Packet4f hi) {
#ifdef _ARCH_PWR10
Packet8bf fp16_0 = F32ToBf16(lo);
Packet8bf fp16_1 = F32ToBf16(hi);
@@ -1875,7 +2241,7 @@
/**
* Convert and pack two float Packets into one bfloat16 Packet - odd & even order
*/
-EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd){
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd) {
#ifdef _ARCH_PWR10
return pmerge(reinterpret_cast<Packet4ui>(F32ToBf16(even).m_val), reinterpret_cast<Packet4ui>(F32ToBf16(odd).m_val));
#else
@@ -1883,66 +2249,76 @@
#endif
}
#define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
- Packet4f a_even = Bf16ToF32Even(A);\
- Packet4f a_odd = Bf16ToF32Odd(A);\
- Packet4f op_even = OP(a_even);\
- Packet4f op_odd = OP(a_odd);\
- return F32ToBf16(op_even, op_odd);\
+ Packet4f a_even = Bf16ToF32Even(A); \
+ Packet4f a_odd = Bf16ToF32Odd(A); \
+ Packet4f op_even = OP(a_even); \
+ Packet4f op_odd = OP(a_odd); \
+ return F32ToBf16(op_even, op_odd);
#define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
- Packet4f a_even = Bf16ToF32Even(A);\
- Packet4f a_odd = Bf16ToF32Odd(A);\
- Packet4f b_even = Bf16ToF32Even(B);\
- Packet4f b_odd = Bf16ToF32Odd(B);\
- Packet4f op_even = OP(a_even, b_even);\
- Packet4f op_odd = OP(a_odd, b_odd);\
- return F32ToBf16(op_even, op_odd);\
+ Packet4f a_even = Bf16ToF32Even(A); \
+ Packet4f a_odd = Bf16ToF32Odd(A); \
+ Packet4f b_even = Bf16ToF32Even(B); \
+ Packet4f b_odd = Bf16ToF32Odd(B); \
+ Packet4f op_even = OP(a_even, b_even); \
+ Packet4f op_odd = OP(a_odd, b_odd); \
+ return F32ToBf16(op_even, op_odd);
#define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
- Packet4f a_even = Bf16ToF32Even(A);\
- Packet4f a_odd = Bf16ToF32Odd(A);\
- Packet4f b_even = Bf16ToF32Even(B);\
- Packet4f b_odd = Bf16ToF32Odd(B);\
- Packet4f op_even = OP(a_even, b_even);\
- Packet4f op_odd = OP(a_odd, b_odd);\
- return F32ToBf16Bool(op_even, op_odd);\
+ Packet4f a_even = Bf16ToF32Even(A); \
+ Packet4f a_odd = Bf16ToF32Odd(A); \
+ Packet4f b_even = Bf16ToF32Even(B); \
+ Packet4f b_odd = Bf16ToF32Odd(B); \
+ Packet4f op_even = OP(a_even, b_even); \
+ Packet4f op_odd = OP(a_odd, b_odd); \
+ return F32ToBf16Bool(op_even, op_odd);
-template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
- EIGEN_DECLARE_CONST_FAST_Packet8us(neg_mask,0x8000);
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
+ EIGEN_DECLARE_CONST_FAST_Packet8us(neg_mask, 0x8000);
return pxor<Packet8us>(p8us_neg_mask, a);
}
-template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf>(const Packet8bf& a) {
BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
}
-template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
- return pldexp_generic(a,exponent);
+template <>
+EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+ return pldexp_generic(a, exponent);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf> (const Packet8bf& a, const Packet8bf& exponent){
+template <>
+EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf>(const Packet8bf& a, const Packet8bf& exponent) {
BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
}
-template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
- return pfrexp_generic(a,exponent);
+template <>
+EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+ return pfrexp_generic(a, exponent);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf> (const Packet8bf& a, Packet8bf& e){
+template <>
+EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf>(const Packet8bf& a, Packet8bf& e) {
Packet4f a_even = Bf16ToF32Even(a);
Packet4f a_odd = Bf16ToF32Odd(a);
Packet4f e_even;
@@ -1953,30 +2329,38 @@
return F32ToBf16(op_even, op_odd);
}
-template<> EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf>(const Packet8bf& a) {
BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf>(const Packet8bf& a) {
BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
}
-template<> EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf>(const Packet8bf& a) {
BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a) {
BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
}
#ifdef EIGEN_VECTORIZE_VSX
-template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf> (const Packet8bf& a){
+template <>
+EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
}
#endif
-template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
Packet4f a_even = Bf16ToF32Even(a);
Packet4f a_odd = Bf16ToF32Odd(a);
Packet4f b_even = Bf16ToF32Even(b);
@@ -1988,54 +2372,62 @@
return F32ToBf16(pmadd_even, pmadd_odd);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
}
-template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
}
-template<> EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from) {
return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
}
-template<> EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
- bfloat16 countdown[8] = { bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
- bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7) };
+template <>
+EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
+ bfloat16 countdown[8] = {bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
+ bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7)};
return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
}
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
Packet4f b, sum;
- b = vec_sld(a, a, 8);
+ b = vec_sld(a, a, 8);
sum = a + b;
- b = vec_sld(sum, sum, 4);
+ b = vec_sld(sum, sum, 4);
sum += b;
return pfirst(sum);
}
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
Packet4i sum;
sum = vec_sums(a, p4i_ZERO);
#ifdef _BIG_ENDIAN
@@ -2046,89 +2438,89 @@
return pfirst(sum);
}
-template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
- float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
+ float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
float f32_result = redux_even + redux_odd;
return bfloat16(f32_result);
}
-template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a)
-{
- union{
+template <typename Packet>
+EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a) {
+ union {
Packet v;
__UNPACK_TYPE__(Packet) n[8];
} vt;
vt.v = a;
- EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
- EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
- Packet4i first_half = pload<Packet4i>(first_loader);
+ EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
+ EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
+ Packet4i first_half = pload<Packet4i>(first_loader);
Packet4i second_half = pload<Packet4i>(second_loader);
return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half));
}
-template<> EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a) {
return predux_size8<Packet8s>(a);
}
-template<> EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a) {
return predux_size8<Packet8us>(a);
}
-template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a)
-{
- union{
+template <typename Packet>
+EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a) {
+ union {
Packet v;
__UNPACK_TYPE__(Packet) n[16];
} vt;
vt.v = a;
- EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
- EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
- EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] };
- EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] };
+ EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
+ EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
+ EIGEN_ALIGN16 int third_loader[4] = {vt.n[8], vt.n[9], vt.n[10], vt.n[11]};
+ EIGEN_ALIGN16 int fourth_loader[4] = {vt.n[12], vt.n[13], vt.n[14], vt.n[15]};
Packet4i first_quarter = pload<Packet4i>(first_loader);
Packet4i second_quarter = pload<Packet4i>(second_loader);
Packet4i third_quarter = pload<Packet4i>(third_loader);
Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
- return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter)
- + predux(third_quarter) + predux(fourth_quarter));
+ return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter) + predux(third_quarter) +
+ predux(fourth_quarter));
}
-template<> EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a) {
return predux_size16<Packet16c>(a);
}
-template<> EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a) {
return predux_size16<Packet16uc>(a);
}
// Other reduction functions:
// mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
Packet4f prod;
prod = pmul(a, vec_sld(a, a, 8));
return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
}
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
EIGEN_ALIGN16 int aux[4];
pstore(aux, a);
return aux[0] * aux[1] * aux[2] * aux[3];
}
-template<> EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a) {
Packet8s pair, quad, octo;
pair = vec_mul(a, vec_sld(a, a, 8));
@@ -2138,8 +2530,8 @@
return pfirst(octo);
}
-template<> EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a) {
Packet8us pair, quad, octo;
pair = vec_mul(a, vec_sld(a, a, 8));
@@ -2149,17 +2541,16 @@
return pfirst(octo);
}
-template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
- float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
+ float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
float f32_result = redux_even * redux_odd;
return bfloat16(f32_result);
}
-
-template<> EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a) {
Packet16c pair, quad, octo, result;
pair = vec_mul(a, vec_sld(a, a, 8));
@@ -2170,8 +2561,8 @@
return pfirst(result);
}
-template<> EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a) {
Packet16uc pair, quad, octo, result;
pair = vec_mul(a, vec_sld(a, a, 8));
@@ -2183,66 +2574,64 @@
}
// min
-template<typename Packet> EIGEN_STRONG_INLINE
-__UNPACK_TYPE__(Packet) predux_min4(const Packet& a)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_min4(const Packet& a) {
Packet b, res;
b = vec_min(a, vec_sld(a, a, 8));
res = vec_min(b, vec_sld(b, b, 4));
return pfirst(res);
}
-
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
return predux_min4<Packet4f>(a);
}
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
return predux_min4<Packet4i>(a);
}
-template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
- float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
+ float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
float f32_result = (std::min)(redux_even, redux_odd);
return bfloat16(f32_result);
}
-template<> EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a) {
Packet8s pair, quad, octo;
-
- //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
- pair = vec_min(a, vec_sld(a, a, 8));
- //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
+ // pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
+ pair = vec_min(a, vec_sld(a, a, 8));
+
+ // quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
quad = vec_min(pair, vec_sld(pair, pair, 4));
- //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
+ // octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
octo = vec_min(quad, vec_sld(quad, quad, 2));
return pfirst(octo);
}
-template<> EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a) {
Packet8us pair, quad, octo;
-
- //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
- pair = vec_min(a, vec_sld(a, a, 8));
- //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
+ // pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
+ pair = vec_min(a, vec_sld(a, a, 8));
+
+ // quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
quad = vec_min(pair, vec_sld(pair, pair, 4));
- //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
+ // octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
octo = vec_min(quad, vec_sld(quad, quad, 2));
return pfirst(octo);
}
-template<> EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a) {
Packet16c pair, quad, octo, result;
pair = vec_min(a, vec_sld(a, a, 8));
@@ -2253,8 +2642,8 @@
return pfirst(result);
}
-template<> EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a) {
Packet16uc pair, quad, octo, result;
pair = vec_min(a, vec_sld(a, a, 8));
@@ -2265,64 +2654,64 @@
return pfirst(result);
}
// max
-template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a)
-{
+template <typename Packet>
+EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a) {
Packet b, res;
b = vec_max(a, vec_sld(a, a, 8));
res = vec_max(b, vec_sld(b, b, 4));
return pfirst(res);
}
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
return predux_max4<Packet4f>(a);
}
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
return predux_max4<Packet4i>(a);
}
-template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
- float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
+ float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
float f32_result = (std::max)(redux_even, redux_odd);
return bfloat16(f32_result);
}
-template<> EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a) {
Packet8s pair, quad, octo;
-
- //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
- pair = vec_max(a, vec_sld(a, a, 8));
- //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
+ // pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
+ pair = vec_max(a, vec_sld(a, a, 8));
+
+ // quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
quad = vec_max(pair, vec_sld(pair, pair, 4));
- //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
+ // octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
octo = vec_max(quad, vec_sld(quad, quad, 2));
return pfirst(octo);
}
-template<> EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a) {
Packet8us pair, quad, octo;
-
- //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
- pair = vec_max(a, vec_sld(a, a, 8));
- //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
+ // pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
+ pair = vec_max(a, vec_sld(a, a, 8));
+
+ // quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
quad = vec_max(pair, vec_sld(pair, pair, 4));
- //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
+ // octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
octo = vec_max(quad, vec_sld(quad, quad, 2));
return pfirst(octo);
}
-template<> EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a) {
Packet16c pair, quad, octo, result;
pair = vec_max(a, vec_sld(a, a, 8));
@@ -2333,8 +2722,8 @@
return pfirst(result);
}
-template<> EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a) {
Packet16uc pair, quad, octo, result;
pair = vec_max(a, vec_sld(a, a, 8));
@@ -2345,13 +2734,13 @@
return pfirst(result);
}
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
-{
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
return vec_any_ne(x, pzero(x));
}
-template <typename T> EIGEN_DEVICE_FUNC inline void
-ptranpose_common(PacketBlock<T,4>& kernel){
+template <typename T>
+EIGEN_DEVICE_FUNC inline void ptranpose_common(PacketBlock<T, 4>& kernel) {
T t0, t1, t2, t3;
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
@@ -2363,18 +2752,11 @@
kernel.packet[3] = vec_mergel(t1, t3);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
- ptranpose_common<Packet4f>(kernel);
-}
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) { ptranpose_common<Packet4f>(kernel); }
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
- ptranpose_common<Packet4i>(kernel);
-}
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) { ptranpose_common<Packet4i>(kernel); }
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8s,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
Packet8s t0, t1, t2, t3;
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
@@ -2386,8 +2768,7 @@
kernel.packet[3] = vec_mergel(t1, t3);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8us,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
Packet8us t0, t1, t2, t3;
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
@@ -2399,9 +2780,7 @@
kernel.packet[3] = vec_mergel(t1, t3);
}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8bf,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
Packet8us t0, t1, t2, t3;
t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
@@ -2414,8 +2793,7 @@
kernel.packet[3] = vec_mergel(t1, t3);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16c,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
Packet16c t0, t1, t2, t3;
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
@@ -2427,9 +2805,7 @@
kernel.packet[3] = vec_mergel(t1, t3);
}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16uc,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
Packet16uc t0, t1, t2, t3;
t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
@@ -2441,8 +2817,7 @@
kernel.packet[3] = vec_mergel(t1, t3);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8s,8>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
Packet8s v[8], sum[8];
v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
@@ -2472,8 +2847,7 @@
kernel.packet[7] = vec_mergel(sum[3], sum[7]);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8us,8>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
Packet8us v[8], sum[8];
v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
@@ -2503,8 +2877,7 @@
kernel.packet[7] = vec_mergel(sum[3], sum[7]);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8bf,8>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 8>& kernel) {
Packet8bf v[8], sum[8];
v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
@@ -2534,8 +2907,7 @@
kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16c,16>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
Packet16c step1[16], step2[16], step3[16];
step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
@@ -2555,16 +2927,16 @@
step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
- step2[0] = vec_mergeh(step1[0], step1[8]);
- step2[1] = vec_mergel(step1[0], step1[8]);
- step2[2] = vec_mergeh(step1[1], step1[9]);
- step2[3] = vec_mergel(step1[1], step1[9]);
- step2[4] = vec_mergeh(step1[2], step1[10]);
- step2[5] = vec_mergel(step1[2], step1[10]);
- step2[6] = vec_mergeh(step1[3], step1[11]);
- step2[7] = vec_mergel(step1[3], step1[11]);
- step2[8] = vec_mergeh(step1[4], step1[12]);
- step2[9] = vec_mergel(step1[4], step1[12]);
+ step2[0] = vec_mergeh(step1[0], step1[8]);
+ step2[1] = vec_mergel(step1[0], step1[8]);
+ step2[2] = vec_mergeh(step1[1], step1[9]);
+ step2[3] = vec_mergel(step1[1], step1[9]);
+ step2[4] = vec_mergeh(step1[2], step1[10]);
+ step2[5] = vec_mergel(step1[2], step1[10]);
+ step2[6] = vec_mergeh(step1[3], step1[11]);
+ step2[7] = vec_mergel(step1[3], step1[11]);
+ step2[8] = vec_mergeh(step1[4], step1[12]);
+ step2[9] = vec_mergel(step1[4], step1[12]);
step2[10] = vec_mergeh(step1[5], step1[13]);
step2[11] = vec_mergel(step1[5], step1[13]);
step2[12] = vec_mergeh(step1[6], step1[14]);
@@ -2572,16 +2944,16 @@
step2[14] = vec_mergeh(step1[7], step1[15]);
step2[15] = vec_mergel(step1[7], step1[15]);
- step3[0] = vec_mergeh(step2[0], step2[8]);
- step3[1] = vec_mergel(step2[0], step2[8]);
- step3[2] = vec_mergeh(step2[1], step2[9]);
- step3[3] = vec_mergel(step2[1], step2[9]);
- step3[4] = vec_mergeh(step2[2], step2[10]);
- step3[5] = vec_mergel(step2[2], step2[10]);
- step3[6] = vec_mergeh(step2[3], step2[11]);
- step3[7] = vec_mergel(step2[3], step2[11]);
- step3[8] = vec_mergeh(step2[4], step2[12]);
- step3[9] = vec_mergel(step2[4], step2[12]);
+ step3[0] = vec_mergeh(step2[0], step2[8]);
+ step3[1] = vec_mergel(step2[0], step2[8]);
+ step3[2] = vec_mergeh(step2[1], step2[9]);
+ step3[3] = vec_mergel(step2[1], step2[9]);
+ step3[4] = vec_mergeh(step2[2], step2[10]);
+ step3[5] = vec_mergel(step2[2], step2[10]);
+ step3[6] = vec_mergeh(step2[3], step2[11]);
+ step3[7] = vec_mergel(step2[3], step2[11]);
+ step3[8] = vec_mergeh(step2[4], step2[12]);
+ step3[9] = vec_mergel(step2[4], step2[12]);
step3[10] = vec_mergeh(step2[5], step2[13]);
step3[11] = vec_mergel(step2[5], step2[13]);
step3[12] = vec_mergeh(step2[6], step2[14]);
@@ -2589,16 +2961,16 @@
step3[14] = vec_mergeh(step2[7], step2[15]);
step3[15] = vec_mergel(step2[7], step2[15]);
- kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
- kernel.packet[1] = vec_mergel(step3[0], step3[8]);
- kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
- kernel.packet[3] = vec_mergel(step3[1], step3[9]);
- kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
- kernel.packet[5] = vec_mergel(step3[2], step3[10]);
- kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
- kernel.packet[7] = vec_mergel(step3[3], step3[11]);
- kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
- kernel.packet[9] = vec_mergel(step3[4], step3[12]);
+ kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
+ kernel.packet[1] = vec_mergel(step3[0], step3[8]);
+ kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
+ kernel.packet[3] = vec_mergel(step3[1], step3[9]);
+ kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
+ kernel.packet[5] = vec_mergel(step3[2], step3[10]);
+ kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
+ kernel.packet[7] = vec_mergel(step3[3], step3[11]);
+ kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
+ kernel.packet[9] = vec_mergel(step3[4], step3[12]);
kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
kernel.packet[11] = vec_mergel(step3[5], step3[13]);
kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
@@ -2607,8 +2979,7 @@
kernel.packet[15] = vec_mergel(step3[7], step3[15]);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16uc,16>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
Packet16uc step1[16], step2[16], step3[16];
step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
@@ -2628,16 +2999,16 @@
step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
- step2[0] = vec_mergeh(step1[0], step1[8]);
- step2[1] = vec_mergel(step1[0], step1[8]);
- step2[2] = vec_mergeh(step1[1], step1[9]);
- step2[3] = vec_mergel(step1[1], step1[9]);
- step2[4] = vec_mergeh(step1[2], step1[10]);
- step2[5] = vec_mergel(step1[2], step1[10]);
- step2[6] = vec_mergeh(step1[3], step1[11]);
- step2[7] = vec_mergel(step1[3], step1[11]);
- step2[8] = vec_mergeh(step1[4], step1[12]);
- step2[9] = vec_mergel(step1[4], step1[12]);
+ step2[0] = vec_mergeh(step1[0], step1[8]);
+ step2[1] = vec_mergel(step1[0], step1[8]);
+ step2[2] = vec_mergeh(step1[1], step1[9]);
+ step2[3] = vec_mergel(step1[1], step1[9]);
+ step2[4] = vec_mergeh(step1[2], step1[10]);
+ step2[5] = vec_mergel(step1[2], step1[10]);
+ step2[6] = vec_mergeh(step1[3], step1[11]);
+ step2[7] = vec_mergel(step1[3], step1[11]);
+ step2[8] = vec_mergeh(step1[4], step1[12]);
+ step2[9] = vec_mergel(step1[4], step1[12]);
step2[10] = vec_mergeh(step1[5], step1[13]);
step2[11] = vec_mergel(step1[5], step1[13]);
step2[12] = vec_mergeh(step1[6], step1[14]);
@@ -2645,16 +3016,16 @@
step2[14] = vec_mergeh(step1[7], step1[15]);
step2[15] = vec_mergel(step1[7], step1[15]);
- step3[0] = vec_mergeh(step2[0], step2[8]);
- step3[1] = vec_mergel(step2[0], step2[8]);
- step3[2] = vec_mergeh(step2[1], step2[9]);
- step3[3] = vec_mergel(step2[1], step2[9]);
- step3[4] = vec_mergeh(step2[2], step2[10]);
- step3[5] = vec_mergel(step2[2], step2[10]);
- step3[6] = vec_mergeh(step2[3], step2[11]);
- step3[7] = vec_mergel(step2[3], step2[11]);
- step3[8] = vec_mergeh(step2[4], step2[12]);
- step3[9] = vec_mergel(step2[4], step2[12]);
+ step3[0] = vec_mergeh(step2[0], step2[8]);
+ step3[1] = vec_mergel(step2[0], step2[8]);
+ step3[2] = vec_mergeh(step2[1], step2[9]);
+ step3[3] = vec_mergel(step2[1], step2[9]);
+ step3[4] = vec_mergeh(step2[2], step2[10]);
+ step3[5] = vec_mergel(step2[2], step2[10]);
+ step3[6] = vec_mergeh(step2[3], step2[11]);
+ step3[7] = vec_mergel(step2[3], step2[11]);
+ step3[8] = vec_mergeh(step2[4], step2[12]);
+ step3[9] = vec_mergel(step2[4], step2[12]);
step3[10] = vec_mergeh(step2[5], step2[13]);
step3[11] = vec_mergel(step2[5], step2[13]);
step3[12] = vec_mergeh(step2[6], step2[14]);
@@ -2662,16 +3033,16 @@
step3[14] = vec_mergeh(step2[7], step2[15]);
step3[15] = vec_mergel(step2[7], step2[15]);
- kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
- kernel.packet[1] = vec_mergel(step3[0], step3[8]);
- kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
- kernel.packet[3] = vec_mergel(step3[1], step3[9]);
- kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
- kernel.packet[5] = vec_mergel(step3[2], step3[10]);
- kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
- kernel.packet[7] = vec_mergel(step3[3], step3[11]);
- kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
- kernel.packet[9] = vec_mergel(step3[4], step3[12]);
+ kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
+ kernel.packet[1] = vec_mergel(step3[0], step3[8]);
+ kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
+ kernel.packet[3] = vec_mergel(step3[1], step3[9]);
+ kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
+ kernel.packet[5] = vec_mergel(step3[2], step3[10]);
+ kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
+ kernel.packet[7] = vec_mergel(step3[3], step3[11]);
+ kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
+ kernel.packet[9] = vec_mergel(step3[4], step3[12]);
kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
kernel.packet[11] = vec_mergel(step3[5], step3[13]);
kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
@@ -2680,112 +3051,127 @@
kernel.packet[15] = vec_mergel(step3[7], step3[15]);
}
-template<typename Packet> EIGEN_STRONG_INLINE
-Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
- Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
+ Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
Packet4ui mask = reinterpret_cast<Packet4ui>(pnegate(reinterpret_cast<Packet4i>(select)));
return vec_sel(elsePacket, thenPacket, mask);
}
-template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
+ const Packet4i& elsePacket) {
return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
}
-template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+ const Packet4f& elsePacket) {
return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
}
-template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) {
- Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
- ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
+template <>
+EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket,
+ const Packet8s& elsePacket) {
+ Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
Packet8s result = vec_sel(elsePacket, thenPacket, mask);
return result;
}
-template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) {
- Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
- ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
+template <>
+EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket,
+ const Packet8us& elsePacket) {
+ Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
return vec_sel(elsePacket, thenPacket, mask);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket, const Packet8bf& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket,
+ const Packet8bf& elsePacket) {
return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
}
-template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket, const Packet16c& elsePacket) {
- Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
- ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
- ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
- ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
+template <>
+EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket,
+ const Packet16c& elsePacket) {
+ Packet16uc select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
+ ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
+ ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
return vec_sel(elsePacket, thenPacket, mask);
}
-template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) {
- Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
- ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
- ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
- ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
+template <>
+EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket,
+ const Packet16uc& elsePacket) {
+ Packet16uc select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+ ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
+ ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
+ ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
return vec_sel(elsePacket, thenPacket, mask);
}
-
//---------- double ----------
#ifdef EIGEN_VECTORIZE_VSX
-typedef __vector double Packet2d;
-typedef __vector unsigned long long Packet2ul;
-typedef __vector long long Packet2l;
+typedef __vector double Packet2d;
+typedef __vector unsigned long long Packet2ul;
+typedef __vector long long Packet2l;
#if EIGEN_COMP_CLANG
-typedef Packet2ul Packet2bl;
+typedef Packet2ul Packet2bl;
#else
-typedef __vector __bool long Packet2bl;
+typedef __vector __bool long Packet2bl;
#endif
-static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
-static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull };
-static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull };
-static Packet2d p2d_ONE = { 1.0, 1.0 };
-static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
-static Packet2d p2d_MZERO = { numext::bit_cast<double>(0x8000000000000000ull),
- numext::bit_cast<double>(0x8000000000000000ull) };
+static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
+static Packet2ul p2ul_SIGN = {0x8000000000000000ull, 0x8000000000000000ull};
+static Packet2ul p2ul_PREV0DOT5 = {0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull};
+static Packet2d p2d_ONE = {1.0, 1.0};
+static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
+static Packet2d p2d_MZERO = {numext::bit_cast<double>(0x8000000000000000ull),
+ numext::bit_cast<double>(0x8000000000000000ull)};
#ifdef _BIG_ENDIAN
-static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
+static Packet2d p2d_COUNTDOWN =
+ reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
#else
-static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
+static Packet2d p2d_COUNTDOWN =
+ reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
#endif
-template<int index> Packet2d vec_splat_dbl(Packet2d& a)
-{
+template <int index>
+Packet2d vec_splat_dbl(Packet2d& a) {
return vec_splat(a, index);
}
-template<> struct packet_traits<double> : default_packet_traits
-{
+template <>
+struct packet_traits<double> : default_packet_traits {
typedef Packet2d type;
typedef Packet2d half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
- size=2,
+ size = 2,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasMin = 1,
- HasMax = 1,
- HasAbs = 1,
- HasSin = 0,
- HasCos = 0,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasMin = 1,
+ HasMax = 1,
+ HasAbs = 1,
+ HasSin = 0,
+ HasCos = 0,
HasATan = 0,
- HasLog = 0,
- HasExp = 1,
+ HasLog = 0,
+ HasExp = 1,
HasSqrt = 1,
#if !EIGEN_COMP_CLANG
HasRsqrt = 1,
@@ -2801,12 +3187,22 @@
};
};
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
+template <>
+struct unpacket_traits<Packet2d> {
+ typedef double type;
+ enum {
+ size = 2,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+ typedef Packet2d half;
+};
-inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
union {
- Packet2l v;
+ Packet2l v;
int64_t n[2];
} vt;
vt.v = v;
@@ -2814,10 +3210,9 @@
return s;
}
-inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet2d& v) {
union {
- Packet2d v;
+ Packet2d v;
double n[2];
} vt;
vt.v = v;
@@ -2826,74 +3221,86 @@
}
// Need to define them first or we get specialization after instantiation errors
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
EIGEN_DEBUG_ALIGNED_LOAD
- return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
+ return vec_xl(0, const_cast<double*>(from)); // cast needed by Clang
}
-template<> EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(const double* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(const double* from, const Index n, const Index offset) {
return pload_partial_common<Packet2d>(from, n, offset);
}
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
EIGEN_DEBUG_ALIGNED_STORE
vec_xst(from, 0, to);
}
-template<> EIGEN_ALWAYS_INLINE void pstore_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
pstore_partial_common<Packet2d>(to, from, n, offset);
}
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
Packet2d v = {from, from};
return v;
}
-template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
return reinterpret_cast<Packet2d>(v);
}
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet2d>(const double *a,
- Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
-{
- //This way is faster than vec_splat (at least for doubles in Power 9)
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
+ Packet2d& a3) {
+ // This way is faster than vec_splat (at least for doubles in Power 9)
a0 = pset1<Packet2d>(a[0]);
a1 = pset1<Packet2d>(a[1]);
a2 = pset1<Packet2d>(a[2]);
a3 = pset1<Packet2d>(a[3]);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
return pgather_common<Packet2d>(from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(const double* from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(const double* from, Index stride,
+ const Index n) {
return pgather_common<Packet2d>(from, stride, n);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
pscatter_common<Packet2d>(to, from, stride);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<double, Packet2d>(double* to, const Packet2d& from, Index stride, const Index n)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<double, Packet2d>(double* to, const Packet2d& from,
+ Index stride, const Index n) {
pscatter_common<Packet2d>(to, from, stride, n);
}
-template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; }
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+ return pset1<Packet2d>(a) + p2d_COUNTDOWN;
+}
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; }
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return a + b;
+}
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return a - b;
+}
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
#ifdef __POWER8_VECTOR__
return vec_neg(a);
#else
@@ -2901,150 +3308,214 @@
#endif
}
-template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+ return a;
+}
-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); }
-template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vec_madd(a, b, p2d_MZERO);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vec_div(a, b);
+}
// for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
-template<> EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_msub(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_nmsub(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_nmadd(a,b,c); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+ return vec_madd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+ return vec_msub(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+ return vec_nmsub(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+ return vec_nmadd(a, b, c);
+}
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
// NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
Packet2d ret;
- __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+ __asm__("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
return ret;
- }
+}
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
// NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
Packet2d ret;
- __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
+ __asm__("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
return ret;
}
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmple(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmplt(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmpeq(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
- Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a,b));
- return vec_nor(c,c);
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
+ return reinterpret_cast<Packet2d>(vec_cmple(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
+ return reinterpret_cast<Packet2d>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
+ return reinterpret_cast<Packet2d>(vec_cmpeq(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
+ Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a, b));
+ return vec_nor(c, c);
}
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
-{
- Packet2d t = vec_add(reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
- Packet2d res;
-
- __asm__("xvrdpiz %x0, %x1\n\t"
- : "=&wa" (res)
- : "wa" (t));
-
- return res;
-}
-template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
-template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
-{
- Packet2d res;
-
- __asm__("xvrdpic %x0, %x1\n\t"
- : "=&wa" (res)
- : "wa" (a));
-
- return res;
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vec_and(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vec_or(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vec_xor(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vec_and(a, vec_nor(b, b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
+ Packet2d t = vec_add(
+ reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
+ Packet2d res;
+
+ __asm__("xvrdpiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
+
+ return res;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+ return vec_ceil(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+ return vec_floor(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
+ Packet2d res;
+
+ __asm__("xvrdpic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
+
+ return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
EIGEN_DEBUG_UNALIGNED_LOAD
return vec_xl(0, const_cast<double*>(from));
}
-template<> EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n, const Index offset) {
return ploadu_partial_common<Packet2d>(from, n, offset);
}
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
Packet2d p;
- if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from);
- else p = ploadu<Packet2d>(from);
+ if ((std::ptrdiff_t(from) % 16) == 0)
+ p = pload<Packet2d>(from);
+ else
+ p = ploadu<Packet2d>(from);
return vec_splat_dbl<0>(p);
}
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
EIGEN_DEBUG_UNALIGNED_STORE
vec_xst(from, 0, to);
}
-template<> EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset)
-{
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
pstoreu_partial_common<Packet2d>(to, from, n, offset);
}
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+ EIGEN_PPC_PREFETCH(addr);
+}
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+ EIGEN_ALIGN16 double x[2];
+ pstore<double>(x, a);
+ return x[0];
+}
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
return vec_sld(a, a, 8);
}
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+ return vec_abs(a);
+}
#ifdef __POWER8_VECTOR__
-template<> EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) { return (Packet2d)vec_sra((Packet2l)a, vec_splats((unsigned long long)(63))); }
+template <>
+EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
+ return (Packet2d)vec_sra((Packet2l)a, vec_splats((unsigned long long)(63)));
+}
#else
#ifdef _BIG_ENDIAN
-static Packet16uc p16uc_DUPSIGN = { 0,0,0,0, 0,0,0,0, 8,8,8,8, 8,8,8,8 };
+static Packet16uc p16uc_DUPSIGN = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
#else
-static Packet16uc p16uc_DUPSIGN = { 7,7,7,7, 7,7,7,7, 15,15,15,15, 15,15,15,15 };
+static Packet16uc p16uc_DUPSIGN = {7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
#endif
-template<> EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
Packet16c tmp = vec_sra(reinterpret_cast<Packet16c>(a), vec_splats((unsigned char)(7)));
return reinterpret_cast<Packet2d>(vec_perm(tmp, tmp, p16uc_DUPSIGN));
}
#endif
-template<> inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x);
+template <>
+inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x);
-template<> inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x);
+template <>
+inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x);
// Packet2l shifts.
-// For POWER8 we simply use vec_sr/l.
+// For POWER8 we simply use vec_sr/l.
//
// Things are more complicated for POWER7. There is actually a
// vec_xxsxdi intrinsic but it is not supported by some gcc versions.
// So we need to shift by N % 32 and rearrage bytes.
#ifdef __POWER8_VECTOR__
-template<int N>
+template <int N>
EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
- const Packet2ul shift = { N, N };
- return vec_sl(a, shift);
+ const Packet2ul shift = {N, N};
+ return vec_sl(a, shift);
}
-template<int N>
+template <int N>
EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
- const Packet2ul shift = { N, N };
- return vec_sr(a, shift);
+ const Packet2ul shift = {N, N};
+ return vec_sr(a, shift);
}
#else
@@ -3052,34 +3523,32 @@
// Shifts [A, B, C, D] to [B, 0, D, 0].
// Used to implement left shifts for Packet2l.
EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
- static const Packet16uc perm = {
- 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
- 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
- #ifdef _BIG_ENDIAN
- return vec_perm(p4i_ZERO, a, perm);
- #else
- return vec_perm(a, p4i_ZERO, perm);
- #endif
+ static const Packet16uc perm = {0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
+ 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b};
+#ifdef _BIG_ENDIAN
+ return vec_perm(p4i_ZERO, a, perm);
+#else
+ return vec_perm(a, p4i_ZERO, perm);
+#endif
}
// Shifts [A, B, C, D] to [0, A, 0, C].
// Used to implement right shifts for Packet2l.
EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
- static const Packet16uc perm = {
- 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
- 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b };
- #ifdef _BIG_ENDIAN
- return vec_perm(p4i_ZERO, a, perm);
- #else
- return vec_perm(a, p4i_ZERO, perm);
- #endif
+ static const Packet16uc perm = {0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+ 0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b};
+#ifdef _BIG_ENDIAN
+ return vec_perm(p4i_ZERO, a, perm);
+#else
+ return vec_perm(a, p4i_ZERO, perm);
+#endif
}
-template<int N, typename EnableIf = void>
+template <int N, typename EnableIf = void>
struct plogical_shift_left_impl;
-template<int N>
-struct plogical_shift_left_impl<N, std::enable_if_t<(N < 32) && (N >= 0)>> {
+template <int N>
+struct plogical_shift_left_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
static const unsigned n = static_cast<unsigned>(N);
const Packet4ui shift = {n, n, n, n};
@@ -3092,8 +3561,8 @@
}
};
-template<int N>
-struct plogical_shift_left_impl<N, std::enable_if_t<(N >= 32)>> {
+template <int N>
+struct plogical_shift_left_impl<N, std::enable_if_t<(N >= 32)> > {
static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
static const unsigned m = static_cast<unsigned>(N - 32);
const Packet4ui shift = {m, m, m, m};
@@ -3102,16 +3571,16 @@
}
};
-template<int N>
+template <int N>
EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
- return plogical_shift_left_impl<N>::run(a);
+ return plogical_shift_left_impl<N>::run(a);
}
-template<int N, typename EnableIf = void>
+template <int N, typename EnableIf = void>
struct plogical_shift_right_impl;
-template<int N>
-struct plogical_shift_right_impl<N, std::enable_if_t<(N < 32) && (N >= 0)>> {
+template <int N>
+struct plogical_shift_right_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
static const unsigned n = static_cast<unsigned>(N);
const Packet4ui shift = {n, n, n, n};
@@ -3124,8 +3593,8 @@
}
};
-template<int N>
-struct plogical_shift_right_impl<N, std::enable_if_t<(N >= 32)>> {
+template <int N>
+struct plogical_shift_right_impl<N, std::enable_if_t<(N >= 32)> > {
static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
static const unsigned m = static_cast<unsigned>(N - 32);
const Packet4ui shift = {m, m, m, m};
@@ -3134,69 +3603,71 @@
}
};
-template<int N>
+template <int N>
EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
- return plogical_shift_right_impl<N>::run(a);
+ return plogical_shift_right_impl<N>::run(a);
}
#endif
-template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
// Clamp exponent to [-2099, 2099]
const Packet2d max_exponent = pset1<Packet2d>(2099.0);
const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
// Split 2^e into four factors and multiply:
- const Packet2l bias = { 1023, 1023 };
+ const Packet2l bias = {1023, 1023};
Packet2l b = plogical_shift_right<2>(e); // floor(e/4)
Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
- Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
- b = psub(psub(psub(e, b), b), b); // e - 3b
- c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
- out = pmul(out, c); // a * 2^e
+ Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
+ b = psub(psub(psub(e, b), b), b); // e - 3b
+ c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias)); // 2^(e - 3b)
+ out = pmul(out, c); // a * 2^e
return out;
}
-
// Extract exponent without existence of Packet2l.
-template<>
-EIGEN_STRONG_INLINE
-Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
}
-template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d> (const Packet2d& a, Packet2d& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
return pfrexp_generic(a, exponent);
}
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
Packet2d b, sum;
- b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
+ b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
sum = a + b;
return pfirst<Packet2d>(sum);
}
// Other reduction functions:
// mul
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{
- return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+ return pfirst(
+ pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
}
// min
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
-{
- return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+ return pfirst(
+ pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
}
// max
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
-{
- return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+ return pfirst(
+ pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2d,2>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
Packet2d t0, t1;
t0 = vec_mergeh(kernel.packet[0], kernel.packet[1]);
t1 = vec_mergel(kernel.packet[0], kernel.packet[1]);
@@ -3204,16 +3675,17 @@
kernel.packet[1] = t1;
}
-template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
- Packet2l select = { ifPacket.select[0], ifPacket.select[1] };
+template <>
+EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
+ const Packet2d& elsePacket) {
+ Packet2l select = {ifPacket.select[0], ifPacket.select[1]};
Packet2ul mask = reinterpret_cast<Packet2ul>(pnegate(reinterpret_cast<Packet2l>(select)));
return vec_sel(elsePacket, thenPacket, mask);
}
+#endif // __VSX__
+} // end namespace internal
-#endif // __VSX__
-} // end namespace internal
+} // end namespace Eigen
-} // end namespace Eigen
-
-#endif // EIGEN_PACKET_MATH_ALTIVEC_H
+#endif // EIGEN_PACKET_MATH_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/AltiVec/TypeCasting.h b/Eigen/src/Core/arch/AltiVec/TypeCasting.h
index 361c69f..fdabeb9 100644
--- a/Eigen/src/Core/arch/AltiVec/TypeCasting.h
+++ b/Eigen/src/Core/arch/AltiVec/TypeCasting.h
@@ -19,57 +19,46 @@
namespace internal {
template <>
struct type_casting_traits<float, int> {
- enum {
- VectorizedCast = 1,
- SrcCoeffRatio = 1,
- TgtCoeffRatio = 1
- };
+ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
};
template <>
struct type_casting_traits<int, float> {
- enum {
- VectorizedCast = 1,
- SrcCoeffRatio = 1,
- TgtCoeffRatio = 1
- };
+ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
};
template <>
struct type_casting_traits<bfloat16, unsigned short int> {
- enum {
- VectorizedCast = 1,
- SrcCoeffRatio = 1,
- TgtCoeffRatio = 1
- };
+ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
};
template <>
struct type_casting_traits<unsigned short int, bfloat16> {
- enum {
- VectorizedCast = 1,
- SrcCoeffRatio = 1,
- TgtCoeffRatio = 1
- };
+ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
};
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
- return vec_cts(a,0);
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+ return vec_cts(a, 0);
}
-template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
- return vec_ctu(a,0);
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
+ return vec_ctu(a, 0);
}
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
- return vec_ctf(a,0);
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+ return vec_ctf(a, 0);
}
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
- return vec_ctf(a,0);
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
+ return vec_ctf(a, 0);
}
-template<> EIGEN_STRONG_INLINE Packet8us pcast<Packet8bf, Packet8us>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet8bf, Packet8us>(const Packet8bf& a) {
Packet4f float_even = Bf16ToF32Even(a);
Packet4f float_odd = Bf16ToF32Odd(a);
Packet4ui int_even = pcast<Packet4f, Packet4ui>(float_even);
@@ -78,13 +67,13 @@
Packet4ui low_even = pand<Packet4ui>(int_even, p4ui_low_mask);
Packet4ui low_odd = pand<Packet4ui>(int_odd, p4ui_low_mask);
- //Check values that are bigger than USHRT_MAX (0xFFFF)
+ // Check values that are bigger than USHRT_MAX (0xFFFF)
Packet4bi overflow_selector;
- if(vec_any_gt(int_even, p4ui_low_mask)){
+ if (vec_any_gt(int_even, p4ui_low_mask)) {
overflow_selector = vec_cmpgt(int_even, p4ui_low_mask);
low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector);
}
- if(vec_any_gt(int_odd, p4ui_low_mask)){
+ if (vec_any_gt(int_odd, p4ui_low_mask)) {
overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask);
low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector);
}
@@ -92,8 +81,9 @@
return pmerge(low_even, low_odd);
}
-template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8us, Packet8bf>(const Packet8us& a) {
- //short -> int -> float -> bfloat16
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcast<Packet8us, Packet8bf>(const Packet8us& a) {
+ // short -> int -> float -> bfloat16
const EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
Packet4ui int_cast = reinterpret_cast<Packet4ui>(a);
Packet4ui int_even = pand<Packet4ui>(int_cast, p4ui_low_mask);
@@ -105,14 +95,11 @@
template <>
struct type_casting_traits<bfloat16, float> {
- enum {
- VectorizedCast = 1,
- SrcCoeffRatio = 1,
- TgtCoeffRatio = 2
- };
+ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
};
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet8bf, Packet4f>(const Packet8bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8bf, Packet4f>(const Packet8bf& a) {
Packet8us z = pset1<Packet8us>(0);
#ifdef _BIG_ENDIAN
return reinterpret_cast<Packet4f>(vec_mergeh(a.m_val, z));
@@ -123,22 +110,21 @@
template <>
struct type_casting_traits<float, bfloat16> {
- enum {
- VectorizedCast = 1,
- SrcCoeffRatio = 2,
- TgtCoeffRatio = 1
- };
+ enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
};
-template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet4f, Packet8bf>(const Packet4f& a, const Packet4f &b) {
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcast<Packet4f, Packet8bf>(const Packet4f& a, const Packet4f& b) {
return F32ToBf16Both(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
return reinterpret_cast<Packet4i>(a);
}
-template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
return reinterpret_cast<Packet4f>(a);
}
@@ -149,31 +135,29 @@
// a slow version that works with older compilers.
// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
-template<>
+template <>
inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) {
-#if EIGEN_GNUC_STRICT_AT_LEAST(7,1,0)
- return vec_cts(x, 0); // TODO: check clang version.
+#if EIGEN_GNUC_STRICT_AT_LEAST(7, 1, 0)
+ return vec_cts(x, 0); // TODO: check clang version.
#else
double tmp[2];
memcpy(tmp, &x, sizeof(tmp));
- Packet2l l = { static_cast<long long>(tmp[0]),
- static_cast<long long>(tmp[1]) };
+ Packet2l l = {static_cast<long long>(tmp[0]), static_cast<long long>(tmp[1])};
return l;
#endif
}
-template<>
+template <>
inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x) {
unsigned long long tmp[2];
memcpy(tmp, &x, sizeof(tmp));
- Packet2d d = { static_cast<double>(tmp[0]),
- static_cast<double>(tmp[1]) };
+ Packet2d d = {static_cast<double>(tmp[0]), static_cast<double>(tmp[1])};
return d;
}
#endif
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_TYPE_CASTING_ALTIVEC_H
+#endif // EIGEN_TYPE_CASTING_ALTIVEC_H
diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h
index 93e8714..68b48f9 100644
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h
@@ -26,16 +26,16 @@
// As a consequence, we get compile failures when compiling Eigen with
// GPU support. Hence the need to disable EIGEN_CONSTEXPR when building
// Eigen with GPU support
- #pragma push_macro("EIGEN_CONSTEXPR")
- #undef EIGEN_CONSTEXPR
- #define EIGEN_CONSTEXPR
+#pragma push_macro("EIGEN_CONSTEXPR")
+#undef EIGEN_CONSTEXPR
+#define EIGEN_CONSTEXPR
#endif
-#define BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, METHOD) \
- template <> \
- EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED \
- PACKET_BF16 METHOD<PACKET_BF16>(const PACKET_BF16& _x) { \
- return F32ToBf16(METHOD<PACKET_F>(Bf16ToF32(_x))); \
+#define BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, METHOD) \
+ template <> \
+ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED PACKET_BF16 METHOD<PACKET_BF16>( \
+ const PACKET_BF16& _x) { \
+ return F32ToBf16(METHOD<PACKET_F>(Bf16ToF32(_x))); \
}
// Only use HIP GPU bf16 in kernels
@@ -77,7 +77,7 @@
unsigned short value;
};
-#endif // defined(EIGEN_USE_HIP_BF16)
+#endif // defined(EIGEN_USE_HIP_BF16)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(unsigned short value);
template <bool AssumeArgumentIsNormalOrInfinityOrZero>
@@ -95,11 +95,10 @@
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base(const __bfloat16_raw& h) : __bfloat16_raw(h) {}
};
-} // namespace bfloat16_impl
+} // namespace bfloat16_impl
// Class definition.
struct bfloat16 : public bfloat16_impl::bfloat16_base {
-
typedef bfloat16_impl::__bfloat16_raw __bfloat16_raw;
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16() {}
@@ -109,16 +108,17 @@
explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(bool b)
: bfloat16_impl::bfloat16_base(bfloat16_impl::raw_uint16_to_bfloat16(b ? 0x3f80 : 0)) {}
- template<class T>
+ template <class T>
explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(T val)
- : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<internal::is_integral<T>::value>(static_cast<float>(val))) {}
+ : bfloat16_impl::bfloat16_base(
+ bfloat16_impl::float_to_bfloat16_rtne<internal::is_integral<T>::value>(static_cast<float>(val))) {}
explicit EIGEN_DEVICE_FUNC bfloat16(float f)
: bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(f)) {}
// Following the convention of numpy, converting between complex and
// float will lead to loss of imag value.
- template<typename RealScalar>
+ template <typename RealScalar>
explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const std::complex<RealScalar>& val)
: bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(static_cast<float>(val.real()))) {}
@@ -160,62 +160,64 @@
// detect tininess in the same way for all operations in radix two"
static EIGEN_CONSTEXPR const bool tinyness_before = std::numeric_limits<float>::tinyness_before;
- static EIGEN_CONSTEXPR Eigen::bfloat16 (min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); }
+ static EIGEN_CONSTEXPR Eigen::bfloat16(min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); }
static EIGEN_CONSTEXPR Eigen::bfloat16 lowest() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0xff7f); }
- static EIGEN_CONSTEXPR Eigen::bfloat16 (max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); }
+ static EIGEN_CONSTEXPR Eigen::bfloat16(max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); }
static EIGEN_CONSTEXPR Eigen::bfloat16 epsilon() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); }
static EIGEN_CONSTEXPR Eigen::bfloat16 round_error() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3f00); }
static EIGEN_CONSTEXPR Eigen::bfloat16 infinity() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); }
static EIGEN_CONSTEXPR Eigen::bfloat16 quiet_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); }
- static EIGEN_CONSTEXPR Eigen::bfloat16 signaling_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fa0); }
+ static EIGEN_CONSTEXPR Eigen::bfloat16 signaling_NaN() {
+ return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fa0);
+ }
static EIGEN_CONSTEXPR Eigen::bfloat16 denorm_min() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0001); }
};
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_specialized;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_signed;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_integer;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_exact;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_infinity;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_quiet_NaN;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_signaling_NaN;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const std::float_denorm_style numeric_limits_bfloat16_impl<T>::has_denorm;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_denorm_loss;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const std::float_round_style numeric_limits_bfloat16_impl<T>::round_style;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_iec559;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_bounded;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_modulo;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::digits;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::digits10;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::max_digits10;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::radix;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::min_exponent;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::min_exponent10;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::max_exponent;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::max_exponent10;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::traps;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::tinyness_before;
} // end namespace bfloat16_impl
} // end namespace Eigen
@@ -225,13 +227,13 @@
// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
// std::numeric_limits<const volatile T>
// https://stackoverflow.com/a/16519653/
-template<>
+template <>
class numeric_limits<Eigen::bfloat16> : public Eigen::bfloat16_impl::numeric_limits_bfloat16_impl<> {};
-template<>
+template <>
class numeric_limits<const Eigen::bfloat16> : public numeric_limits<Eigen::bfloat16> {};
-template<>
+template <>
class numeric_limits<volatile Eigen::bfloat16> : public numeric_limits<Eigen::bfloat16> {};
-template<>
+template <>
class numeric_limits<const volatile Eigen::bfloat16> : public numeric_limits<Eigen::bfloat16> {};
} // end namespace std
@@ -242,7 +244,7 @@
// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
// of the functions, while the latter can only deal with one of them.
-#if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for bfloat16 floats
+#if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for bfloat16 floats
#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
// We need to provide emulated *host-side* BF16 operators for clang.
@@ -250,7 +252,7 @@
#undef EIGEN_DEVICE_FUNC
#if (defined(EIGEN_HAS_GPU_BF16) && defined(EIGEN_HAS_NATIVE_BF16))
#define EIGEN_DEVICE_FUNC __host__
-#else // both host and device need emulated ops.
+#else // both host and device need emulated ops.
#define EIGEN_DEVICE_FUNC __host__ __device__
#endif
#endif
@@ -258,41 +260,41 @@
// Definitions for CPUs, mostly working through conversion
// to/from fp32.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator+(const bfloat16& a, const bfloat16& b) {
return bfloat16(float(a) + float(b));
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const bfloat16& a, const int& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator+(const bfloat16& a, const int& b) {
return bfloat16(float(a) + static_cast<float>(b));
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator + (const int& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator+(const int& a, const bfloat16& b) {
return bfloat16(static_cast<float>(a) + float(b));
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator * (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator*(const bfloat16& a, const bfloat16& b) {
return bfloat16(float(a) * float(b));
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator-(const bfloat16& a, const bfloat16& b) {
return bfloat16(float(a) - float(b));
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator/(const bfloat16& a, const bfloat16& b) {
return bfloat16(float(a) / float(b));
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator - (const bfloat16& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator-(const bfloat16& a) {
numext::uint16_t x = numext::bit_cast<uint16_t>(a) ^ 0x8000;
return numext::bit_cast<bfloat16>(x);
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator += (bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator+=(bfloat16& a, const bfloat16& b) {
a = bfloat16(float(a) + float(b));
return a;
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator *= (bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator*=(bfloat16& a, const bfloat16& b) {
a = bfloat16(float(a) * float(b));
return a;
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator -= (bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator-=(bfloat16& a, const bfloat16& b) {
a = bfloat16(float(a) - float(b));
return a;
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator /= (bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator/=(bfloat16& a, const bfloat16& b) {
a = bfloat16(float(a) / float(b));
return a;
}
@@ -314,22 +316,22 @@
--a;
return original_value;
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const bfloat16& a, const bfloat16& b) {
- return numext::equal_strict(float(a),float(b));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const bfloat16& a, const bfloat16& b) {
+ return numext::equal_strict(float(a), float(b));
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const bfloat16& a, const bfloat16& b) {
return numext::not_equal_strict(float(a), float(b));
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const bfloat16& a, const bfloat16& b) {
return float(a) < float(b);
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const bfloat16& a, const bfloat16& b) {
return float(a) <= float(b);
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const bfloat16& a, const bfloat16& b) {
return float(a) > float(b);
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const bfloat16& a, const bfloat16& b) {
return float(a) >= float(b);
}
@@ -340,7 +342,7 @@
// Division by an index. Do it in full float precision to avoid accuracy
// issues in converting the denominator to bfloat16.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator / (const bfloat16& a, Index b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator/(const bfloat16& a, Index b) {
return bfloat16(static_cast<float>(a) / static_cast<float>(b));
}
@@ -350,7 +352,7 @@
#else
__bfloat16_raw output;
if (numext::isnan EIGEN_NOT_A_MACRO(v)) {
- output.value = std::signbit(v) ? 0xFFC0: 0x7FC0;
+ output.value = std::signbit(v) ? 0xFFC0 : 0x7FC0;
return output;
}
output.value = static_cast<numext::uint16_t>(numext::bit_cast<numext::uint32_t>(v) >> 16);
@@ -368,7 +370,8 @@
#endif
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_bfloat16_as_uint16(const __bfloat16_raw& bf) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_bfloat16_as_uint16(
+ const __bfloat16_raw& bf) {
#if defined(EIGEN_USE_HIP_BF16)
return bf.data;
#else
@@ -391,7 +394,7 @@
//
// qNaN magic: All exponent bits set + most significant bit of fraction
// set.
- output.value = std::signbit(ff) ? 0xFFC0: 0x7FC0;
+ output.value = std::signbit(ff) ? 0xFFC0 : 0x7FC0;
} else {
// Fast rounding algorithm that rounds a half value to nearest even. This
// reduces expected error when we convert a large number of floats. Here
@@ -555,140 +558,96 @@
template <>
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true>(float ff) {
#if defined(EIGEN_USE_HIP_BF16)
- return __bfloat16_raw(__bfloat16_raw::round_to_bfloat16(ff));
+ return __bfloat16_raw(__bfloat16_raw::round_to_bfloat16(ff));
#else
- numext::uint32_t input = numext::bit_cast<numext::uint32_t>(ff);
- __bfloat16_raw output;
+ numext::uint32_t input = numext::bit_cast<numext::uint32_t>(ff);
+ __bfloat16_raw output;
- // Least significant bit of resulting bfloat.
- numext::uint32_t lsb = (input >> 16) & 1;
- numext::uint32_t rounding_bias = 0x7fff + lsb;
- input += rounding_bias;
- output.value = static_cast<numext::uint16_t>(input >> 16);
- return output;
+ // Least significant bit of resulting bfloat.
+ numext::uint32_t lsb = (input >> 16) & 1;
+ numext::uint32_t rounding_bias = 0x7fff + lsb;
+ input += rounding_bias;
+ output.value = static_cast<numext::uint16_t>(input >> 16);
+ return output;
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h) {
#if defined(EIGEN_USE_HIP_BF16)
- return static_cast<float>(h);
+ return static_cast<float>(h);
#else
- return numext::bit_cast<float>(static_cast<numext::uint32_t>(h.value) << 16);
+ return numext::bit_cast<float>(static_cast<numext::uint32_t>(h.value) << 16);
#endif
}
// --- standard functions ---
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const bfloat16& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const bfloat16& a) {
EIGEN_USING_STD(isinf);
#if defined(EIGEN_USE_HIP_BF16)
- return (isinf)(a); // Uses HIP hip_bfloat16 isinf operator
+ return (isinf)(a); // Uses HIP hip_bfloat16 isinf operator
#else
return (isinf)(float(a));
#endif
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const bfloat16& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const bfloat16& a) {
EIGEN_USING_STD(isnan);
#if defined(EIGEN_USE_HIP_BF16)
- return (isnan)(a); // Uses HIP hip_bfloat16 isnan operator
+ return (isnan)(a); // Uses HIP hip_bfloat16 isnan operator
#else
return (isnan)(float(a));
#endif
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const bfloat16& a) {
- return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const bfloat16& a) {
+ return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 abs(const bfloat16& a) {
numext::uint16_t x = numext::bit_cast<numext::uint16_t>(a) & 0x7FFF;
return numext::bit_cast<bfloat16>(x);
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp(const bfloat16& a) {
- return bfloat16(::expf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 expm1(const bfloat16& a) {
- return bfloat16(numext::expm1(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log(const bfloat16& a) {
- return bfloat16(::logf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log1p(const bfloat16& a) {
- return bfloat16(numext::log1p(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log10(const bfloat16& a) {
- return bfloat16(::log10f(float(a)));
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp(const bfloat16& a) { return bfloat16(::expf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 expm1(const bfloat16& a) { return bfloat16(numext::expm1(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log(const bfloat16& a) { return bfloat16(::logf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log1p(const bfloat16& a) { return bfloat16(numext::log1p(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log10(const bfloat16& a) { return bfloat16(::log10f(float(a))); }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log2(const bfloat16& a) {
return bfloat16(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) {
- return bfloat16(::sqrtf(float(a)));
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) { return bfloat16(::sqrtf(float(a))); }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16& a, const bfloat16& b) {
return bfloat16(::powf(float(a), float(b)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan2(const bfloat16& a, const bfloat16& b) {
return bfloat16(::atan2f(float(a), float(b)));
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sin(const bfloat16& a) {
- return bfloat16(::sinf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cos(const bfloat16& a) {
- return bfloat16(::cosf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tan(const bfloat16& a) {
- return bfloat16(::tanf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asin(const bfloat16& a) {
- return bfloat16(::asinf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acos(const bfloat16& a) {
- return bfloat16(::acosf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan(const bfloat16& a) {
- return bfloat16(::atanf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sinh(const bfloat16& a) {
- return bfloat16(::sinhf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cosh(const bfloat16& a) {
- return bfloat16(::coshf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tanh(const bfloat16& a) {
- return bfloat16(::tanhf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asinh(const bfloat16& a) {
- return bfloat16(::asinhf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acosh(const bfloat16& a) {
- return bfloat16(::acoshf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atanh(const bfloat16& a) {
- return bfloat16(::atanhf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) {
- return bfloat16(::floorf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) {
- return bfloat16(::ceilf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16& a) {
- return bfloat16(::rintf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 round(const bfloat16& a) {
- return bfloat16(::roundf(float(a)));
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sin(const bfloat16& a) { return bfloat16(::sinf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cos(const bfloat16& a) { return bfloat16(::cosf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tan(const bfloat16& a) { return bfloat16(::tanf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asin(const bfloat16& a) { return bfloat16(::asinf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acos(const bfloat16& a) { return bfloat16(::acosf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan(const bfloat16& a) { return bfloat16(::atanf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sinh(const bfloat16& a) { return bfloat16(::sinhf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cosh(const bfloat16& a) { return bfloat16(::coshf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tanh(const bfloat16& a) { return bfloat16(::tanhf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asinh(const bfloat16& a) { return bfloat16(::asinhf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acosh(const bfloat16& a) { return bfloat16(::acoshf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atanh(const bfloat16& a) { return bfloat16(::atanhf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) { return bfloat16(::floorf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) { return bfloat16(::ceilf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16& a) { return bfloat16(::rintf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 round(const bfloat16& a) { return bfloat16(::roundf(float(a))); }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmod(const bfloat16& a, const bfloat16& b) {
return bfloat16(::fmodf(float(a), float(b)));
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (min)(const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16(min)(const bfloat16& a, const bfloat16& b) {
const float f1 = static_cast<float>(a);
const float f2 = static_cast<float>(b);
return f2 < f1 ? b : a;
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 (max)(const bfloat16& a, const bfloat16& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16(max)(const bfloat16& a, const bfloat16& b) {
const float f1 = static_cast<float>(a);
const float f2 = static_cast<float>(b);
return f1 < f2 ? b : a;
@@ -707,42 +666,34 @@
}
#ifndef EIGEN_NO_IO
-EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const bfloat16& v) {
+EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const bfloat16& v) {
os << static_cast<float>(v);
return os;
}
#endif
-} // namespace bfloat16_impl
+} // namespace bfloat16_impl
namespace internal {
-template<>
-struct random_default_impl<bfloat16, false, false>
-{
- static inline bfloat16 run(const bfloat16& x, const bfloat16& y)
- {
- return x + (y-x) * bfloat16(float(std::rand()) / float(RAND_MAX));
+template <>
+struct random_default_impl<bfloat16, false, false> {
+ static inline bfloat16 run(const bfloat16& x, const bfloat16& y) {
+ return x + (y - x) * bfloat16(float(std::rand()) / float(RAND_MAX));
}
- static inline bfloat16 run()
- {
- return run(bfloat16(-1.f), bfloat16(1.f));
- }
+ static inline bfloat16 run() { return run(bfloat16(-1.f), bfloat16(1.f)); }
};
-template<> struct is_arithmetic<bfloat16> { enum { value = true }; };
+template <>
+struct is_arithmetic<bfloat16> {
+ enum { value = true };
+};
-} // namespace internal
+} // namespace internal
-template<> struct NumTraits<Eigen::bfloat16>
- : GenericNumTraits<Eigen::bfloat16>
-{
- enum {
- IsSigned = true,
- IsInteger = false,
- IsComplex = false,
- RequireInitialization = false
- };
+template <>
+struct NumTraits<Eigen::bfloat16> : GenericNumTraits<Eigen::bfloat16> {
+ enum { IsSigned = true, IsInteger = false, IsComplex = false, RequireInitialization = false };
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 epsilon() {
return bfloat16_impl::raw_uint16_to_bfloat16(0x3c00);
@@ -764,31 +715,27 @@
}
};
-} // namespace Eigen
-
+} // namespace Eigen
#if defined(EIGEN_HAS_HIP_BF16)
- #pragma pop_macro("EIGEN_CONSTEXPR")
+#pragma pop_macro("EIGEN_CONSTEXPR")
#endif
namespace Eigen {
namespace numext {
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-bool (isnan)(const Eigen::bfloat16& h) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::bfloat16& h) {
return (bfloat16_impl::isnan)(h);
}
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-bool (isinf)(const Eigen::bfloat16& h) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::bfloat16& h) {
return (bfloat16_impl::isinf)(h);
}
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-bool (isfinite)(const Eigen::bfloat16& h) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::bfloat16& h) {
return (bfloat16_impl::isfinite)(h);
}
@@ -813,7 +760,7 @@
return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a));
}
};
-} // namespace std
+} // namespace std
#endif
// Add the missing shfl* intrinsics.
@@ -831,34 +778,39 @@
#if defined(EIGEN_HAS_HIP_BF16)
-__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl(Eigen::bfloat16 var, int srcLane, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl(Eigen::bfloat16 var, int srcLane, int width = warpSize) {
const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
return Eigen::numext::bit_cast<Eigen::bfloat16>(static_cast<Eigen::numext::uint16_t>(__shfl(ivar, srcLane, width)));
}
-__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_up(Eigen::bfloat16 var, unsigned int delta, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_up(Eigen::bfloat16 var, unsigned int delta,
+ int width = warpSize) {
const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
return Eigen::numext::bit_cast<Eigen::bfloat16>(static_cast<Eigen::numext::uint16_t>(__shfl_up(ivar, delta, width)));
}
-__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_down(Eigen::bfloat16 var, unsigned int delta, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_down(Eigen::bfloat16 var, unsigned int delta,
+ int width = warpSize) {
const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
- return Eigen::numext::bit_cast<Eigen::bfloat16>(static_cast<Eigen::numext::uint16_t>(__shfl_down(ivar, delta, width)));
+ return Eigen::numext::bit_cast<Eigen::bfloat16>(
+ static_cast<Eigen::numext::uint16_t>(__shfl_down(ivar, delta, width)));
}
-__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_xor(Eigen::bfloat16 var, int laneMask, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_xor(Eigen::bfloat16 var, int laneMask, int width = warpSize) {
const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
- return Eigen::numext::bit_cast<Eigen::bfloat16>(static_cast<Eigen::numext::uint16_t>(__shfl_xor(ivar, laneMask, width)));
+ return Eigen::numext::bit_cast<Eigen::bfloat16>(
+ static_cast<Eigen::numext::uint16_t>(__shfl_xor(ivar, laneMask, width)));
}
-#endif // HIP
+#endif // HIP
-#endif // __shfl*
+#endif // __shfl*
#if defined(EIGEN_HIPCC)
EIGEN_STRONG_INLINE __device__ Eigen::bfloat16 __ldg(const Eigen::bfloat16* ptr) {
- return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(__ldg(Eigen::numext::bit_cast<const Eigen::numext::uint16_t*>(ptr)));
+ return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(
+ __ldg(Eigen::numext::bit_cast<const Eigen::numext::uint16_t*>(ptr)));
}
-#endif // __ldg
+#endif // __ldg
-#endif // EIGEN_BFLOAT16_H
+#endif // EIGEN_BFLOAT16_H
diff --git a/Eigen/src/Core/arch/Default/ConjHelper.h b/Eigen/src/Core/arch/Default/ConjHelper.h
index 84da47f..fd7923e 100644
--- a/Eigen/src/Core/arch/Default/ConjHelper.h
+++ b/Eigen/src/Core/arch/Default/ConjHelper.h
@@ -11,31 +11,25 @@
#ifndef EIGEN_ARCH_CONJ_HELPER_H
#define EIGEN_ARCH_CONJ_HELPER_H
-#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \
- template <> \
- struct conj_helper<PACKET_REAL, PACKET_CPLX, false, false> { \
- EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, \
- const PACKET_CPLX& y, \
- const PACKET_CPLX& c) const { \
- return padd(c, this->pmul(x, y)); \
- } \
- EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, \
- const PACKET_CPLX& y) const { \
- return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); \
- } \
- }; \
- \
- template <> \
- struct conj_helper<PACKET_CPLX, PACKET_REAL, false, false> { \
- EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, \
- const PACKET_REAL& y, \
- const PACKET_CPLX& c) const { \
- return padd(c, this->pmul(x, y)); \
- } \
- EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, \
- const PACKET_REAL& y) const { \
- return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); \
- } \
+#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \
+ template <> \
+ struct conj_helper<PACKET_REAL, PACKET_CPLX, false, false> { \
+ EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const { \
+ return padd(c, this->pmul(x, y)); \
+ } \
+ EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const { \
+ return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); \
+ } \
+ }; \
+ \
+ template <> \
+ struct conj_helper<PACKET_CPLX, PACKET_REAL, false, false> { \
+ EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const { \
+ return padd(c, this->pmul(x, y)); \
+ } \
+ EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const { \
+ return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); \
+ } \
};
// IWYU pragma: private
@@ -44,74 +38,88 @@
namespace Eigen {
namespace internal {
-template<bool Conjugate> struct conj_if;
+template <bool Conjugate>
+struct conj_if;
-template<> struct conj_if<true> {
- template<typename T>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { return numext::conj(x); }
- template<typename T>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T pconj(const T& x) const { return internal::pconj(x); }
+template <>
+struct conj_if<true> {
+ template <typename T>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
+ return numext::conj(x);
+ }
+ template <typename T>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T pconj(const T& x) const {
+ return internal::pconj(x);
+ }
};
-template<> struct conj_if<false> {
- template<typename T>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator()(const T& x) const { return x; }
- template<typename T>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& pconj(const T& x) const { return x; }
+template <>
+struct conj_if<false> {
+ template <typename T>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator()(const T& x) const {
+ return x;
+ }
+ template <typename T>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& pconj(const T& x) const {
+ return x;
+ }
};
// Generic Implementation, assume scalars since the packet-version is
// specialized below.
-template<typename LhsType, typename RhsType, bool ConjLhs, bool ConjRhs>
+template <typename LhsType, typename RhsType, bool ConjLhs, bool ConjRhs>
struct conj_helper {
typedef typename ScalarBinaryOpTraits<LhsType, RhsType>::ReturnType ResultType;
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
- pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const
- { return this->pmul(x, y) + c; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsType& x, const RhsType& y,
+ const ResultType& c) const {
+ return this->pmul(x, y) + c;
+ }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
- pmul(const LhsType& x, const RhsType& y) const
- { return conj_if<ConjLhs>()(x) * conj_if<ConjRhs>()(y); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType& x, const RhsType& y) const {
+ return conj_if<ConjLhs>()(x) * conj_if<ConjRhs>()(y);
+ }
};
-template<typename LhsScalar, typename RhsScalar>
+template <typename LhsScalar, typename RhsScalar>
struct conj_helper<LhsScalar, RhsScalar, true, true> {
- typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar>::ReturnType ResultType;
+ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResultType;
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
- pmadd(const LhsScalar& x, const RhsScalar& y, const ResultType& c) const
- { return this->pmul(x, y) + c; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsScalar& x, const RhsScalar& y,
+ const ResultType& c) const {
+ return this->pmul(x, y) + c;
+ }
// We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
- pmul(const LhsScalar& x, const RhsScalar& y) const
- { return numext::conj(x * y); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsScalar& x, const RhsScalar& y) const {
+ return numext::conj(x * y);
+ }
};
// Implementation with equal type, use packet operations.
-template<typename Packet, bool ConjLhs, bool ConjRhs>
-struct conj_helper<Packet, Packet, ConjLhs, ConjRhs>
-{
+template <typename Packet, bool ConjLhs, bool ConjRhs>
+struct conj_helper<Packet, Packet, ConjLhs, ConjRhs> {
typedef Packet ResultType;
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const
- { return Eigen::internal::pmadd(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y), c); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const {
+ return Eigen::internal::pmadd(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y), c);
+ }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const
- { return Eigen::internal::pmul(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y)); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const {
+ return Eigen::internal::pmul(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y));
+ }
};
-template<typename Packet>
-struct conj_helper<Packet, Packet, true, true>
-{
+template <typename Packet>
+struct conj_helper<Packet, Packet, true, true> {
typedef Packet ResultType;
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const
- { return Eigen::internal::pmadd(pconj(x), pconj(y), c); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const {
+ return Eigen::internal::pmadd(pconj(x), pconj(y), c);
+ }
// We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const
- { return pconj(Eigen::internal::pmul(x, y)); }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const {
+ return pconj(Eigen::internal::pmul(x, y));
+ }
};
} // namespace internal
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 3d4a2a5..8fb5b68 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -23,14 +23,27 @@
namespace internal {
// Creates a Scalar integer type with same bit-width.
-template<typename T> struct make_integer;
-template<> struct make_integer<float> { typedef numext::int32_t type; };
-template<> struct make_integer<double> { typedef numext::int64_t type; };
-template<> struct make_integer<half> { typedef numext::int16_t type; };
-template<> struct make_integer<bfloat16> { typedef numext::int16_t type; };
+template <typename T>
+struct make_integer;
+template <>
+struct make_integer<float> {
+ typedef numext::int32_t type;
+};
+template <>
+struct make_integer<double> {
+ typedef numext::int64_t type;
+};
+template <>
+struct make_integer<half> {
+ typedef numext::int16_t type;
+};
+template <>
+struct make_integer<bfloat16> {
+ typedef numext::int16_t type;
+};
-template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
typedef typename unpacket_traits<Packet>::type Scalar;
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
static constexpr int mantissa_bits = numext::numeric_limits<Scalar>::digits - 1;
@@ -39,34 +52,32 @@
// Safely applies frexp, correctly handles denormals.
// Assumes IEEE floating point format.
-template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-Packet pfrexp_generic(const Packet& a, Packet& exponent) {
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent) {
typedef typename unpacket_traits<Packet>::type Scalar;
typedef typename make_unsigned<typename make_integer<Scalar>::type>::type ScalarUI;
- static constexpr int
- TotalBits = sizeof(Scalar) * CHAR_BIT,
- MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
- ExponentBits = TotalBits - MantissaBits - 1;
+ static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+ ExponentBits = TotalBits - MantissaBits - 1;
EIGEN_CONSTEXPR ScalarUI scalar_sign_mantissa_mask =
- ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits); // ~0x7f800000
+ ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits); // ~0x7f800000
const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask));
const Packet half = pset1<Packet>(Scalar(0.5));
const Packet zero = pzero(a);
- const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)()); // Minimum normal value, 2^-126
+ const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)()); // Minimum normal value, 2^-126
// To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
- EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1); // 24
+ EIGEN_CONSTEXPR ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1); // 24
// The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
- const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24
+ const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset)); // 2^24
const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);
const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a);
// Determine exponent offset: -126 if normal, -126-24 if denormal
- const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1)<<(ExponentBits-1)) - ScalarUI(2)); // -126
+ const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1) << (ExponentBits - 1)) - ScalarUI(2)); // -126
Packet exponent_offset = pset1<Packet>(scalar_exponent_offset);
- const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset)); // -24
+ const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset)); // -24
exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset);
// Determine exponent and mantissa from normalized_a.
@@ -83,8 +94,8 @@
// Safely applies ldexp, correctly handles overflows, underflows and denormals.
// Assumes IEEE floating point format.
-template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-Packet pldexp_generic(const Packet& a, const Packet& exponent) {
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent) {
// We want to return a * 2^exponent, allowing for all possible integer
// exponents without overflowing or underflowing in intermediate
// computations.
@@ -93,7 +104,7 @@
// to consider for a float is:
// -255-23 -> 255+23
// Below -278 any finite float 'a' will become zero, and above +278 any
- // finite float will become inf, including when 'a' is the smallest possible
+ // finite float will become inf, including when 'a' is the smallest possible
// denormal.
//
// Unfortunately, 2^(278) cannot be represented using either one or two
@@ -110,19 +121,17 @@
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
typedef typename unpacket_traits<Packet>::type Scalar;
typedef typename unpacket_traits<PacketI>::type ScalarI;
- static constexpr int
- TotalBits = sizeof(Scalar) * CHAR_BIT,
- MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
- ExponentBits = TotalBits - MantissaBits - 1;
+ static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+ ExponentBits = TotalBits - MantissaBits - 1;
- const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1)<<ExponentBits) + ScalarI(MantissaBits - 1))); // 278
- const PacketI bias = pset1<PacketI>((ScalarI(1)<<(ExponentBits-1)) - ScalarI(1)); // 127
+ const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) + ScalarI(MantissaBits - 1))); // 278
+ const PacketI bias = pset1<PacketI>((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1)); // 127
const PacketI e = pcast<Packet, PacketI>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
- PacketI b = parithmetic_shift_right<2>(e); // floor(e/4);
+ PacketI b = parithmetic_shift_right<2>(e); // floor(e/4);
Packet c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias))); // 2^b
- Packet out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
- b = psub(psub(psub(e, b), b), b); // e - 3b
- c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias))); // 2^(e-3*b)
+ Packet out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
+ b = psub(psub(psub(e, b), b), b); // e - 3b
+ c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias))); // 2^(e-3*b)
out = pmul(out, c);
return out;
}
@@ -136,22 +145,19 @@
// if 2^e doesn't fit into a normal floating-point Scalar.
//
// Assumes IEEE floating point format
-template<typename Packet>
+template <typename Packet>
struct pldexp_fast_impl {
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
typedef typename unpacket_traits<Packet>::type Scalar;
typedef typename unpacket_traits<PacketI>::type ScalarI;
- static constexpr int
- TotalBits = sizeof(Scalar) * CHAR_BIT,
- MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
- ExponentBits = TotalBits - MantissaBits - 1;
+ static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+ ExponentBits = TotalBits - MantissaBits - 1;
- static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
- Packet run(const Packet& a, const Packet& exponent) {
- const Packet bias = pset1<Packet>(Scalar((ScalarI(1)<<(ExponentBits-1)) - ScalarI(1))); // 127
- const Packet limit = pset1<Packet>(Scalar((ScalarI(1)<<ExponentBits) - ScalarI(1))); // 255
+ static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet run(const Packet& a, const Packet& exponent) {
+ const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1))); // 127
+ const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1))); // 255
// restrict biased exponent between 0 and 255 for float.
- const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127
+ const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit)); // exponent + 127
// return a * (2^e)
return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
}
@@ -164,17 +170,15 @@
// TODO(gonnet): Further reduce the interval allowing for lower-degree
// polynomial interpolants -> ... -> profit!
template <typename Packet, bool base2>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog_impl_float(const Packet _x)
-{
- const Packet cst_1 = pset1<Packet>(1.0f);
- const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0xff800000u));
- const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x7f800000u));
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_float(const Packet _x) {
+ const Packet cst_1 = pset1<Packet>(1.0f);
+ const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0xff800000u));
+ const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x7f800000u));
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
Packet e, x;
// extract significant in the range [0.5,1) and exponent
- x = pfrexp(_x,e);
+ x = pfrexp(_x, e);
// part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
// and shift by -1. The values are then centered around 0, which improves
@@ -216,27 +220,22 @@
}
Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
- Packet iszero_mask = pcmp_eq(_x,pzero(_x));
- Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
+ Packet iszero_mask = pcmp_eq(_x, pzero(_x));
+ Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
// Filter out invalid inputs, i.e.:
// - negative arg will be NAN
// - 0 will be -INF
// - +INF will be +INF
- return pselect(iszero_mask, cst_minus_inf,
- por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
+ return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
}
template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog_float(const Packet _x)
-{
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x) {
return plog_impl_float<Packet, /* base2 */ false>(_x);
}
template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog2_float(const Packet _x)
-{
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Packet _x) {
return plog_impl_float<Packet, /* base2 */ true>(_x);
}
@@ -250,19 +249,16 @@
* for more detail see: http://www.netlib.org/cephes/
*/
template <typename Packet, bool base2>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog_impl_double(const Packet _x)
-{
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(const Packet _x) {
Packet x = _x;
- const Packet cst_1 = pset1<Packet>(1.0);
- const Packet cst_neg_half = pset1<Packet>(-0.5);
- const Packet cst_minus_inf = pset1frombits<Packet>( static_cast<uint64_t>(0xfff0000000000000ull));
- const Packet cst_pos_inf = pset1frombits<Packet>( static_cast<uint64_t>(0x7ff0000000000000ull));
+ const Packet cst_1 = pset1<Packet>(1.0);
+ const Packet cst_neg_half = pset1<Packet>(-0.5);
+ const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
+ const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
-
- // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
- // 1/sqrt(2) <= x < sqrt(2)
+ // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
+ // 1/sqrt(2) <= x < sqrt(2)
const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
@@ -280,8 +276,8 @@
Packet e;
// extract significant in the range [0.5,1) and exponent
- x = pfrexp(x,e);
-
+ x = pfrexp(x, e);
+
// Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
// and shift by -1. The values are then centered around 0, which improves
// the stability of the polynomial evaluation.
@@ -301,20 +297,20 @@
// Evaluate the polynomial approximant , probably to improve instruction-level parallelism.
// y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
Packet y, y1, y_;
- y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
+ y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
- y = pmadd(y, x, cst_cephes_log_p2);
+ y = pmadd(y, x, cst_cephes_log_p2);
y1 = pmadd(y1, x, cst_cephes_log_p5);
y_ = pmadd(y, x3, y1);
- y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
+ y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
- y = pmadd(y, x, cst_cephes_log_q2);
+ y = pmadd(y, x, cst_cephes_log_q2);
y1 = pmadd(y1, x, cst_cephes_log_q5);
- y = pmadd(y, x3, y1);
+ y = pmadd(y, x3, y1);
y_ = pmul(y_, x3);
- y = pdiv(y_, y);
+ y = pdiv(y_, y);
y = pmadd(cst_neg_half, x2, y);
x = padd(x, y);
@@ -329,36 +325,30 @@
}
Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
- Packet iszero_mask = pcmp_eq(_x,pzero(_x));
- Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
+ Packet iszero_mask = pcmp_eq(_x, pzero(_x));
+ Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
// Filter out invalid inputs, i.e.:
// - negative arg will be NAN
// - 0 will be -INF
// - +INF will be +INF
- return pselect(iszero_mask, cst_minus_inf,
- por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
+ return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
}
template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog_double(const Packet _x)
-{
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_double(const Packet _x) {
return plog_impl_double<Packet, /* base2 */ false>(_x);
}
template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog2_double(const Packet _x)
-{
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Packet _x) {
return plog_impl_double<Packet, /* base2 */ true>(_x);
}
/** \internal \returns log(1 + x) computed using W. Kahan's formula.
See: http://www.plunk.org/~hatch/rightway.php
*/
-template<typename Packet>
-Packet generic_plog1p(const Packet& x)
-{
+template <typename Packet>
+Packet generic_plog1p(const Packet& x) {
typedef typename unpacket_traits<Packet>::type ScalarType;
const Packet one = pset1<Packet>(ScalarType(1));
Packet xp1 = padd(x, one);
@@ -372,9 +362,8 @@
/** \internal \returns exp(x)-1 computed using W. Kahan's formula.
See: http://www.plunk.org/~hatch/rightway.php
*/
-template<typename Packet>
-Packet generic_expm1(const Packet& x)
-{
+template <typename Packet>
+Packet generic_expm1(const Packet& x) {
typedef typename unpacket_traits<Packet>::type ScalarType;
const Packet one = pset1<Packet>(ScalarType(1));
const Packet neg_one = pset1<Packet>(ScalarType(-1));
@@ -390,25 +379,18 @@
Packet pos_inf_mask = pcmp_eq(logu, u);
Packet expm1 = pmul(u_minus_one, pdiv(x, logu));
expm1 = pselect(pos_inf_mask, u, expm1);
- return pselect(one_mask,
- x,
- pselect(neg_one_mask,
- neg_one,
- expm1));
+ return pselect(one_mask, x, pselect(neg_one_mask, neg_one, expm1));
}
-
// Exponential function. Works by writing "x = m*log(2) + r" where
// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
// exp(r) is computed using a 6th order minimax polynomial approximation.
template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pexp_float(const Packet _x)
-{
- const Packet cst_zero = pset1<Packet>(0.0f);
- const Packet cst_one = pset1<Packet>(1.0f);
- const Packet cst_half = pset1<Packet>(0.5f);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x) {
+ const Packet cst_zero = pset1<Packet>(0.0f);
+ const Packet cst_one = pset1<Packet>(1.0f);
+ const Packet cst_half = pset1<Packet>(0.5f);
const Packet cst_exp_hi = pset1<Packet>(88.723f);
const Packet cst_exp_lo = pset1<Packet>(-104.f);
@@ -447,13 +429,11 @@
// Return 2^m * exp(r).
// TODO: replace pldexp with faster implementation since y in [-1, 1).
- return pselect(zero_mask, cst_zero, pmax(pldexp(y,m), _x));
+ return pselect(zero_mask, cst_zero, pmax(pldexp(y, m), _x));
}
template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pexp_double(const Packet _x)
-{
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Packet _x) {
Packet x = _x;
const Packet cst_zero = pset1<Packet>(0.0);
const Packet cst_1 = pset1<Packet>(1.0);
@@ -516,7 +496,7 @@
// Construct the result 2^n * exp(g) = e * x. The max is used to catch
// non-finite values in the input.
// TODO: replace pldexp with faster implementation since x in [-1, 1).
- return pselect(zero_mask, cst_zero, pmax(pldexp(x,fx), _x));
+ return pselect(zero_mask, cst_zero, pmax(pldexp(x, fx), _x));
}
// The following code is inspired by the following stack-overflow answer:
@@ -528,29 +508,22 @@
// aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
// - Avoid a branch in rounding and extraction of the remaining fractional part.
// Overall, I measured a speed up higher than x2 on x86-64.
-inline float trig_reduce_huge (float xf, Eigen::numext::int32_t *quadrant)
-{
+inline float trig_reduce_huge(float xf, Eigen::numext::int32_t* quadrant) {
using Eigen::numext::int32_t;
- using Eigen::numext::uint32_t;
using Eigen::numext::int64_t;
+ using Eigen::numext::uint32_t;
using Eigen::numext::uint64_t;
- const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62
- const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point format
+ const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62
+ const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point format
// 192 bits of 2/pi for Payne-Hanek reduction
// Bits are introduced by packet of 8 to enable aligned reads.
- static const uint32_t two_over_pi [] =
- {
- 0x00000028, 0x000028be, 0x0028be60, 0x28be60db,
- 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a,
- 0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4,
- 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770,
- 0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566,
- 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410,
- 0x10e41000, 0xe4100000
- };
-
+ static const uint32_t two_over_pi[] = {
+ 0x00000028, 0x000028be, 0x0028be60, 0x28be60db, 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a, 0x91054a7f,
+ 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4, 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770, 0x4d377036, 0x377036d8,
+ 0x7036d8a5, 0x36d8a566, 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410, 0x10e41000, 0xe4100000};
+
uint32_t xi = numext::bit_cast<uint32_t>(xf);
// Below, -118 = -126 + 8.
// -126 is to get the exponent,
@@ -558,12 +531,12 @@
// This is possible because the fractional part of x as only 24 meaningful bits.
uint32_t e = (xi >> 23) - 118;
// Extract the mantissa and shift it to align it wrt the exponent
- xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7);
+ xi = ((xi & 0x007fffffu) | 0x00800000u) << (e & 0x7);
uint32_t i = e >> 3;
- uint32_t twoopi_1 = two_over_pi[i-1];
- uint32_t twoopi_2 = two_over_pi[i+3];
- uint32_t twoopi_3 = two_over_pi[i+7];
+ uint32_t twoopi_1 = two_over_pi[i - 1];
+ uint32_t twoopi_2 = two_over_pi[i + 3];
+ uint32_t twoopi_3 = two_over_pi[i + 7];
// Compute x * 2/pi in 2.62-bit fixed-point format.
uint64_t p;
@@ -578,23 +551,23 @@
// since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
// r = (p-q)*pi/2,
// where the product can be be carried out with sufficient accuracy using double precision.
- p -= q<<62;
+ p -= q << 62;
return float(double(int64_t(p)) * pio2_62);
}
-template<bool ComputeSine,typename Packet>
+template <bool ComputeSine, typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
#if EIGEN_COMP_GNUC_STRICT
-__attribute__((optimize("-fno-unsafe-math-optimizations")))
+ __attribute__((optimize("-fno-unsafe-math-optimizations")))
#endif
-Packet psincos_float(const Packet& _x)
-{
+ Packet
+ psincos_float(const Packet& _x) {
typedef typename unpacket_traits<Packet>::integer_packet PacketI;
- const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI
- const Packet cst_rounding_magic = pset1<Packet>(12582912); // 2^23 for rounding
- const PacketI csti_1 = pset1<PacketI>(1);
- const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x80000000u));
+ const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f); // 2/PI
+ const Packet cst_rounding_magic = pset1<Packet>(12582912); // 2^23 for rounding
+ const PacketI csti_1 = pset1<PacketI>(1);
+ const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x80000000u));
Packet x = pabs(_x);
@@ -604,19 +577,19 @@
// Rounding trick to find nearest integer:
Packet y_round = padd(y, cst_rounding_magic);
EIGEN_OPTIMIZATION_BARRIER(y_round)
- PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
- y = psub(y_round, cst_rounding_magic); // nearest integer to x * (2/pi)
+ PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
+ y = psub(y_round, cst_rounding_magic); // nearest integer to x * (2/pi)
- // Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
- // using "Extended precision modular arithmetic"
- #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
+// Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
+// using "Extended precision modular arithmetic"
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
// This version requires true FMA for high accuracy
// It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
- #else
+#else
// Without true FMA, the previous set of coefficients maintain 1ULP accuracy
// up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
// We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
@@ -624,29 +597,28 @@
// The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
// and 2 ULP up to:
const float huge_th = ComputeSine ? 25966.f : 18838.f;
- x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
+ x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
EIGEN_OPTIMIZATION_BARRIER(x)
- x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
+ x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
EIGEN_OPTIMIZATION_BARRIER(x)
- x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
- x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
+ x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
+ x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
- // For the record, the following set of coefficients maintain 2ULP up
- // to a slightly larger range:
- // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
- // but it slightly fails to maintain 1ULP for two values of sin below pi.
- // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
- // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
- // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
- // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
+// For the record, the following set of coefficients maintain 2ULP up
+// to a slightly larger range:
+// const float huge_th = ComputeSine ? 51981.f : 39086.125f;
+// but it slightly fails to maintain 1ULP for two values of sin below pi.
+// x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
+// x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
+// x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
+// x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
- // For the record, with only 3 iterations it is possible to maintain
- // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
- // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
- #endif
+// For the record, with only 3 iterations it is possible to maintain
+// 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
+// The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
+#endif
- if(predux_any(pcmp_le(pset1<Packet>(huge_th),pabs(_x))))
- {
+ if (predux_any(pcmp_le(pset1<Packet>(huge_th), pabs(_x)))) {
const int PacketSize = unpacket_traits<Packet>::size;
EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
@@ -654,11 +626,9 @@
pstoreu(vals, pabs(_x));
pstoreu(x_cpy, x);
pstoreu(y_int2, y_int);
- for(int k=0; k<PacketSize;++k)
- {
+ for (int k = 0; k < PacketSize; ++k) {
float val = vals[k];
- if(val>=huge_th && (numext::isfinite)(val))
- x_cpy[k] = trig_reduce_huge(val,&y_int2[k]);
+ if (val >= huge_th && (numext::isfinite)(val)) x_cpy[k] = trig_reduce_huge(val, &y_int2[k]);
}
x = ploadu<Packet>(x_cpy);
y_int = ploadu<PacketI>(y_int2);
@@ -668,19 +638,19 @@
// sin: sign = second_bit(y_int) xor signbit(_x)
// cos: sign = second_bit(y_int+1)
Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
- : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int,csti_1)));
- sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
+ : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
+ sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
// Get the polynomial selection mask from the second bit of y_int
// We'll calculate both (sin and cos) polynomials and then select from the two.
Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
- Packet x2 = pmul(x,x);
+ Packet x2 = pmul(x, x);
// Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
- Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
- y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f ));
- y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f ));
+ Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
+ y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f));
+ y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f));
y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
y1 = pmadd(y1, x2, pset1<Packet>(1.f));
@@ -692,38 +662,32 @@
// c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
// printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
//
- Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
- y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
+ Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
+ y2 = pmadd(y2, x2, pset1<Packet>(0.0083326873655616851693794799871284340042620897293090820312500000f));
y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
y2 = pmul(y2, x2);
y2 = pmadd(y2, x, x);
// Select the correct result from the two polynomials.
- y = ComputeSine ? pselect(poly_mask,y2,y1)
- : pselect(poly_mask,y1,y2);
+ y = ComputeSine ? pselect(poly_mask, y2, y1) : pselect(poly_mask, y1, y2);
// Update the sign and filter huge inputs
return pxor(y, sign_bit);
}
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psin_float(const Packet& x)
-{
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x) {
return psincos_float<true>(x);
}
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pcos_float(const Packet& x)
-{
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x) {
return psincos_float<false>(x);
}
// Generic implementation of acos(x).
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pacos_float(const Packet& x_in) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x_in) {
typedef typename unpacket_traits<Packet>::type Scalar;
static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
@@ -747,7 +711,7 @@
// P(x) = p0 + x * (p1 + x * (p2 + ... (p5 + x * p6)) ... ) .
// We evaluate even and odd terms independently to increase
// instruction level parallelism.
- Packet x2 = pmul(x_in,x_in);
+ Packet x2 = pmul(x_in, x_in);
Packet p_even = pmadd(p6, x2, p4);
Packet p_odd = pmadd(p5, x2, p3);
p_even = pmadd(p_even, x2, p2);
@@ -765,9 +729,8 @@
}
// Generic implementation of asin(x).
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pasin_float(const Packet& x_in) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x_in) {
typedef typename unpacket_traits<Packet>::type Scalar;
static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
@@ -817,9 +780,8 @@
}
// Computes elementwise atan(x) for x in [-1:1] with 2 ulp accuracy.
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan_reduced_float(const Packet& x) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced_float(const Packet& x) {
const Packet q0 = pset1<Packet>(-0.3333314359188079833984375f);
const Packet q2 = pset1<Packet>(0.19993579387664794921875f);
const Packet q4 = pset1<Packet>(-0.14209578931331634521484375f);
@@ -849,9 +811,8 @@
return pmadd(q, pmul(x, x2), x);
}
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan_float(const Packet& x_in) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_float(const Packet& x_in) {
typedef typename unpacket_traits<Packet>::type Scalar;
static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
@@ -879,28 +840,17 @@
// Computes elementwise atan(x) for x in [-tan(pi/8):tan(pi/8)]
// with 2 ulp accuracy.
template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet
-patan_reduced_double(const Packet& x) {
- const Packet q0 =
- pset1<Packet>(-0.33333333333330028569463365784031338989734649658203);
- const Packet q2 =
- pset1<Packet>(0.199999999990664090177006073645316064357757568359375);
- const Packet q4 =
- pset1<Packet>(-0.142857141937123677255527809393242932856082916259766);
- const Packet q6 =
- pset1<Packet>(0.111111065991039953404495577160560060292482376098633);
- const Packet q8 =
- pset1<Packet>(-9.0907812986129224452902519715280504897236824035645e-2);
- const Packet q10 =
- pset1<Packet>(7.6900542950704739442180368769186316058039665222168e-2);
- const Packet q12 =
- pset1<Packet>(-6.6410112986494976294871150912513257935643196105957e-2);
- const Packet q14 =
- pset1<Packet>(5.6920144995467943094258345126945641823112964630127e-2);
- const Packet q16 =
- pset1<Packet>(-4.3577020814990513608577771265117917209863662719727e-2);
- const Packet q18 =
- pset1<Packet>(2.1244050233624342527427586446719942614436149597168e-2);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced_double(const Packet& x) {
+ const Packet q0 = pset1<Packet>(-0.33333333333330028569463365784031338989734649658203);
+ const Packet q2 = pset1<Packet>(0.199999999990664090177006073645316064357757568359375);
+ const Packet q4 = pset1<Packet>(-0.142857141937123677255527809393242932856082916259766);
+ const Packet q6 = pset1<Packet>(0.111111065991039953404495577160560060292482376098633);
+ const Packet q8 = pset1<Packet>(-9.0907812986129224452902519715280504897236824035645e-2);
+ const Packet q10 = pset1<Packet>(7.6900542950704739442180368769186316058039665222168e-2);
+ const Packet q12 = pset1<Packet>(-6.6410112986494976294871150912513257935643196105957e-2);
+ const Packet q14 = pset1<Packet>(5.6920144995467943094258345126945641823112964630127e-2);
+ const Packet q16 = pset1<Packet>(-4.3577020814990513608577771265117917209863662719727e-2);
+ const Packet q18 = pset1<Packet>(2.1244050233624342527427586446719942614436149597168e-2);
// Approximate atan(x) on [0:tan(pi/8)] by a polynomial of the form
// P(x) = x + x^3 * Q(x^2),
@@ -922,9 +872,8 @@
return pmadd(p, pmul(x, x2), x);
}
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan_double(const Packet& x_in) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_double(const Packet& x_in) {
typedef typename unpacket_traits<Packet>::type Scalar;
static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
@@ -968,9 +917,8 @@
return pxor(p, x_signmask);
}
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patanh_float(const Packet& x) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x) {
typedef typename unpacket_traits<Packet>::type Scalar;
static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
const Packet half = pset1<Packet>(0.5f);
@@ -982,12 +930,12 @@
const Packet C7 = pset1<Packet>(0.14672131836414337158203125f);
const Packet C9 = pset1<Packet>(8.2311116158962249755859375e-2f);
const Packet C11 = pset1<Packet>(0.1819281280040740966796875f);
- const Packet x2 = pmul(x,x);
+ const Packet x2 = pmul(x, x);
Packet p = pmadd(C11, x2, C9);
p = pmadd(x2, p, C7);
p = pmadd(x2, p, C5);
p = pmadd(x2, p, C3);
- p = pmadd(pmul(x,x2), p, x);
+ p = pmadd(pmul(x, x2), p, x);
// For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
const Packet one = pset1<Packet>(1.0f);
@@ -996,19 +944,18 @@
return pselect(x_gt_half, r, p);
}
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pdiv_complex(const Packet& x, const Packet& y) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y) {
typedef typename unpacket_traits<Packet>::as_real RealPacket;
// In the following we annotate the code for the case where the inputs
// are a pair length-2 SIMD vectors representing a single pair of complex
// numbers x = a + i*b, y = c + i*d.
- const RealPacket y_abs = pabs(y.v); // |c|, |d|
- const RealPacket y_abs_flip = pcplxflip(Packet(y_abs)).v; // |d|, |c|
- const RealPacket y_max = pmax(y_abs, y_abs_flip); // max(|c|, |d|), max(|c|, |d|)
- const RealPacket y_scaled = pdiv(y.v, y_max); // c / max(|c|, |d|), d / max(|c|, |d|)
+ const RealPacket y_abs = pabs(y.v); // |c|, |d|
+ const RealPacket y_abs_flip = pcplxflip(Packet(y_abs)).v; // |d|, |c|
+ const RealPacket y_max = pmax(y_abs, y_abs_flip); // max(|c|, |d|), max(|c|, |d|)
+ const RealPacket y_scaled = pdiv(y.v, y_max); // c / max(|c|, |d|), d / max(|c|, |d|)
// Compute scaled denominator.
- const RealPacket y_scaled_sq = pmul(y_scaled, y_scaled); // c'**2, d'**2
+ const RealPacket y_scaled_sq = pmul(y_scaled, y_scaled); // c'**2, d'**2
const RealPacket denom = padd(y_scaled_sq, pcplxflip(Packet(y_scaled_sq)).v);
Packet result_scaled = pmul(x, pconj(Packet(y_scaled))); // a * c' + b * d', -a * d + b * c
// Divide elementwise by denom.
@@ -1017,9 +964,8 @@
return Packet(pdiv(result_scaled.v, y_max));
}
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psqrt_complex(const Packet& a) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet& a) {
typedef typename unpacket_traits<Packet>::type Scalar;
typedef typename Scalar::value_type RealScalar;
typedef typename unpacket_traits<Packet>::as_real RealPacket;
@@ -1060,14 +1006,14 @@
// l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
// where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
- RealPacket a_abs = pabs(a.v); // [|x0|, |y0|, |x1|, |y1|]
- RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; // [|y0|, |x0|, |y1|, |x1|]
+ RealPacket a_abs = pabs(a.v); // [|x0|, |y0|, |x1|, |y1|]
+ RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; // [|y0|, |x0|, |y1|, |x1|]
RealPacket a_max = pmax(a_abs, a_abs_flip);
RealPacket a_min = pmin(a_abs, a_abs_flip);
RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
RealPacket r = pdiv(a_min, a_max);
- const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
+ const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); // [l0, l0, l1, l1]
// Set l to a_max if a_min is zero.
l = pselect(a_min_zero_mask, a_max, l);
@@ -1090,8 +1036,7 @@
// Step 4. Compute solution for inputs with negative real part:
// [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
- const RealPacket cst_imag_sign_mask =
- pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
+ const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
Packet negative_real_result;
// Notice that rho is positive, so taking it's absolute value is a noop.
@@ -1131,7 +1076,6 @@
return pselect(is_imag_inf, imag_inf_result, pselect(is_real_inf, real_inf_result, result));
}
-
template <typename Packet>
struct psign_impl<Packet, std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
!NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
@@ -1222,18 +1166,16 @@
// This function splits x into the nearest integer n and fractional part r,
// such that x = n + r holds exactly.
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void absolute_split(const Packet& x, Packet& n, Packet& r) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void absolute_split(const Packet& x, Packet& n, Packet& r) {
n = pround(x);
r = psub(x, n);
}
// This function computes the sum {s, r}, such that x + y = s_hi + s_lo
// holds exactly, and s_hi = fl(x+y), if |x| >= |y|.
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
s_hi = padd(x, y);
const Packet t = psub(s_hi, x);
s_lo = psub(y, t);
@@ -1244,10 +1186,8 @@
// a pair of floating point numbers. Given {x, y}, it computes the pair
// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
// p_hi = fl(x * y).
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void twoprod(const Packet& x, const Packet& y,
- Packet& p_hi, Packet& p_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
p_hi = pmul(x, y);
p_lo = pmsub(x, y, p_hi);
}
@@ -1259,9 +1199,8 @@
// exactly and that half of the significant of x fits in x_hi.
// This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions",
// 3rd edition, Birkh\"auser, 2016.
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
typedef typename unpacket_traits<Packet>::type Scalar;
EIGEN_CONSTEXPR int shift = (NumTraits<Scalar>::digits() + 1) / 2;
const Scalar shift_scale = Scalar(uint64_t(1) << shift); // Scalar constructor not necessarily constexpr.
@@ -1275,10 +1214,8 @@
// Given floating point numbers {x, y} computes the pair
// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
// p_hi = fl(x * y).
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void twoprod(const Packet& x, const Packet& y,
- Packet& p_hi, Packet& p_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
Packet x_hi, x_lo, y_hi, y_lo;
veltkamp_splitting(x, x_hi, x_lo);
veltkamp_splitting(y, y_hi, y_lo);
@@ -1292,23 +1229,20 @@
#endif // EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-
// This function implements Dekker's algorithm for the addition
// of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
// It returns the result as a pair {s_hi, s_lo} such that
// x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly.
// This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions",
// 3rd edition, Birkh\"auser, 2016.
-template<typename Packet>
-EIGEN_STRONG_INLINE
- void twosum(const Packet& x_hi, const Packet& x_lo,
- const Packet& y_hi, const Packet& y_lo,
- Packet& s_hi, Packet& s_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, const Packet& y_lo,
+ Packet& s_hi, Packet& s_lo) {
const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi));
Packet r_hi_1, r_lo_1;
- fast_twosum(x_hi, y_hi,r_hi_1, r_lo_1);
+ fast_twosum(x_hi, y_hi, r_hi_1, r_lo_1);
Packet r_hi_2, r_lo_2;
- fast_twosum(y_hi, x_hi,r_hi_2, r_lo_2);
+ fast_twosum(y_hi, x_hi, r_hi_2, r_lo_2);
const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2);
const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo);
@@ -1320,11 +1254,9 @@
// This is a version of twosum for double word numbers,
// which assumes that |x_hi| >= |y_hi|.
-template<typename Packet>
-EIGEN_STRONG_INLINE
- void fast_twosum(const Packet& x_hi, const Packet& x_lo,
- const Packet& y_hi, const Packet& y_lo,
- Packet& s_hi, Packet& s_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void fast_twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, const Packet& y_lo,
+ Packet& s_hi, Packet& s_lo) {
Packet r_hi, r_lo;
fast_twosum(x_hi, y_hi, r_hi, r_lo);
const Packet s = padd(padd(y_lo, r_lo), x_lo);
@@ -1334,11 +1266,9 @@
// This is a version of twosum for adding a floating point number x to
// double word number {y_hi, y_lo} number, with the assumption
// that |x| >= |y_hi|.
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void fast_twosum(const Packet& x,
- const Packet& y_hi, const Packet& y_lo,
- Packet& s_hi, Packet& s_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y_hi, const Packet& y_lo, Packet& s_hi,
+ Packet& s_lo) {
Packet r_hi, r_lo;
fast_twosum(x, y_hi, r_hi, r_lo);
const Packet s = padd(y_lo, r_lo);
@@ -1353,10 +1283,8 @@
// in the floating point type.
// This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions",
// 3rd edition, Birkh\"auser, 2016.
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
- Packet& p_hi, Packet& p_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y, Packet& p_hi, Packet& p_lo) {
Packet c_hi, c_lo1;
twoprod(x_hi, y, c_hi, c_lo1);
const Packet c_lo2 = pmul(x_lo, y);
@@ -1372,11 +1300,9 @@
// (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error
// of less than 2*2^{-2p}, where p is the number of significand bit
// in the floating point type.
-template<typename Packet>
-EIGEN_STRONG_INLINE
-void twoprod(const Packet& x_hi, const Packet& x_lo,
- const Packet& y_hi, const Packet& y_lo,
- Packet& p_hi, Packet& p_lo) {
+template <typename Packet>
+EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi, const Packet& y_lo,
+ Packet& p_hi, Packet& p_lo) {
Packet p_hi_hi, p_hi_lo;
twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo);
Packet p_lo_hi, p_lo_lo;
@@ -1389,8 +1315,7 @@
// for basic building blocks of double-word arithmetic", Joldes, Muller, & Popescu,
// 2017. https://hal.archives-ouvertes.fr/hal-01351529
template <typename Packet>
-void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y,
- Packet& z_hi, Packet& z_lo) {
+void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y, Packet& z_hi, Packet& z_lo) {
const Packet t_hi = pdiv(x_hi, y);
Packet pi_hi, pi_lo;
twoprod(t_hi, y, pi_hi, pi_lo);
@@ -1405,8 +1330,7 @@
template <typename Scalar>
struct accurate_log2 {
template <typename Packet>
- EIGEN_STRONG_INLINE
- void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
+ EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
log2_x_hi = plog2(x);
log2_x_lo = pzero(x);
}
@@ -1421,8 +1345,7 @@
template <>
struct accurate_log2<float> {
template <typename Packet>
- EIGEN_STRONG_INLINE
- void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
+ EIGEN_STRONG_INLINE void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
// The function log(1+x)/x is approximated in the interval
// [1/sqrt(2)-1;sqrt(2)-1] by a degree 10 polynomial of the form
// Q(x) = (C0 + x * (C1 + x * (C2 + x * (C3 + x * P(x))))),
@@ -1437,14 +1360,14 @@
// > f = log2(1+x)/x;
// > interval = [sqrt(0.5)-1;sqrt(2)-1];
// > p = fpminimax(f,n,[|double,double,double,double,single...|],interval,relative,floating);
-
- const Packet p6 = pset1<Packet>( 9.703654795885e-2f);
+
+ const Packet p6 = pset1<Packet>(9.703654795885e-2f);
const Packet p5 = pset1<Packet>(-0.1690667718648f);
- const Packet p4 = pset1<Packet>( 0.1720575392246f);
+ const Packet p4 = pset1<Packet>(0.1720575392246f);
const Packet p3 = pset1<Packet>(-0.1789081543684f);
- const Packet p2 = pset1<Packet>( 0.2050433009862f);
+ const Packet p2 = pset1<Packet>(0.2050433009862f);
const Packet p1 = pset1<Packet>(-0.2404672354459f);
- const Packet p0 = pset1<Packet>( 0.2885761857032f);
+ const Packet p0 = pset1<Packet>(0.2885761857032f);
const Packet C3_hi = pset1<Packet>(-0.360674142838f);
const Packet C3_lo = pset1<Packet>(-6.13283912543e-09f);
@@ -1460,7 +1383,7 @@
// Evaluate P(x) in working precision.
// We evaluate it in multiple parts to improve instruction level
// parallelism.
- Packet x2 = pmul(x,x);
+ Packet x2 = pmul(x, x);
Packet p_even = pmadd(p6, x2, p4);
p_even = pmadd(p_even, x2, p2);
p_even = pmadd(p_even, x2, p0);
@@ -1502,8 +1425,7 @@
template <>
struct accurate_log2<double> {
template <typename Packet>
- EIGEN_STRONG_INLINE
- void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
+ EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
// We use a transformation of variables:
// r = c * (x-1) / (x+1),
// such that
@@ -1588,8 +1510,7 @@
template <typename Scalar>
struct fast_accurate_exp2 {
template <typename Packet>
- EIGEN_STRONG_INLINE
- Packet operator()(const Packet& x) {
+ EIGEN_STRONG_INLINE Packet operator()(const Packet& x) {
// TODO(rmlarsen): Add a pexp2 packetop.
return pexp(pmul(pset1<Packet>(Scalar(EIGEN_LN2)), x));
}
@@ -1602,8 +1523,7 @@
template <>
struct fast_accurate_exp2<float> {
template <typename Packet>
- EIGEN_STRONG_INLINE
- Packet operator()(const Packet& x) {
+ EIGEN_STRONG_INLINE Packet operator()(const Packet& x) {
// This function approximates exp2(x) by a degree 6 polynomial of the form
// Q(x) = 1 + x * (C + x * P(x)), where the degree 4 polynomial P(x) is evaluated in
// single precision, and the remaining steps are evaluated with extra precision using
@@ -1628,7 +1548,7 @@
// Evaluate P(x) in working precision.
// We evaluate even and odd parts of the polynomial separately
// to gain some instruction level parallelism.
- Packet x2 = pmul(x,x);
+ Packet x2 = pmul(x, x);
Packet p_even = pmadd(p4, x2, p2);
Packet p_odd = pmadd(p3, x2, p1);
p_even = pmadd(p_even, x2, p0);
@@ -1660,8 +1580,7 @@
template <>
struct fast_accurate_exp2<double> {
template <typename Packet>
- EIGEN_STRONG_INLINE
- Packet operator()(const Packet& x) {
+ EIGEN_STRONG_INLINE Packet operator()(const Packet& x) {
// This function approximates exp2(x) by a degree 10 polynomial of the form
// Q(x) = 1 + x * (C + x * P(x)), where the degree 8 polynomial P(x) is evaluated in
// single precision, and the remaining steps are evaluated with extra precision using
@@ -1683,14 +1602,14 @@
const Packet p2 = pset1<Packet>(9.618129107593478832e-3);
const Packet p1 = pset1<Packet>(5.550410866481961247e-2);
const Packet p0 = pset1<Packet>(0.240226506959101332);
- const Packet C_hi = pset1<Packet>(0.693147180559945286);
+ const Packet C_hi = pset1<Packet>(0.693147180559945286);
const Packet C_lo = pset1<Packet>(4.81927865669806721e-17);
const Packet one = pset1<Packet>(1.0);
// Evaluate P(x) in working precision.
// We evaluate even and odd parts of the polynomial separately
// to gain some instruction level parallelism.
- Packet x2 = pmul(x,x);
+ Packet x2 = pmul(x, x);
Packet p_even = pmadd(p8, x2, p6);
Packet p_odd = pmadd(p9, x2, p7);
p_even = pmadd(p_even, x2, p4);
@@ -1885,15 +1804,17 @@
*/
template <typename Packet, int N>
struct ppolevl {
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
+ const typename unpacket_traits<Packet>::type coeff[]) {
EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
- return pmadd(ppolevl<Packet, N-1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
+ return pmadd(ppolevl<Packet, N - 1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
}
};
template <typename Packet>
struct ppolevl<Packet, 0> {
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
+ const typename unpacket_traits<Packet>::type coeff[]) {
EIGEN_UNUSED_VARIABLE(x);
return pset1<Packet>(coeff[0]);
}
@@ -1953,8 +1874,8 @@
template <typename Packet, int N>
struct pchebevl {
- EIGEN_DEVICE_FUNC
- static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x,
+ const typename unpacket_traits<Packet>::type coef[]) {
typedef typename unpacket_traits<Packet>::type Scalar;
Packet b0 = pset1<Packet>(coef[0]);
Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
@@ -2052,14 +1973,14 @@
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet gen_pow(const Packet& x,
- const typename unpacket_traits<Packet>::type& exponent) {
+ const typename unpacket_traits<Packet>::type& exponent) {
const Packet exponent_packet = pset1<Packet>(exponent);
return generic_pow_impl(x, exponent_packet);
}
template <typename Packet, typename ScalarExponent>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_nonint_errors(const Packet& x, const Packet& powx,
- const ScalarExponent& exponent) {
+ const ScalarExponent& exponent) {
using Scalar = typename unpacket_traits<Packet>::type;
// non-integer base and exponent case
@@ -2153,7 +2074,6 @@
return pand(x_is_one, x);
}
-
} // end namespace unary_pow
template <typename Packet, typename ScalarExponent,
@@ -2205,7 +2125,7 @@
}
};
-} // end namespace internal
-} // end namespace Eigen
+} // end namespace internal
+} // end namespace Eigen
-#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
+#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 9e038ab..ade9f3f 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -22,110 +22,96 @@
/***************************************************************************
* Some generic implementations to be used by implementors
-***************************************************************************/
+ ***************************************************************************/
/** Default implementation of pfrexp.
- * It is expected to be called by implementers of template<> pfrexp.
- */
-template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-Packet pfrexp_generic(const Packet& a, Packet& exponent);
+ * It is expected to be called by implementers of template<> pfrexp.
+ */
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent);
// Extracts the biased exponent value from Packet p, and casts the results to
// a floating-point Packet type. Used by pfrexp_generic. Override this if
// there is no unpacket_traits<Packet>::integer_packet.
-template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-Packet pfrexp_generic_get_biased_exponent(const Packet& p);
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& p);
/** Default implementation of pldexp.
- * It is expected to be called by implementers of template<> pldexp.
- */
-template<typename Packet> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-Packet pldexp_generic(const Packet& a, const Packet& exponent);
+ * It is expected to be called by implementers of template<> pldexp.
+ */
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent);
/** \internal \returns log(x) for single precision float */
template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog_float(const Packet _x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x);
/** \internal \returns log2(x) for single precision float */
template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog2_float(const Packet _x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Packet _x);
/** \internal \returns log(x) for single precision float */
template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog_double(const Packet _x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_double(const Packet _x);
/** \internal \returns log2(x) for single precision float */
template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog2_double(const Packet _x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Packet _x);
/** \internal \returns log(1 + x) */
-template<typename Packet>
+template <typename Packet>
Packet generic_plog1p(const Packet& x);
/** \internal \returns exp(x)-1 */
-template<typename Packet>
+template <typename Packet>
Packet generic_expm1(const Packet& x);
/** \internal \returns exp(x) for single precision float */
template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pexp_float(const Packet _x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x);
/** \internal \returns exp(x) for double precision real numbers */
template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pexp_double(const Packet _x);
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Packet _x);
/** \internal \returns sin(x) for single precision float */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psin_float(const Packet& x);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x);
/** \internal \returns cos(x) for single precision float */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pcos_float(const Packet& x);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x);
/** \internal \returns asin(x) for single precision float */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pasin_float(const Packet& x);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x);
/** \internal \returns acos(x) for single precision float */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pacos_float(const Packet& x);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x);
/** \internal \returns atan(x) for single precision float */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan_float(const Packet& x);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_float(const Packet& x);
/** \internal \returns atan(x) for double precision float */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan_double(const Packet& x);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_double(const Packet& x);
/** \internal \returns atanh(x) for single precision float */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patanh_float(const Packet& x);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x);
/** \internal \returns sqrt(x) for complex types */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psqrt_complex(const Packet& a);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet& a);
/** \internal \returns x / y for complex types */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pdiv_complex(const Packet& x, const Packet& y);
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y);
-template <typename Packet, int N> struct ppolevl;
+template <typename Packet, int N>
+struct ppolevl;
// Macros for instantiating these generic functions for different backends.
#define EIGEN_PACKET_FUNCTION(METHOD, SCALAR, PACKET) \
@@ -166,7 +152,7 @@
EIGEN_DOUBLE_PACKET_FUNCTION(log2, PACKET) \
EIGEN_DOUBLE_PACKET_FUNCTION(exp, PACKET)
-} // end namespace internal
-} // end namespace Eigen
+} // end namespace internal
+} // end namespace Eigen
-#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H
+#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index c652318..92516c7 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -24,7 +24,6 @@
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
// Standard 16-bit float type, mostly useful for GPUs. Defines a new
// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
// operator overloads such that it behaves basically as an arithmetic
@@ -32,7 +31,6 @@
// in fp32 for CPUs, except for simple parameter conversions, I/O
// to disk and the likes), but fast on GPUs.
-
#ifndef EIGEN_HALF_H
#define EIGEN_HALF_H
@@ -46,16 +44,15 @@
// As a consequence, we get compile failures when compiling Eigen with
// GPU support. Hence the need to disable EIGEN_CONSTEXPR when building
// Eigen with GPU support
- #pragma push_macro("EIGEN_CONSTEXPR")
- #undef EIGEN_CONSTEXPR
- #define EIGEN_CONSTEXPR
+#pragma push_macro("EIGEN_CONSTEXPR")
+#undef EIGEN_CONSTEXPR
+#define EIGEN_CONSTEXPR
#endif
-#define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD) \
- template <> \
- EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED \
- PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) { \
- return float2half(METHOD<PACKET_F>(half2float(_x))); \
+#define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD) \
+ template <> \
+ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) { \
+ return float2half(METHOD<PACKET_F>(half2float(_x))); \
}
namespace Eigen {
@@ -97,8 +94,7 @@
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw() : x(0) {}
#endif
#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
- explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {
- }
+ explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {}
__fp16 x;
#else
explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw(numext::uint16_t raw) : x(raw) {}
@@ -107,15 +103,15 @@
};
#elif defined(EIGEN_HAS_HIP_FP16)
- // Nothing to do here
- // HIP fp16 header file has a definition for __half_raw
+// Nothing to do here
+// HIP fp16 header file has a definition for __half_raw
#elif defined(EIGEN_HAS_CUDA_FP16)
- #if EIGEN_CUDA_SDK_VER < 90000
- // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
- typedef __half __half_raw;
- #endif // defined(EIGEN_HAS_CUDA_FP16)
+#if EIGEN_CUDA_SDK_VER < 90000
+// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
+typedef __half __half_raw;
+#endif // defined(EIGEN_HAS_CUDA_FP16)
#elif defined(SYCL_DEVICE_ONLY)
- typedef cl::sycl::half __half_raw;
+typedef cl::sycl::half __half_raw;
#endif
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x);
@@ -127,21 +123,20 @@
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
#if defined(EIGEN_HAS_GPU_FP16)
- #if defined(EIGEN_HAS_HIP_FP16)
+#if defined(EIGEN_HAS_HIP_FP16)
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
- #elif defined(EIGEN_HAS_CUDA_FP16)
- #if EIGEN_CUDA_SDK_VER >= 90000
+#elif defined(EIGEN_HAS_CUDA_FP16)
+#if EIGEN_CUDA_SDK_VER >= 90000
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
- #endif
- #endif
+#endif
+#endif
#endif
};
-} // namespace half_impl
+} // namespace half_impl
// Class definition.
struct half : public half_impl::half_base {
-
// Writing this out as separate #if-else blocks to make the code easier to follow
// The same applies to most #if-else blocks in this file
#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
@@ -153,12 +148,12 @@
// Nothing to do here
// HIP fp16 header file has a definition for __half_raw
#elif defined(EIGEN_HAS_CUDA_FP16)
- // Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
- // (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP! So keeping this within
- // #if defined(EIGEN_HAS_CUDA_FP16) is needed
- #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
- typedef half_impl::__half_raw __half_raw;
- #endif
+// Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
+// (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP! So keeping this within
+// #if defined(EIGEN_HAS_CUDA_FP16) is needed
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+ typedef half_impl::__half_raw __half_raw;
+#endif
#endif
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half() {}
@@ -166,31 +161,29 @@
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
#if defined(EIGEN_HAS_GPU_FP16)
- #if defined(EIGEN_HAS_HIP_FP16)
+#if defined(EIGEN_HAS_HIP_FP16)
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
- #elif defined(EIGEN_HAS_CUDA_FP16)
- #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+#elif defined(EIGEN_HAS_CUDA_FP16)
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
- #endif
- #endif
#endif
-
+#endif
+#endif
explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR half(bool b)
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
- template<class T>
+ template <class T>
explicit EIGEN_DEVICE_FUNC half(T val)
: half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(val))) {}
- explicit EIGEN_DEVICE_FUNC half(float f)
- : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
+ explicit EIGEN_DEVICE_FUNC half(float f) : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
// Following the convention of numpy, converting between complex and
// float will lead to loss of imag value.
- template<typename RealScalar>
+ template <typename RealScalar>
explicit EIGEN_DEVICE_FUNC half(std::complex<RealScalar> c)
: half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(c.real()))) {}
- EIGEN_DEVICE_FUNC operator float() const { // NOLINT: Allow implicit conversion to float, because it is lossless.
+ EIGEN_DEVICE_FUNC operator float() const { // NOLINT: Allow implicit conversion to float, because it is lossless.
return half_impl::half_to_float(*this);
}
@@ -224,8 +217,10 @@
static EIGEN_CONSTEXPR const bool is_bounded = true;
static EIGEN_CONSTEXPR const bool is_modulo = false;
static EIGEN_CONSTEXPR const int digits = 11;
- static EIGEN_CONSTEXPR const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
- static EIGEN_CONSTEXPR const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+ static EIGEN_CONSTEXPR const int digits10 =
+ 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+ static EIGEN_CONSTEXPR const int max_digits10 =
+ 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
static EIGEN_CONSTEXPR const int radix = std::numeric_limits<float>::radix;
static EIGEN_CONSTEXPR const int min_exponent = -13;
static EIGEN_CONSTEXPR const int min_exponent10 = -4;
@@ -236,9 +231,9 @@
// detect tininess in the same way for all operations in radix two"
static EIGEN_CONSTEXPR const bool tinyness_before = std::numeric_limits<float>::tinyness_before;
- static EIGEN_CONSTEXPR Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x0400); }
+ static EIGEN_CONSTEXPR Eigen::half(min)() { return Eigen::half_impl::raw_uint16_to_half(0x0400); }
static EIGEN_CONSTEXPR Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
- static EIGEN_CONSTEXPR Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
+ static EIGEN_CONSTEXPR Eigen::half(max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
static EIGEN_CONSTEXPR Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x1400); }
static EIGEN_CONSTEXPR Eigen::half round_error() { return Eigen::half_impl::raw_uint16_to_half(0x3800); }
static EIGEN_CONSTEXPR Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
@@ -247,51 +242,51 @@
static EIGEN_CONSTEXPR Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x0001); }
};
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_specialized;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_signed;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_integer;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_exact;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_infinity;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_quiet_NaN;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_signaling_NaN;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const std::float_denorm_style numeric_limits_half_impl<T>::has_denorm;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::has_denorm_loss;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const std::float_round_style numeric_limits_half_impl<T>::round_style;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_iec559;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_bounded;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::is_modulo;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::digits;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::digits10;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_digits10;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::radix;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::min_exponent;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::min_exponent10;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_exponent;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const int numeric_limits_half_impl<T>::max_exponent10;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::traps;
-template<typename T>
+template <typename T>
EIGEN_CONSTEXPR const bool numeric_limits_half_impl<T>::tinyness_before;
} // end namespace half_impl
} // end namespace Eigen
@@ -301,13 +296,13 @@
// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
// std::numeric_limits<const volatile T>
// https://stackoverflow.com/a/16519653/
-template<>
+template <>
class numeric_limits<Eigen::half> : public Eigen::half_impl::numeric_limits_half_impl<> {};
-template<>
+template <>
class numeric_limits<const Eigen::half> : public numeric_limits<Eigen::half> {};
-template<>
+template <>
class numeric_limits<volatile Eigen::half> : public numeric_limits<Eigen::half> {};
-template<>
+template <>
class numeric_limits<const volatile Eigen::half> : public numeric_limits<Eigen::half> {};
} // end namespace std
@@ -315,8 +310,7 @@
namespace half_impl {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
- EIGEN_CUDA_ARCH >= 530) || \
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
// Note: We deliberately do *not* define this to 1 even if we have Arm's native
// fp16 type since GPU halfs are rather different from native CPU halfs.
@@ -330,20 +324,16 @@
// conversion steps back and forth.
#if defined(EIGEN_HAS_NATIVE_FP16)
-EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
return __hadd(::__half(a), ::__half(b));
#else
return __hadd(a, b);
#endif
}
-EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
- return __hmul(a, b);
-}
-EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
- return __hsub(a, b);
-}
-EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) { return __hmul(a, b); }
+EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) { return __hsub(a, b); }
+EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
return __hdiv(a, b);
#else
@@ -352,99 +342,63 @@
return __float2half(num / denom);
#endif
}
-EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
- return __hneg(a);
-}
-EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half operator-(const half& a) { return __hneg(a); }
+EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
a = a + b;
return a;
}
-EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator*=(half& a, const half& b) {
a = a * b;
return a;
}
-EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator-=(half& a, const half& b) {
a = a - b;
return a;
}
-EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
+EIGEN_STRONG_INLINE __device__ half& operator/=(half& a, const half& b) {
a = a / b;
return a;
}
-EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
- return __heq(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
- return __hne(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
- return __hlt(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
- return __hle(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
- return __hgt(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
- return __hge(a, b);
-}
+EIGEN_STRONG_INLINE __device__ bool operator==(const half& a, const half& b) { return __heq(a, b); }
+EIGEN_STRONG_INLINE __device__ bool operator!=(const half& a, const half& b) { return __hne(a, b); }
+EIGEN_STRONG_INLINE __device__ bool operator<(const half& a, const half& b) { return __hlt(a, b); }
+EIGEN_STRONG_INLINE __device__ bool operator<=(const half& a, const half& b) { return __hle(a, b); }
+EIGEN_STRONG_INLINE __device__ bool operator>(const half& a, const half& b) { return __hgt(a, b); }
+EIGEN_STRONG_INLINE __device__ bool operator>=(const half& a, const half& b) { return __hge(a, b); }
#endif
#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
- return half(vaddh_f16(a.x, b.x));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
- return half(vmulh_f16(a.x, b.x));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
- return half(vsubh_f16(a.x, b.x));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
- return half(vdivh_f16(a.x, b.x));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
- return half(vnegh_f16(a.x));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(vaddh_f16(a.x, b.x)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a, const half& b) { return half(vmulh_f16(a.x, b.x)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a, const half& b) { return half(vsubh_f16(a.x, b.x)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, const half& b) { return half(vdivh_f16(a.x, b.x)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) { return half(vnegh_f16(a.x)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
a = half(vaddh_f16(a.x, b.x));
return a;
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
a = half(vmulh_f16(a.x, b.x));
return a;
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
a = half(vsubh_f16(a.x, b.x));
return a;
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
a = half(vdivh_f16(a.x, b.x));
return a;
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
- return vceqh_f16(a.x, b.x);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
- return !vceqh_f16(a.x, b.x);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
- return vclth_f16(a.x, b.x);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
- return vcleh_f16(a.x, b.x);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
- return vcgth_f16(a.x, b.x);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
- return vcgeh_f16(a.x, b.x);
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) { return vceqh_f16(a.x, b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) { return !vceqh_f16(a.x, b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) { return vclth_f16(a.x, b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return vcleh_f16(a.x, b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return vcgth_f16(a.x, b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return vcgeh_f16(a.x, b.x); }
// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
// of the functions, while the latter can only deal with one of them.
-#elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats
+#elif !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats
#if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
// We need to provide emulated *host-side* FP16 operators for clang.
@@ -452,64 +406,48 @@
#undef EIGEN_DEVICE_FUNC
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16)
#define EIGEN_DEVICE_FUNC __host__
-#else // both host and device need emulated ops.
+#else // both host and device need emulated ops.
#define EIGEN_DEVICE_FUNC __host__ __device__
#endif
#endif
// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
// to/from fp32.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
- return half(float(a) + float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
- return half(float(a) * float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
- return half(float(a) - float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
- return half(float(a) / float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(float(a) + float(b)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a, const half& b) { return half(float(a) * float(b)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a, const half& b) { return half(float(a) - float(b)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, const half& b) { return half(float(a) / float(b)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) {
half result;
result.x = a.x ^ 0x8000;
return result;
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
a = half(float(a) + float(b));
return a;
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
a = half(float(a) * float(b));
return a;
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
a = half(float(a) - float(b));
return a;
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
a = half(float(a) / float(b));
return a;
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
- return numext::equal_strict(float(a),float(b));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) {
+ return numext::equal_strict(float(a), float(b));
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) {
return numext::not_equal_strict(float(a), float(b));
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
- return float(a) < float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
- return float(a) <= float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
- return float(a) > float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
- return float(a) >= float(b);
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) { return float(a) < float(b); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return float(a) <= float(b); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return float(a) > float(b); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return float(a) >= float(b); }
#if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
#pragma pop_macro("EIGEN_DEVICE_FUNC")
@@ -518,7 +456,7 @@
// Division by an index. Do it in full float precision to avoid accuracy
// issues in converting the denominator to half.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, Index b) {
return half(static_cast<float>(a) / static_cast<float>(b));
}
@@ -557,8 +495,8 @@
// Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out
// of this catch22 by having separate bodies for GPU / non GPU
#if defined(EIGEN_HAS_GPU_FP16)
- __half_raw h;
- h.x = x;
+ __half_raw h;
+ h.x = x;
return h;
#else
return __half_raw(x);
@@ -585,18 +523,18 @@
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
- (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
__half tmp_ff = __float2half(ff);
return *(__half_raw*)&tmp_ff;
#elif defined(EIGEN_HAS_FP16_C)
__half_raw h;
- #if EIGEN_COMP_MSVC
- // MSVC does not have scalar instructions.
- h.x =_mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(ff), 0), 0);
- #else
- h.x = _cvtss_sh(ff, 0);
- #endif
+#if EIGEN_COMP_MSVC
+ // MSVC does not have scalar instructions.
+ h.x = _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(ff), 0), 0);
+#else
+ h.x = _cvtss_sh(ff, 0);
+#endif
return h;
#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
@@ -605,11 +543,12 @@
return h;
#else
- float32_bits f; f.f = ff;
+ float32_bits f;
+ f.f = ff;
- const float32_bits f32infty = { 255 << 23 };
- const float32_bits f16max = { (127 + 16) << 23 };
- const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+ const float32_bits f32infty = {255 << 23};
+ const float32_bits f16max = {(127 + 16) << 23};
+ const float32_bits denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
unsigned int sign_mask = 0x80000000u;
__half_raw o;
o.x = static_cast<numext::uint16_t>(0x0u);
@@ -622,10 +561,10 @@
// 0x80000000. Important if you want fast straight SSE2 code
// (since there's no unsigned PCMPGTD).
- if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set)
- o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
- } else { // (De)normalized number or zero
- if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero
+ if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set)
+ o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+ } else { // (De)normalized number or zero
+ if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero
// use a magic value to align our 10 mantissa bits at the bottom of
// the float. as long as FP addition is round-to-nearest-even this
// just works.
@@ -634,7 +573,7 @@
// and one integer subtract of the bias later, we have our final float!
o.x = static_cast<numext::uint16_t>(f.u - denorm_magic.u);
} else {
- unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+ unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
// update exponent, rounding bias part 1
// Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
@@ -654,51 +593,51 @@
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
- (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __half2float(h);
#elif defined(EIGEN_HAS_FP16_C)
- #if EIGEN_COMP_MSVC
- // MSVC does not have scalar instructions.
- return _mm_cvtss_f32(_mm_cvtph_ps(_mm_set1_epi16(h.x)));
- #else
- return _cvtsh_ss(h.x);
- #endif
+#if EIGEN_COMP_MSVC
+ // MSVC does not have scalar instructions.
+ return _mm_cvtss_f32(_mm_cvtph_ps(_mm_set1_epi16(h.x)));
+#else
+ return _cvtsh_ss(h.x);
+#endif
#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
return static_cast<float>(h.x);
#else
- const float32_bits magic = { 113 << 23 };
- const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
+ const float32_bits magic = {113 << 23};
+ const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
float32_bits o;
- o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
- unsigned int exp = shifted_exp & o.u; // just the exponent
- o.u += (127 - 15) << 23; // exponent adjust
+ o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
+ unsigned int exp = shifted_exp & o.u; // just the exponent
+ o.u += (127 - 15) << 23; // exponent adjust
// handle exponent special cases
- if (exp == shifted_exp) { // Inf/NaN?
- o.u += (128 - 16) << 23; // extra exp adjust
- } else if (exp == 0) { // Zero/Denormal?
- o.u += 1 << 23; // extra exp adjust
- o.f -= magic.f; // renormalize
+ if (exp == shifted_exp) { // Inf/NaN?
+ o.u += (128 - 16) << 23; // extra exp adjust
+ } else if (exp == 0) { // Zero/Denormal?
+ o.u += 1 << 23; // extra exp adjust
+ o.f -= magic.f; // renormalize
}
- o.u |= (h.x & 0x8000) << 16; // sign bit
+ o.u |= (h.x & 0x8000) << 16; // sign bit
return o.f;
#endif
}
// --- standard functions ---
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
#ifdef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) == 0x7c00;
#else
return (a.x & 0x7fff) == 0x7c00;
#endif
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
- (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __hisnan(a);
#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
@@ -706,8 +645,8 @@
return (a.x & 0x7fff) > 0x7c00;
#endif
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) {
- return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const half& a) {
+ return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
@@ -721,39 +660,34 @@
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
- defined(EIGEN_HIP_DEVICE_COMPILE)
+ defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hexp(a));
#else
- return half(::expf(float(a)));
+ return half(::expf(float(a)));
#endif
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
- return half(numext::expm1(float(a)));
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
- (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
+ EIGEN_CUDA_ARCH >= 530) || \
+ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return half(::hlog(a));
#else
return half(::logf(float(a)));
#endif
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) {
- return half(numext::log1p(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
- return half(::log10f(float(a)));
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { return half(numext::log1p(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) {
return half(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
- defined(EIGEN_HIP_DEVICE_COMPILE)
+ defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hsqrt(a));
#else
- return half(::sqrtf(float(a)));
+ return half(::sqrtf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
@@ -762,33 +696,17 @@
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan2(const half& a, const half& b) {
return half(::atan2f(float(a), float(b)));
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) {
- return half(::sinf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) {
- return half(::cosf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) {
- return half(::tanf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
- return half(::tanhf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half asin(const half& a) {
- return half(::asinf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) {
- return half(::acosf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) {
- return half(::atanf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) {
- return half(::atanhf(float(a)));
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { return half(::sinf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { return half(::cosf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { return half(::tanf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half asin(const half& a) { return half(::asinf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { return half(::acosf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) { return half(::atanf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) { return half(::atanhf(float(a))); }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
- defined(EIGEN_HIP_DEVICE_COMPILE)
+ defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hfloor(a));
#else
return half(::floorf(float(a)));
@@ -796,25 +714,21 @@
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
- defined(EIGEN_HIP_DEVICE_COMPILE)
+ defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hceil(a));
#else
return half(::ceilf(float(a)));
#endif
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half rint(const half& a) {
- return half(::rintf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half round(const half& a) {
- return half(::roundf(float(a)));
-}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half rint(const half& a) { return half(::rintf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half round(const half& a) { return half(::roundf(float(a))); }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half fmod(const half& a, const half& b) {
return half(::fmodf(float(a), float(b)));
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
- (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __hlt(b, a) ? b : a;
#else
const float f1 = static_cast<float>(a);
@@ -822,9 +736,9 @@
return f2 < f1 ? b : a;
#endif
}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
- (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+ (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __hlt(a, b) ? b : a;
#else
const float f1 = static_cast<float>(a);
@@ -834,51 +748,43 @@
}
#ifndef EIGEN_NO_IO
-EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) {
+EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const half& v) {
os << static_cast<float>(v);
return os;
}
#endif
-} // end namespace half_impl
+} // end namespace half_impl
// import Eigen::half_impl::half into Eigen namespace
// using half_impl::half;
namespace internal {
-template<>
-struct random_default_impl<half, false, false>
-{
- static inline half run(const half& x, const half& y)
- {
- return x + (y-x) * half(float(std::rand()) / float(RAND_MAX));
+template <>
+struct random_default_impl<half, false, false> {
+ static inline half run(const half& x, const half& y) {
+ return x + (y - x) * half(float(std::rand()) / float(RAND_MAX));
}
- static inline half run()
- {
- return run(half(-1.f), half(1.f));
- }
+ static inline half run() { return run(half(-1.f), half(1.f)); }
};
-template<> struct is_arithmetic<half> { enum { value = true }; };
+template <>
+struct is_arithmetic<half> {
+ enum { value = true };
+};
-} // end namespace internal
+} // end namespace internal
-template<> struct NumTraits<Eigen::half>
- : GenericNumTraits<Eigen::half>
-{
- enum {
- IsSigned = true,
- IsInteger = false,
- IsComplex = false,
- RequireInitialization = false
- };
+template <>
+struct NumTraits<Eigen::half> : GenericNumTraits<Eigen::half> {
+ enum { IsSigned = true, IsInteger = false, IsComplex = false, RequireInitialization = false };
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() {
return half_impl::raw_uint16_to_half(0x0800);
}
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
- return half_impl::raw_uint16_to_half(0x211f); // Eigen::half(1e-2f);
+ return half_impl::raw_uint16_to_half(0x211f); // Eigen::half(1e-2f);
}
EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() {
return half_impl::raw_uint16_to_half(0x7bff);
@@ -894,10 +800,10 @@
}
};
-} // end namespace Eigen
+} // end namespace Eigen
#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
- #pragma pop_macro("EIGEN_CONSTEXPR")
+#pragma pop_macro("EIGEN_CONSTEXPR")
#endif
namespace Eigen {
@@ -946,63 +852,65 @@
// with native support for __half and __nv_bfloat16
//
// Note that the following are __device__ - only functions.
-#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) \
- || defined(EIGEN_HIPCC)
+#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) || defined(EIGEN_HIPCC)
#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane,
+ int width = warpSize) {
const __half h = var;
return static_cast<Eigen::half>(__shfl_sync(mask, h, srcLane, width));
}
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta,
+ int width = warpSize) {
const __half h = var;
return static_cast<Eigen::half>(__shfl_up_sync(mask, h, delta, width));
}
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta,
+ int width = warpSize) {
const __half h = var;
return static_cast<Eigen::half>(__shfl_down_sync(mask, h, delta, width));
}
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask,
+ int width = warpSize) {
const __half h = var;
return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
}
-#else // HIP or CUDA SDK < 9.0
+#else // HIP or CUDA SDK < 9.0
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl(ivar, srcLane, width)));
}
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width = warpSize) {
const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_up(ivar, delta, width)));
}
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width = warpSize) {
const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_down(ivar, delta, width)));
}
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width = warpSize) {
const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_xor(ivar, laneMask, width)));
}
-#endif // HIP vs CUDA
-#endif // __shfl*
+#endif // HIP vs CUDA
+#endif // __shfl*
// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) \
- || defined(EIGEN_HIPCC)
+#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) || defined(EIGEN_HIPCC)
EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
}
-#endif // __ldg
+#endif // __ldg
#if EIGEN_HAS_STD_HASH
namespace std {
@@ -1012,7 +920,7 @@
return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a));
}
};
-} // end namespace std
+} // end namespace std
#endif
namespace Eigen {
@@ -1020,8 +928,7 @@
template <>
struct cast_impl<float, half> {
- EIGEN_DEVICE_FUNC
- static inline half run(const float& a) {
+ EIGEN_DEVICE_FUNC static inline half run(const float& a) {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __float2half(a);
@@ -1033,8 +940,7 @@
template <>
struct cast_impl<int, half> {
- EIGEN_DEVICE_FUNC
- static inline half run(const int& a) {
+ EIGEN_DEVICE_FUNC static inline half run(const int& a) {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __float2half(static_cast<float>(a));
@@ -1046,8 +952,7 @@
template <>
struct cast_impl<half, float> {
- EIGEN_DEVICE_FUNC
- static inline float run(const half& a) {
+ EIGEN_DEVICE_FUNC static inline float run(const half& a) {
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
return __half2float(a);
@@ -1060,4 +965,4 @@
} // namespace internal
} // namespace Eigen
-#endif // EIGEN_HALF_H
+#endif // EIGEN_HALF_H
diff --git a/Eigen/src/Core/arch/Default/Settings.h b/Eigen/src/Core/arch/Default/Settings.h
index a5c3ada..7e3a970 100644
--- a/Eigen/src/Core/arch/Default/Settings.h
+++ b/Eigen/src/Core/arch/Default/Settings.h
@@ -8,7 +8,6 @@
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
/* All the parameters defined in this file can be specialized in the
* architecture specific files, and/or by the user.
* More to come... */
@@ -17,33 +16,32 @@
#define EIGEN_DEFAULT_SETTINGS_H
/** Defines the maximal loop size to enable meta unrolling of loops.
- * Note that the value here is expressed in Eigen's own notion of "number of FLOPS",
- * it does not correspond to the number of iterations or the number of instructions
- */
+ * Note that the value here is expressed in Eigen's own notion of "number of FLOPS",
+ * it does not correspond to the number of iterations or the number of instructions
+ */
#ifndef EIGEN_UNROLLING_LIMIT
#define EIGEN_UNROLLING_LIMIT 110
#endif
/** Defines the threshold between a "small" and a "large" matrix.
- * This threshold is mainly used to select the proper product implementation.
- */
+ * This threshold is mainly used to select the proper product implementation.
+ */
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
/** Defines the maximal width of the blocks used in the triangular product and solver
- * for vectors (level 2 blas xTRMV and xTRSV). The default is 8.
- */
+ * for vectors (level 2 blas xTRMV and xTRSV). The default is 8.
+ */
#ifndef EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH
#define EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH 8
#endif
-
/** Defines the default number of registers available for that architecture.
- * Currently it must be 8 or 16. Other values will fail.
- */
+ * Currently it must be 8 or 16. Other values will fail.
+ */
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
#endif
-#endif // EIGEN_DEFAULT_SETTINGS_H
+#endif // EIGEN_DEFAULT_SETTINGS_H
diff --git a/Eigen/src/Core/arch/GPU/Complex.h b/Eigen/src/Core/arch/GPU/Complex.h
index 8a7869c..fa46aec 100644
--- a/Eigen/src/Core/arch/GPU/Complex.h
+++ b/Eigen/src/Core/arch/GPU/Complex.h
@@ -31,7 +31,7 @@
// to the first inclusion of <complex>.
#if defined(EIGEN_GPUCC) && defined(EIGEN_GPU_COMPILE_PHASE)
-
+
// ICC already specializes std::complex<float> and std::complex<double>
// operators, preventing us from making them device functions here.
// This will lead to silent runtime errors if the operators are used on device.
@@ -62,33 +62,30 @@
// Specialized std::complex overloads.
namespace complex_operator_detail {
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<T> complex_multiply(const std::complex<T>& a, const std::complex<T>& b) {
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_multiply(const std::complex<T>& a,
+ const std::complex<T>& b) {
const T a_real = numext::real(a);
const T a_imag = numext::imag(a);
const T b_real = numext::real(b);
const T b_imag = numext::imag(b);
- return std::complex<T>(
- a_real * b_real - a_imag * b_imag,
- a_imag * b_real + a_real * b_imag);
+ return std::complex<T>(a_real * b_real - a_imag * b_imag, a_imag * b_real + a_real * b_imag);
}
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<T> complex_divide_fast(const std::complex<T>& a, const std::complex<T>& b) {
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide_fast(const std::complex<T>& a,
+ const std::complex<T>& b) {
const T a_real = numext::real(a);
const T a_imag = numext::imag(a);
const T b_real = numext::real(b);
const T b_imag = numext::imag(b);
const T norm = (b_real * b_real + b_imag * b_imag);
- return std::complex<T>((a_real * b_real + a_imag * b_imag) / norm,
- (a_imag * b_real - a_real * b_imag) / norm);
+ return std::complex<T>((a_real * b_real + a_imag * b_imag) / norm, (a_imag * b_real - a_real * b_imag) / norm);
}
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<T> complex_divide_stable(const std::complex<T>& a, const std::complex<T>& b) {
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide_stable(const std::complex<T>& a,
+ const std::complex<T>& b) {
const T a_real = numext::real(a);
const T a_imag = numext::imag(a);
const T b_real = numext::real(b);
@@ -99,13 +96,13 @@
const T rscale = scale_imag ? T(1) : b_real / b_imag;
const T iscale = scale_imag ? b_imag / b_real : T(1);
const T denominator = b_real * rscale + b_imag * iscale;
- return std::complex<T>((a_real * rscale + a_imag * iscale) / denominator,
+ return std::complex<T>((a_real * rscale + a_imag * iscale) / denominator,
(a_imag * rscale - a_real * iscale) / denominator);
}
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<T> complex_divide(const std::complex<T>& a, const std::complex<T>& b) {
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide(const std::complex<T>& a,
+ const std::complex<T>& b) {
#if EIGEN_FAST_MATH
return complex_divide_fast(a, b);
#else
@@ -118,131 +115,107 @@
// since they are already specialized for float/double/long double within
// the standard <complex> header. We also do not specialize the stream
// operators.
-#define EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(T) \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T> operator+(const std::complex<T>& a) { return a; } \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T> operator-(const std::complex<T>& a) { \
- return std::complex<T>(-numext::real(a), -numext::imag(a)); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T> operator+(const std::complex<T>& a, const std::complex<T>& b) { \
- return std::complex<T>(numext::real(a) + numext::real(b), numext::imag(a) + numext::imag(b)); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T> operator+(const std::complex<T>& a, const T& b) { \
- return std::complex<T>(numext::real(a) + b, numext::imag(a)); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T> operator+(const T& a, const std::complex<T>& b) { \
- return std::complex<T>(a + numext::real(b), numext::imag(b)); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T> operator-(const std::complex<T>& a, const std::complex<T>& b) { \
- return std::complex<T>(numext::real(a) - numext::real(b), numext::imag(a) - numext::imag(b)); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T> operator-(const std::complex<T>& a, const T& b) { \
- return std::complex<T>(numext::real(a) - b, numext::imag(a)); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T> operator-(const T& a, const std::complex<T>& b) { \
- return std::complex<T>(a - numext::real(b), -numext::imag(b)); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T> operator*(const std::complex<T>& a, const std::complex<T>& b) { \
- return complex_multiply(a, b); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T> operator*(const std::complex<T>& a, const T& b) { \
- return std::complex<T>(numext::real(a) * b, numext::imag(a) * b); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T> operator*(const T& a, const std::complex<T>& b) { \
- return std::complex<T>(a * numext::real(b), a * numext::imag(b)); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T> operator/(const std::complex<T>& a, const std::complex<T>& b) { \
- return complex_divide(a, b); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T> operator/(const std::complex<T>& a, const T& b) { \
- return std::complex<T>(numext::real(a) / b, numext::imag(a) / b); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T> operator/(const T& a, const std::complex<T>& b) { \
- return complex_divide(std::complex<T>(a, 0), b); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T>& operator+=(std::complex<T>& a, const std::complex<T>& b) { \
- numext::real_ref(a) += numext::real(b); \
- numext::imag_ref(a) += numext::imag(b); \
- return a; \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T>& operator-=(std::complex<T>& a, const std::complex<T>& b) { \
- numext::real_ref(a) -= numext::real(b); \
- numext::imag_ref(a) -= numext::imag(b); \
- return a; \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T>& operator*=(std::complex<T>& a, const std::complex<T>& b) { \
- a = complex_multiply(a, b); \
- return a; \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-std::complex<T>& operator/=(std::complex<T>& a, const std::complex<T>& b) { \
- a = complex_divide(a, b); \
- return a; \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-bool operator==(const std::complex<T>& a, const std::complex<T>& b) { \
- return numext::real(a) == numext::real(b) && numext::imag(a) == numext::imag(b); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-bool operator==(const std::complex<T>& a, const T& b) { \
- return numext::real(a) == b && numext::imag(a) == 0; \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-bool operator==(const T& a, const std::complex<T>& b) { \
- return a == numext::real(b) && 0 == numext::imag(b); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-bool operator!=(const std::complex<T>& a, const std::complex<T>& b) { \
- return !(a == b); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-bool operator!=(const std::complex<T>& a, const T& b) { \
- return !(a == b); \
-} \
- \
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
-bool operator!=(const T& a, const std::complex<T>& b) { \
- return !(a == b); \
-}
+#define EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(T) \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator+(const std::complex<T>& a) { return a; } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator-(const std::complex<T>& a) { \
+ return std::complex<T>(-numext::real(a), -numext::imag(a)); \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator+(const std::complex<T>& a, \
+ const std::complex<T>& b) { \
+ return std::complex<T>(numext::real(a) + numext::real(b), numext::imag(a) + numext::imag(b)); \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator+(const std::complex<T>& a, const T& b) { \
+ return std::complex<T>(numext::real(a) + b, numext::imag(a)); \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator+(const T& a, const std::complex<T>& b) { \
+ return std::complex<T>(a + numext::real(b), numext::imag(b)); \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator-(const std::complex<T>& a, \
+ const std::complex<T>& b) { \
+ return std::complex<T>(numext::real(a) - numext::real(b), numext::imag(a) - numext::imag(b)); \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator-(const std::complex<T>& a, const T& b) { \
+ return std::complex<T>(numext::real(a) - b, numext::imag(a)); \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator-(const T& a, const std::complex<T>& b) { \
+ return std::complex<T>(a - numext::real(b), -numext::imag(b)); \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator*(const std::complex<T>& a, \
+ const std::complex<T>& b) { \
+ return complex_multiply(a, b); \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator*(const std::complex<T>& a, const T& b) { \
+ return std::complex<T>(numext::real(a) * b, numext::imag(a) * b); \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator*(const T& a, const std::complex<T>& b) { \
+ return std::complex<T>(a * numext::real(b), a * numext::imag(b)); \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator/(const std::complex<T>& a, \
+ const std::complex<T>& b) { \
+ return complex_divide(a, b); \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator/(const std::complex<T>& a, const T& b) { \
+ return std::complex<T>(numext::real(a) / b, numext::imag(a) / b); \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator/(const T& a, const std::complex<T>& b) { \
+ return complex_divide(std::complex<T>(a, 0), b); \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator+=(std::complex<T>& a, const std::complex<T>& b) { \
+ numext::real_ref(a) += numext::real(b); \
+ numext::imag_ref(a) += numext::imag(b); \
+ return a; \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator-=(std::complex<T>& a, const std::complex<T>& b) { \
+ numext::real_ref(a) -= numext::real(b); \
+ numext::imag_ref(a) -= numext::imag(b); \
+ return a; \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator*=(std::complex<T>& a, const std::complex<T>& b) { \
+ a = complex_multiply(a, b); \
+ return a; \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator/=(std::complex<T>& a, const std::complex<T>& b) { \
+ a = complex_divide(a, b); \
+ return a; \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const std::complex<T>& a, const std::complex<T>& b) { \
+ return numext::real(a) == numext::real(b) && numext::imag(a) == numext::imag(b); \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const std::complex<T>& a, const T& b) { \
+ return numext::real(a) == b && numext::imag(a) == 0; \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const T& a, const std::complex<T>& b) { \
+ return a == numext::real(b) && 0 == numext::imag(b); \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const std::complex<T>& a, const std::complex<T>& b) { \
+ return !(a == b); \
+ } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const std::complex<T>& a, const T& b) { return !(a == b); } \
+ \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const T& a, const std::complex<T>& b) { return !(a == b); }
// Do not specialize for long double, since that reduces to double on device.
EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(float)
@@ -250,7 +223,6 @@
#undef EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS
-
} // namespace complex_operator_detail
EIGEN_USING_STD_COMPLEX_OPERATORS
diff --git a/Eigen/src/Core/arch/GPU/MathFunctions.h b/Eigen/src/Core/arch/GPU/MathFunctions.h
index f8191db..606215f 100644
--- a/Eigen/src/Core/arch/GPU/MathFunctions.h
+++ b/Eigen/src/Core/arch/GPU/MathFunctions.h
@@ -21,86 +21,73 @@
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 plog<float4>(const float4& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plog<float4>(const float4& a) {
return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 plog<double2>(const double2& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plog<double2>(const double2& a) {
using ::log;
return make_double2(log(a.x), log(a.y));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 plog1p<float4>(const float4& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plog1p<float4>(const float4& a) {
return make_float4(log1pf(a.x), log1pf(a.y), log1pf(a.z), log1pf(a.w));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 plog1p<double2>(const double2& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plog1p<double2>(const double2& a) {
return make_double2(log1p(a.x), log1p(a.y));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pexp<float4>(const float4& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pexp<float4>(const float4& a) {
return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pexp<double2>(const double2& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pexp<double2>(const double2& a) {
using ::exp;
return make_double2(exp(a.x), exp(a.y));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pexpm1<float4>(const float4& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pexpm1<float4>(const float4& a) {
return make_float4(expm1f(a.x), expm1f(a.y), expm1f(a.z), expm1f(a.w));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pexpm1<double2>(const double2& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pexpm1<double2>(const double2& a) {
return make_double2(expm1(a.x), expm1(a.y));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 psqrt<float4>(const float4& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psqrt<float4>(const float4& a) {
return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 psqrt<double2>(const double2& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psqrt<double2>(const double2& a) {
using ::sqrt;
return make_double2(sqrt(a.x), sqrt(a.y));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 prsqrt<float4>(const float4& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 prsqrt<float4>(const float4& a) {
return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 prsqrt<double2>(const double2& a)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 prsqrt<double2>(const double2& a) {
return make_double2(rsqrt(a.x), rsqrt(a.y));
}
-
#endif
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_MATH_FUNCTIONS_GPU_H
+#endif // EIGEN_MATH_FUNCTIONS_GPU_H
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index 5c959ed..7900b0e 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -36,23 +36,29 @@
// we'll use on the host side (SSE, AVX, ...)
#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
-template<> struct is_arithmetic<float4> { enum { value = true }; };
-template<> struct is_arithmetic<double2> { enum { value = true }; };
+template <>
+struct is_arithmetic<float4> {
+ enum { value = true };
+};
+template <>
+struct is_arithmetic<double2> {
+ enum { value = true };
+};
-template<> struct packet_traits<float> : default_packet_traits
-{
+template <>
+struct packet_traits<float> : default_packet_traits {
typedef float4 type;
typedef float4 half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
- size=4,
+ size = 4,
- HasDiv = 1,
- HasSin = 0,
- HasCos = 0,
- HasLog = 1,
- HasExp = 1,
+ HasDiv = 1,
+ HasSin = 0,
+ HasCos = 0,
+ HasLog = 1,
+ HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasLGamma = 1,
@@ -74,18 +80,18 @@
};
};
-template<> struct packet_traits<double> : default_packet_traits
-{
+template <>
+struct packet_traits<double> : default_packet_traits {
typedef double2 type;
typedef double2 half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
- size=2,
+ size = 2,
- HasDiv = 1,
- HasLog = 1,
- HasExp = 1,
+ HasDiv = 1,
+ HasLog = 1,
+ HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasLGamma = 1,
@@ -107,14 +113,37 @@
};
};
+template <>
+struct unpacket_traits<float4> {
+ typedef float type;
+ enum {
+ size = 4,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+ typedef float4 half;
+};
+template <>
+struct unpacket_traits<double2> {
+ typedef double type;
+ enum {
+ size = 2,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+ typedef double2 half;
+};
-template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef float4 half; };
-template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef double2 half; };
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
return make_float4(from, from, from, from);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
return make_double2(from, from);
}
@@ -123,259 +152,254 @@
// of the functions, while the latter can only deal with one of them.
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
- const float& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, const float& b) {
return __int_as_float(__float_as_int(a) & __float_as_int(b));
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a,
- const double& b) {
- return __longlong_as_double(__double_as_longlong(a) &
- __double_as_longlong(b));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a, const double& b) {
+ return __longlong_as_double(__double_as_longlong(a) & __double_as_longlong(b));
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a,
- const float& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a, const float& b) {
return __int_as_float(__float_as_int(a) | __float_as_int(b));
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a,
- const double& b) {
- return __longlong_as_double(__double_as_longlong(a) |
- __double_as_longlong(b));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a, const double& b) {
+ return __longlong_as_double(__double_as_longlong(a) | __double_as_longlong(b));
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a,
- const float& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a, const float& b) {
return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a,
- const double& b) {
- return __longlong_as_double(__double_as_longlong(a) ^
- __double_as_longlong(b));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a, const double& b) {
+ return __longlong_as_double(__double_as_longlong(a) ^ __double_as_longlong(b));
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a,
- const float& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a, const float& b) {
return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a,
- const double& b) {
- return __longlong_as_double(__double_as_longlong(a) &
- ~__double_as_longlong(b));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a, const double& b) {
+ return __longlong_as_double(__double_as_longlong(a) & ~__double_as_longlong(b));
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a,
- const float& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a, const float& b) {
return __int_as_float(a == b ? 0xffffffffu : 0u);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a,
- const double& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a, const double& b) {
return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a,
- const float& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a, const float& b) {
return __int_as_float(a < b ? 0xffffffffu : 0u);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a,
- const double& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a, const double& b) {
return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float le_mask(const float& a,
- const float& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float le_mask(const float& a, const float& b) {
return __int_as_float(a <= b ? 0xffffffffu : 0u);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double le_mask(const double& a,
- const double& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double le_mask(const double& a, const double& b) {
return __longlong_as_double(a <= b ? 0xffffffffffffffffull : 0ull);
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
- const float4& b) {
- return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
- bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a, const float4& b) {
+ return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y), bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a,
- const double2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a, const double2& b) {
return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a,
- const float4& b) {
- return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
- bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a, const float4& b) {
+ return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y), bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a,
- const double2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a, const double2& b) {
return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a,
- const float4& b) {
- return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
- bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a, const float4& b) {
+ return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y), bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a,
- const double2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a, const double2& b) {
return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a,
- const float4& b) {
- return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y),
- bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a, const float4& b) {
+ return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y), bitwise_andnot(a.z, b.z),
+ bitwise_andnot(a.w, b.w));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pandnot<double2>(const double2& a, const double2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pandnot<double2>(const double2& a, const double2& b) {
return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a,
- const float4& b) {
- return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
- eq_mask(a.w, b.w));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a, const float4& b) {
+ return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z), eq_mask(a.w, b.w));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a,
- const float4& b) {
- return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z),
- lt_mask(a.w, b.w));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a, const float4& b) {
+ return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z), lt_mask(a.w, b.w));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_le<float4>(const float4& a,
- const float4& b) {
- return make_float4(le_mask(a.x, b.x), le_mask(a.y, b.y), le_mask(a.z, b.z),
- le_mask(a.w, b.w));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_le<float4>(const float4& a, const float4& b) {
+ return make_float4(le_mask(a.x, b.x), le_mask(a.y, b.y), le_mask(a.z, b.z), le_mask(a.w, b.w));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pcmp_eq<double2>(const double2& a, const double2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_eq<double2>(const double2& a, const double2& b) {
return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pcmp_lt<double2>(const double2& a, const double2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_lt<double2>(const double2& a, const double2& b) {
return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pcmp_le<double2>(const double2& a, const double2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_le<double2>(const double2& a, const double2& b) {
return make_double2(le_mask(a.x, b.x), le_mask(a.y, b.y));
}
-#endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+#endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG &&
+ // !EIGEN_COMP_NVCC)
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
- return make_float4(a, a+1, a+2, a+3);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
+ return make_float4(a, a + 1, a + 2, a + 3);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
- return make_double2(a, a+1);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
+ return make_double2(a, a + 1);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
- return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
+ return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
- return make_double2(a.x+b.x, a.y+b.y);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
+ return make_double2(a.x + b.x, a.y + b.y);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
- return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
+ return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
- return make_double2(a.x-b.x, a.y-b.y);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
+ return make_double2(a.x - b.x, a.y - b.y);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
return make_float4(-a.x, -a.y, -a.z, -a.w);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
return make_double2(-a.x, -a.y);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
- return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) {
+ return a;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
- return make_double2(a.x*b.x, a.y*b.y);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) {
+ return a;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
- return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
+ return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
- return make_double2(a.x/b.x, a.y/b.y);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
+ return make_double2(a.x * b.x, a.y * b.y);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
+ return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
+ return make_double2(a.x / b.x, a.y / b.y);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
return *reinterpret_cast<const float4*>(from);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
return *reinterpret_cast<const double2*>(from);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
return make_float4(from[0], from[1], from[2], from[3]);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
return make_double2(from[0], from[1]);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
return make_float4(from[0], from[0], from[1], from[1]);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
return make_double2(from[0], from[0]);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
*reinterpret_cast<float4*>(to) = from;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
*reinterpret_cast<double2*>(to) = from;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
to[0] = from.x;
to[1] = from.y;
to[2] = from.z;
to[3] = from.w;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
to[0] = from.x;
to[1] = from.y;
}
-template<>
+template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
#if defined(EIGEN_GPU_HAS_LDG)
return __ldg(reinterpret_cast<const float4*>(from));
@@ -383,7 +407,7 @@
return make_float4(from[0], from[1], from[2], from[3]);
#endif
}
-template<>
+template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
#if defined(EIGEN_GPU_HAS_LDG)
return __ldg(reinterpret_cast<const double2*>(from));
@@ -392,93 +416,110 @@
#endif
}
-template<>
+template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
#if defined(EIGEN_GPU_HAS_LDG)
- return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
+ return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
#else
return make_float4(from[0], from[1], from[2], from[3]);
#endif
}
-template<>
+template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
#if defined(EIGEN_GPU_HAS_LDG)
- return make_double2(__ldg(from+0), __ldg(from+1));
+ return make_double2(__ldg(from + 0), __ldg(from + 1));
#else
return make_double2(from[0], from[1]);
#endif
}
-template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
- return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
+template <>
+EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
+ return make_float4(from[0 * stride], from[1 * stride], from[2 * stride], from[3 * stride]);
}
-template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
- return make_double2(from[0*stride], from[1*stride]);
+template <>
+EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
+ return make_double2(from[0 * stride], from[1 * stride]);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
- to[stride*0] = from.x;
- to[stride*1] = from.y;
- to[stride*2] = from.z;
- to[stride*3] = from.w;
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
+ to[stride * 0] = from.x;
+ to[stride * 1] = from.y;
+ to[stride * 2] = from.z;
+ to[stride * 3] = from.w;
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
- to[stride*0] = from.x;
- to[stride*1] = from.y;
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
+ to[stride * 0] = from.x;
+ to[stride * 1] = from.y;
}
-template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
return a.x;
}
-template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
+template <>
+EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
return a.x;
}
-template<> EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
return a.x + a.y + a.z + a.w;
}
-template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
+template <>
+EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
return a.x + a.y;
}
-template<> EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
}
-template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
+template <>
+EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
return fmax(a.x, a.y);
}
-template<> EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
}
-template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
+template <>
+EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
return fmin(a.x, a.y);
}
-template<> EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) {
return a.x * a.y * a.z * a.w;
}
-template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
+template <>
+EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
return a.x * a.y;
}
-template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
}
-template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
+template <>
+EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
return make_double2(fabs(a.x), fabs(a.y));
}
-template<> EIGEN_DEVICE_FUNC inline float4 pfloor<float4>(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC inline float4 pfloor<float4>(const float4& a) {
return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
}
-template<> EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
+template <>
+EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
return make_double2(floor(a.x), floor(a.y));
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<float4,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<float4, 4>& kernel) {
float tmp = kernel.packet[0].y;
kernel.packet[0].y = kernel.packet[1].x;
kernel.packet[1].x = tmp;
@@ -504,14 +545,13 @@
kernel.packet[3].z = tmp;
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<double2,2>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<double2, 2>& kernel) {
double tmp = kernel.packet[0].y;
kernel.packet[0].y = kernel.packet[1].x;
kernel.packet[1].x = tmp;
}
-#endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+#endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
// on device. There is no benefit to using them on the host anyways, since they are
@@ -519,41 +559,68 @@
#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
typedef ulonglong2 Packet4h2;
-template<> struct unpacket_traits<Packet4h2> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; };
-template<> struct is_arithmetic<Packet4h2> { enum { value = true }; };
+template <>
+struct unpacket_traits<Packet4h2> {
+ typedef Eigen::half type;
+ enum {
+ size = 8,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+ typedef Packet4h2 half;
+};
+template <>
+struct is_arithmetic<Packet4h2> {
+ enum { value = true };
+};
-template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; };
-template<> struct is_arithmetic<half2> { enum { value = true }; };
+template <>
+struct unpacket_traits<half2> {
+ typedef Eigen::half type;
+ enum {
+ size = 2,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+ typedef half2 half;
+};
+template <>
+struct is_arithmetic<half2> {
+ enum { value = true };
+};
-template<> struct packet_traits<Eigen::half> : default_packet_traits
-{
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
typedef Packet4h2 type;
typedef Packet4h2 half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
- size=8,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasSqrt = 1,
- HasRsqrt = 1,
- HasExp = 1,
- HasExpm1 = 1,
- HasLog = 1,
- HasLog1p = 1
+ size = 8,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasSqrt = 1,
+ HasRsqrt = 1,
+ HasExp = 1,
+ HasExpm1 = 1,
+ HasLog = 1,
+ HasLog1p = 1
};
};
-template<>
+template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
return __half2half2(from);
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pset1<Packet4h2>(const Eigen::half& from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pset1<Packet4h2>(const Eigen::half& from) {
Packet4h2 r;
half2* p_alias = reinterpret_cast<half2*>(&r);
p_alias[0] = pset1<half2>(from);
@@ -569,59 +636,48 @@
return *reinterpret_cast<const half2*>(from);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) {
- return __halves2half2(from[0], from[1]);
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { return __halves2half2(from[0], from[1]); }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) {
return __halves2half2(from[0], from[0]);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
- const half2& from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) {
*reinterpret_cast<half2*>(to) = from;
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
- const half2& from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) {
to[0] = __low2half(from);
to[1] = __high2half(from);
}
-
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
- const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) {
#if defined(EIGEN_GPU_HAS_LDG)
// Input is guaranteed to be properly aligned.
return __ldg(reinterpret_cast<const half2*>(from));
#else
- return __halves2half2(*(from+0), *(from+1));
+ return __halves2half2(*(from + 0), *(from + 1));
#endif
}
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
- const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) {
#if defined(EIGEN_GPU_HAS_LDG)
- return __halves2half2(__ldg(from+0), __ldg(from+1));
+ return __halves2half2(__ldg(from + 0), __ldg(from + 1));
#else
- return __halves2half2(*(from+0), *(from+1));
+ return __halves2half2(*(from + 0), *(from + 1));
#endif
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
- Index stride) {
- return __halves2half2(from[0*stride], from[1*stride]);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) {
+ return __halves2half2(from[0 * stride], from[1 * stride]);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(
- Eigen::half* to, const half2& from, Index stride) {
- to[stride*0] = __low2half(from);
- to[stride*1] = __high2half(from);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) {
+ to[stride * 0] = __low2half(from);
+ to[stride * 1] = __high2half(from);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) {
- return __low2half(a);
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { return __low2half(a); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
half a1 = __low2half(a);
@@ -641,8 +697,7 @@
return pset1<half2>(false_half);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<half2,2>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<half2, 2>& kernel) {
__half a1 = __low2half(kernel.packet[0]);
__half a2 = __high2half(kernel.packet[0]);
__half b1 = __low2half(kernel.packet[1]);
@@ -660,9 +715,7 @@
#endif
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
- const half2& a,
- const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) {
half mask_low = __low2half(mask);
half mask_high = __high2half(mask);
half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
@@ -670,8 +723,7 @@
return __halves2half2(result_low, result_high);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a,
- const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, const half2& b) {
half true_half = half_impl::raw_uint16_to_half(0xffffu);
half false_half = half_impl::raw_uint16_to_half(0x0000u);
half a1 = __low2half(a);
@@ -683,8 +735,7 @@
return __halves2half2(eq1, eq2);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a,
- const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, const half2& b) {
half true_half = half_impl::raw_uint16_to_half(0xffffu);
half false_half = half_impl::raw_uint16_to_half(0x0000u);
half a1 = __low2half(a);
@@ -696,8 +747,7 @@
return __halves2half2(eq1, eq2);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_le(const half2& a,
- const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_le(const half2& a, const half2& b) {
half true_half = half_impl::raw_uint16_to_half(0xffffu);
half false_half = half_impl::raw_uint16_to_half(0x0000u);
half a1 = __low2half(a);
@@ -709,8 +759,7 @@
return __halves2half2(eq1, eq2);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a,
- const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, const half2& b) {
half a1 = __low2half(a);
half a2 = __high2half(a);
half b1 = __low2half(b);
@@ -720,8 +769,7 @@
return __halves2half2(result1, result2);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a,
- const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, const half2& b) {
half a1 = __low2half(a);
half a2 = __high2half(a);
half b1 = __low2half(b);
@@ -731,8 +779,7 @@
return __halves2half2(result1, result2);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a,
- const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, const half2& b) {
half a1 = __low2half(a);
half a2 = __high2half(a);
half b1 = __low2half(b);
@@ -742,8 +789,7 @@
return __halves2half2(result1, result2);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
- const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2& b) {
half a1 = __low2half(a);
half a2 = __high2half(a);
half b1 = __low2half(b);
@@ -753,8 +799,7 @@
return __halves2half2(result1, result2);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
- const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hadd2(a, b);
#else
@@ -768,8 +813,7 @@
#endif
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a,
- const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hsub2(a, b);
#else
@@ -795,8 +839,7 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
- const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hmul2(a, b);
#else
@@ -810,11 +853,9 @@
#endif
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
- const half2& b,
- const half2& c) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
- return __hfma2(a, b, c);
+ return __hfma2(a, b, c);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
@@ -828,8 +869,7 @@
#endif
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a,
- const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __h2div(a, b);
#else
@@ -843,8 +883,7 @@
#endif
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a,
- const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@@ -854,8 +893,7 @@
return __halves2half2(r1, r2);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
- const half2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@@ -925,28 +963,15 @@
return __floats2half2_rn(r1, r2);
}
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \
- defined(EIGEN_HIP_DEVICE_COMPILE)
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-half2 plog(const half2& a) {
- return h2log(a);
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-half2 pexp(const half2& a) {
- return h2exp(a);
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-half2 psqrt(const half2& a) {
- return h2sqrt(a);
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-half2 prsqrt(const half2& a) {
- return h2rsqrt(a);
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); }
#else
@@ -982,18 +1007,16 @@
return __floats2half2_rn(r1, r2);
}
#endif
-} // namespace
+} // namespace
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pload<Packet4h2>(const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pload<Packet4h2>(const Eigen::half* from) {
return *reinterpret_cast<const Packet4h2*>(from);
}
// unaligned load;
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-ploadu<Packet4h2>(const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploadu<Packet4h2>(const Eigen::half* from) {
Packet4h2 r;
half2* p_alias = reinterpret_cast<half2*>(&r);
p_alias[0] = ploadu(from + 0);
@@ -1004,8 +1027,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-ploaddup<Packet4h2>(const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploaddup<Packet4h2>(const Eigen::half* from) {
Packet4h2 r;
half2* p_alias = reinterpret_cast<half2*>(&r);
p_alias[0] = ploaddup(from + 0);
@@ -1016,24 +1038,21 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(
- Eigen::half* to, const Packet4h2& from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h2& from) {
*reinterpret_cast<Packet4h2*>(to) = from;
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(
- Eigen::half* to, const Packet4h2& from) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h2& from) {
const half2* from_alias = reinterpret_cast<const half2*>(&from);
- pstoreu(to + 0,from_alias[0]);
- pstoreu(to + 2,from_alias[1]);
- pstoreu(to + 4,from_alias[2]);
- pstoreu(to + 6,from_alias[3]);
+ pstoreu(to + 0, from_alias[0]);
+ pstoreu(to + 2, from_alias[1]);
+ pstoreu(to + 4, from_alias[2]);
+ pstoreu(to + 6, from_alias[3]);
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
-ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
#if defined(EIGEN_GPU_HAS_LDG)
Packet4h2 r;
r = __ldg(reinterpret_cast<const Packet4h2*>(from));
@@ -1050,8 +1069,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
-ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
r_alias[0] = ploadt_ro_unaligned(from + 0);
@@ -1062,8 +1080,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
Packet4h2 r;
half2* p_alias = reinterpret_cast<half2*>(&r);
p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
@@ -1074,8 +1091,8 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(
- Eigen::half* to, const Packet4h2& from, Index stride) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(Eigen::half* to, const Packet4h2& from,
+ Index stride) {
const half2* from_alias = reinterpret_cast<const half2*>(&from);
pscatter(to + stride * 0, from_alias[0], stride);
pscatter(to + stride * 2, from_alias[1], stride);
@@ -1084,14 +1101,12 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(
- const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(const Packet4h2& a) {
return pfirst(*(reinterpret_cast<const half2*>(&a)));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(
- const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(const Packet4h2& a) {
Packet4h2 r;
half2* p_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1103,8 +1118,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(
- const Packet4h2& /*a*/) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(const Packet4h2& /*a*/) {
half true_half = half_impl::raw_uint16_to_half(0xffffu);
return pset1<Packet4h2>(true_half);
}
@@ -1115,9 +1129,9 @@
return pset1<Packet4h2>(false_half);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(
- double* d_row0, double* d_row1, double* d_row2, double* d_row3,
- double* d_row4, double* d_row5, double* d_row6, double* d_row7) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(double* d_row0, double* d_row1, double* d_row2,
+ double* d_row3, double* d_row4, double* d_row5,
+ double* d_row6, double* d_row7) {
double d_tmp;
d_tmp = d_row0[1];
d_row0[1] = d_row4[0];
@@ -1136,8 +1150,8 @@
d_row7[0] = d_tmp;
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(
- half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(half2* f_row0, half2* f_row1, half2* f_row2,
+ half2* f_row3) {
half2 f_tmp;
f_tmp = f_row0[1];
f_row0[1] = f_row2[0];
@@ -1148,8 +1162,7 @@
f_row3[0] = f_tmp;
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
-ptranspose_half(half2& f0, half2& f1) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half(half2& f0, half2& f1) {
__half a1 = __low2half(f0);
__half a2 = __high2half(f0);
__half b1 = __low2half(f1);
@@ -1158,8 +1171,7 @@
f1 = __halves2half2(a2, b2);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet4h2,8>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4h2, 8>& kernel) {
double* d_row0 = reinterpret_cast<double*>(&kernel.packet[0]);
double* d_row1 = reinterpret_cast<double*>(&kernel.packet[1]);
double* d_row2 = reinterpret_cast<double*>(&kernel.packet[2]);
@@ -1168,9 +1180,7 @@
double* d_row5 = reinterpret_cast<double*>(&kernel.packet[5]);
double* d_row6 = reinterpret_cast<double*>(&kernel.packet[6]);
double* d_row7 = reinterpret_cast<double*>(&kernel.packet[7]);
- ptranspose_double(d_row0, d_row1, d_row2, d_row3,
- d_row4, d_row5, d_row6, d_row7);
-
+ ptranspose_double(d_row0, d_row1, d_row2, d_row3, d_row4, d_row5, d_row6, d_row7);
half2* f_row0 = reinterpret_cast<half2*>(d_row0);
half2* f_row1 = reinterpret_cast<half2*>(d_row1);
@@ -1211,23 +1221,18 @@
ptranspose_half(f_row0[1], f_row1[1]);
ptranspose_half(f_row2[0], f_row3[0]);
ptranspose_half(f_row2[1], f_row3[1]);
-
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-plset<Packet4h2>(const Eigen::half& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::half& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
Packet4h2 r;
half2* p_alias = reinterpret_cast<half2*>(&r);
p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
- p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)),
- __hadd(a, __float2half(3.0f)));
- p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)),
- __hadd(a, __float2half(5.0f)));
- p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)),
- __hadd(a, __float2half(7.0f)));
+ p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)), __hadd(a, __float2half(3.0f)));
+ p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
+ p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
return r;
#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
Packet4h2 r;
@@ -1235,8 +1240,8 @@
half2 b = pset1<half2>(a);
half2 c;
- half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f));
- half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f));
+ half2 half_offset0 = __halves2half2(__float2half(0.0f), __float2half(2.0f));
+ half2 half_offset1 = __halves2half2(__float2half(4.0f), __float2half(6.0f));
c = __hadd2(b, half_offset0);
r_alias[0] = plset(__low2half(c));
@@ -1261,9 +1266,8 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
- const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
+ const Packet4h2& b) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* mask_alias = reinterpret_cast<const half2*>(&mask);
@@ -1277,8 +1281,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1291,8 +1294,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pcmp_lt<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_lt<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1305,8 +1307,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pcmp_le<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_le<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1319,8 +1320,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(
- const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1333,8 +1333,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(
- const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1347,8 +1346,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(
- const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1361,8 +1359,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1375,8 +1372,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(
- const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1389,8 +1385,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(
- const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1420,8 +1415,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(
- const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1434,8 +1428,8 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(
- const Packet4h2& a, const Packet4h2& b, const Packet4h2& c) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(const Packet4h2& a, const Packet4h2& b,
+ const Packet4h2& c) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1449,8 +1443,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(
- const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1463,8 +1456,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(
- const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1477,8 +1469,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(
- const Packet4h2& a, const Packet4h2& b) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1491,64 +1482,53 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(
- const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(const Packet4h2& a) {
const half2* a_alias = reinterpret_cast<const half2*>(&a);
- return predux(a_alias[0]) + predux(a_alias[1]) +
- predux(a_alias[2]) + predux(a_alias[3]);
+ return predux(a_alias[0]) + predux(a_alias[1]) + predux(a_alias[2]) + predux(a_alias[3]);
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
- const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(const Packet4h2& a) {
const half2* a_alias = reinterpret_cast<const half2*>(&a);
- half2 m0 = __halves2half2(predux_max(a_alias[0]),
- predux_max(a_alias[1]));
- half2 m1 = __halves2half2(predux_max(a_alias[2]),
- predux_max(a_alias[3]));
- __half first = predux_max(m0);
+ half2 m0 = __halves2half2(predux_max(a_alias[0]), predux_max(a_alias[1]));
+ half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
+ __half first = predux_max(m0);
__half second = predux_max(m1);
#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
return (__hgt(first, second) ? first : second);
#else
- float ffirst = __half2float(first);
+ float ffirst = __half2float(first);
float fsecond = __half2float(second);
- return (ffirst > fsecond)? first: second;
+ return (ffirst > fsecond) ? first : second;
#endif
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
- const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(const Packet4h2& a) {
const half2* a_alias = reinterpret_cast<const half2*>(&a);
- half2 m0 = __halves2half2(predux_min(a_alias[0]),
- predux_min(a_alias[1]));
- half2 m1 = __halves2half2(predux_min(a_alias[2]),
- predux_min(a_alias[3]));
- __half first = predux_min(m0);
+ half2 m0 = __halves2half2(predux_min(a_alias[0]), predux_min(a_alias[1]));
+ half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
+ __half first = predux_min(m0);
__half second = predux_min(m1);
#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
return (__hlt(first, second) ? first : second);
#else
- float ffirst = __half2float(first);
+ float ffirst = __half2float(first);
float fsecond = __half2float(second);
- return (ffirst < fsecond)? first: second;
+ return (ffirst < fsecond) ? first : second;
#endif
}
// likely overflow/underflow
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(
- const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(const Packet4h2& a) {
const half2* a_alias = reinterpret_cast<const half2*>(&a);
- return predux_mul(pmul(pmul(a_alias[0], a_alias[1]),
- pmul(a_alias[2], a_alias[3])));
+ return predux_mul(pmul(pmul(a_alias[0], a_alias[1]), pmul(a_alias[2], a_alias[3])));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-plog1p<Packet4h2>(const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog1p<Packet4h2>(const Packet4h2& a) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1560,8 +1540,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pexpm1<Packet4h2>(const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexpm1<Packet4h2>(const Packet4h2& a) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1609,8 +1588,7 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-prsqrt<Packet4h2>(const Packet4h2& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt<Packet4h2>(const Packet4h2& a) {
Packet4h2 r;
half2* r_alias = reinterpret_cast<half2*>(&r);
const half2* a_alias = reinterpret_cast<const half2*>(&a);
@@ -1623,9 +1601,8 @@
// The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for
// the implementation of GPU half reduction.
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
- const half2& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hadd2(a, b);
#else
@@ -1639,9 +1616,8 @@
#endif
}
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
- const half2& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __hmul2(a, b);
#else
@@ -1655,9 +1631,8 @@
#endif
}
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a,
- const half2& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
return __h2div(a, b);
#else
@@ -1671,9 +1646,8 @@
#endif
}
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a,
- const half2& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@@ -1683,9 +1657,8 @@
return __halves2half2(r1, r2);
}
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a,
- const half2& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@@ -1695,15 +1668,14 @@
return __halves2half2(r1, r2);
}
-#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
+#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
#undef EIGEN_GPU_HAS_LDG
#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-
-#endif // EIGEN_PACKET_MATH_GPU_H
+#endif // EIGEN_PACKET_MATH_GPU_H
diff --git a/Eigen/src/Core/arch/GPU/Tuple.h b/Eigen/src/Core/arch/GPU/Tuple.h
index e223ca1..6bea9ac 100644
--- a/Eigen/src/Core/arch/GPU/Tuple.h
+++ b/Eigen/src/Core/arch/GPU/Tuple.h
@@ -20,196 +20,173 @@
namespace tuple_impl {
// Internal tuple implementation.
-template<size_t N, typename... Types>
+template <size_t N, typename... Types>
class TupleImpl;
// Generic recursive tuple.
-template<size_t N, typename T1, typename... Ts>
+template <size_t N, typename T1, typename... Ts>
class TupleImpl<N, T1, Ts...> {
public:
// Tuple may contain Eigen types.
EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-
+
// Default constructor, enable if all types are default-constructible.
- template<typename U1 = T1, typename EnableIf = std::enable_if_t<
- std::is_default_constructible<U1>::value
- && reduce_all<std::is_default_constructible<Ts>::value...>::value
- >>
- EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC
- TupleImpl() : head_{}, tail_{} {}
-
+ template <typename U1 = T1,
+ typename EnableIf = std::enable_if_t<std::is_default_constructible<U1>::value &&
+ reduce_all<std::is_default_constructible<Ts>::value...>::value>>
+ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC TupleImpl() : head_{}, tail_{} {}
+
// Element constructor.
- template<typename U1, typename... Us,
- // Only enable if...
- typename EnableIf = std::enable_if_t<
- // the number of input arguments match, and ...
- sizeof...(Us) == sizeof...(Ts) && (
- // this does not look like a copy/move constructor.
- N > 1 || std::is_convertible<U1, T1>::value)
- >>
- EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC
- TupleImpl(U1&& arg1, Us&&... args)
- : head_(std::forward<U1>(arg1)), tail_(std::forward<Us>(args)...) {}
-
- // The first stored value.
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
- T1& head() {
- return head_;
- }
-
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
- const T1& head() const {
- return head_;
- }
-
+ template <typename U1, typename... Us,
+ // Only enable if...
+ typename EnableIf = std::enable_if_t<
+ // the number of input arguments match, and ...
+ sizeof...(Us) == sizeof...(Ts) && (
+ // this does not look like a copy/move constructor.
+ N > 1 || std::is_convertible<U1, T1>::value)>>
+ EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC TupleImpl(U1&& arg1, Us&&... args)
+ : head_(std::forward<U1>(arg1)), tail_(std::forward<Us>(args)...) {}
+
+ // The first stored value.
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& head() { return head_; }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& head() const { return head_; }
+
// The tail values.
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
- TupleImpl<N-1, Ts...>& tail() {
- return tail_;
- }
-
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
- const TupleImpl<N-1, Ts...>& tail() const {
- return tail_;
- }
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- void swap(TupleImpl& other) {
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TupleImpl<N - 1, Ts...>& tail() { return tail_; }
+
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const TupleImpl<N - 1, Ts...>& tail() const { return tail_; }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(TupleImpl& other) {
using numext::swap;
swap(head_, other.head_);
swap(tail_, other.tail_);
}
-
- template<typename... UTypes>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- TupleImpl& operator=(const TupleImpl<N, UTypes...>& other) {
+
+ template <typename... UTypes>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl& operator=(const TupleImpl<N, UTypes...>& other) {
head_ = other.head_;
tail_ = other.tail_;
return *this;
}
-
- template<typename... UTypes>
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- TupleImpl& operator=(TupleImpl<N, UTypes...>&& other) {
+
+ template <typename... UTypes>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl& operator=(TupleImpl<N, UTypes...>&& other) {
head_ = std::move(other.head_);
tail_ = std::move(other.tail_);
return *this;
}
-
+
private:
// Allow related tuples to reference head_/tail_.
- template<size_t M, typename... UTypes>
+ template <size_t M, typename... UTypes>
friend class TupleImpl;
-
+
T1 head_;
- TupleImpl<N-1, Ts...> tail_;
+ TupleImpl<N - 1, Ts...> tail_;
};
// Empty tuple specialization.
-template<>
+template <>
class TupleImpl<size_t(0)> {};
-template<typename TupleType>
+template <typename TupleType>
struct is_tuple : std::false_type {};
-template<typename... Types>
-struct is_tuple< TupleImpl<sizeof...(Types), Types...> > : std::true_type {};
+template <typename... Types>
+struct is_tuple<TupleImpl<sizeof...(Types), Types...>> : std::true_type {};
// Gets an element from a tuple.
-template<size_t Idx, typename T1, typename... Ts>
+template <size_t Idx, typename T1, typename... Ts>
struct tuple_get_impl {
using TupleType = TupleImpl<sizeof...(Ts) + 1, T1, Ts...>;
using ReturnType = typename tuple_get_impl<Idx - 1, Ts...>::ReturnType;
-
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
- ReturnType& run(TupleType& tuple) {
- return tuple_get_impl<Idx-1, Ts...>::run(tuple.tail());
+
+ static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ReturnType& run(TupleType& tuple) {
+ return tuple_get_impl<Idx - 1, Ts...>::run(tuple.tail());
}
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
- const ReturnType& run(const TupleType& tuple) {
- return tuple_get_impl<Idx-1, Ts...>::run(tuple.tail());
+ static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const ReturnType& run(const TupleType& tuple) {
+ return tuple_get_impl<Idx - 1, Ts...>::run(tuple.tail());
}
};
// Base case, getting the head element.
-template<typename T1, typename... Ts>
+template <typename T1, typename... Ts>
struct tuple_get_impl<0, T1, Ts...> {
using TupleType = TupleImpl<sizeof...(Ts) + 1, T1, Ts...>;
using ReturnType = T1;
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
- T1& run(TupleType& tuple) {
- return tuple.head();
- }
+ static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& run(TupleType& tuple) { return tuple.head(); }
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
- const T1& run(const TupleType& tuple) {
+ static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& run(const TupleType& tuple) {
return tuple.head();
}
};
// Concatenates N Tuples.
-template<size_t NTuples, typename... Tuples>
+template <size_t NTuples, typename... Tuples>
struct tuple_cat_impl;
-template<size_t NTuples, size_t N1, typename... Args1, size_t N2, typename... Args2, typename... Tuples>
+template <size_t NTuples, size_t N1, typename... Args1, size_t N2, typename... Args2, typename... Tuples>
struct tuple_cat_impl<NTuples, TupleImpl<N1, Args1...>, TupleImpl<N2, Args2...>, Tuples...> {
using TupleType1 = TupleImpl<N1, Args1...>;
using TupleType2 = TupleImpl<N2, Args2...>;
using MergedTupleType = TupleImpl<N1 + N2, Args1..., Args2...>;
-
- using ReturnType = typename tuple_cat_impl<NTuples-1, MergedTupleType, Tuples...>::ReturnType;
-
+
+ using ReturnType = typename tuple_cat_impl<NTuples - 1, MergedTupleType, Tuples...>::ReturnType;
+
// Uses the index sequences to extract and merge elements from tuple1 and tuple2,
// then recursively calls again.
- template<typename Tuple1, size_t... I1s, typename Tuple2, size_t... I2s, typename... MoreTuples>
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- ReturnType run(Tuple1&& tuple1, std::index_sequence<I1s...>,
- Tuple2&& tuple2, std::index_sequence<I2s...>,
- MoreTuples&&... tuples) {
- return tuple_cat_impl<NTuples-1, MergedTupleType, Tuples...>::run(
+ template <typename Tuple1, size_t... I1s, typename Tuple2, size_t... I2s, typename... MoreTuples>
+ static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1,
+ std::index_sequence<I1s...>,
+ Tuple2&& tuple2,
+ std::index_sequence<I2s...>,
+ MoreTuples&&... tuples) {
+ return tuple_cat_impl<NTuples - 1, MergedTupleType, Tuples...>::run(
MergedTupleType(tuple_get_impl<I1s, Args1...>::run(std::forward<Tuple1>(tuple1))...,
tuple_get_impl<I2s, Args2...>::run(std::forward<Tuple2>(tuple2))...),
std::forward<MoreTuples>(tuples)...);
}
-
+
// Concatenates the first two tuples.
- template<typename Tuple1, typename Tuple2, typename... MoreTuples>
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- ReturnType run(Tuple1&& tuple1, Tuple2&& tuple2, MoreTuples&&... tuples) {
- return run(std::forward<Tuple1>(tuple1), std::make_index_sequence<N1>{},
- std::forward<Tuple2>(tuple2), std::make_index_sequence<N2>{},
- std::forward<MoreTuples>(tuples)...);
+ template <typename Tuple1, typename Tuple2, typename... MoreTuples>
+ static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, Tuple2&& tuple2,
+ MoreTuples&&... tuples) {
+ return run(std::forward<Tuple1>(tuple1), std::make_index_sequence<N1>{}, std::forward<Tuple2>(tuple2),
+ std::make_index_sequence<N2>{}, std::forward<MoreTuples>(tuples)...);
}
};
// Base case with a single tuple.
-template<size_t N, typename... Args>
-struct tuple_cat_impl<1, TupleImpl<N, Args...> > {
+template <size_t N, typename... Args>
+struct tuple_cat_impl<1, TupleImpl<N, Args...>> {
using ReturnType = TupleImpl<N, Args...>;
-
- template<typename Tuple1>
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- ReturnType run(Tuple1&& tuple1) {
+
+ template <typename Tuple1>
+ static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1) {
return tuple1;
}
};
// Special case of no tuples.
-template<>
-struct tuple_cat_impl<0> {
+template <>
+struct tuple_cat_impl<0> {
using ReturnType = TupleImpl<0>;
- static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- ReturnType run() {return ReturnType{}; }
+ static EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run() { return ReturnType{}; }
};
// For use in make_tuple, unwraps a reference_wrapper.
template <typename T>
-struct unwrap_reference_wrapper { using type = T; };
-
+struct unwrap_reference_wrapper {
+ using type = T;
+};
+
template <typename T>
-struct unwrap_reference_wrapper<std::reference_wrapper<T> > { using type = T&; };
+struct unwrap_reference_wrapper<std::reference_wrapper<T>> {
+ using type = T&;
+};
// For use in make_tuple, decays a type and unwraps a reference_wrapper.
template <typename T>
@@ -220,11 +197,11 @@
/**
* Utility for determining a tuple's size.
*/
-template<typename Tuple>
+template <typename Tuple>
struct tuple_size;
-template<typename... Types >
-struct tuple_size< TupleImpl<sizeof...(Types), Types...> > : std::integral_constant<size_t, sizeof...(Types)> {};
+template <typename... Types>
+struct tuple_size<TupleImpl<sizeof...(Types), Types...>> : std::integral_constant<size_t, sizeof...(Types)> {};
/**
* Gets an element of a tuple.
@@ -233,17 +210,15 @@
* \param tuple the tuple.
* \return a reference to the desired element.
*/
-template<size_t Idx, typename... Types>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const typename tuple_get_impl<Idx, Types...>::ReturnType&
-get(const TupleImpl<sizeof...(Types), Types...>& tuple) {
+template <size_t Idx, typename... Types>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename tuple_get_impl<Idx, Types...>::ReturnType& get(
+ const TupleImpl<sizeof...(Types), Types...>& tuple) {
return tuple_get_impl<Idx, Types...>::run(tuple);
}
-template<size_t Idx, typename... Types>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename tuple_get_impl<Idx, Types...>::ReturnType&
-get(TupleImpl<sizeof...(Types), Types...>& tuple) {
+template <size_t Idx, typename... Types>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename tuple_get_impl<Idx, Types...>::ReturnType& get(
+ TupleImpl<sizeof...(Types), Types...>& tuple) {
return tuple_get_impl<Idx, Types...>::run(tuple);
}
@@ -252,31 +227,27 @@
* \param tuples ... list of tuples.
* \return concatenated tuple.
*/
-template<typename... Tuples,
- typename EnableIf = std::enable_if_t<
- internal::reduce_all<
- is_tuple<typename std::decay<Tuples>::type>::value...>::value>>
+template <typename... Tuples, typename EnableIf = std::enable_if_t<
+ internal::reduce_all<is_tuple<typename std::decay<Tuples>::type>::value...>::value>>
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::ReturnType
-tuple_cat(Tuples&&... tuples) {
+ typename tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::ReturnType
+ tuple_cat(Tuples&&... tuples) {
return tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::run(std::forward<Tuples>(tuples)...);
}
/**
* Tie arguments together into a tuple.
*/
-template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), Args&...> >
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-ReturnType tie(Args&... args) EIGEN_NOEXCEPT {
- return ReturnType{args...};
+template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), Args&...>>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType tie(Args&... args) EIGEN_NOEXCEPT {
+ return ReturnType{args...};
}
/**
* Create a tuple of l-values with the supplied arguments.
*/
-template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), typename unwrap_decay<Args>::type...> >
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-ReturnType make_tuple(Args&&... args) {
+template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), typename unwrap_decay<Args>::type...>>
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType make_tuple(Args&&... args) {
return ReturnType{std::forward<Args>(args)...};
}
@@ -284,15 +255,15 @@
* Forward a set of arguments as a tuple.
*/
template <typename... Args>
-EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-TupleImpl<sizeof...(Args), Args...> forward_as_tuple(Args&&... args) {
+EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl<sizeof...(Args), Args...> forward_as_tuple(
+ Args&&... args) {
return TupleImpl<sizeof...(Args), Args...>(std::forward<Args>(args)...);
}
/**
* Alternative to std::tuple that can be used on device.
*/
-template<typename... Types>
+template <typename... Types>
using tuple = TupleImpl<sizeof...(Types), Types...>;
} // namespace tuple_impl
diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h
index aa89cd2..ae43f8e 100644
--- a/Eigen/src/Core/arch/GPU/TypeCasting.h
+++ b/Eigen/src/Core/arch/GPU/TypeCasting.h
@@ -22,61 +22,56 @@
template <>
struct type_casting_traits<Eigen::half, float> {
- enum {
- VectorizedCast = 1,
- SrcCoeffRatio = 1,
- TgtCoeffRatio = 2
- };
+ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
};
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
float2 r1 = __half22float2(a);
float2 r2 = __half22float2(b);
return make_float4(r1.x, r1.y, r2.x, r2.y);
}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcast<float4, Packet4h2>(const float4& a, const float4& b) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcast<float4, Packet4h2>(const float4& a, const float4& b) {
Packet4h2 r;
- half2* r_alias=reinterpret_cast<half2*>(&r);
- r_alias[0]=__floats2half2_rn(a.x,a.y);
- r_alias[1]=__floats2half2_rn(a.z,a.w);
- r_alias[2]=__floats2half2_rn(b.x,b.y);
- r_alias[3]=__floats2half2_rn(b.z,b.w);
+ half2* r_alias = reinterpret_cast<half2*>(&r);
+ r_alias[0] = __floats2half2_rn(a.x, a.y);
+ r_alias[1] = __floats2half2_rn(a.z, a.w);
+ r_alias[2] = __floats2half2_rn(b.x, b.y);
+ r_alias[3] = __floats2half2_rn(b.z, b.w);
return r;
}
template <>
struct type_casting_traits<float, Eigen::half> {
- enum {
- VectorizedCast = 1,
- SrcCoeffRatio = 2,
- TgtCoeffRatio = 1
- };
+ enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
};
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<Packet4h2, float4>(const Packet4h2& a) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<Packet4h2, float4>(const Packet4h2& a) {
// Simply discard the second half of the input
float4 r;
- const half2* a_alias=reinterpret_cast<const half2*>(&a);
+ const half2* a_alias = reinterpret_cast<const half2*>(&a);
float2 r1 = __half22float2(a_alias[0]);
float2 r2 = __half22float2(a_alias[1]);
- r.x=static_cast<float>(r1.x);
- r.y=static_cast<float>(r1.y);
- r.z=static_cast<float>(r2.x);
- r.w=static_cast<float>(r2.y);
+ r.x = static_cast<float>(r1.x);
+ r.y = static_cast<float>(r1.y);
+ r.z = static_cast<float>(r2.x);
+ r.w = static_cast<float>(r2.y);
return r;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
// Simply discard the second half of the input
return __floats2half2_rn(a.x, a.y);
}
#endif
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_TYPE_CASTING_GPU_H
+#endif // EIGEN_TYPE_CASTING_GPU_H
diff --git a/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/Eigen/src/Core/arch/HIP/hcc/math_constants.h
index 25375a0..99dd3ae 100644
--- a/Eigen/src/Core/arch/HIP/hcc/math_constants.h
+++ b/Eigen/src/Core/arch/HIP/hcc/math_constants.h
@@ -1,5 +1,5 @@
/*
- * math_constants.h -
+ * math_constants.h -
* HIP equivalent of the CUDA header of the same name
*/
@@ -8,16 +8,16 @@
/* single precision constants */
-#define HIPRT_INF_F __int_as_float(0x7f800000)
-#define HIPRT_NAN_F __int_as_float(0x7fffffff)
+#define HIPRT_INF_F __int_as_float(0x7f800000)
+#define HIPRT_NAN_F __int_as_float(0x7fffffff)
#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001)
#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff)
-#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000)
-#define HIPRT_ZERO_F 0.0f
-#define HIPRT_ONE_F 1.0f
+#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000)
+#define HIPRT_ZERO_F 0.0f
+#define HIPRT_ONE_F 1.0f
/* double precision constants */
-#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000)
-#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000)
+#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000)
+#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000)
#endif
diff --git a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h
index 51f37fa..a159739 100644
--- a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h
@@ -9,31 +9,26 @@
namespace internal {
template <bool ConjLhs_, bool ConjRhs_, int PacketSize_>
-class gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Target,
- PacketSize_>
- : public gebp_traits<float, float, ConjLhs_, ConjRhs_,
- Architecture::Generic, PacketSize_> {
+class gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Target, PacketSize_>
+ : public gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_> {
public:
typedef Packet32qf AccPacket;
EIGEN_STRONG_INLINE void initAcc(Packet32qf& p) { p = pzero<Packet32qf>(p); }
template <typename LaneIdType>
- EIGEN_STRONG_INLINE void madd(const Packet32f& a, const Packet32f& b,
- Packet32qf& c, Packet32f& /*tmp*/,
+ EIGEN_STRONG_INLINE void madd(const Packet32f& a, const Packet32f& b, Packet32qf& c, Packet32f& /*tmp*/,
const LaneIdType&) const {
c = pmadd_f32_to_qf32(a, b, c);
}
template <typename LaneIdType>
- EIGEN_STRONG_INLINE void madd(const Packet32f& a,
- const QuadPacket<Packet32f>& b, Packet32qf& c,
- Packet32f& tmp, const LaneIdType& lane) const {
+ EIGEN_STRONG_INLINE void madd(const Packet32f& a, const QuadPacket<Packet32f>& b, Packet32qf& c, Packet32f& tmp,
+ const LaneIdType& lane) const {
madd(a, b.get(lane), c, tmp, lane);
}
- EIGEN_STRONG_INLINE void acc(const Packet32qf& c, const Packet32f& alpha,
- Packet32f& r) const {
+ EIGEN_STRONG_INLINE void acc(const Packet32qf& c, const Packet32f& alpha, Packet32f& r) const {
r = pmadd_qf32_to_f32(c, alpha, r);
}
};
diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h
index cc8722f..7c69f3b 100644
--- a/Eigen/src/Core/arch/HVX/PacketMath.h
+++ b/Eigen/src/Core/arch/HVX/PacketMath.h
@@ -18,21 +18,13 @@
namespace Eigen {
namespace internal {
-EIGEN_STRONG_INLINE HVX_Vector HVX_load(const void* mem) {
- return *((HVX_Vector*)mem);
-}
+EIGEN_STRONG_INLINE HVX_Vector HVX_load(const void* mem) { return *((HVX_Vector*)mem); }
-EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const void* mem) {
- return *((HVX_UVector*)mem);
-}
+EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const void* mem) { return *((HVX_UVector*)mem); }
-EIGEN_STRONG_INLINE void HVX_store(void* mem, HVX_Vector v) {
- *((HVX_Vector*)mem) = v;
-}
+EIGEN_STRONG_INLINE void HVX_store(void* mem, HVX_Vector v) { *((HVX_Vector*)mem) = v; }
-EIGEN_STRONG_INLINE void HVX_storeu(void* mem, HVX_Vector v) {
- *((HVX_UVector*)mem) = v;
-}
+EIGEN_STRONG_INLINE void HVX_storeu(void* mem, HVX_Vector v) { *((HVX_UVector*)mem) = v; }
// Hexagon compiler uses same HVX_Vector to represent all HVX vector types.
// Wrap different vector type (float32, int32, etc) to different class with
@@ -106,24 +98,18 @@
}
template <>
-EIGEN_STRONG_INLINE Packet32f pmul<Packet32f>(const Packet32f& a,
- const Packet32f& b) {
- return Packet32f::Create(
- Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
+EIGEN_STRONG_INLINE Packet32f pmul<Packet32f>(const Packet32f& a, const Packet32f& b) {
+ return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
}
template <>
-EIGEN_STRONG_INLINE Packet32f padd<Packet32f>(const Packet32f& a,
- const Packet32f& b) {
- return Packet32f::Create(
- Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
+EIGEN_STRONG_INLINE Packet32f padd<Packet32f>(const Packet32f& a, const Packet32f& b) {
+ return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
}
template <>
-EIGEN_STRONG_INLINE Packet32f psub<Packet32f>(const Packet32f& a,
- const Packet32f& b) {
- return Packet32f::Create(
- Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
+EIGEN_STRONG_INLINE Packet32f psub<Packet32f>(const Packet32f& a, const Packet32f& b) {
+ return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
}
template <>
@@ -153,8 +139,7 @@
}
template <>
-EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a,
- const Packet32f& b) {
+EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a, const Packet32f& b) {
HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
@@ -175,16 +160,12 @@
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 4>& kernel) {
// Shuffle the 32-bit lanes.
- HVX_VectorPair v_0_1_0 =
- Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
- HVX_VectorPair v_0_3_2 =
- Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
// Shuffle the 64-bit lanes.
- HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2),
- HEXAGON_HVX_GET_V0(v_0_1_0), -8);
- HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2),
- HEXAGON_HVX_GET_V1(v_0_1_0), -8);
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+ HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
@@ -194,174 +175,94 @@
EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 32>& kernel) {
// Shuffle the 32-bit lanes.
- HVX_VectorPair v_0_1_0 =
- Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
- HVX_VectorPair v_0_3_2 =
- Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
- HVX_VectorPair v_0_5_4 =
- Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
- HVX_VectorPair v_0_7_6 =
- Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
- HVX_VectorPair v_0_9_8 =
- Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
- HVX_VectorPair v_0_11_10 =
- Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
- HVX_VectorPair v_0_13_12 =
- Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
- HVX_VectorPair v_0_15_14 =
- Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
- HVX_VectorPair v_0_17_16 =
- Q6_W_vshuff_VVR(kernel.packet[17].Get(), kernel.packet[16].Get(), -4);
- HVX_VectorPair v_0_19_18 =
- Q6_W_vshuff_VVR(kernel.packet[19].Get(), kernel.packet[18].Get(), -4);
- HVX_VectorPair v_0_21_20 =
- Q6_W_vshuff_VVR(kernel.packet[21].Get(), kernel.packet[20].Get(), -4);
- HVX_VectorPair v_0_23_22 =
- Q6_W_vshuff_VVR(kernel.packet[23].Get(), kernel.packet[22].Get(), -4);
- HVX_VectorPair v_0_25_24 =
- Q6_W_vshuff_VVR(kernel.packet[25].Get(), kernel.packet[24].Get(), -4);
- HVX_VectorPair v_0_27_26 =
- Q6_W_vshuff_VVR(kernel.packet[27].Get(), kernel.packet[26].Get(), -4);
- HVX_VectorPair v_0_29_28 =
- Q6_W_vshuff_VVR(kernel.packet[29].Get(), kernel.packet[28].Get(), -4);
- HVX_VectorPair v_0_31_30 =
- Q6_W_vshuff_VVR(kernel.packet[31].Get(), kernel.packet[30].Get(), -4);
+ HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+ HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+ HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
+ HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
+ HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
+ HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
+ HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
+ HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
+ HVX_VectorPair v_0_17_16 = Q6_W_vshuff_VVR(kernel.packet[17].Get(), kernel.packet[16].Get(), -4);
+ HVX_VectorPair v_0_19_18 = Q6_W_vshuff_VVR(kernel.packet[19].Get(), kernel.packet[18].Get(), -4);
+ HVX_VectorPair v_0_21_20 = Q6_W_vshuff_VVR(kernel.packet[21].Get(), kernel.packet[20].Get(), -4);
+ HVX_VectorPair v_0_23_22 = Q6_W_vshuff_VVR(kernel.packet[23].Get(), kernel.packet[22].Get(), -4);
+ HVX_VectorPair v_0_25_24 = Q6_W_vshuff_VVR(kernel.packet[25].Get(), kernel.packet[24].Get(), -4);
+ HVX_VectorPair v_0_27_26 = Q6_W_vshuff_VVR(kernel.packet[27].Get(), kernel.packet[26].Get(), -4);
+ HVX_VectorPair v_0_29_28 = Q6_W_vshuff_VVR(kernel.packet[29].Get(), kernel.packet[28].Get(), -4);
+ HVX_VectorPair v_0_31_30 = Q6_W_vshuff_VVR(kernel.packet[31].Get(), kernel.packet[30].Get(), -4);
// Shuffle the 64-bit lanes.
- HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2),
- HEXAGON_HVX_GET_V0(v_0_1_0), -8);
- HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2),
- HEXAGON_HVX_GET_V1(v_0_1_0), -8);
- HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6),
- HEXAGON_HVX_GET_V0(v_0_5_4), -8);
- HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_7_6),
- HEXAGON_HVX_GET_V1(v_0_5_4), -8);
- HVX_VectorPair v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10),
- HEXAGON_HVX_GET_V0(v_0_9_8), -8);
- HVX_VectorPair v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10),
- HEXAGON_HVX_GET_V1(v_0_9_8), -8);
- HVX_VectorPair v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14),
- HEXAGON_HVX_GET_V0(v_0_13_12), -8);
- HVX_VectorPair v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14),
- HEXAGON_HVX_GET_V1(v_0_13_12), -8);
- HVX_VectorPair v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_19_18),
- HEXAGON_HVX_GET_V0(v_0_17_16), -8);
- HVX_VectorPair v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_19_18),
- HEXAGON_HVX_GET_V1(v_0_17_16), -8);
- HVX_VectorPair v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_23_22),
- HEXAGON_HVX_GET_V0(v_0_21_20), -8);
- HVX_VectorPair v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_23_22),
- HEXAGON_HVX_GET_V1(v_0_21_20), -8);
- HVX_VectorPair v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26),
- HEXAGON_HVX_GET_V0(v_0_25_24), -8);
- HVX_VectorPair v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26),
- HEXAGON_HVX_GET_V1(v_0_25_24), -8);
- HVX_VectorPair v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30),
- HEXAGON_HVX_GET_V0(v_0_29_28), -8);
- HVX_VectorPair v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30),
- HEXAGON_HVX_GET_V1(v_0_29_28), -8);
+ HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+ HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
+ HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
+ HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_7_6), HEXAGON_HVX_GET_V1(v_0_5_4), -8);
+ HVX_VectorPair v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
+ HVX_VectorPair v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_9_8), -8);
+ HVX_VectorPair v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
+ HVX_VectorPair v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_13_12), -8);
+ HVX_VectorPair v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_19_18), HEXAGON_HVX_GET_V0(v_0_17_16), -8);
+ HVX_VectorPair v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_19_18), HEXAGON_HVX_GET_V1(v_0_17_16), -8);
+ HVX_VectorPair v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_23_22), HEXAGON_HVX_GET_V0(v_0_21_20), -8);
+ HVX_VectorPair v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_23_22), HEXAGON_HVX_GET_V1(v_0_21_20), -8);
+ HVX_VectorPair v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_25_24), -8);
+ HVX_VectorPair v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_25_24), -8);
+ HVX_VectorPair v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_29_28), -8);
+ HVX_VectorPair v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_29_28), -8);
// Shuffle the 128-bit lanes.
- v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_5_4),
- HEXAGON_HVX_GET_V0(v_1_1_0), -16);
- v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_5_4),
- HEXAGON_HVX_GET_V1(v_1_1_0), -16);
- v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6),
- HEXAGON_HVX_GET_V0(v_1_3_2), -16);
- v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6),
- HEXAGON_HVX_GET_V1(v_1_3_2), -16);
- v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_13_12),
- HEXAGON_HVX_GET_V0(v_1_9_8), -16);
- v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_13_12),
- HEXAGON_HVX_GET_V1(v_1_9_8), -16);
- v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_15_14),
- HEXAGON_HVX_GET_V0(v_1_11_10), -16);
- v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_15_14),
- HEXAGON_HVX_GET_V1(v_1_11_10), -16);
- v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20),
- HEXAGON_HVX_GET_V0(v_1_17_16), -16);
- v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20),
- HEXAGON_HVX_GET_V1(v_1_17_16), -16);
- v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22),
- HEXAGON_HVX_GET_V0(v_1_19_18), -16);
- v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22),
- HEXAGON_HVX_GET_V1(v_1_19_18), -16);
- v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28),
- HEXAGON_HVX_GET_V0(v_1_25_24), -16);
- v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28),
- HEXAGON_HVX_GET_V1(v_1_25_24), -16);
- v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30),
- HEXAGON_HVX_GET_V0(v_1_27_26), -16);
- v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30),
- HEXAGON_HVX_GET_V1(v_1_27_26), -16);
+ v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
+ v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
+ v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_3_2), -16);
+ v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_3_2), -16);
+ v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_13_12), HEXAGON_HVX_GET_V0(v_1_9_8), -16);
+ v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_13_12), HEXAGON_HVX_GET_V1(v_1_9_8), -16);
+ v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_15_14), HEXAGON_HVX_GET_V0(v_1_11_10), -16);
+ v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_15_14), HEXAGON_HVX_GET_V1(v_1_11_10), -16);
+ v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_17_16), -16);
+ v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_17_16), -16);
+ v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_19_18), -16);
+ v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_19_18), -16);
+ v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_25_24), -16);
+ v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_25_24), -16);
+ v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_27_26), -16);
+ v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_27_26), -16);
// Shuffle the 256-bit lanes.
- v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8),
- HEXAGON_HVX_GET_V0(v_0_1_0), -32);
- v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8),
- HEXAGON_HVX_GET_V1(v_0_1_0), -32);
- v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10),
- HEXAGON_HVX_GET_V0(v_0_3_2), -32);
- v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10),
- HEXAGON_HVX_GET_V1(v_0_3_2), -32);
- v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_13_12),
- HEXAGON_HVX_GET_V0(v_0_5_4), -32);
- v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_13_12),
- HEXAGON_HVX_GET_V1(v_0_5_4), -32);
- v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14),
- HEXAGON_HVX_GET_V0(v_0_7_6), -32);
- v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14),
- HEXAGON_HVX_GET_V1(v_0_7_6), -32);
- v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_25_24),
- HEXAGON_HVX_GET_V0(v_0_17_16), -32);
- v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_25_24),
- HEXAGON_HVX_GET_V1(v_0_17_16), -32);
- v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26),
- HEXAGON_HVX_GET_V0(v_0_19_18), -32);
- v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26),
- HEXAGON_HVX_GET_V1(v_0_19_18), -32);
- v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_29_28),
- HEXAGON_HVX_GET_V0(v_0_21_20), -32);
- v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_29_28),
- HEXAGON_HVX_GET_V1(v_0_21_20), -32);
- v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30),
- HEXAGON_HVX_GET_V0(v_0_23_22), -32);
- v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30),
- HEXAGON_HVX_GET_V1(v_0_23_22), -32);
+ v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
+ v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
+ v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
+ v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
+ v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_13_12), HEXAGON_HVX_GET_V0(v_0_5_4), -32);
+ v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_13_12), HEXAGON_HVX_GET_V1(v_0_5_4), -32);
+ v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_7_6), -32);
+ v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_7_6), -32);
+ v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_25_24), HEXAGON_HVX_GET_V0(v_0_17_16), -32);
+ v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_25_24), HEXAGON_HVX_GET_V1(v_0_17_16), -32);
+ v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_19_18), -32);
+ v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_19_18), -32);
+ v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_29_28), HEXAGON_HVX_GET_V0(v_0_21_20), -32);
+ v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_29_28), HEXAGON_HVX_GET_V1(v_0_21_20), -32);
+ v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_23_22), -32);
+ v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_23_22), -32);
// Shuffle the 512-bit lanes.
- v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_17_16),
- HEXAGON_HVX_GET_V0(v_1_1_0), -64);
- v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_17_16),
- HEXAGON_HVX_GET_V1(v_1_1_0), -64);
- v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_19_18),
- HEXAGON_HVX_GET_V0(v_1_3_2), -64);
- v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_19_18),
- HEXAGON_HVX_GET_V1(v_1_3_2), -64);
- v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20),
- HEXAGON_HVX_GET_V0(v_1_5_4), -64);
- v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20),
- HEXAGON_HVX_GET_V1(v_1_5_4), -64);
- v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22),
- HEXAGON_HVX_GET_V0(v_1_7_6), -64);
- v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22),
- HEXAGON_HVX_GET_V1(v_1_7_6), -64);
- v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_25_24),
- HEXAGON_HVX_GET_V0(v_1_9_8), -64);
- v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_25_24),
- HEXAGON_HVX_GET_V1(v_1_9_8), -64);
- v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_27_26),
- HEXAGON_HVX_GET_V0(v_1_11_10), -64);
- v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_27_26),
- HEXAGON_HVX_GET_V1(v_1_11_10), -64);
- v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28),
- HEXAGON_HVX_GET_V0(v_1_13_12), -64);
- v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28),
- HEXAGON_HVX_GET_V1(v_1_13_12), -64);
- v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30),
- HEXAGON_HVX_GET_V0(v_1_15_14), -64);
- v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30),
- HEXAGON_HVX_GET_V1(v_1_15_14), -64);
+ v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_17_16), HEXAGON_HVX_GET_V0(v_1_1_0), -64);
+ v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_17_16), HEXAGON_HVX_GET_V1(v_1_1_0), -64);
+ v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_19_18), HEXAGON_HVX_GET_V0(v_1_3_2), -64);
+ v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_19_18), HEXAGON_HVX_GET_V1(v_1_3_2), -64);
+ v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_5_4), -64);
+ v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_5_4), -64);
+ v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_7_6), -64);
+ v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_7_6), -64);
+ v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_25_24), HEXAGON_HVX_GET_V0(v_1_9_8), -64);
+ v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_25_24), HEXAGON_HVX_GET_V1(v_1_9_8), -64);
+ v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_27_26), HEXAGON_HVX_GET_V0(v_1_11_10), -64);
+ v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_27_26), HEXAGON_HVX_GET_V1(v_1_11_10), -64);
+ v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_13_12), -64);
+ v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_13_12), -64);
+ v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_15_14), -64);
+ v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_15_14), -64);
kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
@@ -401,12 +302,9 @@
EIGEN_STRONG_INLINE float predux<Packet32f>(const Packet32f& a) {
HVX_Vector vsum_4 = Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), 4), a.Get());
HVX_Vector vsum_8 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_4, 8), vsum_4);
- HVX_Vector vsum_16 =
- Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_8, 16), vsum_8);
- HVX_Vector vsum_32 =
- Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_16, 32), vsum_16);
- HVX_Vector vsum_64 =
- Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_32, 64), vsum_32);
+ HVX_Vector vsum_16 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_8, 16), vsum_8);
+ HVX_Vector vsum_32 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_16, 32), vsum_16);
+ HVX_Vector vsum_64 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_32, 64), vsum_32);
return pfirst(Packet32f::Create(Q6_Vsf_equals_Vqf32(vsum_64)));
}
@@ -421,8 +319,7 @@
EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) {
HVX_Vector load = HVX_loadu(from);
HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
- HVX_VectorPair quad =
- Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8);
+ HVX_VectorPair quad = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8);
return Packet32f::Create(HEXAGON_HVX_GET_V0(quad));
}
@@ -463,8 +360,7 @@
}
template <>
-EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a,
- const Packet32f& b) {
+EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a, const Packet32f& b) {
HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero());
return Packet32f::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get()));
}
@@ -472,14 +368,10 @@
template <typename Op>
EIGEN_STRONG_INLINE float predux_generic(const Packet32f& a, Op op) {
Packet32f vredux_4 = op(Packet32f::Create(Q6_V_vror_VR(a.Get(), 4)), a);
- Packet32f vredux_8 =
- op(Packet32f::Create(Q6_V_vror_VR(vredux_4.Get(), 8)), vredux_4);
- Packet32f vredux_16 =
- op(Packet32f::Create(Q6_V_vror_VR(vredux_8.Get(), 16)), vredux_8);
- Packet32f vredux_32 =
- op(Packet32f::Create(Q6_V_vror_VR(vredux_16.Get(), 32)), vredux_16);
- Packet32f vredux_64 =
- op(Packet32f::Create(Q6_V_vror_VR(vredux_32.Get(), 64)), vredux_32);
+ Packet32f vredux_8 = op(Packet32f::Create(Q6_V_vror_VR(vredux_4.Get(), 8)), vredux_4);
+ Packet32f vredux_16 = op(Packet32f::Create(Q6_V_vror_VR(vredux_8.Get(), 16)), vredux_8);
+ Packet32f vredux_32 = op(Packet32f::Create(Q6_V_vror_VR(vredux_16.Get(), 32)), vredux_16);
+ Packet32f vredux_64 = op(Packet32f::Create(Q6_V_vror_VR(vredux_32.Get(), 64)), vredux_32);
return pfirst(vredux_64);
}
@@ -498,9 +390,9 @@
return predux_generic(a, por<Packet32f>) != 0.0f;
}
-static const float index_vsf[32] __attribute__((aligned(128))) = {
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
- 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+static const float index_vsf[32]
+ __attribute__((aligned(128))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
template <>
EIGEN_STRONG_INLINE Packet32f plset(const float& a) {
@@ -514,30 +406,23 @@
}
template <>
-EIGEN_STRONG_INLINE Packet32qf pmul<Packet32qf>(const Packet32qf& a,
- const Packet32qf& b) {
+EIGEN_STRONG_INLINE Packet32qf pmul<Packet32qf>(const Packet32qf& a, const Packet32qf& b) {
return Packet32qf::Create(Q6_Vqf32_vmpy_Vqf32Vqf32(a.Get(), b.Get()));
}
template <>
-EIGEN_STRONG_INLINE Packet32qf padd<Packet32qf>(const Packet32qf& a,
- const Packet32qf& b) {
+EIGEN_STRONG_INLINE Packet32qf padd<Packet32qf>(const Packet32qf& a, const Packet32qf& b) {
return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32(a.Get(), b.Get()));
}
// Mixed float32 and qfloat32 operations.
-EIGEN_STRONG_INLINE Packet32qf pmadd_f32_to_qf32(const Packet32f& a,
- const Packet32f& b,
- const Packet32qf& c) {
- return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32(
- Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()), c.Get()));
+EIGEN_STRONG_INLINE Packet32qf pmadd_f32_to_qf32(const Packet32f& a, const Packet32f& b, const Packet32qf& c) {
+ return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()), c.Get()));
}
-EIGEN_STRONG_INLINE Packet32f pmadd_qf32_to_f32(const Packet32qf& a,
- const Packet32f& b,
- const Packet32f& c) {
- return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(
- Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(a.Get()), b.Get()), c.Get())));
+EIGEN_STRONG_INLINE Packet32f pmadd_qf32_to_f32(const Packet32qf& a, const Packet32f& b, const Packet32f& c) {
+ return Packet32f::Create(Q6_Vsf_equals_Vqf32(
+ Q6_Vqf32_vadd_Vqf32Vsf(Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(a.Get()), b.Get()), c.Get())));
}
} // end namespace internal
diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h
index b64bd8d..2d2fbbc 100644
--- a/Eigen/src/Core/arch/MSA/Complex.h
+++ b/Eigen/src/Core/arch/MSA/Complex.h
@@ -24,17 +24,13 @@
//---------- float ----------
struct Packet2cf {
- EIGEN_STRONG_INLINE Packet2cf() {
- }
- EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex<float>& a,
- const std::complex<float>& b) {
- Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) };
+ EIGEN_STRONG_INLINE Packet2cf() {}
+ EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex<float>& a, const std::complex<float>& b) {
+ Packet4f t = {std::real(a), std::imag(a), std::real(b), std::imag(b)};
v = t;
}
- EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {
- }
- EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) {
- }
+ EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+ EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) {}
EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) {
v = b.v;
return *this;
@@ -61,33 +57,23 @@
v = padd(v1, v2);
return *this;
}
- EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
- return Packet2cf(*this) *= b;
- }
+ EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const { return Packet2cf(*this) *= b; }
EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
v = padd(v, b.v);
return *this;
}
- EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
- return Packet2cf(*this) += b;
- }
+ EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const { return Packet2cf(*this) += b; }
EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
v = psub(v, b.v);
return *this;
}
- EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
- return Packet2cf(*this) -= b;
- }
- EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const {
- return pdiv_complex(Packet2cf(*this), b);
- }
+ EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const { return Packet2cf(*this) -= b; }
+ EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const { return pdiv_complex(Packet2cf(*this), b); }
EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) {
*this = Packet2cf(*this) / b;
return *this;
}
- EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
- return Packet2cf(pnegate(v));
- }
+ EIGEN_STRONG_INLINE Packet2cf operator-(void) const { return Packet2cf(pnegate(v)); }
Packet4f v;
};
@@ -126,7 +112,13 @@
template <>
struct unpacket_traits<Packet2cf> {
typedef std::complex<float> type;
- enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+ enum {
+ size = 2,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
typedef Packet2cf half;
};
@@ -135,8 +127,8 @@
EIGEN_MSA_DEBUG;
float f0 = from.real(), f1 = from.imag();
- Packet4f v0 = { f0, f0, f0, f0 };
- Packet4f v1 = { f1, f1, f1, f1 };
+ Packet4f v0 = {f0, f0, f0, f0};
+ Packet4f v1 = {f1, f1, f1, f1};
return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0));
}
@@ -225,32 +217,29 @@
}
template <>
-EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to,
- const Packet2cf& from) {
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_ALIGNED_STORE pstore<float>((float*)to, from.v);
}
template <>
-EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to,
- const Packet2cf& from) {
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_UNALIGNED_STORE pstoreu<float>((float*)to, from.v);
}
template <>
-EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
- const std::complex<float>* from, Index stride) {
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+ Index stride) {
EIGEN_MSA_DEBUG;
return Packet2cf(from[0 * stride], from[1 * stride]);
}
template <>
-EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to,
- const Packet2cf& from,
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from,
Index stride) {
EIGEN_MSA_DEBUG;
@@ -300,8 +289,7 @@
EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
EIGEN_MSA_DEBUG;
- return std::complex<float>((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]),
- (a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
+ return std::complex<float>((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]), (a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
}
EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
@@ -321,39 +309,33 @@
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
EIGEN_MSA_DEBUG;
- Packet4f tmp =
- (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
- kernel.packet[0].v =
- (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
+ Packet4f tmp = (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
+ kernel.packet[0].v = (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
kernel.packet[1].v = tmp;
}
template <>
EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
const Packet2cf& elsePacket) {
- return (Packet2cf)(Packet4f)pblend<Packet2d>(ifPacket, (Packet2d)thenPacket.v,
- (Packet2d)elsePacket.v);
+ return (Packet2cf)(Packet4f)pblend<Packet2d>(ifPacket, (Packet2d)thenPacket.v, (Packet2d)elsePacket.v);
}
//---------- double ----------
struct Packet1cd {
- EIGEN_STRONG_INLINE Packet1cd() {
- }
+ EIGEN_STRONG_INLINE Packet1cd() {}
EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex<double>& a) {
v[0] = std::real(a);
v[1] = std::imag(a);
}
- EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {
- }
- EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) {
- }
+ EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+ EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) {}
EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) {
v = b.v;
return *this;
}
EIGEN_STRONG_INLINE Packet1cd conjugate(void) const {
- static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 };
+ static const v2u64 p2ul_CONJ_XOR = {0x0, 0x8000000000000000};
return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR);
}
EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
@@ -375,23 +357,17 @@
v = padd(v1, v2);
return *this;
}
- EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
- return Packet1cd(*this) *= b;
- }
+ EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const { return Packet1cd(*this) *= b; }
EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
v = padd(v, b.v);
return *this;
}
- EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
- return Packet1cd(*this) += b;
- }
+ EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const { return Packet1cd(*this) += b; }
EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
v = psub(v, b.v);
return *this;
}
- EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
- return Packet1cd(*this) -= b;
- }
+ EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const { return Packet1cd(*this) -= b; }
EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) {
*this *= b.conjugate();
Packet2d s = pmul<Packet2d>(b.v, b.v);
@@ -399,12 +375,8 @@
v = pdiv(v, s);
return *this;
}
- EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const {
- return Packet1cd(*this) /= b;
- }
- EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
- return Packet1cd(pnegate(v));
- }
+ EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const { return Packet1cd(*this) /= b; }
+ EIGEN_STRONG_INLINE Packet1cd operator-(void) const { return Packet1cd(pnegate(v)); }
Packet2d v;
};
@@ -439,7 +411,13 @@
template <>
struct unpacket_traits<Packet1cd> {
typedef std::complex<double> type;
- enum { size = 1, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+ enum {
+ size = 1,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
typedef Packet1cd half;
};
@@ -535,16 +513,14 @@
}
template <>
-EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to,
- const Packet1cd& from) {
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_ALIGNED_STORE pstore<double>((double*)to, from.v);
}
template <>
-EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to,
- const Packet1cd& from) {
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
EIGEN_MSA_DEBUG;
EIGEN_DEBUG_UNALIGNED_STORE pstoreu<double>((double*)to, from.v);
@@ -558,8 +534,8 @@
}
template <>
-EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
- const std::complex<double>* from, Index stride __attribute__((unused))) {
+EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from,
+ Index stride __attribute__((unused))) {
EIGEN_MSA_DEBUG;
Packet1cd res;
@@ -569,10 +545,8 @@
}
template <>
-EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to,
- const Packet1cd& from,
- Index stride
- __attribute__((unused))) {
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from,
+ Index stride __attribute__((unused))) {
EIGEN_MSA_DEBUG;
pstore(to, from);
diff --git a/Eigen/src/Core/arch/MSA/MathFunctions.h b/Eigen/src/Core/arch/MSA/MathFunctions.h
index 3e77329..f68d254 100644
--- a/Eigen/src/Core/arch/MSA/MathFunctions.h
+++ b/Eigen/src/Core/arch/MSA/MathFunctions.h
@@ -34,8 +34,7 @@
namespace internal {
template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f
-plog<Packet4f>(const Packet4f& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f plog<Packet4f>(const Packet4f& _x) {
static EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f);
static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f);
@@ -122,8 +121,7 @@
}
template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f
-pexp<Packet4f>(const Packet4f& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pexp<Packet4f>(const Packet4f& _x) {
// Limiting single-precision pexp's argument to [-128, +128] lets pexp
// reach 0 and INFINITY naturally.
static EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f);
@@ -143,10 +141,8 @@
Packet4f x = _x;
// Clamp x.
- x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x,
- (v16u8)p4f_exp_lo);
- x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x,
- (v16u8)p4f_exp_hi);
+ x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x, (v16u8)p4f_exp_lo);
+ x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x, (v16u8)p4f_exp_hi);
// Round to nearest integer by adding 0.5 (with x's sign) and truncating.
Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0);
@@ -175,8 +171,7 @@
}
template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f
-ptanh<Packet4f>(const Packet4f& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f ptanh<Packet4f>(const Packet4f& _x) {
static EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f);
static EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f);
// The monomial coefficients of the numerator polynomial (odd).
@@ -198,8 +193,7 @@
// Clamp the inputs to the range [-9, 9] since anything outside
// this range is -/+1.0f in single-precision.
- x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x,
- (v16u8)p4f_tanh_hi);
+ x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x, (v16u8)p4f_tanh_hi);
// Since the polynomials are odd/even, we need x**2.
Packet4f x2 = pmul(x, x);
@@ -264,7 +258,7 @@
// x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
// Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1);
- Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0); // bclri = bit-clear
+ Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0); // bclri = bit-clear
y = __builtin_msa_ffint_s_w(y_int2);
// Compute the sign to apply to the polynomial.
@@ -308,25 +302,22 @@
// Update the sign.
sign_mask = pxor(sign_mask, (Packet4i)y);
- y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0); // binsli = bit-insert-left
+ y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0); // binsli = bit-insert-left
return y;
}
template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f
-psin<Packet4f>(const Packet4f& x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psin<Packet4f>(const Packet4f& x) {
return psincos_inner_msa_float</* sine */ true>(x);
}
template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f
-pcos<Packet4f>(const Packet4f& x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pcos<Packet4f>(const Packet4f& x) {
return psincos_inner_msa_float</* sine */ false>(x);
}
template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d
-pexp<Packet2d>(const Packet2d& _x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d pexp<Packet2d>(const Packet2d& _x) {
// Limiting double-precision pexp's argument to [-1024, +1024] lets pexp
// reach 0 and INFINITY naturally.
static EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0);
@@ -348,10 +339,8 @@
Packet2d x = _x;
// Clamp x.
- x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x,
- (v16u8)p2d_exp_lo);
- x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x,
- (v16u8)p2d_exp_hi);
+ x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x, (v16u8)p2d_exp_lo);
+ x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x, (v16u8)p2d_exp_hi);
// Round to nearest integer by adding 0.5 (with x's sign) and truncating.
Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0);
diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h
index b36f024..c1843c3 100644
--- a/Eigen/src/Core/arch/MSA/PacketMath.h
+++ b/Eigen/src/Core/arch/MSA/PacketMath.h
@@ -54,9 +54,9 @@
typedef v4i32 Packet4i;
typedef v4u32 Packet4ui;
-#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X }
-#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X }
-#define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X }
+#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = {X, X, X, X}
+#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = {X, X, X, X}
+#define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = {X, X, X, X}
inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) {
os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
@@ -115,14 +115,26 @@
template <>
struct unpacket_traits<Packet4f> {
typedef float type;
- enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+ enum {
+ size = 4,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
typedef Packet4f half;
};
template <>
struct unpacket_traits<Packet4i> {
typedef int32_t type;
- enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+ enum {
+ size = 4,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
typedef Packet4i half;
};
@@ -130,7 +142,7 @@
EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
EIGEN_MSA_DEBUG;
- Packet4f v = { from, from, from, from };
+ Packet4f v = {from, from, from, from};
return v;
}
@@ -146,7 +158,7 @@
EIGEN_MSA_DEBUG;
float f = *from;
- Packet4f v = { f, f, f, f };
+ Packet4f v = {f, f, f, f};
return v;
}
@@ -175,7 +187,7 @@
EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
EIGEN_MSA_DEBUG;
- static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f };
+ static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f};
return padd(pset1<Packet4f>(a), countdown);
}
@@ -183,7 +195,7 @@
EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
EIGEN_MSA_DEBUG;
- static const Packet4i countdown = { 0, 1, 2, 3 };
+ static const Packet4i countdown = {0, 1, 2, 3};
return padd(pset1<Packet4i>(a), countdown);
}
@@ -411,8 +423,8 @@
EIGEN_MSA_DEBUG;
float f0 = from[0], f1 = from[1];
- Packet4f v0 = { f0, f0, f0, f0 };
- Packet4f v1 = { f1, f1, f1, f1 };
+ Packet4f v0 = {f0, f0, f0, f0};
+ Packet4f v1 = {f1, f1, f1, f1};
return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
}
@@ -421,8 +433,8 @@
EIGEN_MSA_DEBUG;
int32_t i0 = from[0], i1 = from[1];
- Packet4i v0 = { i0, i0, i0, i0 };
- Packet4i v1 = { i1, i1, i1, i1 };
+ Packet4i v0 = {i0, i0, i0, i0};
+ Packet4i v1 = {i1, i1, i1, i1};
return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
}
@@ -459,7 +471,7 @@
EIGEN_MSA_DEBUG;
float f = *from;
- Packet4f v = { f, f, f, f };
+ Packet4f v = {f, f, f, f};
v[1] = from[stride];
v[2] = from[2 * stride];
v[3] = from[3 * stride];
@@ -471,7 +483,7 @@
EIGEN_MSA_DEBUG;
int32_t i = *from;
- Packet4i v = { i, i, i, i };
+ Packet4i v = {i, i, i, i};
v[1] = from[stride];
v[2] = from[2 * stride];
v[3] = from[3 * stride];
@@ -479,8 +491,7 @@
}
template <>
-EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from,
- Index stride) {
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
EIGEN_MSA_DEBUG;
*to = from[0];
@@ -493,8 +504,7 @@
}
template <>
-EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
- Index stride) {
+EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride) {
EIGEN_MSA_DEBUG;
*to = from[0];
@@ -572,7 +582,6 @@
return s[0];
}
-
template <>
EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
EIGEN_MSA_DEBUG;
@@ -618,8 +627,7 @@
#endif
// Continue with min computation.
Packet4f v = __builtin_msa_fmin_w(a, swapped);
- v = __builtin_msa_fmin_w(
- v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+ v = __builtin_msa_fmin_w(v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
#if !EIGEN_FAST_MATH
// Based on the mask select between v and 4 qNaNs.
v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
@@ -653,8 +661,7 @@
#endif
// Continue with max computation.
Packet4f v = __builtin_msa_fmax_w(a, swapped);
- v = __builtin_msa_fmax_w(
- v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+ v = __builtin_msa_fmax_w(v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
#if !EIGEN_FAST_MATH
// Based on the mask select between v and 4 qNaNs.
v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
@@ -801,8 +808,7 @@
template <>
EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
const Packet4f& elsePacket) {
- Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
- ifPacket.select[3] };
+ Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
}
@@ -810,8 +816,7 @@
template <>
EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
const Packet4i& elsePacket) {
- Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
- ifPacket.select[3] };
+ Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
}
@@ -822,9 +827,9 @@
typedef v2i64 Packet2l;
typedef v2u64 Packet2ul;
-#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X }
-#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X }
-#define EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X }
+#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = {X, X}
+#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = {X, X}
+#define EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = {X, X}
inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) {
os << "[ " << value[0] << ", " << value[1] << " ]";
@@ -864,7 +869,13 @@
template <>
struct unpacket_traits<Packet2d> {
typedef double type;
- enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
+ enum {
+ size = 2,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
typedef Packet2d half;
};
@@ -872,7 +883,7 @@
EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
EIGEN_MSA_DEBUG;
- Packet2d value = { from, from };
+ Packet2d value = {from, from};
return value;
}
@@ -887,7 +898,7 @@
EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
EIGEN_MSA_DEBUG;
- static const Packet2d countdown = { 0.0, 1.0 };
+ static const Packet2d countdown = {0.0, 1.0};
return padd(pset1<Packet2d>(a), countdown);
}
@@ -1011,7 +1022,7 @@
EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
EIGEN_MSA_DEBUG;
- Packet2d value = { *from, *from };
+ Packet2d value = {*from, *from};
return value;
}
@@ -1041,8 +1052,7 @@
}
template <>
-EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from,
- Index stride) {
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
EIGEN_MSA_DEBUG;
*to = from[0];
@@ -1221,7 +1231,7 @@
template <>
EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
const Packet2d& elsePacket) {
- Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
+ Packet2ul select = {ifPacket.select[0], ifPacket.select[1]};
Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0);
return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
}
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 98b76da..8240847 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -18,70 +18,64 @@
namespace internal {
-inline uint32x4_t p4ui_CONJ_XOR()
-{
+inline uint32x4_t p4ui_CONJ_XOR() {
// See bug 1325, clang fails to call vld1q_u64.
#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
- uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+ uint32x4_t ret = {0x00000000, 0x80000000, 0x00000000, 0x80000000};
return ret;
#else
- static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
- return vld1q_u32( conj_XOR_DATA );
+ static const uint32_t conj_XOR_DATA[] = {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+ return vld1q_u32(conj_XOR_DATA);
#endif
}
-inline uint32x2_t p2ui_CONJ_XOR()
-{
- static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 };
- return vld1_u32( conj_XOR_DATA );
+inline uint32x2_t p2ui_CONJ_XOR() {
+ static const uint32_t conj_XOR_DATA[] = {0x00000000, 0x80000000};
+ return vld1_u32(conj_XOR_DATA);
}
//---------- float ----------
-struct Packet1cf
-{
+struct Packet1cf {
EIGEN_STRONG_INLINE Packet1cf() {}
EIGEN_STRONG_INLINE explicit Packet1cf(const Packet2f& a) : v(a) {}
Packet2f v;
};
-struct Packet2cf
-{
+struct Packet2cf {
EIGEN_STRONG_INLINE Packet2cf() {}
EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
Packet4f v;
};
-template<> struct packet_traits<std::complex<float> > : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
typedef Packet2cf type;
typedef Packet1cf half;
- enum
- {
+ enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 2,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasNegate = 1,
- HasSqrt = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasNegate = 1,
+ HasSqrt = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
HasSetLinear = 0
};
};
-template<> struct unpacket_traits<Packet1cf>
-{
+template <>
+struct unpacket_traits<Packet1cf> {
typedef std::complex<float> type;
typedef Packet1cf half;
typedef Packet2f as_real;
- enum
- {
+ enum {
size = 1,
alignment = Aligned16,
vectorizable = true,
@@ -89,13 +83,12 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet2cf>
-{
+template <>
+struct unpacket_traits<Packet2cf> {
typedef std::complex<float> type;
typedef Packet1cf half;
typedef Packet4f as_real;
- enum
- {
+ enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
@@ -104,45 +97,65 @@
};
};
-template<> EIGEN_STRONG_INLINE Packet1cf pcast<float,Packet1cf>(const float& a)
-{ return Packet1cf(vset_lane_f32(a, vdup_n_f32(0.f), 0)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pcast<Packet2f,Packet2cf>(const Packet2f& a)
-{ return Packet2cf(vreinterpretq_f32_u64(vmovl_u32(vreinterpret_u32_f32(a)))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf pcast<float, Packet1cf>(const float& a) {
+ return Packet1cf(vset_lane_f32(a, vdup_n_f32(0.f), 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcast<Packet2f, Packet2cf>(const Packet2f& a) {
+ return Packet2cf(vreinterpretq_f32_u64(vmovl_u32(vreinterpret_u32_f32(a))));
+}
-template<> EIGEN_STRONG_INLINE Packet1cf pset1<Packet1cf>(const std::complex<float>& from)
-{ return Packet1cf(vld1_f32(reinterpret_cast<const float*>(&from))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cf pset1<Packet1cf>(const std::complex<float>& from) {
+ return Packet1cf(vld1_f32(reinterpret_cast<const float*>(&from)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
const float32x2_t r64 = vld1_f32(reinterpret_cast<const float*>(&from));
return Packet2cf(vcombine_f32(r64, r64));
}
-template<> EIGEN_STRONG_INLINE Packet1cf padd<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(padd<Packet2f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(padd<Packet4f>(a.v, b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf padd<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+ return Packet1cf(padd<Packet2f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(padd<Packet4f>(a.v, b.v));
+}
-template<> EIGEN_STRONG_INLINE Packet1cf psub<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(psub<Packet2f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(psub<Packet4f>(a.v, b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf psub<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+ return Packet1cf(psub<Packet2f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(psub<Packet4f>(a.v, b.v));
+}
-template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate<Packet2f>(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate<Packet4f>(a.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) {
+ return Packet1cf(pnegate<Packet2f>(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+ return Packet2cf(pnegate<Packet4f>(a.v));
+}
-template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a) {
const Packet2ui b = Packet2ui(vreinterpret_u32_f32(a.v));
return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR())));
}
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
const Packet4ui b = Packet4ui(vreinterpretq_u32_f32(a.v));
return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR())));
}
-template<> EIGEN_STRONG_INLINE Packet1cf pmul<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cf pmul<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
Packet2f v1, v2;
// Get the real values of a | a1_re | a1_re |
@@ -160,8 +173,8 @@
// Add and return the result
return Packet1cf(vadd_f32(v1, v2));
}
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
Packet4f v1, v2;
// Get the real values of a | a1_re | a1_re | a2_re | a2_re |
@@ -180,8 +193,8 @@
return Packet2cf(vaddq_f32(v1, v2));
}
-template<> EIGEN_STRONG_INLINE Packet1cf pcmp_eq(const Packet1cf& a, const Packet1cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cf pcmp_eq(const Packet1cf& a, const Packet1cf& b) {
// Compare real and imaginary parts of a and b to get the mask vector:
// [re(a[0])==re(b[0]), im(a[0])==im(b[0])]
Packet2f eq = pcmp_eq<Packet2f>(a.v, b.v);
@@ -191,8 +204,8 @@
// Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped
return Packet1cf(pand<Packet2f>(eq, eq_swapped));
}
-template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
// Compare real and imaginary parts of a and b to get the mask vector:
// [re(a[0])==re(b[0]), im(a[0])==im(b[0]), re(a[1])==re(b[1]), im(a[1])==im(b[1])]
Packet4f eq = pcmp_eq<Packet4f>(a.v, b.v);
@@ -203,129 +216,178 @@
return Packet2cf(pand<Packet4f>(eq, eq_swapped));
}
-template<> EIGEN_STRONG_INLINE Packet1cf pand<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf por<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
-template<> EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf pxor<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf pandnot<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf pload<Packet1cf>(const std::complex<float>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload<Packet2f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(reinterpret_cast<const float*>(from))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf ploadu<Packet1cf>(const std::complex<float>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cf(ploadu<Packet2f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(reinterpret_cast<const float*>(from))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf ploaddup<Packet1cf>(const std::complex<float>* from)
-{ return pset1<Packet1cf>(*from); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from)
-{ return pset1<Packet2cf>(*from); }
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *to, const Packet1cf& from)
-{ EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *to, const Packet2cf& from)
-{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<float*>(to), from.v); }
-
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *to, const Packet1cf& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *to, const Packet2cf& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<float*>(to), from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet1cf pgather<std::complex<float>, Packet1cf>(
- const std::complex<float>* from, Index stride)
-{
- const Packet2f tmp = vdup_n_f32(std::real(from[0*stride]));
- return Packet1cf(vset_lane_f32(std::imag(from[0*stride]), tmp, 1));
+template <>
+EIGEN_STRONG_INLINE Packet1cf pand<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+ return Packet1cf(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v))));
}
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
- const std::complex<float>* from, Index stride)
-{
- Packet4f res = vdupq_n_f32(std::real(from[0*stride]));
- res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1);
- res = vsetq_lane_f32(std::real(from[1*stride]), res, 2);
- res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3);
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf por<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+ return Packet1cf(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf pxor<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+ return Packet1cf(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf pandnot<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+ return Packet1cf(vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf pload<Packet1cf>(const std::complex<float>* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload<Packet2f>((const float*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(reinterpret_cast<const float*>(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf ploadu<Packet1cf>(const std::complex<float>* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cf(ploadu<Packet2f>((const float*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(reinterpret_cast<const float*>(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf ploaddup<Packet1cf>(const std::complex<float>* from) {
+ return pset1<Packet1cf>(*from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+ return pset1<Packet2cf>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet1cf& from) {
+ EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<float*>(to), from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet1cf& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<float*>(to), from.v);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cf pgather<std::complex<float>, Packet1cf>(const std::complex<float>* from,
+ Index stride) {
+ const Packet2f tmp = vdup_n_f32(std::real(from[0 * stride]));
+ return Packet1cf(vset_lane_f32(std::imag(from[0 * stride]), tmp, 1));
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+ Index stride) {
+ Packet4f res = vdupq_n_f32(std::real(from[0 * stride]));
+ res = vsetq_lane_f32(std::imag(from[0 * stride]), res, 1);
+ res = vsetq_lane_f32(std::real(from[1 * stride]), res, 2);
+ res = vsetq_lane_f32(std::imag(from[1 * stride]), res, 3);
return Packet2cf(res);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet1cf>(
- std::complex<float>* to, const Packet1cf& from, Index stride)
-{ to[stride*0] = std::complex<float>(vget_lane_f32(from.v, 0), vget_lane_f32(from.v, 1)); }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(
- std::complex<float>* to, const Packet2cf& from, Index stride)
-{
- to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
- to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet1cf>(std::complex<float>* to, const Packet1cf& from,
+ Index stride) {
+ to[stride * 0] = std::complex<float>(vget_lane_f32(from.v, 0), vget_lane_f32(from.v, 1));
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from,
+ Index stride) {
+ to[stride * 0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
+ to[stride * 1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
}
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *addr)
-{ EIGEN_ARM_PREFETCH(reinterpret_cast<const float*>(addr)); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+ EIGEN_ARM_PREFETCH(reinterpret_cast<const float*>(addr));
+}
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet1cf>(const Packet1cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet1cf>(const Packet1cf& a) {
EIGEN_ALIGN16 std::complex<float> x;
vst1_f32(reinterpret_cast<float*>(&x), a.v);
return x;
}
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
EIGEN_ALIGN16 std::complex<float> x[2];
vst1q_f32(reinterpret_cast<float*>(x), a.v);
return x[0];
}
-template<> EIGEN_STRONG_INLINE Packet1cf preverse(const Packet1cf& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{ return Packet2cf(vcombine_f32(vget_high_f32(a.v), vget_low_f32(a.v))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf preverse(const Packet1cf& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
+ return Packet2cf(vcombine_f32(vget_high_f32(a.v), vget_low_f32(a.v)));
+}
-template<> EIGEN_STRONG_INLINE Packet1cf pcplxflip<Packet1cf>(const Packet1cf& a)
-{ return Packet1cf(vrev64_f32(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a)
-{ return Packet2cf(vrev64q_f32(a.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf pcplxflip<Packet1cf>(const Packet1cf& a) {
+ return Packet1cf(vrev64_f32(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
+ return Packet2cf(vrev64q_f32(a.v));
+}
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet1cf>(const Packet1cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet1cf>(const Packet1cf& a) {
std::complex<float> s;
- vst1_f32((float *)&s, a.v);
+ vst1_f32((float*)&s, a.v);
return s;
}
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
std::complex<float> s;
vst1_f32(reinterpret_cast<float*>(&s), vadd_f32(vget_low_f32(a.v), vget_high_f32(a.v)));
return s;
}
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet1cf>(const Packet1cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet1cf>(const Packet1cf& a) {
std::complex<float> s;
- vst1_f32((float *)&s, a.v);
+ vst1_f32((float*)&s, a.v);
return s;
}
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
float32x2_t a1, a2, v1, v2, prod;
std::complex<float> s;
a1 = vget_low_f32(a.v);
a2 = vget_high_f32(a.v);
- // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+ // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
v1 = vdup_lane_f32(a1, 0);
// Get the real values of a | a1_im | a1_im | a2_im | a2_im |
v2 = vdup_lane_f32(a1, 1);
@@ -345,31 +407,32 @@
return s;
}
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cf,Packet2f)
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cf, Packet2f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
-template<> EIGEN_STRONG_INLINE Packet1cf pdiv<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cf pdiv<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
return pdiv_complex(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
return pdiv_complex(a, b);
}
EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1cf, 1>& /*kernel*/) {}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel)
-{
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
kernel.packet[1].v = tmp;
}
-template<> EIGEN_STRONG_INLINE Packet1cf psqrt<Packet1cf>(const Packet1cf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet1cf psqrt<Packet1cf>(const Packet1cf& a) {
return psqrt_complex<Packet1cf>(a);
}
-template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
return psqrt_complex<Packet2cf>(a);
}
@@ -378,84 +441,93 @@
// See bug 1325, clang fails to call vld1q_u64.
#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML || EIGEN_COMP_CPE
- static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000};
+static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000};
#else
- const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 };
- static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );
+const uint64_t p2ul_conj_XOR_DATA[] = {0x0, 0x8000000000000000};
+static uint64x2_t p2ul_CONJ_XOR = vld1q_u64(p2ul_conj_XOR_DATA);
#endif
-struct Packet1cd
-{
+struct Packet1cd {
EIGEN_STRONG_INLINE Packet1cd() {}
EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
Packet2d v;
};
-template<> struct packet_traits<std::complex<double> > : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
typedef Packet1cd type;
typedef Packet1cd half;
- enum
- {
+ enum {
Vectorizable = 1,
AlignedOnScalar = 0,
size = 1,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
HasNegate = 1,
- HasSqrt = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
+ HasSqrt = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
HasSetLinear = 0
};
};
-template<> struct unpacket_traits<Packet1cd>
-{
+template <>
+struct unpacket_traits<Packet1cd> {
typedef std::complex<double> type;
typedef Packet1cd half;
typedef Packet2d as_real;
- enum
- {
- size=1,
- alignment=Aligned16,
- vectorizable=true,
- masked_load_available=false,
- masked_store_available=false
+ enum {
+ size = 1,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
};
};
-template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>(reinterpret_cast<const double*>(from))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>(reinterpret_cast<const double*>(from)));
+}
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>(reinterpret_cast<const double*>(from))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>(reinterpret_cast<const double*>(from)));
+}
-template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) {
/* here we really have to use unaligned loads :( */
return ploadu<Packet1cd>(&from);
}
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(padd<Packet2d>(a.v, b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(padd<Packet2d>(a.v, b.v));
+}
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(psub<Packet2d>(a.v, b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(psub<Packet2d>(a.v, b.v));
+}
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a)
-{ return Packet1cd(pnegate<Packet2d>(a.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+ return Packet1cd(pnegate<Packet2d>(a.v));
+}
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
-{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR)));
+}
-template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
Packet2d v1, v2;
// Get the real values of a
@@ -474,8 +546,8 @@
return Packet1cd(vaddq_f64(v1, v2));
}
-template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
// Compare real and imaginary parts of a and b to get the mask vector:
// [re(a)==re(b), im(a)==im(b)]
Packet2d eq = pcmp_eq<Packet2d>(a.v, b.v);
@@ -486,81 +558,109 @@
return Packet1cd(pand<Packet2d>(eq, eq_swapped));
}
-template<> EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v), vreinterpretq_u64_f64(b.v))));
+}
-template<> EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v), vreinterpretq_u64_f64(b.v))));
+}
-template<> EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), vreinterpretq_u64_f64(b.v))));
+}
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v), vreinterpretq_u64_f64(b.v))));
+}
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from)
-{ return pset1<Packet1cd>(*from); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+ return pset1<Packet1cd>(*from);
+}
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *to, const Packet1cd& from)
-{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<double*>(to), from.v); }
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<double*>(to), from.v);
+}
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *to, const Packet1cd& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), from.v); }
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), from.v);
+}
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *addr)
-{ EIGEN_ARM_PREFETCH(reinterpret_cast<const double*>(addr)); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+ EIGEN_ARM_PREFETCH(reinterpret_cast<const double*>(addr));
+}
-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
- const std::complex<double>* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from,
+ Index stride) {
Packet2d res = pset1<Packet2d>(0.0);
- res = vsetq_lane_f64(std::real(from[0*stride]), res, 0);
- res = vsetq_lane_f64(std::imag(from[0*stride]), res, 1);
+ res = vsetq_lane_f64(std::real(from[0 * stride]), res, 0);
+ res = vsetq_lane_f64(std::imag(from[0 * stride]), res, 1);
return Packet1cd(res);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(
- std::complex<double>* to, const Packet1cd& from, Index stride)
-{ to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); }
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from,
+ Index stride) {
+ to[stride * 0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1));
+}
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
EIGEN_ALIGN16 std::complex<double> res;
pstore<std::complex<double> >(&res, a);
return res;
}
-template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+ return a;
+}
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
+ return pfirst(a);
+}
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
+ return pfirst(a);
+}
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
return pdiv_complex(a, b);
}
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
-{ return Packet1cd(preverse(Packet2d(x.v))); }
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
+ return Packet1cd(preverse(Packet2d(x.v)));
+}
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
-{
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
Packet2d tmp = vcombine_f64(vget_high_f64(kernel.packet[0].v), vget_high_f64(kernel.packet[1].v));
kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v));
kernel.packet[1].v = tmp;
}
-template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+template <>
+EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
return psqrt_complex<Packet1cd>(a);
}
-#endif // EIGEN_ARCH_ARM64
+#endif // EIGEN_ARCH_ARM64
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_COMPLEX_NEON_H
+#endif // EIGEN_COMPLEX_NEON_H
diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
index 48410c5..4ecf7d1 100644
--- a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
@@ -9,38 +9,28 @@
// Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
// Here we specialize gebp_traits to eliminate these register spills.
// See #2138.
-template<>
-struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
- : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
-{
- EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
- {
+template <>
+struct gebp_traits<float, float, false, false, Architecture::NEON, GEBPPacketFull>
+ : gebp_traits<float, float, false, false, Architecture::Generic, GEBPPacketFull> {
+ EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const {
// This volatile inline ASM both acts as a barrier to prevent reordering,
// as well as enforces strict register use.
- asm volatile(
- "vmla.f32 %q[r], %q[c], %q[alpha]"
- : [r] "+w" (r)
- : [c] "w" (c),
- [alpha] "w" (alpha)
- : );
+ asm volatile("vmla.f32 %q[r], %q[c], %q[alpha]" : [r] "+w"(r) : [c] "w"(c), [alpha] "w"(alpha) :);
}
template <typename LaneIdType>
- EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
- Packet4f& c, Packet4f&,
- const LaneIdType&) const {
+ EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b, Packet4f& c, Packet4f&, const LaneIdType&) const {
acc(a, b, c);
}
-
+
template <typename LaneIdType>
- EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b,
- Packet4f& c, Packet4f& tmp,
+ EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b, Packet4f& c, Packet4f& tmp,
const LaneIdType& lane) const {
madd(a, b.get(lane), c, tmp, lane);
}
};
-#endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
+#endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
#if EIGEN_ARCH_ARM64
@@ -48,139 +38,139 @@
#define EIGEN_NEON_GEBP_NR 8
#endif
-template<>
-struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
- : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
-{
+template <>
+struct gebp_traits<float, float, false, false, Architecture::NEON, GEBPPacketFull>
+ : gebp_traits<float, float, false, false, Architecture::Generic, GEBPPacketFull> {
typedef float RhsPacket;
typedef float32x4_t RhsPacketx4;
enum { nr = EIGEN_NEON_GEBP_NR };
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const {
- dest = *b;
- }
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
- {
- dest = vld1q_f32(b);
- }
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { dest = vld1q_f32(b); }
- EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
- {
- dest = *b;
- }
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
- EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
- {}
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
- {
- loadRhs(b,dest);
- }
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
- {
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+ const FixedInt<0>&) const {
c = vfmaq_n_f32(c, a, b);
}
// NOTE: Template parameter inference failed when compiled with Android NDK:
// "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
- { madd_helper<0>(a, b, c); }
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
- { madd_helper<1>(a, b, c); }
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
- { madd_helper<2>(a, b, c); }
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
- { madd_helper<3>(a, b, c); }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+ const FixedInt<0>&) const {
+ madd_helper<0>(a, b, c);
+ }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+ const FixedInt<1>&) const {
+ madd_helper<1>(a, b, c);
+ }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+ const FixedInt<2>&) const {
+ madd_helper<2>(a, b, c);
+ }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+ const FixedInt<3>&) const {
+ madd_helper<3>(a, b, c);
+ }
private:
- template<int LaneID>
- EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
- {
- #if EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
+ template <int LaneID>
+ EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const {
+#if EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
// 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
// vfmaq_laneq_f32 is implemented through a costly dup, which was fixed in gcc9
// 2. workaround the gcc register split problem on arm64-neon
- if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : );
- else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : );
- else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : );
- else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : );
- #else
+ if (LaneID == 0)
+ asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w"(c) : "w"(a), "w"(b) :);
+ else if (LaneID == 1)
+ asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w"(c) : "w"(a), "w"(b) :);
+ else if (LaneID == 2)
+ asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w"(c) : "w"(a), "w"(b) :);
+ else if (LaneID == 3)
+ asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w"(c) : "w"(a), "w"(b) :);
+#else
c = vfmaq_laneq_f32(c, a, b, LaneID);
- #endif
+#endif
}
};
-
-template<>
-struct gebp_traits <double,double,false,false,Architecture::NEON>
- : gebp_traits<double,double,false,false,Architecture::Generic>
-{
+template <>
+struct gebp_traits<double, double, false, false, Architecture::NEON>
+ : gebp_traits<double, double, false, false, Architecture::Generic> {
typedef double RhsPacket;
enum { nr = EIGEN_NEON_GEBP_NR };
struct RhsPacketx4 {
float64x2_t B_0, B_1;
};
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
- {
- dest = *b;
- }
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
- {
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
dest.B_0 = vld1q_f64(b);
- dest.B_1 = vld1q_f64(b+2);
+ dest.B_1 = vld1q_f64(b + 2);
}
- EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
- {
- loadRhs(b,dest);
- }
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
- EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
- {}
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
- {
- loadRhs(b,dest);
- }
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
- {
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+ const FixedInt<0>&) const {
c = vfmaq_n_f64(c, a, b);
}
// NOTE: Template parameter inference failed when compiled with Android NDK:
// "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
- { madd_helper<0>(a, b, c); }
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
- { madd_helper<1>(a, b, c); }
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
- { madd_helper<2>(a, b, c); }
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
- { madd_helper<3>(a, b, c); }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+ const FixedInt<0>&) const {
+ madd_helper<0>(a, b, c);
+ }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+ const FixedInt<1>&) const {
+ madd_helper<1>(a, b, c);
+ }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+ const FixedInt<2>&) const {
+ madd_helper<2>(a, b, c);
+ }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+ const FixedInt<3>&) const {
+ madd_helper<3>(a, b, c);
+ }
private:
template <int LaneID>
- EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
- {
- #if EIGEN_GNUC_STRICT_LESS_THAN(9,0,0)
+ EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const {
+#if EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
// 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
// vfmaq_laneq_f64 is implemented through a costly dup, which was fixed in gcc9
// 2. workaround the gcc register split problem on arm64-neon
- if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
- else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
- else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
- else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
- #else
- if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
- else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
- else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
- else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
- #endif
+ if (LaneID == 0)
+ asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w"(c) : "w"(a), "w"(b.B_0) :);
+ else if (LaneID == 1)
+ asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w"(c) : "w"(a), "w"(b.B_0) :);
+ else if (LaneID == 2)
+ asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w"(c) : "w"(a), "w"(b.B_1) :);
+ else if (LaneID == 3)
+ asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w"(c) : "w"(a), "w"(b.B_1) :);
+#else
+ if (LaneID == 0)
+ c = vfmaq_laneq_f64(c, a, b.B_0, 0);
+ else if (LaneID == 1)
+ c = vfmaq_laneq_f64(c, a, b.B_0, 1);
+ else if (LaneID == 2)
+ c = vfmaq_laneq_f64(c, a, b.B_1, 0);
+ else if (LaneID == 3)
+ c = vfmaq_laneq_f64(c, a, b.B_1, 1);
+#endif
}
};
@@ -190,68 +180,64 @@
// through a costly dup in gcc compiler.
#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
-template<>
-struct gebp_traits <half,half,false,false,Architecture::NEON>
- : gebp_traits<half,half,false,false,Architecture::Generic>
-{
+template <>
+struct gebp_traits<half, half, false, false, Architecture::NEON>
+ : gebp_traits<half, half, false, false, Architecture::Generic> {
typedef half RhsPacket;
typedef float16x4_t RhsPacketx4;
typedef float16x4_t PacketHalf;
enum { nr = EIGEN_NEON_GEBP_NR };
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
- {
- dest = *b;
- }
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
- EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
- {
- dest = vld1_f16((const __fp16 *)b);
- }
+ EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { dest = vld1_f16((const __fp16*)b); }
- EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
- {
- dest = *b;
- }
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
- EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
- {}
+ EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
- EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar*, RhsPacket&) const
- {
+ EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar*, RhsPacket&) const {
// If LHS is a Packet8h, we cannot correctly mimic a ploadquad of the RHS
// using a single scalar value.
eigen_assert(false && "Cannot loadRhsQuad for a scalar RHS.");
}
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
- {
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+ const FixedInt<0>&) const {
c = vfmaq_n_f16(c, a, b);
}
- EIGEN_STRONG_INLINE void madd(const PacketHalf& a, const RhsPacket& b, PacketHalf& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
- {
+ EIGEN_STRONG_INLINE void madd(const PacketHalf& a, const RhsPacket& b, PacketHalf& c, RhsPacket& /*tmp*/,
+ const FixedInt<0>&) const {
c = vfma_n_f16(c, a, b);
}
// NOTE: Template parameter inference failed when compiled with Android NDK:
// "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
- { madd_helper<0>(a, b, c); }
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
- { madd_helper<1>(a, b, c); }
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
- { madd_helper<2>(a, b, c); }
- EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
- { madd_helper<3>(a, b, c); }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+ const FixedInt<0>&) const {
+ madd_helper<0>(a, b, c);
+ }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+ const FixedInt<1>&) const {
+ madd_helper<1>(a, b, c);
+ }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+ const FixedInt<2>&) const {
+ madd_helper<2>(a, b, c);
+ }
+ EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+ const FixedInt<3>&) const {
+ madd_helper<3>(a, b, c);
+ }
+
private:
- template<int LaneID>
- EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
- {
+ template <int LaneID>
+ EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const {
c = vfmaq_lane_f16(c, a, b, LaneID);
}
};
-#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
-#endif // EIGEN_ARCH_ARM64
+#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
+#endif // EIGEN_ARCH_ARM64
} // namespace internal
} // namespace Eigen
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
index 8611810..3d2e7bd 100644
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -20,21 +20,18 @@
#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-Packet4hf ptanh<Packet4hf>(const Packet4hf& x) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet4hf ptanh<Packet4hf>(const Packet4hf& x) {
// Convert to float, call the float ptanh, and then convert back.
return vcvt_f16_f32(ptanh<Packet4f>(vcvt_f32_f16(x)));
}
template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-Packet8hf ptanh<Packet8hf>(const Packet8hf& x) {
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet8hf ptanh<Packet8hf>(const Packet8hf& x) {
// Convert each 4 halfs to float, call the float ptanh, and then convert back.
- return vcombine_f16(
- vcvt_f16_f32(ptanh<Packet4f>(vcvt_f32_f16(vget_low_f16(x)))),
- vcvt_f16_f32(ptanh<Packet4f>(vcvt_high_f32_f16(x))));
+ return vcombine_f16(vcvt_f16_f32(ptanh<Packet4f>(vcvt_f32_f16(vget_low_f16(x)))),
+ vcvt_f16_f32(ptanh<Packet4f>(vcvt_high_f32_f16(x))));
}
-#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
BF16_PACKET_FUNCTION(Packet4f, Packet4bf, psin)
BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pcos)
@@ -63,8 +60,8 @@
#endif
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_MATH_FUNCTIONS_NEON_H
+#endif // EIGEN_MATH_FUNCTIONS_NEON_H
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index e70f8b0..4e3a14d 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -41,24 +41,24 @@
// are aliases to the same underlying type __n128.
// We thus have to wrap them to make them different C++ types.
// (See also bug 1428)
-typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
-typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
-typedef eigen_packet_wrapper<int32_t ,2> Packet4c;
-typedef eigen_packet_wrapper<int8x8_t ,3> Packet8c;
-typedef eigen_packet_wrapper<int8x16_t ,4> Packet16c;
-typedef eigen_packet_wrapper<uint32_t ,5> Packet4uc;
-typedef eigen_packet_wrapper<uint8x8_t ,6> Packet8uc;
-typedef eigen_packet_wrapper<uint8x16_t ,7> Packet16uc;
-typedef eigen_packet_wrapper<int16x4_t ,8> Packet4s;
-typedef eigen_packet_wrapper<int16x8_t ,9> Packet8s;
-typedef eigen_packet_wrapper<uint16x4_t ,10> Packet4us;
-typedef eigen_packet_wrapper<uint16x8_t ,11> Packet8us;
-typedef eigen_packet_wrapper<int32x2_t ,12> Packet2i;
-typedef eigen_packet_wrapper<int32x4_t ,13> Packet4i;
-typedef eigen_packet_wrapper<uint32x2_t ,14> Packet2ui;
-typedef eigen_packet_wrapper<uint32x4_t ,15> Packet4ui;
-typedef eigen_packet_wrapper<int64x2_t ,16> Packet2l;
-typedef eigen_packet_wrapper<uint64x2_t ,17> Packet2ul;
+typedef eigen_packet_wrapper<float32x2_t, 0> Packet2f;
+typedef eigen_packet_wrapper<float32x4_t, 1> Packet4f;
+typedef eigen_packet_wrapper<int32_t, 2> Packet4c;
+typedef eigen_packet_wrapper<int8x8_t, 3> Packet8c;
+typedef eigen_packet_wrapper<int8x16_t, 4> Packet16c;
+typedef eigen_packet_wrapper<uint32_t, 5> Packet4uc;
+typedef eigen_packet_wrapper<uint8x8_t, 6> Packet8uc;
+typedef eigen_packet_wrapper<uint8x16_t, 7> Packet16uc;
+typedef eigen_packet_wrapper<int16x4_t, 8> Packet4s;
+typedef eigen_packet_wrapper<int16x8_t, 9> Packet8s;
+typedef eigen_packet_wrapper<uint16x4_t, 10> Packet4us;
+typedef eigen_packet_wrapper<uint16x8_t, 11> Packet8us;
+typedef eigen_packet_wrapper<int32x2_t, 12> Packet2i;
+typedef eigen_packet_wrapper<int32x4_t, 13> Packet4i;
+typedef eigen_packet_wrapper<uint32x2_t, 14> Packet2ui;
+typedef eigen_packet_wrapper<uint32x4_t, 15> Packet4ui;
+typedef eigen_packet_wrapper<int64x2_t, 16> Packet2l;
+typedef eigen_packet_wrapper<uint64x2_t, 17> Packet2ul;
EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
float from[4] = {a, b, c, d};
@@ -72,405 +72,380 @@
#else
-typedef float32x2_t Packet2f;
-typedef float32x4_t Packet4f;
-typedef eigen_packet_wrapper<int32_t ,2> Packet4c;
-typedef int8x8_t Packet8c;
-typedef int8x16_t Packet16c;
-typedef eigen_packet_wrapper<uint32_t ,5> Packet4uc;
-typedef uint8x8_t Packet8uc;
-typedef uint8x16_t Packet16uc;
-typedef int16x4_t Packet4s;
-typedef int16x8_t Packet8s;
-typedef uint16x4_t Packet4us;
-typedef uint16x8_t Packet8us;
-typedef int32x2_t Packet2i;
-typedef int32x4_t Packet4i;
-typedef uint32x2_t Packet2ui;
-typedef uint32x4_t Packet4ui;
-typedef int64x2_t Packet2l;
-typedef uint64x2_t Packet2ul;
+typedef float32x2_t Packet2f;
+typedef float32x4_t Packet4f;
+typedef eigen_packet_wrapper<int32_t, 2> Packet4c;
+typedef int8x8_t Packet8c;
+typedef int8x16_t Packet16c;
+typedef eigen_packet_wrapper<uint32_t, 5> Packet4uc;
+typedef uint8x8_t Packet8uc;
+typedef uint8x16_t Packet16uc;
+typedef int16x4_t Packet4s;
+typedef int16x8_t Packet8s;
+typedef uint16x4_t Packet4us;
+typedef uint16x8_t Packet8us;
+typedef int32x2_t Packet2i;
+typedef int32x4_t Packet4i;
+typedef uint32x2_t Packet2ui;
+typedef uint32x4_t Packet4ui;
+typedef int64x2_t Packet2l;
+typedef uint64x2_t Packet2ul;
EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { return Packet4f{a, b, c, d}; }
EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) { return Packet2f{a, b}; }
-#endif // EIGEN_COMP_MSVC_STRICT
+#endif // EIGEN_COMP_MSVC_STRICT
-EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){
+EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask) {
const float* a = reinterpret_cast<const float*>(&m);
- Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3)));
+ Packet4f res =
+ make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
return res;
}
// fuctionally equivalent to _mm_shuffle_ps in SSE when interleave
// == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
// interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
-// to enable a shared implementation for fast inversion of matrices of size 4.
-template<bool interleave>
-EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask)
-{
+// to enable a shared implementation for fast inversion of matrices of size 4.
+template <bool interleave>
+EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f& m, const Packet4f& n, int mask) {
const float* a = reinterpret_cast<const float*>(&m);
const float* b = reinterpret_cast<const float*>(&n);
- Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
+ Packet4f res =
+ make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
return res;
}
-template<>
-EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f &m, const Packet4f &n, int mask)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f& m, const Packet4f& n, int mask) {
const float* a = reinterpret_cast<const float*>(&m);
const float* b = reinterpret_cast<const float*>(&n);
- Packet4f res = make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
+ Packet4f res =
+ make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
return res;
}
-EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));}
+EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {
+ return ((s) << 6 | (r) << 4 | (q) << 2 | (p));
+}
-EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s)
-{
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) {
return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
}
-EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s)
-{
- return shuffle2<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s));
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) {
+ return shuffle2<false>(a, b, eigen_neon_shuffle_mask(p, q, r, s));
}
-EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
-{
- return shuffle2<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1));
+EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
+ return shuffle2<false>(a, b, eigen_neon_shuffle_mask(0, 1, 0, 1));
}
-EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
-{
- return shuffle2<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3));
+EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
+ return shuffle2<false>(b, a, eigen_neon_shuffle_mask(2, 3, 2, 3));
}
-EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
-{
- return shuffle2<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1));
+EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
+ return shuffle2<true>(a, b, eigen_neon_shuffle_mask(0, 0, 1, 1));
}
-EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
-{
- return shuffle2<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
+EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
+ return shuffle2<true>(a, b, eigen_neon_shuffle_mask(2, 2, 3, 3));
}
-#define vec4f_duplane(a, p) \
- Packet4f(vdupq_lane_f32(vget_low_f32(a), p))
+#define vec4f_duplane(a, p) Packet4f(vdupq_lane_f32(vget_low_f32(a), p))
-#define EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
- const Packet4f p4f_##NAME = pset1<Packet4f>(X)
+#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
-#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
-#define EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
- const Packet4i p4i_##NAME = pset1<Packet4i>(X)
+#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
#if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC
- // __builtin_prefetch tends to do nothing on ARM64 compilers because the
- // prefetch instructions there are too detailed for __builtin_prefetch to map
- // meaningfully to them.
- #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : );
+// __builtin_prefetch tends to do nothing on ARM64 compilers because the
+// prefetch instructions there are too detailed for __builtin_prefetch to map
+// meaningfully to them.
+#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) :);
#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
- #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
#elif defined __pld
- #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
+#define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
#elif EIGEN_ARCH_ARM
- #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
+#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("pld [%[addr]]\n" ::[addr] "r"(ADDR) :);
#else
- // by default no explicit prefetching
- #define EIGEN_ARM_PREFETCH(ADDR)
+// by default no explicit prefetching
+#define EIGEN_ARM_PREFETCH(ADDR)
#endif
template <>
-struct packet_traits<float> : default_packet_traits
-{
+struct packet_traits<float> : default_packet_traits {
typedef Packet4f type;
typedef Packet2f half;
- enum
- {
+ enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 4,
- HasCmp = 1,
- HasAdd = 1,
- HasSub = 1,
- HasShift = 1,
- HasMul = 1,
- HasNegate = 1,
- HasAbs = 1,
- HasArg = 0,
- HasAbs2 = 1,
- HasAbsDiff = 1,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasShift = 1,
+ HasMul = 1,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasArg = 0,
+ HasAbs2 = 1,
+ HasAbsDiff = 1,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasSetLinear = 1,
- HasBlend = 0,
+ HasBlend = 0,
- HasDiv = 1,
+ HasDiv = 1,
HasFloor = 1,
HasCeil = 1,
HasRint = 1,
- HasSin = EIGEN_FAST_MATH,
- HasCos = EIGEN_FAST_MATH,
- HasACos = 1,
- HasASin = 1,
- HasATan = 1,
+ HasSin = EIGEN_FAST_MATH,
+ HasCos = EIGEN_FAST_MATH,
+ HasACos = 1,
+ HasASin = 1,
+ HasATan = 1,
HasATanh = 1,
- HasLog = 1,
- HasExp = 1,
+ HasLog = 1,
+ HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasTanh = EIGEN_FAST_MATH,
- HasErf = EIGEN_FAST_MATH,
+ HasErf = EIGEN_FAST_MATH,
HasBessel = 0, // Issues with accuracy.
HasNdtri = 0
};
};
template <>
-struct packet_traits<int8_t> : default_packet_traits
-{
+struct packet_traits<int8_t> : default_packet_traits {
typedef Packet16c type;
typedef Packet8c half;
- enum
- {
+ enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 16,
- HasCmp = 1,
- HasAdd = 1,
- HasSub = 1,
- HasShift = 1,
- HasMul = 1,
- HasNegate = 1,
- HasAbs = 1,
- HasAbsDiff = 1,
- HasArg = 0,
- HasAbs2 = 1,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasShift = 1,
+ HasMul = 1,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasAbsDiff = 1,
+ HasArg = 0,
+ HasAbs2 = 1,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasSetLinear = 1,
- HasBlend = 0
+ HasBlend = 0
};
};
template <>
-struct packet_traits<uint8_t> : default_packet_traits
-{
+struct packet_traits<uint8_t> : default_packet_traits {
typedef Packet16uc type;
typedef Packet8uc half;
- enum
- {
+ enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 16,
- HasCmp = 1,
- HasAdd = 1,
- HasSub = 1,
- HasShift = 1,
- HasMul = 1,
- HasNegate = 0,
- HasAbs = 1,
- HasAbsDiff = 1,
- HasArg = 0,
- HasAbs2 = 1,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasShift = 1,
+ HasMul = 1,
+ HasNegate = 0,
+ HasAbs = 1,
+ HasAbsDiff = 1,
+ HasArg = 0,
+ HasAbs2 = 1,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasSetLinear = 1,
- HasBlend = 0,
+ HasBlend = 0,
HasSqrt = 1
};
};
template <>
-struct packet_traits<int16_t> : default_packet_traits
-{
+struct packet_traits<int16_t> : default_packet_traits {
typedef Packet8s type;
typedef Packet4s half;
- enum
- {
+ enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 8,
- HasCmp = 1,
- HasAdd = 1,
- HasSub = 1,
- HasShift = 1,
- HasMul = 1,
- HasNegate = 1,
- HasAbs = 1,
- HasAbsDiff = 1,
- HasArg = 0,
- HasAbs2 = 1,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasShift = 1,
+ HasMul = 1,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasAbsDiff = 1,
+ HasArg = 0,
+ HasAbs2 = 1,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasSetLinear = 1,
- HasBlend = 0
+ HasBlend = 0
};
};
template <>
-struct packet_traits<uint16_t> : default_packet_traits
-{
+struct packet_traits<uint16_t> : default_packet_traits {
typedef Packet8us type;
typedef Packet4us half;
- enum
- {
+ enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 8,
- HasCmp = 1,
- HasAdd = 1,
- HasSub = 1,
- HasShift = 1,
- HasMul = 1,
- HasNegate = 0,
- HasAbs = 1,
- HasAbsDiff = 1,
- HasArg = 0,
- HasAbs2 = 1,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasShift = 1,
+ HasMul = 1,
+ HasNegate = 0,
+ HasAbs = 1,
+ HasAbsDiff = 1,
+ HasArg = 0,
+ HasAbs2 = 1,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasSetLinear = 1,
- HasBlend = 0,
+ HasBlend = 0,
HasSqrt = 1
};
};
template <>
-struct packet_traits<int32_t> : default_packet_traits
-{
+struct packet_traits<int32_t> : default_packet_traits {
typedef Packet4i type;
typedef Packet2i half;
- enum
- {
+ enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 4,
- HasCmp = 1,
- HasAdd = 1,
- HasSub = 1,
- HasShift = 1,
- HasMul = 1,
- HasNegate = 1,
- HasAbs = 1,
- HasArg = 0,
- HasAbs2 = 1,
- HasAbsDiff = 1,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasShift = 1,
+ HasMul = 1,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasArg = 0,
+ HasAbs2 = 1,
+ HasAbsDiff = 1,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasSetLinear = 1,
- HasBlend = 0
+ HasBlend = 0
};
};
template <>
-struct packet_traits<uint32_t> : default_packet_traits
-{
+struct packet_traits<uint32_t> : default_packet_traits {
typedef Packet4ui type;
typedef Packet2ui half;
- enum
- {
+ enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 4,
- HasCmp = 1,
- HasAdd = 1,
- HasSub = 1,
- HasShift = 1,
- HasMul = 1,
- HasNegate = 0,
- HasAbs = 1,
- HasArg = 0,
- HasAbs2 = 1,
- HasAbsDiff = 1,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasShift = 1,
+ HasMul = 1,
+ HasNegate = 0,
+ HasAbs = 1,
+ HasArg = 0,
+ HasAbs2 = 1,
+ HasAbsDiff = 1,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasSetLinear = 1,
- HasBlend = 0,
+ HasBlend = 0,
HasSqrt = 1
};
};
template <>
-struct packet_traits<int64_t> : default_packet_traits
-{
+struct packet_traits<int64_t> : default_packet_traits {
typedef Packet2l type;
typedef Packet2l half;
- enum
- {
+ enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 2,
- HasCmp = 1,
- HasAdd = 1,
- HasSub = 1,
- HasShift = 1,
- HasMul = 1,
- HasNegate = 1,
- HasAbs = 1,
- HasArg = 0,
- HasAbs2 = 1,
- HasAbsDiff = 1,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasShift = 1,
+ HasMul = 1,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasArg = 0,
+ HasAbs2 = 1,
+ HasAbsDiff = 1,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasSetLinear = 1,
- HasBlend = 0
+ HasBlend = 0
};
};
template <>
-struct packet_traits<uint64_t> : default_packet_traits
-{
+struct packet_traits<uint64_t> : default_packet_traits {
typedef Packet2ul type;
typedef Packet2ul half;
- enum
- {
+ enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 2,
- HasCmp = 1,
- HasAdd = 1,
- HasSub = 1,
- HasShift = 1,
- HasMul = 1,
- HasNegate = 0,
- HasAbs = 1,
- HasArg = 0,
- HasAbs2 = 1,
- HasAbsDiff = 1,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasShift = 1,
+ HasMul = 1,
+ HasNegate = 0,
+ HasAbs = 1,
+ HasArg = 0,
+ HasAbs2 = 1,
+ HasAbsDiff = 1,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasSetLinear = 1,
- HasBlend = 0
+ HasBlend = 0
};
};
-template<> struct unpacket_traits<Packet2f>
-{
+template <>
+struct unpacket_traits<Packet2f> {
typedef float type;
typedef Packet2f half;
typedef Packet2i integer_packet;
- enum
- {
+ enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
@@ -478,13 +453,12 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet4f>
-{
+template <>
+struct unpacket_traits<Packet4f> {
typedef float type;
typedef Packet2f half;
typedef Packet4i integer_packet;
- enum
- {
+ enum {
size = 4,
alignment = Aligned16,
vectorizable = true,
@@ -492,12 +466,11 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet4c>
-{
+template <>
+struct unpacket_traits<Packet4c> {
typedef int8_t type;
typedef Packet4c half;
- enum
- {
+ enum {
size = 4,
alignment = Unaligned,
vectorizable = true,
@@ -505,12 +478,11 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet8c>
-{
+template <>
+struct unpacket_traits<Packet8c> {
typedef int8_t type;
typedef Packet4c half;
- enum
- {
+ enum {
size = 8,
alignment = Aligned16,
vectorizable = true,
@@ -518,12 +490,11 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet16c>
-{
+template <>
+struct unpacket_traits<Packet16c> {
typedef int8_t type;
typedef Packet8c half;
- enum
- {
+ enum {
size = 16,
alignment = Aligned16,
vectorizable = true,
@@ -531,12 +502,11 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet4uc>
-{
+template <>
+struct unpacket_traits<Packet4uc> {
typedef uint8_t type;
typedef Packet4uc half;
- enum
- {
+ enum {
size = 4,
alignment = Unaligned,
vectorizable = true,
@@ -544,12 +514,11 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet8uc>
-{
+template <>
+struct unpacket_traits<Packet8uc> {
typedef uint8_t type;
typedef Packet4uc half;
- enum
- {
+ enum {
size = 8,
alignment = Aligned16,
vectorizable = true,
@@ -557,24 +526,23 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet16uc>
-{
+template <>
+struct unpacket_traits<Packet16uc> {
typedef uint8_t type;
typedef Packet8uc half;
- enum
- {
+ enum {
size = 16,
alignment = Aligned16,
vectorizable = true,
masked_load_available = false,
- masked_store_available = false};
+ masked_store_available = false
+ };
};
-template<> struct unpacket_traits<Packet4s>
-{
+template <>
+struct unpacket_traits<Packet4s> {
typedef int16_t type;
typedef Packet4s half;
- enum
- {
+ enum {
size = 4,
alignment = Aligned16,
vectorizable = true,
@@ -582,12 +550,11 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet8s>
-{
+template <>
+struct unpacket_traits<Packet8s> {
typedef int16_t type;
typedef Packet4s half;
- enum
- {
+ enum {
size = 8,
alignment = Aligned16,
vectorizable = true,
@@ -595,12 +562,11 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet4us>
-{
+template <>
+struct unpacket_traits<Packet4us> {
typedef uint16_t type;
typedef Packet4us half;
- enum
- {
+ enum {
size = 4,
alignment = Aligned16,
vectorizable = true,
@@ -608,12 +574,11 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet8us>
-{
+template <>
+struct unpacket_traits<Packet8us> {
typedef uint16_t type;
typedef Packet4us half;
- enum
- {
+ enum {
size = 8,
alignment = Aligned16,
vectorizable = true,
@@ -621,12 +586,11 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet2i>
-{
+template <>
+struct unpacket_traits<Packet2i> {
typedef int32_t type;
typedef Packet2i half;
- enum
- {
+ enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
@@ -634,12 +598,11 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet4i>
-{
+template <>
+struct unpacket_traits<Packet4i> {
typedef int32_t type;
typedef Packet2i half;
- enum
- {
+ enum {
size = 4,
alignment = Aligned16,
vectorizable = true,
@@ -647,12 +610,11 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet2ui>
-{
+template <>
+struct unpacket_traits<Packet2ui> {
typedef uint32_t type;
typedef Packet2ui half;
- enum
- {
+ enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
@@ -660,12 +622,11 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet4ui>
-{
+template <>
+struct unpacket_traits<Packet4ui> {
typedef uint32_t type;
typedef Packet2ui half;
- enum
- {
+ enum {
size = 4,
alignment = Aligned16,
vectorizable = true,
@@ -673,12 +634,11 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet2l>
-{
+template <>
+struct unpacket_traits<Packet2l> {
typedef int64_t type;
typedef Packet2l half;
- enum
- {
+ enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
@@ -686,12 +646,11 @@
masked_store_available = false
};
};
-template<> struct unpacket_traits<Packet2ul>
-{
+template <>
+struct unpacket_traits<Packet2ul> {
typedef uint64_t type;
typedef Packet2ul half;
- enum
- {
+ enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
@@ -700,1637 +659,2767 @@
};
};
-template<> EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(const float& from) { return vdup_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(const int8_t& from)
-{ return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(const int8_t& from) { return vdup_n_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) { return vdupq_n_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(const uint8_t& from)
-{ return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0); }
-template<> EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(const uint8_t& from) { return vdup_n_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) { return vdupq_n_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(const int16_t& from) { return vdup_n_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) { return vdupq_n_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(const uint16_t& from) { return vdup_n_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) { return vdupq_n_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(const int32_t& from) { return vdup_n_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(const uint32_t& from) { return vdup_n_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) { return vdupq_n_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) { return vdupq_n_s64(from); }
-template<> EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) { return vdupq_n_u64(from); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(const float& from) {
+ return vdup_n_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+ return vdupq_n_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(const int8_t& from) {
+ return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(const int8_t& from) {
+ return vdup_n_s8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) {
+ return vdupq_n_s8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(const uint8_t& from) {
+ return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(const uint8_t& from) {
+ return vdup_n_u8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) {
+ return vdupq_n_u8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(const int16_t& from) {
+ return vdup_n_s16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) {
+ return vdupq_n_s16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(const uint16_t& from) {
+ return vdup_n_u16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) {
+ return vdupq_n_u16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(const int32_t& from) {
+ return vdup_n_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
+ return vdupq_n_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(const uint32_t& from) {
+ return vdup_n_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
+ return vdupq_n_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
+ return vdupq_n_s64(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) {
+ return vdupq_n_u64(from);
+}
-template<> EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(uint32_t from)
-{ return vreinterpret_f32_u32(vdup_n_u32(from)); }
-template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from)
-{ return vreinterpretq_f32_u32(vdupq_n_u32(from)); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(uint32_t from) {
+ return vreinterpret_f32_u32(vdup_n_u32(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
+ return vreinterpretq_f32_u32(vdupq_n_u32(from));
+}
-template<> EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(const float& a)
-{
- const float c[] = {0.0f,1.0f};
+template <>
+EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(const float& a) {
+ const float c[] = {0.0f, 1.0f};
return vadd_f32(pset1<Packet2f>(a), vld1_f32(c));
}
-template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
-{
- const float c[] = {0.0f,1.0f,2.0f,3.0f};
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+ const float c[] = {0.0f, 1.0f, 2.0f, 3.0f};
return vaddq_f32(pset1<Packet4f>(a), vld1q_f32(c));
}
-template<> EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(const int8_t& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(const int8_t& a)
-{
- const int8_t c[] = {0,1,2,3,4,5,6,7};
+template <>
+EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(const int8_t& a) {
+ return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(const int8_t& a) {
+ const int8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
return vadd_s8(pset1<Packet8c>(a), vld1_s8(c));
}
-template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a)
-{
- const int8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+template <>
+EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a) {
+ const int8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
return vaddq_s8(pset1<Packet16c>(a), vld1q_s8(c));
}
-template<> EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(const uint8_t& a)
-{ return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(const uint8_t& a)
-{
- const uint8_t c[] = {0,1,2,3,4,5,6,7};
+template <>
+EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(const uint8_t& a) {
+ return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(const uint8_t& a) {
+ const uint8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
return vadd_u8(pset1<Packet8uc>(a), vld1_u8(c));
}
-template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a)
-{
- const uint8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
+template <>
+EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a) {
+ const uint8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
return vaddq_u8(pset1<Packet16uc>(a), vld1q_u8(c));
}
-template<> EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(const int16_t& a)
-{
- const int16_t c[] = {0,1,2,3};
+template <>
+EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(const int16_t& a) {
+ const int16_t c[] = {0, 1, 2, 3};
return vadd_s16(pset1<Packet4s>(a), vld1_s16(c));
}
-template<> EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(const uint16_t& a)
-{
- const uint16_t c[] = {0,1,2,3};
+template <>
+EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(const uint16_t& a) {
+ const uint16_t c[] = {0, 1, 2, 3};
return vadd_u16(pset1<Packet4us>(a), vld1_u16(c));
}
-template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a)
-{
- const int16_t c[] = {0,1,2,3,4,5,6,7};
+template <>
+EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a) {
+ const int16_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
return vaddq_s16(pset1<Packet8s>(a), vld1q_s16(c));
}
-template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a)
-{
- const uint16_t c[] = {0,1,2,3,4,5,6,7};
+template <>
+EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a) {
+ const uint16_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
return vaddq_u16(pset1<Packet8us>(a), vld1q_u16(c));
}
-template<> EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(const int32_t& a)
-{
- const int32_t c[] = {0,1};
+template <>
+EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(const int32_t& a) {
+ const int32_t c[] = {0, 1};
return vadd_s32(pset1<Packet2i>(a), vld1_s32(c));
}
-template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
-{
- const int32_t c[] = {0,1,2,3};
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
+ const int32_t c[] = {0, 1, 2, 3};
return vaddq_s32(pset1<Packet4i>(a), vld1q_s32(c));
}
-template<> EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(const uint32_t& a)
-{
- const uint32_t c[] = {0,1};
+template <>
+EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(const uint32_t& a) {
+ const uint32_t c[] = {0, 1};
return vadd_u32(pset1<Packet2ui>(a), vld1_u32(c));
}
-template<> EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a)
-{
- const uint32_t c[] = {0,1,2,3};
+template <>
+EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
+ const uint32_t c[] = {0, 1, 2, 3};
return vaddq_u32(pset1<Packet4ui>(a), vld1q_u32(c));
}
-template<> EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a)
-{
- const int64_t c[] = {0,1};
+template <>
+EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
+ const int64_t c[] = {0, 1};
return vaddq_s64(pset1<Packet2l>(a), vld1q_s64(c));
}
-template<> EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a)
-{
- const uint64_t c[] = {0,1};
+template <>
+EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a) {
+ const uint64_t c[] = {0, 1};
return vaddq_u64(pset1<Packet2ul>(a), vld1q_u64(c));
}
-template<> EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(const Packet2f& a, const Packet2f& b) { return vadd_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
- return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(
- vreinterpret_s8_s32(vdup_n_s32(a)),
- vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return vadd_f32(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(const Packet8c& a, const Packet8c& b) { return vadd_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) { return vaddq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
- return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(
- vreinterpret_u8_u32(vdup_n_u32(a)),
- vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vaddq_f32(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vadd_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vaddq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(const Packet4s& a, const Packet4s& b) { return vadd_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) { return vaddq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(const Packet4us& a, const Packet4us& b) { return vadd_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) { return vaddq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(const Packet2i& a, const Packet2i& b) { return vadd_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vadd_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vaddq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) { return vaddq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vaddq_u64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(const Packet4c& a, const Packet4c& b) {
+ return vget_lane_s32(
+ vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(const Packet8c& a, const Packet8c& b) {
+ return vadd_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return vaddq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+ return vget_lane_u32(
+ vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+ return vadd_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return vaddq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(const Packet4s& a, const Packet4s& b) {
+ return vadd_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return vaddq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(const Packet4us& a, const Packet4us& b) {
+ return vadd_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vaddq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(const Packet2i& a, const Packet2i& b) {
+ return vadd_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vaddq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+ return vadd_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return vaddq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
+ return vaddq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+ return vaddq_u64(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(const Packet2f& a, const Packet2f& b) { return vsub_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
- return vget_lane_s32(vreinterpret_s32_s8(vsub_s8(
- vreinterpret_s8_s32(vdup_n_s32(a)),
- vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return vsub_f32(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(const Packet8c& a, const Packet8c& b) { return vsub_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) { return vsubq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
- return vget_lane_u32(vreinterpret_u32_u8(vsub_u8(
- vreinterpret_u8_u32(vdup_n_u32(a)),
- vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vsubq_f32(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vsub_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vsubq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(const Packet4s& a, const Packet4s& b) { return vsub_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) { return vsubq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(const Packet4us& a, const Packet4us& b) { return vsub_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) { return vsubq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(const Packet2i& a, const Packet2i& b) { return vsub_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vsub_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vsubq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(const Packet4c& a, const Packet4c& b) {
+ return vget_lane_s32(
+ vreinterpret_s32_s8(vsub_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(const Packet8c& a, const Packet8c& b) {
+ return vsub_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return vsubq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+ return vget_lane_u32(
+ vreinterpret_u32_u8(vsub_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+ return vsub_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return vsubq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(const Packet4s& a, const Packet4s& b) {
+ return vsub_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return vsubq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(const Packet4us& a, const Packet4us& b) {
+ return vsub_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vsubq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(const Packet2i& a, const Packet2i& b) {
+ return vsub_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vsubq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+ return vsub_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return vsubq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
+ return vsubq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+ return vsubq_u64(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
-template<> EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f & b) {
+template <>
+EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
+template <>
+EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f& b) {
Packet2f mask = make_packet2f(numext::bit_cast<float>(0x80000000u), 0.0f);
return padd(a, pxor(mask, b));
}
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
-template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template <>
+EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
return padd(a, pxor(mask, b));
}
-template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) { return vneg_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) { return vnegq_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) { return vneg_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) { return vnegq_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) { return vneg_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) {
+ return vneg_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
+ return vnegq_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a) {
+ return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) {
+ return vneg_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
+ return vnegq_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) {
+ return vneg_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
+ return vnegq_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) {
+ return vneg_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+ return vnegq_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
#if EIGEN_ARCH_ARM64
return vnegq_s64(a);
#else
- return vcombine_s64(
- vdup_n_s64(-vgetq_lane_s64(a, 0)),
- vdup_n_s64(-vgetq_lane_s64(a, 1)));
+ return vcombine_s64(vdup_n_s64(-vgetq_lane_s64(a, 0)), vdup_n_s64(-vgetq_lane_s64(a, 1)));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmul_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
- return vget_lane_s32(vreinterpret_s32_s8(vmul_s8(
- vreinterpret_s8_s32(vdup_n_s32(a)),
- vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) {
+ return a;
}
-template<> EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmul_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmulq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
- return vget_lane_u32(vreinterpret_u32_u8(vmul_u8(
- vreinterpret_u8_u32(vdup_n_u32(a)),
- vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+ return a;
}
-template<> EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmul_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmulq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmul_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmulq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmul_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmulq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmul_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmul_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmulq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
- return vcombine_s64(
- vdup_n_s64(vgetq_lane_s64(a, 0)*vgetq_lane_s64(b, 0)),
- vdup_n_s64(vgetq_lane_s64(a, 1)*vgetq_lane_s64(b, 1)));
+template <>
+EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) {
+ return a;
}
-template<> EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
- return vcombine_u64(
- vdup_n_u64(vgetq_lane_u64(a, 0)*vgetq_lane_u64(b, 0)),
- vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1)));
+template <>
+EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) {
+ return a;
}
-template<> EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return vmul_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vmulq_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(const Packet4c& a, const Packet4c& b) {
+ return vget_lane_s32(
+ vreinterpret_s32_s8(vmul_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(const Packet8c& a, const Packet8c& b) {
+ return vmul_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return vmulq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+ return vget_lane_u32(
+ vreinterpret_u32_u8(vmul_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+ return vmul_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return vmulq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(const Packet4s& a, const Packet4s& b) {
+ return vmul_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return vmulq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(const Packet4us& a, const Packet4us& b) {
+ return vmul_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vmulq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(const Packet2i& a, const Packet2i& b) {
+ return vmul_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vmulq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+ return vmul_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return vmulq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
+ return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) * vgetq_lane_s64(b, 0)),
+ vdup_n_s64(vgetq_lane_s64(a, 1) * vgetq_lane_s64(b, 1)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+ return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) * vgetq_lane_u64(b, 0)),
+ vdup_n_u64(vgetq_lane_u64(a, 1) * vgetq_lane_u64(b, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/) {
eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet4c>(0);
}
-template<> EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(const Packet8c& /*a*/, const Packet8c& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(const Packet8c& /*a*/, const Packet8c& /*b*/) {
eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet8c>(0);
}
-template<> EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(const Packet16c& /*a*/, const Packet16c& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(const Packet16c& /*a*/, const Packet16c& /*b*/) {
eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet16c>(0);
}
-template<> EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(const Packet4uc& /*a*/, const Packet4uc& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(const Packet4uc& /*a*/, const Packet4uc& /*b*/) {
eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet4uc>(0);
}
-template<> EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(const Packet8uc& /*a*/, const Packet8uc& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(const Packet8uc& /*a*/, const Packet8uc& /*b*/) {
eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet8uc>(0);
}
-template<> EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& /*a*/, const Packet16uc& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& /*a*/, const Packet16uc& /*b*/) {
eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet16uc>(0);
}
-template<> EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(const Packet4s& /*a*/, const Packet4s& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(const Packet4s& /*a*/, const Packet4s& /*b*/) {
eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet4s>(0);
}
-template<> EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& /*a*/, const Packet8s& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& /*a*/, const Packet8s& /*b*/) {
eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet8s>(0);
}
-template<> EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(const Packet4us& /*a*/, const Packet4us& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(const Packet4us& /*a*/, const Packet4us& /*b*/) {
eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet4us>(0);
}
-template<> EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& /*a*/, const Packet8us& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& /*a*/, const Packet8us& /*b*/) {
eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet8us>(0);
}
-template<> EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(const Packet2i& /*a*/, const Packet2i& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(const Packet2i& /*a*/, const Packet2i& /*b*/) {
eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet2i>(0);
}
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) {
eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet4i>(0);
}
-template<> EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(const Packet2ui& /*a*/, const Packet2ui& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(const Packet2ui& /*a*/, const Packet2ui& /*b*/) {
eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet2ui>(0);
}
-template<> EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& /*a*/, const Packet4ui& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& /*a*/, const Packet4ui& /*b*/) {
eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet4ui>(0);
}
-template<> EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& /*a*/, const Packet2l& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& /*a*/, const Packet2l& /*b*/) {
eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet2l>(0LL);
}
-template<> EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/, const Packet2ul& /*b*/)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/, const Packet2ul& /*b*/) {
eigen_assert(false && "packet integer division are not supported by NEON");
return pset1<Packet2ul>(0ULL);
}
-
#ifdef __ARM_FEATURE_FMA
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
-{ return vfmaq_f32(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
-{ return vfma_f32(c,a,b); }
-#else
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
-{
- return vmlaq_f32(c,a,b);
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+ return vfmaq_f32(c, a, b);
}
-template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
-{
- return vmla_f32(c,a,b);
+template <>
+EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+ return vfma_f32(c, a, b);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+ return vmlaq_f32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+ return vmla_f32(c, a, b);
}
#endif
// No FMA instruction for int, so use MLA unconditionally.
-template<> EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c)
-{
- return vget_lane_s32(vreinterpret_s32_s8(vmla_s8(
- vreinterpret_s8_s32(vdup_n_s32(c)),
- vreinterpret_s8_s32(vdup_n_s32(a)),
- vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c) {
+ return vget_lane_s32(
+ vreinterpret_s32_s8(vmla_s8(vreinterpret_s8_s32(vdup_n_s32(c)), vreinterpret_s8_s32(vdup_n_s32(a)),
+ vreinterpret_s8_s32(vdup_n_s32(b)))),
+ 0);
}
-template<> EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c)
-{ return vmla_s8(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c)
-{ return vmlaq_s8(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c)
-{
- return vget_lane_u32(vreinterpret_u32_u8(vmla_u8(
- vreinterpret_u8_u32(vdup_n_u32(c)),
- vreinterpret_u8_u32(vdup_n_u32(a)),
- vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c) {
+ return vmla_s8(c, a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c)
-{ return vmla_u8(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c)
-{ return vmlaq_u8(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c)
-{ return vmla_s16(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c)
-{ return vmlaq_s16(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c)
-{ return vmla_u16(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c)
-{ return vmlaq_u16(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c)
-{ return vmla_s32(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c)
-{ return vmlaq_s32(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c)
-{ return vmla_u32(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c)
-{ return vmlaq_u32(c,a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
+ return vmlaq_s8(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c) {
+ return vget_lane_u32(
+ vreinterpret_u32_u8(vmla_u8(vreinterpret_u8_u32(vdup_n_u32(c)), vreinterpret_u8_u32(vdup_n_u32(a)),
+ vreinterpret_u8_u32(vdup_n_u32(b)))),
+ 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c) {
+ return vmla_u8(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) {
+ return vmlaq_u8(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c) {
+ return vmla_s16(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
+ return vmlaq_s16(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c) {
+ return vmla_u16(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
+ return vmlaq_u16(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c) {
+ return vmla_s32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+ return vmlaq_s32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c) {
+ return vmla_u32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) {
+ return vmlaq_u32(c, a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vabd_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vabdq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
- return vget_lane_s32(vreinterpret_s32_s8(vabd_s8(
- vreinterpret_s8_s32(vdup_n_s32(a)),
- vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return vabd_f32(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return vabd_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vabdq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
- return vget_lane_u32(vreinterpret_u32_u8(vabd_u8(
- vreinterpret_u8_u32(vdup_n_u32(a)),
- vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vabdq_f32(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vabd_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vabdq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vabd_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vabdq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vabd_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vabdq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(const Packet2i& a, const Packet2i& b)
-{ return vabd_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b)
-{ return vabdq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vabd_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vabdq_u32(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(const Packet4c& a, const Packet4c& b) {
+ return vget_lane_s32(
+ vreinterpret_s32_s8(vabd_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(const Packet8c& a, const Packet8c& b) {
+ return vabd_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return vabdq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+ return vget_lane_u32(
+ vreinterpret_u32_u8(vabd_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+ return vabd_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return vabdq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(const Packet4s& a, const Packet4s& b) {
+ return vabd_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return vabdq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(const Packet4us& a, const Packet4us& b) {
+ return vabd_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vabdq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(const Packet2i& a, const Packet2i& b) {
+ return vabd_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vabdq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+ return vabd_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return vabdq_u32(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmin_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return vmin_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vminq_f32(a, b);
+}
#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
-// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
-template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vminnmq_f32(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vminnm_f32(a, b); }
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
+// systems).
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vminnmq_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return vminnm_f32(a, b);
+}
#endif
-template<> EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmin<Packet4f>(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmin<Packet2f>(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
- return vget_lane_s32(vreinterpret_s32_s8(vmin_s8(
- vreinterpret_s8_s32(vdup_n_s32(a)),
- vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmin_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vminq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
- return vget_lane_u32(vreinterpret_u32_u8(vmin_u8(
- vreinterpret_u8_u32(vdup_n_u32(a)),
- vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmin_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vminq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmin_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vminq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmin_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vminq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmin_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmin_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vminq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
- return vcombine_s64(
- vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
- vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
-}
-template<> EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
- return vcombine_u64(
- vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
- vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return pmin<Packet4f>(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmax_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return pmin<Packet2f>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(const Packet4c& a, const Packet4c& b) {
+ return vget_lane_s32(
+ vreinterpret_s32_s8(vmin_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(const Packet8c& a, const Packet8c& b) {
+ return vmin_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return vminq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+ return vget_lane_u32(
+ vreinterpret_u32_u8(vmin_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+ return vmin_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return vminq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(const Packet4s& a, const Packet4s& b) {
+ return vmin_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return vminq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(const Packet4us& a, const Packet4us& b) {
+ return vmin_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vminq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(const Packet2i& a, const Packet2i& b) {
+ return vmin_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vminq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+ return vmin_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return vminq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
+ return vcombine_s64(vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
+ vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+ return vcombine_u64(vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
+ vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return vmax_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vmaxq_f32(a, b);
+}
#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
-// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
-template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxnmq_f32(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) { return vmaxnm_f32(a, b); }
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
+// systems).
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vmaxnmq_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return vmaxnm_f32(a, b);
+}
#endif
-template<> EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { return pmax<Packet4f>(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) { return pmax<Packet2f>(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
- return vget_lane_s32(vreinterpret_s32_s8(vmax_s8(
- vreinterpret_s8_s32(vdup_n_s32(a)),
- vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmax_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmaxq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
- return vget_lane_u32(vreinterpret_u32_u8(vmax_u8(
- vreinterpret_u8_u32(vdup_n_u32(a)),
- vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmax_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmaxq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmax_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmaxq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmax_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmaxq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmax_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmax_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmaxq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
- return vcombine_s64(
- vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
- vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
-}
-template<> EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
- return vcombine_u64(
- vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
- vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return pmax<Packet4f>(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vcle_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vcleq_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
- return vget_lane_s32(vreinterpret_s32_u8(vcle_s8(
- vreinterpret_s8_s32(vdup_n_s32(a)),
- vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return pmax<Packet2f>(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return vreinterpret_s8_u8(vcle_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vreinterpretq_s8_u8(vcleq_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
- return vget_lane_u32(vreinterpret_u32_u8(vcle_u8(
- vreinterpret_u8_u32(vdup_n_u32(a)),
- vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+
+template <>
+EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(const Packet4c& a, const Packet4c& b) {
+ return vget_lane_s32(
+ vreinterpret_s32_s8(vmax_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
}
-template<> EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vcle_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vcleq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vreinterpret_s16_u16(vcle_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vreinterpretq_s16_u16(vcleq_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vcle_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vcleq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(const Packet2i& a, const Packet2i& b)
-{ return vreinterpret_s32_u32(vcle_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b)
-{ return vreinterpretq_s32_u32(vcleq_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vcle_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vcleq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(const Packet8c& a, const Packet8c& b) {
+ return vmax_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return vmaxq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+ return vget_lane_u32(
+ vreinterpret_u32_u8(vmax_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+ return vmax_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return vmaxq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(const Packet4s& a, const Packet4s& b) {
+ return vmax_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return vmaxq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(const Packet4us& a, const Packet4us& b) {
+ return vmax_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vmaxq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(const Packet2i& a, const Packet2i& b) {
+ return vmax_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vmaxq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+ return vmax_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return vmaxq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
+ return vcombine_s64(vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
+ vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+ return vcombine_u64(vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
+ vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return vreinterpret_f32_u32(vcle_f32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vreinterpretq_f32_u32(vcleq_f32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(const Packet4c& a, const Packet4c& b) {
+ return vget_lane_s32(
+ vreinterpret_s32_u8(vcle_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(const Packet8c& a, const Packet8c& b) {
+ return vreinterpret_s8_u8(vcle_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return vreinterpretq_s8_u8(vcleq_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+ return vget_lane_u32(
+ vreinterpret_u32_u8(vcle_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+ return vcle_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return vcleq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(const Packet4s& a, const Packet4s& b) {
+ return vreinterpret_s16_u16(vcle_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return vreinterpretq_s16_u16(vcleq_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(const Packet4us& a, const Packet4us& b) {
+ return vcle_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vcleq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(const Packet2i& a, const Packet2i& b) {
+ return vreinterpret_s32_u32(vcle_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vreinterpretq_s32_u32(vcleq_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+ return vcle_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return vcleq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b) {
#if EIGEN_ARCH_ARM64
- return vreinterpretq_s64_u64(vcleq_s64(a,b));
+ return vreinterpretq_s64_u64(vcleq_s64(a, b));
#else
- return vcombine_s64(
- vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
- vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+ return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+ vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
#if EIGEN_ARCH_ARM64
- return vcleq_u64(a,b);
+ return vcleq_u64(a, b);
#else
- return vcombine_u64(
- vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
- vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+ return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+ vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vclt_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vcltq_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
- return vget_lane_s32(vreinterpret_s32_u8(vclt_s8(
- vreinterpret_s8_s32(vdup_n_s32(a)),
- vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return vreinterpret_f32_u32(vclt_f32(a, b));
}
-template<> EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return vreinterpret_s8_u8(vclt_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vreinterpretq_s8_u8(vcltq_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
- return vget_lane_u32(vreinterpret_u32_u8(vclt_u8(
- vreinterpret_u8_u32(vdup_n_u32(a)),
- vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vreinterpretq_f32_u32(vcltq_f32(a, b));
}
-template<> EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vclt_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vcltq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vreinterpret_s16_u16(vclt_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vreinterpretq_s16_u16(vcltq_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vclt_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vcltq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(const Packet2i& a, const Packet2i& b)
-{ return vreinterpret_s32_u32(vclt_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b)
-{ return vreinterpretq_s32_u32(vcltq_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vclt_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vcltq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(const Packet4c& a, const Packet4c& b) {
+ return vget_lane_s32(
+ vreinterpret_s32_u8(vclt_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(const Packet8c& a, const Packet8c& b) {
+ return vreinterpret_s8_u8(vclt_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return vreinterpretq_s8_u8(vcltq_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+ return vget_lane_u32(
+ vreinterpret_u32_u8(vclt_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+ return vclt_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return vcltq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(const Packet4s& a, const Packet4s& b) {
+ return vreinterpret_s16_u16(vclt_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return vreinterpretq_s16_u16(vcltq_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(const Packet4us& a, const Packet4us& b) {
+ return vclt_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vcltq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(const Packet2i& a, const Packet2i& b) {
+ return vreinterpret_s32_u32(vclt_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vreinterpretq_s32_u32(vcltq_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+ return vclt_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return vcltq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b) {
#if EIGEN_ARCH_ARM64
- return vreinterpretq_s64_u64(vcltq_s64(a,b));
+ return vreinterpretq_s64_u64(vcltq_s64(a, b));
#else
- return vcombine_s64(
- vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
- vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+ return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+ vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
#if EIGEN_ARCH_ARM64
- return vcltq_u64(a,b);
+ return vcltq_u64(a, b);
#else
- return vcombine_u64(
- vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
- vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+ return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+ vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vceq_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vceqq_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
- return vget_lane_s32(vreinterpret_s32_u8(vceq_s8(
- vreinterpret_s8_s32(vdup_n_s32(a)),
- vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return vreinterpret_f32_u32(vceq_f32(a, b));
}
-template<> EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return vreinterpret_s8_u8(vceq_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vreinterpretq_s8_u8(vceqq_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
- return vget_lane_u32(vreinterpret_u32_u8(vceq_u8(
- vreinterpret_u8_u32(vdup_n_u32(a)),
- vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vreinterpretq_f32_u32(vceqq_f32(a, b));
}
-template<> EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vceq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vceqq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vreinterpret_s16_u16(vceq_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vreinterpretq_s16_u16(vceqq_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vceq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vceqq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(const Packet2i& a, const Packet2i& b)
-{ return vreinterpret_s32_u32(vceq_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b)
-{ return vreinterpretq_s32_u32(vceqq_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vceq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vceqq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(const Packet4c& a, const Packet4c& b) {
+ return vget_lane_s32(
+ vreinterpret_s32_u8(vceq_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(const Packet8c& a, const Packet8c& b) {
+ return vreinterpret_s8_u8(vceq_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return vreinterpretq_s8_u8(vceqq_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+ return vget_lane_u32(
+ vreinterpret_u32_u8(vceq_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+ return vceq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return vceqq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(const Packet4s& a, const Packet4s& b) {
+ return vreinterpret_s16_u16(vceq_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return vreinterpretq_s16_u16(vceqq_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(const Packet4us& a, const Packet4us& b) {
+ return vceq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vceqq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(const Packet2i& a, const Packet2i& b) {
+ return vreinterpret_s32_u32(vceq_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vreinterpretq_s32_u32(vceqq_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+ return vceq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return vceqq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b) {
#if EIGEN_ARCH_ARM64
- return vreinterpretq_s64_u64(vceqq_s64(a,b));
+ return vreinterpretq_s64_u64(vceqq_s64(a, b));
#else
- return vcombine_s64(
- vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
- vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+ return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+ vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
#if EIGEN_ARCH_ARM64
- return vceqq_u64(a,b);
+ return vceqq_u64(a, b);
#else
- return vcombine_u64(
- vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
- vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+ return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+ vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a,b))); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a, b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a, b)));
+}
// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
-template<> EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(const Packet4c& a, const Packet4c& b)
-{ return a & b; }
-template<> EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return vand_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vandq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{ return a & b; }
-template<> EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vand_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vandq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(const Packet4s& a, const Packet4s& b) { return vand_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) { return vandq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vand_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vandq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(const Packet2i& a, const Packet2i& b) { return vand_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vand_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vandq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) { return vandq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{ return vandq_u64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(const Packet4c& a, const Packet4c& b) {
+ return a & b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(const Packet8c& a, const Packet8c& b) {
+ return vand_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return vandq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+ return a & b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+ return vand_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return vandq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(const Packet4s& a, const Packet4s& b) {
+ return vand_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return vandq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(const Packet4us& a, const Packet4us& b) {
+ return vand_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vandq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(const Packet2i& a, const Packet2i& b) {
+ return vand_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vandq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+ return vand_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return vandq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
+ return vandq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+ return vandq_u64(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet2f por<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4c por<Packet4c>(const Packet4c& a, const Packet4c& b)
-{ return a | b; }
-template<> EIGEN_STRONG_INLINE Packet8c por<Packet8c>(const Packet8c& a, const Packet8c& b) { return vorr_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vorrq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{ return a | b; }
-template<> EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vorr_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vorrq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s por<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vorr_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vorrq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us por<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vorr_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vorrq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i por<Packet2i>(const Packet2i& a, const Packet2i& b) { return vorr_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vorr_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vorrq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b)
-{ return vorrq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{ return vorrq_u64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2f por<Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c por<Packet4c>(const Packet4c& a, const Packet4c& b) {
+ return a | b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c por<Packet8c>(const Packet8c& a, const Packet8c& b) {
+ return vorr_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return vorrq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+ return a | b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+ return vorr_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return vorrq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s por<Packet4s>(const Packet4s& a, const Packet4s& b) {
+ return vorr_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return vorrq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us por<Packet4us>(const Packet4us& a, const Packet4us& b) {
+ return vorr_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vorrq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i por<Packet2i>(const Packet2i& a, const Packet2i& b) {
+ return vorr_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vorrq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+ return vorr_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return vorrq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
+ return vorrq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+ return vorrq_u64(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(const Packet4c& a, const Packet4c& b)
-{ return a ^ b; }
-template<> EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return veor_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return veorq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{ return a ^ b; }
-template<> EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return veor_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return veorq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(const Packet4s& a, const Packet4s& b) { return veor_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) { return veorq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return veor_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return veorq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(const Packet2i& a, const Packet2i& b) { return veor_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return veor_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return veorq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b)
-{ return veorq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{ return veorq_u64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(const Packet4c& a, const Packet4c& b) {
+ return a ^ b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(const Packet8c& a, const Packet8c& b) {
+ return veor_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return veorq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+ return a ^ b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+ return veor_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return veorq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(const Packet4s& a, const Packet4s& b) {
+ return veor_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return veorq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(const Packet4us& a, const Packet4us& b) {
+ return veor_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return veorq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(const Packet2i& a, const Packet2i& b) {
+ return veor_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return veorq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+ return veor_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return veorq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
+ return veorq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+ return veorq_u64(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(const Packet4c& a, const Packet4c& b)
-{ return a & ~b; }
-template<> EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(const Packet8c& a, const Packet8c& b) { return vbic_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) { return vbicq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{ return a & ~b; }
-template<> EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vbic_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vbicq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vbic_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vbicq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vbic_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vbicq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(const Packet2i& a, const Packet2i& b)
-{ return vbic_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b)
-{ return vbicq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vbic_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vbicq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b)
-{ return vbicq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{ return vbicq_u64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(const Packet2f& a, const Packet2f& b) {
+ return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(const Packet4c& a, const Packet4c& b) {
+ return a & ~b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(const Packet8c& a, const Packet8c& b) {
+ return vbic_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) {
+ return vbicq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+ return a & ~b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+ return vbic_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+ return vbicq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(const Packet4s& a, const Packet4s& b) {
+ return vbic_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b) {
+ return vbicq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(const Packet4us& a, const Packet4us& b) {
+ return vbic_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b) {
+ return vbicq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(const Packet2i& a, const Packet2i& b) {
+ return vbic_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vbicq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+ return vbic_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return vbicq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
+ return vbicq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+ return vbicq_u64(a, b);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a) {
+ return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) {
+ return vshr_n_s8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) {
+ return vshrq_n_s8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a) {
+ return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) {
+ return vshr_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) {
+ return vshrq_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) {
+ return vshr_n_s16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) {
+ return vshrq_n_s16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) {
+ return vshr_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) {
+ return vshrq_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) {
+ return vshr_n_s32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) {
+ return vshrq_n_s32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) {
+ return vshr_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) {
+ return vshrq_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) {
+ return vshrq_n_s64(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) {
+ return vshrq_n_u64(a, N);
+}
-template<int N> EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) { return vshr_n_s8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) { return vshrq_n_s8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a)
-{ return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) { return vshr_n_s16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) { return vshrq_n_s16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) { return vshr_n_s32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return vshrq_n_s32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) { return vshrq_n_s64(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
+template <int N>
+EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a) {
+ return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a) {
+ return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a) {
+ return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a) {
+ return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) {
+ return vshr_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) {
+ return vshrq_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a) {
+ return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) {
+ return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) {
+ return vshr_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) {
+ return vshrq_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a) {
+ return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) {
+ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) {
+ return vshr_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) {
+ return vshrq_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a) {
+ return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) {
+ return vshrq_n_u64(a, N);
+}
-template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a)
-{ return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a)
-{ return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a)
-{ return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a)
-{ return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a)
-{ return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a)
-{ return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a)
-{ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a)
-{ return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
+template <int N>
+EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a) {
+ return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) {
+ return vshl_n_s8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) {
+ return vshlq_n_s8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a) {
+ return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) {
+ return vshl_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) {
+ return vshlq_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) {
+ return vshl_n_s16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) {
+ return vshlq_n_s16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) {
+ return vshl_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) {
+ return vshlq_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) {
+ return vshl_n_s32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) {
+ return vshlq_n_s32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) {
+ return vshl_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) {
+ return vshlq_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) {
+ return vshlq_n_s64(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) {
+ return vshlq_n_u64(a, N);
+}
-template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) { return vshl_n_s8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) { return vshlq_n_s8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a)
-{ return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) { return vshl_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) { return vshlq_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) { return vshl_n_s16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) { return vshlq_n_s16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) { return vshl_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) { return vshlq_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) { return vshl_n_s32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return vshlq_n_s32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) { return vshl_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) { return vshlq_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) { return vshlq_n_s64(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) { return vshlq_n_u64(a,N); }
-
-template<> EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from) {
Packet4c res;
memcpy(&res, from, sizeof(Packet4c));
return res;
}
-template<> EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from) {
Packet4uc res;
memcpy(&res, from, sizeof(Packet4uc));
return res;
}
-template<> EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); }
-template<> EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); }
+template <>
+EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from);
+}
-template<> EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(const float* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(const int8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(const float* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(const int8_t* from) {
Packet4c res;
memcpy(&res, from, sizeof(Packet4c));
return res;
}
-template<> EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(const int8_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(const uint8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(const int8_t* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(const uint8_t* from) {
Packet4uc res;
memcpy(&res, from, sizeof(Packet4uc));
return res;
}
-template<> EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(const uint8_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(const int16_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(const uint16_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(const int32_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(const uint32_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from); }
-template<> EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from); }
+template <>
+EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(const uint8_t* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(const int16_t* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(const uint16_t* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(const int32_t* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(const uint32_t* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from);
+}
-template<> EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(const float* from)
-{ return vld1_dup_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
-{ return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from+1)); }
-template<> EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(const int8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(const float* from) {
+ return vld1_dup_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
+ return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from + 1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(const int8_t* from) {
const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload<Packet4c>(from)));
- return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a,a).val[0]), 0);
+ return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a, a).val[0]), 0);
}
-template<> EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(const int8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(const int8_t* from) {
const int8x8_t a = vld1_s8(from);
- return vzip_s8(a,a).val[0];
+ return vzip_s8(a, a).val[0];
}
-template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from) {
const int8x8_t a = vld1_s8(from);
- const int8x8x2_t b = vzip_s8(a,a);
+ const int8x8x2_t b = vzip_s8(a, a);
return vcombine_s8(b.val[0], b.val[1]);
}
-template<> EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(const uint8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(const uint8_t* from) {
const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload<Packet4uc>(from)));
- return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a,a).val[0]), 0);
+ return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a, a).val[0]), 0);
}
-template<> EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(const uint8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(const uint8_t* from) {
const uint8x8_t a = vld1_u8(from);
- return vzip_u8(a,a).val[0];
+ return vzip_u8(a, a).val[0];
}
-template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from) {
const uint8x8_t a = vld1_u8(from);
- const uint8x8x2_t b = vzip_u8(a,a);
+ const uint8x8x2_t b = vzip_u8(a, a);
return vcombine_u8(b.val[0], b.val[1]);
}
-template<> EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(const int16_t* from)
-{
- return vreinterpret_s16_u32(vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)),
- vreinterpret_u32_s16(vld1_dup_s16(from+1))).val[0]);
+template <>
+EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(const int16_t* from) {
+ return vreinterpret_s16_u32(
+ vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)), vreinterpret_u32_s16(vld1_dup_s16(from + 1))).val[0]);
}
-template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from) {
const int16x4_t a = vld1_s16(from);
- const int16x4x2_t b = vzip_s16(a,a);
+ const int16x4x2_t b = vzip_s16(a, a);
return vcombine_s16(b.val[0], b.val[1]);
}
-template<> EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(const uint16_t* from)
-{
- return vreinterpret_u16_u32(vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)),
- vreinterpret_u32_u16(vld1_dup_u16(from+1))).val[0]);
+template <>
+EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(const uint16_t* from) {
+ return vreinterpret_u16_u32(
+ vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)), vreinterpret_u32_u16(vld1_dup_u16(from + 1))).val[0]);
}
-template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from) {
const uint16x4_t a = vld1_u16(from);
- const uint16x4x2_t b = vzip_u16(a,a);
+ const uint16x4x2_t b = vzip_u16(a, a);
return vcombine_u16(b.val[0], b.val[1]);
}
-template<> EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(const int32_t* from)
-{ return vld1_dup_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
-{ return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from+1)); }
-template<> EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(const uint32_t* from)
-{ return vld1_dup_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from)
-{ return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from+1)); }
-template<> EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from)
-{ return vld1q_dup_s64(from); }
-template<> EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from)
-{ return vld1q_dup_u64(from); }
-
-template<> EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) { return vld1q_dup_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from)
-{ return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from)
-{
- return vreinterpret_s8_u32(vzip_u32(
- vreinterpret_u32_s8(vld1_dup_s8(from)),
- vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
+template <>
+EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(const int32_t* from) {
+ return vld1_dup_s32(from);
}
-template<> EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from)
-{
- const int8x8_t a = vreinterpret_s8_u32(vzip_u32(
- vreinterpret_u32_s8(vld1_dup_s8(from)),
- vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
- const int8x8_t b = vreinterpret_s8_u32(vzip_u32(
- vreinterpret_u32_s8(vld1_dup_s8(from+2)),
- vreinterpret_u32_s8(vld1_dup_s8(from+3))).val[0]);
- return vcombine_s8(a,b);
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
+ return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from + 1));
}
-template<> EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from)
-{ return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0); }
-template<> EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from)
-{
- return vreinterpret_u8_u32(vzip_u32(
- vreinterpret_u32_u8(vld1_dup_u8(from)),
- vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
+template <>
+EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(const uint32_t* from) {
+ return vld1_dup_u32(from);
}
-template<> EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from)
-{
- const uint8x8_t a = vreinterpret_u8_u32(vzip_u32(
- vreinterpret_u32_u8(vld1_dup_u8(from)),
- vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
- const uint8x8_t b = vreinterpret_u8_u32(vzip_u32(
- vreinterpret_u32_u8(vld1_dup_u8(from+2)),
- vreinterpret_u32_u8(vld1_dup_u8(from+3))).val[0]);
- return vcombine_u8(a,b);
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
+ return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from + 1));
}
-template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from)
-{ return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from+1)); }
-template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from)
-{ return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from+1)); }
-template<> EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) { return vld1q_dup_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) { return vld1q_dup_u32(from); }
+template <>
+EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
+ return vld1q_dup_s64(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from) {
+ return vld1q_dup_u64(from);
+}
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from)
-{ memcpy(to, &from, sizeof(from)); }
-template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from)
-{ memcpy(to, &from, sizeof(from)); }
-template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to,from); }
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) {
+ return vld1q_dup_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from) {
+ return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from) {
+ return vreinterpret_s8_u32(
+ vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from) {
+ const int8x8_t a = vreinterpret_s8_u32(
+ vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
+ const int8x8_t b = vreinterpret_s8_u32(
+ vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from + 2)), vreinterpret_u32_s8(vld1_dup_s8(from + 3))).val[0]);
+ return vcombine_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from) {
+ return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from) {
+ return vreinterpret_u8_u32(
+ vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from) {
+ const uint8x8_t a = vreinterpret_u8_u32(
+ vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
+ const uint8x8_t b = vreinterpret_u8_u32(
+ vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from + 2)), vreinterpret_u32_u8(vld1_dup_u8(from + 3))).val[0]);
+ return vcombine_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from) {
+ return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from + 1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from) {
+ return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from + 1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) {
+ return vld1q_dup_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) {
+ return vld1q_dup_u32(from);
+}
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2f& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet4c& from)
-{ memcpy(to, &from, sizeof(from)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet8c& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet4uc& from)
-{ memcpy(to, &from, sizeof(from)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet8uc& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet4s& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet4us& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet2i& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet2ui& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to,from); }
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from) {
+ memcpy(to, &from, sizeof(from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from) {
+ memcpy(to, &from, sizeof(from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to, from);
+}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(const float* from, Index stride)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2f& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet4c& from) {
+ memcpy(to, &from, sizeof(from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet8c& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet4uc& from) {
+ memcpy(to, &from, sizeof(from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet8uc& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet4s& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet4us& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet2i& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet2ui& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(const float* from, Index stride) {
Packet2f res = vld1_dup_f32(from);
- res = vld1_lane_f32(from + 1*stride, res, 1);
+ res = vld1_lane_f32(from + 1 * stride, res, 1);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
Packet4f res = vld1q_dup_f32(from);
- res = vld1q_lane_f32(from + 1*stride, res, 1);
- res = vld1q_lane_f32(from + 2*stride, res, 2);
- res = vld1q_lane_f32(from + 3*stride, res, 3);
+ res = vld1q_lane_f32(from + 1 * stride, res, 1);
+ res = vld1q_lane_f32(from + 2 * stride, res, 2);
+ res = vld1q_lane_f32(from + 3 * stride, res, 3);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride) {
Packet4c res;
- for (int i = 0; i != 4; i++)
- reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
+ for (int i = 0; i != 4; i++) reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride) {
Packet8c res = vld1_dup_s8(from);
- res = vld1_lane_s8(from + 1*stride, res, 1);
- res = vld1_lane_s8(from + 2*stride, res, 2);
- res = vld1_lane_s8(from + 3*stride, res, 3);
- res = vld1_lane_s8(from + 4*stride, res, 4);
- res = vld1_lane_s8(from + 5*stride, res, 5);
- res = vld1_lane_s8(from + 6*stride, res, 6);
- res = vld1_lane_s8(from + 7*stride, res, 7);
+ res = vld1_lane_s8(from + 1 * stride, res, 1);
+ res = vld1_lane_s8(from + 2 * stride, res, 2);
+ res = vld1_lane_s8(from + 3 * stride, res, 3);
+ res = vld1_lane_s8(from + 4 * stride, res, 4);
+ res = vld1_lane_s8(from + 5 * stride, res, 5);
+ res = vld1_lane_s8(from + 6 * stride, res, 6);
+ res = vld1_lane_s8(from + 7 * stride, res, 7);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride) {
Packet16c res = vld1q_dup_s8(from);
- res = vld1q_lane_s8(from + 1*stride, res, 1);
- res = vld1q_lane_s8(from + 2*stride, res, 2);
- res = vld1q_lane_s8(from + 3*stride, res, 3);
- res = vld1q_lane_s8(from + 4*stride, res, 4);
- res = vld1q_lane_s8(from + 5*stride, res, 5);
- res = vld1q_lane_s8(from + 6*stride, res, 6);
- res = vld1q_lane_s8(from + 7*stride, res, 7);
- res = vld1q_lane_s8(from + 8*stride, res, 8);
- res = vld1q_lane_s8(from + 9*stride, res, 9);
- res = vld1q_lane_s8(from + 10*stride, res, 10);
- res = vld1q_lane_s8(from + 11*stride, res, 11);
- res = vld1q_lane_s8(from + 12*stride, res, 12);
- res = vld1q_lane_s8(from + 13*stride, res, 13);
- res = vld1q_lane_s8(from + 14*stride, res, 14);
- res = vld1q_lane_s8(from + 15*stride, res, 15);
+ res = vld1q_lane_s8(from + 1 * stride, res, 1);
+ res = vld1q_lane_s8(from + 2 * stride, res, 2);
+ res = vld1q_lane_s8(from + 3 * stride, res, 3);
+ res = vld1q_lane_s8(from + 4 * stride, res, 4);
+ res = vld1q_lane_s8(from + 5 * stride, res, 5);
+ res = vld1q_lane_s8(from + 6 * stride, res, 6);
+ res = vld1q_lane_s8(from + 7 * stride, res, 7);
+ res = vld1q_lane_s8(from + 8 * stride, res, 8);
+ res = vld1q_lane_s8(from + 9 * stride, res, 9);
+ res = vld1q_lane_s8(from + 10 * stride, res, 10);
+ res = vld1q_lane_s8(from + 11 * stride, res, 11);
+ res = vld1q_lane_s8(from + 12 * stride, res, 12);
+ res = vld1q_lane_s8(from + 13 * stride, res, 13);
+ res = vld1q_lane_s8(from + 14 * stride, res, 14);
+ res = vld1q_lane_s8(from + 15 * stride, res, 15);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride) {
Packet4uc res;
- for (int i = 0; i != 4; i++)
- reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
+ for (int i = 0; i != 4; i++) reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride) {
Packet8uc res = vld1_dup_u8(from);
- res = vld1_lane_u8(from + 1*stride, res, 1);
- res = vld1_lane_u8(from + 2*stride, res, 2);
- res = vld1_lane_u8(from + 3*stride, res, 3);
- res = vld1_lane_u8(from + 4*stride, res, 4);
- res = vld1_lane_u8(from + 5*stride, res, 5);
- res = vld1_lane_u8(from + 6*stride, res, 6);
- res = vld1_lane_u8(from + 7*stride, res, 7);
+ res = vld1_lane_u8(from + 1 * stride, res, 1);
+ res = vld1_lane_u8(from + 2 * stride, res, 2);
+ res = vld1_lane_u8(from + 3 * stride, res, 3);
+ res = vld1_lane_u8(from + 4 * stride, res, 4);
+ res = vld1_lane_u8(from + 5 * stride, res, 5);
+ res = vld1_lane_u8(from + 6 * stride, res, 6);
+ res = vld1_lane_u8(from + 7 * stride, res, 7);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride) {
Packet16uc res = vld1q_dup_u8(from);
- res = vld1q_lane_u8(from + 1*stride, res, 1);
- res = vld1q_lane_u8(from + 2*stride, res, 2);
- res = vld1q_lane_u8(from + 3*stride, res, 3);
- res = vld1q_lane_u8(from + 4*stride, res, 4);
- res = vld1q_lane_u8(from + 5*stride, res, 5);
- res = vld1q_lane_u8(from + 6*stride, res, 6);
- res = vld1q_lane_u8(from + 7*stride, res, 7);
- res = vld1q_lane_u8(from + 8*stride, res, 8);
- res = vld1q_lane_u8(from + 9*stride, res, 9);
- res = vld1q_lane_u8(from + 10*stride, res, 10);
- res = vld1q_lane_u8(from + 11*stride, res, 11);
- res = vld1q_lane_u8(from + 12*stride, res, 12);
- res = vld1q_lane_u8(from + 13*stride, res, 13);
- res = vld1q_lane_u8(from + 14*stride, res, 14);
- res = vld1q_lane_u8(from + 15*stride, res, 15);
+ res = vld1q_lane_u8(from + 1 * stride, res, 1);
+ res = vld1q_lane_u8(from + 2 * stride, res, 2);
+ res = vld1q_lane_u8(from + 3 * stride, res, 3);
+ res = vld1q_lane_u8(from + 4 * stride, res, 4);
+ res = vld1q_lane_u8(from + 5 * stride, res, 5);
+ res = vld1q_lane_u8(from + 6 * stride, res, 6);
+ res = vld1q_lane_u8(from + 7 * stride, res, 7);
+ res = vld1q_lane_u8(from + 8 * stride, res, 8);
+ res = vld1q_lane_u8(from + 9 * stride, res, 9);
+ res = vld1q_lane_u8(from + 10 * stride, res, 10);
+ res = vld1q_lane_u8(from + 11 * stride, res, 11);
+ res = vld1q_lane_u8(from + 12 * stride, res, 12);
+ res = vld1q_lane_u8(from + 13 * stride, res, 13);
+ res = vld1q_lane_u8(from + 14 * stride, res, 14);
+ res = vld1q_lane_u8(from + 15 * stride, res, 15);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride) {
Packet4s res = vld1_dup_s16(from);
- res = vld1_lane_s16(from + 1*stride, res, 1);
- res = vld1_lane_s16(from + 2*stride, res, 2);
- res = vld1_lane_s16(from + 3*stride, res, 3);
+ res = vld1_lane_s16(from + 1 * stride, res, 1);
+ res = vld1_lane_s16(from + 2 * stride, res, 2);
+ res = vld1_lane_s16(from + 3 * stride, res, 3);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride) {
Packet8s res = vld1q_dup_s16(from);
- res = vld1q_lane_s16(from + 1*stride, res, 1);
- res = vld1q_lane_s16(from + 2*stride, res, 2);
- res = vld1q_lane_s16(from + 3*stride, res, 3);
- res = vld1q_lane_s16(from + 4*stride, res, 4);
- res = vld1q_lane_s16(from + 5*stride, res, 5);
- res = vld1q_lane_s16(from + 6*stride, res, 6);
- res = vld1q_lane_s16(from + 7*stride, res, 7);
+ res = vld1q_lane_s16(from + 1 * stride, res, 1);
+ res = vld1q_lane_s16(from + 2 * stride, res, 2);
+ res = vld1q_lane_s16(from + 3 * stride, res, 3);
+ res = vld1q_lane_s16(from + 4 * stride, res, 4);
+ res = vld1q_lane_s16(from + 5 * stride, res, 5);
+ res = vld1q_lane_s16(from + 6 * stride, res, 6);
+ res = vld1q_lane_s16(from + 7 * stride, res, 7);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride) {
Packet4us res = vld1_dup_u16(from);
- res = vld1_lane_u16(from + 1*stride, res, 1);
- res = vld1_lane_u16(from + 2*stride, res, 2);
- res = vld1_lane_u16(from + 3*stride, res, 3);
+ res = vld1_lane_u16(from + 1 * stride, res, 1);
+ res = vld1_lane_u16(from + 2 * stride, res, 2);
+ res = vld1_lane_u16(from + 3 * stride, res, 3);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride) {
Packet8us res = vld1q_dup_u16(from);
- res = vld1q_lane_u16(from + 1*stride, res, 1);
- res = vld1q_lane_u16(from + 2*stride, res, 2);
- res = vld1q_lane_u16(from + 3*stride, res, 3);
- res = vld1q_lane_u16(from + 4*stride, res, 4);
- res = vld1q_lane_u16(from + 5*stride, res, 5);
- res = vld1q_lane_u16(from + 6*stride, res, 6);
- res = vld1q_lane_u16(from + 7*stride, res, 7);
+ res = vld1q_lane_u16(from + 1 * stride, res, 1);
+ res = vld1q_lane_u16(from + 2 * stride, res, 2);
+ res = vld1q_lane_u16(from + 3 * stride, res, 3);
+ res = vld1q_lane_u16(from + 4 * stride, res, 4);
+ res = vld1q_lane_u16(from + 5 * stride, res, 5);
+ res = vld1q_lane_u16(from + 6 * stride, res, 6);
+ res = vld1q_lane_u16(from + 7 * stride, res, 7);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride) {
Packet2i res = vld1_dup_s32(from);
- res = vld1_lane_s32(from + 1*stride, res, 1);
+ res = vld1_lane_s32(from + 1 * stride, res, 1);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
Packet4i res = vld1q_dup_s32(from);
- res = vld1q_lane_s32(from + 1*stride, res, 1);
- res = vld1q_lane_s32(from + 2*stride, res, 2);
- res = vld1q_lane_s32(from + 3*stride, res, 3);
+ res = vld1q_lane_s32(from + 1 * stride, res, 1);
+ res = vld1q_lane_s32(from + 2 * stride, res, 2);
+ res = vld1q_lane_s32(from + 3 * stride, res, 3);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride) {
Packet2ui res = vld1_dup_u32(from);
- res = vld1_lane_u32(from + 1*stride, res, 1);
+ res = vld1_lane_u32(from + 1 * stride, res, 1);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
Packet4ui res = vld1q_dup_u32(from);
- res = vld1q_lane_u32(from + 1*stride, res, 1);
- res = vld1q_lane_u32(from + 2*stride, res, 2);
- res = vld1q_lane_u32(from + 3*stride, res, 3);
+ res = vld1q_lane_u32(from + 1 * stride, res, 1);
+ res = vld1q_lane_u32(from + 2 * stride, res, 2);
+ res = vld1q_lane_u32(from + 3 * stride, res, 3);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
Packet2l res = vld1q_dup_s64(from);
- res = vld1q_lane_s64(from + 1*stride, res, 1);
+ res = vld1q_lane_s64(from + 1 * stride, res, 1);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride) {
Packet2ul res = vld1q_dup_u64(from);
- res = vld1q_lane_u64(from + 1*stride, res, 1);
+ res = vld1q_lane_u64(from + 1 * stride, res, 1);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride)
-{
- vst1_lane_f32(to + stride*0, from, 0);
- vst1_lane_f32(to + stride*1, from, 1);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride) {
+ vst1_lane_f32(to + stride * 0, from, 0);
+ vst1_lane_f32(to + stride * 1, from, 1);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
-{
- vst1q_lane_f32(to + stride*0, from, 0);
- vst1q_lane_f32(to + stride*1, from, 1);
- vst1q_lane_f32(to + stride*2, from, 2);
- vst1q_lane_f32(to + stride*3, from, 3);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
+ vst1q_lane_f32(to + stride * 0, from, 0);
+ vst1q_lane_f32(to + stride * 1, from, 1);
+ vst1q_lane_f32(to + stride * 2, from, 2);
+ vst1q_lane_f32(to + stride * 3, from, 3);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride)
-{
- for (int i = 0; i != 4; i++)
- *(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride) {
+ for (int i = 0; i != 4; i++) *(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride)
-{
- vst1_lane_s8(to + stride*0, from, 0);
- vst1_lane_s8(to + stride*1, from, 1);
- vst1_lane_s8(to + stride*2, from, 2);
- vst1_lane_s8(to + stride*3, from, 3);
- vst1_lane_s8(to + stride*4, from, 4);
- vst1_lane_s8(to + stride*5, from, 5);
- vst1_lane_s8(to + stride*6, from, 6);
- vst1_lane_s8(to + stride*7, from, 7);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride) {
+ vst1_lane_s8(to + stride * 0, from, 0);
+ vst1_lane_s8(to + stride * 1, from, 1);
+ vst1_lane_s8(to + stride * 2, from, 2);
+ vst1_lane_s8(to + stride * 3, from, 3);
+ vst1_lane_s8(to + stride * 4, from, 4);
+ vst1_lane_s8(to + stride * 5, from, 5);
+ vst1_lane_s8(to + stride * 6, from, 6);
+ vst1_lane_s8(to + stride * 7, from, 7);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from, Index stride)
-{
- vst1q_lane_s8(to + stride*0, from, 0);
- vst1q_lane_s8(to + stride*1, from, 1);
- vst1q_lane_s8(to + stride*2, from, 2);
- vst1q_lane_s8(to + stride*3, from, 3);
- vst1q_lane_s8(to + stride*4, from, 4);
- vst1q_lane_s8(to + stride*5, from, 5);
- vst1q_lane_s8(to + stride*6, from, 6);
- vst1q_lane_s8(to + stride*7, from, 7);
- vst1q_lane_s8(to + stride*8, from, 8);
- vst1q_lane_s8(to + stride*9, from, 9);
- vst1q_lane_s8(to + stride*10, from, 10);
- vst1q_lane_s8(to + stride*11, from, 11);
- vst1q_lane_s8(to + stride*12, from, 12);
- vst1q_lane_s8(to + stride*13, from, 13);
- vst1q_lane_s8(to + stride*14, from, 14);
- vst1q_lane_s8(to + stride*15, from, 15);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from,
+ Index stride) {
+ vst1q_lane_s8(to + stride * 0, from, 0);
+ vst1q_lane_s8(to + stride * 1, from, 1);
+ vst1q_lane_s8(to + stride * 2, from, 2);
+ vst1q_lane_s8(to + stride * 3, from, 3);
+ vst1q_lane_s8(to + stride * 4, from, 4);
+ vst1q_lane_s8(to + stride * 5, from, 5);
+ vst1q_lane_s8(to + stride * 6, from, 6);
+ vst1q_lane_s8(to + stride * 7, from, 7);
+ vst1q_lane_s8(to + stride * 8, from, 8);
+ vst1q_lane_s8(to + stride * 9, from, 9);
+ vst1q_lane_s8(to + stride * 10, from, 10);
+ vst1q_lane_s8(to + stride * 11, from, 11);
+ vst1q_lane_s8(to + stride * 12, from, 12);
+ vst1q_lane_s8(to + stride * 13, from, 13);
+ vst1q_lane_s8(to + stride * 14, from, 14);
+ vst1q_lane_s8(to + stride * 15, from, 15);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from, Index stride)
-{
- for (int i = 0; i != 4; i++)
- *(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from,
+ Index stride) {
+ for (int i = 0; i != 4; i++) *(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from, Index stride)
-{
- vst1_lane_u8(to + stride*0, from, 0);
- vst1_lane_u8(to + stride*1, from, 1);
- vst1_lane_u8(to + stride*2, from, 2);
- vst1_lane_u8(to + stride*3, from, 3);
- vst1_lane_u8(to + stride*4, from, 4);
- vst1_lane_u8(to + stride*5, from, 5);
- vst1_lane_u8(to + stride*6, from, 6);
- vst1_lane_u8(to + stride*7, from, 7);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from,
+ Index stride) {
+ vst1_lane_u8(to + stride * 0, from, 0);
+ vst1_lane_u8(to + stride * 1, from, 1);
+ vst1_lane_u8(to + stride * 2, from, 2);
+ vst1_lane_u8(to + stride * 3, from, 3);
+ vst1_lane_u8(to + stride * 4, from, 4);
+ vst1_lane_u8(to + stride * 5, from, 5);
+ vst1_lane_u8(to + stride * 6, from, 6);
+ vst1_lane_u8(to + stride * 7, from, 7);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from, Index stride)
-{
- vst1q_lane_u8(to + stride*0, from, 0);
- vst1q_lane_u8(to + stride*1, from, 1);
- vst1q_lane_u8(to + stride*2, from, 2);
- vst1q_lane_u8(to + stride*3, from, 3);
- vst1q_lane_u8(to + stride*4, from, 4);
- vst1q_lane_u8(to + stride*5, from, 5);
- vst1q_lane_u8(to + stride*6, from, 6);
- vst1q_lane_u8(to + stride*7, from, 7);
- vst1q_lane_u8(to + stride*8, from, 8);
- vst1q_lane_u8(to + stride*9, from, 9);
- vst1q_lane_u8(to + stride*10, from, 10);
- vst1q_lane_u8(to + stride*11, from, 11);
- vst1q_lane_u8(to + stride*12, from, 12);
- vst1q_lane_u8(to + stride*13, from, 13);
- vst1q_lane_u8(to + stride*14, from, 14);
- vst1q_lane_u8(to + stride*15, from, 15);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from,
+ Index stride) {
+ vst1q_lane_u8(to + stride * 0, from, 0);
+ vst1q_lane_u8(to + stride * 1, from, 1);
+ vst1q_lane_u8(to + stride * 2, from, 2);
+ vst1q_lane_u8(to + stride * 3, from, 3);
+ vst1q_lane_u8(to + stride * 4, from, 4);
+ vst1q_lane_u8(to + stride * 5, from, 5);
+ vst1q_lane_u8(to + stride * 6, from, 6);
+ vst1q_lane_u8(to + stride * 7, from, 7);
+ vst1q_lane_u8(to + stride * 8, from, 8);
+ vst1q_lane_u8(to + stride * 9, from, 9);
+ vst1q_lane_u8(to + stride * 10, from, 10);
+ vst1q_lane_u8(to + stride * 11, from, 11);
+ vst1q_lane_u8(to + stride * 12, from, 12);
+ vst1q_lane_u8(to + stride * 13, from, 13);
+ vst1q_lane_u8(to + stride * 14, from, 14);
+ vst1q_lane_u8(to + stride * 15, from, 15);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from, Index stride)
-{
- vst1_lane_s16(to + stride*0, from, 0);
- vst1_lane_s16(to + stride*1, from, 1);
- vst1_lane_s16(to + stride*2, from, 2);
- vst1_lane_s16(to + stride*3, from, 3);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from,
+ Index stride) {
+ vst1_lane_s16(to + stride * 0, from, 0);
+ vst1_lane_s16(to + stride * 1, from, 1);
+ vst1_lane_s16(to + stride * 2, from, 2);
+ vst1_lane_s16(to + stride * 3, from, 3);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from, Index stride)
-{
- vst1q_lane_s16(to + stride*0, from, 0);
- vst1q_lane_s16(to + stride*1, from, 1);
- vst1q_lane_s16(to + stride*2, from, 2);
- vst1q_lane_s16(to + stride*3, from, 3);
- vst1q_lane_s16(to + stride*4, from, 4);
- vst1q_lane_s16(to + stride*5, from, 5);
- vst1q_lane_s16(to + stride*6, from, 6);
- vst1q_lane_s16(to + stride*7, from, 7);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from,
+ Index stride) {
+ vst1q_lane_s16(to + stride * 0, from, 0);
+ vst1q_lane_s16(to + stride * 1, from, 1);
+ vst1q_lane_s16(to + stride * 2, from, 2);
+ vst1q_lane_s16(to + stride * 3, from, 3);
+ vst1q_lane_s16(to + stride * 4, from, 4);
+ vst1q_lane_s16(to + stride * 5, from, 5);
+ vst1q_lane_s16(to + stride * 6, from, 6);
+ vst1q_lane_s16(to + stride * 7, from, 7);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from, Index stride)
-{
- vst1_lane_u16(to + stride*0, from, 0);
- vst1_lane_u16(to + stride*1, from, 1);
- vst1_lane_u16(to + stride*2, from, 2);
- vst1_lane_u16(to + stride*3, from, 3);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from,
+ Index stride) {
+ vst1_lane_u16(to + stride * 0, from, 0);
+ vst1_lane_u16(to + stride * 1, from, 1);
+ vst1_lane_u16(to + stride * 2, from, 2);
+ vst1_lane_u16(to + stride * 3, from, 3);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from, Index stride)
-{
- vst1q_lane_u16(to + stride*0, from, 0);
- vst1q_lane_u16(to + stride*1, from, 1);
- vst1q_lane_u16(to + stride*2, from, 2);
- vst1q_lane_u16(to + stride*3, from, 3);
- vst1q_lane_u16(to + stride*4, from, 4);
- vst1q_lane_u16(to + stride*5, from, 5);
- vst1q_lane_u16(to + stride*6, from, 6);
- vst1q_lane_u16(to + stride*7, from, 7);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from,
+ Index stride) {
+ vst1q_lane_u16(to + stride * 0, from, 0);
+ vst1q_lane_u16(to + stride * 1, from, 1);
+ vst1q_lane_u16(to + stride * 2, from, 2);
+ vst1q_lane_u16(to + stride * 3, from, 3);
+ vst1q_lane_u16(to + stride * 4, from, 4);
+ vst1q_lane_u16(to + stride * 5, from, 5);
+ vst1q_lane_u16(to + stride * 6, from, 6);
+ vst1q_lane_u16(to + stride * 7, from, 7);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from, Index stride)
-{
- vst1_lane_s32(to + stride*0, from, 0);
- vst1_lane_s32(to + stride*1, from, 1);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from,
+ Index stride) {
+ vst1_lane_s32(to + stride * 0, from, 0);
+ vst1_lane_s32(to + stride * 1, from, 1);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
-{
- vst1q_lane_s32(to + stride*0, from, 0);
- vst1q_lane_s32(to + stride*1, from, 1);
- vst1q_lane_s32(to + stride*2, from, 2);
- vst1q_lane_s32(to + stride*3, from, 3);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
+ Index stride) {
+ vst1q_lane_s32(to + stride * 0, from, 0);
+ vst1q_lane_s32(to + stride * 1, from, 1);
+ vst1q_lane_s32(to + stride * 2, from, 2);
+ vst1q_lane_s32(to + stride * 3, from, 3);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from, Index stride)
-{
- vst1_lane_u32(to + stride*0, from, 0);
- vst1_lane_u32(to + stride*1, from, 1);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from,
+ Index stride) {
+ vst1_lane_u32(to + stride * 0, from, 0);
+ vst1_lane_u32(to + stride * 1, from, 1);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride)
-{
- vst1q_lane_u32(to + stride*0, from, 0);
- vst1q_lane_u32(to + stride*1, from, 1);
- vst1q_lane_u32(to + stride*2, from, 2);
- vst1q_lane_u32(to + stride*3, from, 3);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from,
+ Index stride) {
+ vst1q_lane_u32(to + stride * 0, from, 0);
+ vst1q_lane_u32(to + stride * 1, from, 1);
+ vst1q_lane_u32(to + stride * 2, from, 2);
+ vst1q_lane_u32(to + stride * 3, from, 3);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride)
-{
- vst1q_lane_s64(to + stride*0, from, 0);
- vst1q_lane_s64(to + stride*1, from, 1);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from,
+ Index stride) {
+ vst1q_lane_s64(to + stride * 0, from, 0);
+ vst1q_lane_s64(to + stride * 1, from, 1);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from, Index stride)
-{
- vst1q_lane_u64(to + stride*0, from, 0);
- vst1q_lane_u64(to + stride*1, from, 1);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from,
+ Index stride) {
+ vst1q_lane_u64(to + stride * 0, from, 0);
+ vst1q_lane_u64(to + stride * 1, from, 1);
}
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+ EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) {
+ EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) {
+ EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) {
+ EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) {
+ EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
+ EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
+ EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
+ EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) {
+ EIGEN_ARM_PREFETCH(addr);
+}
-template<> EIGEN_STRONG_INLINE float pfirst<Packet2f>(const Packet2f& a) { return vget_lane_f32(a,0); }
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return vgetq_lane_f32(a,0); }
-template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(const Packet4c& a) { return static_cast<int8_t>(a & 0xff); }
-template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(const Packet8c& a) { return vget_lane_s8(a,0); }
-template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) { return vgetq_lane_s8(a,0); }
-template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(const Packet4uc& a) { return static_cast<uint8_t>(a & 0xff); }
-template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(const Packet8uc& a) { return vget_lane_u8(a,0); }
-template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) { return vgetq_lane_u8(a,0); }
-template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(const Packet4s& a) { return vget_lane_s16(a,0); }
-template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) { return vgetq_lane_s16(a,0); }
-template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(const Packet4us& a) { return vget_lane_u16(a,0); }
-template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) { return vgetq_lane_u16(a,0); }
-template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(const Packet2i& a) { return vget_lane_s32(a,0); }
-template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { return vgetq_lane_s32(a,0); }
-template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(a,0); }
-template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { return vgetq_lane_u32(a,0); }
-template<> EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) { return vgetq_lane_s64(a,0); }
-template<> EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) { return vgetq_lane_u64(a,0); }
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet2f>(const Packet2f& a) {
+ return vget_lane_f32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+ return vgetq_lane_f32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(const Packet4c& a) {
+ return static_cast<int8_t>(a & 0xff);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(const Packet8c& a) {
+ return vget_lane_s8(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) {
+ return vgetq_lane_s8(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(const Packet4uc& a) {
+ return static_cast<uint8_t>(a & 0xff);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(const Packet8uc& a) {
+ return vget_lane_u8(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) {
+ return vgetq_lane_u8(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(const Packet4s& a) {
+ return vget_lane_s16(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) {
+ return vgetq_lane_s16(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(const Packet4us& a) {
+ return vget_lane_u16(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) {
+ return vgetq_lane_u16(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(const Packet2i& a) {
+ return vget_lane_s32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
+ return vgetq_lane_s32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(const Packet2ui& a) {
+ return vget_lane_u32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
+ return vgetq_lane_u32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
+ return vgetq_lane_s64(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) {
+ return vgetq_lane_u64(a, 0);
+}
-template<> EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) { return vrev64_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) {
+ return vrev64_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
const float32x4_t a_r64 = vrev64q_f32(a);
return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64));
}
-template<> EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) { return vrev64_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a) {
+ return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) {
+ return vrev64_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
const int8x16_t a_r64 = vrev64q_s8(a);
return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64));
}
-template<> EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a)
-{ return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) { return vrev64_u8(a); }
-template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a) {
+ return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) {
+ return vrev64_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
const uint8x16_t a_r64 = vrev64q_u8(a);
return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64));
}
-template<> EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) { return vrev64_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) {
+ return vrev64_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
const int16x8_t a_r64 = vrev64q_s16(a);
return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64));
}
-template<> EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) { return vrev64_u16(a); }
-template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) {
+ return vrev64_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
const uint16x8_t a_r64 = vrev64q_u16(a);
return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64));
}
-template<> EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) { return vrev64_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) {
+ return vrev64_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
const int32x4_t a_r64 = vrev64q_s32(a);
return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64));
}
-template<> EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) { return vrev64_u32(a); }
-template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) {
+ return vrev64_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
const uint32x4_t a_r64 = vrev64q_u32(a);
return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64));
}
-template<> EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a)
-{ return vcombine_s64(vget_high_s64(a), vget_low_s64(a)); }
-template<> EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a)
-{ return vcombine_u64(vget_high_u64(a), vget_low_u64(a)); }
+template <>
+EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
+ return vcombine_s64(vget_high_s64(a), vget_low_s64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) {
+ return vcombine_u64(vget_high_u64(a), vget_low_u64(a));
+}
-template<> EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) { return vabs_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(const Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) { return vabs_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vabsq_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) { return vabs_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vabsq_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) { return vabs_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) {
+ return vabs_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+ return vabsq_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(const Packet4c& a) {
+ return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) {
+ return vabs_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
+ return vabsq_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) {
+ return vabs_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
+ return vabsq_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) {
+ return vabs_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
+ return vabsq_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
#if EIGEN_ARCH_ARM64
return vabsq_s64(a);
#else
- return vcombine_s64(
- vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))),
- vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
+ return vcombine_s64(vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))), vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) {
+ return a;
+}
template <>
EIGEN_STRONG_INLINE Packet2f psignbit(const Packet2f& a) {
@@ -2341,47 +3430,70 @@
return vreinterpretq_f32_s32(vshrq_n_s32(vreinterpretq_s32_f32(a), 31));
}
-template<> EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent)
-{ return pfrexp_generic(a,exponent); }
-template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent)
-{ return pfrexp_generic(a,exponent); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent) {
+ return pfrexp_generic(a, exponent);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+ return pfrexp_generic(a, exponent);
+}
-template<> EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(const Packet2f& a, const Packet2f& exponent)
-{ return pldexp_generic(a,exponent); }
-template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent)
-{ return pldexp_generic(a,exponent); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(const Packet2f& a, const Packet2f& exponent) {
+ return pldexp_generic(a, exponent);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+ return pldexp_generic(a, exponent);
+}
#if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) { return vaddv_f32(a); }
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) { return vaddvq_f32(a); }
+template <>
+EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) {
+ return vaddv_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
+ return vaddvq_f32(a);
+}
#else
-template<> EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) { return vget_lane_f32(vpadd_f32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) {
+ return vget_lane_f32(vpadd_f32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a));
return vget_lane_f32(vpadd_f32(sum, sum), 0);
}
#endif
-template<> EIGEN_STRONG_INLINE int8_t predux<Packet4c>(const Packet4c& a)
-{
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet4c>(const Packet4c& a) {
const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
int8x8_t sum = vpadd_s8(a_dup, a_dup);
sum = vpadd_s8(sum, sum);
return vget_lane_s8(sum, 0);
}
#if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a) { return vaddv_s8(a); }
-template<> EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) { return vaddvq_s8(a); }
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a) {
+ return vaddv_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
+ return vaddvq_s8(a);
+}
#else
-template<> EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a)
-{
- int8x8_t sum = vpadd_s8(a,a);
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a) {
+ int8x8_t sum = vpadd_s8(a, a);
sum = vpadd_s8(sum, sum);
sum = vpadd_s8(sum, sum);
return vget_lane_s8(sum, 0);
}
-template<> EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a));
sum = vpadd_s8(sum, sum);
sum = vpadd_s8(sum, sum);
@@ -2389,144 +3501,204 @@
return vget_lane_s8(sum, 0);
}
#endif
-template<> EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(const Packet4uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(const Packet4uc& a) {
const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
uint8x8_t sum = vpadd_u8(a_dup, a_dup);
sum = vpadd_u8(sum, sum);
return vget_lane_u8(sum, 0);
}
#if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a) { return vaddv_u8(a); }
-template<> EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) { return vaddvq_u8(a); }
-template<> EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a) { return vaddv_s16(a); }
-template<> EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) { return vaddvq_s16(a); }
-template<> EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a) { return vaddv_u16(a); }
-template<> EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) { return vaddvq_u16(a); }
-template<> EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) { return vaddv_s32(a); }
-template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) { return vaddvq_s32(a); }
-template<> EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) { return vaddv_u32(a); }
-template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) { return vaddvq_u32(a); }
-template<> EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) { return vaddvq_s64(a); }
-template<> EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) { return vaddvq_u64(a); }
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a) {
+ return vaddv_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
+ return vaddvq_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a) {
+ return vaddv_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
+ return vaddvq_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a) {
+ return vaddv_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
+ return vaddvq_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) {
+ return vaddv_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
+ return vaddvq_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) {
+ return vaddv_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
+ return vaddvq_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
+ return vaddvq_s64(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
+ return vaddvq_u64(a);
+}
#else
-template<> EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a)
-{
- uint8x8_t sum = vpadd_u8(a,a);
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a) {
+ uint8x8_t sum = vpadd_u8(a, a);
sum = vpadd_u8(sum, sum);
sum = vpadd_u8(sum, sum);
return vget_lane_u8(sum, 0);
}
-template<> EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a));
sum = vpadd_u8(sum, sum);
sum = vpadd_u8(sum, sum);
sum = vpadd_u8(sum, sum);
return vget_lane_u8(sum, 0);
}
-template<> EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a)
-{
- const int16x4_t sum = vpadd_s16(a,a);
+template <>
+EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a) {
+ const int16x4_t sum = vpadd_s16(a, a);
return vget_lane_s16(vpadd_s16(sum, sum), 0);
}
-template<> EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a));
sum = vpadd_s16(sum, sum);
sum = vpadd_s16(sum, sum);
return vget_lane_s16(sum, 0);
}
-template<> EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a)
-{
- const uint16x4_t sum = vpadd_u16(a,a);
+template <>
+EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a) {
+ const uint16x4_t sum = vpadd_u16(a, a);
return vget_lane_u16(vpadd_u16(sum, sum), 0);
}
-template<> EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a));
sum = vpadd_u16(sum, sum);
sum = vpadd_u16(sum, sum);
return vget_lane_u16(sum, 0);
}
-template<> EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) { return vget_lane_s32(vpadd_s32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) {
+ return vget_lane_s32(vpadd_s32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a));
return vget_lane_s32(vpadd_s32(sum, sum), 0);
}
-template<> EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(vpadd_u32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) {
+ return vget_lane_u32(vpadd_u32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a));
return vget_lane_u32(vpadd_u32(sum, sum), 0);
}
-template<> EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a)
-{ return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); }
-template<> EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a)
-{ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); }
+template <>
+EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
+ return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
+ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
+}
#endif
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a)
-{
- return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a,
- vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a) {
+ return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a, vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a)
-{ return vadd_s8(vget_high_s8(a), vget_low_s8(a)); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a)
-{
- return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a,
- vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a) {
+ return vadd_s8(vget_high_s8(a), vget_low_s8(a));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a)
-{ return vadd_u8(vget_high_u8(a), vget_low_u8(a)); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a)
-{ return vadd_s16(vget_high_s16(a), vget_low_s16(a)); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a)
-{ return vadd_u16(vget_high_u16(a), vget_low_u16(a)); }
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a) {
+ return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a, vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a) {
+ return vadd_u8(vget_high_u8(a), vget_low_u8(a));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a) {
+ return vadd_s16(vget_high_s16(a), vget_low_s16(a));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a) {
+ return vadd_u16(vget_high_u16(a), vget_low_u16(a));
+}
// Other reduction functions:
// mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a)
-{ return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); }
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{ return predux_mul<Packet2f>(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
-template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a) {
+ return vget_lane_f32(a, 0) * vget_lane_f32(a, 1);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+ return predux_mul<Packet2f>(vmul_f32(vget_low_f32(a), vget_high_f32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a) {
int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
prod = vmul_s8(prod, vrev16_s8(prod));
return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2);
}
-template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a)
-{
+template <>
+EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a) {
int8x8_t prod = vmul_s8(a, vrev16_s8(a));
prod = vmul_s8(prod, vrev32_s8(prod));
return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
}
-template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a)
-{ return predux_mul<Packet8c>(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
-template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a) {
+ return predux_mul<Packet8c>(vmul_s8(vget_low_s8(a), vget_high_s8(a)));
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a) {
uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
prod = vmul_u8(prod, vrev16_u8(prod));
return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2);
}
-template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a) {
uint8x8_t prod = vmul_u8(a, vrev16_u8(a));
prod = vmul_u8(prod, vrev32_u8(prod));
return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
}
-template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a)
-{ return predux_mul<Packet8uc>(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
-template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a) {
+ return predux_mul<Packet8uc>(vmul_u8(vget_low_u8(a), vget_high_u8(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a) {
const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
}
-template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a) {
int16x4_t prod;
// Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
@@ -2536,13 +3708,13 @@
// Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
}
-template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(const Packet4us& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(const Packet4us& a) {
const uint16x4_t prod = vmul_u16(a, vrev32_u16(a));
return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
}
-template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a) {
uint16x4_t prod;
// Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
@@ -2552,52 +3724,78 @@
// Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
}
-template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a)
-{ return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); }
-template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
-{ return predux_mul<Packet2i>(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a)
-{ return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a)
-{ return predux_mul<Packet2ui>(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
-template<> EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a)
-{ return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); }
-template<> EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a)
-{ return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1); }
+template <>
+EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a) {
+ return vget_lane_s32(a, 0) * vget_lane_s32(a, 1);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
+ return predux_mul<Packet2i>(vmul_s32(vget_low_s32(a), vget_high_s32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a) {
+ return vget_lane_u32(a, 0) * vget_lane_u32(a, 1);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
+ return predux_mul<Packet2ui>(vmul_u32(vget_low_u32(a), vget_high_u32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
+ return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a) {
+ return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1);
+}
// min
#if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a) { return vminv_f32(a); }
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) { return vminvq_f32(a); }
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a) {
+ return vminv_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+ return vminvq_f32(a);
+}
#else
-template<> EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a)
-{ return vget_lane_f32(vpmin_f32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a) {
+ return vget_lane_f32(vpmin_f32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a));
return vget_lane_f32(vpmin_f32(min, min), 0);
}
#endif
-template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(const Packet4c& a)
-{
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(const Packet4c& a) {
const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
int8x8_t min = vpmin_s8(a_dup, a_dup);
min = vpmin_s8(min, min);
return vget_lane_s8(min, 0);
}
#if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a) { return vminv_s8(a); }
-template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) { return vminvq_s8(a); }
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a) {
+ return vminv_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
+ return vminvq_s8(a);
+}
#else
-template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a)
-{
- int8x8_t min = vpmin_s8(a,a);
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a) {
+ int8x8_t min = vpmin_s8(a, a);
min = vpmin_s8(min, min);
min = vpmin_s8(min, min);
return vget_lane_s8(min, 0);
}
-template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a));
min = vpmin_s8(min, min);
min = vpmin_s8(min, min);
@@ -2605,117 +3803,169 @@
return vget_lane_s8(min, 0);
}
#endif
-template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(const Packet4uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(const Packet4uc& a) {
const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
uint8x8_t min = vpmin_u8(a_dup, a_dup);
min = vpmin_u8(min, min);
return vget_lane_u8(min, 0);
}
#if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a) { return vminv_u8(a); }
-template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) { return vminvq_u8(a); }
-template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a) { return vminv_s16(a); }
-template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) { return vminvq_s16(a); }
-template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a) { return vminv_u16(a); }
-template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) { return vminvq_u16(a); }
-template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a) { return vminv_s32(a); }
-template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) { return vminvq_s32(a); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a) { return vminv_u32(a); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) { return vminvq_u32(a); }
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a) {
+ return vminv_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
+ return vminvq_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a) {
+ return vminv_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
+ return vminvq_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a) {
+ return vminv_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
+ return vminvq_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a) {
+ return vminv_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
+ return vminvq_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a) {
+ return vminv_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
+ return vminvq_u32(a);
+}
#else
-template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a)
-{
- uint8x8_t min = vpmin_u8(a,a);
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a) {
+ uint8x8_t min = vpmin_u8(a, a);
min = vpmin_u8(min, min);
min = vpmin_u8(min, min);
return vget_lane_u8(min, 0);
}
-template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a));
min = vpmin_u8(min, min);
min = vpmin_u8(min, min);
min = vpmin_u8(min, min);
return vget_lane_u8(min, 0);
}
-template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a)
-{
- const int16x4_t min = vpmin_s16(a,a);
+template <>
+EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a) {
+ const int16x4_t min = vpmin_s16(a, a);
return vget_lane_s16(vpmin_s16(min, min), 0);
}
-template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a));
min = vpmin_s16(min, min);
min = vpmin_s16(min, min);
return vget_lane_s16(min, 0);
}
-template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a)
-{
- const uint16x4_t min = vpmin_u16(a,a);
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a) {
+ const uint16x4_t min = vpmin_u16(a, a);
return vget_lane_u16(vpmin_u16(min, min), 0);
}
-template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a));
min = vpmin_u16(min, min);
min = vpmin_u16(min, min);
return vget_lane_u16(min, 0);
}
-template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a)
-{ return vget_lane_s32(vpmin_s32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a) {
+ return vget_lane_s32(vpmin_s32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a));
return vget_lane_s32(vpmin_s32(min, min), 0);
}
-template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a)
-{ return vget_lane_u32(vpmin_u32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a) {
+ return vget_lane_u32(vpmin_u32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a));
return vget_lane_u32(vpmin_u32(min, min), 0);
}
#endif
-template<> EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a)
-{ return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
-template<> EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a)
-{ return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
+template <>
+EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a) {
+ return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1));
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a) {
+ return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1));
+}
// max
#if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a) { return vmaxv_f32(a); }
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) { return vmaxvq_f32(a); }
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a) {
+ return vmaxv_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+ return vmaxvq_f32(a);
+}
#else
-template<> EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a)
-{ return vget_lane_f32(vpmax_f32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a) {
+ return vget_lane_f32(vpmax_f32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a));
return vget_lane_f32(vpmax_f32(max, max), 0);
}
#endif
-template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(const Packet4c& a)
-{
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(const Packet4c& a) {
const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
int8x8_t max = vpmax_s8(a_dup, a_dup);
max = vpmax_s8(max, max);
return vget_lane_s8(max, 0);
}
#if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a) { return vmaxv_s8(a); }
-template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) { return vmaxvq_s8(a); }
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a) {
+ return vmaxv_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
+ return vmaxvq_s8(a);
+}
#else
-template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a)
-{
- int8x8_t max = vpmax_s8(a,a);
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a) {
+ int8x8_t max = vpmax_s8(a, a);
max = vpmax_s8(max, max);
max = vpmax_s8(max, max);
return vget_lane_s8(max, 0);
}
-template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a)
-{
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a));
max = vpmax_s8(max, max);
max = vpmax_s8(max, max);
@@ -2723,201 +3973,238 @@
return vget_lane_s8(max, 0);
}
#endif
-template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(const Packet4uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(const Packet4uc& a) {
const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
uint8x8_t max = vpmax_u8(a_dup, a_dup);
max = vpmax_u8(max, max);
return vget_lane_u8(max, 0);
}
#if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a) { return vmaxv_u8(a); }
-template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) { return vmaxvq_u8(a); }
-template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a) { return vmaxv_s16(a); }
-template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) { return vmaxvq_s16(a); }
-template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a) { return vmaxv_u16(a); }
-template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) { return vmaxvq_u16(a); }
-template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a) { return vmaxv_s32(a); }
-template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) { return vmaxvq_s32(a); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a) { return vmaxv_u32(a); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) { return vmaxvq_u32(a); }
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a) {
+ return vmaxv_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
+ return vmaxvq_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a) {
+ return vmaxv_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
+ return vmaxvq_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a) {
+ return vmaxv_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
+ return vmaxvq_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a) {
+ return vmaxv_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
+ return vmaxvq_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a) {
+ return vmaxv_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
+ return vmaxvq_u32(a);
+}
#else
-template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a)
-{
- uint8x8_t max = vpmax_u8(a,a);
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a) {
+ uint8x8_t max = vpmax_u8(a, a);
max = vpmax_u8(max, max);
max = vpmax_u8(max, max);
return vget_lane_u8(max, 0);
}
-template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a));
max = vpmax_u8(max, max);
max = vpmax_u8(max, max);
max = vpmax_u8(max, max);
return vget_lane_u8(max, 0);
}
-template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a)
-{
- const int16x4_t max = vpmax_s16(a,a);
+template <>
+EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a) {
+ const int16x4_t max = vpmax_s16(a, a);
return vget_lane_s16(vpmax_s16(max, max), 0);
}
-template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a)
-{
+template <>
+EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a));
max = vpmax_s16(max, max);
max = vpmax_s16(max, max);
return vget_lane_s16(max, 0);
}
-template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a)
-{
- const uint16x4_t max = vpmax_u16(a,a);
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a) {
+ const uint16x4_t max = vpmax_u16(a, a);
return vget_lane_u16(vpmax_u16(max, max), 0);
}
-template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a));
max = vpmax_u16(max, max);
max = vpmax_u16(max, max);
return vget_lane_u16(max, 0);
}
-template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a)
-{ return vget_lane_s32(vpmax_s32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a) {
+ return vget_lane_s32(vpmax_s32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a));
return vget_lane_s32(vpmax_s32(max, max), 0);
}
-template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a)
-{ return vget_lane_u32(vpmax_u32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a) {
+ return vget_lane_u32(vpmax_u32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a));
return vget_lane_u32(vpmax_u32(max, max), 0);
}
#endif
-template<> EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a)
-{ return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
-template<> EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a)
-{ return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
+template <>
+EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a) {
+ return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1));
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a) {
+ return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1));
+}
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
-{
- uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
- vget_high_u32(vreinterpretq_u32_f32(x)));
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
+ uint32x2_t tmp = vorr_u32(vget_low_u32(vreinterpretq_u32_f32(x)), vget_high_u32(vreinterpretq_u32_f32(x)));
return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
}
// Helpers for ptranspose.
namespace detail {
-
-template<typename Packet>
+
+template <typename Packet>
void zip_in_place(Packet& p1, Packet& p2);
-template<>
+template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet2f>(Packet2f& p1, Packet2f& p2) {
const float32x2x2_t tmp = vzip_f32(p1, p2);
p1 = tmp.val[0];
p2 = tmp.val[1];
}
-template<>
+template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet4f>(Packet4f& p1, Packet4f& p2) {
const float32x4x2_t tmp = vzipq_f32(p1, p2);
p1 = tmp.val[0];
p2 = tmp.val[1];
}
-template<>
+template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet8c>(Packet8c& p1, Packet8c& p2) {
const int8x8x2_t tmp = vzip_s8(p1, p2);
p1 = tmp.val[0];
p2 = tmp.val[1];
}
-template<>
+template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet16c>(Packet16c& p1, Packet16c& p2) {
const int8x16x2_t tmp = vzipq_s8(p1, p2);
p1 = tmp.val[0];
p2 = tmp.val[1];
}
-template<>
+template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet8uc>(Packet8uc& p1, Packet8uc& p2) {
const uint8x8x2_t tmp = vzip_u8(p1, p2);
p1 = tmp.val[0];
p2 = tmp.val[1];
}
-template<>
+template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet16uc>(Packet16uc& p1, Packet16uc& p2) {
const uint8x16x2_t tmp = vzipq_u8(p1, p2);
p1 = tmp.val[0];
p2 = tmp.val[1];
}
-template<>
+template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet2i>(Packet2i& p1, Packet2i& p2) {
const int32x2x2_t tmp = vzip_s32(p1, p2);
p1 = tmp.val[0];
p2 = tmp.val[1];
}
-template<>
+template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet4i>(Packet4i& p1, Packet4i& p2) {
const int32x4x2_t tmp = vzipq_s32(p1, p2);
p1 = tmp.val[0];
p2 = tmp.val[1];
}
-template<>
+template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet2ui>(Packet2ui& p1, Packet2ui& p2) {
const uint32x2x2_t tmp = vzip_u32(p1, p2);
p1 = tmp.val[0];
p2 = tmp.val[1];
}
-template<>
+template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet4ui>(Packet4ui& p1, Packet4ui& p2) {
const uint32x4x2_t tmp = vzipq_u32(p1, p2);
p1 = tmp.val[0];
p2 = tmp.val[1];
}
-template<>
+template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet4s>(Packet4s& p1, Packet4s& p2) {
const int16x4x2_t tmp = vzip_s16(p1, p2);
p1 = tmp.val[0];
p2 = tmp.val[1];
}
-template<>
+template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet8s>(Packet8s& p1, Packet8s& p2) {
const int16x8x2_t tmp = vzipq_s16(p1, p2);
p1 = tmp.val[0];
p2 = tmp.val[1];
}
-template<>
+template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet4us>(Packet4us& p1, Packet4us& p2) {
const uint16x4x2_t tmp = vzip_u16(p1, p2);
p1 = tmp.val[0];
p2 = tmp.val[1];
}
-template<>
+template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet8us>(Packet8us& p1, Packet8us& p2) {
const uint16x8x2_t tmp = vzipq_u16(p1, p2);
p1 = tmp.val[0];
p2 = tmp.val[1];
}
-template<typename Packet>
+template <typename Packet>
EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
zip_in_place(kernel.packet[0], kernel.packet[1]);
}
-template<typename Packet>
+template <typename Packet>
EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
zip_in_place(kernel.packet[0], kernel.packet[2]);
zip_in_place(kernel.packet[1], kernel.packet[3]);
@@ -2925,7 +4212,7 @@
zip_in_place(kernel.packet[2], kernel.packet[3]);
}
-template<typename Packet>
+template <typename Packet>
EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
zip_in_place(kernel.packet[0], kernel.packet[4]);
zip_in_place(kernel.packet[1], kernel.packet[5]);
@@ -2936,31 +4223,31 @@
zip_in_place(kernel.packet[1], kernel.packet[3]);
zip_in_place(kernel.packet[4], kernel.packet[6]);
zip_in_place(kernel.packet[5], kernel.packet[7]);
-
+
zip_in_place(kernel.packet[0], kernel.packet[1]);
zip_in_place(kernel.packet[2], kernel.packet[3]);
zip_in_place(kernel.packet[4], kernel.packet[5]);
zip_in_place(kernel.packet[6], kernel.packet[7]);
}
-template<typename Packet>
+template <typename Packet>
EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
EIGEN_UNROLL_LOOP
- for (int i=0; i<4; ++i) {
+ for (int i = 0; i < 4; ++i) {
const int m = (1 << i);
EIGEN_UNROLL_LOOP
- for (int j=0; j<m; ++j) {
- const int n = (1 << (3-i));
+ for (int j = 0; j < m; ++j) {
+ const int n = (1 << (3 - i));
EIGEN_UNROLL_LOOP
- for (int k=0; k<n; ++k) {
- const int idx = 2*j*n+k;
+ for (int k = 0; k < n; ++k) {
+ const int idx = 2 * j * n + k;
zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
}
}
}
}
-} // namespace detail
+} // namespace detail
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2f, 2>& kernel) {
detail::ptranspose_impl(kernel);
@@ -2969,12 +4256,11 @@
detail::ptranspose_impl(kernel);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel)
-{
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel) {
const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
- const int8x8x2_t zip8 = vzip_s8(a,b);
+ const int8x8x2_t zip8 = vzip_s8(a, b);
const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
@@ -2998,12 +4284,11 @@
detail::ptranspose_impl(kernel);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel)
-{
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel) {
const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
- const uint8x8x2_t zip8 = vzip_u8(a,b);
+ const uint8x8x2_t zip8 = vzip_u8(a, b);
const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
@@ -3051,7 +4336,7 @@
detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
- detail::ptranspose_impl(kernel);
+ detail::ptranspose_impl(kernel);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ui, 2>& kernel) {
detail::zip_in_place(kernel.packet[0], kernel.packet[1]);
@@ -3060,158 +4345,195 @@
detail::ptranspose_impl(kernel);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet2l, 2>& kernel)
-{
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
#if EIGEN_ARCH_ARM64
const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
kernel.packet[0] = tmp1;
#else
- const int64x1_t tmp[2][2] = {
- { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) },
- { vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1]) }
- };
+ const int64x1_t tmp[2][2] = {{vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0])},
+ {vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1])}};
kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]);
kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
#endif
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet2ul, 2>& kernel)
-{
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ul, 2>& kernel) {
#if EIGEN_ARCH_ARM64
const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
kernel.packet[0] = tmp1;
#else
- const uint64x1_t tmp[2][2] = {
- { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) },
- { vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1]) }
- };
+ const uint64x1_t tmp[2][2] = {{vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0])},
+ {vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1])}};
kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]);
kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]);
#endif
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect( const Packet2f& mask, const Packet2f& a, const Packet2f& b)
-{ return vbsl_f32(vreinterpret_u32_f32(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b)
-{ return vbslq_f32(vreinterpretq_u32_f32(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b)
-{ return vbsl_s8(vreinterpret_u8_s8(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b)
-{ return vbslq_s8(vreinterpretq_u8_s8(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b)
-{ return vbsl_u8(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, const Packet16uc& b)
-{ return vbslq_u8(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b)
-{ return vbsl_s16(vreinterpret_u16_s16(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b)
-{ return vbslq_s16(vreinterpretq_u16_s16(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b)
-{ return vbsl_u16(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b)
-{ return vbslq_u16(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b)
-{ return vbsl_s32(vreinterpret_u32_s32(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b)
-{ return vbslq_s32(vreinterpretq_u32_s32(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b)
-{ return vbsl_u32(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b)
-{ return vbslq_u32(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b)
-{ return vbslq_s64(vreinterpretq_u64_s64(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b)
-{ return vbslq_u64(mask, a, b); }
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect(const Packet2f& mask, const Packet2f& a, const Packet2f& b) {
+ return vbsl_f32(vreinterpret_u32_f32(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+ return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b) {
+ return vbsl_s8(vreinterpret_u8_s8(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) {
+ return vbslq_s8(vreinterpretq_u8_s8(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b) {
+ return vbsl_u8(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a,
+ const Packet16uc& b) {
+ return vbslq_u8(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b) {
+ return vbsl_s16(vreinterpret_u16_s16(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) {
+ return vbslq_s16(vreinterpretq_u16_s16(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b) {
+ return vbsl_u16(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) {
+ return vbslq_u16(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b) {
+ return vbsl_s32(vreinterpret_u32_s32(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
+ return vbslq_s32(vreinterpretq_u32_s32(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b) {
+ return vbsl_u32(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
+ return vbslq_u32(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
+ return vbslq_s64(vreinterpretq_u64_s64(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) {
+ return vbslq_u64(mask, a, b);
+}
// Use armv8 rounding intinsics if available.
#if EIGEN_ARCH_ARMV8
-template<> EIGEN_STRONG_INLINE Packet2f print<Packet2f>(const Packet2f& a)
-{ return vrndn_f32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2f print<Packet2f>(const Packet2f& a) {
+ return vrndn_f32(a);
+}
-template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
-{ return vrndnq_f32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
+ return vrndnq_f32(a);
+}
-template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
-{ return vrndm_f32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a) {
+ return vrndm_f32(a);
+}
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
-{ return vrndmq_f32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+ return vrndmq_f32(a);
+}
-template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
-{ return vrndp_f32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a) {
+ return vrndp_f32(a);
+}
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
-{ return vrndpq_f32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+ return vrndpq_f32(a);
+}
#else
-template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
// Adds and subtracts signum(a) * 2^23 to force rounding.
- const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
+ const Packet4f limit = pset1<Packet4f>(static_cast<float>(1 << 23));
const Packet4f abs_a = pabs(a);
Packet4f r = padd(abs_a, limit);
// Don't compile-away addition and subtraction.
EIGEN_OPTIMIZATION_BARRIER(r);
r = psub(r, limit);
// If greater than limit, simply return a. Otherwise, account for sign.
- r = pselect(pcmp_lt(abs_a, limit),
- pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+ r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
return r;
}
-template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) {
// Adds and subtracts signum(a) * 2^23 to force rounding.
- const Packet2f limit = pset1<Packet2f>(static_cast<float>(1<<23));
+ const Packet2f limit = pset1<Packet2f>(static_cast<float>(1 << 23));
const Packet2f abs_a = pabs(a);
Packet2f r = padd(abs_a, limit);
// Don't compile-away addition and subtraction.
EIGEN_OPTIMIZATION_BARRIER(r);
r = psub(r, limit);
// If greater than limit, simply return a. Otherwise, account for sign.
- r = pselect(pcmp_lt(abs_a, limit),
- pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+ r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
return r;
}
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
const Packet4f cst_1 = pset1<Packet4f>(1.0f);
- Packet4f tmp = print<Packet4f>(a);
+ Packet4f tmp = print<Packet4f>(a);
// If greater, subtract one.
Packet4f mask = pcmp_lt(a, tmp);
mask = pand(mask, cst_1);
return psub(tmp, mask);
}
-template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a) {
const Packet2f cst_1 = pset1<Packet2f>(1.0f);
- Packet2f tmp = print<Packet2f>(a);
+ Packet2f tmp = print<Packet2f>(a);
// If greater, subtract one.
Packet2f mask = pcmp_lt(a, tmp);
mask = pand(mask, cst_1);
return psub(tmp, mask);
}
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
const Packet4f cst_1 = pset1<Packet4f>(1.0f);
- Packet4f tmp = print<Packet4f>(a);
+ Packet4f tmp = print<Packet4f>(a);
// If smaller, add one.
Packet4f mask = pcmp_lt(tmp, a);
mask = pand(mask, cst_1);
return padd(tmp, mask);
}
-template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a) {
const Packet2f cst_1 = pset1<Packet2f>(1.0);
- Packet2f tmp = print<Packet2f>(a);
+ Packet2f tmp = print<Packet2f>(a);
// If smaller, add one.
Packet2f mask = pcmp_lt(tmp, a);
mask = pand(mask, cst_1);
@@ -3226,12 +4548,12 @@
* and tests whether setting that digit to 1 would cause the square of the value to be greater than the argument
* value. The algorithm is described in detail here: http://ww1.microchip.com/downloads/en/AppNotes/91040a.pdf .
*/
-template<> EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) {
uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a));
uint8x8_t res = vdup_n_u8(0);
uint8x8_t add = vdup_n_u8(0x8);
- for (int i = 0; i < 4; i++)
- {
+ for (int i = 0; i < 4; i++) {
const uint8x8_t temp = vorr_u8(res, add);
res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res);
add = vshr_n_u8(add, 1);
@@ -3239,11 +4561,11 @@
return vget_lane_u32(vreinterpret_u32_u8(res), 0);
}
/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) {
uint8x8_t res = vdup_n_u8(0);
uint8x8_t add = vdup_n_u8(0x8);
- for (int i = 0; i < 4; i++)
- {
+ for (int i = 0; i < 4; i++) {
const uint8x8_t temp = vorr_u8(res, add);
res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res);
add = vshr_n_u8(add, 1);
@@ -3251,11 +4573,11 @@
return res;
}
/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
+template <>
+EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
uint8x16_t res = vdupq_n_u8(0);
uint8x16_t add = vdupq_n_u8(0x8);
- for (int i = 0; i < 4; i++)
- {
+ for (int i = 0; i < 4; i++) {
const uint8x16_t temp = vorrq_u8(res, add);
res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res);
add = vshrq_n_u8(add, 1);
@@ -3263,11 +4585,11 @@
return res;
}
/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) {
uint16x4_t res = vdup_n_u16(0);
uint16x4_t add = vdup_n_u16(0x80);
- for (int i = 0; i < 8; i++)
- {
+ for (int i = 0; i < 8; i++) {
const uint16x4_t temp = vorr_u16(res, add);
res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res);
add = vshr_n_u16(add, 1);
@@ -3275,11 +4597,11 @@
return res;
}
/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
uint16x8_t res = vdupq_n_u16(0);
uint16x8_t add = vdupq_n_u16(0x80);
- for (int i = 0; i < 8; i++)
- {
+ for (int i = 0; i < 8; i++) {
const uint16x8_t temp = vorrq_u16(res, add);
res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res);
add = vshrq_n_u16(add, 1);
@@ -3287,11 +4609,11 @@
return res;
}
/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) {
uint32x2_t res = vdup_n_u32(0);
uint32x2_t add = vdup_n_u32(0x8000);
- for (int i = 0; i < 16; i++)
- {
+ for (int i = 0; i < 16; i++) {
const uint32x2_t temp = vorr_u32(res, add);
res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res);
add = vshr_n_u32(add, 1);
@@ -3299,11 +4621,11 @@
return res;
}
/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
uint32x4_t res = vdupq_n_u32(0);
uint32x4_t add = vdupq_n_u32(0x8000);
- for (int i = 0; i < 16; i++)
- {
+ for (int i = 0; i < 16; i++) {
const uint32x4_t temp = vorrq_u32(res, add);
res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res);
add = vshrq_n_u32(add, 1);
@@ -3329,7 +4651,8 @@
return result;
}
-template<typename Packet> Packet prsqrt_float_common(const Packet& a) {
+template <typename Packet>
+Packet prsqrt_float_common(const Packet& a) {
const Packet cst_zero = pzero(a);
const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
Packet return_zero = pcmp_eq(a, cst_inf);
@@ -3340,16 +4663,18 @@
return result;
}
-template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
return prsqrt_float_common(a);
}
-template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
return prsqrt_float_common(a);
}
-template<> EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) {
// Compute approximate reciprocal.
float32x4_t result = vrecpeq_f32(a);
result = vmulq_f32(vrecpsq_f32(a, result), result);
@@ -3357,8 +4682,8 @@
return result;
}
-template<> EIGEN_STRONG_INLINE Packet2f preciprocal<Packet2f>(const Packet2f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2f preciprocal<Packet2f>(const Packet2f& a) {
// Compute approximate reciprocal.
float32x2_t result = vrecpe_f32(a);
result = vmul_f32(vrecps_f32(a, result), result);
@@ -3368,37 +4693,51 @@
// Unfortunately vsqrt_f32 is only available for A64.
#if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { return vsqrtq_f32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+ return vsqrtq_f32(a);
+}
-template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) { return vsqrt_f32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
+ return vsqrt_f32(a);
+}
-template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return vdivq_f32(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) {
+ return vdivq_f32(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) { return vdiv_f32(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) {
+ return vdiv_f32(a, b);
+}
#else
-template<typename Packet>
+template <typename Packet>
EIGEN_STRONG_INLINE Packet psqrt_float_common(const Packet& a) {
const Packet cst_zero = pzero(a);
const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
-
- Packet result = pmul(a, prsqrt_float_unsafe(a));
+
+ Packet result = pmul(a, prsqrt_float_unsafe(a));
Packet a_is_zero = pcmp_eq(a, cst_zero);
Packet a_is_inf = pcmp_eq(a, cst_inf);
Packet return_a = por(a_is_zero, a_is_inf);
-
+
result = pselect(return_a, a, result);
return result;
}
-template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
return psqrt_float_common(a);
}
-template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
return psqrt_float_common(a);
}
-template<typename Packet>
+template <typename Packet>
EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) {
// if b is large, NEON intrinsics will flush preciprocal(b) to zero
// avoid underflow with the following manipulation:
@@ -3407,18 +4746,20 @@
const Packet cst_one = pset1<Packet>(1.0f);
const Packet cst_quarter = pset1<Packet>(0.25f);
const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
-
+
Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
return result;
}
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
return pdiv_float_common(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b) {
+template <>
+EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b) {
return pdiv_float_common(a, b);
}
#endif
@@ -3429,56 +4770,57 @@
// TODO: Guard if we have native bfloat16 support
typedef eigen_packet_wrapper<uint16x4_t, 19> Packet4bf;
-template<> struct is_arithmetic<Packet4bf> { enum { value = true }; };
+template <>
+struct is_arithmetic<Packet4bf> {
+ enum { value = true };
+};
-template<> struct packet_traits<bfloat16> : default_packet_traits
-{
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
typedef Packet4bf type;
typedef Packet4bf half;
- enum
- {
+ enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 4,
- HasCmp = 1,
- HasAdd = 1,
- HasSub = 1,
- HasShift = 1,
- HasMul = 1,
- HasNegate = 1,
- HasAbs = 1,
- HasArg = 0,
- HasAbs2 = 1,
- HasAbsDiff = 1,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasShift = 1,
+ HasMul = 1,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasArg = 0,
+ HasAbs2 = 1,
+ HasAbsDiff = 1,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasSetLinear = 1,
- HasBlend = 0,
- HasDiv = 1,
- HasFloor = 1,
- HasCeil = 1,
- HasRint = 1,
+ HasBlend = 0,
+ HasDiv = 1,
+ HasFloor = 1,
+ HasCeil = 1,
+ HasRint = 1,
- HasSin = EIGEN_FAST_MATH,
- HasCos = EIGEN_FAST_MATH,
- HasLog = 1,
- HasExp = 1,
+ HasSin = EIGEN_FAST_MATH,
+ HasCos = EIGEN_FAST_MATH,
+ HasLog = 1,
+ HasExp = 1,
HasSqrt = 0,
HasTanh = EIGEN_FAST_MATH,
- HasErf = EIGEN_FAST_MATH,
+ HasErf = EIGEN_FAST_MATH,
HasBessel = 0, // Issues with accuracy.
HasNdtri = 0
};
};
-template<> struct unpacket_traits<Packet4bf>
-{
+template <>
+struct unpacket_traits<Packet4bf> {
typedef bfloat16 type;
typedef Packet4bf half;
- enum
- {
+ enum {
size = 4,
alignment = Aligned16,
vectorizable = true,
@@ -3487,23 +4829,22 @@
};
};
-namespace detail {
-template<>
+namespace detail {
+template <>
EIGEN_ALWAYS_INLINE void zip_in_place<Packet4bf>(Packet4bf& p1, Packet4bf& p2) {
const uint16x4x2_t tmp = vzip_u16(p1, p2);
p1 = tmp.val[0];
p2 = tmp.val[1];
}
-} // namespace detail
+} // namespace detail
-EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p)
-{
+EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) {
// See the scalar implementation in BFloat16.h for a comprehensible explanation
// of this fast rounding algorithm
Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p));
// lsb = (input >> 16) & 1
- Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
+ Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
// rounding_bias = 0x7fff + lsb
Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff));
@@ -3523,215 +4864,216 @@
return vmovn_u32(input);
}
-EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p)
-{
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p) {
return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16)));
}
-EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) {
- return vmovn_u32(vreinterpretq_u32_f32(p));
-}
+EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { return vmovn_u32(vreinterpretq_u32_f32(p)); }
-template<> EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
return Packet4bf(pset1<Packet4us>(from.value));
}
-template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(Packet4us(from))));
}
-template<> EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from) {
return Packet4bf(pload<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from) {
return Packet4bf(ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
}
-template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from) {
EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
}
-template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet4bf& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet4bf& from) {
EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
}
-template<> EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from) {
return Packet4bf(ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
}
-template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
return F32ToBf16(pabs<Packet4f>(Bf16ToF32(a)));
}
-template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(const Packet4bf &a,
- const Packet4bf &b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
return F32ToBf16(pmin<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template <> EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(const Packet4bf &a,
- const Packet4bf &b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
return F32ToBf16(pmin<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template <> EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(const Packet4bf &a,
- const Packet4bf &b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
return F32ToBf16(pmin<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(const Packet4bf &a,
- const Packet4bf &b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
return F32ToBf16(pmax<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template <> EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(const Packet4bf &a,
- const Packet4bf &b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
return F32ToBf16(pmax<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template <> EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(const Packet4bf &a,
- const Packet4bf &b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
return F32ToBf16(pmax<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a) {
return F32ToBf16(plset<Packet4f>(static_cast<float>(a)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a, const Packet4bf& b) {
return Packet4bf(por<Packet4us>(Packet4us(a), Packet4us(b)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a, const Packet4bf& b) {
return Packet4bf(pxor<Packet4us>(Packet4us(a), Packet4us(b)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a, const Packet4bf& b) {
return Packet4bf(pand<Packet4us>(Packet4us(a), Packet4us(b)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a, const Packet4bf& b) {
return Packet4bf(pandnot<Packet4us>(Packet4us(a), Packet4us(b)));
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a,
- const Packet4bf& b)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a, const Packet4bf& b) {
return Packet4bf(pselect<Packet4us>(Packet4us(mask), Packet4us(a), Packet4us(b)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a) {
return F32ToBf16(print<Packet4f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a) {
return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(const Packet4bf& a) {
return F32ToBf16(pceil<Packet4f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) {
+ return a;
+}
-template<> EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
return F32ToBf16(padd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf psub<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf psub<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
return F32ToBf16(psub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
return F32ToBf16(pmul<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<>
-EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride) {
return Packet4bf(pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride));
}
-template<>
-EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride)
-{
+template <>
+EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride) {
pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), Packet4us(from), stride);
}
-template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a) {
return static_cast<bfloat16>(predux<Packet4f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<Packet4bf>(const Packet4bf& a) {
return static_cast<bfloat16>(predux_max<Packet4f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<Packet4bf>(const Packet4bf& a) {
return static_cast<bfloat16>(predux_min<Packet4f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a) {
return static_cast<bfloat16>(predux_mul<Packet4f>(Bf16ToF32(a)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a) {
return Packet4bf(preverse<Packet4us>(Packet4us(a)));
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel)
-{
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel) {
detail::ptranspose_impl(kernel);
}
-template<> EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
return F32ToBf16(pabsdiff<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
return F32MaskToBf16Mask(pcmp_eq<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
return F32MaskToBf16Mask(pcmp_lt_or_nan<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
}
-template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a) {
return Packet4bf(pxor<Packet4us>(Packet4us(a), pset1<Packet4us>(static_cast<uint16_t>(0x8000))));
}
@@ -3756,9 +5098,15 @@
// already defined in arm_neon.h, then our workaround doesn't cause a conflict
// and has lower priority in overload resolution.
// This doesn't work with MSVC though, since the function names are macros.
-template <typename T> uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; }
+template <typename T>
+uint64x2_t vreinterpretq_u64_f64(T a) {
+ return (uint64x2_t)a;
+}
-template <typename T> float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; }
+template <typename T>
+float64x2_t vreinterpretq_f64_u64(T a) {
+ return (float64x2_t)a;
+}
#endif
#if EIGEN_COMP_MSVC_STRICT
@@ -3777,85 +5125,73 @@
EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { return Packet2d{a, b}; }
#endif
-
// fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
// Currently used in LU/arch/InverseSize4.h to enable a shared implementation
// for fast inversion of matrices of size 4.
-EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask)
-{
+EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) {
const double* a = reinterpret_cast<const double*>(&m);
const double* b = reinterpret_cast<const double*>(&n);
Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
return res;
}
-EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask)
-{
+EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) {
return shuffle(a, b, mask);
}
-EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a,const Packet2d& b)
-{
- return shuffle(a, b, 0);
-}
-EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b)
-{
- return shuffle(a, b, 3);
-}
-#define vec2d_duplane(a, p) \
- Packet2d(vdupq_laneq_f64(a, p))
+EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 0); }
+EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 3); }
+#define vec2d_duplane(a, p) Packet2d(vdupq_laneq_f64(a, p))
-template<> struct packet_traits<double> : default_packet_traits
-{
+template <>
+struct packet_traits<double> : default_packet_traits {
typedef Packet2d type;
typedef Packet2d half;
- enum
- {
+ enum {
Vectorizable = 1,
AlignedOnScalar = 1,
size = 2,
- HasCmp = 1,
- HasAdd = 1,
- HasSub = 1,
- HasShift = 1,
- HasMul = 1,
- HasNegate = 1,
- HasAbs = 1,
- HasArg = 0,
- HasAbs2 = 1,
- HasAbsDiff = 1,
- HasMin = 1,
- HasMax = 1,
- HasConj = 1,
+ HasCmp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasShift = 1,
+ HasMul = 1,
+ HasNegate = 1,
+ HasAbs = 1,
+ HasArg = 0,
+ HasAbs2 = 1,
+ HasAbsDiff = 1,
+ HasMin = 1,
+ HasMax = 1,
+ HasConj = 1,
HasSetLinear = 1,
- HasBlend = 0,
+ HasBlend = 0,
- HasDiv = 1,
+ HasDiv = 1,
HasFloor = 1,
HasCeil = 1,
HasRint = 1,
#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
- HasExp = 1,
- HasLog = 1,
+ HasExp = 1,
+ HasLog = 1,
HasATan = 1,
#endif
- HasSin = 0,
- HasCos = 0,
+ HasSin = 0,
+ HasCos = 0,
HasSqrt = 1,
HasRsqrt = 1,
HasTanh = 0,
- HasErf = 0
+ HasErf = 0
};
};
-template<> struct unpacket_traits<Packet2d>
-{
+template <>
+struct unpacket_traits<Packet2d> {
typedef double type;
typedef Packet2d half;
typedef Packet2l integer_packet;
- enum
- {
+ enum {
size = 2,
alignment = Aligned16,
vectorizable = true,
@@ -3864,149 +5200,239 @@
};
};
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return vdupq_n_f64(from); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+ return vdupq_n_f64(from);
+}
-template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
-{
- const double c[] = {0.0,1.0};
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+ const double c[] = {0.0, 1.0};
return vaddq_f64(pset1<Packet2d>(a), vld1q_f64(c));
}
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vaddq_f64(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vsubq_f64(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
-template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b){
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d&, const Packet2d&);
+template <>
+EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
return padd(a, pxor(mask, b));
}
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+ return vnegq_f64(a);
+}
-template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+ return a;
+}
-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vmulq_f64(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vdivq_f64(a, b);
+}
#ifdef __ARM_FEATURE_FMA
// See bug 936. See above comment about FMA for float.
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
-{ return vfmaq_f64(c,a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+ return vfmaq_f64(c, a, b);
+}
#else
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
-{ return vmlaq_f64(c,a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+ return vmlaq_f64(c, a, b);
+}
#endif
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vminq_f64(a, b);
+}
#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
-// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
-template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vminnmq_f64(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxnmq_f64(a, b); }
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
+// systems).
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vminnmq_f64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vmaxnmq_f64(a, b);
+}
#endif
-template<> EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmin<Packet2d>(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return pmin<Packet2d>(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vmaxq_f64(a, b);
+}
-
-template<> EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { return pmax<Packet2d>(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return pmax<Packet2d>(a, b);
+}
// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
+}
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
+}
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
+}
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
+}
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vcleq_f64(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
+ return vreinterpretq_f64_u64(vcleq_f64(a, b));
+}
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vcltq_f64(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
+ return vreinterpretq_f64_u64(vcltq_f64(a, b));
+}
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a,b)))); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
+ return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a, b))));
+}
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vceqq_f64(a,b)); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
+ return vreinterpretq_f64_u64(vceqq_f64(a, b));
+}
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from);
+}
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from);
+}
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) { return vld1q_dup_f64(from); }
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to,from); }
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
+ return vld1q_dup_f64(from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+ EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from);
+}
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to,from); }
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from);
+}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
Packet2d res = pset1<Packet2d>(0.0);
- res = vld1q_lane_f64(from + 0*stride, res, 0);
- res = vld1q_lane_f64(from + 1*stride, res, 1);
+ res = vld1q_lane_f64(from + 0 * stride, res, 0);
+ res = vld1q_lane_f64(from + 1 * stride, res, 1);
return res;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
-{
- vst1q_lane_f64(to + stride*0, from, 0);
- vst1q_lane_f64(to + stride*1, from, 1);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
+ vst1q_lane_f64(to + stride * 0, from, 0);
+ vst1q_lane_f64(to + stride * 1, from, 1);
}
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+ EIGEN_ARM_PREFETCH(addr);
+}
// FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a,0); }
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+ return vgetq_lane_f64(a, 0);
+}
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
-{ return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+ return vcombine_f64(vget_high_f64(a), vget_low_f64(a));
+}
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+ return vabsq_f64(a);
+}
template <>
EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
return vreinterpretq_f64_s64(vshrq_n_s64(vreinterpretq_s64_f64(a), 63));
}
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{ return vaddvq_f64(a); }
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
+ return vaddvq_f64(a);
+}
// Other reduction functions:
// mul
#if EIGEN_COMP_CLANGAPPLE
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{ return (vget_low_f64(a) * vget_high_f64(a))[0]; }
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+ return (vget_low_f64(a) * vget_high_f64(a))[0];
+}
#else
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{ return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0); }
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+ return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0);
+}
#endif
// min
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
-{ return vminvq_f64(a); }
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+ return vminvq_f64(a);
+}
// max
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
-{ return vmaxvq_f64(a); }
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+ return vmaxvq_f64(a);
+}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet2d, 2>& kernel)
-{
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
@@ -4014,35 +5440,53 @@
kernel.packet[1] = tmp2;
}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b)
-{ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
+ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
-{ return vrndnq_f64(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
+ return vrndnq_f64(a);
+}
-template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
-{ return vrndmq_f64(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+ return vrndmq_f64(a);
+}
-template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
-{ return vrndpq_f64(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+ return vrndpq_f64(a);
+}
-template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent)
-{ return pldexp_generic(a, exponent); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+ return pldexp_generic(a, exponent);
+}
-template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent)
-{ return pfrexp_generic(a,exponent); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
+ return pfrexp_generic(a, exponent);
+}
-template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from)
-{ return vreinterpretq_f64_u64(vdupq_n_u64(from)); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
+ return vreinterpretq_f64_u64(vdupq_n_u64(from));
+}
-template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
// Do Newton iterations for 1/sqrt(x).
return generic_rsqrt_newton_step<Packet2d, /*Steps=*/3>::run(a, vrsqrteq_f64(a));
}
-template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); }
+template <>
+EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x) {
+ return vsqrtq_f64(_x);
+}
-#endif // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+#endif // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
// Do we have an fp16 types and supporting Neon intrinsics?
#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
@@ -4119,7 +5563,7 @@
};
};
-template<>
+template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(const Packet8hf& a) {
return vadd_f16(vget_low_f16(a), vget_high_f16(a));
}
@@ -4229,14 +5673,27 @@
}
#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
-// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
-template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vminnm_f16(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vminnmq_f16(a, b); }
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
+// systems).
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+ return vminnm_f16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+ return vminnmq_f16(a, b);
+}
#endif
-template<> EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmin<Packet4hf>(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+ return pmin<Packet4hf>(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmin<Packet8hf>(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+ return pmin<Packet8hf>(a, b);
+}
template <>
EIGEN_STRONG_INLINE Packet8hf pmax<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
@@ -4249,14 +5706,27 @@
}
#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
-// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8 systems).
-template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return vmaxnm_f16(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return vmaxnmq_f16(a, b); }
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
+// systems).
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+ return vmaxnm_f16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+ return vmaxnmq_f16(a, b);
+}
#endif
-template<> EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) { return pmax<Packet4hf>(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+ return pmax<Packet4hf>(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) { return pmax<Packet8hf>(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+ return pmax<Packet8hf>(a, b);
+}
#define EIGEN_MAKE_ARM_FP16_CMP_8(name) \
template <> \
@@ -4292,28 +5762,34 @@
}
template <>
-EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(const Packet8hf& a)
-{ return vrndnq_f16(a); }
+EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(const Packet8hf& a) {
+ return vrndnq_f16(a);
+}
template <>
-EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(const Packet4hf& a)
-{ return vrndn_f16(a); }
+EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(const Packet4hf& a) {
+ return vrndn_f16(a);
+}
template <>
-EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a)
-{ return vrndmq_f16(a); }
+EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a) {
+ return vrndmq_f16(a);
+}
template <>
-EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a)
-{ return vrndm_f16(a); }
+EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a) {
+ return vrndm_f16(a);
+}
template <>
-EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a)
-{ return vrndpq_f16(a); }
+EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a) {
+ return vrndpq_f16(a);
+}
template <>
-EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a)
-{ return vrndp_f16(a); }
+EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a) {
+ return vrndp_f16(a);
+}
template <>
EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(const Packet8hf& a) {
@@ -4415,13 +5891,17 @@
EIGEN_STRONG_INLINE Packet8hf ploadquad<Packet8hf>(const Eigen::half* from) {
Packet4hf lo, hi;
lo = vld1_dup_f16(reinterpret_cast<const float16_t*>(from));
- hi = vld1_dup_f16(reinterpret_cast<const float16_t*>(from+1));
+ hi = vld1_dup_f16(reinterpret_cast<const float16_t*>(from + 1));
return vcombine_f16(lo, hi);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 0); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) {
+ return vsetq_lane_f16(b.x, a, 0);
+}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 0); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) {
+ return vset_lane_f16(b.x, a, 0);
+}
template <>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) {
@@ -4433,9 +5913,13 @@
return vbsl_f16(vreinterpret_u16_f16(mask), a, b);
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) { return vsetq_lane_f16(b.x, a, 7); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) {
+ return vsetq_lane_f16(b.x, a, 7);
+}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) { return vset_lane_f16(b.x, a, 3); }
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) {
+ return vset_lane_f16(b.x, a, 3);
+}
template <>
EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
@@ -4482,7 +5966,8 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from, Index stride) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from,
+ Index stride) {
to[stride * 0].x = vgetq_lane_f16(from, 0);
to[stride * 1].x = vgetq_lane_f16(from, 1);
to[stride * 2].x = vgetq_lane_f16(from, 2);
@@ -4494,7 +5979,8 @@
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from, Index stride) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from,
+ Index stride) {
to[stride * 0].x = vget_lane_f16(from, 0);
to[stride * 1].x = vget_lane_f16(from, 1);
to[stride * 2].x = vget_lane_f16(from, 2);
@@ -4524,7 +6010,8 @@
return h;
}
-template<> EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) {
float16x4_t a_lo, a_hi;
Packet8hf a_r64;
@@ -4544,7 +6031,7 @@
return vabsq_f16(a);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet8hf psignbit(const Packet8hf& a) {
return vreinterpretq_f16_s16(vshrq_n_s16(vreinterpretq_s16_f16(a), 15));
}
@@ -4556,7 +6043,7 @@
template <>
EIGEN_STRONG_INLINE Packet4hf psignbit(const Packet4hf& a) {
- return vreinterpret_f16_s16( vshr_n_s16( vreinterpret_s16_f16(a), 15));
+ return vreinterpret_f16_s16(vshr_n_s16(vreinterpret_s16_f16(a), 15));
}
template <>
@@ -4636,8 +6123,7 @@
return h;
}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel)
-{
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel) {
const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
@@ -4690,10 +6176,10 @@
kernel.packet[6] = T_3[1].val[1];
kernel.packet[7] = T_3[3].val[1];
}
-#endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+#endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_PACKET_MATH_NEON_H
+#endif // EIGEN_PACKET_MATH_NEON_H
diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h
index 68566b0..58d7b8c 100644
--- a/Eigen/src/Core/arch/NEON/TypeCasting.h
+++ b/Eigen/src/Core/arch/NEON/TypeCasting.h
@@ -18,7 +18,6 @@
namespace internal {
-
//==============================================================================
// preinterpret (truncation operations)
//==============================================================================
@@ -93,7 +92,6 @@
return Packet4f(vreinterpretq_f32_u32(a));
}
-
template <>
EIGEN_STRONG_INLINE Packet4c preinterpret<Packet4c, Packet4uc>(const Packet4uc& a) {
return static_cast<Packet4c>(a);
@@ -107,7 +105,6 @@
return Packet16c(vreinterpretq_s8_u8(a));
}
-
template <>
EIGEN_STRONG_INLINE Packet4uc preinterpret<Packet4uc, Packet4c>(const Packet4c& a) {
return static_cast<Packet4uc>(a);
@@ -185,7 +182,6 @@
// pcast, SrcType = float
//==============================================================================
-
template <>
struct type_casting_traits<float, numext::int64_t> {
enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
@@ -415,7 +411,6 @@
return vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(a))));
}
-
template <>
struct type_casting_traits<numext::int8_t, numext::uint32_t> {
enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
@@ -477,7 +472,6 @@
return preinterpret<Packet4us>(pcast<Packet4c, Packet4s>(a));
}
-
//==============================================================================
// pcast, SrcType = uint8_t
//==============================================================================
@@ -577,7 +571,6 @@
return vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a))));
}
-
template <>
struct type_casting_traits<numext::uint8_t, numext::int16_t> {
enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
@@ -595,7 +588,6 @@
return preinterpret<Packet4s>(pcast<Packet4uc, Packet4us>(a));
}
-
//==============================================================================
// pcast, SrcType = int16_t
//==============================================================================
@@ -673,7 +665,6 @@
return preinterpret<Packet2ui>(pcast<Packet4s, Packet2i>(a));
}
-
template <>
struct type_casting_traits<numext::int16_t, numext::int8_t> {
enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
@@ -794,7 +785,6 @@
return preinterpret<Packet2i>(pcast<Packet4us, Packet2ui>(a));
}
-
template <>
struct type_casting_traits<numext::uint16_t, numext::uint8_t> {
enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
@@ -881,7 +871,6 @@
return preinterpret<Packet2ul>(pcast<Packet2i, Packet2l>(a));
}
-
template <>
struct type_casting_traits<numext::int32_t, numext::int16_t> {
enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
@@ -1013,7 +1002,6 @@
return preinterpret<Packet2l>(pcast<Packet2ui, Packet2ul>(a));
}
-
template <>
struct type_casting_traits<numext::uint32_t, numext::uint16_t> {
enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
@@ -1273,7 +1261,6 @@
#endif
}
-
template <>
struct type_casting_traits<numext::uint64_t, numext::uint32_t> {
enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
@@ -1407,7 +1394,6 @@
return Packet4i(vreinterpretq_s32_f64(a));
}
-
template <>
struct type_casting_traits<double, float> {
enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
@@ -1534,7 +1520,7 @@
}
template <>
EIGEN_STRONG_INLINE Packet8uc pcast<Packet2d, Packet8uc>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
- const Packet2d& d) {
+ const Packet2d& d) {
return preinterpret<Packet8uc>(pcast<Packet2d, Packet8c>(a, b, c, d));
}
template <>
diff --git a/Eigen/src/Core/arch/NEON/UnaryFunctors.h b/Eigen/src/Core/arch/NEON/UnaryFunctors.h
index 09da91c..8be5bb0 100644
--- a/Eigen/src/Core/arch/NEON/UnaryFunctors.h
+++ b/Eigen/src/Core/arch/NEON/UnaryFunctors.h
@@ -17,38 +17,31 @@
#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
/** \internal
- * \brief Template specialization of the logistic function for Eigen::half.
- */
+ * \brief Template specialization of the logistic function for Eigen::half.
+ */
template <>
struct scalar_logistic_op<Eigen::half> {
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- Eigen::half operator()(const Eigen::half& x) const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator()(const Eigen::half& x) const {
// Convert to float and call scalar_logistic_op<float>.
const scalar_logistic_op<float> float_op;
return Eigen::half(float_op(float(x)));
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- Eigen::half packetOp(const Eigen::half& x) const {
- return this->operator()(x);
- }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half packetOp(const Eigen::half& x) const { return this->operator()(x); }
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- Packet4hf packetOp(const Packet4hf& x) const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf packetOp(const Packet4hf& x) const {
const scalar_logistic_op<float> float_op;
return vcvt_f16_f32(float_op.packetOp(vcvt_f32_f16(x)));
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
- Packet8hf packetOp(const Packet8hf& x) const {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf packetOp(const Packet8hf& x) const {
const scalar_logistic_op<float> float_op;
- return vcombine_f16(
- vcvt_f16_f32(float_op.packetOp(vcvt_f32_f16(vget_low_f16(x)))),
- vcvt_f16_f32(float_op.packetOp(vcvt_high_f32_f16(x))));
+ return vcombine_f16(vcvt_f16_f32(float_op.packetOp(vcvt_f32_f16(vget_low_f16(x)))),
+ vcvt_f16_f32(float_op.packetOp(vcvt_high_f32_f16(x))));
}
};
-template<>
+template <>
struct functor_traits<scalar_logistic_op<Eigen::half>> {
enum {
Cost = functor_traits<scalar_logistic_op<float>>::Cost,
@@ -57,8 +50,8 @@
};
#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_NEON_UNARY_FUNCTORS_H
+#endif // EIGEN_NEON_UNARY_FUNCTORS_H
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index d068806..4c5c499 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -18,8 +18,7 @@
namespace internal {
//---------- float ----------
-struct Packet2cf
-{
+struct Packet2cf {
EIGEN_STRONG_INLINE Packet2cf() {}
EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
Packet4f v;
@@ -28,8 +27,8 @@
// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
// to leverage AVX instructions.
#ifndef EIGEN_VECTORIZE_AVX
-template<> struct packet_traits<std::complex<float> > : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
typedef Packet2cf type;
typedef Packet2cf half;
enum {
@@ -37,138 +36,179 @@
AlignedOnScalar = 1,
size = 2,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
HasNegate = 1,
- HasSqrt = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
+ HasSqrt = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
HasSetLinear = 0,
- HasBlend = 1
+ HasBlend = 1
};
};
#endif
-template<> struct unpacket_traits<Packet2cf> {
+template <>
+struct unpacket_traits<Packet2cf> {
typedef std::complex<float> type;
typedef Packet2cf half;
typedef Packet4f as_real;
enum {
- size=2,
- alignment=Aligned16,
- vectorizable=true,
- masked_load_available=false,
- masked_store_available=false
+ size = 2,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
};
};
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a)
-{
- const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
- return Packet2cf(_mm_xor_ps(a.v,mask));
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(_mm_add_ps(a.v, b.v));
}
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
-{
- const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
- return Packet2cf(_mm_xor_ps(a.v,mask));
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(_mm_sub_ps(a.v, b.v));
}
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
- #ifdef EIGEN_VECTORIZE_SSE3
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+ const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
+ return Packet2cf(_mm_xor_ps(a.v, mask));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+ const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000, 0x80000000, 0x00000000, 0x80000000));
+ return Packet2cf(_mm_xor_ps(a.v, mask));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+#ifdef EIGEN_VECTORIZE_SSE3
return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
- _mm_mul_ps(_mm_movehdup_ps(a.v),
- vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-// return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
-// _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-// vec4f_swizzle1(b.v, 1, 0, 3, 2))));
- #else
- const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000));
- return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
- _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
- vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
- #endif
+ _mm_mul_ps(_mm_movehdup_ps(a.v), vec4f_swizzle1(b.v, 1, 0, 3, 2))));
+ // return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
+ // _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
+ // vec4f_swizzle1(b.v, 1, 0, 3, 2))));
+#else
+ const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x00000000, 0x80000000, 0x00000000));
+ return Packet2cf(
+ _mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
+ _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
+#endif
}
-template<> EIGEN_STRONG_INLINE Packet2cf ptrue <Packet2cf>(const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(b.v,a.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf ptrue<Packet2cf>(const Packet2cf& a) {
+ return Packet2cf(ptrue(Packet4f(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(_mm_and_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(_mm_or_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(_mm_xor_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(_mm_andnot_ps(b.v, a.v));
+}
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from))); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from)));
+}
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
const float re = std::real(from);
const float im = std::imag(from);
return Packet2cf(_mm_set_ps(im, re, im, re));
}
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); }
-
-
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
-{
- return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]),
- std::imag(from[0*stride]), std::real(from[0*stride])));
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+ return pset1<Packet2cf>(*from);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
-{
- to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
- _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
- to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
- _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+ EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v));
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v));
}
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+ Index stride) {
+ return Packet2cf(_mm_set_ps(std::imag(from[1 * stride]), std::real(from[1 * stride]), std::imag(from[0 * stride]),
+ std::real(from[0 * stride])));
+}
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from,
+ Index stride) {
+ to[stride * 0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
+ _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
+ to[stride * 1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
+ _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+ _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
alignas(alignof(__m64)) std::complex<float> res;
_mm_storel_pi((__m64*)&res, a.v);
return res;
}
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); }
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
- return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v))));
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
+ return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v)))));
}
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
- return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v))));
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
+ return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v, a.v))));
}
-EIGEN_STRONG_INLINE Packet2cf pcplxflip/* <Packet2cf> */(const Packet2cf& x)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
+ return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v, a.v))));
+}
+
+EIGEN_STRONG_INLINE Packet2cf pcplxflip /* <Packet2cf> */ (const Packet2cf& x) {
return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
}
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
return pdiv_complex(a, b);
}
//---------- double ----------
-struct Packet1cd
-{
+struct Packet1cd {
EIGEN_STRONG_INLINE Packet1cd() {}
EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
Packet2d v;
@@ -177,8 +217,8 @@
// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
// to leverage AVX instructions.
#ifndef EIGEN_VECTORIZE_AVX
-template<> struct packet_traits<std::complex<double> > : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
typedef Packet1cd type;
typedef Packet1cd half;
enum {
@@ -186,112 +226,155 @@
AlignedOnScalar = 0,
size = 1,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
HasNegate = 1,
- HasSqrt = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
+ HasSqrt = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
HasSetLinear = 0
};
};
#endif
-template<> struct unpacket_traits<Packet1cd> {
+template <>
+struct unpacket_traits<Packet1cd> {
typedef std::complex<double> type;
typedef Packet1cd half;
typedef Packet2d as_real;
enum {
- size=1,
- alignment=Aligned16,
- vectorizable=true,
- masked_load_available=false,
- masked_store_available=false
+ size = 1,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
};
};
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
-{
- const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
- return Packet1cd(_mm_xor_pd(a.v,mask));
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(_mm_add_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(_mm_sub_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+ return Packet1cd(pnegate(Packet2d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+ const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000, 0x0, 0x0, 0x0));
+ return Packet1cd(_mm_xor_pd(a.v, mask));
}
-template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
- #ifdef EIGEN_VECTORIZE_SSE3
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+#ifdef EIGEN_VECTORIZE_SSE3
return Packet1cd(_mm_addsub_pd(_mm_mul_pd(_mm_movedup_pd(a.v), b.v),
- _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
- vec2d_swizzle1(b.v, 1, 0))));
- #else
- const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
+ _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), vec2d_swizzle1(b.v, 1, 0))));
+#else
+ const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0, 0x0, 0x80000000, 0x0));
return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
- _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
- vec2d_swizzle1(b.v, 1, 0)), mask)));
- #endif
+ _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), vec2d_swizzle1(b.v, 1, 0)), mask)));
+#endif
}
-template<> EIGEN_STRONG_INLINE Packet1cd ptrue <Packet1cd>(const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(b.v,a.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd ptrue<Packet1cd>(const Packet1cd& a) {
+ return Packet1cd(ptrue(Packet2d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(_mm_and_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(_mm_or_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(_mm_xor_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(_mm_andnot_pd(b.v, a.v));
+}
// FIXME force unaligned load, this is a temporary fix
-template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
-{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd
+pset1<Packet1cd>(const std::complex<double>& from) { /* here we really have to use unaligned loads :( */
+ return ploadu<Packet1cd>(&from);
+}
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+ return pset1<Packet1cd>(*from);
+}
// FIXME force unaligned store, this is a temporary fix
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+ EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v));
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v));
+}
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+ _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
EIGEN_ALIGN16 double res[2];
_mm_store_pd(res, a.v);
- return std::complex<double>(res[0],res[1]);
+ return std::complex<double>(res[0], res[1]);
}
-template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+ return a;
+}
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
return pfirst(a);
}
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
return pfirst(a);
}
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
return pdiv_complex(a, b);
}
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/* <Packet1cd> */(const Packet1cd& x)
-{
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /* <Packet1cd> */ (const Packet1cd& x) {
return Packet1cd(preverse(Packet2d(x.v)));
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2cf,2>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
__m128d w1 = _mm_castps_pd(kernel.packet[0].v);
__m128d w2 = _mm_castps_pd(kernel.packet[1].v);
@@ -300,32 +383,36 @@
kernel.packet[1].v = tmp;
}
-template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
__m128 eq = _mm_cmpeq_ps(a.v, b.v);
return Packet2cf(pand<Packet4f>(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)));
}
-template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
__m128d eq = _mm_cmpeq_pd(a.v, b.v);
return Packet1cd(pand<Packet2d>(eq, vec2d_swizzle1(eq, 1, 0)));
}
-template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+ const Packet2cf& elsePacket) {
__m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
return Packet2cf(_mm_castpd_ps(result));
}
-template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+template <>
+EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
return psqrt_complex<Packet1cd>(a);
}
-template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
return psqrt_complex<Packet2cf>(a);
}
-} // end namespace internal
-} // end namespace Eigen
+} // end namespace internal
+} // end namespace Eigen
-#endif // EIGEN_COMPLEX_SSE_H
+#endif // EIGEN_COMPLEX_SSE_H
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 0f86bcf..30c1f07 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -29,17 +29,23 @@
// iteration for square root. In particular, Skylake and Zen2 processors
// have approximately doubled throughput of the _mm_sqrt_ps instruction
// compared to their predecessors.
-template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet16b psqrt<Packet16b>(const Packet16b& x) { return x; }
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt<Packet4f>(const Packet4f& x) {
+ return _mm_sqrt_ps(x);
+}
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d psqrt<Packet2d>(const Packet2d& x) {
+ return _mm_sqrt_pd(x);
+}
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16b psqrt<Packet16b>(const Packet16b& x) {
+ return x;
+}
#if EIGEN_FAST_MATH
// Even on Skylake, using Newton iteration is a win for reciprocal square root.
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt<Packet4f>(const Packet4f& x) {
return generic_rsqrt_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rsqrt_ps(x));
}
@@ -47,28 +53,25 @@
// Trying to speed up reciprocal using Newton-Raphson is counterproductive
// unless FMA is available. Without FMA pdiv(pset1<Packet>(Scalar(1),a)) is
// 30% faster.
-template<> EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& x) {
+template <>
+EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& x) {
return generic_reciprocal_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rcp_ps(x));
}
#endif
#endif
-} // end namespace internal
+} // end namespace internal
namespace numext {
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-float sqrt(const float &x)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sqrt(const float& x) {
return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x))));
}
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-double sqrt(const double &x)
-{
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sqrt(const double& x) {
#if EIGEN_COMP_GNUC_STRICT
// This works around a GCC bug generating poor code for _mm_sqrt_pd
// See https://gitlab.com/libeigen/eigen/commit/8dca9f97e38970
@@ -78,8 +81,8 @@
#endif
}
-} // end namespace numex
+} // namespace numext
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_MATH_FUNCTIONS_SSE_H
+#endif // EIGEN_MATH_FUNCTIONS_SSE_H
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 8dd553d..be8183c 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -25,7 +25,7 @@
#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
// 32 bits => 8 registers
// 64 bits => 16 registers
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2 * sizeof(void*))
#endif
#ifdef EIGEN_VECTORIZE_FMA
@@ -34,16 +34,18 @@
#endif
#endif
-#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW || EIGEN_COMP_LCC) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX
+#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW || EIGEN_COMP_LCC) && \
+ (__GXX_ABI_VERSION < 1004)) || \
+ EIGEN_OS_QNX
// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
// have overloads for both types without linking error.
// One solution is to increase ABI version using -fabi-version=4 (or greater).
// Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
// structure:
-typedef eigen_packet_wrapper<__m128> Packet4f;
+typedef eigen_packet_wrapper<__m128> Packet4f;
typedef eigen_packet_wrapper<__m128d> Packet2d;
#else
-typedef __m128 Packet4f;
+typedef __m128 Packet4f;
typedef __m128d Packet2d;
#endif
@@ -51,87 +53,90 @@
typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
typedef eigen_packet_wrapper<__m128i, 4> Packet4ui;
-template<> struct is_arithmetic<__m128> { enum { value = true }; };
-template<> struct is_arithmetic<__m128i> { enum { value = true }; };
-template<> struct is_arithmetic<__m128d> { enum { value = true }; };
-template<> struct is_arithmetic<Packet4i> { enum { value = true }; };
+template <>
+struct is_arithmetic<__m128> {
+ enum { value = true };
+};
+template <>
+struct is_arithmetic<__m128i> {
+ enum { value = true };
+};
+template <>
+struct is_arithmetic<__m128d> {
+ enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet4i> {
+ enum { value = true };
+};
// Note that `Packet4ui` uses the underlying type `__m128i`, which is
// interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
// operations used in `GenericPacketMath.h`.
-template<> struct is_arithmetic<Packet4ui> { enum { value = false }; };
-template<> struct is_arithmetic<Packet16b> { enum { value = true }; };
+template <>
+struct is_arithmetic<Packet4ui> {
+ enum { value = false };
+};
+template <>
+struct is_arithmetic<Packet16b> {
+ enum { value = true };
+};
-template<int p, int q, int r, int s>
-struct shuffle_mask{
- enum { mask = (s)<<6|(r)<<4|(q)<<2|(p) };
+template <int p, int q, int r, int s>
+struct shuffle_mask {
+ enum { mask = (s) << 6 | (r) << 4 | (q) << 2 | (p) };
};
// TODO: change the implementation of all swizzle* ops from macro to template,
-#define vec4f_swizzle1(v,p,q,r,s) \
- Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), (shuffle_mask<p,q,r,s>::mask))))
+#define vec4f_swizzle1(v, p, q, r, s) \
+ Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), (shuffle_mask<p, q, r, s>::mask))))
-#define vec4i_swizzle1(v,p,q,r,s) \
- Packet4i(_mm_shuffle_epi32( v, (shuffle_mask<p,q,r,s>::mask)))
+#define vec4i_swizzle1(v, p, q, r, s) Packet4i(_mm_shuffle_epi32(v, (shuffle_mask<p, q, r, s>::mask)))
-#define vec4ui_swizzle1(v, p, q, r, s) \
- Packet4ui(vec4i_swizzle1(v,p,q,r,s))
+#define vec4ui_swizzle1(v, p, q, r, s) Packet4ui(vec4i_swizzle1(v, p, q, r, s))
-#define vec2d_swizzle1(v,p,q) \
- Packet2d(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), (shuffle_mask<2*p,2*p+1,2*q,2*q+1>::mask))))
+#define vec2d_swizzle1(v, p, q) \
+ Packet2d(_mm_castsi128_pd( \
+ _mm_shuffle_epi32(_mm_castpd_si128(v), (shuffle_mask<2 * p, 2 * p + 1, 2 * q, 2 * q + 1>::mask))))
-#define vec4f_swizzle2(a,b,p,q,r,s) \
- Packet4f(_mm_shuffle_ps( (a), (b), (shuffle_mask<p,q,r,s>::mask)))
+#define vec4f_swizzle2(a, b, p, q, r, s) Packet4f(_mm_shuffle_ps((a), (b), (shuffle_mask<p, q, r, s>::mask)))
-#define vec4i_swizzle2(a,b,p,q,r,s) \
- Packet4i(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p,q,r,s>::mask)))))
+#define vec4i_swizzle2(a, b, p, q, r, s) \
+ Packet4i( \
+ _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p, q, r, s>::mask)))))
-#define vec4ui_swizzle2(a,b,p,q,r,s) \
- Packet4i(vec4i_swizzle2(a,b,p,q,r,s))
+#define vec4ui_swizzle2(a, b, p, q, r, s) Packet4i(vec4i_swizzle2(a, b, p, q, r, s))
-EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
-{
- return Packet4f(_mm_movelh_ps(a,b));
+EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
+ return Packet4f(_mm_movelh_ps(a, b));
}
-EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
-{
- return Packet4f(_mm_movehl_ps(a,b));
+EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
+ return Packet4f(_mm_movehl_ps(a, b));
}
-EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
-{
- return Packet4f(_mm_unpacklo_ps(a,b));
+EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
+ return Packet4f(_mm_unpacklo_ps(a, b));
}
-EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
-{
- return Packet4f(_mm_unpackhi_ps(a,b));
+EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
+ return Packet4f(_mm_unpackhi_ps(a, b));
}
-#define vec4f_duplane(a,p) \
- vec4f_swizzle2(a,a,p,p,p,p)
+#define vec4f_duplane(a, p) vec4f_swizzle2(a, a, p, p, p, p)
-#define vec2d_swizzle2(a,b,mask) \
- Packet2d(_mm_shuffle_pd(a,b,mask))
+#define vec2d_swizzle2(a, b, mask) Packet2d(_mm_shuffle_pd(a, b, mask))
-EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b)
-{
- return Packet2d(_mm_unpacklo_pd(a,b));
+EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) {
+ return Packet2d(_mm_unpacklo_pd(a, b));
}
-EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b)
-{
- return Packet2d(_mm_unpackhi_pd(a,b));
+EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) {
+ return Packet2d(_mm_unpackhi_pd(a, b));
}
-#define vec2d_duplane(a,p) \
- vec2d_swizzle2(a,a,(p<<1)|p)
+#define vec2d_duplane(a, p) vec2d_swizzle2(a, a, (p << 1) | p)
-#define EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
- const Packet4f p4f_##NAME = pset1<Packet4f>(X)
+#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
-#define EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
- const Packet2d p2d_##NAME = pset1<Packet2d>(X)
+#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = pset1<Packet2d>(X)
-#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
- const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
+#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
-#define EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
- const Packet4i p4i_##NAME = pset1<Packet4i>(X)
+#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
#define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = pset1<Packet4ui>(X)
@@ -147,7 +152,7 @@
AlignedOnScalar = 1,
size = 4,
- HasCmp = 1,
+ HasCmp = 1,
HasDiv = 1,
HasReciprocal = EIGEN_FAST_MATH,
HasSin = EIGEN_FAST_MATH,
@@ -173,7 +178,7 @@
HasRound = 1,
#endif
HasRint = 1,
- HasSign = 0 // The manually vectorized version is slightly slower for SSE.
+ HasSign = 0 // The manually vectorized version is slightly slower for SSE.
};
};
template <>
@@ -183,12 +188,12 @@
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
- size=2,
+ size = 2,
- HasCmp = 1,
- HasDiv = 1,
- HasLog = 1,
- HasExp = 1,
+ HasCmp = 1,
+ HasDiv = 1,
+ HasLog = 1,
+ HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasATan = 1,
@@ -201,23 +206,23 @@
HasRint = 1
};
};
-template<> struct packet_traits<int> : default_packet_traits
-{
+template <>
+struct packet_traits<int> : default_packet_traits {
typedef Packet4i type;
typedef Packet4i half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
HasCmp = 1,
- HasDiv=1,
- size=4,
+ HasDiv = 1,
+ size = 4,
HasShift = 1,
HasBlend = 1
};
};
-template<> struct packet_traits<uint32_t> : default_packet_traits
-{
+template <>
+struct packet_traits<uint32_t> : default_packet_traits {
typedef Packet4ui type;
typedef Packet4ui half;
enum {
@@ -236,81 +241,167 @@
};
};
#endif
-template<> struct packet_traits<bool> : default_packet_traits
-{
+template <>
+struct packet_traits<bool> : default_packet_traits {
typedef Packet16b type;
typedef Packet16b half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
- size=16,
+ size = 16,
- HasAdd = 1,
- HasSub = 1,
- HasCmp = 1, // note -- only pcmp_eq is defined
- HasShift = 0,
- HasMul = 1,
- HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasConj = 0,
- HasSqrt = 1,
- HasSign = 0 // Don't try to vectorize psign<bool> = identity.
+ HasAdd = 1,
+ HasSub = 1,
+ HasCmp = 1, // note -- only pcmp_eq is defined
+ HasShift = 0,
+ HasMul = 1,
+ HasNegate = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasConj = 0,
+ HasSqrt = 1,
+ HasSign = 0 // Don't try to vectorize psign<bool> = identity.
};
};
-template<> struct unpacket_traits<Packet4f> {
- typedef float type;
- typedef Packet4f half;
- typedef Packet4i integer_packet;
- enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+template <>
+struct unpacket_traits<Packet4f> {
+ typedef float type;
+ typedef Packet4f half;
+ typedef Packet4i integer_packet;
+ enum {
+ size = 4,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
-template<> struct unpacket_traits<Packet2d> {
- typedef double type;
- typedef Packet2d half;
- enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+template <>
+struct unpacket_traits<Packet2d> {
+ typedef double type;
+ typedef Packet2d half;
+ enum {
+ size = 2,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
-template<> struct unpacket_traits<Packet4i> {
- typedef int type;
- typedef Packet4i half;
- enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+template <>
+struct unpacket_traits<Packet4i> {
+ typedef int type;
+ typedef Packet4i half;
+ enum {
+ size = 4,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
-template<> struct unpacket_traits<Packet4ui> {
+template <>
+struct unpacket_traits<Packet4ui> {
typedef uint32_t type;
typedef Packet4ui half;
- enum {size = 4, alignment = Aligned16, vectorizable = true, masked_load_available = false, masked_store_available = false};
+ enum {
+ size = 4,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
-template<> struct unpacket_traits<Packet16b> {
- typedef bool type;
- typedef Packet16b half;
- enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+template <>
+struct unpacket_traits<Packet16b> {
+ typedef bool type;
+ typedef Packet16b half;
+ enum {
+ size = 16,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
};
#ifndef EIGEN_VECTORIZE_AVX
-template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
-template<> struct scalar_div_cost<double,true> { enum { value = 8 }; };
+template <>
+struct scalar_div_cost<float, true> {
+ enum { value = 7 };
+};
+template <>
+struct scalar_div_cost<double, true> {
+ enum { value = 8 };
+};
#endif
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); }
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) { return _mm_set1_epi32(numext::bit_cast<int32_t>(from)); }
-template<> EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(const bool& from) { return _mm_set1_epi8(static_cast<char>(from)); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+ return _mm_set_ps1(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+ return _mm_set1_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
+ return _mm_set1_epi32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
+ return _mm_set1_epi32(numext::bit_cast<int32_t>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(const bool& from) {
+ return _mm_set1_epi8(static_cast<char>(from));
+}
-template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return _mm_castsi128_ps(pset1<Packet4i>(from)); }
-template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) { return _mm_castsi128_pd(_mm_set1_epi64x(from)); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
+ return _mm_castsi128_ps(pset1<Packet4i>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
+ return _mm_castsi128_pd(_mm_set1_epi64x(from));
+}
-template<> EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) { return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1)); }
-template<> EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) { return _mm_set_epi32(0, -1, 0, -1); }
-template<> EIGEN_STRONG_INLINE Packet4ui peven_mask(const Packet4ui& /*a*/) { return _mm_set_epi32(0, -1, 0, -1); }
-template<> EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& /*a*/) { return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1)); }
+template <>
+EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) {
+ return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) {
+ return _mm_set_epi32(0, -1, 0, -1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui peven_mask(const Packet4ui& /*a*/) {
+ return _mm_set_epi32(0, -1, 0, -1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& /*a*/) {
+ return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1));
+}
-template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); }
-template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); }
-template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); }
-template<> EIGEN_STRONG_INLINE Packet4ui pzero(const Packet4ui& /*a*/) { return _mm_setzero_si128(); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) {
+ return _mm_setzero_ps();
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) {
+ return _mm_setzero_pd();
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) {
+ return _mm_setzero_si128();
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pzero(const Packet4ui& /*a*/) {
+ return _mm_setzero_si128();
+}
// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
@@ -318,242 +409,455 @@
// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
// Also note that with AVX, we want it to generate a vbroadcastss.
#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
-template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
- return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
+template <>
+EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
+ return vec4f_swizzle1(_mm_load_ss(from), 0, 0, 0, 0);
}
#endif
-template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
-template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
-template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
-template<> EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) { return _mm_add_epi32(pset1<Packet4ui>(a), _mm_set_epi32(3, 2, 1, 0)); }
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+ return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3, 2, 1, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+ return _mm_add_pd(pset1<Packet2d>(a), _mm_set_pd(1, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
+ return _mm_add_epi32(pset1<Packet4i>(a), _mm_set_epi32(3, 2, 1, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
+ return _mm_add_epi32(pset1<Packet4ui>(a), _mm_set_epi32(3, 2, 1, 0));
+}
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_add_epi32(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return _mm_add_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return _mm_add_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return _mm_add_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return _mm_add_epi32(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) {
+ return _mm_or_si128(a, b);
+}
-template<typename Packet> EIGEN_STRONG_INLINE Packet padds(const Packet& a, const Packet& b);
-template<> EIGEN_STRONG_INLINE Packet4f padds<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ss(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d padds<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_sd(a,b); }
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet padds(const Packet& a, const Packet& b);
+template <>
+EIGEN_STRONG_INLINE Packet4f padds<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return _mm_add_ss(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d padds<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return _mm_add_sd(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_sub_epi32(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return _mm_sub_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return _mm_sub_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return _mm_sub_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return _mm_sub_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) {
+ return _mm_xor_si128(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
-template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template <>
+EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
#ifdef EIGEN_VECTORIZE_SSE3
- return _mm_addsub_ps(a,b);
+ return _mm_addsub_ps(a, b);
#else
- const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x0,0x80000000,0x0));
+ const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x0, 0x80000000, 0x0));
return padd(a, pxor(mask, b));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
-template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d&, const Packet2d&);
+template <>
+EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
#ifdef EIGEN_VECTORIZE_SSE3
- return _mm_addsub_pd(a,b);
+ return _mm_addsub_pd(a, b);
#else
- const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x0));
+ const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x0));
return padd(a, pxor(mask, b));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
-{
- const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
- return _mm_xor_ps(a,mask);
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
+ const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
+ return _mm_xor_ps(a, mask);
}
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
-{
- const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
- return _mm_xor_pd(a,mask);
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+ const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x80000000));
+ return _mm_xor_pd(a, mask);
}
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
-{
- return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+ return psub(Packet4i(_mm_setr_epi32(0, 0, 0, 0)), a);
}
-template<> EIGEN_STRONG_INLINE Packet16b pnegate(const Packet16b& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16b pnegate(const Packet16b& a) {
return psub(pset1<Packet16b>(false), a);
}
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+ return a;
+}
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return _mm_mul_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return _mm_mul_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
#ifdef EIGEN_VECTORIZE_SSE4_1
- return _mm_mullo_epi32(a,b);
+ return _mm_mullo_epi32(a, b);
#else
// this version is slightly faster than 4 scalar products
return vec4i_swizzle1(
- vec4i_swizzle2(
- _mm_mul_epu32(a,b),
- _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2),
- vec4i_swizzle1(b,1,0,3,2)),
- 0,2,0,2),
- 0,2,1,3);
+ vec4i_swizzle2(_mm_mul_epu32(a, b), _mm_mul_epu32(vec4i_swizzle1(a, 1, 0, 3, 2), vec4i_swizzle1(b, 1, 0, 3, 2)),
+ 0, 2, 0, 2),
+ 0, 2, 1, 3);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
#ifdef EIGEN_VECTORIZE_SSE4_1
- return _mm_mullo_epi32(a,b);
+ return _mm_mullo_epi32(a, b);
#else
// this version is slightly faster than 4 scalar products
return vec4ui_swizzle1(
- vec4ui_swizzle2(
- _mm_mul_epu32(a,b),
- _mm_mul_epu32(vec4ui_swizzle1(a,1,0,3,2),
- vec4ui_swizzle1(b,1,0,3,2)),
- 0,2,0,2),
- 0,2,1,3);
+ vec4ui_swizzle2(_mm_mul_epu32(a, b),
+ _mm_mul_epu32(vec4ui_swizzle1(a, 1, 0, 3, 2), vec4ui_swizzle1(b, 1, 0, 3, 2)), 0, 2, 0, 2),
+ 0, 2, 1, 3);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, const Packet16b& b) {
+ return _mm_and_si128(a, b);
+}
template <>
-EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a,
- const Packet4i& b) {
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return _mm_div_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return _mm_div_pd(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
#ifdef EIGEN_VECTORIZE_AVX
- return _mm256_cvttpd_epi32(
- _mm256_div_pd(_mm256_cvtepi32_pd(a), _mm256_cvtepi32_pd(b)));
+ return _mm256_cvttpd_epi32(_mm256_div_pd(_mm256_cvtepi32_pd(a), _mm256_cvtepi32_pd(b)));
#else
__m128i q_lo = _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(a), _mm_cvtepi32_pd(b)));
- __m128i q_hi =
- _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(vec4i_swizzle1(a, 2, 3, 0, 1)),
- _mm_cvtepi32_pd(vec4i_swizzle1(b, 2, 3, 0, 1))));
+ __m128i q_hi = _mm_cvttpd_epi32(
+ _mm_div_pd(_mm_cvtepi32_pd(vec4i_swizzle1(a, 2, 3, 0, 1)), _mm_cvtepi32_pd(vec4i_swizzle1(b, 2, 3, 0, 1))));
return vec4i_swizzle1(_mm_unpacklo_epi32(q_lo, q_hi), 0, 2, 1, 3);
#endif
}
-
// for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
-template<> EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) { return padd(pmul(a, b), c); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+ return padd(pmul(a, b), c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) {
+ return padd(pmul(a, b), c);
+}
#ifdef EIGEN_VECTORIZE_FMA
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmsub_ps(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmsub_pd(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fnmadd_ps(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fnmadd_pd(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fnmsub_ps(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fnmsub_pd(a,b,c); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+ return _mm_fmadd_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+ return _mm_fmadd_pd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+ return _mm_fmsub_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+ return _mm_fmsub_pd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+ return _mm_fnmadd_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+ return _mm_fnmadd_pd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+ return _mm_fnmsub_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+ return _mm_fnmsub_pd(a, b, c);
+}
-template<typename Packet> EIGEN_STRONG_INLINE Packet pmadds(const Packet& a, const Packet& b, const Packet& c);
-template<> EIGEN_STRONG_INLINE Packet4f pmadds<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ss(a,b,c); }
-template<> EIGEN_STRONG_INLINE Packet2d pmadds<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_sd(a,b,c); }
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pmadds(const Packet& a, const Packet& b, const Packet& c);
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadds<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+ return _mm_fmadd_ss(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadds<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+ return _mm_fmadd_sd(a, b, c);
+}
#endif
#ifdef EIGEN_VECTORIZE_SSE4_1
-template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
- return _mm_blendv_ps(b,a,mask);
+template <>
+EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+ return _mm_blendv_ps(b, a, mask);
}
-template<> EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
- return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b),_mm_castsi128_ps(a),_mm_castsi128_ps(mask)));
+template <>
+EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
+ return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
}
-template<> EIGEN_DEVICE_FUNC inline Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
- return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b),_mm_castsi128_ps(a),_mm_castsi128_ps(mask)));
+template <>
+EIGEN_DEVICE_FUNC inline Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
+ return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
}
-template<> EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) { return _mm_blendv_pd(b,a,mask); }
+template <>
+EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
+ return _mm_blendv_pd(b, a, mask);
+}
-template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
- return _mm_blendv_epi8(b,a,mask);
+template <>
+EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
+ return _mm_blendv_epi8(b, a, mask);
}
#else
-template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
+template <>
+EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
Packet16b a_part = _mm_and_si128(mask, a);
Packet16b b_part = _mm_andnot_si128(mask, b);
return _mm_or_si128(a_part, b_part);
}
#endif
-template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
-template<> EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& a) { return _mm_cmpeq_epi8(a, a); }
-template<> EIGEN_STRONG_INLINE Packet4f
-ptrue<Packet4f>(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) {
+ return _mm_cmpeq_epi32(a, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& a) {
+ return _mm_cmpeq_epi8(a, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ptrue<Packet4f>(const Packet4f& a) {
Packet4i b = _mm_castps_si128(a);
return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
}
-template<> EIGEN_STRONG_INLINE Packet2d
-ptrue<Packet2d>(const Packet2d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d ptrue<Packet2d>(const Packet2d& a) {
Packet4i b = _mm_castpd_si128(a);
return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
}
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return _mm_and_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return _mm_and_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return _mm_and_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return _mm_and_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(const Packet16b& a, const Packet16b& b) {
+ return _mm_and_si128(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_and_si128(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return _mm_or_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return _mm_or_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return _mm_or_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return _mm_or_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b por<Packet16b>(const Packet16b& a, const Packet16b& b) {
+ return _mm_or_si128(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_or_si128(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16b por<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return _mm_xor_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return _mm_xor_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return _mm_xor_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return _mm_xor_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(const Packet16b& a, const Packet16b& b) {
+ return _mm_xor_si128(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_xor_si128(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return _mm_andnot_ps(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return _mm_andnot_pd(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return _mm_andnot_si128(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+ return _mm_andnot_si128(b, a);
+}
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
-template<> EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return _mm_andnot_si128(b, a); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
+ return _mm_cmple_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) {
+ return _mm_cmplt_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
+ return _mm_cmpnge_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) {
+ return _mm_cmpeq_ps(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
+ return _mm_cmple_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
+ return _mm_cmplt_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
+ return _mm_cmpnge_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
+ return _mm_cmpeq_pd(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { return _mm_cmpnge_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
+ return _mm_cmplt_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
+ return _mm_cmpeq_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) {
+ return _mm_cmpeq_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) {
+ return _mm_cmpeq_epi8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
+ return por(pcmp_lt(a, b), pcmp_eq(a, b));
+}
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) { return _mm_cmpeq_epi32(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
// There appears to be a bug in GCC, by which the optimizer may
// flip the argument order in calls to _mm_min_ps, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
#ifdef EIGEN_VECTORIZE_AVX
Packet4f res;
- asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ asm("vminps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
#else
Packet4f res = b;
- asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+ asm("minps %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
#endif
return res;
#else
@@ -561,18 +865,19 @@
return _mm_min_ps(b, a);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
// There appears to be a bug in GCC, by which the optimizer may
// flip the argument order in calls to _mm_min_pd, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
#ifdef EIGEN_VECTORIZE_AVX
Packet2d res;
- asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ asm("vminpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
#else
Packet2d res = b;
- asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+ asm("minpd %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
#endif
return res;
#else
@@ -580,17 +885,18 @@
return _mm_min_pd(b, a);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
#ifdef EIGEN_VECTORIZE_SSE4_1
- return _mm_min_epi32(a,b);
+ return _mm_min_epi32(a, b);
#else
// after some bench, this version *is* faster than a scalar implementation
- Packet4i mask = _mm_cmplt_epi32(a,b);
- return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
+ Packet4i mask = _mm_cmplt_epi32(a, b);
+ return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_min_epu32(a, b);
#else
@@ -600,19 +906,19 @@
#endif
}
-
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
// There appears to be a bug in GCC, by which the optimizer may
// flip the argument order in calls to _mm_max_ps, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
#ifdef EIGEN_VECTORIZE_AVX
Packet4f res;
- asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ asm("vmaxps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
#else
Packet4f res = b;
- asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+ asm("maxps %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
#endif
return res;
#else
@@ -620,18 +926,19 @@
return _mm_max_ps(b, a);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_GNUC_STRICT_LESS_THAN(6,3,0)
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
// There appears to be a bug in GCC, by which the optimizer may
// flip the argument order in calls to _mm_max_pd, so we have to
// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
#ifdef EIGEN_VECTORIZE_AVX
Packet2d res;
- asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
+ asm("vmaxpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
#else
Packet2d res = b;
- asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
+ asm("maxpd %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
#endif
return res;
#else
@@ -639,17 +946,18 @@
return _mm_max_pd(b, a);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
#ifdef EIGEN_VECTORIZE_SSE4_1
- return _mm_max_epi32(a,b);
+ return _mm_max_epi32(a, b);
#else
// after some bench, this version *is* faster than a scalar implementation
- Packet4i mask = _mm_cmpgt_epi32(a,b);
- return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
+ Packet4i mask = _mm_cmpgt_epi32(a, b);
+ return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_max_epu32(a, b);
#else
@@ -659,7 +967,8 @@
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt(const Packet4ui& a, const Packet4ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_lt(const Packet4ui& a, const Packet4ui& b) {
#ifdef EIGEN_VECTORIZE_SSE4_1
return pxor(pcmp_eq(a, pmax(a, b)), ptrue(a));
#else
@@ -667,7 +976,8 @@
(Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le(const Packet4ui& a, const Packet4ui& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_le(const Packet4ui& a, const Packet4ui& b) {
#ifdef EIGEN_VECTORIZE_SSE4_1
return pcmp_eq(a, pmin(a, b));
#else
@@ -695,167 +1005,212 @@
}
// Add specializations for min/max with prescribed NaN progation.
-template<>
+template <>
EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
return pminmax_propagate_numbers(a, b, pmin<Packet4f>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
return pminmax_propagate_numbers(a, b, pmin<Packet2d>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
return pminmax_propagate_numbers(a, b, pmax<Packet4f>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
return pminmax_propagate_numbers(a, b, pmax<Packet2d>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
return pminmax_propagate_nan(a, b, pmin<Packet4f>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
return pminmax_propagate_nan(a, b, pmin<Packet2d>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
return pminmax_propagate_nan(a, b, pmax<Packet4f>);
}
-template<>
+template <>
EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
return pminmax_propagate_nan(a, b, pmax<Packet2d>);
}
-template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) { return _mm_srai_epi32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right (const Packet4i& a) { return _mm_srli_epi32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left (const Packet4i& a) { return _mm_slli_epi32(a,N); }
-
-template<int N> EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) { return _mm_srli_epi32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right (const Packet4ui& a) { return _mm_srli_epi32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left (const Packet4ui& a) { return _mm_slli_epi32(a,N); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
-{
- const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
- return _mm_and_ps(a,mask);
+template <int N>
+EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
+ return _mm_srai_epi32(a, N);
}
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
-{
- const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
- return _mm_and_pd(a,mask);
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
+ return _mm_srli_epi32(a, N);
}
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
-{
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
+ return _mm_slli_epi32(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) {
+ return _mm_srli_epi32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
+ return _mm_srli_epi32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
+ return _mm_slli_epi32(a, N);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+ const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF));
+ return _mm_and_ps(a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+ const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF));
+ return _mm_and_pd(a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
#ifdef EIGEN_VECTORIZE_SSSE3
return _mm_abs_epi32(a);
#else
- Packet4i aux = _mm_srai_epi32(a,31);
- return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
+ Packet4i aux = _mm_srai_epi32(a, 31);
+ return _mm_sub_epi32(_mm_xor_si128(a, aux), aux);
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
+ return a;
+}
-template<> EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) { return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31)); }
-template<> EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
+ return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
Packet4f tmp = psignbit<Packet4f>(_mm_castpd_ps(a));
#ifdef EIGEN_VECTORIZE_AVX
return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
#else
return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
-#endif // EIGEN_VECTORIZE_AVX
+#endif // EIGEN_VECTORIZE_AVX
}
-template<> EIGEN_STRONG_INLINE Packet4ui psignbit(const Packet4ui& a) { return pzero(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4ui psignbit(const Packet4ui& a) {
+ return pzero(a);
+}
#ifdef EIGEN_VECTORIZE_SSE4_1
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
// Unfortunately _mm_round_ps doesn't have a rounding mode to implement numext::round.
const Packet4f mask = pset1frombits<Packet4f>(0x80000000u);
const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu);
return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
}
-template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
}
-template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
-template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
+template <>
+EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
+ return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
+ return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+}
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+ return _mm_ceil_ps(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+ return _mm_ceil_pd(a);
+}
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+ return _mm_floor_ps(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+ return _mm_floor_pd(a);
+}
#else
-template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
// Adds and subtracts signum(a) * 2^23 to force rounding.
- const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
+ const Packet4f limit = pset1<Packet4f>(static_cast<float>(1 << 23));
const Packet4f abs_a = pabs(a);
Packet4f r = padd(abs_a, limit);
// Don't compile-away addition and subtraction.
EIGEN_OPTIMIZATION_BARRIER(r);
r = psub(r, limit);
// If greater than limit, simply return a. Otherwise, account for sign.
- r = pselect(pcmp_lt(abs_a, limit),
- pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+ r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
return r;
}
-template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
// Adds and subtracts signum(a) * 2^52 to force rounding.
- const Packet2d limit = pset1<Packet2d>(static_cast<double>(1ull<<52));
+ const Packet2d limit = pset1<Packet2d>(static_cast<double>(1ull << 52));
const Packet2d abs_a = pabs(a);
Packet2d r = padd(abs_a, limit);
// Don't compile-away addition and subtraction.
EIGEN_OPTIMIZATION_BARRIER(r);
r = psub(r, limit);
// If greater than limit, simply return a. Otherwise, account for sign.
- r = pselect(pcmp_lt(abs_a, limit),
- pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+ r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
return r;
}
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
const Packet4f cst_1 = pset1<Packet4f>(1.0f);
- Packet4f tmp = print<Packet4f>(a);
+ Packet4f tmp = print<Packet4f>(a);
// If greater, subtract one.
Packet4f mask = _mm_cmpgt_ps(tmp, a);
mask = pand(mask, cst_1);
return psub(tmp, mask);
}
-template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
const Packet2d cst_1 = pset1<Packet2d>(1.0);
- Packet2d tmp = print<Packet2d>(a);
+ Packet2d tmp = print<Packet2d>(a);
// If greater, subtract one.
Packet2d mask = _mm_cmpgt_pd(tmp, a);
mask = pand(mask, cst_1);
return psub(tmp, mask);
}
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
const Packet4f cst_1 = pset1<Packet4f>(1.0f);
- Packet4f tmp = print<Packet4f>(a);
+ Packet4f tmp = print<Packet4f>(a);
// If smaller, add one.
Packet4f mask = _mm_cmplt_ps(tmp, a);
mask = pand(mask, cst_1);
return padd(tmp, mask);
}
-template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
const Packet2d cst_1 = pset1<Packet2d>(1.0);
- Packet2d tmp = print<Packet2d>(a);
+ Packet2d tmp = print<Packet2d>(a);
// If smaller, add one.
Packet2d mask = _mm_cmplt_pd(tmp, a);
mask = pand(mask, cst_1);
@@ -863,71 +1218,104 @@
}
#endif
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
-template<> EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
-template<> EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(const bool* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(const bool* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
#if EIGEN_COMP_MSVC
- template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_loadu_ps(from);
}
#else
// NOTE: with the code below, MSVC's compiler crashes!
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_loadu_ps(from);
}
#endif
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_loadu_pd(from);
}
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
}
-template<> EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
}
-template<> EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(const bool* from) {
+template <>
+EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(const bool* from) {
EIGEN_DEBUG_UNALIGNED_LOAD
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
}
// Load lower part of packet zero extending.
-template<typename Packet> EIGEN_STRONG_INLINE Packet ploadl(const typename unpacket_traits<Packet>::type* from);
-template<> EIGEN_STRONG_INLINE Packet4f ploadl<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))); }
-template<> EIGEN_STRONG_INLINE Packet2d ploadl<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from); }
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploadl(const typename unpacket_traits<Packet>::type* from);
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadl<Packet4f>(const float* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadl<Packet2d>(const double* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from);
+}
// Load scalar
-template<typename Packet> EIGEN_STRONG_INLINE Packet ploads(const typename unpacket_traits<Packet>::type* from);
-template<> EIGEN_STRONG_INLINE Packet4f ploads<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_ss(from); }
-template<> EIGEN_STRONG_INLINE Packet2d ploads<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from); }
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploads(const typename unpacket_traits<Packet>::type* from);
+template <>
+EIGEN_STRONG_INLINE Packet4f ploads<Packet4f>(const float* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_ss(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ploads<Packet2d>(const double* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from);
+}
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
}
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
-{ return pset1<Packet2d>(from[0]); }
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
+ return pset1<Packet2d>(from[0]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
Packet4i tmp;
tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
return vec4i_swizzle1(tmp, 0, 0, 1, 1);
}
-template<> EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
Packet4ui tmp;
tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
return vec4ui_swizzle1(tmp, 0, 0, 1, 1);
@@ -935,154 +1323,268 @@
// Loads 8 bools from memory and returns the packet
// {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7}
-template<> EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool* from) {
__m128i tmp = _mm_castpd_si128(pload1<Packet2d>(reinterpret_cast<const double*>(from)));
- return _mm_unpacklo_epi8(tmp, tmp);
+ return _mm_unpacklo_epi8(tmp, tmp);
}
// Loads 4 bools from memory and returns the packet
// {b0, b0 b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3}
-template<> EIGEN_STRONG_INLINE Packet16b
-ploadquad<Packet16b>(const bool* from) {
+template <>
+EIGEN_STRONG_INLINE Packet16b ploadquad<Packet16b>(const bool* from) {
__m128i tmp = _mm_castps_si128(pload1<Packet4f>(reinterpret_cast<const float*>(from)));
tmp = _mm_unpacklo_epi8(tmp, tmp);
- return _mm_unpacklo_epi16(tmp, tmp);
+ return _mm_unpacklo_epi16(tmp, tmp);
}
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
-template<> EIGEN_STRONG_INLINE void pstore<bool>(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
-
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<bool>(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
-
-template<typename Scalar, typename Packet> EIGEN_STRONG_INLINE void pstorel(Scalar* to, const Packet& from);
-template<> EIGEN_STRONG_INLINE void pstorel(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pi(reinterpret_cast<__m64*>(to), from); }
-template<> EIGEN_STRONG_INLINE void pstorel(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pd(to, from); }
-
-template<typename Scalar, typename Packet> EIGEN_STRONG_INLINE void pstores(Scalar* to, const Packet& from);
-template<> EIGEN_STRONG_INLINE void pstores(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_store_ss(to, from); }
-template<> EIGEN_STRONG_INLINE void pstores(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_store_sd(to, from); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
-{
- return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from);
}
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
-{
- return _mm_set_pd(from[1*stride], from[0*stride]);
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from);
}
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
-{
- return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+template <>
+EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
}
-template<> EIGEN_DEVICE_FUNC inline Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<bool>(bool* to, const Packet16b& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bool>(bool* to, const Packet16b& from) {
+ EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template <typename Scalar, typename Packet>
+EIGEN_STRONG_INLINE void pstorel(Scalar* to, const Packet& from);
+template <>
+EIGEN_STRONG_INLINE void pstorel(float* to, const Packet4f& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pi(reinterpret_cast<__m64*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstorel(double* to, const Packet2d& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pd(to, from);
+}
+
+template <typename Scalar, typename Packet>
+EIGEN_STRONG_INLINE void pstores(Scalar* to, const Packet& from);
+template <>
+EIGEN_STRONG_INLINE void pstores(float* to, const Packet4f& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm_store_ss(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstores(double* to, const Packet2d& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE _mm_store_sd(to, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
+ return _mm_set_ps(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
+ return _mm_set_pd(from[1 * stride], from[0 * stride]);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
+ return _mm_set_epi32(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
return _mm_set_epi32(numext::bit_cast<int32_t>(from[3 * stride]), numext::bit_cast<int32_t>(from[2 * stride]),
numext::bit_cast<int32_t>(from[1 * stride]), numext::bit_cast<int32_t>(from[0 * stride]));
}
-template<> EIGEN_DEVICE_FUNC inline Packet16b pgather<bool, Packet16b>(const bool* from, Index stride)
-{
- return _mm_set_epi8(from[15*stride], from[14*stride], from[13*stride], from[12*stride],
- from[11*stride], from[10*stride], from[9*stride], from[8*stride],
- from[7*stride], from[6*stride], from[5*stride], from[4*stride],
- from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
+template <>
+EIGEN_DEVICE_FUNC inline Packet16b pgather<bool, Packet16b>(const bool* from, Index stride) {
+ return _mm_set_epi8(from[15 * stride], from[14 * stride], from[13 * stride], from[12 * stride], from[11 * stride],
+ from[10 * stride], from[9 * stride], from[8 * stride], from[7 * stride], from[6 * stride],
+ from[5 * stride], from[4 * stride], from[3 * stride], from[2 * stride], from[1 * stride],
+ from[0 * stride]);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
-{
- to[stride*0] = _mm_cvtss_f32(from);
- to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
- to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
- to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
+ to[stride * 0] = _mm_cvtss_f32(from);
+ to[stride * 1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
+ to[stride * 2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
+ to[stride * 3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
-{
- to[stride*0] = _mm_cvtsd_f64(from);
- to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
+ to[stride * 0] = _mm_cvtsd_f64(from);
+ to[stride * 1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
-{
- to[stride*0] = _mm_cvtsi128_si32(from);
- to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
- to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
- to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
+ to[stride * 0] = _mm_cvtsi128_si32(from);
+ to[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
+ to[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
+ to[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride) {
to[stride * 0] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(from));
to[stride * 1] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)));
to[stride * 2] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)));
to[stride * 3] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)));
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride)
-{
- to[4*stride*0] = _mm_cvtsi128_si32(from);
- to[4*stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
- to[4*stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
- to[4*stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride) {
+ to[4 * stride * 0] = _mm_cvtsi128_si32(from);
+ to[4 * stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
+ to[4 * stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
+ to[4 * stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
}
-
// some compilers might be tempted to perform multiple moves instead of using a vector path.
-template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a) {
Packet4f pa = _mm_set_ss(a);
- pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
+ pstore(to, Packet4f(vec4f_swizzle1(pa, 0, 0, 0, 0)));
}
// some compilers might be tempted to perform multiple moves instead of using a vector path.
-template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a) {
Packet2d pa = _mm_set_sd(a);
- pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
+ pstore(to, Packet2d(vec2d_swizzle1(pa, 0, 0)));
}
#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
-typedef const void * SsePrefetchPtrType;
+typedef const void* SsePrefetchPtrType;
#else
-typedef const char * SsePrefetchPtrType;
+typedef const char* SsePrefetchPtrType;
#endif
#ifndef EIGEN_VECTORIZE_AVX
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+ _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+ _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
+ _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
+ _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
#endif
#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
// Direct of the struct members fixed bug #62.
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
-template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
-template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a)); return x; }
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+ return a.m128_f32[0];
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+ return a.m128d_f64[0];
+}
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
+ int x = _mm_cvtsi128_si32(a);
+ return x;
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
+ uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
+ return x;
+}
#elif EIGEN_COMP_MSVC_STRICT
// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
-template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
-template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a)); return x; }
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+ float x = _mm_cvtss_f32(a);
+ return x;
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+ double x = _mm_cvtsd_f64(a);
+ return x;
+}
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
+ int x = _mm_cvtsi128_si32(a);
+ return x;
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
+ uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
+ return x;
+}
#else
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
-template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
-template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a)); }
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+ return _mm_cvtss_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+ return _mm_cvtsd_f64(a);
+}
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
+ return _mm_cvtsi128_si32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
+ return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
+}
#endif
-template<> EIGEN_STRONG_INLINE bool pfirst<Packet16b>(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast<bool>(x & 1); }
+template <>
+EIGEN_STRONG_INLINE bool pfirst<Packet16b>(const Packet16b& a) {
+ int x = _mm_cvtsi128_si32(a);
+ return static_cast<bool>(x & 1);
+}
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return _mm_shuffle_ps(a,a,0x1B); }
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return _mm_shuffle_pd(a,a,0x1); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); }
-template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) { return _mm_shuffle_epi32(a, 0x1B); }
-template<> EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+ return _mm_shuffle_ps(a, a, 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+ return _mm_shuffle_pd(a, a, 0x1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+ return _mm_shuffle_epi32(a, 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
+ return _mm_shuffle_epi32(a, 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
#ifdef EIGEN_VECTORIZE_SSSE3
__m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
return _mm_shuffle_epi8(a, mask);
@@ -1093,30 +1595,33 @@
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
- return pfrexp_generic(a,exponent);
+template <>
+EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+ return pfrexp_generic(a, exponent);
}
// Extract exponent without existence of Packet2l.
-template<>
-EIGEN_STRONG_INLINE
-Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
- const Packet2d cst_exp_mask = pset1frombits<Packet2d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
+ const Packet2d cst_exp_mask = pset1frombits<Packet2d>(static_cast<uint64_t>(0x7ff0000000000000ull));
__m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52);
return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3));
}
-template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
return pfrexp_generic(a, exponent);
}
-template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
- return pldexp_generic(a,exponent);
+template <>
+EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+ return pldexp_generic(a, exponent);
}
// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
// supported by SSE, and has more range than is needed for exponents.
-template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
// Clamp exponent to [-2099, 2099]
const Packet2d max_exponent = pset1<Packet2d>(2099.0);
const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent);
@@ -1126,226 +1631,223 @@
// Split 2^e into four factors and multiply:
const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
- Packet4i b = parithmetic_shift_right<2>(ei); // floor(e/4)
+ Packet4i b = parithmetic_shift_right<2>(ei); // floor(e/4)
Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^b
- Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
- b = psub(psub(psub(ei, b), b), b); // e - 3b
- c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^(e - 3b)
- out = pmul(out, c); // a * 2^e
+ Packet2d out = pmul(pmul(pmul(a, c), c), c); // a * 2^(3b)
+ b = psub(psub(psub(ei, b), b), b); // e - 3b
+ c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52)); // 2^(e - 3b)
+ out = pmul(out, c); // a * 2^e
return out;
}
// with AVX, the default implementations based on pload1 are faster
#ifndef __AVX__
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
- Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
a3 = pload<Packet4f>(a);
- a0 = vec4f_swizzle1(a3, 0,0,0,0);
- a1 = vec4f_swizzle1(a3, 1,1,1,1);
- a2 = vec4f_swizzle1(a3, 2,2,2,2);
- a3 = vec4f_swizzle1(a3, 3,3,3,3);
+ a0 = vec4f_swizzle1(a3, 0, 0, 0, 0);
+ a1 = vec4f_swizzle1(a3, 1, 1, 1, 1);
+ a2 = vec4f_swizzle1(a3, 2, 2, 2, 2);
+ a3 = vec4f_swizzle1(a3, 3, 3, 3, 3);
}
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet2d>(const double *a,
- Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
-{
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
+ Packet2d& a3) {
#ifdef EIGEN_VECTORIZE_SSE3
- a0 = _mm_loaddup_pd(a+0);
- a1 = _mm_loaddup_pd(a+1);
- a2 = _mm_loaddup_pd(a+2);
- a3 = _mm_loaddup_pd(a+3);
+ a0 = _mm_loaddup_pd(a + 0);
+ a1 = _mm_loaddup_pd(a + 1);
+ a2 = _mm_loaddup_pd(a + 2);
+ a3 = _mm_loaddup_pd(a + 3);
#else
a1 = pload<Packet2d>(a);
- a0 = vec2d_swizzle1(a1, 0,0);
- a1 = vec2d_swizzle1(a1, 1,1);
- a3 = pload<Packet2d>(a+2);
- a2 = vec2d_swizzle1(a3, 0,0);
- a3 = vec2d_swizzle1(a3, 1,1);
+ a0 = vec2d_swizzle1(a1, 0, 0);
+ a1 = vec2d_swizzle1(a1, 1, 1);
+ a3 = pload<Packet2d>(a + 2);
+ a2 = vec2d_swizzle1(a3, 0, 0);
+ a3 = vec2d_swizzle1(a3, 1, 1);
#endif
}
#endif
-EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
-{
+EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) {
vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
}
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
// Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
// (from Nehalem to Haswell)
// #ifdef EIGEN_VECTORIZE_SSE3
// Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3));
// return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp));
// #else
- Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
- return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+ Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a, a));
+ return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
// #endif
}
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
// Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
// (from Nehalem to Haswell)
// #ifdef EIGEN_VECTORIZE_SSE3
// return pfirst<Packet2d>(_mm_hadd_pd(a, a));
// #else
- return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
+ return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a, a)));
// #endif
}
#ifdef EIGEN_VECTORIZE_SSSE3
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
-{
- Packet4i tmp0 = _mm_hadd_epi32(a,a);
- return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0));
+template <>
+EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
+ Packet4i tmp0 = _mm_hadd_epi32(a, a);
+ return pfirst<Packet4i>(_mm_hadd_epi32(tmp0, tmp0));
}
-template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
Packet4ui tmp0 = _mm_hadd_epi32(a, a);
return pfirst<Packet4ui>(_mm_hadd_epi32(tmp0, tmp0));
}
#else
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
-{
- Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
+template <>
+EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
+ Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a, a));
return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
}
-template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
Packet4ui tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a, a));
return pfirst(tmp) + pfirst<Packet4ui>(_mm_shuffle_epi32(tmp, 1));
}
#endif
-template<> EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
- Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a));
+template <>
+EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
+ Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a, a));
return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
}
// Other reduction functions:
-
// mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
- Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
- return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+ Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a, a));
+ return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
}
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{
- return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+ return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a, a)));
}
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., reusing pmul is very slow !)
// TODO try to call _mm_mul_epu32 directly
EIGEN_ALIGN16 int aux[4];
pstore(aux, a);
- return (aux[0] * aux[1]) * (aux[2] * aux[3]);
+ return (aux[0] * aux[1]) * (aux[2] * aux[3]);
}
-template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., reusing pmul is very slow !)
// TODO try to call _mm_mul_epu32 directly
EIGEN_ALIGN16 uint32_t aux[4];
pstore(aux, a);
- return (aux[0] * aux[1]) * (aux[2] * aux[3]);
+ return (aux[0] * aux[1]) * (aux[2] * aux[3]);
}
-template<> EIGEN_STRONG_INLINE bool predux_mul<Packet16b>(const Packet16b& a) {
- Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a,a));
- return ((pfirst<Packet4i>(tmp) == 0x01010101) &&
- (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
+template <>
+EIGEN_STRONG_INLINE bool predux_mul<Packet16b>(const Packet16b& a) {
+ Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a, a));
+ return ((pfirst<Packet4i>(tmp) == 0x01010101) && (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
}
// min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
- Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
- return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+ Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a, a));
+ return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
}
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
-{
- return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+ return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a, a)));
}
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
#ifdef EIGEN_VECTORIZE_SSE4_1
- Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
- return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
+ Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
+ return pfirst<Packet4i>(_mm_min_epi32(tmp, _mm_shuffle_epi32(tmp, 1)));
#else
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., it does not like using std::min after the pstore !!)
EIGEN_ALIGN16 int aux[4];
pstore(aux, a);
- int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
- int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
- return aux0<aux2 ? aux0 : aux2;
-#endif // EIGEN_VECTORIZE_SSE4_1
+ int aux0 = aux[0] < aux[1] ? aux[0] : aux[1];
+ int aux2 = aux[2] < aux[3] ? aux[2] : aux[3];
+ return aux0 < aux2 ? aux0 : aux2;
+#endif // EIGEN_VECTORIZE_SSE4_1
}
-template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
#ifdef EIGEN_VECTORIZE_SSE4_1
- Packet4ui tmp = _mm_min_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
- return pfirst<Packet4ui>(_mm_min_epu32(tmp,_mm_shuffle_epi32(tmp, 1)));
+ Packet4ui tmp = _mm_min_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
+ return pfirst<Packet4ui>(_mm_min_epu32(tmp, _mm_shuffle_epi32(tmp, 1)));
#else
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., it does not like using std::min after the pstore !!)
EIGEN_ALIGN16 uint32_t aux[4];
pstore(aux, a);
- uint32_t aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
- uint32_t aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
- return aux0<aux2 ? aux0 : aux2;
-#endif // EIGEN_VECTORIZE_SSE4_1
+ uint32_t aux0 = aux[0] < aux[1] ? aux[0] : aux[1];
+ uint32_t aux2 = aux[2] < aux[3] ? aux[2] : aux[3];
+ return aux0 < aux2 ? aux0 : aux2;
+#endif // EIGEN_VECTORIZE_SSE4_1
}
// max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
- Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
- return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+ Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a, a));
+ return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)));
}
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
-{
- return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+ return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a, a)));
}
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
#ifdef EIGEN_VECTORIZE_SSE4_1
- Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
- return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
+ Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
+ return pfirst<Packet4i>(_mm_max_epi32(tmp, _mm_shuffle_epi32(tmp, 1)));
#else
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., it does not like using std::min after the pstore !!)
EIGEN_ALIGN16 int aux[4];
pstore(aux, a);
- int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
- int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
- return aux0>aux2 ? aux0 : aux2;
-#endif // EIGEN_VECTORIZE_SSE4_1
+ int aux0 = aux[0] > aux[1] ? aux[0] : aux[1];
+ int aux2 = aux[2] > aux[3] ? aux[2] : aux[3];
+ return aux0 > aux2 ? aux0 : aux2;
+#endif // EIGEN_VECTORIZE_SSE4_1
}
-template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a)
-{
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
#ifdef EIGEN_VECTORIZE_SSE4_1
- Packet4ui tmp = _mm_max_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
- return pfirst<Packet4ui>(_mm_max_epu32(tmp,_mm_shuffle_epi32(tmp, 1)));
+ Packet4ui tmp = _mm_max_epu32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 3, 2)));
+ return pfirst<Packet4ui>(_mm_max_epu32(tmp, _mm_shuffle_epi32(tmp, 1)));
#else
// after some experiments, it is seems this is the fastest way to implement it
// for GCC (eg., it does not like using std::min after the pstore !!)
EIGEN_ALIGN16 uint32_t aux[4];
pstore(aux, a);
- uint32_t aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
- uint32_t aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
- return aux0>aux2 ? aux0 : aux2;
-#endif // EIGEN_VECTORIZE_SSE4_1
+ uint32_t aux0 = aux[0] > aux[1] ? aux[0] : aux[1];
+ uint32_t aux2 = aux[2] > aux[3] ? aux[2] : aux[3];
+ return aux0 > aux2 ? aux0 : aux2;
+#endif // EIGEN_VECTORIZE_SSE4_1
}
// not needed yet
@@ -1354,34 +1856,31 @@
// return _mm_movemask_ps(x) == 0xF;
// }
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
-{
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
return _mm_movemask_ps(x) != 0x0;
}
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x)
-{
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4i& x) {
return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
}
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& x)
-{
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& x) {
return _mm_movemask_ps(_mm_castsi128_ps(x)) != 0x0;
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
_MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2d,2>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
__m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
kernel.packet[1] = tmp;
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
__m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
__m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
__m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
@@ -1396,20 +1895,18 @@
ptranspose((PacketBlock<Packet4i, 4>&)kernel);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16b,4>& kernel) {
- __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
- __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
- __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
- __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16b, 4>& kernel) {
+ __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
+ __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
+ __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
+ __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16b,16>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
// If we number the elements in the input thus:
// kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f}
// kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f}
@@ -1421,67 +1918,72 @@
// kernel.packet[ 1] = {01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1}
// ...
// kernel.packet[15] = {0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff},
- __m128i t0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
- __m128i t1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
- __m128i t2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); // 20 30 21 31 22 32 ... 27 37
- __m128i t3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); // 28 38 29 39 2a 3a ... 2f 3f
- __m128i t4 = _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]); // 40 50 41 51 42 52 47 57
- __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]); // 48 58 49 59 4a 5a
- __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
- __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
- __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
- __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
- __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
- __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
- __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
- __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
- __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
- __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
+ __m128i t0 =
+ _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ __m128i t1 =
+ _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
+ __m128i t2 =
+ _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); // 20 30 21 31 22 32 ... 27 37
+ __m128i t3 =
+ _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); // 28 38 29 39 2a 3a ... 2f 3f
+ __m128i t4 =
+ _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]); // 40 50 41 51 42 52 47 57
+ __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]); // 48 58 49 59 4a 5a
+ __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
+ __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
+ __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
+ __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
+ __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
+ __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
+ __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
+ __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
+ __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
+ __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
- __m128i s0 = _mm_unpacklo_epi16(t0, t2); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- __m128i s1 = _mm_unpackhi_epi16(t0, t2); // 04 14 24 34
- __m128i s2 = _mm_unpacklo_epi16(t1, t3); // 08 18 28 38 ...
- __m128i s3 = _mm_unpackhi_epi16(t1, t3); // 0c 1c 2c 3c ...
- __m128i s4 = _mm_unpacklo_epi16(t4, t6); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
- __m128i s5 = _mm_unpackhi_epi16(t4, t6); // 44 54 64 74 ...
- __m128i s6 = _mm_unpacklo_epi16(t5, t7);
- __m128i s7 = _mm_unpackhi_epi16(t5, t7);
- __m128i s8 = _mm_unpacklo_epi16(t8, ta);
- __m128i s9 = _mm_unpackhi_epi16(t8, ta);
- __m128i sa = _mm_unpacklo_epi16(t9, tb);
- __m128i sb = _mm_unpackhi_epi16(t9, tb);
- __m128i sc = _mm_unpacklo_epi16(tc, te);
- __m128i sd = _mm_unpackhi_epi16(tc, te);
- __m128i se = _mm_unpacklo_epi16(td, tf);
- __m128i sf = _mm_unpackhi_epi16(td, tf);
+ __m128i s0 = _mm_unpacklo_epi16(t0, t2); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ __m128i s1 = _mm_unpackhi_epi16(t0, t2); // 04 14 24 34
+ __m128i s2 = _mm_unpacklo_epi16(t1, t3); // 08 18 28 38 ...
+ __m128i s3 = _mm_unpackhi_epi16(t1, t3); // 0c 1c 2c 3c ...
+ __m128i s4 = _mm_unpacklo_epi16(t4, t6); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ __m128i s5 = _mm_unpackhi_epi16(t4, t6); // 44 54 64 74 ...
+ __m128i s6 = _mm_unpacklo_epi16(t5, t7);
+ __m128i s7 = _mm_unpackhi_epi16(t5, t7);
+ __m128i s8 = _mm_unpacklo_epi16(t8, ta);
+ __m128i s9 = _mm_unpackhi_epi16(t8, ta);
+ __m128i sa = _mm_unpacklo_epi16(t9, tb);
+ __m128i sb = _mm_unpackhi_epi16(t9, tb);
+ __m128i sc = _mm_unpacklo_epi16(tc, te);
+ __m128i sd = _mm_unpackhi_epi16(tc, te);
+ __m128i se = _mm_unpacklo_epi16(td, tf);
+ __m128i sf = _mm_unpackhi_epi16(td, tf);
- __m128i u0 = _mm_unpacklo_epi32(s0, s4); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- __m128i u1 = _mm_unpackhi_epi32(s0, s4); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- __m128i u2 = _mm_unpacklo_epi32(s1, s5);
- __m128i u3 = _mm_unpackhi_epi32(s1, s5);
- __m128i u4 = _mm_unpacklo_epi32(s2, s6);
- __m128i u5 = _mm_unpackhi_epi32(s2, s6);
- __m128i u6 = _mm_unpacklo_epi32(s3, s7);
- __m128i u7 = _mm_unpackhi_epi32(s3, s7);
- __m128i u8 = _mm_unpacklo_epi32(s8, sc);
- __m128i u9 = _mm_unpackhi_epi32(s8, sc);
- __m128i ua = _mm_unpacklo_epi32(s9, sd);
- __m128i ub = _mm_unpackhi_epi32(s9, sd);
- __m128i uc = _mm_unpacklo_epi32(sa, se);
- __m128i ud = _mm_unpackhi_epi32(sa, se);
- __m128i ue = _mm_unpacklo_epi32(sb, sf);
- __m128i uf = _mm_unpackhi_epi32(sb, sf);
+ __m128i u0 = _mm_unpacklo_epi32(s0, s4); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ __m128i u1 = _mm_unpackhi_epi32(s0, s4); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ __m128i u2 = _mm_unpacklo_epi32(s1, s5);
+ __m128i u3 = _mm_unpackhi_epi32(s1, s5);
+ __m128i u4 = _mm_unpacklo_epi32(s2, s6);
+ __m128i u5 = _mm_unpackhi_epi32(s2, s6);
+ __m128i u6 = _mm_unpacklo_epi32(s3, s7);
+ __m128i u7 = _mm_unpackhi_epi32(s3, s7);
+ __m128i u8 = _mm_unpacklo_epi32(s8, sc);
+ __m128i u9 = _mm_unpackhi_epi32(s8, sc);
+ __m128i ua = _mm_unpacklo_epi32(s9, sd);
+ __m128i ub = _mm_unpackhi_epi32(s9, sd);
+ __m128i uc = _mm_unpacklo_epi32(sa, se);
+ __m128i ud = _mm_unpackhi_epi32(sa, se);
+ __m128i ue = _mm_unpacklo_epi32(sb, sf);
+ __m128i uf = _mm_unpackhi_epi32(sb, sf);
- kernel.packet[0] = _mm_unpacklo_epi64(u0, u8);
- kernel.packet[1] = _mm_unpackhi_epi64(u0, u8);
- kernel.packet[2] = _mm_unpacklo_epi64(u1, u9);
- kernel.packet[3] = _mm_unpackhi_epi64(u1, u9);
- kernel.packet[4] = _mm_unpacklo_epi64(u2, ua);
- kernel.packet[5] = _mm_unpackhi_epi64(u2, ua);
- kernel.packet[6] = _mm_unpacklo_epi64(u3, ub);
- kernel.packet[7] = _mm_unpackhi_epi64(u3, ub);
- kernel.packet[8] = _mm_unpacklo_epi64(u4, uc);
- kernel.packet[9] = _mm_unpackhi_epi64(u4, uc);
+ kernel.packet[0] = _mm_unpacklo_epi64(u0, u8);
+ kernel.packet[1] = _mm_unpackhi_epi64(u0, u8);
+ kernel.packet[2] = _mm_unpacklo_epi64(u1, u9);
+ kernel.packet[3] = _mm_unpackhi_epi64(u1, u9);
+ kernel.packet[4] = _mm_unpacklo_epi64(u2, ua);
+ kernel.packet[5] = _mm_unpackhi_epi64(u2, ua);
+ kernel.packet[6] = _mm_unpacklo_epi64(u3, ub);
+ kernel.packet[7] = _mm_unpackhi_epi64(u3, ub);
+ kernel.packet[8] = _mm_unpacklo_epi64(u4, uc);
+ kernel.packet[9] = _mm_unpackhi_epi64(u4, uc);
kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
@@ -1490,7 +1992,9 @@
kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
}
-template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
+ const Packet4i& elsePacket) {
const __m128i zero = _mm_setzero_si128();
const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m128i false_mask = _mm_cmpeq_epi32(select, zero);
@@ -1500,11 +2004,14 @@
return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet4ui pblend(const Selector<4>& ifPacket, const Packet4ui& thenPacket,
- const Packet4ui& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui pblend(const Selector<4>& ifPacket, const Packet4ui& thenPacket,
+ const Packet4ui& elsePacket) {
return (Packet4ui)pblend(ifPacket, (Packet4i)thenPacket, (Packet4i)elsePacket);
}
-template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+ const Packet4f& elsePacket) {
const __m128 zero = _mm_setzero_ps();
const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m128 false_mask = _mm_cmpeq_ps(select, zero);
@@ -1514,7 +2021,9 @@
return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
#endif
}
-template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
+ const Packet2d& elsePacket) {
const __m128d zero = _mm_setzero_pd();
const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
__m128d false_mask = _mm_cmpeq_pd(select, zero);
@@ -1527,29 +2036,37 @@
// Scalar path for pmadd with FMA to ensure consistency with vectorized path.
#ifdef EIGEN_VECTORIZE_FMA
-template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
- return ::fmaf(a,b,c);
+template <>
+EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
+ return ::fmaf(a, b, c);
}
-template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
- return ::fma(a,b,c);
+template <>
+EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
+ return ::fma(a, b, c);
}
-template<> EIGEN_STRONG_INLINE float pmsub(const float& a, const float& b, const float& c) {
- return ::fmaf(a,b,-c);
+template <>
+EIGEN_STRONG_INLINE float pmsub(const float& a, const float& b, const float& c) {
+ return ::fmaf(a, b, -c);
}
-template<> EIGEN_STRONG_INLINE double pmsub(const double& a, const double& b, const double& c) {
- return ::fma(a,b,-c);
+template <>
+EIGEN_STRONG_INLINE double pmsub(const double& a, const double& b, const double& c) {
+ return ::fma(a, b, -c);
}
-template<> EIGEN_STRONG_INLINE float pnmadd(const float& a, const float& b, const float& c) {
- return ::fmaf(-a,b,c);
+template <>
+EIGEN_STRONG_INLINE float pnmadd(const float& a, const float& b, const float& c) {
+ return ::fmaf(-a, b, c);
}
-template<> EIGEN_STRONG_INLINE double pnmadd(const double& a, const double& b, const double& c) {
- return ::fma(-a,b,c);
+template <>
+EIGEN_STRONG_INLINE double pnmadd(const double& a, const double& b, const double& c) {
+ return ::fma(-a, b, c);
}
-template<> EIGEN_STRONG_INLINE float pnmsub(const float& a, const float& b, const float& c) {
- return ::fmaf(-a,b,-c);
+template <>
+EIGEN_STRONG_INLINE float pnmsub(const float& a, const float& b, const float& c) {
+ return ::fmaf(-a, b, -c);
}
-template<> EIGEN_STRONG_INLINE double pnmsub(const double& a, const double& b, const double& c) {
- return ::fma(-a,b,-c);
+template <>
+EIGEN_STRONG_INLINE double pnmsub(const double& a, const double& b, const double& c) {
+ return ::fma(-a, b, -c);
}
#endif
@@ -1571,8 +2088,7 @@
// Inf/NaN?
__m128i naninf_mask = _mm_cmpeq_epi32(exp, shifted_exp);
// Inf/NaN adjust
- __m128i naninf_adj =
- _mm_and_si128(_mm_set1_epi32((128 - 16) << 23), naninf_mask);
+ __m128i naninf_adj = _mm_and_si128(_mm_set1_epi32((128 - 16) << 23), naninf_mask);
// extra exp adjust for Inf/NaN
ou = _mm_add_epi32(ou, naninf_adj);
@@ -1584,11 +2100,9 @@
// magic.u = 113 << 23
__m128i magic = _mm_and_si128(zeroden_mask, _mm_set1_epi32(113 << 23));
// o.f -= magic.f
- ou = _mm_castps_si128(
- _mm_sub_ps(_mm_castsi128_ps(ou), _mm_castsi128_ps(magic)));
+ ou = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(ou), _mm_castsi128_ps(magic)));
- __m128i sign =
- _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x8000)), 16);
+ __m128i sign = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x8000)), 16);
// o.u |= (h.x & 0x8000) << 16; // sign bit
ou = _mm_or_si128(ou, sign);
// return o.f;
@@ -1622,8 +2136,7 @@
__m128i naninf_value = _mm_or_si128(inf_value, nan_value);
__m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
- __m128i subnorm_mask =
- _mm_cmplt_epi32(_mm_castps_si128(f), _mm_set1_epi32(113 << 23));
+ __m128i subnorm_mask = _mm_cmplt_epi32(_mm_castps_si128(f), _mm_set1_epi32(113 << 23));
// f.f += denorm_magic.f;
f = _mm_add_ps(f, _mm_castsi128_ps(denorm_magic));
// f.u - denorm_magic.u
@@ -1656,7 +2169,7 @@
// Packet math for Eigen::half
// Disable the following code since it's broken on too many platforms / compilers.
-//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+// #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
#if 0
typedef struct {
@@ -1859,19 +2372,18 @@
#endif
+} // end namespace internal
-} // end namespace internal
-
-} // end namespace Eigen
+} // end namespace Eigen
#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
// PGI++ does not define the following intrinsics in C++ mode.
-static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); }
+static inline __m128 _mm_castpd_ps(__m128d x) { return reinterpret_cast<__m128&>(x); }
static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
-static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); }
-static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
-static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
+static inline __m128d _mm_castps_pd(__m128 x) { return reinterpret_cast<__m128d&>(x); }
+static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
+static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
#endif
-#endif // EIGEN_PACKET_MATH_SSE_H
+#endif // EIGEN_PACKET_MATH_SSE_H
diff --git a/Eigen/src/Core/arch/SSE/TypeCasting.h b/Eigen/src/Core/arch/SSE/TypeCasting.h
index 7e3099b..cbc6d47 100644
--- a/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -18,23 +18,29 @@
namespace internal {
#ifndef EIGEN_VECTORIZE_AVX
-template<> struct type_casting_traits<float, bool> : vectorized_type_casting_traits<float, bool> {};
-template<> struct type_casting_traits<bool, float> : vectorized_type_casting_traits<bool, float> {};
+template <>
+struct type_casting_traits<float, bool> : vectorized_type_casting_traits<float, bool> {};
+template <>
+struct type_casting_traits<bool, float> : vectorized_type_casting_traits<bool, float> {};
-template<> struct type_casting_traits<float, int> : vectorized_type_casting_traits<float, int> {};
-template<> struct type_casting_traits<int, float> : vectorized_type_casting_traits<int, float> {};
+template <>
+struct type_casting_traits<float, int> : vectorized_type_casting_traits<float, int> {};
+template <>
+struct type_casting_traits<int, float> : vectorized_type_casting_traits<int, float> {};
-template<> struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
-template<> struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
+template <>
+struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
+template <>
+struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
-template<> struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
-template<> struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
+template <>
+struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
+template <>
+struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
#endif
template <>
-EIGEN_STRONG_INLINE Packet16b pcast<Packet4f, Packet16b>(const Packet4f& a,
- const Packet4f& b,
- const Packet4f& c,
+EIGEN_STRONG_INLINE Packet16b pcast<Packet4f, Packet16b>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
const Packet4f& d) {
__m128 zero = pzero(a);
__m128 nonzero_a = _mm_cmpneq_ps(a, zero);
@@ -50,79 +56,92 @@
template <>
EIGEN_STRONG_INLINE Packet4f pcast<Packet16b, Packet4f>(const Packet16b& a) {
const __m128 cst_one = _mm_set_ps1(1.0f);
- #ifdef EIGEN_VECTORIZE_SSE4_1
+#ifdef EIGEN_VECTORIZE_SSE4_1
__m128i a_extended = _mm_cvtepi8_epi32(a);
__m128i abcd = _mm_cmpeq_epi32(a_extended, _mm_setzero_si128());
- #else
+#else
__m128i abcd_efhg_ijkl_mnop = _mm_cmpeq_epi8(a, _mm_setzero_si128());
__m128i aabb_ccdd_eeff_gghh = _mm_unpacklo_epi8(abcd_efhg_ijkl_mnop, abcd_efhg_ijkl_mnop);
__m128i abcd = _mm_unpacklo_epi8(aabb_ccdd_eeff_gghh, aabb_ccdd_eeff_gghh);
- #endif
+#endif
__m128 result = _mm_andnot_ps(_mm_castsi128_ps(abcd), cst_one);
return result;
}
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
return _mm_cvttps_epi32(a);
}
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet2d, Packet4i>(const Packet2d& a, const Packet2d& b) {
- return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm_cvttpd_epi32(a)),
- _mm_castsi128_ps(_mm_cvttpd_epi32(b)),
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2d, Packet4i>(const Packet2d& a, const Packet2d& b) {
+ return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm_cvttpd_epi32(a)), _mm_castsi128_ps(_mm_cvttpd_epi32(b)),
(1 << 2) | (1 << 6)));
}
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
return _mm_cvtepi32_ps(a);
}
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
}
-template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4i, Packet2d>(const Packet4i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4i, Packet2d>(const Packet4i& a) {
// Simply discard the second half of the input
return _mm_cvtepi32_pd(a);
}
-template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
// Simply discard the second half of the input
return _mm_cvtps_pd(a);
}
-template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4f>(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4f>(const Packet4f& a) {
return _mm_castps_pd(a);
}
-template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet2d>(const Packet2d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet2d>(const Packet2d& a) {
return _mm_castpd_ps(a);
}
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
return _mm_castps_si128(a);
}
-template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
return _mm_castsi128_ps(a);
}
-template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d,Packet4i>(const Packet4i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
return _mm_castsi128_pd(a);
}
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet2d>(const Packet2d& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
return _mm_castpd_si128(a);
}
-template<> EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
return Packet4ui(a);
}
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
return Packet4i(a);
}
// Disable the following code since it's broken on too many platforms / compilers.
-//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+// #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
#if 0
template <>
@@ -171,8 +190,8 @@
#endif
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_TYPE_CASTING_SSE_H
+#endif // EIGEN_TYPE_CASTING_SSE_H
diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h
index 64b710f..6a03de9 100644
--- a/Eigen/src/Core/arch/SVE/PacketMath.h
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h
@@ -13,10 +13,8 @@
// IWYU pragma: private
#include "../../InternalHeaderCheck.h"
-namespace Eigen
-{
-namespace internal
-{
+namespace Eigen {
+namespace internal {
#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
#endif
@@ -75,174 +73,146 @@
};
template <>
-EIGEN_STRONG_INLINE void prefetch<numext::int32_t>(const numext::int32_t* addr)
-{
+EIGEN_STRONG_INLINE void prefetch<numext::int32_t>(const numext::int32_t* addr) {
svprfw(svptrue_b32(), addr, SV_PLDL1KEEP);
}
template <>
-EIGEN_STRONG_INLINE PacketXi pset1<PacketXi>(const numext::int32_t& from)
-{
+EIGEN_STRONG_INLINE PacketXi pset1<PacketXi>(const numext::int32_t& from) {
return svdup_n_s32(from);
}
template <>
-EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const numext::int32_t& a)
-{
+EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const numext::int32_t& a) {
numext::int32_t c[packet_traits<numext::int32_t>::size];
for (int i = 0; i < packet_traits<numext::int32_t>::size; i++) c[i] = i;
return svadd_s32_z(svptrue_b32(), pset1<PacketXi>(a), svld1_s32(svptrue_b32(), c));
}
template <>
-EIGEN_STRONG_INLINE PacketXi padd<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi padd<PacketXi>(const PacketXi& a, const PacketXi& b) {
return svadd_s32_z(svptrue_b32(), a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXi psub<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi psub<PacketXi>(const PacketXi& a, const PacketXi& b) {
return svsub_s32_z(svptrue_b32(), a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a)
-{
+EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a) {
return svneg_s32_z(svptrue_b32(), a);
}
template <>
-EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a)
-{
+EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a) {
return a;
}
template <>
-EIGEN_STRONG_INLINE PacketXi pmul<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pmul<PacketXi>(const PacketXi& a, const PacketXi& b) {
return svmul_s32_z(svptrue_b32(), a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXi pdiv<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pdiv<PacketXi>(const PacketXi& a, const PacketXi& b) {
return svdiv_s32_z(svptrue_b32(), a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c)
-{
+EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) {
return svmla_s32_z(svptrue_b32(), c, a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXi pmin<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pmin<PacketXi>(const PacketXi& a, const PacketXi& b) {
return svmin_s32_z(svptrue_b32(), a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXi pmax<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pmax<PacketXi>(const PacketXi& a, const PacketXi& b) {
return svmax_s32_z(svptrue_b32(), a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXi pcmp_le<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pcmp_le<PacketXi>(const PacketXi& a, const PacketXi& b) {
return svdup_n_s32_z(svcmple_s32(svptrue_b32(), a, b), 0xffffffffu);
}
template <>
-EIGEN_STRONG_INLINE PacketXi pcmp_lt<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pcmp_lt<PacketXi>(const PacketXi& a, const PacketXi& b) {
return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
}
template <>
-EIGEN_STRONG_INLINE PacketXi pcmp_eq<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pcmp_eq<PacketXi>(const PacketXi& a, const PacketXi& b) {
return svdup_n_s32_z(svcmpeq_s32(svptrue_b32(), a, b), 0xffffffffu);
}
template <>
-EIGEN_STRONG_INLINE PacketXi ptrue<PacketXi>(const PacketXi& /*a*/)
-{
+EIGEN_STRONG_INLINE PacketXi ptrue<PacketXi>(const PacketXi& /*a*/) {
return svdup_n_s32_z(svptrue_b32(), 0xffffffffu);
}
template <>
-EIGEN_STRONG_INLINE PacketXi pzero<PacketXi>(const PacketXi& /*a*/)
-{
+EIGEN_STRONG_INLINE PacketXi pzero<PacketXi>(const PacketXi& /*a*/) {
return svdup_n_s32_z(svptrue_b32(), 0);
}
template <>
-EIGEN_STRONG_INLINE PacketXi pand<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pand<PacketXi>(const PacketXi& a, const PacketXi& b) {
return svand_s32_z(svptrue_b32(), a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXi por<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi por<PacketXi>(const PacketXi& a, const PacketXi& b) {
return svorr_s32_z(svptrue_b32(), a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXi pxor<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pxor<PacketXi>(const PacketXi& a, const PacketXi& b) {
return sveor_s32_z(svptrue_b32(), a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXi pandnot<PacketXi>(const PacketXi& a, const PacketXi& b)
-{
+EIGEN_STRONG_INLINE PacketXi pandnot<PacketXi>(const PacketXi& a, const PacketXi& b) {
return svbic_s32_z(svptrue_b32(), a, b);
}
template <int N>
-EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a)
-{
+EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a) {
return svasrd_n_s32_z(svptrue_b32(), a, N);
}
template <int N>
-EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a)
-{
+EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a) {
return svreinterpret_s32_u32(svlsr_n_u32_z(svptrue_b32(), svreinterpret_u32_s32(a), N));
}
template <int N>
-EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a)
-{
+EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a) {
return svlsl_n_s32_z(svptrue_b32(), a, N);
}
template <>
-EIGEN_STRONG_INLINE PacketXi pload<PacketXi>(const numext::int32_t* from)
-{
+EIGEN_STRONG_INLINE PacketXi pload<PacketXi>(const numext::int32_t* from) {
EIGEN_DEBUG_ALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
}
template <>
-EIGEN_STRONG_INLINE PacketXi ploadu<PacketXi>(const numext::int32_t* from)
-{
+EIGEN_STRONG_INLINE PacketXi ploadu<PacketXi>(const numext::int32_t* from) {
EIGEN_DEBUG_UNALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
}
template <>
-EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const numext::int32_t* from)
-{
+EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const numext::int32_t* from) {
svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...}
indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...}
return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
}
template <>
-EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const numext::int32_t* from)
-{
+EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const numext::int32_t* from) {
svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...}
indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...}
indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
@@ -250,63 +220,54 @@
}
template <>
-EIGEN_STRONG_INLINE void pstore<numext::int32_t>(numext::int32_t* to, const PacketXi& from)
-{
+EIGEN_STRONG_INLINE void pstore<numext::int32_t>(numext::int32_t* to, const PacketXi& from) {
EIGEN_DEBUG_ALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
}
template <>
-EIGEN_STRONG_INLINE void pstoreu<numext::int32_t>(numext::int32_t* to, const PacketXi& from)
-{
+EIGEN_STRONG_INLINE void pstoreu<numext::int32_t>(numext::int32_t* to, const PacketXi& from) {
EIGEN_DEBUG_UNALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
}
template <>
-EIGEN_DEVICE_FUNC inline PacketXi pgather<numext::int32_t, PacketXi>(const numext::int32_t* from, Index stride)
-{
+EIGEN_DEVICE_FUNC inline PacketXi pgather<numext::int32_t, PacketXi>(const numext::int32_t* from, Index stride) {
// Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
svint32_t indices = svindex_s32(0, stride);
return svld1_gather_s32index_s32(svptrue_b32(), from, indices);
}
template <>
-EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, PacketXi>(numext::int32_t* to, const PacketXi& from, Index stride)
-{
+EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, PacketXi>(numext::int32_t* to, const PacketXi& from,
+ Index stride) {
// Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
svint32_t indices = svindex_s32(0, stride);
svst1_scatter_s32index_s32(svptrue_b32(), to, indices, from);
}
template <>
-EIGEN_STRONG_INLINE numext::int32_t pfirst<PacketXi>(const PacketXi& a)
-{
+EIGEN_STRONG_INLINE numext::int32_t pfirst<PacketXi>(const PacketXi& a) {
// svlasta returns the first element if all predicate bits are 0
return svlasta_s32(svpfalse_b(), a);
}
template <>
-EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a)
-{
+EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) {
return svrev_s32(a);
}
template <>
-EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a)
-{
+EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a) {
return svabs_s32_z(svptrue_b32(), a);
}
template <>
-EIGEN_STRONG_INLINE numext::int32_t predux<PacketXi>(const PacketXi& a)
-{
+EIGEN_STRONG_INLINE numext::int32_t predux<PacketXi>(const PacketXi& a) {
return static_cast<numext::int32_t>(svaddv_s32(svptrue_b32(), a));
}
template <>
-EIGEN_STRONG_INLINE numext::int32_t predux_mul<PacketXi>(const PacketXi& a)
-{
- EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
- EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
+EIGEN_STRONG_INLINE numext::int32_t predux_mul<PacketXi>(const PacketXi& a) {
+ EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0), EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
// Multiply the vector by its reverse
svint32_t prod = svmul_s32_z(svptrue_b32(), a, svrev_s32(a));
@@ -338,14 +299,12 @@
}
template <>
-EIGEN_STRONG_INLINE numext::int32_t predux_min<PacketXi>(const PacketXi& a)
-{
+EIGEN_STRONG_INLINE numext::int32_t predux_min<PacketXi>(const PacketXi& a) {
return svminv_s32(svptrue_b32(), a);
}
template <>
-EIGEN_STRONG_INLINE numext::int32_t predux_max<PacketXi>(const PacketXi& a)
-{
+EIGEN_STRONG_INLINE numext::int32_t predux_max<PacketXi>(const PacketXi& a) {
return svmaxv_s32(svptrue_b32(), a);
}
@@ -422,120 +381,101 @@
};
template <>
-EIGEN_STRONG_INLINE PacketXf pset1<PacketXf>(const float& from)
-{
+EIGEN_STRONG_INLINE PacketXf pset1<PacketXf>(const float& from) {
return svdup_n_f32(from);
}
template <>
-EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(numext::uint32_t from)
-{
+EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(numext::uint32_t from) {
return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), from));
}
template <>
-EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a)
-{
+EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
float c[packet_traits<float>::size];
for (int i = 0; i < packet_traits<float>::size; i++) c[i] = i;
return svadd_f32_z(svptrue_b32(), pset1<PacketXf>(a), svld1_f32(svptrue_b32(), c));
}
template <>
-EIGEN_STRONG_INLINE PacketXf padd<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf padd<PacketXf>(const PacketXf& a, const PacketXf& b) {
return svadd_f32_z(svptrue_b32(), a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXf psub<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf psub<PacketXf>(const PacketXf& a, const PacketXf& b) {
return svsub_f32_z(svptrue_b32(), a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a) {
return svneg_f32_z(svptrue_b32(), a);
}
template <>
-EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a) {
return a;
}
template <>
-EIGEN_STRONG_INLINE PacketXf pmul<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pmul<PacketXf>(const PacketXf& a, const PacketXf& b) {
return svmul_f32_z(svptrue_b32(), a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXf pdiv<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pdiv<PacketXf>(const PacketXf& a, const PacketXf& b) {
return svdiv_f32_z(svptrue_b32(), a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c)
-{
+EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) {
return svmla_f32_z(svptrue_b32(), c, a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXf pmin<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pmin<PacketXf>(const PacketXf& a, const PacketXf& b) {
return svmin_f32_z(svptrue_b32(), a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXf pmin<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pmin<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b) {
return pmin<PacketXf>(a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXf pmin<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pmin<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b) {
return svminnm_f32_z(svptrue_b32(), a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXf pmax<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pmax<PacketXf>(const PacketXf& a, const PacketXf& b) {
return svmax_f32_z(svptrue_b32(), a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXf pmax<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pmax<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b) {
return pmax<PacketXf>(a, b);
}
template <>
-EIGEN_STRONG_INLINE PacketXf pmax<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pmax<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b) {
return svmaxnm_f32_z(svptrue_b32(), a, b);
}
// Float comparisons in SVE return svbool (predicate). Use svdup to set active
// lanes to 1 (0xffffffffu) and inactive lanes to 0.
template <>
-EIGEN_STRONG_INLINE PacketXf pcmp_le<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pcmp_le<PacketXf>(const PacketXf& a, const PacketXf& b) {
return svreinterpret_f32_u32(svdup_n_u32_z(svcmple_f32(svptrue_b32(), a, b), 0xffffffffu));
}
template <>
-EIGEN_STRONG_INLINE PacketXf pcmp_lt<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pcmp_lt<PacketXf>(const PacketXf& a, const PacketXf& b) {
return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
}
template <>
-EIGEN_STRONG_INLINE PacketXf pcmp_eq<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pcmp_eq<PacketXf>(const PacketXf& a, const PacketXf& b) {
return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(), a, b), 0xffffffffu));
}
@@ -543,71 +483,60 @@
// greater/equal comparison (svcmpge_f32). Then fill a float vector with the
// active elements.
template <>
-EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan<PacketXf>(const PacketXf& a, const PacketXf& b) {
return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(), a, b)), 0xffffffffu));
}
template <>
-EIGEN_STRONG_INLINE PacketXf pfloor<PacketXf>(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE PacketXf pfloor<PacketXf>(const PacketXf& a) {
return svrintm_f32_z(svptrue_b32(), a);
}
template <>
-EIGEN_STRONG_INLINE PacketXf ptrue<PacketXf>(const PacketXf& /*a*/)
-{
+EIGEN_STRONG_INLINE PacketXf ptrue<PacketXf>(const PacketXf& /*a*/) {
return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), 0xffffffffu));
}
// Logical Operations are not supported for float, so reinterpret casts
template <>
-EIGEN_STRONG_INLINE PacketXf pand<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pand<PacketXf>(const PacketXf& a, const PacketXf& b) {
return svreinterpret_f32_u32(svand_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
}
template <>
-EIGEN_STRONG_INLINE PacketXf por<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf por<PacketXf>(const PacketXf& a, const PacketXf& b) {
return svreinterpret_f32_u32(svorr_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
}
template <>
-EIGEN_STRONG_INLINE PacketXf pxor<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pxor<PacketXf>(const PacketXf& a, const PacketXf& b) {
return svreinterpret_f32_u32(sveor_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
}
template <>
-EIGEN_STRONG_INLINE PacketXf pandnot<PacketXf>(const PacketXf& a, const PacketXf& b)
-{
+EIGEN_STRONG_INLINE PacketXf pandnot<PacketXf>(const PacketXf& a, const PacketXf& b) {
return svreinterpret_f32_u32(svbic_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
}
template <>
-EIGEN_STRONG_INLINE PacketXf pload<PacketXf>(const float* from)
-{
+EIGEN_STRONG_INLINE PacketXf pload<PacketXf>(const float* from) {
EIGEN_DEBUG_ALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
}
template <>
-EIGEN_STRONG_INLINE PacketXf ploadu<PacketXf>(const float* from)
-{
+EIGEN_STRONG_INLINE PacketXf ploadu<PacketXf>(const float* from) {
EIGEN_DEBUG_UNALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
}
template <>
-EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from)
-{
+EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from) {
svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...}
indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...}
return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
}
template <>
-EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from)
-{
+EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from) {
svuint32_t indices = svindex_u32(0, 1); // index {base=0, base+step=1, base+step*2, ...}
indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a1, a1, a2, a2, ...}
indices = svzip1_u32(indices, indices); // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
@@ -615,63 +544,54 @@
}
template <>
-EIGEN_STRONG_INLINE void pstore<float>(float* to, const PacketXf& from)
-{
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const PacketXf& from) {
EIGEN_DEBUG_ALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
}
template <>
-EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const PacketXf& from)
-{
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const PacketXf& from) {
EIGEN_DEBUG_UNALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
}
template <>
-EIGEN_DEVICE_FUNC inline PacketXf pgather<float, PacketXf>(const float* from, Index stride)
-{
+EIGEN_DEVICE_FUNC inline PacketXf pgather<float, PacketXf>(const float* from, Index stride) {
// Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
svint32_t indices = svindex_s32(0, stride);
return svld1_gather_s32index_f32(svptrue_b32(), from, indices);
}
template <>
-EIGEN_DEVICE_FUNC inline void pscatter<float, PacketXf>(float* to, const PacketXf& from, Index stride)
-{
+EIGEN_DEVICE_FUNC inline void pscatter<float, PacketXf>(float* to, const PacketXf& from, Index stride) {
// Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
svint32_t indices = svindex_s32(0, stride);
svst1_scatter_s32index_f32(svptrue_b32(), to, indices, from);
}
template <>
-EIGEN_STRONG_INLINE float pfirst<PacketXf>(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE float pfirst<PacketXf>(const PacketXf& a) {
// svlasta returns the first element if all predicate bits are 0
return svlasta_f32(svpfalse_b(), a);
}
template <>
-EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) {
return svrev_f32(a);
}
template <>
-EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a) {
return svabs_f32_z(svptrue_b32(), a);
}
-// TODO(tellenbach): Should this go into MathFunctions.h? If so, change for
+// TODO(tellenbach): Should this go into MathFunctions.h? If so, change for
// all vector extensions and the generic version.
template <>
-EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent)
-{
+EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent) {
return pfrexp_generic(a, exponent);
}
template <>
-EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
return svaddv_f32(svptrue_b32(), a);
}
@@ -679,10 +599,8 @@
// mul
// Only works for SVE Vls multiple of 128
template <>
-EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a)
-{
- EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
- EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
+EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
+ EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0), EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
// Multiply the vector by its reverse
svfloat32_t prod = svmul_f32_z(svptrue_b32(), a, svrev_f32(a));
svfloat32_t half_prod;
@@ -713,20 +631,17 @@
}
template <>
-EIGEN_STRONG_INLINE float predux_min<PacketXf>(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE float predux_min<PacketXf>(const PacketXf& a) {
return svminv_f32(svptrue_b32(), a);
}
template <>
-EIGEN_STRONG_INLINE float predux_max<PacketXf>(const PacketXf& a)
-{
+EIGEN_STRONG_INLINE float predux_max<PacketXf>(const PacketXf& a) {
return svmaxv_f32(svptrue_b32(), a);
}
-template<int N>
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXf, N>& kernel)
-{
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXf, N>& kernel) {
float buffer[packet_traits<float>::size * N] = {0};
int i = 0;
@@ -741,9 +656,8 @@
}
}
-template<>
-EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf& exponent)
-{
+template <>
+EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf& exponent) {
return pldexp_generic(a, exponent);
}
diff --git a/Eigen/src/Core/arch/SVE/TypeCasting.h b/Eigen/src/Core/arch/SVE/TypeCasting.h
index 068ff48..b451676 100644
--- a/Eigen/src/Core/arch/SVE/TypeCasting.h
+++ b/Eigen/src/Core/arch/SVE/TypeCasting.h
@@ -49,4 +49,4 @@
} // namespace internal
} // namespace Eigen
-#endif // EIGEN_TYPE_CASTING_SVE_H
+#endif // EIGEN_TYPE_CASTING_SVE_H
diff --git a/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
index 27d9a82..578e0f3 100644
--- a/Eigen/src/Core/arch/SYCL/InteropHeaders.h
+++ b/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@@ -78,12 +78,11 @@
};
#ifdef SYCL_DEVICE_ONLY
-#define SYCL_PACKET_TRAITS(packet_type, has_blend, unpacket_type, lengths) \
- template <> \
- struct packet_traits<unpacket_type> \
- : sycl_packet_traits<has_blend, lengths> { \
- typedef packet_type type; \
- typedef packet_type half; \
+#define SYCL_PACKET_TRAITS(packet_type, has_blend, unpacket_type, lengths) \
+ template <> \
+ struct packet_traits<unpacket_type> : sycl_packet_traits<has_blend, lengths> { \
+ typedef packet_type type; \
+ typedef packet_type half; \
};
SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, Eigen::half, 8)
@@ -134,15 +133,13 @@
#ifndef SYCL_DEVICE_ONLY
template <typename PacketReturnType, int PacketSize>
struct PacketWrapper {
- typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
- Scalar;
+ typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
template <typename Index>
EIGEN_DEVICE_FUNC static Scalar scalarize(Index, PacketReturnType &) {
eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR THE CHOSEN TYPE");
abort();
}
- EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(Scalar in,
- Scalar) {
+ EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(Scalar in, Scalar) {
return ::Eigen::internal::template plset<PacketReturnType>(in);
}
EIGEN_DEVICE_FUNC static void set_packet(PacketReturnType, Scalar *) {
@@ -154,8 +151,7 @@
#elif defined(SYCL_DEVICE_ONLY)
template <typename PacketReturnType>
struct PacketWrapper<PacketReturnType, 4> {
- typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
- Scalar;
+ typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
template <typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) {
switch (index) {
@@ -168,15 +164,14 @@
case 3:
return in.w();
default:
- //INDEX MUST BE BETWEEN 0 and 3.There is no abort function in SYCL kernel. so we cannot use abort here.
- // The code will never reach here
- __builtin_unreachable();
+ // INDEX MUST BE BETWEEN 0 and 3.There is no abort function in SYCL kernel. so we cannot use abort here.
+ // The code will never reach here
+ __builtin_unreachable();
}
__builtin_unreachable();
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(
- Scalar in, Scalar other) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, Scalar other) {
return PacketReturnType(in, other, other, other);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
@@ -186,25 +181,20 @@
template <typename PacketReturnType>
struct PacketWrapper<PacketReturnType, 1> {
- typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
- Scalar;
+ typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
template <typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index, PacketReturnType &in) {
return in;
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in,
- Scalar) {
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, Scalar) {
return PacketReturnType(in);
}
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
- lhs = rhs[0];
- }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) { lhs = rhs[0]; }
};
template <typename PacketReturnType>
struct PacketWrapper<PacketReturnType, 2> {
- typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
- Scalar;
+ typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
template <typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) {
switch (index) {
@@ -213,15 +203,14 @@
case 1:
return in.y();
default:
- //INDEX MUST BE BETWEEN 0 and 1.There is no abort function in SYCL kernel. so we cannot use abort here.
- // The code will never reach here
+ // INDEX MUST BE BETWEEN 0 and 1.There is no abort function in SYCL kernel. so we cannot use abort here.
+ // The code will never reach here
__builtin_unreachable();
}
__builtin_unreachable();
}
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(
- Scalar in, Scalar other) {
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, Scalar other) {
return PacketReturnType(in, other);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
diff --git a/Eigen/src/Core/arch/SYCL/MathFunctions.h b/Eigen/src/Core/arch/SYCL/MathFunctions.h
index a8adc46..b20c32b 100644
--- a/Eigen/src/Core/arch/SYCL/MathFunctions.h
+++ b/Eigen/src/Core/arch/SYCL/MathFunctions.h
@@ -31,11 +31,10 @@
// introduce conflicts between these packet_traits definitions and the ones
// we'll use on the host side (SSE, AVX, ...)
#if defined(SYCL_DEVICE_ONLY)
-#define SYCL_PLOG(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::log(a); \
+#define SYCL_PLOG(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog<packet_type>(const packet_type& a) { \
+ return cl::sycl::log(a); \
}
SYCL_PLOG(cl::sycl::cl_half8)
@@ -43,11 +42,10 @@
SYCL_PLOG(cl::sycl::cl_double2)
#undef SYCL_PLOG
-#define SYCL_PLOG1P(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog1p<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::log1p(a); \
+#define SYCL_PLOG1P(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog1p<packet_type>(const packet_type& a) { \
+ return cl::sycl::log1p(a); \
}
SYCL_PLOG1P(cl::sycl::cl_half8)
@@ -55,11 +53,10 @@
SYCL_PLOG1P(cl::sycl::cl_double2)
#undef SYCL_PLOG1P
-#define SYCL_PLOG10(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog10<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::log10(a); \
+#define SYCL_PLOG10(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog10<packet_type>(const packet_type& a) { \
+ return cl::sycl::log10(a); \
}
SYCL_PLOG10(cl::sycl::cl_half8)
@@ -67,11 +64,10 @@
SYCL_PLOG10(cl::sycl::cl_double2)
#undef SYCL_PLOG10
-#define SYCL_PEXP(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexp<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::exp(a); \
+#define SYCL_PEXP(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexp<packet_type>(const packet_type& a) { \
+ return cl::sycl::exp(a); \
}
SYCL_PEXP(cl::sycl::cl_half8)
@@ -81,11 +77,10 @@
SYCL_PEXP(cl::sycl::cl_double2)
#undef SYCL_PEXP
-#define SYCL_PEXPM1(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexpm1<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::expm1(a); \
+#define SYCL_PEXPM1(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexpm1<packet_type>(const packet_type& a) { \
+ return cl::sycl::expm1(a); \
}
SYCL_PEXPM1(cl::sycl::cl_half8)
@@ -93,11 +88,10 @@
SYCL_PEXPM1(cl::sycl::cl_double2)
#undef SYCL_PEXPM1
-#define SYCL_PSQRT(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psqrt<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::sqrt(a); \
+#define SYCL_PSQRT(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psqrt<packet_type>(const packet_type& a) { \
+ return cl::sycl::sqrt(a); \
}
SYCL_PSQRT(cl::sycl::cl_half8)
@@ -105,11 +99,10 @@
SYCL_PSQRT(cl::sycl::cl_double2)
#undef SYCL_PSQRT
-#define SYCL_PRSQRT(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type prsqrt<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::rsqrt(a); \
+#define SYCL_PRSQRT(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type prsqrt<packet_type>(const packet_type& a) { \
+ return cl::sycl::rsqrt(a); \
}
SYCL_PRSQRT(cl::sycl::cl_half8)
@@ -118,11 +111,10 @@
#undef SYCL_PRSQRT
/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-#define SYCL_PSIN(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psin<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::sin(a); \
+#define SYCL_PSIN(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psin<packet_type>(const packet_type& a) { \
+ return cl::sycl::sin(a); \
}
SYCL_PSIN(cl::sycl::cl_half8)
@@ -131,11 +123,10 @@
#undef SYCL_PSIN
/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-#define SYCL_PCOS(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcos<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::cos(a); \
+#define SYCL_PCOS(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcos<packet_type>(const packet_type& a) { \
+ return cl::sycl::cos(a); \
}
SYCL_PCOS(cl::sycl::cl_half8)
@@ -144,11 +135,10 @@
#undef SYCL_PCOS
/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-#define SYCL_PTAN(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptan<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::tan(a); \
+#define SYCL_PTAN(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptan<packet_type>(const packet_type& a) { \
+ return cl::sycl::tan(a); \
}
SYCL_PTAN(cl::sycl::cl_half8)
@@ -157,11 +147,10 @@
#undef SYCL_PTAN
/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-#define SYCL_PASIN(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pasin<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::asin(a); \
+#define SYCL_PASIN(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pasin<packet_type>(const packet_type& a) { \
+ return cl::sycl::asin(a); \
}
SYCL_PASIN(cl::sycl::cl_half8)
@@ -170,11 +159,10 @@
#undef SYCL_PASIN
/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-#define SYCL_PACOS(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pacos<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::acos(a); \
+#define SYCL_PACOS(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pacos<packet_type>(const packet_type& a) { \
+ return cl::sycl::acos(a); \
}
SYCL_PACOS(cl::sycl::cl_half8)
@@ -183,11 +171,10 @@
#undef SYCL_PACOS
/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-#define SYCL_PATAN(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type patan<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::atan(a); \
+#define SYCL_PATAN(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type patan<packet_type>(const packet_type& a) { \
+ return cl::sycl::atan(a); \
}
SYCL_PATAN(cl::sycl::cl_half8)
@@ -196,11 +183,10 @@
#undef SYCL_PATAN
/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-#define SYCL_PSINH(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psinh<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::sinh(a); \
+#define SYCL_PSINH(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psinh<packet_type>(const packet_type& a) { \
+ return cl::sycl::sinh(a); \
}
SYCL_PSINH(cl::sycl::cl_half8)
@@ -209,11 +195,10 @@
#undef SYCL_PSINH
/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-#define SYCL_PCOSH(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcosh<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::cosh(a); \
+#define SYCL_PCOSH(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcosh<packet_type>(const packet_type& a) { \
+ return cl::sycl::cosh(a); \
}
SYCL_PCOSH(cl::sycl::cl_half8)
@@ -222,11 +207,10 @@
#undef SYCL_PCOSH
/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-#define SYCL_PTANH(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptanh<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::tanh(a); \
+#define SYCL_PTANH(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptanh<packet_type>(const packet_type& a) { \
+ return cl::sycl::tanh(a); \
}
SYCL_PTANH(cl::sycl::cl_half8)
@@ -234,11 +218,10 @@
SYCL_PTANH(cl::sycl::cl_double2)
#undef SYCL_PTANH
-#define SYCL_PCEIL(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pceil<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::ceil(a); \
+#define SYCL_PCEIL(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pceil<packet_type>(const packet_type& a) { \
+ return cl::sycl::ceil(a); \
}
SYCL_PCEIL(cl::sycl::cl_half)
@@ -246,11 +229,10 @@
SYCL_PCEIL(cl::sycl::cl_double2)
#undef SYCL_PCEIL
-#define SYCL_PROUND(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pround<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::round(a); \
+#define SYCL_PROUND(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pround<packet_type>(const packet_type& a) { \
+ return cl::sycl::round(a); \
}
SYCL_PROUND(cl::sycl::cl_half8)
@@ -258,11 +240,10 @@
SYCL_PROUND(cl::sycl::cl_double2)
#undef SYCL_PROUND
-#define SYCL_PRINT(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type print<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::rint(a); \
+#define SYCL_PRINT(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type print<packet_type>(const packet_type& a) { \
+ return cl::sycl::rint(a); \
}
SYCL_PRINT(cl::sycl::cl_half8)
@@ -270,11 +251,10 @@
SYCL_PRINT(cl::sycl::cl_double2)
#undef SYCL_PRINT
-#define SYCL_FLOOR(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pfloor<packet_type>( \
- const packet_type& a) { \
- return cl::sycl::floor(a); \
+#define SYCL_FLOOR(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pfloor<packet_type>(const packet_type& a) { \
+ return cl::sycl::floor(a); \
}
SYCL_FLOOR(cl::sycl::cl_half8)
@@ -282,11 +262,10 @@
SYCL_FLOOR(cl::sycl::cl_double2)
#undef SYCL_FLOOR
-#define SYCL_PMIN(packet_type, expr) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmin<packet_type>( \
- const packet_type& a, const packet_type& b) { \
- return expr; \
+#define SYCL_PMIN(packet_type, expr) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmin<packet_type>(const packet_type& a, const packet_type& b) { \
+ return expr; \
}
SYCL_PMIN(cl::sycl::cl_half8, cl::sycl::fmin(a, b))
@@ -294,11 +273,10 @@
SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
#undef SYCL_PMIN
-#define SYCL_PMAX(packet_type, expr) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmax<packet_type>( \
- const packet_type& a, const packet_type& b) { \
- return expr; \
+#define SYCL_PMAX(packet_type, expr) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmax<packet_type>(const packet_type& a, const packet_type& b) { \
+ return expr; \
}
SYCL_PMAX(cl::sycl::cl_half8, cl::sycl::fmax(a, b))
@@ -306,13 +284,10 @@
SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
#undef SYCL_PMAX
-#define SYCL_PLDEXP(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pldexp( \
- const packet_type& a, const packet_type& exponent) { \
- return cl::sycl::ldexp( \
- a, exponent.template convert<cl::sycl::cl_int, \
- cl::sycl::rounding_mode::automatic>()); \
+#define SYCL_PLDEXP(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pldexp(const packet_type& a, const packet_type& exponent) { \
+ return cl::sycl::ldexp(a, exponent.template convert<cl::sycl::cl_int, cl::sycl::rounding_mode::automatic>()); \
}
SYCL_PLDEXP(cl::sycl::cl_half8)
diff --git a/Eigen/src/Core/arch/SYCL/PacketMath.h b/Eigen/src/Core/arch/SYCL/PacketMath.h
index 4b0b1c6..6b6bfe4 100644
--- a/Eigen/src/Core/arch/SYCL/PacketMath.h
+++ b/Eigen/src/Core/arch/SYCL/PacketMath.h
@@ -29,15 +29,16 @@
namespace internal {
#ifdef SYCL_DEVICE_ONLY
-#define SYCL_PLOAD(packet_type, AlignedType) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type \
- pload##AlignedType<packet_type>( \
- const typename unpacket_traits<packet_type>::type* from) { \
- auto ptr = cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(from);\
- packet_type res{}; \
- res.load(0, ptr); \
- return res; \
+#define SYCL_PLOAD(packet_type, AlignedType) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType<packet_type>( \
+ const typename unpacket_traits<packet_type>::type* from) { \
+ auto ptr = \
+ cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>( \
+ from); \
+ packet_type res{}; \
+ res.load(0, ptr); \
+ return res; \
}
SYCL_PLOAD(cl::sycl::cl_float4, u)
@@ -47,37 +48,34 @@
#undef SYCL_PLOAD
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
- pload<cl::sycl::cl_half8>(
- const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
- auto ptr = cl::sycl::address_space_cast<
- cl::sycl::access::address_space::generic_space,
- cl::sycl::access::decorated::no>(
- reinterpret_cast<const cl::sycl::cl_half*>(from));
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pload<cl::sycl::cl_half8>(
+ const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
+ auto ptr =
+ cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(
+ reinterpret_cast<const cl::sycl::cl_half*>(from));
cl::sycl::cl_half8 res{};
res.load(0, ptr);
return res;
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
-ploadu<cl::sycl::cl_half8>(
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 ploadu<cl::sycl::cl_half8>(
const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
- auto ptr = cl::sycl::address_space_cast<
- cl::sycl::access::address_space::generic_space,
- cl::sycl::access::decorated::no>(
- reinterpret_cast<const cl::sycl::cl_half*>(from));
+ auto ptr =
+ cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(
+ reinterpret_cast<const cl::sycl::cl_half*>(from));
cl::sycl::cl_half8 res{};
res.load(0, ptr);
return res;
}
-#define SYCL_PSTORE(scalar, packet_type, alignment) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
- scalar* to, const packet_type& from) { \
- auto ptr = cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(to);\
- from.store(0, ptr); \
+#define SYCL_PSTORE(scalar, packet_type, alignment) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment(scalar* to, const packet_type& from) { \
+ auto ptr = \
+ cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>( \
+ to); \
+ from.store(0, ptr); \
}
SYCL_PSTORE(float, cl::sycl::cl_float4, )
@@ -87,22 +85,18 @@
#undef SYCL_PSTORE
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoreu(
- Eigen::half* to, const cl::sycl::cl_half8& from) {
- auto ptr = cl::sycl::address_space_cast<
- cl::sycl::access::address_space::generic_space,
- cl::sycl::access::decorated::no>(
- reinterpret_cast<cl::sycl::cl_half*>(to));
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoreu(Eigen::half* to, const cl::sycl::cl_half8& from) {
+ auto ptr =
+ cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(
+ reinterpret_cast<cl::sycl::cl_half*>(to));
from.store(0, ptr);
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore(
- Eigen::half* to, const cl::sycl::cl_half8& from) {
- auto ptr = cl::sycl::address_space_cast<
- cl::sycl::access::address_space::generic_space,
- cl::sycl::access::decorated::no>(
- reinterpret_cast<cl::sycl::cl_half*>(to));
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore(Eigen::half* to, const cl::sycl::cl_half8& from) {
+ auto ptr =
+ cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(
+ reinterpret_cast<cl::sycl::cl_half*>(to));
from.store(0, ptr);
}
@@ -123,44 +117,33 @@
template <typename packet_type>
struct get_base_packet {
template <typename sycl_multi_pointer>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type
- get_ploaddup(sycl_multi_pointer) {}
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_ploaddup(sycl_multi_pointer) {}
template <typename sycl_multi_pointer>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type
- get_pgather(sycl_multi_pointer, Index) {}
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_pgather(sycl_multi_pointer, Index) {}
};
template <>
struct get_base_packet<cl::sycl::cl_half8> {
template <typename sycl_multi_pointer>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_ploaddup(
- sycl_multi_pointer from) {
- return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0]),
- static_cast<cl::sycl::half>(from[0]),
- static_cast<cl::sycl::half>(from[1]),
- static_cast<cl::sycl::half>(from[1]),
- static_cast<cl::sycl::half>(from[2]),
- static_cast<cl::sycl::half>(from[2]),
- static_cast<cl::sycl::half>(from[3]),
- static_cast<cl::sycl::half>(from[3]));
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_ploaddup(sycl_multi_pointer from) {
+ return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0]), static_cast<cl::sycl::half>(from[0]),
+ static_cast<cl::sycl::half>(from[1]), static_cast<cl::sycl::half>(from[1]),
+ static_cast<cl::sycl::half>(from[2]), static_cast<cl::sycl::half>(from[2]),
+ static_cast<cl::sycl::half>(from[3]), static_cast<cl::sycl::half>(from[3]));
}
template <typename sycl_multi_pointer>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_pgather(
- sycl_multi_pointer from, Index stride) {
- return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0 * stride]),
- static_cast<cl::sycl::half>(from[1 * stride]),
- static_cast<cl::sycl::half>(from[2 * stride]),
- static_cast<cl::sycl::half>(from[3 * stride]),
- static_cast<cl::sycl::half>(from[4 * stride]),
- static_cast<cl::sycl::half>(from[5 * stride]),
- static_cast<cl::sycl::half>(from[6 * stride]),
- static_cast<cl::sycl::half>(from[7 * stride]));
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_pgather(sycl_multi_pointer from, Index stride) {
+ return cl::sycl::cl_half8(
+ static_cast<cl::sycl::half>(from[0 * stride]), static_cast<cl::sycl::half>(from[1 * stride]),
+ static_cast<cl::sycl::half>(from[2 * stride]), static_cast<cl::sycl::half>(from[3 * stride]),
+ static_cast<cl::sycl::half>(from[4 * stride]), static_cast<cl::sycl::half>(from[5 * stride]),
+ static_cast<cl::sycl::half>(from[6 * stride]), static_cast<cl::sycl::half>(from[7 * stride]));
}
template <typename sycl_multi_pointer>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
- sycl_multi_pointer to, const cl::sycl::cl_half8& from, Index stride) {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to, const cl::sycl::cl_half8& from,
+ Index stride) {
auto tmp = stride;
to[0] = Eigen::half(from.s0());
to[tmp] = Eigen::half(from.s1());
@@ -171,45 +154,36 @@
to[tmp += stride] = Eigen::half(from.s6());
to[tmp += stride] = Eigen::half(from.s7());
}
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 set_plset(
- const cl::sycl::half& a) {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 set_plset(const cl::sycl::half& a) {
return cl::sycl::cl_half8(static_cast<cl::sycl::half>(a), static_cast<cl::sycl::half>(a + 1),
- static_cast<cl::sycl::half>(a + 2),
- static_cast<cl::sycl::half>(a + 3),
- static_cast<cl::sycl::half>(a + 4),
- static_cast<cl::sycl::half>(a + 5),
- static_cast<cl::sycl::half>(a + 6),
- static_cast<cl::sycl::half>(a + 7));
+ static_cast<cl::sycl::half>(a + 2), static_cast<cl::sycl::half>(a + 3),
+ static_cast<cl::sycl::half>(a + 4), static_cast<cl::sycl::half>(a + 5),
+ static_cast<cl::sycl::half>(a + 6), static_cast<cl::sycl::half>(a + 7));
}
};
template <>
struct get_base_packet<cl::sycl::cl_float4> {
template <typename sycl_multi_pointer>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(
- sycl_multi_pointer from) {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(sycl_multi_pointer from) {
return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]);
}
template <typename sycl_multi_pointer>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather(
- sycl_multi_pointer from, Index stride) {
- return cl::sycl::cl_float4(from[0 * stride], from[1 * stride],
- from[2 * stride], from[3 * stride]);
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather(sycl_multi_pointer from, Index stride) {
+ return cl::sycl::cl_float4(from[0 * stride], from[1 * stride], from[2 * stride], from[3 * stride]);
}
template <typename sycl_multi_pointer>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
- sycl_multi_pointer to, const cl::sycl::cl_float4& from, Index stride) {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to, const cl::sycl::cl_float4& from,
+ Index stride) {
auto tmp = stride;
to[0] = from.x();
to[tmp] = from.y();
to[tmp += stride] = from.z();
to[tmp += stride] = from.w();
}
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset(
- const float& a) {
- return cl::sycl::cl_float4(static_cast<float>(a), static_cast<float>(a + 1),
- static_cast<float>(a + 2),
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset(const float& a) {
+ return cl::sycl::cl_float4(static_cast<float>(a), static_cast<float>(a + 1), static_cast<float>(a + 2),
static_cast<float>(a + 3));
}
};
@@ -217,28 +191,25 @@
template <>
struct get_base_packet<cl::sycl::cl_double2> {
template <typename sycl_multi_pointer>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2
- get_ploaddup(const sycl_multi_pointer from) {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_ploaddup(const sycl_multi_pointer from) {
return cl::sycl::cl_double2(from[0], from[0]);
}
template <typename sycl_multi_pointer, typename Index>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather(
- const sycl_multi_pointer from, Index stride) {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather(const sycl_multi_pointer from,
+ Index stride) {
return cl::sycl::cl_double2(from[0 * stride], from[1 * stride]);
}
template <typename sycl_multi_pointer>
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
- sycl_multi_pointer to, const cl::sycl::cl_double2& from, Index stride) {
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to,
+ const cl::sycl::cl_double2& from, Index stride) {
to[0] = from.x();
to[stride] = from.y();
}
- static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset(
- const double& a) {
- return cl::sycl::cl_double2(static_cast<double>(a),
- static_cast<double>(a + 1));
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset(const double& a) {
+ return cl::sycl::cl_double2(static_cast<double>(a), static_cast<double>(a + 1));
}
};
@@ -268,15 +239,14 @@
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 plset<cl::sycl::cl_half8>(
const typename unpacket_traits<cl::sycl::cl_half8>::type& a) {
- return get_base_packet<cl::sycl::cl_half8>::set_plset((const cl::sycl::half &) a);
+ return get_base_packet<cl::sycl::cl_half8>::set_plset((const cl::sycl::half&)a);
}
-#define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \
- pgather<scalar, packet_type>( \
- const typename unpacket_traits<packet_type>::type* from, Index stride) { \
- return get_base_packet<packet_type>::get_pgather(from, stride); \
+#define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pgather<scalar, packet_type>( \
+ const typename unpacket_traits<packet_type>::type* from, Index stride) { \
+ return get_base_packet<packet_type>::get_pgather(from, stride); \
}
SYCL_PGATHER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
@@ -284,12 +254,11 @@
SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
#undef SYCL_PGATHER_SPECILIZE
-#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<scalar, packet_type>( \
- typename unpacket_traits<packet_type>::type * to, \
- const packet_type& from, Index stride) { \
- get_base_packet<packet_type>::set_pscatter(to, from, stride); \
+#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<scalar, packet_type>( \
+ typename unpacket_traits<packet_type>::type * to, const packet_type& from, Index stride) { \
+ get_base_packet<packet_type>::set_pscatter(to, from, stride); \
}
SYCL_PSCATTER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
@@ -298,11 +267,11 @@
#undef SYCL_PSCATTER_SPECILIZE
-#define SYCL_PMAD(packet_type) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd( \
- const packet_type& a, const packet_type& b, const packet_type& c) { \
- return cl::sycl::mad(a, b, c); \
+#define SYCL_PMAD(packet_type) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd(const packet_type& a, const packet_type& b, \
+ const packet_type& c) { \
+ return cl::sycl::mad(a, b, c); \
}
SYCL_PMAD(cl::sycl::cl_half8)
@@ -311,146 +280,109 @@
#undef SYCL_PMAD
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half pfirst<cl::sycl::cl_half8>(
- const cl::sycl::cl_half8& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half pfirst<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
return Eigen::half(a.s0());
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(
- const cl::sycl::cl_float4& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
return a.x();
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(
- const cl::sycl::cl_double2& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return a.x();
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux<cl::sycl::cl_half8>(
- const cl::sycl::cl_half8& a) {
- return Eigen::half(a.s0() + a.s1() + a.s2() + a.s3() + a.s4() + a.s5()
- + a.s6() + a.s7());
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+ return Eigen::half(a.s0() + a.s1() + a.s2() + a.s3() + a.s4() + a.s5() + a.s6() + a.s7());
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(
- const cl::sycl::cl_float4& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
return a.x() + a.y() + a.z() + a.w();
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(
- const cl::sycl::cl_double2& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return a.x() + a.y();
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_max<cl::sycl::cl_half8>(
- const cl::sycl::cl_half8& a) {
- return Eigen::half(cl::sycl::fmax(
- cl::sycl::fmax(
- cl::sycl::fmax(a.s0(), a.s1()),
- cl::sycl::fmax(a.s2(), a.s3())),
- cl::sycl::fmax(
- cl::sycl::fmax(a.s4(), a.s5()),
- cl::sycl::fmax(a.s6(), a.s7()))));
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_max<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+ return Eigen::half(cl::sycl::fmax(cl::sycl::fmax(cl::sycl::fmax(a.s0(), a.s1()), cl::sycl::fmax(a.s2(), a.s3())),
+ cl::sycl::fmax(cl::sycl::fmax(a.s4(), a.s5()), cl::sycl::fmax(a.s6(), a.s7()))));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(
- const cl::sycl::cl_float4& a) {
- return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()),
- cl::sycl::fmax(a.z(), a.w()));
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+ return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()), cl::sycl::fmax(a.z(), a.w()));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(
- const cl::sycl::cl_double2& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return cl::sycl::fmax(a.x(), a.y());
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_min<cl::sycl::cl_half8>(
- const cl::sycl::cl_half8& a) {
- return Eigen::half(cl::sycl::fmin(
- cl::sycl::fmin(
- cl::sycl::fmin(a.s0(), a.s1()),
- cl::sycl::fmin(a.s2(), a.s3())),
- cl::sycl::fmin(
- cl::sycl::fmin(a.s4(), a.s5()),
- cl::sycl::fmin(a.s6(), a.s7()))));
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_min<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+ return Eigen::half(cl::sycl::fmin(cl::sycl::fmin(cl::sycl::fmin(a.s0(), a.s1()), cl::sycl::fmin(a.s2(), a.s3())),
+ cl::sycl::fmin(cl::sycl::fmin(a.s4(), a.s5()), cl::sycl::fmin(a.s6(), a.s7()))));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(
- const cl::sycl::cl_float4& a) {
- return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()),
- cl::sycl::fmin(a.z(), a.w()));
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+ return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()), cl::sycl::fmin(a.z(), a.w()));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(
- const cl::sycl::cl_double2& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return cl::sycl::fmin(a.x(), a.y());
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_mul<cl::sycl::cl_half8>(
- const cl::sycl::cl_half8& a) {
- return Eigen::half(a.s0() * a.s1() * a.s2() * a.s3() * a.s4() * a.s5() *
- a.s6() * a.s7());
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_mul<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+ return Eigen::half(a.s0() * a.s1() * a.s2() * a.s3() * a.s4() * a.s5() * a.s6() * a.s7());
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(
- const cl::sycl::cl_float4& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
return a.x() * a.y() * a.z() * a.w();
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(
- const cl::sycl::cl_double2& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return a.x() * a.y();
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8
-pabs<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
- return cl::sycl::cl_half8(cl::sycl::fabs(a.s0()), cl::sycl::fabs(a.s1()),
- cl::sycl::fabs(a.s2()), cl::sycl::fabs(a.s3()),
- cl::sycl::fabs(a.s4()), cl::sycl::fabs(a.s5()),
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pabs<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+ return cl::sycl::cl_half8(cl::sycl::fabs(a.s0()), cl::sycl::fabs(a.s1()), cl::sycl::fabs(a.s2()),
+ cl::sycl::fabs(a.s3()), cl::sycl::fabs(a.s4()), cl::sycl::fabs(a.s5()),
cl::sycl::fabs(a.s6()), cl::sycl::fabs(a.s7()));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
-pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
- return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()),
- cl::sycl::fabs(a.z()), cl::sycl::fabs(a.w()));
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+ return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()), cl::sycl::fabs(a.z()),
+ cl::sycl::fabs(a.w()));
}
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2
-pabs<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pabs<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()));
}
template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_le(const Packet &a,
- const Packet &b) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_le(const Packet& a, const Packet& b) {
return (a <= b).template as<Packet>();
}
template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_lt(const Packet &a,
- const Packet &b) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_lt(const Packet& a, const Packet& b) {
return (a < b).template as<Packet>();
}
template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet &a,
- const Packet &b) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet& a, const Packet& b) {
return (a == b).template as<Packet>();
}
-#define SYCL_PCMP(OP, TYPE) \
- template <> \
- EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TYPE pcmp_##OP<TYPE>(const TYPE &a, \
- const TYPE &b) { \
- return sycl_pcmp_##OP<TYPE>(a, b); \
+#define SYCL_PCMP(OP, TYPE) \
+ template <> \
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TYPE pcmp_##OP<TYPE>(const TYPE& a, const TYPE& b) { \
+ return sycl_pcmp_##OP<TYPE>(a, b); \
}
SYCL_PCMP(le, cl::sycl::cl_half8)
@@ -464,8 +396,7 @@
SYCL_PCMP(eq, cl::sycl::cl_double2)
#undef SYCL_PCMP
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
- PacketBlock<cl::sycl::cl_half8, 8>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(PacketBlock<cl::sycl::cl_half8, 8>& kernel) {
cl::sycl::cl_half tmp = kernel.packet[0].s1();
kernel.packet[0].s1() = kernel.packet[1].s0();
kernel.packet[1].s0() = tmp;
@@ -579,8 +510,7 @@
kernel.packet[7].s6() = tmp;
}
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
- PacketBlock<cl::sycl::cl_float4, 4>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(PacketBlock<cl::sycl::cl_float4, 4>& kernel) {
float tmp = kernel.packet[0].y();
kernel.packet[0].y() = kernel.packet[1].x();
kernel.packet[1].x() = tmp;
@@ -606,8 +536,7 @@
kernel.packet[3].z() = tmp;
}
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
- PacketBlock<cl::sycl::cl_double2, 2>& kernel) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(PacketBlock<cl::sycl::cl_double2, 2>& kernel) {
double tmp = kernel.packet[0].y();
kernel.packet[0].y() = kernel.packet[1].x();
kernel.packet[1].x() = tmp;
@@ -615,35 +544,27 @@
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pblend(
- const Selector<unpacket_traits<cl::sycl::cl_half8>::size>& ifPacket,
- const cl::sycl::cl_half8& thenPacket,
+ const Selector<unpacket_traits<cl::sycl::cl_half8>::size>& ifPacket, const cl::sycl::cl_half8& thenPacket,
const cl::sycl::cl_half8& elsePacket) {
- cl::sycl::cl_short8 condition(
- ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1,
- ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1,
- ifPacket.select[4] ? 0 : -1, ifPacket.select[5] ? 0 : -1,
- ifPacket.select[6] ? 0 : -1, ifPacket.select[7] ? 0 : -1);
+ cl::sycl::cl_short8 condition(ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1, ifPacket.select[2] ? 0 : -1,
+ ifPacket.select[3] ? 0 : -1, ifPacket.select[4] ? 0 : -1, ifPacket.select[5] ? 0 : -1,
+ ifPacket.select[6] ? 0 : -1, ifPacket.select[7] ? 0 : -1);
return cl::sycl::select(thenPacket, elsePacket, condition);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend(
- const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
- const cl::sycl::cl_float4& thenPacket,
+ const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket, const cl::sycl::cl_float4& thenPacket,
const cl::sycl::cl_float4& elsePacket) {
- cl::sycl::cl_int4 condition(
- ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1,
- ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1);
+ cl::sycl::cl_int4 condition(ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1, ifPacket.select[2] ? 0 : -1,
+ ifPacket.select[3] ? 0 : -1);
return cl::sycl::select(thenPacket, elsePacket, condition);
}
template <>
-inline cl::sycl::cl_double2 pblend(
- const Selector<unpacket_traits<cl::sycl::cl_double2>::size>& ifPacket,
- const cl::sycl::cl_double2& thenPacket,
- const cl::sycl::cl_double2& elsePacket) {
- cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1,
- ifPacket.select[1] ? 0 : -1);
+inline cl::sycl::cl_double2 pblend(const Selector<unpacket_traits<cl::sycl::cl_double2>::size>& ifPacket,
+ const cl::sycl::cl_double2& thenPacket, const cl::sycl::cl_double2& elsePacket) {
+ cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1);
return cl::sycl::select(thenPacket, elsePacket, condition);
}
#endif // SYCL_DEVICE_ONLY
diff --git a/Eigen/src/Core/arch/SYCL/TypeCasting.h b/Eigen/src/Core/arch/SYCL/TypeCasting.h
index 9f193c1..6e3fa4f 100644
--- a/Eigen/src/Core/arch/SYCL/TypeCasting.h
+++ b/Eigen/src/Core/arch/SYCL/TypeCasting.h
@@ -34,10 +34,9 @@
};
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_int4
-pcast<cl::sycl::cl_float4, cl::sycl::cl_int4>(const cl::sycl::cl_float4& a) {
- return a
- .template convert<cl::sycl::cl_int, cl::sycl::rounding_mode::automatic>();
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_int4 pcast<cl::sycl::cl_float4, cl::sycl::cl_int4>(
+ const cl::sycl::cl_float4& a) {
+ return a.template convert<cl::sycl::cl_int, cl::sycl::rounding_mode::automatic>();
}
template <>
@@ -46,10 +45,9 @@
};
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
-pcast<cl::sycl::cl_int4, cl::sycl::cl_float4>(const cl::sycl::cl_int4& a) {
- return a.template convert<cl::sycl::cl_float,
- cl::sycl::rounding_mode::automatic>();
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pcast<cl::sycl::cl_int4, cl::sycl::cl_float4>(
+ const cl::sycl::cl_int4& a) {
+ return a.template convert<cl::sycl::cl_float, cl::sycl::rounding_mode::automatic>();
}
template <>
@@ -58,13 +56,10 @@
};
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
-pcast<cl::sycl::cl_double2, cl::sycl::cl_float4>(
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pcast<cl::sycl::cl_double2, cl::sycl::cl_float4>(
const cl::sycl::cl_double2& a, const cl::sycl::cl_double2& b) {
- auto a1 = a.template convert<cl::sycl::cl_float,
- cl::sycl::rounding_mode::automatic>();
- auto b1 = b.template convert<cl::sycl::cl_float,
- cl::sycl::rounding_mode::automatic>();
+ auto a1 = a.template convert<cl::sycl::cl_float, cl::sycl::rounding_mode::automatic>();
+ auto b1 = b.template convert<cl::sycl::cl_float, cl::sycl::rounding_mode::automatic>();
return cl::sycl::cl_float4(a1.x(), a1.y(), b1.x(), b1.y());
}
@@ -74,8 +69,8 @@
};
template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2
-pcast<cl::sycl::cl_float4, cl::sycl::cl_double2>(const cl::sycl::cl_float4& a) {
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pcast<cl::sycl::cl_float4, cl::sycl::cl_double2>(
+ const cl::sycl::cl_float4& a) {
// Simply discard the second half of the input
return cl::sycl::cl_double2(a.x(), a.y());
}
diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h
index 4d74d3d..4000e05 100644
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@@ -19,21 +19,22 @@
namespace internal {
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-static Packet4ui p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
+static Packet4ui p4ui_CONJ_XOR = {0x00000000, 0x80000000, 0x00000000,
+ 0x80000000}; // vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
#endif
-static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
-static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul p2ul_CONJ_XOR1 =
+ (Packet2ul)vec_sld((Packet4ui)p2d_ZERO_, (Packet4ui)p2l_ZERO, 8); //{ 0x8000000000000000, 0x0000000000000000 };
+static Packet2ul p2ul_CONJ_XOR2 =
+ (Packet2ul)vec_sld((Packet4ui)p2l_ZERO, (Packet4ui)p2d_ZERO_, 8); //{ 0x8000000000000000, 0x0000000000000000 };
-struct Packet1cd
-{
+struct Packet1cd {
EIGEN_STRONG_INLINE Packet1cd() {}
EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
Packet2d v;
};
-struct Packet2cf
-{
+struct Packet2cf {
EIGEN_STRONG_INLINE Packet2cf() {}
EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
@@ -46,8 +47,8 @@
#endif
};
-template<> struct packet_traits<std::complex<float> > : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
typedef Packet2cf type;
typedef Packet2cf half;
enum {
@@ -55,23 +56,22 @@
AlignedOnScalar = 1,
size = 2,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
- HasBlend = 1,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
+ HasBlend = 1,
HasSetLinear = 0
};
};
-
-template<> struct packet_traits<std::complex<double> > : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
typedef Packet1cd type;
typedef Packet1cd half;
enum {
@@ -79,58 +79,101 @@
AlignedOnScalar = 1,
size = 1,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
HasNegate = 1,
- HasAbs = 0,
- HasAbs2 = 0,
- HasMin = 0,
- HasMax = 0,
+ HasAbs = 0,
+ HasAbs2 = 0,
+ HasMin = 0,
+ HasMax = 0,
HasSetLinear = 0
};
};
-template<> struct unpacket_traits<Packet2cf> {
- typedef std::complex<float> type;
- enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+template <>
+struct unpacket_traits<Packet2cf> {
+ typedef std::complex<float> type;
+ enum {
+ size = 2,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
typedef Packet2cf half;
typedef Packet4f as_real;
};
-template<> struct unpacket_traits<Packet1cd> {
+template <>
+struct unpacket_traits<Packet1cd> {
typedef std::complex<double> type;
- enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+ enum {
+ size = 1,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
typedef Packet1cd half;
typedef Packet2d as_real;
};
/* Forward declaration */
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf, 2>& kernel);
/* complex<double> first */
-template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+ EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v);
+}
-template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
-{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd
+pset1<Packet1cd>(const std::complex<double>& from) { /* here we really have to use unaligned loads :( */
+ return ploadu<Packet1cd>(&from);
+}
-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride EIGEN_UNUSED)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from,
+ Index stride EIGEN_UNUSED) {
return pload<Packet1cd>(from);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride EIGEN_UNUSED)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from,
+ Index stride EIGEN_UNUSED) {
pstore<std::complex<double> >(to, from);
}
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(a.v + b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(a.v - b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+ return Packet1cd(pnegate(Packet2d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+ return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
Packet2d a_re, a_im, v1, v2;
// Permute and multiply the real parts of a and b
@@ -141,219 +184,285 @@
v1 = vec_madd(a_re, b.v, p2d_ZERO);
// multiply a_im * b and get the conjugate result
v2 = vec_madd(a_im, b.v, p2d_ZERO);
- v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8);
- v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1);
+ v2 = (Packet2d)vec_sld((Packet4ui)v2, (Packet4ui)v2, 8);
+ v2 = (Packet2d)vec_xor((Packet2d)v2, (Packet2d)p2ul_CONJ_XOR1);
return Packet1cd(v1 + v2);
}
-template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
-template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
- Packet2d eq = vec_cmpeq (a.v, b.v);
- Packet2d tmp = { eq[1], eq[0] };
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(vec_and(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(vec_or(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(vec_xor(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+ return Packet1cd(vec_and(a.v, vec_nor(b.v, b.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+ return pset1<Packet1cd>(*from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+ Packet2d eq = vec_cmpeq(a.v, b.v);
+ Packet2d tmp = {eq[1], eq[0]};
return (Packet1cd)pand<Packet2d>(eq, tmp);
}
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+ EIGEN_ZVECTOR_PREFETCH(addr);
+}
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
EIGEN_ALIGN16 std::complex<double> res;
pstore<std::complex<double> >(&res, a);
return res;
}
-template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
return pfirst(a);
}
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
return pfirst(a);
}
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
return pdiv_complex(a, b);
}
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
-{
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
return Packet1cd(preverse(Packet2d(x.v)));
}
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
-{
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
kernel.packet[0].v = tmp;
}
/* complex<float> follows */
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+ EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+ EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v);
+}
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
EIGEN_ALIGN16 std::complex<float> res[2];
pstore<std::complex<float> >(res, a);
return res[0];
}
-
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
Packet2cf res;
- res.cd[0] = Packet1cd(vec_ld2f((const float *)&from));
+ res.cd[0] = Packet1cd(vec_ld2f((const float*)&from));
res.cd[1] = res.cd[0];
return res;
}
#else
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
Packet2cf res;
- if((std::ptrdiff_t(&from) % 16) == 0)
- res.v = pload<Packet4f>((const float *)&from);
+ if ((std::ptrdiff_t(&from) % 16) == 0)
+ res.v = pload<Packet4f>((const float*)&from);
else
- res.v = ploadu<Packet4f>((const float *)&from);
+ res.v = ploadu<Packet4f>((const float*)&from);
res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
return res;
}
#endif
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+ Index stride) {
EIGEN_ALIGN16 std::complex<float> af[2];
- af[0] = from[0*stride];
- af[1] = from[1*stride];
+ af[0] = from[0 * stride];
+ af[1] = from[1 * stride];
return pload<Packet2cf>(af);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from,
+ Index stride) {
EIGEN_ALIGN16 std::complex<float> af[2];
- pstore<std::complex<float> >((std::complex<float> *) af, from);
- to[0*stride] = af[0];
- to[1*stride] = af[1];
+ pstore<std::complex<float> >((std::complex<float>*)af, from);
+ to[0 * stride] = af[0];
+ to[1 * stride] = af[1];
}
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(padd<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(psub<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+ return Packet2cf(pnegate(Packet4f(a.v)));
+}
-template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v,b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(pand<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(por<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(pxor<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+ return Packet2cf(pandnot<Packet4f>(a.v, b.v));
+}
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+ return pset1<Packet2cf>(*from);
+}
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+ EIGEN_ZVECTOR_PREFETCH(addr);
+}
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
-template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
- Packet4f eq = pcmp_eq<Packet4f> (a.v, b.v);
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+ Packet4f eq = pcmp_eq<Packet4f>(a.v, b.v);
Packet2cf res;
- Packet2d tmp1 = { eq.v4f[0][1], eq.v4f[0][0] };
- Packet2d tmp2 = { eq.v4f[1][1], eq.v4f[1][0] };
+ Packet2d tmp1 = {eq.v4f[0][1], eq.v4f[0][0]};
+ Packet2d tmp2 = {eq.v4f[1][1], eq.v4f[1][0]};
res.v.v4f[0] = pand<Packet2d>(eq.v4f[0], tmp1);
res.v.v4f[1] = pand<Packet2d>(eq.v4f[1], tmp2);
return res;
}
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
Packet2cf res;
res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
return res;
}
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
Packet2cf res;
- res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
- res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
+ res.v.v4f[0] =
+ pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
+ res.v.v4f[1] =
+ pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
return res;
}
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
Packet2cf res;
res.cd[0] = a.cd[1];
res.cd[1] = a.cd[0];
return res;
}
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
std::complex<float> res;
Packet1cd b = padd<Packet1cd>(a.cd[0], a.cd[1]);
vec_st2f(b.v, (float*)&res);
return res;
}
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
std::complex<float> res;
Packet1cd b = pmul<Packet1cd>(a.cd[0], a.cd[1]);
vec_st2f(b.v, (float*)&res);
return res;
}
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
return pdiv_complex(a, b);
}
-EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
-{
+EIGEN_STRONG_INLINE Packet2cf pcplxflip /*<Packet2cf>*/ (const Packet2cf& x) {
Packet2cf res;
res.cd[0] = pcplxflip(x.cd[0]);
res.cd[1] = pcplxflip(x.cd[1]);
return res;
}
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
-{
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
Packet1cd tmp = kernel.packet[0].cd[1];
kernel.packet[0].cd[1] = kernel.packet[1].cd[0];
kernel.packet[1].cd[0] = tmp;
}
-template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+ const Packet2cf& elsePacket) {
Packet2cf result;
- const Selector<4> ifPacket4 = { ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1] };
+ const Selector<4> ifPacket4 = {ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1]};
result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v);
return result;
}
#else
-template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
- Packet4f eq = vec_cmpeq (a.v, b.v);
- Packet4f tmp = { eq[1], eq[0], eq[3], eq[2] };
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+ Packet4f eq = vec_cmpeq(a.v, b.v);
+ Packet4f tmp = {eq[1], eq[0], eq[3], eq[2]};
return (Packet2cf)pand<Packet4f>(eq, tmp);
}
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+ return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
Packet4f a_re, a_im, prod, prod_im;
// Permute and multiply the real parts of a and b
a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
-
+
// Get the imaginary parts of a
a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
@@ -365,27 +474,27 @@
// multiply a_re * b, add prod_im
prod = pmadd<Packet4f>(a_re, b.v, prod_im);
-
+
return Packet2cf(prod);
}
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
Packet4f rev_a;
rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2);
return Packet2cf(rev_a);
}
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
Packet4f b;
b = vec_sld(a.v, a.v, 8);
b = padd<Packet4f>(a.v, b);
return pfirst<Packet2cf>(Packet2cf(b));
}
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
Packet4f b;
Packet2cf prod;
b = vec_sld(a.v, a.v, 8);
@@ -394,34 +503,36 @@
return pfirst<Packet2cf>(prod);
}
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
return pdiv_complex(a, b);
}
-template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x) {
return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
}
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
-{
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
kernel.packet[0].v = tmp;
}
-template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+ const Packet2cf& elsePacket) {
Packet2cf result;
- result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
+ result.v = reinterpret_cast<Packet4f>(
+ pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
return result;
}
#endif
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_COMPLEX32_ZVECTOR_H
+#endif // EIGEN_COMPLEX32_ZVECTOR_H
diff --git a/Eigen/src/Core/arch/ZVector/MathFunctions.h b/Eigen/src/Core/arch/ZVector/MathFunctions.h
index 1b43878..5c55350 100644
--- a/Eigen/src/Core/arch/ZVector/MathFunctions.h
+++ b/Eigen/src/Core/arch/ZVector/MathFunctions.h
@@ -24,7 +24,7 @@
namespace internal {
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-static EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+static EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
static EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
static EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
static EIGEN_DECLARE_CONST_Packet4i(23, 23);
@@ -32,27 +32,27 @@
static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
/* the smallest non denormalized float number */
-static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
-static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f
-static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff);
-
+static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
+static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f
+static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff);
+
/* natural logarithm computed for 4 simultaneous float
return NaN for x <= 0
*/
static EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
-static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310E-1f);
static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
-static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
-static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
-static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
-static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
-static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
-static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174E-1f);
static EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
static EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-static EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
+static EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
static EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
static EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
@@ -67,11 +67,11 @@
static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
#endif
-static EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
-static EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
+static EIGEN_DECLARE_CONST_Packet2d(1, 1.0);
+static EIGEN_DECLARE_CONST_Packet2d(2, 2.0);
static EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-static EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
+static EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
static EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
static EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
@@ -88,9 +88,8 @@
static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d pexp<Packet2d>(const Packet2d& _x)
-{
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d pexp<Packet2d>(const Packet2d& _x) {
Packet2d x = _x;
Packet2d tmp, fx;
@@ -108,40 +107,38 @@
x = psub(x, tmp);
x = psub(x, z);
- Packet2d x2 = pmul(x,x);
+ Packet2d x2 = pmul(x, x);
Packet2d px = p2d_cephes_exp_p0;
px = pmadd(px, x2, p2d_cephes_exp_p1);
px = pmadd(px, x2, p2d_cephes_exp_p2);
- px = pmul (px, x);
+ px = pmul(px, x);
Packet2d qx = p2d_cephes_exp_q0;
qx = pmadd(qx, x2, p2d_cephes_exp_q1);
qx = pmadd(qx, x2, p2d_cephes_exp_q2);
qx = pmadd(qx, x2, p2d_cephes_exp_q3);
- x = pdiv(px,psub(qx,px));
- x = pmadd(p2d_2,x,p2d_1);
+ x = pdiv(px, psub(qx, px));
+ x = pmadd(p2d_2, x, p2d_1);
// build 2^n
emm0 = vec_ctsl(fx, 0);
- static const Packet2l p2l_1023 = { 1023, 1023 };
- static const Packet2ul p2ul_52 = { 52, 52 };
+ static const Packet2l p2l_1023 = {1023, 1023};
+ static const Packet2ul p2ul_52 = {52, 52};
emm0 = emm0 + p2l_1023;
emm0 = emm0 << reinterpret_cast<Packet2l>(p2ul_52);
- // Altivec's max & min operators just drop silent NaNs. Check NaNs in
+ // Altivec's max & min operators just drop silent NaNs. Check NaNs in
// inputs and return them unmodified.
Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
- return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x),
- isnumber_mask);
+ return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x), isnumber_mask);
}
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4f pexp<Packet4f>(const Packet4f& _x)
-{
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pexp<Packet4f>(const Packet4f& _x) {
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
Packet4f x = _x;
@@ -161,7 +158,7 @@
x = psub(x, tmp);
x = psub(x, z);
- z = pmul(x,x);
+ z = pmul(x, x);
Packet4f y = p4f_cephes_exp_p0;
y = pmadd(y, x, p4f_cephes_exp_p1);
@@ -173,7 +170,7 @@
y = padd(y, p4f_1);
// build 2^n
- emm0 = (Packet4i){ (int)fx[0], (int)fx[1], (int)fx[2], (int)fx[3] };
+ emm0 = (Packet4i){(int)fx[0], (int)fx[1], (int)fx[2], (int)fx[3]};
emm0 = emm0 + p4i_0x7f;
emm0 = emm0 << reinterpret_cast<Packet4i>(p4i_23);
@@ -186,15 +183,13 @@
#endif
}
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d psqrt<Packet2d>(const Packet2d& x)
-{
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d psqrt<Packet2d>(const Packet2d& x) {
return vec_sqrt(x);
}
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4f psqrt<Packet4f>(const Packet4f& x)
-{
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt<Packet4f>(const Packet4f& x) {
Packet4f res;
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
res = vec_sqrt(x);
@@ -205,13 +200,13 @@
return res;
}
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d prsqrt<Packet2d>(const Packet2d& x) {
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d prsqrt<Packet2d>(const Packet2d& x) {
return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
}
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f prsqrt<Packet4f>(const Packet4f& x) {
Packet4f res;
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
res = pset1<Packet4f>(1.0) / psqrt<Packet4f>(x);
@@ -224,8 +219,7 @@
// Hyperbolic Tangent function.
template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f
-ptanh<Packet4f>(const Packet4f& x) {
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f ptanh<Packet4f>(const Packet4f& x) {
return internal::generic_fast_tanh_float(x);
}
diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h
index 07de778..8ac8f77 100644
--- a/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -26,135 +26,136 @@
#endif
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
#endif
-typedef __vector int Packet4i;
-typedef __vector unsigned int Packet4ui;
-typedef __vector __bool int Packet4bi;
-typedef __vector short int Packet8i;
-typedef __vector unsigned char Packet16uc;
-typedef __vector double Packet2d;
-typedef __vector unsigned long long Packet2ul;
-typedef __vector long long Packet2l;
+typedef __vector int Packet4i;
+typedef __vector unsigned int Packet4ui;
+typedef __vector __bool int Packet4bi;
+typedef __vector short int Packet8i;
+typedef __vector unsigned char Packet16uc;
+typedef __vector double Packet2d;
+typedef __vector unsigned long long Packet2ul;
+typedef __vector long long Packet2l;
// Z14 has builtin support for float vectors
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-typedef __vector float Packet4f;
+typedef __vector float Packet4f;
#else
typedef struct {
- Packet2d v4f[2];
+ Packet2d v4f[2];
} Packet4f;
#endif
typedef union {
- numext::int32_t i[4];
+ numext::int32_t i[4];
numext::uint32_t ui[4];
- numext::int64_t l[2];
+ numext::int64_t l[2];
numext::uint64_t ul[2];
- double d[2];
- float f[4];
- Packet4i v4i;
+ double d[2];
+ float f[4];
+ Packet4i v4i;
Packet4ui v4ui;
- Packet2l v2l;
+ Packet2l v2l;
Packet2ul v2ul;
- Packet2d v2d;
+ Packet2d v2d;
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
- Packet4f v4f;
+ Packet4f v4f;
#endif
} Packet;
// We don't want to write the same code all the time, but we need to reuse the constants
// and it doesn't really work to declare them global, so we define macros instead
-#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
- Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
+#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
-#define EIGEN_DECLARE_CONST_FAST_Packet2d(NAME,X) \
- Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
+#define EIGEN_DECLARE_CONST_FAST_Packet2d(NAME, X) Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
-#define EIGEN_DECLARE_CONST_FAST_Packet2l(NAME,X) \
- Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
+#define EIGEN_DECLARE_CONST_FAST_Packet2l(NAME, X) Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
-#define EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
- Packet4i p4i_##NAME = pset1<Packet4i>(X)
+#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
-#define EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
- Packet2d p2d_##NAME = pset1<Packet2d>(X)
+#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
-#define EIGEN_DECLARE_CONST_Packet2l(NAME,X) \
- Packet2l p2l_##NAME = pset1<Packet2l>(X)
+#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
// These constants are endian-agnostic
-static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
-static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
static EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
static EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
static EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
-static Packet2d p2d_ONE = { 1.0, 1.0 };
-static Packet2d p2d_ZERO_ = { numext::bit_cast<double>(0x8000000000000000ull),
- numext::bit_cast<double>(0x8000000000000000ull) };
+static Packet2d p2d_ONE = {1.0, 1.0};
+static Packet2d p2d_ZERO_ = {numext::bit_cast<double>(0x8000000000000000ull),
+ numext::bit_cast<double>(0x8000000000000000ull)};
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
- Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
+#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
-#define EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
- Packet4f p4f_##NAME = pset1<Packet4f>(X)
+#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
-#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
+#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
-static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
-static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
-static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000};
+static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1); //{ -1, -1, -1, -1}
+static Packet4f p4f_MZERO = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
#endif
-static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
-static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
-static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
+static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
+static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(
+ vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
-static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
+static Packet16uc p16uc_PSET64_HI = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
+static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
// Mask alignment
-#define EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
+#define EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
-#define EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & EIGEN_MASK_ALIGNMENT)
+#define EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & EIGEN_MASK_ALIGNMENT)
// Handle endianness properly while loading constants
// Define global static constants:
-static Packet16uc p16uc_FORWARD = { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 };
-static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
-static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_FORWARD = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+static Packet16uc p16uc_REVERSE64 = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
-static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
-static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
-/*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+static Packet16uc p16uc_PSET32_WODD =
+ vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
+ 8); //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
+ 8); //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+/*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3),
+8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
-static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/
-static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
-/*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
-static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/
-static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
-static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD,
+(Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/
+static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
+ (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+/*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7,
+16,17,18,19, 20,21,22,23}; static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{
+8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/
+static Packet16uc p16uc_TRANSPOSE64_HI = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+static Packet16uc p16uc_TRANSPOSE64_LO = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
-static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+static Packet16uc p16uc_COMPLEX32_REV =
+ vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
-static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
-
+static Packet16uc p16uc_COMPLEX32_REV2 =
+ vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
- #define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR);
#else
- #define EIGEN_ZVECTOR_PREFETCH(ADDR) asm( " pfd [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+#define EIGEN_ZVECTOR_PREFETCH(ADDR) asm(" pfd [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
#endif
-template<> struct packet_traits<int> : default_packet_traits
-{
+template <>
+struct packet_traits<int> : default_packet_traits {
typedef Packet4i type;
typedef Packet4i half;
enum {
@@ -162,10 +163,10 @@
AlignedOnScalar = 1,
size = 4,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
HasBlend = 1
};
};
@@ -202,26 +203,26 @@
};
};
-template<> struct packet_traits<double> : default_packet_traits
-{
+template <>
+struct packet_traits<double> : default_packet_traits {
typedef Packet2d type;
typedef Packet2d half;
enum {
Vectorizable = 1,
AlignedOnScalar = 1,
- size=2,
+ size = 2,
- HasAdd = 1,
- HasSub = 1,
- HasMul = 1,
- HasDiv = 1,
- HasMin = 1,
- HasMax = 1,
- HasAbs = 1,
- HasSin = 0,
- HasCos = 0,
- HasLog = 0,
- HasExp = 1,
+ HasAdd = 1,
+ HasSub = 1,
+ HasMul = 1,
+ HasDiv = 1,
+ HasMin = 1,
+ HasMax = 1,
+ HasAbs = 1,
+ HasSin = 0,
+ HasCos = 0,
+ HasLog = 0,
+ HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasRound = 1,
@@ -232,47 +233,75 @@
};
};
-template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4i half; };
-template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
+template <>
+struct unpacket_traits<Packet4i> {
+ typedef int type;
+ enum {
+ size = 4,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+ typedef Packet4i half;
+};
+template <>
+struct unpacket_traits<Packet4f> {
+ typedef float type;
+ enum {
+ size = 4,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+ typedef Packet4f half;
+};
+template <>
+struct unpacket_traits<Packet2d> {
+ typedef double type;
+ enum {
+ size = 2,
+ alignment = Aligned16,
+ vectorizable = true,
+ masked_load_available = false,
+ masked_store_available = false
+ };
+ typedef Packet2d half;
+};
/* Forward declaration */
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
-
-inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
-{
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel);
+
+inline std::ostream& operator<<(std::ostream& s, const Packet4i& v) {
Packet vt;
vt.v4i = v;
s << vt.i[0] << ", " << vt.i[1] << ", " << vt.i[2] << ", " << vt.i[3];
return s;
}
-inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet4ui& v) {
Packet vt;
vt.v4ui = v;
s << vt.ui[0] << ", " << vt.ui[1] << ", " << vt.ui[2] << ", " << vt.ui[3];
return s;
}
-inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
Packet vt;
vt.v2l = v;
s << vt.l[0] << ", " << vt.l[1];
return s;
}
-inline std::ostream & operator <<(std::ostream & s, const Packet2ul & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet2ul& v) {
Packet vt;
vt.v2ul = v;
- s << vt.ul[0] << ", " << vt.ul[1] ;
+ s << vt.ul[0] << ", " << vt.ul[1];
return s;
}
-inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet2d& v) {
Packet vt;
vt.v2d = v;
s << vt.d[0] << ", " << vt.d[1];
@@ -280,8 +309,7 @@
}
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
-{
+inline std::ostream& operator<<(std::ostream& s, const Packet4f& v) {
Packet vt;
vt.v4f = v;
s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3];
@@ -289,54 +317,53 @@
}
#endif
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
// FIXME: No intrinsic yet
EIGEN_DEBUG_ALIGNED_LOAD
- Packet *vfrom;
- vfrom = (Packet *) from;
+ Packet* vfrom;
+ vfrom = (Packet*)from;
return vfrom->v4i;
}
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
// FIXME: No intrinsic yet
EIGEN_DEBUG_ALIGNED_LOAD
- Packet *vfrom;
- vfrom = (Packet *) from;
+ Packet* vfrom;
+ vfrom = (Packet*)from;
return vfrom->v2d;
}
-template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
// FIXME: No intrinsic yet
EIGEN_DEBUG_ALIGNED_STORE
- Packet *vto;
- vto = (Packet *) to;
+ Packet* vto;
+ vto = (Packet*)to;
vto->v4i = from;
}
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
// FIXME: No intrinsic yet
EIGEN_DEBUG_ALIGNED_STORE
- Packet *vto;
- vto = (Packet *) to;
+ Packet* vto;
+ vto = (Packet*)to;
vto->v2d = from;
}
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
return vec_splats(from);
}
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
return vec_splats(from);
}
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4i>(const int *a,
- Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
-{
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
a3 = pload<Packet4i>(a);
a0 = vec_splat(a3, 0);
a1 = vec_splat(a3, 1);
@@ -344,187 +371,316 @@
a3 = vec_splat(a3, 3);
}
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet2d>(const double *a,
- Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
-{
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
+ Packet2d& a3) {
a1 = pload<Packet2d>(a);
a0 = vec_splat(a1, 0);
a1 = vec_splat(a1, 1);
- a3 = pload<Packet2d>(a+2);
+ a3 = pload<Packet2d>(a + 2);
a2 = vec_splat(a3, 0);
a3 = vec_splat(a3, 1);
}
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
EIGEN_ALIGN16 int ai[4];
- ai[0] = from[0*stride];
- ai[1] = from[1*stride];
- ai[2] = from[2*stride];
- ai[3] = from[3*stride];
- return pload<Packet4i>(ai);
+ ai[0] = from[0 * stride];
+ ai[1] = from[1 * stride];
+ ai[2] = from[2 * stride];
+ ai[3] = from[3 * stride];
+ return pload<Packet4i>(ai);
}
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
EIGEN_ALIGN16 double af[2];
- af[0] = from[0*stride];
- af[1] = from[1*stride];
- return pload<Packet2d>(af);
+ af[0] = from[0 * stride];
+ af[1] = from[1 * stride];
+ return pload<Packet2d>(af);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
EIGEN_ALIGN16 int ai[4];
- pstore<int>((int *)ai, from);
- to[0*stride] = ai[0];
- to[1*stride] = ai[1];
- to[2*stride] = ai[2];
- to[3*stride] = ai[3];
+ pstore<int>((int*)ai, from);
+ to[0 * stride] = ai[0];
+ to[1 * stride] = ai[1];
+ to[2 * stride] = ai[2];
+ to[3 * stride] = ai[3];
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
EIGEN_ALIGN16 double af[2];
pstore<double>(af, from);
- to[0*stride] = af[0];
- to[1*stride] = af[1];
+ to[0 * stride] = af[0];
+ to[1 * stride] = af[1];
}
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return (a + b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return (a + b);
+}
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return (a - b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return (a - b);
+}
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return (a * b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return (a * b);
+}
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
-template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return (a / b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return (a / b);
+}
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+ return (-a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+ return (-a);
+}
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+ return a;
+}
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+ return padd<Packet4i>(pmul<Packet4i>(a, b), c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+ return vec_madd(a, b, c);
+}
-template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
+ return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+ return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN);
+}
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vec_min(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vec_max(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vec_and(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vec_or(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return vec_xor(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vec_xor(a, b);
+}
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+ return pand<Packet4i>(a, vec_nor(b, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+ return vec_and(a, vec_nor(b, b));
+}
-template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
+ return vec_round(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+ return vec_ceil(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+ return vec_floor(a);
+}
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { return pload<Packet4i>(from); }
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { return pload<Packet2d>(from); }
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
+ return pload<Packet4i>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
+ return pload<Packet2d>(from);
+}
-
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
Packet4i p = pload<Packet4i>(from);
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
}
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
Packet2d p = pload<Packet2d>(from);
return vec_perm(p, p, p16uc_PSET64_HI);
}
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { pstore<int>(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { pstore<double>(to, from); }
-
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-
-template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int x[4]; pstore(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; }
-
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
-{
- return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
+ pstore<int>(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+ pstore<double>(to, from);
}
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
-{
- return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
+template <>
+EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
+ EIGEN_ZVECTOR_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+ EIGEN_ZVECTOR_PREFETCH(addr);
}
-template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
+ EIGEN_ALIGN16 int x[4];
+ pstore(x, a);
+ return x[0];
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+ EIGEN_ALIGN16 double x[2];
+ pstore(x, a);
+ return x[0];
+}
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+ return reinterpret_cast<Packet4i>(
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+ return reinterpret_cast<Packet2d>(
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) {
+ return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) {
+ return vec_abs(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
Packet4i b, sum;
- b = vec_sld(a, a, 8);
+ b = vec_sld(a, a, 8);
sum = padd<Packet4i>(a, b);
- b = vec_sld(sum, sum, 4);
+ b = vec_sld(sum, sum, 4);
sum = padd<Packet4i>(sum, b);
return pfirst(sum);
}
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
Packet2d b, sum;
- b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
+ b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
sum = padd<Packet2d>(a, b);
return pfirst(sum);
}
// Other reduction functions:
// mul
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
EIGEN_ALIGN16 int aux[4];
pstore(aux, a);
return aux[0] * aux[1] * aux[2] * aux[3];
}
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{
- return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+ return pfirst(
+ pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
}
// min
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
Packet4i b, res;
- b = pmin<Packet4i>(a, vec_sld(a, a, 8));
+ b = pmin<Packet4i>(a, vec_sld(a, a, 8));
res = pmin<Packet4i>(b, vec_sld(b, b, 4));
return pfirst(res);
}
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
-{
- return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+ return pfirst(pmin<Packet2d>(
+ a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
}
// max
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
-{
+template <>
+EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
Packet4i b, res;
b = pmax<Packet4i>(a, vec_sld(a, a, 8));
res = pmax<Packet4i>(b, vec_sld(b, b, 4));
@@ -532,13 +688,13 @@
}
// max
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
-{
- return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+ return pfirst(pmax<Packet2d>(
+ a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
@@ -549,23 +705,25 @@
kernel.packet[3] = vec_mergel(t1, t3);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2d,2>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
kernel.packet[0] = t0;
kernel.packet[1] = t1;
}
-template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
- Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+template <>
+EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
+ const Packet4i& elsePacket) {
+ Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
return vec_sel(elsePacket, thenPacket, mask);
}
-
-template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
- Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
+template <>
+EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
+ const Packet2d& elsePacket) {
+ Packet2ul select = {ifPacket.select[0], ifPacket.select[1]};
Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
return vec_sel(elsePacket, thenPacket, mask);
}
@@ -576,32 +734,32 @@
#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
/* Helper function to simulate a vec_splat_packet4f
*/
-template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from)
-{
+template <int element>
+EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) {
Packet4f splat;
switch (element) {
- case 0:
- splat.v4f[0] = vec_splat(from.v4f[0], 0);
- splat.v4f[1] = splat.v4f[0];
- break;
- case 1:
- splat.v4f[0] = vec_splat(from.v4f[0], 1);
- splat.v4f[1] = splat.v4f[0];
- break;
- case 2:
- splat.v4f[0] = vec_splat(from.v4f[1], 0);
- splat.v4f[1] = splat.v4f[0];
- break;
- case 3:
- splat.v4f[0] = vec_splat(from.v4f[1], 1);
- splat.v4f[1] = splat.v4f[0];
- break;
+ case 0:
+ splat.v4f[0] = vec_splat(from.v4f[0], 0);
+ splat.v4f[1] = splat.v4f[0];
+ break;
+ case 1:
+ splat.v4f[0] = vec_splat(from.v4f[0], 1);
+ splat.v4f[1] = splat.v4f[0];
+ break;
+ case 2:
+ splat.v4f[0] = vec_splat(from.v4f[1], 0);
+ splat.v4f[1] = splat.v4f[0];
+ break;
+ case 3:
+ splat.v4f[0] = vec_splat(from.v4f[1], 1);
+ splat.v4f[1] = splat.v4f[0];
+ break;
}
return splat;
}
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
// FIXME: No intrinsic yet
EIGEN_DEBUG_ALIGNED_LOAD
Packet4f vfrom;
@@ -610,26 +768,24 @@
return vfrom;
}
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
// FIXME: No intrinsic yet
EIGEN_DEBUG_ALIGNED_STORE
vec_st2f(from.v4f[0], &to[0]);
vec_st2f(from.v4f[1], &to[2]);
}
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
Packet4f to;
to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
to.v4f[1] = to.v4f[0];
return to;
}
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
- Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
a3 = pload<Packet4f>(a);
a0 = vec_splat_packet4f<0>(a3);
a1 = vec_splat_packet4f<1>(a3);
@@ -637,207 +793,213 @@
a3 = vec_splat_packet4f<3>(a3);
}
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
EIGEN_ALIGN16 float ai[4];
- ai[0] = from[0*stride];
- ai[1] = from[1*stride];
- ai[2] = from[2*stride];
- ai[3] = from[3*stride];
- return pload<Packet4f>(ai);
+ ai[0] = from[0 * stride];
+ ai[1] = from[1 * stride];
+ ai[2] = from[2 * stride];
+ ai[3] = from[3 * stride];
+ return pload<Packet4f>(ai);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
EIGEN_ALIGN16 float ai[4];
- pstore<float>((float *)ai, from);
- to[0*stride] = ai[0];
- to[1*stride] = ai[1];
- to[2*stride] = ai[2];
- to[3*stride] = ai[3];
+ pstore<float>((float*)ai, from);
+ to[0 * stride] = ai[0];
+ to[1 * stride] = ai[1];
+ to[2 * stride] = ai[2];
+ to[3 * stride] = ai[3];
}
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
Packet4f c;
c.v4f[0] = a.v4f[0] + b.v4f[0];
c.v4f[1] = a.v4f[1] + b.v4f[1];
return c;
}
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
Packet4f c;
c.v4f[0] = a.v4f[0] - b.v4f[0];
c.v4f[1] = a.v4f[1] - b.v4f[1];
return c;
}
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
Packet4f c;
c.v4f[0] = a.v4f[0] * b.v4f[0];
c.v4f[1] = a.v4f[1] * b.v4f[1];
return c;
}
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
Packet4f c;
c.v4f[0] = a.v4f[0] / b.v4f[0];
c.v4f[1] = a.v4f[1] / b.v4f[1];
return c;
}
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
Packet4f c;
c.v4f[0] = -a.v4f[0];
c.v4f[1] = -a.v4f[1];
return c;
}
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
Packet4f res;
res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
return res;
}
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
Packet4f res;
res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
return res;
}
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
Packet4f res;
res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
return res;
}
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
Packet4f res;
res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
return res;
}
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
Packet4f res;
res.v4f[0] = por(a.v4f[0], b.v4f[0]);
res.v4f[1] = por(a.v4f[1], b.v4f[1]);
return res;
}
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
Packet4f res;
res.v4f[0] = pxor(a.v4f[0], b.v4f[0]);
res.v4f[1] = pxor(a.v4f[1], b.v4f[1]);
return res;
}
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
Packet4f res;
res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
return res;
}
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
Packet4f res;
res.v4f[0] = vec_round(a.v4f[0]);
res.v4f[1] = vec_round(a.v4f[1]);
return res;
}
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
Packet4f res;
res.v4f[0] = vec_ceil(a.v4f[0]);
res.v4f[1] = vec_ceil(a.v4f[1]);
return res;
}
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
Packet4f res;
res.v4f[0] = vec_floor(a.v4f[0]);
res.v4f[1] = vec_floor(a.v4f[1]);
return res;
}
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
Packet4f p = pload<Packet4f>(from);
p.v4f[1] = vec_splat(p.v4f[0], 1);
p.v4f[0] = vec_splat(p.v4f[0], 0);
return p;
}
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+ EIGEN_ALIGN16 float x[2];
+ vec_st2f(a.v4f[0], &x[0]);
+ return x[0];
+}
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
Packet4f rev;
rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
return rev;
}
-template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a) {
Packet4f res;
res.v4f[0] = pabs(a.v4f[0]);
res.v4f[1] = pabs(a.v4f[1]);
return res;
}
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
Packet2d sum;
sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
double first = predux<Packet2d>(sum);
return static_cast<float>(first);
}
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
// Return predux_mul<Packet2d> of the subvectors product
return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
}
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
Packet2d b, res;
- b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
- res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+ b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
+ res = pmin<Packet2d>(
+ b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
return static_cast<float>(pfirst(res));
}
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
Packet2d b, res;
- b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
- res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+ b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
+ res = pmax<Packet2d>(
+ b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
return static_cast<float>(pfirst(res));
}
/* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
*/
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
- PacketBlock<Packet2d,2> t0,t1,t2,t3;
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+ PacketBlock<Packet2d, 2> t0, t1, t2, t3;
// copy top-left 2x2 Packet2d block
t0.packet[0] = kernel.packet[0].v4f[0];
t0.packet[1] = kernel.packet[1].v4f[0];
@@ -871,9 +1033,11 @@
kernel.packet[3].v4f[1] = t3.packet[1];
}
-template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
- Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
- Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+ const Packet4f& elsePacket) {
+ Packet2ul select_hi = {ifPacket.select[0], ifPacket.select[1]};
+ Packet2ul select_lo = {ifPacket.select[2], ifPacket.select[3]};
Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast<Packet2ul>(p2l_ONE));
Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast<Packet2ul>(p2l_ONE));
Packet4f result;
@@ -882,24 +1046,24 @@
return result;
}
-template<> Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
Packet4f res;
res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]);
res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]);
return res;
}
-template<> Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
Packet4f res;
res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]);
res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]);
return res;
}
-template<> Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
+template <>
+Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
Packet4f res;
res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]);
res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]);
@@ -907,33 +1071,31 @@
}
#else
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
// FIXME: No intrinsic yet
EIGEN_DEBUG_ALIGNED_LOAD
- Packet *vfrom;
- vfrom = (Packet *) from;
+ Packet* vfrom;
+ vfrom = (Packet*)from;
return vfrom->v4f;
}
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
-{
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
// FIXME: No intrinsic yet
EIGEN_DEBUG_ALIGNED_STORE
- Packet *vto;
- vto = (Packet *) to;
+ Packet* vto;
+ vto = (Packet*)to;
vto->v4f = from;
}
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
return vec_splats(from);
}
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
- Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
a3 = pload<Packet4f>(a);
a0 = vec_splat(a3, 0);
a1 = vec_splat(a3, 1);
@@ -941,95 +1103,151 @@
a3 = vec_splat(a3, 3);
}
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
EIGEN_ALIGN16 float af[4];
- af[0] = from[0*stride];
- af[1] = from[1*stride];
- af[2] = from[2*stride];
- af[3] = from[3*stride];
- return pload<Packet4f>(af);
+ af[0] = from[0 * stride];
+ af[1] = from[1 * stride];
+ af[2] = from[2 * stride];
+ af[3] = from[3 * stride];
+ return pload<Packet4f>(af);
}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
EIGEN_ALIGN16 float af[4];
pstore<float>((float*)af, from);
- to[0*stride] = af[0];
- to[1*stride] = af[1];
- to[2*stride] = af[2];
- to[3*stride] = af[3];
+ to[0 * stride] = af[0];
+ to[1 * stride] = af[1];
+ to[2 * stride] = af[2];
+ to[3 * stride] = af[3];
}
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a + b); }
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a - b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a * b); }
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a / b); }
-template<> EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) { return (-a); }
-template<> EIGEN_STRONG_INLINE Packet4f pconj<Packet4f> (const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f> (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f> (const Packet4f& a) { return vec_round(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f> (const Packet4f& a) { return vec_ceil(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f> (const Packet4f& a) { return vec_floor(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f> (const Packet4f& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x[4]; pstore(x, a); return x[0]; }
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return (a + b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return (a - b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return (a * b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return (a / b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) {
+ return (-a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj<Packet4f>(const Packet4f& a) {
+ return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+ return vec_madd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vec_xor(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+ return vec_and(a, vec_nor(b, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
+ return vec_round(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+ return vec_ceil(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+ return vec_floor(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a) {
+ return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+ EIGEN_ALIGN16 float x[4];
+ pstore(x, a);
+ return x[0];
+}
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
Packet4f p = pload<Packet4f>(from);
return vec_perm(p, p, p16uc_DUPLICATE32_HI);
}
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{
- return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+ return reinterpret_cast<Packet4f>(
+ vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
}
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
Packet4f b, sum;
- b = vec_sld(a, a, 8);
+ b = vec_sld(a, a, 8);
sum = padd<Packet4f>(a, b);
- b = vec_sld(sum, sum, 4);
+ b = vec_sld(sum, sum, 4);
sum = padd<Packet4f>(sum, b);
return pfirst(sum);
}
// Other reduction functions:
// mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
Packet4f prod;
prod = pmul(a, vec_sld(a, a, 8));
return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
}
// min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
Packet4f b, res;
- b = pmin<Packet4f>(a, vec_sld(a, a, 8));
+ b = pmin<Packet4f>(a, vec_sld(a, a, 8));
res = pmin<Packet4f>(b, vec_sld(b, b, 4));
return pfirst(res);
}
// max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
Packet4f b, res;
b = pmax<Packet4f>(a, vec_sld(a, a, 8));
res = pmax<Packet4f>(b, vec_sld(b, b, 4));
return pfirst(res);
}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
@@ -1040,21 +1258,35 @@
kernel.packet[3] = vec_mergel(t1, t3);
}
-template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
- Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+ const Packet4f& elsePacket) {
+ Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
return vec_sel(elsePacket, thenPacket, mask);
}
#endif
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f> (const float* from) { return pload<Packet4f>(from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); }
-template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f> (const float& a) { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+ EIGEN_ZVECTOR_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+ return pload<Packet4f>(from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+ pstore<float>(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+ return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN);
+}
-} // end namespace internal
+} // end namespace internal
-} // end namespace Eigen
+} // end namespace Eigen
-#endif // EIGEN_PACKET_MATH_ZVECTOR_H
+#endif // EIGEN_PACKET_MATH_ZVECTOR_H