| // This file is part of Eigen, a lightweight C++ template library |
| // for linear algebra. |
| // |
| // Copyright (C) 2023 Zang Ruochen <zangruochen@loongson.cn> |
| // Copyright (C) 2024 XiWei Gu <guxiwei-hf@loongson.cn> |
| // |
| // This Source Code Form is subject to the terms of the Mozilla |
| // Public License v. 2.0. If a copy of the MPL was not distributed |
| // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| |
| #ifndef EIGEN_PACKET_MATH_LSX_H |
| #define EIGEN_PACKET_MATH_LSX_H |
| |
| // IWYU pragma: private |
| #include "../../InternalHeaderCheck.h" |
| |
| namespace Eigen { |
| |
| namespace internal { |
| |
| #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD |
| #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 |
| #endif |
| |
| #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS |
| #if EIGEN_ARCH_LOONGARCH64 |
| #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 |
| #endif |
| #endif |
| |
| #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD |
| #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD |
| #endif |
| |
| typedef __m128 Packet4f; |
| typedef __m128d Packet2d; |
| |
| typedef eigen_packet_wrapper<__m128i, 0> Packet16c; |
| typedef eigen_packet_wrapper<__m128i, 1> Packet8s; |
| typedef eigen_packet_wrapper<__m128i, 2> Packet4i; |
| typedef eigen_packet_wrapper<__m128i, 3> Packet2l; |
| typedef eigen_packet_wrapper<__m128i, 4> Packet16uc; |
| typedef eigen_packet_wrapper<__m128i, 5> Packet8us; |
| typedef eigen_packet_wrapper<__m128i, 6> Packet4ui; |
| typedef eigen_packet_wrapper<__m128i, 7> Packet2ul; |
| |
| template <> |
| struct is_arithmetic<__m128> { |
| enum { value = true }; |
| }; |
| template <> |
| struct is_arithmetic<__m128i> { |
| enum { value = true }; |
| }; |
| template <> |
| struct is_arithmetic<__m128d> { |
| enum { value = true }; |
| }; |
| template <> |
| struct is_arithmetic<Packet16c> { |
| enum { value = true }; |
| }; |
| template <> |
| struct is_arithmetic<Packet8s> { |
| enum { value = true }; |
| }; |
| template <> |
| struct is_arithmetic<Packet4i> { |
| enum { value = true }; |
| }; |
| template <> |
| struct is_arithmetic<Packet2l> { |
| enum { value = true }; |
| }; |
| template <> |
| struct is_arithmetic<Packet16uc> { |
| enum { value = false }; |
| }; |
| template <> |
| struct is_arithmetic<Packet8us> { |
| enum { value = false }; |
| }; |
| template <> |
| struct is_arithmetic<Packet4ui> { |
| enum { value = false }; |
| }; |
| template <> |
| struct is_arithmetic<Packet2ul> { |
| enum { value = false }; |
| }; |
| |
| EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { |
| float from[4] = {a, b, c, d}; |
| return (Packet4f)__lsx_vld(from, 0); |
| } |
| |
| EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask) { |
| const float* a = reinterpret_cast<const float*>(&m); |
| Packet4f res = |
| make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3))); |
| return res; |
| } |
| |
| template <bool interleave> |
| EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f& m, const Packet4f& n, int mask) { |
| const float* a = reinterpret_cast<const float*>(&m); |
| const float* b = reinterpret_cast<const float*>(&n); |
| Packet4f res = |
| make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))); |
| return res; |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f& m, const Packet4f& n, int mask) { |
| const float* a = reinterpret_cast<const float*>(&m); |
| const float* b = reinterpret_cast<const float*>(&n); |
| Packet4f res = |
| make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))); |
| return res; |
| } |
| |
| EIGEN_STRONG_INLINE static int eigen_lsx_shuffle_mask(int p, int q, int r, int s) { |
| return ((s) << 6 | (r) << 4 | (q) << 2 | (p)); |
| } |
| |
| EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) { |
| return shuffle1(a, eigen_lsx_shuffle_mask(p, q, r, s)); |
| } |
| EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) { |
| return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(p, q, r, s)); |
| } |
| EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) { |
| return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(0, 1, 0, 1)); |
| } |
| EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) { |
| return shuffle2<false>(b, a, eigen_lsx_shuffle_mask(2, 3, 2, 3)); |
| } |
| EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) { |
| return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(0, 0, 1, 1)); |
| } |
| EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) { |
| return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(2, 2, 3, 3)); |
| } |
| |
| EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { |
| double from[2] = {a, b}; |
| return (Packet2d)__lsx_vld(from, 0); |
| } |
| |
| EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) { |
| const double* a = reinterpret_cast<const double*>(&m); |
| const double* b = reinterpret_cast<const double*>(&n); |
| Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1))); |
| return res; |
| } |
| |
| EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) { |
| return shuffle(a, b, mask); |
| } |
| EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 0); } |
| EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 3); } |
| |
| template <> |
| struct packet_traits<int8_t> : default_packet_traits { |
| typedef Packet16c type; |
| typedef Packet16c half; |
| enum { |
| Vectorizable = 1, |
| AlignedOnScalar = 1, |
| size = 16, |
| |
| HasAbs2 = 0, |
| HasSetLinear = 0, |
| HasCmp = 1, |
| HasBlend = 0 |
| }; |
| }; |
| |
| template <> |
| struct packet_traits<int16_t> : default_packet_traits { |
| typedef Packet8s type; |
| typedef Packet8s half; |
| enum { |
| Vectorizable = 1, |
| AlignedOnScalar = 1, |
| size = 8, |
| |
| HasAbs2 = 0, |
| HasSetLinear = 0, |
| HasCmp = 1, |
| HasDiv = 1, |
| HasBlend = 0 |
| }; |
| }; |
| |
| template <> |
| struct packet_traits<int32_t> : default_packet_traits { |
| typedef Packet4i type; |
| typedef Packet4i half; |
| enum { |
| Vectorizable = 1, |
| AlignedOnScalar = 1, |
| size = 4, |
| |
| HasAbs2 = 0, |
| HasSetLinear = 0, |
| HasCmp = 1, |
| HasDiv = 1, |
| HasBlend = 0 |
| }; |
| }; |
| |
| template <> |
| struct packet_traits<int64_t> : default_packet_traits { |
| typedef Packet2l type; |
| typedef Packet2l half; |
| enum { |
| Vectorizable = 1, |
| AlignedOnScalar = 1, |
| size = 2, |
| |
| HasAbs2 = 0, |
| HasSetLinear = 0, |
| HasCmp = 1, |
| HasDiv = 1, |
| HasBlend = 0 |
| }; |
| }; |
| |
| template <> |
| struct packet_traits<uint8_t> : default_packet_traits { |
| typedef Packet16uc type; |
| typedef Packet16uc half; |
| enum { |
| Vectorizable = 1, |
| AlignedOnScalar = 1, |
| size = 16, |
| |
| HasAbs2 = 0, |
| HasSetLinear = 0, |
| HasNegate = 0, |
| HasCmp = 1, |
| HasBlend = 0 |
| }; |
| }; |
| |
| template <> |
| struct packet_traits<uint16_t> : default_packet_traits { |
| typedef Packet8us type; |
| typedef Packet8us half; |
| enum { |
| Vectorizable = 1, |
| AlignedOnScalar = 1, |
| size = 8, |
| |
| HasAbs2 = 0, |
| HasSetLinear = 0, |
| HasNegate = 0, |
| HasCmp = 1, |
| HasDiv = 1, |
| HasBlend = 0 |
| }; |
| }; |
| |
| template <> |
| struct packet_traits<uint32_t> : default_packet_traits { |
| typedef Packet4ui type; |
| typedef Packet4ui half; |
| enum { |
| Vectorizable = 1, |
| AlignedOnScalar = 1, |
| size = 4, |
| |
| HasAbs2 = 0, |
| HasSetLinear = 0, |
| HasNegate = 0, |
| HasCmp = 1, |
| HasDiv = 1, |
| HasBlend = 0 |
| }; |
| }; |
| |
| template <> |
| struct packet_traits<uint64_t> : default_packet_traits { |
| typedef Packet2ul type; |
| typedef Packet2ul half; |
| enum { |
| Vectorizable = 1, |
| AlignedOnScalar = 1, |
| size = 2, |
| |
| HasAbs2 = 0, |
| HasSetLinear = 0, |
| HasNegate = 0, |
| HasCmp = 1, |
| HasDiv = 1, |
| HasBlend = 0 |
| }; |
| }; |
| |
| template <> |
| struct packet_traits<float> : default_packet_traits { |
| typedef Packet4f type; |
| typedef Packet4f half; |
| enum { |
| Vectorizable = 1, |
| AlignedOnScalar = 1, |
| size = 4, |
| |
| HasAbs2 = 0, |
| HasSetLinear = 0, |
| HasBlend = 0, |
| HasSign = 0, |
| HasDiv = 1, |
| HasExp = 1, |
| HasSqrt = 1, |
| HasLog = 1, |
| HasRsqrt = 1 |
| }; |
| }; |
| |
| template <> |
| struct packet_traits<double> : default_packet_traits { |
| typedef Packet2d type; |
| typedef Packet2d half; |
| enum { |
| Vectorizable = 1, |
| AlignedOnScalar = 1, |
| size = 2, |
| |
| HasAbs2 = 0, |
| HasSetLinear = 0, |
| HasBlend = 0, |
| HasSign = 0, |
| HasDiv = 1, |
| HasSqrt = 1, |
| HasLog = 1, |
| HasRsqrt = 1 |
| }; |
| }; |
| |
| template <> |
| struct unpacket_traits<Packet16c> { |
| typedef int8_t type; |
| typedef Packet16c half; |
| enum { |
| size = 16, |
| alignment = Aligned16, |
| vectorizable = true, |
| masked_load_available = false, |
| masked_store_available = false |
| }; |
| }; |
| template <> |
| struct unpacket_traits<Packet8s> { |
| typedef int16_t type; |
| typedef Packet8s half; |
| enum { |
| size = 8, |
| alignment = Aligned16, |
| vectorizable = true, |
| masked_load_available = false, |
| masked_store_available = false |
| }; |
| }; |
| template <> |
| struct unpacket_traits<Packet4i> { |
| typedef int32_t type; |
| typedef Packet4i half; |
| enum { |
| size = 4, |
| alignment = Aligned16, |
| vectorizable = true, |
| masked_load_available = false, |
| masked_store_available = false |
| }; |
| }; |
| template <> |
| struct unpacket_traits<Packet2l> { |
| typedef int64_t type; |
| typedef Packet2l half; |
| enum { |
| size = 2, |
| alignment = Aligned16, |
| vectorizable = true, |
| masked_load_available = false, |
| masked_store_available = false |
| }; |
| }; |
| template <> |
| struct unpacket_traits<Packet16uc> { |
| typedef uint8_t type; |
| typedef Packet16uc half; |
| enum { |
| size = 16, |
| alignment = Aligned16, |
| vectorizable = true, |
| masked_load_available = false, |
| masked_store_available = false |
| }; |
| }; |
| template <> |
| struct unpacket_traits<Packet8us> { |
| typedef uint16_t type; |
| typedef Packet8us half; |
| enum { |
| size = 8, |
| alignment = Aligned16, |
| vectorizable = true, |
| masked_load_available = false, |
| masked_store_available = false |
| }; |
| }; |
| template <> |
| struct unpacket_traits<Packet4ui> { |
| typedef uint32_t type; |
| typedef Packet4ui half; |
| enum { |
| size = 4, |
| alignment = Aligned16, |
| vectorizable = true, |
| masked_load_available = false, |
| masked_store_available = false |
| }; |
| }; |
| template <> |
| struct unpacket_traits<Packet2ul> { |
| typedef uint64_t type; |
| typedef Packet2ul half; |
| enum { |
| size = 2, |
| alignment = Aligned16, |
| vectorizable = true, |
| masked_load_available = false, |
| masked_store_available = false |
| }; |
| }; |
| template <> |
| struct unpacket_traits<Packet4f> { |
| typedef float type; |
| typedef Packet4f half; |
| typedef Packet4i integer_packet; |
| enum { |
| size = 4, |
| alignment = Aligned16, |
| vectorizable = true, |
| masked_load_available = false, |
| masked_store_available = false |
| }; |
| }; |
| template <> |
| struct unpacket_traits<Packet2d> { |
| typedef double type; |
| typedef Packet2d half; |
| typedef Packet2l integer_packet; |
| enum { |
| size = 2, |
| alignment = Aligned16, |
| vectorizable = true, |
| masked_load_available = false, |
| masked_store_available = false |
| }; |
| }; |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) { |
| return __lsx_vreplgr2vr_b(from); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) { |
| return __lsx_vreplgr2vr_h(from); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { |
| return __lsx_vreplgr2vr_w(from); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) { |
| return __lsx_vreplgr2vr_d(from); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) { |
| return __lsx_vreplgr2vr_b(from); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) { |
| return __lsx_vreplgr2vr_h(from); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) { |
| return __lsx_vreplgr2vr_w(from); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) { |
| return __lsx_vreplgr2vr_d(from); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { |
| Packet4f v = {from, from, from, from}; |
| return v; |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { |
| Packet2d v = {from, from}; |
| return v; |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) { |
| return reinterpret_cast<__m128>((__m128i)pset1<Packet4ui>(from)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) { |
| return reinterpret_cast<__m128d>((__m128i)pset1<Packet2ul>(from)); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a) { |
| const int8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; |
| return __lsx_vadd_b(pset1<Packet16c>(a), __lsx_vld(countdown, 0)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a) { |
| const int16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7}; |
| return __lsx_vadd_h(pset1<Packet8s>(a), __lsx_vld(countdown, 0)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) { |
| const int32_t countdown[] = {0, 1, 2, 3}; |
| return __lsx_vadd_w(pset1<Packet4i>(a), __lsx_vld(countdown, 0)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) { |
| const int64_t countdown[] = {0, 1}; |
| return __lsx_vadd_d(pset1<Packet2l>(a), __lsx_vld(countdown, 0)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a) { |
| const uint8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; |
| return __lsx_vadd_b(pset1<Packet16uc>(a), __lsx_vld(countdown, 0)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a) { |
| const uint16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7}; |
| return __lsx_vadd_h(pset1<Packet8us>(a), __lsx_vld(countdown, 0)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) { |
| const uint32_t countdown[] = {0, 1, 2, 3}; |
| return __lsx_vadd_w(pset1<Packet4ui>(a), __lsx_vld(countdown, 0)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a) { |
| const uint64_t countdown[] = {0, 1}; |
| return __lsx_vadd_d(pset1<Packet2ul>(a), __lsx_vld(countdown, 0)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { |
| static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f}; |
| return __lsx_vfadd_s(pset1<Packet4f>(a), countdown); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { |
| static const Packet2d countdown = {0.0f, 1.0f}; |
| return __lsx_vfadd_d(pset1<Packet2d>(a), countdown); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) { |
| return __lsx_vadd_b(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) { |
| return __lsx_vadd_h(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { |
| return __lsx_vadd_w(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) { |
| return __lsx_vadd_d(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { |
| return __lsx_vadd_b(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) { |
| return __lsx_vadd_h(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { |
| return __lsx_vadd_w(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { |
| return __lsx_vadd_d(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { |
| return __lsx_vfadd_s(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { |
| return __lsx_vfadd_d(a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) { |
| return __lsx_vsub_b(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) { |
| return __lsx_vsub_h(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { |
| return __lsx_vsub_w(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) { |
| return __lsx_vsub_d(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { |
| return __lsx_vsub_b(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) { |
| return __lsx_vsub_h(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { |
| return __lsx_vsub_w(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { |
| return __lsx_vsub_d(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { |
| return __lsx_vfsub_s(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { |
| return __lsx_vfsub_d(a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b); |
| template <> |
| EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) { |
| const Packet4f mask = |
| make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f); |
| return padd(a, pxor(mask, b)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b); |
| template <> |
| EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) { |
| const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0); |
| return padd(a, pxor(mask, b)); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { |
| Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000), |
| numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000)); |
| return (Packet4f)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { |
| Packet2d mask = |
| make_packet2d(numext::bit_cast<double>(0x8000000000000000), numext::bit_cast<double>(0x8000000000000000)); |
| return (Packet2d)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) { |
| return __lsx_vneg_b(a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) { |
| return __lsx_vneg_h(a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { |
| return __lsx_vneg_w(a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) { |
| return __lsx_vneg_d(a); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { |
| return a; |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { |
| return a; |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) { |
| return a; |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) { |
| return a; |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { |
| return a; |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) { |
| return a; |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) { |
| return a; |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) { |
| return a; |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) { |
| return a; |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) { |
| return a; |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { |
| return __lsx_vfmul_s(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { |
| return __lsx_vfmul_d(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) { |
| return __lsx_vmul_b(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) { |
| return __lsx_vmul_h(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { |
| return __lsx_vmul_w(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) { |
| return __lsx_vmul_d(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { |
| return __lsx_vmul_b(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) { |
| return __lsx_vmul_h(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { |
| return __lsx_vmul_w(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { |
| return __lsx_vmul_d(a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { |
| return __lsx_vfdiv_s(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { |
| return __lsx_vfdiv_d(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& a, const Packet8s& b) { |
| return __lsx_vdiv_h(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { |
| return __lsx_vdiv_w(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& a, const Packet2l& b) { |
| return __lsx_vdiv_d(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& a, const Packet8us& b) { |
| return __lsx_vdiv_hu(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { |
| return __lsx_vdiv_wu(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { |
| return __lsx_vdiv_du(a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { |
| return __lsx_vfmadd_s(a, b, c); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { |
| return __lsx_vfmadd_d(a, b, c); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { |
| return __lsx_vfmsub_s(a, b, c); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { |
| return __lsx_vfmsub_d(a, b, c); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { |
| return __lsx_vfnmsub_s(a, b, c); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { |
| return __lsx_vfnmsub_d(a, b, c); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) { |
| return __lsx_vfnmadd_s(a, b, c); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) { |
| return __lsx_vfnmadd_d(a, b, c); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) { |
| return __lsx_vmadd_b(c, a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { |
| return __lsx_vmadd_h(c, a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { |
| return __lsx_vmadd_w(c, a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pmadd(const Packet2l& a, const Packet2l& b, const Packet2l& c) { |
| return __lsx_vmadd_d(c, a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) { |
| return __lsx_vmadd_b(c, a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { |
| return __lsx_vmadd_h(c, a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) { |
| return __lsx_vmadd_w(c, a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul pmadd(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c) { |
| return __lsx_vmadd_d(c, a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { |
| return (Packet4f)__lsx_vand_v((__m128i)a, (__m128i)b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { |
| return (Packet2d)__lsx_vand_v((__m128i)a, (__m128i)b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b) { |
| return __lsx_vand_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) { |
| return __lsx_vand_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { |
| return __lsx_vand_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) { |
| return __lsx_vand_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { |
| return __lsx_vand_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) { |
| return __lsx_vand_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { |
| return __lsx_vand_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { |
| return __lsx_vand_v(a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { |
| return (Packet4f)__lsx_vor_v((__m128i)a, (__m128i)b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { |
| return (Packet2d)__lsx_vor_v((__m128i)a, (__m128i)b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b) { |
| return __lsx_vor_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) { |
| return __lsx_vor_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { |
| return __lsx_vor_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) { |
| return __lsx_vor_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { |
| return __lsx_vor_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) { |
| return __lsx_vor_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { |
| return __lsx_vor_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { |
| return __lsx_vor_v(a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { |
| return (Packet4f)__lsx_vxor_v((__m128i)a, (__m128i)b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { |
| return (Packet2d)__lsx_vxor_v((__m128i)a, (__m128i)b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b) { |
| return __lsx_vxor_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) { |
| return __lsx_vxor_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { |
| return __lsx_vxor_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) { |
| return __lsx_vxor_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { |
| return __lsx_vxor_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) { |
| return __lsx_vxor_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { |
| return __lsx_vxor_v(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { |
| return __lsx_vxor_v(a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { |
| return (Packet4f)__lsx_vandn_v((__m128i)b, (__m128i)a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { |
| return (Packet2d)__lsx_vandn_v((__m128i)b, (__m128i)a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) { |
| return __lsx_vandn_v(b, a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b) { |
| return __lsx_vandn_v(b, a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { |
| return __lsx_vandn_v(b, a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) { |
| return __lsx_vandn_v(b, a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { |
| return __lsx_vandn_v(b, a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b) { |
| return __lsx_vandn_v(b, a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { |
| return __lsx_vandn_v(b, a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { |
| return __lsx_vandn_v(b, a); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) { |
| return (Packet4f)__lsx_vfcmp_cle_s(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pcmp_le<Packet2d>(const Packet2d& a, const Packet2d& b) { |
| return (Packet2d)__lsx_vfcmp_cle_d(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b) { |
| return __lsx_vsle_b(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b) { |
| return __lsx_vsle_h(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b) { |
| return __lsx_vsle_w(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b) { |
| return __lsx_vsle_d(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { |
| return __lsx_vsle_bu(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b) { |
| return __lsx_vsle_hu(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { |
| return __lsx_vsle_wu(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { |
| return __lsx_vsle_du(a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) { |
| return (Packet4f)__lsx_vfcmp_clt_s(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pcmp_lt<Packet2d>(const Packet2d& a, const Packet2d& b) { |
| return (Packet2d)__lsx_vfcmp_clt_d(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b) { |
| return __lsx_vslt_b(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b) { |
| return __lsx_vslt_h(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b) { |
| return __lsx_vslt_w(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b) { |
| return __lsx_vslt_d(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { |
| return __lsx_vslt_bu(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b) { |
| return __lsx_vslt_hu(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { |
| return __lsx_vslt_wu(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { |
| return __lsx_vslt_du(a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b) { |
| return (Packet4f)__lsx_vfcmp_sult_s(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan<Packet2d>(const Packet2d& a, const Packet2d& b) { |
| return (Packet2d)__lsx_vfcmp_sult_d(a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) { |
| return (Packet4f)__lsx_vfcmp_seq_s(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pcmp_eq<Packet2d>(const Packet2d& a, const Packet2d& b) { |
| return (Packet2d)__lsx_vfcmp_seq_d(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b) { |
| return __lsx_vseq_b(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b) { |
| return __lsx_vseq_h(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b) { |
| return __lsx_vseq_w(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b) { |
| return __lsx_vseq_d(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { |
| return __lsx_vseq_b(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b) { |
| return __lsx_vseq_h(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { |
| return __lsx_vseq_w(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { |
| return __lsx_vseq_d(a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { |
| return __lsx_vmin_b(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { |
| return __lsx_vmin_h(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { |
| return __lsx_vmin_w(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) { |
| return __lsx_vmin_d(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { |
| return __lsx_vmin_bu(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { |
| return __lsx_vmin_hu(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { |
| return __lsx_vmin_wu(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { |
| return __lsx_vmin_du(a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { |
| return __lsx_vmax_b(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { |
| return __lsx_vmax_h(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { |
| return __lsx_vmax_w(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) { |
| return __lsx_vmax_d(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { |
| return __lsx_vmax_bu(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { |
| return __lsx_vmax_hu(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { |
| return __lsx_vmax_wu(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { |
| return __lsx_vmax_du(a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { |
| Packet4i aNaN = __lsx_vfcmp_cun_s(a, a); |
| Packet4i aMinOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(a, b), aNaN); |
| return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { |
| Packet2l aNaN = __lsx_vfcmp_cun_d(a, a); |
| Packet2l aMinOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(a, b), aNaN); |
| return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { |
| Packet4i aNaN = __lsx_vfcmp_cun_s(a, a); |
| Packet4i aMaxOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(b, a), aNaN); |
| return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { |
| Packet2l aNaN = __lsx_vfcmp_cun_d(a, a); |
| Packet2l aMaxOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(b, a), aNaN); |
| return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN); |
| } |
| |
| template <int N> |
| EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(const Packet16c& a) { |
| return __lsx_vsrai_b((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(const Packet8s& a) { |
| return __lsx_vsrai_h((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) { |
| return __lsx_vsrai_w((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) { |
| return __lsx_vsrai_d((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(const Packet16uc& a) { |
| return __lsx_vsrli_b((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(const Packet8us& a) { |
| return __lsx_vsrli_h((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) { |
| return __lsx_vsrli_w((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(const Packet2ul& a) { |
| return __lsx_vsrli_d((__m128i)a, N); |
| } |
| |
| template <int N> |
| EIGEN_STRONG_INLINE Packet16c plogical_shift_right(const Packet16c& a) { |
| return __lsx_vsrli_b((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet8s plogical_shift_right(const Packet8s& a) { |
| return __lsx_vsrli_h((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) { |
| return __lsx_vsrli_w((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) { |
| return __lsx_vsrli_d((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(const Packet16uc& a) { |
| return __lsx_vsrli_b((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) { |
| return __lsx_vsrli_h((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) { |
| return __lsx_vsrli_w((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(const Packet2ul& a) { |
| return __lsx_vsrli_d((__m128i)a, N); |
| } |
| |
| template <int N> |
| EIGEN_STRONG_INLINE Packet16c plogical_shift_left(const Packet16c& a) { |
| return __lsx_vslli_b((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet8s plogical_shift_left(const Packet8s& a) { |
| return __lsx_vslli_h((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) { |
| return __lsx_vslli_w((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) { |
| return __lsx_vslli_d((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(const Packet16uc& a) { |
| return __lsx_vslli_b((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) { |
| return __lsx_vslli_h((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) { |
| return __lsx_vslli_w((__m128i)a, N); |
| } |
| template <int N> |
| EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(const Packet2ul& a) { |
| return __lsx_vslli_d((__m128i)a, N); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { |
| return (Packet4f)__lsx_vbitclri_w((__m128i)a, 31); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { |
| return (Packet2d)__lsx_vbitclri_d((__m128i)a, 63); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { |
| return __lsx_vabsd_b(a, pzero(a)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { |
| return __lsx_vabsd_h(a, pzero(a)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { |
| return __lsx_vabsd_w(a, pzero(a)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) { |
| return __lsx_vabsd_d(a, pzero(a)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { |
| return a; |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { |
| return a; |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { |
| return a; |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { |
| return a; |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { |
| EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { |
| EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d)__lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) { |
| EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) { |
| EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) { |
| EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) { |
| EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) { |
| EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) { |
| EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) { |
| EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) { |
| EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { |
| EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { |
| EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from) { |
| EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from) { |
| EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) { |
| EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) { |
| EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from) { |
| EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from) { |
| EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) { |
| EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from) { |
| EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) { |
| float f0 = from[0], f1 = from[1]; |
| return make_packet4f(f0, f0, f1, f1); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) { |
| return pset1<Packet2d>(from[0]); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from) { |
| Packet16c tmp = pload<Packet16c>(from); |
| return __lsx_vilvl_b(tmp, tmp); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from) { |
| Packet8s tmp = pload<Packet8s>(from); |
| return __lsx_vilvl_h(tmp, tmp); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) { |
| Packet4i tmp = pload<Packet4i>(from); |
| return __lsx_vilvl_w(tmp, tmp); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) { |
| return pset1<Packet2l>(from[0]); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from) { |
| Packet16uc tmp = pload<Packet16uc>(from); |
| return __lsx_vilvl_b(tmp, tmp); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from) { |
| Packet8us tmp = pload<Packet8us>(from); |
| return __lsx_vilvl_h(tmp, tmp); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) { |
| Packet4ui tmp = pload<Packet4ui>(from); |
| return __lsx_vilvl_w(tmp, tmp); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from) { |
| return pset1<Packet2ul>(from[0]); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { |
| EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { |
| EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) { |
| EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) { |
| EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) { |
| EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) { |
| EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) { |
| EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) { |
| EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) { |
| EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) { |
| EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { |
| EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { |
| EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from) { |
| EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from) { |
| EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) { |
| EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) { |
| EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from) { |
| EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from) { |
| EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) { |
| EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from) { |
| EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0); |
| } |
| |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) { |
| Packet4f v = {from[0], from[stride], from[2 * stride], from[3 * stride]}; |
| return v; |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) { |
| Packet2d v = {from[0], from[stride]}; |
| return v; |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride) { |
| int8_t v[16] __attribute__((aligned(16))); |
| v[0] = from[0]; |
| v[1] = from[stride]; |
| v[2] = from[2 * stride]; |
| v[3] = from[3 * stride]; |
| v[4] = from[4 * stride]; |
| v[5] = from[5 * stride]; |
| v[6] = from[6 * stride]; |
| v[7] = from[7 * stride]; |
| v[8] = from[8 * stride]; |
| v[9] = from[9 * stride]; |
| v[10] = from[10 * stride]; |
| v[11] = from[11 * stride]; |
| v[12] = from[12 * stride]; |
| v[13] = from[13 * stride]; |
| v[14] = from[14 * stride]; |
| v[15] = from[15 * stride]; |
| return __lsx_vld(v, 0); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride) { |
| int16_t v[8] __attribute__((aligned(16))); |
| v[0] = from[0]; |
| v[1] = from[stride]; |
| v[2] = from[2 * stride]; |
| v[3] = from[3 * stride]; |
| v[4] = from[4 * stride]; |
| v[5] = from[5 * stride]; |
| v[6] = from[6 * stride]; |
| v[7] = from[7 * stride]; |
| return __lsx_vld(v, 0); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) { |
| int32_t v[4] __attribute__((aligned(16))); |
| v[0] = from[0]; |
| v[1] = from[stride]; |
| v[2] = from[2 * stride]; |
| v[3] = from[3 * stride]; |
| return __lsx_vld(v, 0); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) { |
| int64_t v[2] __attribute__((aligned(16))); |
| v[0] = from[0]; |
| v[1] = from[stride]; |
| return __lsx_vld(v, 0); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride) { |
| uint8_t v[16] __attribute__((aligned(16))); |
| v[0] = from[0]; |
| v[1] = from[stride]; |
| v[2] = from[2 * stride]; |
| v[3] = from[3 * stride]; |
| v[4] = from[4 * stride]; |
| v[5] = from[5 * stride]; |
| v[6] = from[6 * stride]; |
| v[7] = from[7 * stride]; |
| v[8] = from[8 * stride]; |
| v[9] = from[9 * stride]; |
| v[10] = from[10 * stride]; |
| v[11] = from[11 * stride]; |
| v[12] = from[12 * stride]; |
| v[13] = from[13 * stride]; |
| v[14] = from[14 * stride]; |
| v[15] = from[15 * stride]; |
| return __lsx_vld(v, 0); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride) { |
| uint16_t v[8] __attribute__((aligned(16))); |
| v[0] = from[0]; |
| v[1] = from[stride]; |
| v[2] = from[2 * stride]; |
| v[3] = from[3 * stride]; |
| v[4] = from[4 * stride]; |
| v[5] = from[5 * stride]; |
| v[6] = from[6 * stride]; |
| v[7] = from[7 * stride]; |
| return __lsx_vld(v, 0); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) { |
| uint32_t v[4] __attribute__((aligned(16))); |
| v[0] = from[0]; |
| v[1] = from[stride]; |
| v[2] = from[2 * stride]; |
| v[3] = from[3 * stride]; |
| return __lsx_vld(v, 0); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride) { |
| uint64_t v[2] __attribute__((aligned(16))); |
| v[0] = from[0]; |
| v[1] = from[stride]; |
| return __lsx_vld(v, 0); |
| } |
| |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) { |
| __lsx_vstelm_w(from, to, 0, 0); |
| __lsx_vstelm_w(from, to + stride * 1, 0, 1); |
| __lsx_vstelm_w(from, to + stride * 2, 0, 2); |
| __lsx_vstelm_w(from, to + stride * 3, 0, 3); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) { |
| __lsx_vstelm_d(from, to, 0, 0); |
| __lsx_vstelm_d(from, to + stride, 0, 1); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from, |
| Index stride) { |
| __lsx_vstelm_b((__m128i)from, to, 0, 0); |
| __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1); |
| __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2); |
| __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3); |
| __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4); |
| __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5); |
| __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6); |
| __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7); |
| __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8); |
| __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9); |
| __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10); |
| __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11); |
| __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12); |
| __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13); |
| __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14); |
| __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from, |
| Index stride) { |
| __lsx_vstelm_h((__m128i)from, to, 0, 0); |
| __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1); |
| __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2); |
| __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3); |
| __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4); |
| __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5); |
| __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6); |
| __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, |
| Index stride) { |
| __lsx_vstelm_w((__m128i)from, to, 0, 0); |
| __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1); |
| __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2); |
| __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, |
| Index stride) { |
| __lsx_vstelm_d((__m128i)from, to, 0, 0); |
| __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from, |
| Index stride) { |
| __lsx_vstelm_b((__m128i)from, to, 0, 0); |
| __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1); |
| __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2); |
| __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3); |
| __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4); |
| __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5); |
| __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6); |
| __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7); |
| __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8); |
| __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9); |
| __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10); |
| __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11); |
| __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12); |
| __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13); |
| __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14); |
| __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from, |
| Index stride) { |
| __lsx_vstelm_h((__m128i)from, to, 0, 0); |
| __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1); |
| __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2); |
| __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3); |
| __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4); |
| __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5); |
| __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6); |
| __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, |
| Index stride) { |
| __lsx_vstelm_w((__m128i)from, to, 0, 0); |
| __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1); |
| __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2); |
| __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from, |
| Index stride) { |
| __lsx_vstelm_d((__m128i)from, to, 0, 0); |
| __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { |
| __builtin_prefetch(addr); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { |
| __builtin_prefetch(addr); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) { |
| __builtin_prefetch(addr); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) { |
| __builtin_prefetch(addr); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { |
| __builtin_prefetch(addr); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) { |
| __builtin_prefetch(addr); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) { |
| __builtin_prefetch(addr); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) { |
| __builtin_prefetch(addr); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) { |
| __builtin_prefetch(addr); |
| } |
| template <> |
| EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) { |
| __builtin_prefetch(addr); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { |
| float v; |
| __lsx_vstelm_w(a, &v, 0, 0); |
| return v; |
| } |
| template <> |
| EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { |
| double v; |
| __lsx_vstelm_d(a, &v, 0, 0); |
| return v; |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) { |
| return (int8_t)__lsx_vpickve2gr_b((__m128i)a, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) { |
| return (int16_t)__lsx_vpickve2gr_h((__m128i)a, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { |
| return __lsx_vpickve2gr_w((__m128i)a, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) { |
| return __lsx_vpickve2gr_d((__m128i)a, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) { |
| return (uint8_t)__lsx_vpickve2gr_bu((__m128i)a, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) { |
| return (uint16_t)__lsx_vpickve2gr_hu((__m128i)a, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { |
| return __lsx_vpickve2gr_wu((__m128i)a, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) { |
| return __lsx_vpickve2gr_du((__m128i)a, 0); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { |
| return (Packet4f)__lsx_vshuf4i_w(a, 0x1B); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { |
| return (Packet2d)__lsx_vshuf4i_d(a, a, 0x1); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) { |
| return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) { |
| return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { |
| return __lsx_vshuf4i_w((__m128i)a, 0x1B); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) { |
| return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) { |
| return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) { |
| return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) { |
| return __lsx_vshuf4i_w((__m128i)a, 0x1B); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) { |
| return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) { |
| Packet4f tmp = __lsx_vfadd_s(a, vec4f_swizzle1(a, 2, 3, 2, 3)); |
| return pfirst<Packet4f>(__lsx_vfadd_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { |
| return pfirst<Packet2d>(__lsx_vfadd_d(a, preverse(a))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) { |
| Packet8s tmp1 = __lsx_vhaddw_h_b(a, a); |
| Packet4i tmp2 = __lsx_vhaddw_w_h(tmp1, tmp1); |
| Packet2l tmp3 = __lsx_vhaddw_d_w(tmp2, tmp2); |
| return (int8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp3, tmp3), 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) { |
| Packet4i tmp1 = __lsx_vhaddw_w_h(a, a); |
| Packet2l tmp2 = __lsx_vhaddw_d_w(tmp1, tmp1); |
| return (int16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp2, tmp2), 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) { |
| Packet2l tmp = __lsx_vhaddw_d_w(a, a); |
| return (int32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp, tmp), 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) { |
| return (int64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(a, a), 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) { |
| Packet8us tmp1 = __lsx_vhaddw_hu_bu(a, a); |
| Packet4ui tmp2 = __lsx_vhaddw_wu_hu(tmp1, tmp1); |
| Packet2ul tmp3 = __lsx_vhaddw_du_wu(tmp2, tmp2); |
| return (uint8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp3, tmp3), 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) { |
| Packet4ui tmp1 = __lsx_vhaddw_wu_hu(a, a); |
| Packet2ul tmp2 = __lsx_vhaddw_du_wu(tmp1, tmp1); |
| return (uint16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp2, tmp2), 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) { |
| Packet2ul tmp = __lsx_vhaddw_du_wu(a, a); |
| return (uint32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp, tmp), 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) { |
| return (uint64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(a, a), 0); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) { |
| Packet4f tmp = __lsx_vfmul_s(a, vec4f_swizzle1(a, 2, 3, 2, 3)); |
| return pfirst<Packet4f>(__lsx_vfmul_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { |
| return pfirst<Packet2d>(__lsx_vfmul_d(a, preverse(a))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a) { |
| Packet8s tmp1 = __lsx_vmulwev_h_b(a, preverse(a)); |
| Packet4i tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1)); |
| Packet2l tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2)); |
| return (int8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a) { |
| Packet4i tmp1 = __lsx_vmulwev_w_h(a, preverse(a)); |
| Packet2l tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1)); |
| return (int16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) { |
| Packet2l tmp = __lsx_vmulwev_d_w(a, preverse(a)); |
| return (int32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) { |
| return (int64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(a, preverse(a)), 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a) { |
| Packet8us tmp1 = __lsx_vmulwev_h_bu(a, preverse(a)); |
| Packet4ui tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1)); |
| Packet2ul tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2)); |
| return (uint8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a) { |
| Packet4ui tmp1 = __lsx_vmulwev_w_hu(a, preverse(a)); |
| Packet2ul tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1)); |
| return (uint16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) { |
| Packet2ul tmp = __lsx_vmulwev_d_wu(a, preverse(a)); |
| return (uint32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a) { |
| return (uint64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_du(a, preverse(a)), 0); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) { |
| Packet4f tmp = __lsx_vfmin_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E)); |
| return pfirst(__lsx_vfmin_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { |
| return pfirst(__lsx_vfmin_d(a, preverse(a))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) { |
| Packet16c tmp1 = __lsx_vmin_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); |
| Packet16c tmp2 = __lsx_vmin_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E)); |
| Packet16c tmp3 = __lsx_vmin_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E)); |
| return pfirst((Packet16c)__lsx_vmin_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) { |
| Packet8s tmp1 = __lsx_vmin_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); |
| Packet8s tmp2 = __lsx_vmin_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E)); |
| return pfirst((Packet8s)__lsx_vmin_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) { |
| Packet4i tmp = __lsx_vmin_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); |
| return pfirst((Packet4i)__lsx_vmin_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a) { |
| return pfirst((Packet2l)__lsx_vmin_d(a, preverse(a))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) { |
| Packet16uc tmp1 = __lsx_vmin_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); |
| Packet16uc tmp2 = __lsx_vmin_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E)); |
| Packet16uc tmp3 = __lsx_vmin_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E)); |
| return pfirst((Packet16uc)__lsx_vmin_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) { |
| Packet8us tmp1 = __lsx_vmin_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); |
| Packet8us tmp2 = __lsx_vmin_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E)); |
| return pfirst((Packet8us)__lsx_vmin_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) { |
| Packet4ui tmp = __lsx_vmin_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); |
| return pfirst((Packet4ui)__lsx_vmin_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a) { |
| return pfirst((Packet2ul)__lsx_vmin_du(a, preverse(a))); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) { |
| Packet4f tmp = __lsx_vfmax_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E)); |
| return pfirst(__lsx_vfmax_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { |
| return pfirst(__lsx_vfmax_d(a, preverse(a))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) { |
| Packet16c tmp1 = __lsx_vmax_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); |
| Packet16c tmp2 = __lsx_vmax_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E)); |
| Packet16c tmp3 = __lsx_vmax_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E)); |
| return pfirst((Packet16c)__lsx_vmax_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) { |
| Packet8s tmp1 = __lsx_vmax_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); |
| Packet8s tmp2 = __lsx_vmax_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E)); |
| return pfirst((Packet8s)__lsx_vmax_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) { |
| Packet4i tmp = __lsx_vmax_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); |
| return pfirst((Packet4i)__lsx_vmax_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a) { |
| return pfirst((Packet2l)__lsx_vmax_d(a, preverse(a))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) { |
| Packet16uc tmp1 = __lsx_vmax_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); |
| Packet16uc tmp2 = __lsx_vmax_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E)); |
| Packet16uc tmp3 = __lsx_vmax_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E)); |
| return pfirst((Packet16uc)__lsx_vmax_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) { |
| Packet8us tmp1 = __lsx_vmax_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); |
| Packet8us tmp2 = __lsx_vmax_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E)); |
| return pfirst((Packet8us)__lsx_vmax_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) { |
| Packet4ui tmp = __lsx_vmax_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E)); |
| return pfirst((Packet4ui)__lsx_vmax_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1))); |
| } |
| template <> |
| EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a) { |
| return pfirst((Packet2ul)__lsx_vmax_du(a, preverse(a))); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { |
| return __lsx_vfsqrt_s(a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) { |
| return __lsx_vfsqrt_d(a); |
| } |
| |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) { |
| Packet4f T0 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]); |
| Packet4f T1 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]); |
| Packet4f T2 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]); |
| Packet4f T3 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]); |
| |
| kernel.packet[0] = (Packet4f)__lsx_vilvl_d((__m128i)T2, (__m128i)T0); |
| kernel.packet[1] = (Packet4f)__lsx_vilvh_d((__m128i)T2, (__m128i)T0); |
| kernel.packet[2] = (Packet4f)__lsx_vilvl_d((__m128i)T3, (__m128i)T1); |
| kernel.packet[3] = (Packet4f)__lsx_vilvh_d((__m128i)T3, (__m128i)T1); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) { |
| Packet2d tmp = (Packet2d)__lsx_vilvh_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]); |
| kernel.packet[0] = (Packet2d)__lsx_vilvl_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]); |
| kernel.packet[1] = tmp; |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) { |
| __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]); |
| __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]); |
| __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]); |
| __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]); |
| __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]); |
| __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]); |
| __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]); |
| __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]); |
| __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]); |
| __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]); |
| __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]); |
| __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]); |
| __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]); |
| __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]); |
| __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]); |
| __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]); |
| |
| __m128i s0 = __lsx_vilvl_h(t2, t0); |
| __m128i s1 = __lsx_vilvh_h(t2, t0); |
| __m128i s2 = __lsx_vilvl_h(t3, t1); |
| __m128i s3 = __lsx_vilvh_h(t3, t1); |
| __m128i s4 = __lsx_vilvl_h(t6, t4); |
| __m128i s5 = __lsx_vilvh_h(t6, t4); |
| __m128i s6 = __lsx_vilvl_h(t7, t5); |
| __m128i s7 = __lsx_vilvh_h(t7, t5); |
| __m128i s8 = __lsx_vilvl_h(ta, t8); |
| __m128i s9 = __lsx_vilvh_h(ta, t8); |
| __m128i sa = __lsx_vilvl_h(tb, t9); |
| __m128i sb = __lsx_vilvh_h(tb, t9); |
| __m128i sc = __lsx_vilvl_h(te, tc); |
| __m128i sd = __lsx_vilvh_h(te, tc); |
| __m128i se = __lsx_vilvl_h(tf, td); |
| __m128i sf = __lsx_vilvh_h(tf, td); |
| |
| __m128i u0 = __lsx_vilvl_w(s4, s0); |
| __m128i u1 = __lsx_vilvh_w(s4, s0); |
| __m128i u2 = __lsx_vilvl_w(s5, s1); |
| __m128i u3 = __lsx_vilvh_w(s5, s1); |
| __m128i u4 = __lsx_vilvl_w(s6, s2); |
| __m128i u5 = __lsx_vilvh_w(s6, s2); |
| __m128i u6 = __lsx_vilvl_w(s7, s3); |
| __m128i u7 = __lsx_vilvh_w(s7, s3); |
| __m128i u8 = __lsx_vilvl_w(sc, s8); |
| __m128i u9 = __lsx_vilvh_w(sc, s8); |
| __m128i ua = __lsx_vilvl_w(sd, s9); |
| __m128i ub = __lsx_vilvh_w(sd, s9); |
| __m128i uc = __lsx_vilvl_w(se, sa); |
| __m128i ud = __lsx_vilvh_w(se, sa); |
| __m128i ue = __lsx_vilvl_w(sf, sb); |
| __m128i uf = __lsx_vilvh_w(sf, sb); |
| |
| kernel.packet[0] = __lsx_vilvl_d(u8, u0); |
| kernel.packet[1] = __lsx_vilvh_d(u8, u0); |
| kernel.packet[2] = __lsx_vilvl_d(u9, u1); |
| kernel.packet[3] = __lsx_vilvh_d(u9, u1); |
| kernel.packet[4] = __lsx_vilvl_d(ua, u2); |
| kernel.packet[5] = __lsx_vilvh_d(ua, u2); |
| kernel.packet[6] = __lsx_vilvl_d(ub, u3); |
| kernel.packet[7] = __lsx_vilvh_d(ub, u3); |
| kernel.packet[8] = __lsx_vilvl_d(uc, u4); |
| kernel.packet[9] = __lsx_vilvh_d(uc, u4); |
| kernel.packet[10] = __lsx_vilvl_d(ud, u5); |
| kernel.packet[11] = __lsx_vilvh_d(ud, u5); |
| kernel.packet[12] = __lsx_vilvl_d(ue, u6); |
| kernel.packet[13] = __lsx_vilvh_d(ue, u6); |
| kernel.packet[14] = __lsx_vilvl_d(uf, u7); |
| kernel.packet[15] = __lsx_vilvh_d(uf, u7); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) { |
| __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]); |
| __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]); |
| __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]); |
| __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]); |
| __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]); |
| __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]); |
| __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]); |
| __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]); |
| |
| __m128i s0 = __lsx_vilvl_h(t2, t0); |
| __m128i s1 = __lsx_vilvh_h(t2, t0); |
| __m128i s2 = __lsx_vilvl_h(t3, t1); |
| __m128i s3 = __lsx_vilvh_h(t3, t1); |
| __m128i s4 = __lsx_vilvl_h(t6, t4); |
| __m128i s5 = __lsx_vilvh_h(t6, t4); |
| __m128i s6 = __lsx_vilvl_h(t7, t5); |
| __m128i s7 = __lsx_vilvh_h(t7, t5); |
| |
| kernel.packet[0] = __lsx_vilvl_w(s4, s0); |
| kernel.packet[1] = __lsx_vilvh_w(s4, s0); |
| kernel.packet[2] = __lsx_vilvl_w(s5, s1); |
| kernel.packet[3] = __lsx_vilvh_w(s5, s1); |
| kernel.packet[4] = __lsx_vilvl_w(s6, s2); |
| kernel.packet[5] = __lsx_vilvh_w(s6, s2); |
| kernel.packet[6] = __lsx_vilvl_w(s7, s3); |
| kernel.packet[7] = __lsx_vilvh_w(s7, s3); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) { |
| __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]); |
| __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]); |
| __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]); |
| __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]); |
| |
| kernel.packet[0] = __lsx_vilvl_h(t2, t0); |
| kernel.packet[1] = __lsx_vilvh_h(t2, t0); |
| kernel.packet[2] = __lsx_vilvl_h(t3, t1); |
| kernel.packet[3] = __lsx_vilvh_h(t3, t1); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) { |
| __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]); |
| __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]); |
| __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]); |
| __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]); |
| __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]); |
| __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]); |
| __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]); |
| __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]); |
| |
| __m128i s0 = __lsx_vilvl_w(t2, t0); |
| __m128i s1 = __lsx_vilvh_w(t2, t0); |
| __m128i s2 = __lsx_vilvl_w(t3, t1); |
| __m128i s3 = __lsx_vilvh_w(t3, t1); |
| __m128i s4 = __lsx_vilvl_w(t6, t4); |
| __m128i s5 = __lsx_vilvh_w(t6, t4); |
| __m128i s6 = __lsx_vilvl_w(t7, t5); |
| __m128i s7 = __lsx_vilvh_w(t7, t5); |
| |
| kernel.packet[0] = __lsx_vilvl_d(s4, s0); |
| kernel.packet[1] = __lsx_vilvh_d(s4, s0); |
| kernel.packet[2] = __lsx_vilvl_d(s5, s1); |
| kernel.packet[3] = __lsx_vilvh_d(s5, s1); |
| kernel.packet[4] = __lsx_vilvl_d(s6, s2); |
| kernel.packet[5] = __lsx_vilvh_d(s6, s2); |
| kernel.packet[6] = __lsx_vilvl_d(s7, s3); |
| kernel.packet[7] = __lsx_vilvh_d(s7, s3); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) { |
| __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]); |
| __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]); |
| __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]); |
| __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]); |
| |
| kernel.packet[0] = __lsx_vilvl_w(t2, t0); |
| kernel.packet[1] = __lsx_vilvh_w(t2, t0); |
| kernel.packet[2] = __lsx_vilvl_w(t3, t1); |
| kernel.packet[3] = __lsx_vilvh_w(t3, t1); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) { |
| __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]); |
| __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]); |
| __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]); |
| __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]); |
| |
| kernel.packet[0] = __lsx_vilvl_d(T2, T0); |
| kernel.packet[1] = __lsx_vilvh_d(T2, T0); |
| kernel.packet[2] = __lsx_vilvl_d(T3, T1); |
| kernel.packet[3] = __lsx_vilvh_d(T3, T1); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) { |
| __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]); |
| kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]); |
| kernel.packet[1] = tmp; |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) { |
| __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]); |
| __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]); |
| __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]); |
| __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]); |
| __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]); |
| __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]); |
| __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]); |
| __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]); |
| __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]); |
| __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]); |
| __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]); |
| __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]); |
| __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]); |
| __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]); |
| __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]); |
| __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]); |
| |
| __m128i s0 = __lsx_vilvl_h(t2, t0); |
| __m128i s1 = __lsx_vilvh_h(t2, t0); |
| __m128i s2 = __lsx_vilvl_h(t3, t1); |
| __m128i s3 = __lsx_vilvh_h(t3, t1); |
| __m128i s4 = __lsx_vilvl_h(t6, t4); |
| __m128i s5 = __lsx_vilvh_h(t6, t4); |
| __m128i s6 = __lsx_vilvl_h(t7, t5); |
| __m128i s7 = __lsx_vilvh_h(t7, t5); |
| __m128i s8 = __lsx_vilvl_h(ta, t8); |
| __m128i s9 = __lsx_vilvh_h(ta, t8); |
| __m128i sa = __lsx_vilvl_h(tb, t9); |
| __m128i sb = __lsx_vilvh_h(tb, t9); |
| __m128i sc = __lsx_vilvl_h(te, tc); |
| __m128i sd = __lsx_vilvh_h(te, tc); |
| __m128i se = __lsx_vilvl_h(tf, td); |
| __m128i sf = __lsx_vilvh_h(tf, td); |
| |
| __m128i u0 = __lsx_vilvl_w(s4, s0); |
| __m128i u1 = __lsx_vilvh_w(s4, s0); |
| __m128i u2 = __lsx_vilvl_w(s5, s1); |
| __m128i u3 = __lsx_vilvh_w(s5, s1); |
| __m128i u4 = __lsx_vilvl_w(s6, s2); |
| __m128i u5 = __lsx_vilvh_w(s6, s2); |
| __m128i u6 = __lsx_vilvl_w(s7, s3); |
| __m128i u7 = __lsx_vilvh_w(s7, s3); |
| __m128i u8 = __lsx_vilvl_w(sc, s8); |
| __m128i u9 = __lsx_vilvh_w(sc, s8); |
| __m128i ua = __lsx_vilvl_w(sd, s9); |
| __m128i ub = __lsx_vilvh_w(sd, s9); |
| __m128i uc = __lsx_vilvl_w(se, sa); |
| __m128i ud = __lsx_vilvh_w(se, sa); |
| __m128i ue = __lsx_vilvl_w(sf, sb); |
| __m128i uf = __lsx_vilvh_w(sf, sb); |
| |
| kernel.packet[0] = __lsx_vilvl_d(u8, u0); |
| kernel.packet[1] = __lsx_vilvh_d(u8, u0); |
| kernel.packet[2] = __lsx_vilvl_d(u9, u1); |
| kernel.packet[3] = __lsx_vilvh_d(u9, u1); |
| kernel.packet[4] = __lsx_vilvl_d(ua, u2); |
| kernel.packet[5] = __lsx_vilvh_d(ua, u2); |
| kernel.packet[6] = __lsx_vilvl_d(ub, u3); |
| kernel.packet[7] = __lsx_vilvh_d(ub, u3); |
| kernel.packet[8] = __lsx_vilvl_d(uc, u4); |
| kernel.packet[9] = __lsx_vilvh_d(uc, u4); |
| kernel.packet[10] = __lsx_vilvl_d(ud, u5); |
| kernel.packet[11] = __lsx_vilvh_d(ud, u5); |
| kernel.packet[12] = __lsx_vilvl_d(ue, u6); |
| kernel.packet[13] = __lsx_vilvh_d(ue, u6); |
| kernel.packet[14] = __lsx_vilvl_d(uf, u7); |
| kernel.packet[15] = __lsx_vilvh_d(uf, u7); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) { |
| __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]); |
| __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]); |
| __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]); |
| __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]); |
| __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]); |
| __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]); |
| __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]); |
| __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]); |
| |
| __m128i s0 = __lsx_vilvl_h(t2, t0); |
| __m128i s1 = __lsx_vilvh_h(t2, t0); |
| __m128i s2 = __lsx_vilvl_h(t3, t1); |
| __m128i s3 = __lsx_vilvh_h(t3, t1); |
| __m128i s4 = __lsx_vilvl_h(t6, t4); |
| __m128i s5 = __lsx_vilvh_h(t6, t4); |
| __m128i s6 = __lsx_vilvl_h(t7, t5); |
| __m128i s7 = __lsx_vilvh_h(t7, t5); |
| |
| kernel.packet[0] = __lsx_vilvl_w(s4, s0); |
| kernel.packet[1] = __lsx_vilvh_w(s4, s0); |
| kernel.packet[2] = __lsx_vilvl_w(s5, s1); |
| kernel.packet[3] = __lsx_vilvh_w(s5, s1); |
| kernel.packet[4] = __lsx_vilvl_w(s6, s2); |
| kernel.packet[5] = __lsx_vilvh_w(s6, s2); |
| kernel.packet[6] = __lsx_vilvl_w(s7, s3); |
| kernel.packet[7] = __lsx_vilvh_w(s7, s3); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) { |
| __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]); |
| __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]); |
| __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]); |
| __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]); |
| |
| kernel.packet[0] = __lsx_vilvl_h(t2, t0); |
| kernel.packet[1] = __lsx_vilvh_h(t2, t0); |
| kernel.packet[2] = __lsx_vilvl_h(t3, t1); |
| kernel.packet[3] = __lsx_vilvh_h(t3, t1); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) { |
| __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]); |
| __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]); |
| __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]); |
| __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]); |
| __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]); |
| __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]); |
| __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]); |
| __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]); |
| |
| __m128i s0 = __lsx_vilvl_w(t2, t0); |
| __m128i s1 = __lsx_vilvh_w(t2, t0); |
| __m128i s2 = __lsx_vilvl_w(t3, t1); |
| __m128i s3 = __lsx_vilvh_w(t3, t1); |
| __m128i s4 = __lsx_vilvl_w(t6, t4); |
| __m128i s5 = __lsx_vilvh_w(t6, t4); |
| __m128i s6 = __lsx_vilvl_w(t7, t5); |
| __m128i s7 = __lsx_vilvh_w(t7, t5); |
| |
| kernel.packet[0] = __lsx_vilvl_d(s4, s0); |
| kernel.packet[1] = __lsx_vilvh_d(s4, s0); |
| kernel.packet[2] = __lsx_vilvl_d(s5, s1); |
| kernel.packet[3] = __lsx_vilvh_d(s5, s1); |
| kernel.packet[4] = __lsx_vilvl_d(s6, s2); |
| kernel.packet[5] = __lsx_vilvh_d(s6, s2); |
| kernel.packet[6] = __lsx_vilvl_d(s7, s3); |
| kernel.packet[7] = __lsx_vilvh_d(s7, s3); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) { |
| __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]); |
| __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]); |
| __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]); |
| __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]); |
| |
| kernel.packet[0] = __lsx_vilvl_w(t2, t0); |
| kernel.packet[1] = __lsx_vilvh_w(t2, t0); |
| kernel.packet[2] = __lsx_vilvl_w(t3, t1); |
| kernel.packet[3] = __lsx_vilvh_w(t3, t1); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) { |
| __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]); |
| __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]); |
| __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]); |
| __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]); |
| |
| kernel.packet[0] = __lsx_vilvl_d(T2, T0); |
| kernel.packet[1] = __lsx_vilvh_d(T2, T0); |
| kernel.packet[2] = __lsx_vilvl_d(T3, T1); |
| kernel.packet[3] = __lsx_vilvh_d(T3, T1); |
| } |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ul, 2>& kernel) { |
| __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]); |
| kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]); |
| kernel.packet[1] = tmp; |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) { |
| return __lsx_vfrsqrt_s(a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { |
| return __lsx_vfrsqrt_d(a); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { |
| return __lsx_vfrintrm_s(a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { |
| return __lsx_vfrintrm_d(a); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { |
| return __lsx_vfrintrp_s(a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { |
| return __lsx_vfrintrp_d(a); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { |
| const Packet4f mask = pset1frombits<Packet4f>(static_cast<numext::uint32_t>(0x80000000u)); |
| const Packet4f prev0dot5 = pset1frombits<Packet4f>(static_cast<numext::uint32_t>(0x3EFFFFFFu)); |
| return __lsx_vfrintrz_s(padd(pxor(pand(a, mask), prev0dot5), a)); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { |
| const Packet2d mask = pset1frombits<Packet2d>(static_cast<numext::uint64_t>(0x8000000000000000ull)); |
| const Packet2d prev0dot5 = pset1frombits<Packet2d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull)); |
| return __lsx_vfrintrz_d(padd(por(pand(a, mask), prev0dot5), a)); |
| } |
| |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { |
| return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) { |
| return (Packet16c)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from) { |
| int8_t tmp[16] = {*from, *from, *from, *from, *(from + 1), *(from + 1), |
| *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2), |
| *(from + 3), *(from + 3), *(from + 3), *(from + 3)}; |
| return __lsx_vld(tmp, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from) { |
| uint8_t tmp[16] = {*from, *from, *from, *from, *(from + 1), *(from + 1), |
| *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2), |
| *(from + 3), *(from + 3), *(from + 3), *(from + 3)}; |
| return __lsx_vld(tmp, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from) { |
| int16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)}; |
| return __lsx_vld(tmp, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from) { |
| uint16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)}; |
| return __lsx_vld(tmp, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) { |
| int32_t tmp[4] = {*from, *from, *from, *from}; |
| return __lsx_vld(tmp, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) { |
| uint32_t tmp[4] = {*from, *from, *from, *from}; |
| return __lsx_vld(tmp, 0); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pnmsub(const Packet16c& a, const Packet16c& b, const Packet16c& c) { |
| return __lsx_vmsub_b(pnegate(c), a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pnmsub(const Packet8s& a, const Packet8s& b, const Packet8s& c) { |
| return __lsx_vmsub_h(pnegate(c), a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pnmsub(const Packet4i& a, const Packet4i& b, const Packet4i& c) { |
| return __lsx_vmsub_w(pnegate(c), a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pnmsub(const Packet2l& a, const Packet2l& b, const Packet2l& c) { |
| return __lsx_vmsub_d(pnegate(c), a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pmsub(const Packet16c& a, const Packet16c& b, const Packet16c& c) { |
| return __lsx_vmadd_b(pnegate(c), a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pmsub(const Packet8s& a, const Packet8s& b, const Packet8s& c) { |
| return __lsx_vmadd_h(pnegate(c), a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pmsub(const Packet4i& a, const Packet4i& b, const Packet4i& c) { |
| return __lsx_vmadd_w(pnegate(c), a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pmsub(const Packet2l& a, const Packet2l& b, const Packet2l& c) { |
| return __lsx_vmadd_d(pnegate(c), a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pnmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) { |
| return __lsx_vmsub_b(c, a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pnmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { |
| return __lsx_vmsub_h(c, a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pnmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { |
| return __lsx_vmsub_w(c, a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2l pnmadd(const Packet2l& a, const Packet2l& b, const Packet2l& c) { |
| return __lsx_vmsub_d(c, a, b); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pexp(const Packet4f& _x) { |
| return pexp_float(_x); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pexp(const Packet2d& _x) { |
| return pexp_double(_x); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) { |
| return pldexp_generic(a, exponent); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) { |
| return pfrexp_generic(a, exponent); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) { |
| return pfrexp_generic(a, exponent); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /* a */) { |
| Packet4f v = {0.0f, 0.0f, 0.0f, 0.0f}; |
| return v; |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b) { |
| Packet4f v = psub(a, b); |
| return pabs(v); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { |
| return pmin<Packet4f>(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) { |
| return pmax<Packet4f>(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) { |
| return (__m128)__lsx_vldrepl_w(from, 0); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) { |
| return (__m128)__lsx_vsrai_w((__m128i)a, 31); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) { |
| return __lsx_vfrintrne_s(a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) { |
| return __lsx_vfrintrz_s(a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) { |
| return __lsx_vfrecip_s(a); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /* a */) { |
| Packet2d v = {0.0, 0.0}; |
| return v; |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { |
| return pmin<Packet2d>(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) { |
| return pmax<Packet2d>(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) { |
| return (__m128d)(__lsx_vsrai_d((__m128i)a, 63)); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) { |
| return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) { |
| return __lsx_vfrintrne_d(a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) { |
| return __lsx_vfrintrz_d(a); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) { |
| return pldexp_generic(a, exponent); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b) { |
| Packet16c v = psub(a, b); |
| return pabs(v); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b) { |
| Packet8s v = psub(a, b); |
| return pabs(v); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) { |
| return __lsx_vbitsel_v(b, a, mask); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b) { |
| Packet4i v = psub(a, b); |
| return pabs(v); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) { |
| return __lsx_vbitsel_v(b, a, mask); |
| } |
| |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) { |
| return __lsx_vbitsel_v(b, a, mask); |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { |
| return __lsx_vdiv_bu(a, b); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { |
| Packet16uc v = psub(a, b); |
| return pabs(v); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, |
| const Packet16uc& b) { |
| return __lsx_vbitsel_v(b, a, mask); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) { |
| __m128i res = {0, 0}; |
| __m128i add = {0x0808080808080808, 0x0808080808080808}; |
| for (int i = 0; i < 4; i++) { |
| const __m128i temp = __lsx_vor_v(res, add); |
| const __m128i tmul = __lsx_vpackev_b(__lsx_vmulwod_h_bu(temp, temp), __lsx_vmulwev_h_bu(temp, temp)); |
| res = __lsx_vbitsel_v(res, temp, __lsx_vsle_bu(tmul, a)); |
| add = __lsx_vsrli_b(add, 1); |
| } |
| return res; |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b) { |
| Packet8us v = psub(a, b); |
| return pabs(v); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) { |
| return __lsx_vbitsel_v(b, a, mask); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) { |
| __m128i res = {0, 0}; |
| __m128i add = {0x0080008000800080, 0x0080008000800080}; |
| for (int i = 0; i < 4; i++) { |
| const __m128i temp = __lsx_vor_v(res, add); |
| const __m128i tmul = __lsx_vpackev_h(__lsx_vmulwod_w_hu(temp, temp), __lsx_vmulwev_w_hu(temp, temp)); |
| res = __lsx_vbitsel_v(res, temp, __lsx_vsle_hu(tmul, a)); |
| add = __lsx_vsrli_h(add, 1); |
| } |
| return res; |
| } |
| |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { |
| Packet4ui v = psub(a, b); |
| return pabs(v); |
| } |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) { |
| return __lsx_vbitsel_v(b, a, mask); |
| } |
| template <> |
| EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) { |
| __m128i res = {0, 0}; |
| __m128i add = {0x0000800000008000, 0x0000800000008000}; |
| for (int i = 0; i < 4; i++) { |
| const __m128i temp = __lsx_vor_v(res, add); |
| const __m128i tmul = __lsx_vpackev_w(__lsx_vmulwod_d_wu(temp, temp), __lsx_vmulwev_d_wu(temp, temp)); |
| res = __lsx_vbitsel_v(res, temp, __lsx_vsle_wu(tmul, a)); |
| add = __lsx_vsrli_w(add, 1); |
| } |
| return res; |
| } |
| |
| template <> |
| EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) { |
| return __lsx_vbitsel_v(b, a, mask); |
| } |
| |
| } // namespace internal |
| } // namespace Eigen |
| #endif |