1)provide a better generic paddsub op implementation 2)make paddsub op support the Packet2cf/Packet4f/Packet2f in NEON 3)make paddsub op support the Packet2cf/Packet4f in SSE

commit: 38ae5353ab6f1050aed64821ac56a1561096cdce [log] [tgz]
author: Guoqiang QI <425418567@qq.com> Wed Jan 13 22:54:03 2021 +0000
committer: Rasmus Munk Larsen <rmlarsen@google.com> Wed Jan 13 22:54:03 2021 +0000
tree: bc2cbea514d3843d300695559b96abeac4ceefc3
parent: 352f1422d3ceea19a04cab297c6339e0870e1c6c [diff]
diff --git a/Eigen/Geometry b/Eigen/Geometry
index 16b4bd6..e93fa72 100644
--- a/Eigen/Geometry
+++ b/Eigen/Geometry

@@ -50,8 +50,8 @@
 #include "src/Geometry/Umeyama.h"
 
 // Use the SSE optimized version whenever possible.
-#if defined EIGEN_VECTORIZE_SSE
-#include "src/Geometry/arch/Geometry_SSE.h"
+#if (defined EIGEN_VECTORIZE_SSE) || (defined EIGEN_VECTORIZE_NEON)
+#include "src/Geometry/arch/Geometry_SIMD.h"
 #endif
 
 #include "src/Core/util/ReenableStupidWarnings.h"

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 671ed3c..ec7d20e 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h

@@ -407,6 +407,12 @@
 template<> EIGEN_DEVICE_FUNC inline unsigned long long
 pabs(const unsigned long long& a) { return a; }
 
+/** \internal \returns the addsub value of \a a,b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+paddsub(const Packet& a, const Packet& b) {
+  return pselect(peven_mask(a), padd(a, b), psub(a, b));
+ }
+
 /** \internal \returns the phase angle of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 parg(const Packet& a) { using numext::arg; return arg(a); }

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 9253d8c..f400934 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h

@@ -19,6 +19,12 @@
 namespace Eigen {
 namespace internal {
 
+template<typename Packet, int N> EIGEN_DEVICE_FUNC inline Packet
+pset(const typename unpacket_traits<Packet>::type (&a)[N] /* a */) {
+  EIGEN_STATIC_ASSERT(unpacket_traits<Packet>::size == N, THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE);
+  return pload<Packet>(a);
+}
+
 template<typename Packet> EIGEN_STRONG_INLINE Packet
 pfrexp_float(const Packet& a, Packet& exponent) {
   typedef typename unpacket_traits<Packet>::integer_packet PacketI;

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 491f1c9..a623f54 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h

@@ -17,6 +17,10 @@
 // implemented in GenericPacketMathFunctions.h
 // This is needed to workaround a circular dependency.
 
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (a[N-1],...,a[0]) */
+template<typename Packet, int N> EIGEN_DEVICE_FUNC inline Packet
+pset(const typename unpacket_traits<Packet>::type (&a)[N] /* a */);
+
 template<typename Packet> EIGEN_STRONG_INLINE Packet
 pfrexp_float(const Packet& a, Packet& exponent);
 

diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index a889ab1..1aa361b 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h

@@ -124,6 +124,13 @@
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
 
+template<> EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b);
+template<> EIGEN_STRONG_INLINE Packet2cf paddsub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet4f mask = {-0.0f, -0.0f, 0.0f, 0.0f};
+  return Packet2cf(padd(a.v, pxor(mask, b.v)));  
+}
+
 template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate<Packet2f>(a.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate<Packet4f>(a.v)); }
 

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 14f3dbd..1f34faa 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h

@@ -84,12 +84,18 @@
 
 #endif // EIGEN_COMP_MSVC
 
+EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){
+  const float* a = reinterpret_cast<const float*>(&m);
+  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))};
+  return res;
+}
+
 // fuctionally equivalent to _mm_shuffle_ps in SSE when interleave
 // == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
 // interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
-// to enable a shared implementation for fast inversion of matrices of size 4.
-template<bool interleave>
-EIGEN_STRONG_INLINE Packet4f shuffle(const Packet4f &m, const Packet4f &n, int mask)
+// to enable a shared implementation for fast inversion of matrices of size 4. 
+template<bool interleave> 
+EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int mask)
 {
   const float* a = reinterpret_cast<const float*>(&m);
   const float* b = reinterpret_cast<const float*>(&n);
@@ -97,8 +103,8 @@
   return res;
 }
 
-template<>
-EIGEN_STRONG_INLINE Packet4f shuffle<true>(const Packet4f &m, const Packet4f &n, int mask)
+template<> 
+EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f &m, const Packet4f &n, int mask) 
 {
   const float* a = reinterpret_cast<const float*>(&m);
   const float* b = reinterpret_cast<const float*>(&n);
@@ -108,25 +114,29 @@
 
 EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {return ((s)<<6|(r)<<4|(q)<<2|(p));}
 
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s)
+{ 
+  return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
+}
 EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s)
-{
-  return shuffle<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s));
+{ 
+  return shuffle2<false>(a,b,eigen_neon_shuffle_mask(p, q, r, s));
 }
 EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
 {
-  return shuffle<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1));
+  return shuffle2<false>(a,b,eigen_neon_shuffle_mask(0, 1, 0, 1));
 }
 EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
 {
-  return shuffle<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3));
+  return shuffle2<false>(b,a,eigen_neon_shuffle_mask(2, 3, 2, 3));
 }
 EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
 {
-  return shuffle<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1));
+  return shuffle2<true>(a,b,eigen_neon_shuffle_mask(0, 0, 1, 1));
 }
 EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
 {
-  return shuffle<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
+  return shuffle2<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
 }
 #define vec4f_duplane(a, p) \
   vdupq_lane_f32(vget_low_f32(a), p)
@@ -851,6 +861,17 @@
 template<> EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
+template<> EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f & b) {
+  Packet2f mask = {-0.0f, 0.0f};
+  return padd(a, pxor(mask, b));
+}
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f mask = {-0.0f, 0.0f, -0.0f, 0.0f};
+  return padd(a, pxor(mask, b));
+}
+
 template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
 template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a)
@@ -3717,6 +3738,12 @@
 
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
+template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b){
+  const Packet2d mask = {-0.0,0.0};
+  return padd(a, pxor(mask, b));
+}
+
 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
 
 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }

diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index 58cdb5d..f6f1b8c 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h

@@ -66,6 +66,13 @@
 
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b);
+template<> EIGEN_STRONG_INLINE Packet2cf paddsub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x0,0x0));
+  return Packet2cf(padd(a.v, pxor(mask, b.v)));
+}
+
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a)
 {
   const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));

diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 4e733c7..4c5b664 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h

@@ -301,6 +301,28 @@
 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
 
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b)
+{
+#ifdef EIGEN_VECTORIZE_SSE3
+  return _mm_addsub_ps(a,b);
+#else
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x0,0x80000000,0x0));
+  return padd(a, pxor(mask, b));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
+template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) 
+{
+#ifdef EIGEN_VECTORIZE_SSE3  
+  return _mm_addsub_pd(a,b); 
+#else
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x0)); 
+  return padd(a, pxor(mask, b));
+#endif
+}
+
 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
 {
   const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));

diff --git a/Eigen/src/Core/util/StaticAssert.h b/Eigen/src/Core/util/StaticAssert.h
index 95107ff..2ef19a4 100644
--- a/Eigen/src/Core/util/StaticAssert.h
+++ b/Eigen/src/Core/util/StaticAssert.h

@@ -105,7 +105,8 @@
         CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1,
         SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1,
         INVALID_TEMPLATE_PARAMETER=1,
-        GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1
+        GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1,
+        THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE=1
       };
     };
 

diff --git a/Eigen/src/Geometry/arch/Geometry_SSE.h b/Eigen/src/Geometry/arch/Geometry_SIMD.h
similarity index 79%
rename from Eigen/src/Geometry/arch/Geometry_SSE.h
rename to Eigen/src/Geometry/arch/Geometry_SIMD.h
index 108cc9f..9c15bfb 100644
--- a/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ b/Eigen/src/Geometry/arch/Geometry_SIMD.h

@@ -8,15 +8,15 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_GEOMETRY_SSE_H
-#define EIGEN_GEOMETRY_SSE_H
+#ifndef EIGEN_GEOMETRY_SIMD_H
+#define EIGEN_GEOMETRY_SIMD_H
 
 namespace Eigen { 
 
 namespace internal {
 
 template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, float>
+struct quat_product<Architecture::Target, Derived, OtherDerived, float>
 {
   enum {
     AAlignment = traits<Derived>::Alignment,
@@ -28,7 +28,8 @@
     evaluator<typename Derived::Coefficients> ae(_a.coeffs());
     evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
     Quaternion<float> res;
-    const Packet4f mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
+    float arr[4] = {0.f, 0.f, 0.f, -0.f};
+    const Packet4f mask = pset<Packet4f>(arr);
     Packet4f a = ae.template packet<AAlignment,Packet4f>(0);
     Packet4f b = be.template packet<BAlignment,Packet4f>(0);
     Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
@@ -45,7 +46,7 @@
 };
 
 template<class Derived>
-struct quat_conj<Architecture::SSE, Derived, float>
+struct quat_conj<Architecture::Target, Derived, float>
 {
   enum {
     ResAlignment = traits<Quaternion<float> >::Alignment
@@ -54,7 +55,8 @@
   {
     evaluator<typename Derived::Coefficients> qe(q.coeffs());
     Quaternion<float> res;
-    const Packet4f mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f);
+    float arr[4] = {-0.f,-0.f,-0.f,0.f};
+    const Packet4f mask = pset<Packet4f>(arr);
     pstoret<float,Packet4f,ResAlignment>(&res.x(), pxor(mask, qe.template packet<traits<Derived>::Alignment,Packet4f>(0)));
     return res;
   }
@@ -62,7 +64,7 @@
 
 
 template<typename VectorLhs,typename VectorRhs>
-struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
+struct cross3_impl<Architecture::Target,VectorLhs,VectorRhs,float,true>
 {
   enum {
     ResAlignment = traits<typename plain_matrix_type<VectorLhs>::type>::Alignment
@@ -84,9 +86,10 @@
 
 
 
+#if (defined EIGEN_VECTORIZE_SSE) || (EIGEN_ARCH_ARM64)
 
 template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, double>
+struct quat_product<Architecture::Target, Derived, OtherDerived, double>
 {
   enum {
     BAlignment = traits<OtherDerived>::Alignment,
@@ -95,8 +98,6 @@
 
   static inline Quaternion<double> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
   {
-  const Packet2d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
-
   Quaternion<double> res;
 
   evaluator<typename Derived::Coefficients> ae(_a.coeffs());
@@ -120,12 +121,7 @@
    */
   t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw));
   t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw));
-#ifdef EIGEN_VECTORIZE_SSE3
-  EIGEN_UNUSED_VARIABLE(mask)
-  pstoret<double,Packet2d,ResAlignment>(&res.x(), _mm_addsub_pd(t1, preverse(t2)));
-#else
-  pstoret<double,Packet2d,ResAlignment>(&res.x(), padd(t1, pxor(mask,preverse(t2))));
-#endif
+  pstoret<double,Packet2d,ResAlignment>(&res.x(), paddsub(t1, preverse(t2)));
   
   /*
    * t1 = ww*zw - yy*xy
@@ -134,19 +130,14 @@
    */
   t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy));
   t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy));
-#ifdef EIGEN_VECTORIZE_SSE3
-  EIGEN_UNUSED_VARIABLE(mask)
-  pstoret<double,Packet2d,ResAlignment>(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2)));
-#else
-  pstoret<double,Packet2d,ResAlignment>(&res.z(), psub(t1, pxor(mask,preverse(t2))));
-#endif
+  pstoret<double,Packet2d,ResAlignment>(&res.z(), preverse(paddsub(preverse(t1), t2)));
 
   return res;
 }
 };
 
 template<class Derived>
-struct quat_conj<Architecture::SSE, Derived, double>
+struct quat_conj<Architecture::Target, Derived, double>
 {
   enum {
     ResAlignment = traits<Quaternion<double> >::Alignment
@@ -155,16 +146,20 @@
   {
     evaluator<typename Derived::Coefficients> qe(q.coeffs());
     Quaternion<double> res;
-    const Packet2d mask0 = _mm_setr_pd(-0.,-0.);
-    const Packet2d mask2 = _mm_setr_pd(-0.,0.);
+    double arr1[2] = {-0.0, -0.0};
+    double arr2[2] = {-0.0,  0.0};
+    const Packet2d mask0 = pset<Packet2d>(arr1);
+    const Packet2d mask2 = pset<Packet2d>(arr2);
     pstoret<double,Packet2d,ResAlignment>(&res.x(), pxor(mask0, qe.template packet<traits<Derived>::Alignment,Packet2d>(0)));
     pstoret<double,Packet2d,ResAlignment>(&res.z(), pxor(mask2, qe.template packet<traits<Derived>::Alignment,Packet2d>(2)));
     return res;
   }
 };
 
+#endif // end EIGEN_VECTORIZE_SSE_OR_EIGEN_ARCH_ARM64
+
 } // end namespace internal
 
 } // end namespace Eigen
 
-#endif // EIGEN_GEOMETRY_SSE_H
+#endif // EIGEN_GEOMETRY_SIMD_H
commit	38ae5353ab6f1050aed64821ac56a1561096cdce	[log] [tgz]
author	Guoqiang QI <425418567@qq.com>	Wed Jan 13 22:54:03 2021 +0000
committer	Rasmus Munk Larsen <rmlarsen@google.com>	Wed Jan 13 22:54:03 2021 +0000
tree	bc2cbea514d3843d300695559b96abeac4ceefc3
parent	352f1422d3ceea19a04cab297c6339e0870e1c6c [diff]