Fix a bug for pcmp_lt_or_nan and Add sqrt support for SVE

commit: 072ec9d95403b449547cbdd8186dd07f9248e2de [log] [tgz]
author: qile lin <dgrlql2022@gmail.com> Wed Sep 04 21:45:39 2024 +0000
committer: Rasmus Munk Larsen <rmlarsen@google.com> Wed Sep 04 21:45:39 2024 +0000
tree: ca03b57fe1a136f98f63c6d3fb3c093276d22fb0
parent: 9315389795d8da2401aec5adf064bf7d7a06d100 [diff]
diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h
index 924f897..51bbfe0 100644
--- a/Eigen/src/Core/arch/SVE/PacketMath.h
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h

@@ -358,7 +358,7 @@
     HasCos = EIGEN_FAST_MATH,
     HasLog = 1,
     HasExp = 1,
-    HasSqrt = 0,
+    HasSqrt = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH
   };
@@ -478,12 +478,12 @@
   return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(), a, b), 0xffffffffu));
 }
 
-// Do a predicate inverse (svnot_b_x) on the predicate resulted from the
+// Do a predicate inverse (svnot_b_z) on the predicate resulted from the
 // greater/equal comparison (svcmpge_f32). Then fill a float vector with the
 // active elements.
 template <>
 EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan<PacketXf>(const PacketXf& a, const PacketXf& b) {
-  return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_x(svptrue_b32(), svcmpge_f32(svptrue_b32(), a, b)), 0xffffffffu));
+  return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(), a, b)), 0xffffffffu));
 }
 
 template <>
@@ -660,6 +660,11 @@
   return pldexp_generic(a, exponent);
 }
 
+template <>
+EIGEN_STRONG_INLINE PacketXf psqrt<PacketXf>(const PacketXf& a) {
+  return svsqrt_f32_x(svptrue_b32(), a);
+}
+
 }  // namespace internal
 }  // namespace Eigen
commit	072ec9d95403b449547cbdd8186dd07f9248e2de	[log] [tgz]
author	qile lin <dgrlql2022@gmail.com>	Wed Sep 04 21:45:39 2024 +0000
committer	Rasmus Munk Larsen <rmlarsen@google.com>	Wed Sep 04 21:45:39 2024 +0000
tree	ca03b57fe1a136f98f63c6d3fb3c093276d22fb0
parent	9315389795d8da2401aec5adf064bf7d7a06d100 [diff]