Vectorize erfc(x) for double and improve erfc(x) for float.

commit: 0d366f65322564086afdbd2e46793ff04ba7ba88 [log] [tgz]
author: Rasmus Munk Larsen <rmlarsen@google.com> Fri Nov 08 17:21:11 2024 +0000
committer: Rasmus Munk Larsen <rmlarsen@google.com> Fri Nov 08 17:21:11 2024 +0000
tree: 882a59d9f761b1b0654cc74774199b584f3f4e68
parent: 8adf43640e6957d939bd76eaf9d2420c33083886 [diff]
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 58fdb08..1980e92 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h

@@ -124,6 +124,7 @@
     HasRsqrt = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
     HasBlend = 1
   };
 };
@@ -144,6 +145,7 @@
 #endif
     HasTanh = EIGEN_FAST_MATH,
     HasLog = 1,
+    HasErfc = 1,
     HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,

diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 0681a0a..78d17d5 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h

@@ -133,6 +133,7 @@
     HasReciprocal = EIGEN_FAST_MATH,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
     HasCmp = 1,
     HasDiv = 1
   };
@@ -154,6 +155,7 @@
     HasExp = 1,
     HasATan = 1,
     HasTanh = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
     HasATanh = 1,
     HasCmp = 1,
     HasDiv = 1

diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 7401c0b..da26cd4 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h

@@ -193,6 +193,7 @@
 #endif
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
 #else
     HasSqrt = 0,
     HasRsqrt = 0,
@@ -3182,6 +3183,7 @@
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
     HasTanh = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
     HasATanh = 1,
     HasATan = 0,
     HasLog = 0,

diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index a184931..d3c067e 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h

@@ -1659,6 +1659,13 @@
   p_lo = pmsub(x, y, p_hi);
 }
 
+// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
+// x * y = xy + p_lo holds exactly.
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
+  return pmsub(x, y, xy);
+}
+
 #else
 
 // This function implements the Veltkamp splitting. Given a floating point
@@ -1694,6 +1701,21 @@
   p_lo = pmadd(x_lo, y_lo, p_lo);
 }
 
+// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
+// x * y = xy + p_lo holds exactly.
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
+  Packet x_hi, x_lo, y_hi, y_lo;
+  veltkamp_splitting(x, x_hi, x_lo);
+  veltkamp_splitting(y, y_hi, y_lo);
+
+  Packet p_lo = pmadd(x_hi, y_hi, pnegate(xy));
+  p_lo = pmadd(x_hi, y_lo, p_lo);
+  p_lo = pmadd(x_lo, y_hi, p_lo);
+  p_lo = pmadd(x_lo, y_lo, p_lo);
+  return p_lo;
+}
+
 #endif  // EIGEN_VECTORIZE_FMA
 
 // This function implements Dekker's algorithm for the addition

diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 414f0f5..2f401fd 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h

@@ -209,6 +209,7 @@
     HasRsqrt = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
     HasBessel = 0,  // Issues with accuracy.
     HasNdtri = 0
   };
@@ -5140,7 +5141,8 @@
     HasSqrt = 1,
     HasRsqrt = 1,
     HasTanh = EIGEN_FAST_MATH,
-    HasErf = 0
+    HasErf = 0,
+    HasErfc = EIGEN_FAST_MATH
   };
 };
 

diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 37f6048..c749763 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h

@@ -197,6 +197,7 @@
     HasRsqrt = 1,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
     HasBlend = 1,
     HasSign = 0  // The manually vectorized version is slightly slower for SSE.
   };
@@ -216,6 +217,7 @@
     HasCos = EIGEN_FAST_MATH,
     HasTanh = EIGEN_FAST_MATH,
     HasLog = 1,
+    HasErfc = EIGEN_FAST_MATH,
     HasExp = 1,
     HasSqrt = 1,
     HasRsqrt = 1,

diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h
index 51bbfe0..952d756 100644
--- a/Eigen/src/Core/arch/SVE/PacketMath.h
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h

@@ -360,7 +360,8 @@
     HasExp = 1,
     HasSqrt = 1,
     HasTanh = EIGEN_FAST_MATH,
-    HasErf = EIGEN_FAST_MATH
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH
   };
 };
 

diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 5169f1c..0b266f9 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h

@@ -345,11 +345,17 @@
 /***************************************************************************
  * Implementation of erfc, requires C++11/C99                               *
  ****************************************************************************/
+template <typename Scalar>
+struct generic_fast_erfc {
+  template <typename T>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T run(const T& x_in);
+};
+
+template <>
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc_float(const T& x) {
-  const T x_abs = pmin(pabs(x), pset1<T>(10.0f));
-  const T one = pset1<T>(1.0f);
-  const T x_abs_gt_one_mask = pcmp_lt(one, x_abs);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<float>::run(const T& x_in) {
+  constexpr float kClamp = 11.0f;
+  const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp));
 
   // erfc(x) = 1 + x * S(x^2), |x| <= 1.
   //
@@ -360,10 +366,12 @@
                              2.67075151205062866210937500000e-02, -1.12800106406211853027343750000e-01,
                              3.76122951507568359375000000000e-01, -1.12837910652160644531250000000e+00};
   const T x2 = pmul(x, x);
+  const T one = pset1<T>(1.0);
   const T erfc_small = pmadd(x, ppolevl<T, 5>::run(x2, alpha), one);
 
   // Return early if we don't need the more expensive approximation for any
   // entry in a.
+  const T x_abs_gt_one_mask = pcmp_lt(one, x2);
   if (!predux_any(x_abs_gt_one_mask)) return erfc_small;
 
   // erfc(x) = exp(-x^2) * 1/x * P(1/x^2) / Q(1/x^2), 1 < x < 9.
@@ -377,23 +385,103 @@
   constexpr float delta[] = {1.7251677811145782470703125e-02f, 3.9137163758277893066406250e-01f,
                              1.0000000000000000000000000e+00f, 6.2173241376876831054687500e-01f,
                              9.5662862062454223632812500e-02f};
-  const T z = pexp(pnegate(x2));
+  const T x2_lo = twoprod_low(x, x, x2);
+  // Here we use that
+  //   exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo)
+  // since x2_lo < kClamp * eps << 1 in the region we care about. This trick reduces the max error
+  // from 34 ulps to below 5 ulps.
+  const T exp2_hi = pexp(pnegate(x2));
+  const T z = pnmadd(exp2_hi, x2_lo, exp2_hi);
   const T q2 = preciprocal(x2);
   const T num = ppolevl<T, 3>::run(q2, gamma);
-  const T denom = pmul(x_abs, ppolevl<T, 4>::run(q2, delta));
+  const T denom = pmul(x, ppolevl<T, 4>::run(q2, delta));
   const T r = pdiv(num, denom);
-  // If x < -1 then use erfc(x) = 2 - erfc(|x|).
-  const T x_negative = pcmp_lt(x, pset1<T>(0.0f));
-  const T erfc_large = pselect(x_negative, pnmadd(z, r, pset1<T>(2.0f)), pmul(z, r));
-
+  const T maybe_two = pand(pcmp_lt(x, pset1<T>(0.0)), pset1<T>(2.0));
+  const T erfc_large = pmadd(z, r, maybe_two);
   return pselect(x_abs_gt_one_mask, erfc_large, erfc_small);
 }
 
-template <typename Scalar>
-struct erfc_impl {
-  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+template <>
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<double>::run(const T& x_in) {
+  // Clamp x to [-27:27] beyond which erfc(x) is either two or zero (below the underflow threshold).
+  // This avoids having to deal with twoprod(x,x) producing NaN for sufficiently large x.
+  constexpr double kClamp = 28.0;
+  const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp));
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar) { return Scalar(0); }
+  // erfc(x) = 1 + x * S(x^2) / T(x^2), |x| <= 1.
+  //
+  // Coefficients for S and T generated with Rminimax command:
+  //  ./ratapprox --function="erfc(x)-1" --dom='[-1,1]' --type=[9,10]
+  //  --num="odd" --numF="[D]" --den="even" --denF="[D]" --log --dispCoeff="dec"
+  constexpr double alpha[] = {-1.9493725660006057018823477644531294572516344487667083740234375e-04,
+                              -1.8272566210022942682217328425053892715368419885635375976562500e-03,
+                              -4.5303363351690106863856044583371840417385101318359375000000000e-02,
+                              -1.4215015503619179981775744181504705920815467834472656250000000e-01,
+                              -1.1283791670955125585606992899556644260883331298828125000000000e+00};
+  constexpr double beta[] = {2.0294484101083099089526257108317963684385176748037338256835938e-05,
+                             6.8117805899186819641732970609382391558028757572174072265625000e-04,
+                             1.0582026056098614921752165685120417037978768348693847656250000e-02,
+                             9.3252603143757495374188692949246615171432495117187500000000000e-02,
+                             4.5931062818368939559832142549566924571990966796875000000000000e-01,
+                             1.0};
+  const T x2 = pmul(x, x);
+  const T num_small = ppolevl<T, 4>::run(x2, alpha);
+  const T denom_small = ppolevl<T, 5>::run(x2, beta);
+  const T one = pset1<T>(1.0);
+  const T erfc_small = pmadd(x, pdiv(num_small, denom_small), one);
+
+  // Return early if we don't need the more expensive approximation for any
+  // entry in a.
+  const T x_abs_gt_one_mask = pcmp_lt(one, x2);
+  if (!predux_any(x_abs_gt_one_mask)) return erfc_small;
+
+  // erfc(x) = exp(-x^2) * 1/x * P(x) / Q(x), 1 < x < 27.
+  //
+  // Coefficients for P and Q generated with Rminimax command:
+  //  ./ratapprox --function="erfc(1/sqrt(x))*exp(1/x)/sqrt(x)"  --dom='[0.0013717,1]' --type=[9,9] --numF="[D]"
+  //  --denF="[D]" --log --dispCoeff="dec"
+  constexpr double gamma[] = {1.5252844933226974316088642158462107545346952974796295166015625e-04,
+                              1.0909912393738931124520519233556115068495273590087890625000000e-02,
+                              1.0628604636755033252537572252549580298364162445068359375000000e-01,
+                              3.3492472973137982217295416376146022230386734008789062500000000e-01,
+                              4.5065776215933289750026347064704168587923049926757812500000000e-01,
+                              2.9433039130294824659017649537418037652969360351562500000000000e-01,
+                              9.8792676360600226170838311645638896152377128601074218750000000e-02,
+                              1.7095935395503719655962981960328761488199234008789062500000000e-02,
+                              1.4249109729504577659398023570247460156679153442382812500000000e-03,
+                              4.4567378313647954771875570045835956989321857690811157226562500e-05};
+  constexpr double delta[] = {2.041985103115789845773520028160419315099716186523437500000000e-03,
+                              5.316030659946043707142493417450168635696172714233398437500000e-02,
+                              3.426242193784684864077405563875799998641014099121093750000000e-01,
+                              8.565637124308049799026321124983951449394226074218750000000000e-01,
+                              1.000000000000000000000000000000000000000000000000000000000000e+00,
+                              5.968805280570776972126623149961233139038085937500000000000000e-01,
+                              1.890922854723317836356244470152887515723705291748046875000000e-01,
+                              3.152505418656005586885981983868987299501895904541015625000000e-02,
+                              2.565085751861882583380047861965067568235099315643310546875000e-03,
+                              7.899362131678837697403017248376499992446042597293853759765625e-05};
+
+  const T x2_lo = twoprod_low(x, x, x2);
+  // Here we use that
+  //   exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo)
+  // since x2_lo < kClamp *eps << 1 in the region we care about. This trick reduces the max error
+  // from 258 ulps to below 7 ulps.
+  const T exp2_hi = pexp(pnegate(x2));
+  const T z = pnmadd(exp2_hi, x2_lo, exp2_hi);
+  const T q2 = preciprocal(x2);
+  const T num_large = ppolevl<T, 9>::run(q2, gamma);
+  const T denom_large = pmul(x, ppolevl<T, 9>::run(q2, delta));
+  const T r = pdiv(num_large, denom_large);
+  const T maybe_two = pand(pcmp_lt(x, pset1<T>(0.0)), pset1<T>(2.0));
+  const T erfc_large = pmadd(z, r, maybe_two);
+  return pselect(x_abs_gt_one_mask, erfc_large, erfc_small);
+}
+
+template <typename T>
+struct erfc_impl {
+  typedef typename unpacket_traits<T>::type Scalar;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erfc<Scalar>::run(x); }
 };
 
 template <typename Scalar>
@@ -408,7 +496,7 @@
 #if defined(SYCL_DEVICE_ONLY)
     return cl::sycl::erfc(x);
 #else
-    return generic_fast_erfc_float(x);
+    return generic_fast_erfc<float>::run(x);
 #endif
   }
 };
@@ -419,7 +507,7 @@
 #if defined(SYCL_DEVICE_ONLY)
     return cl::sycl::erfc(x);
 #else
-    return ::erfc(x);
+    return generic_fast_erfc<double>::run(x);
 #endif
   }
 };

diff --git a/unsupported/test/special_packetmath.cpp b/unsupported/test/special_packetmath.cpp
index 1ec5c3d..044ce67 100644
--- a/unsupported/test/special_packetmath.cpp
+++ b/unsupported/test/special_packetmath.cpp

@@ -117,6 +117,8 @@
 #if EIGEN_HAS_C99_MATH
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
+  // FIXME(rmlarsen): This test occasionally fails due to difference in tiny subnormal results
+  // near the underflow boundary. I am not sure which version is correct.
   CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc);
 #endif
 }
commit	0d366f65322564086afdbd2e46793ff04ba7ba88	[log] [tgz]
author	Rasmus Munk Larsen <rmlarsen@google.com>	Fri Nov 08 17:21:11 2024 +0000
committer	Rasmus Munk Larsen <rmlarsen@google.com>	Fri Nov 08 17:21:11 2024 +0000
tree	882a59d9f761b1b0654cc74774199b584f3f4e68
parent	8adf43640e6957d939bd76eaf9d2420c33083886 [diff]