Vectorize erfc(x) for double and improve erfc(x) for float.
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 58fdb08..1980e92 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -124,6 +124,7 @@
HasRsqrt = 1,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
+ HasErfc = EIGEN_FAST_MATH,
HasBlend = 1
};
};
@@ -144,6 +145,7 @@
#endif
HasTanh = EIGEN_FAST_MATH,
HasLog = 1,
+ HasErfc = 1,
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 0681a0a..78d17d5 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -133,6 +133,7 @@
HasReciprocal = EIGEN_FAST_MATH,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
+ HasErfc = EIGEN_FAST_MATH,
HasCmp = 1,
HasDiv = 1
};
@@ -154,6 +155,7 @@
HasExp = 1,
HasATan = 1,
HasTanh = EIGEN_FAST_MATH,
+ HasErfc = EIGEN_FAST_MATH,
HasATanh = 1,
HasCmp = 1,
HasDiv = 1
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 7401c0b..da26cd4 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -193,6 +193,7 @@
#endif
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
+ HasErfc = EIGEN_FAST_MATH,
#else
HasSqrt = 0,
HasRsqrt = 0,
@@ -3182,6 +3183,7 @@
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
HasTanh = EIGEN_FAST_MATH,
+ HasErfc = EIGEN_FAST_MATH,
HasATanh = 1,
HasATan = 0,
HasLog = 0,
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index a184931..d3c067e 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -1659,6 +1659,13 @@
p_lo = pmsub(x, y, p_hi);
}
+// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
+// x * y = xy + p_lo holds exactly.
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
+ return pmsub(x, y, xy);
+}
+
#else
// This function implements the Veltkamp splitting. Given a floating point
@@ -1694,6 +1701,21 @@
p_lo = pmadd(x_lo, y_lo, p_lo);
}
+// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
+// x * y = xy + p_lo holds exactly.
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
+ Packet x_hi, x_lo, y_hi, y_lo;
+ veltkamp_splitting(x, x_hi, x_lo);
+ veltkamp_splitting(y, y_hi, y_lo);
+
+ Packet p_lo = pmadd(x_hi, y_hi, pnegate(xy));
+ p_lo = pmadd(x_hi, y_lo, p_lo);
+ p_lo = pmadd(x_lo, y_hi, p_lo);
+ p_lo = pmadd(x_lo, y_lo, p_lo);
+ return p_lo;
+}
+
#endif // EIGEN_VECTORIZE_FMA
// This function implements Dekker's algorithm for the addition
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 414f0f5..2f401fd 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -209,6 +209,7 @@
HasRsqrt = 1,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
+ HasErfc = EIGEN_FAST_MATH,
HasBessel = 0, // Issues with accuracy.
HasNdtri = 0
};
@@ -5140,7 +5141,8 @@
HasSqrt = 1,
HasRsqrt = 1,
HasTanh = EIGEN_FAST_MATH,
- HasErf = 0
+ HasErf = 0,
+ HasErfc = EIGEN_FAST_MATH
};
};
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 37f6048..c749763 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -197,6 +197,7 @@
HasRsqrt = 1,
HasTanh = EIGEN_FAST_MATH,
HasErf = EIGEN_FAST_MATH,
+ HasErfc = EIGEN_FAST_MATH,
HasBlend = 1,
HasSign = 0 // The manually vectorized version is slightly slower for SSE.
};
@@ -216,6 +217,7 @@
HasCos = EIGEN_FAST_MATH,
HasTanh = EIGEN_FAST_MATH,
HasLog = 1,
+ HasErfc = EIGEN_FAST_MATH,
HasExp = 1,
HasSqrt = 1,
HasRsqrt = 1,
diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h
index 51bbfe0..952d756 100644
--- a/Eigen/src/Core/arch/SVE/PacketMath.h
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h
@@ -360,7 +360,8 @@
HasExp = 1,
HasSqrt = 1,
HasTanh = EIGEN_FAST_MATH,
- HasErf = EIGEN_FAST_MATH
+ HasErf = EIGEN_FAST_MATH,
+ HasErfc = EIGEN_FAST_MATH
};
};
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 5169f1c..0b266f9 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -345,11 +345,17 @@
/***************************************************************************
* Implementation of erfc, requires C++11/C99 *
****************************************************************************/
+template <typename Scalar>
+struct generic_fast_erfc {
+ template <typename T>
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T run(const T& x_in);
+};
+
+template <>
template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc_float(const T& x) {
- const T x_abs = pmin(pabs(x), pset1<T>(10.0f));
- const T one = pset1<T>(1.0f);
- const T x_abs_gt_one_mask = pcmp_lt(one, x_abs);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<float>::run(const T& x_in) {
+ constexpr float kClamp = 11.0f;
+ const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp));
// erfc(x) = 1 + x * S(x^2), |x| <= 1.
//
@@ -360,10 +366,12 @@
2.67075151205062866210937500000e-02, -1.12800106406211853027343750000e-01,
3.76122951507568359375000000000e-01, -1.12837910652160644531250000000e+00};
const T x2 = pmul(x, x);
+ const T one = pset1<T>(1.0);
const T erfc_small = pmadd(x, ppolevl<T, 5>::run(x2, alpha), one);
// Return early if we don't need the more expensive approximation for any
// entry in a.
+ const T x_abs_gt_one_mask = pcmp_lt(one, x2);
if (!predux_any(x_abs_gt_one_mask)) return erfc_small;
// erfc(x) = exp(-x^2) * 1/x * P(1/x^2) / Q(1/x^2), 1 < x < 9.
@@ -377,23 +385,103 @@
constexpr float delta[] = {1.7251677811145782470703125e-02f, 3.9137163758277893066406250e-01f,
1.0000000000000000000000000e+00f, 6.2173241376876831054687500e-01f,
9.5662862062454223632812500e-02f};
- const T z = pexp(pnegate(x2));
+ const T x2_lo = twoprod_low(x, x, x2);
+ // Here we use that
+ // exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo)
+ // since x2_lo < kClamp * eps << 1 in the region we care about. This trick reduces the max error
+ // from 34 ulps to below 5 ulps.
+ const T exp2_hi = pexp(pnegate(x2));
+ const T z = pnmadd(exp2_hi, x2_lo, exp2_hi);
const T q2 = preciprocal(x2);
const T num = ppolevl<T, 3>::run(q2, gamma);
- const T denom = pmul(x_abs, ppolevl<T, 4>::run(q2, delta));
+ const T denom = pmul(x, ppolevl<T, 4>::run(q2, delta));
const T r = pdiv(num, denom);
- // If x < -1 then use erfc(x) = 2 - erfc(|x|).
- const T x_negative = pcmp_lt(x, pset1<T>(0.0f));
- const T erfc_large = pselect(x_negative, pnmadd(z, r, pset1<T>(2.0f)), pmul(z, r));
-
+ const T maybe_two = pand(pcmp_lt(x, pset1<T>(0.0)), pset1<T>(2.0));
+ const T erfc_large = pmadd(z, r, maybe_two);
return pselect(x_abs_gt_one_mask, erfc_large, erfc_small);
}
-template <typename Scalar>
-struct erfc_impl {
- EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+template <>
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<double>::run(const T& x_in) {
+ // Clamp x to [-27:27] beyond which erfc(x) is either two or zero (below the underflow threshold).
+ // This avoids having to deal with twoprod(x,x) producing NaN for sufficiently large x.
+ constexpr double kClamp = 28.0;
+ const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp));
- EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar) { return Scalar(0); }
+ // erfc(x) = 1 + x * S(x^2) / T(x^2), |x| <= 1.
+ //
+ // Coefficients for S and T generated with Rminimax command:
+ // ./ratapprox --function="erfc(x)-1" --dom='[-1,1]' --type=[9,10]
+ // --num="odd" --numF="[D]" --den="even" --denF="[D]" --log --dispCoeff="dec"
+ constexpr double alpha[] = {-1.9493725660006057018823477644531294572516344487667083740234375e-04,
+ -1.8272566210022942682217328425053892715368419885635375976562500e-03,
+ -4.5303363351690106863856044583371840417385101318359375000000000e-02,
+ -1.4215015503619179981775744181504705920815467834472656250000000e-01,
+ -1.1283791670955125585606992899556644260883331298828125000000000e+00};
+ constexpr double beta[] = {2.0294484101083099089526257108317963684385176748037338256835938e-05,
+ 6.8117805899186819641732970609382391558028757572174072265625000e-04,
+ 1.0582026056098614921752165685120417037978768348693847656250000e-02,
+ 9.3252603143757495374188692949246615171432495117187500000000000e-02,
+ 4.5931062818368939559832142549566924571990966796875000000000000e-01,
+ 1.0};
+ const T x2 = pmul(x, x);
+ const T num_small = ppolevl<T, 4>::run(x2, alpha);
+ const T denom_small = ppolevl<T, 5>::run(x2, beta);
+ const T one = pset1<T>(1.0);
+ const T erfc_small = pmadd(x, pdiv(num_small, denom_small), one);
+
+ // Return early if we don't need the more expensive approximation for any
+ // entry in a.
+ const T x_abs_gt_one_mask = pcmp_lt(one, x2);
+ if (!predux_any(x_abs_gt_one_mask)) return erfc_small;
+
+ // erfc(x) = exp(-x^2) * 1/x * P(x) / Q(x), 1 < x < 27.
+ //
+ // Coefficients for P and Q generated with Rminimax command:
+ // ./ratapprox --function="erfc(1/sqrt(x))*exp(1/x)/sqrt(x)" --dom='[0.0013717,1]' --type=[9,9] --numF="[D]"
+ // --denF="[D]" --log --dispCoeff="dec"
+ constexpr double gamma[] = {1.5252844933226974316088642158462107545346952974796295166015625e-04,
+ 1.0909912393738931124520519233556115068495273590087890625000000e-02,
+ 1.0628604636755033252537572252549580298364162445068359375000000e-01,
+ 3.3492472973137982217295416376146022230386734008789062500000000e-01,
+ 4.5065776215933289750026347064704168587923049926757812500000000e-01,
+ 2.9433039130294824659017649537418037652969360351562500000000000e-01,
+ 9.8792676360600226170838311645638896152377128601074218750000000e-02,
+ 1.7095935395503719655962981960328761488199234008789062500000000e-02,
+ 1.4249109729504577659398023570247460156679153442382812500000000e-03,
+ 4.4567378313647954771875570045835956989321857690811157226562500e-05};
+ constexpr double delta[] = {2.041985103115789845773520028160419315099716186523437500000000e-03,
+ 5.316030659946043707142493417450168635696172714233398437500000e-02,
+ 3.426242193784684864077405563875799998641014099121093750000000e-01,
+ 8.565637124308049799026321124983951449394226074218750000000000e-01,
+ 1.000000000000000000000000000000000000000000000000000000000000e+00,
+ 5.968805280570776972126623149961233139038085937500000000000000e-01,
+ 1.890922854723317836356244470152887515723705291748046875000000e-01,
+ 3.152505418656005586885981983868987299501895904541015625000000e-02,
+ 2.565085751861882583380047861965067568235099315643310546875000e-03,
+ 7.899362131678837697403017248376499992446042597293853759765625e-05};
+
+ const T x2_lo = twoprod_low(x, x, x2);
+ // Here we use that
+ // exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo)
+ // since x2_lo < kClamp *eps << 1 in the region we care about. This trick reduces the max error
+ // from 258 ulps to below 7 ulps.
+ const T exp2_hi = pexp(pnegate(x2));
+ const T z = pnmadd(exp2_hi, x2_lo, exp2_hi);
+ const T q2 = preciprocal(x2);
+ const T num_large = ppolevl<T, 9>::run(q2, gamma);
+ const T denom_large = pmul(x, ppolevl<T, 9>::run(q2, delta));
+ const T r = pdiv(num_large, denom_large);
+ const T maybe_two = pand(pcmp_lt(x, pset1<T>(0.0)), pset1<T>(2.0));
+ const T erfc_large = pmadd(z, r, maybe_two);
+ return pselect(x_abs_gt_one_mask, erfc_large, erfc_small);
+}
+
+template <typename T>
+struct erfc_impl {
+ typedef typename unpacket_traits<T>::type Scalar;
+ EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erfc<Scalar>::run(x); }
};
template <typename Scalar>
@@ -408,7 +496,7 @@
#if defined(SYCL_DEVICE_ONLY)
return cl::sycl::erfc(x);
#else
- return generic_fast_erfc_float(x);
+ return generic_fast_erfc<float>::run(x);
#endif
}
};
@@ -419,7 +507,7 @@
#if defined(SYCL_DEVICE_ONLY)
return cl::sycl::erfc(x);
#else
- return ::erfc(x);
+ return generic_fast_erfc<double>::run(x);
#endif
}
};
diff --git a/unsupported/test/special_packetmath.cpp b/unsupported/test/special_packetmath.cpp
index 1ec5c3d..044ce67 100644
--- a/unsupported/test/special_packetmath.cpp
+++ b/unsupported/test/special_packetmath.cpp
@@ -117,6 +117,8 @@
#if EIGEN_HAS_C99_MATH
CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma);
CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf);
+ // FIXME(rmlarsen): This test occasionally fails due to difference in tiny subnormal results
+ // near the underflow boundary. I am not sure which version is correct.
CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc);
#endif
}