Vectorize acos, asin, and atan for float.
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index bddd6aa..43aea54 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -34,6 +34,24 @@
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f
+pasin<Packet8f>(const Packet8f& _x) {
+ return pasin_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f
+pacos<Packet8f>(const Packet8f& _x) {
+ return pacos_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f
+patan<Packet8f>(const Packet8f& _x) {
+ return patan_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f
plog<Packet8f>(const Packet8f& _x) {
return plog_float(_x);
}
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 7608776..8f346f3 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -73,6 +73,9 @@
HasReciprocal = EIGEN_FAST_MATH,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasACos = 1,
+ HasASin = 1,
+ HasATan = 1,
HasLog = 1,
HasLog1p = 1,
HasExpm1 = 1,
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 6afcb4e..79a9043 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -259,6 +259,24 @@
template <>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f
+pacos<Packet16f>(const Packet16f& _x) {
+ return pacos_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f
+pasin<Packet16f>(const Packet16f& _x) {
+ return pasin_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f
+patan<Packet16f>(const Packet16f& _x) {
+ return patan_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f
ptanh<Packet16f>(const Packet16f& _x) {
return internal::generic_fast_tanh_float(_x);
}
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 0158b12..7b07149 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -122,6 +122,9 @@
HasBlend = 0,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasACos = 1,
+ HasASin = 1,
+ HasATan = 1,
#if EIGEN_HAS_AVX512_MATH
HasLog = 1,
HasLog1p = 1,
diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
index c85ca13..fffd2e5 100644
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -42,6 +42,24 @@
return pcos_float(_x);
}
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet4f pacos<Packet4f>(const Packet4f& _x)
+{
+ return pacos_float(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet4f pasin<Packet4f>(const Packet4f& _x)
+{
+ return pasin_float(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet4f patan<Packet4f>(const Packet4f& _x)
+{
+ return patan_float(_x);
+}
+
#ifdef __VSX__
#ifndef EIGEN_COMP_CLANG
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 7460bdc..f030189 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -168,6 +168,9 @@
HasAbs = 1,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasACos = 1,
+ HasASin = 1,
+ HasATan = 1,
HasLog = 1,
HasExp = 1,
#ifdef __VSX__
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 694ccfc..9dc26f4 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -719,6 +719,151 @@
return psincos_float<false>(x);
}
+// Generic implementation of acos(x).
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pacos_float(const Packet& x_in) {
+ typedef typename unpacket_traits<Packet>::type Scalar;
+ static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+ const Packet cst_one = pset1<Packet>(Scalar(1));
+ const Packet cst_pi = pset1<Packet>(Scalar(EIGEN_PI));
+ const Packet p6 = pset1<Packet>(Scalar(2.26911413483321666717529296875e-3));
+ const Packet p5 = pset1<Packet>(Scalar(-1.1063250713050365447998046875e-2));
+ const Packet p4 = pset1<Packet>(Scalar(2.680264413356781005859375e-2));
+ const Packet p3 = pset1<Packet>(Scalar(-4.87488098442554473876953125e-2));
+ const Packet p2 = pset1<Packet>(Scalar(8.874166011810302734375e-2));
+ const Packet p1 = pset1<Packet>(Scalar(-0.2145837843418121337890625));
+ const Packet p0 = pset1<Packet>(Scalar(1.57079613208770751953125));
+
+ // For x in [0:1], we approximate acos(x)/sqrt(1-x), which is a smooth
+ // function, by a 6'th order polynomial.
+ // For x in [-1:0) we use that acos(-x) = pi - acos(x).
+ const Packet neg_mask = pcmp_lt(x_in, pzero(x_in));
+ Packet x = pabs(x_in);
+ const Packet invalid_mask = pcmp_lt(pset1<Packet>(1.0f), x);
+
+ // Evaluate the polynomial using Horner's rule:
+ // P(x) = p0 + x * (p1 + x * (p2 + ... (p5 + x * p6)) ... ) .
+ // We evaluate even and odd terms independently to increase
+ // instruction level parallelism.
+ Packet x2 = pmul(x_in,x_in);
+ Packet p_even = pmadd(p6, x2, p4);
+ Packet p_odd = pmadd(p5, x2, p3);
+ p_even = pmadd(p_even, x2, p2);
+ p_odd = pmadd(p_odd, x2, p1);
+ p_even = pmadd(p_even, x2, p0);
+ Packet p = pmadd(p_odd, x, p_even);
+
+ // The polynomial approximates acos(x)/sqrt(1-x), so
+ // multiply by sqrt(1-x) to get acos(x).
+ Packet denom = psqrt(psub(cst_one, x));
+ Packet result = pmul(denom, p);
+
+ // Undo mapping for negative arguments.
+ result = pselect(neg_mask, psub(cst_pi, result), result);
+ // Return NaN for arguments outside [-1:1].
+ return pselect(invalid_mask,
+ pset1<Packet>(std::numeric_limits<float>::quiet_NaN()),
+ result);
+}
+
+// Generic implementation of asin(x).
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pasin_float(const Packet& x_in) {
+ typedef typename unpacket_traits<Packet>::type Scalar;
+ static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+ // For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
+ // even terms only.
+ const Packet p9 = pset1<Packet>(Scalar(5.08838854730129241943359375e-2f));
+ const Packet p7 = pset1<Packet>(Scalar(3.95139865577220916748046875e-2f));
+ const Packet p5 = pset1<Packet>(Scalar(7.550220191478729248046875e-2f));
+ const Packet p3 = pset1<Packet>(Scalar(0.16664917767047882080078125f));
+ const Packet p1 = pset1<Packet>(Scalar(1.00000011920928955078125f));
+
+ const Packet neg_mask = pcmp_lt(x_in, pzero(x_in));
+ Packet x = pabs(x_in);
+ const Packet invalid_mask = pcmp_lt(pset1<Packet>(1.0f), x);
+ // For arguments |x| > 0.5, we map x back to [0:0.5] using
+ // the transformation x_large = sqrt(0.5*(1-x)), and use the
+ // identity
+ // asin(x) = pi/2 - 2 * asin( sqrt( 0.5 * (1 - x)))
+ const Packet cst_half = pset1<Packet>(Scalar(0.5f));
+ const Packet cst_two = pset1<Packet>(Scalar(2));
+ Packet x_large = psqrt(pnmadd(cst_half, x, cst_half));
+ const Packet large_mask = pcmp_lt(cst_half, x);
+ x = pselect(large_mask, x_large, x);
+
+ // Compute polynomial.
+ // x * (p1 + x^2*(p3 + x^2*(p5 + x^2*(p7 + x^2*p9))))
+ Packet x2 = pmul(x, x);
+ Packet p = pmadd(p9, x2, p7);
+ p = pmadd(p, x2, p5);
+ p = pmadd(p, x2, p3);
+ p = pmadd(p, x2, p1);
+ p = pmul(p, x);
+
+ constexpr float kPiOverTwo = static_cast<float>(0.5*EIGEN_PI);
+ Packet p_large = pnmadd(cst_two, p, pset1<Packet>(kPiOverTwo));
+ p = pselect(large_mask, p_large, p);
+ // Flip the sign for negative arguments.
+ p = pselect(neg_mask, pnegate(p), p);
+
+ // Return NaN for arguments outside [-1:1].
+ return pselect(invalid_mask, pset1<Packet>(std::numeric_limits<float>::quiet_NaN()), p);
+}
+
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet patan_float(const Packet& x_in) {
+ typedef typename unpacket_traits<Packet>::type Scalar;
+ static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+ const Packet cst_one = pset1<Packet>(1.0f);
+ constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI/2);
+ const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
+ constexpr float kPiOverFour = static_cast<float>(EIGEN_PI/4);
+ const Packet cst_pi_over_four = pset1<Packet>(kPiOverFour);
+ const Packet cst_large = pset1<Packet>(2.4142135623730950488016887f); // tan(3*pi/8);
+ const Packet cst_medium = pset1<Packet>(0.4142135623730950488016887f); // tan(pi/8);
+ const Packet q0 = pset1<Packet>(-0.333329379558563232421875f);
+ const Packet q2 = pset1<Packet>(0.19977366924285888671875f);
+ const Packet q4 = pset1<Packet>(-0.13874518871307373046875f);
+ const Packet q6 = pset1<Packet>(8.044691383838653564453125e-2f);
+
+ const Packet neg_mask = pcmp_lt(x_in, pzero(x_in));
+ Packet x = pabs(x_in);
+
+ // Use the same range reduction strategy (to [0:tan(pi/8)]) as the
+ // Cephes library:
+ // "Large": For x >= tan(3*pi/8), use atan(1/x) = pi/2 - atan(x).
+ // "Medium": For x in [tan(pi/8) : tan(3*pi/8)),
+ // use atan(x) = pi/4 + atan((x-1)/(x+1)).
+ // "Small": For x < pi/8, approximate atan(x) directly by a polynomial
+ // calculated using Sollya.
+ const Packet large_mask = pcmp_lt(cst_large, x);
+ x = pselect(large_mask, preciprocal(x), x);
+ const Packet medium_mask = pandnot(pcmp_lt(cst_medium, x), large_mask);
+ x = pselect(medium_mask, pdiv(psub(x, cst_one), padd(x, cst_one)), x);
+
+ // Approximate atan(x) on [0:tan(pi/8)] by a polynomial of the form
+ // P(x) = x + x^3 * Q(x^2),
+ // where Q(x^2) is a cubic polynomial in x^2.
+ const Packet x2 = pmul(x, x);
+ const Packet x4 = pmul(x2, x2);
+ Packet q_odd = pmadd(q6, x4, q2);
+ Packet q_even = pmadd(q4, x4, q0);
+ const Packet q = pmadd(q_odd, x2, q_even);
+ Packet p = pmadd(q, pmul(x, x2), x);
+
+ // Apply transformations according to the range reduction masks.
+ p = pselect(large_mask, psub(cst_pi_over_two, p), p);
+ p = pselect(medium_mask, padd(cst_pi_over_four, p), p);
+ return pselect(neg_mask, pnegate(p), p);
+}
+
template<typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pdiv_complex(const Packet& x, const Packet& y) {
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 962cb14..de6fd95 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -89,6 +89,21 @@
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
Packet pcos_float(const Packet& x);
+/** \internal \returns asin(x) for single precision float */
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pasin_float(const Packet& x);
+
+/** \internal \returns acos(x) for single precision float */
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pacos_float(const Packet& x);
+
+/** \internal \returns atan(x) for single precision float */
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet patan_float(const Packet& x);
+
/** \internal \returns sqrt(x) for complex types */
template<typename Packet>
EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
index c424cb2..445572f 100644
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -34,6 +34,21 @@
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pcos<Packet4f>(const Packet4f& x)
{ return pcos_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f pacos<Packet2f>(const Packet2f& x)
+{ return pacos_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pacos<Packet4f>(const Packet4f& x)
+{ return pacos_float(x); }
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f pasin<Packet2f>(const Packet2f& x)
+{ return pasin_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pasin<Packet4f>(const Packet4f& x)
+{ return pasin_float(x); }
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f patan<Packet2f>(const Packet2f& x)
+{ return patan_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f patan<Packet4f>(const Packet4f& x)
+{ return patan_float(x); }
+
// Hyperbolic Tangent function.
template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f ptanh<Packet2f>(const Packet2f& x)
{ return internal::generic_fast_tanh_float(x); }
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 33af653..9ef83d4 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -198,6 +198,9 @@
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasACos = 1,
+ HasASin = 1,
+ HasATan = 1,
HasLog = 1,
HasExp = 1,
HasSqrt = 1,
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index ff6653b..8e8a0a4 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -75,6 +75,24 @@
return pcos_float(_x);
}
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet4f pacos<Packet4f>(const Packet4f& _x)
+{
+ return pacos_float(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet4f pasin<Packet4f>(const Packet4f& _x)
+{
+ return pasin_float(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet4f patan<Packet4f>(const Packet4f& _x)
+{
+ return patan_float(_x);
+}
+
// Notice that for newer processors, it is counterproductive to use Newton
// iteration for square root. In particular, Skylake and Zen2 processors
// have approximately doubled throughput of the _mm_sqrt_ps instruction
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index bd3424f..0fa4394 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -139,6 +139,9 @@
HasReciprocal = EIGEN_FAST_MATH,
HasSin = EIGEN_FAST_MATH,
HasCos = EIGEN_FAST_MATH,
+ HasACos = 1,
+ HasASin = 1,
+ HasATan = 1,
HasLog = 1,
HasLog1p = 1,
HasExpm1 = 1,