Vectorize atanh & add a missing definition and unit test for atan.
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 499a94b..8b5f030 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -82,6 +82,7 @@
     HasASin   = 0,
     HasACos   = 0,
     HasATan   = 0,
+    HasATanh  = 0,
     HasSinh   = 0,
     HasCosh   = 0,
     HasTanh   = 0,
@@ -857,9 +858,6 @@
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pacos(const Packet& a) { EIGEN_USING_STD(acos); return acos(a); }
 
-/** \internal \returns the arc tangent of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan(const Packet& a) { EIGEN_USING_STD(atan); return atan(a); }
 
 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
@@ -869,10 +867,18 @@
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pcosh(const Packet& a) { EIGEN_USING_STD(cosh); return cosh(a); }
 
+/** \internal \returns the arc tangent of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet patan(const Packet& a) { EIGEN_USING_STD(atan); return atan(a); }
+
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet ptanh(const Packet& a) { EIGEN_USING_STD(tanh); return tanh(a); }
 
+/** \internal \returns the arc tangent of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet patanh(const Packet& a) { EIGEN_USING_STD(atanh); return atanh(a); }
+
 /** \internal \returns the exp of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pexp(const Packet& a) { EIGEN_USING_STD(exp); return exp(a); }
diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index cb7d7b8..8ce181e 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -58,6 +58,12 @@
 
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f
+patanh<Packet8f>(const Packet8f& _x) {
+  return patanh_float(_x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f
 plog<Packet8f>(const Packet8f& _x) {
   return plog_float(_x);
 }
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 60e1ff4..89963eb 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -76,6 +76,7 @@
     HasACos = 1,
     HasASin = 1,
     HasATan = 1,
+    HasATanh = 1,
     HasLog = 1,
     HasLog1p = 1,
     HasExpm1 = 1,
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index af47a85..be0e4dd 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -276,6 +276,12 @@
 }
 
 template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f
+patanh<Packet16f>(const Packet16f& _x) {
+  return patanh_float(_x);
+}
+
+template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d
 patan<Packet8d>(const Packet8d& _x) {
   return patan_double(_x);
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index 916babf..4628b21 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -125,6 +125,7 @@
     HasACos = 1,
     HasASin = 1,
     HasATan = 1,
+    HasATanh = 1,
 #if EIGEN_HAS_AVX512_MATH
     HasLog = 1,
     HasLog1p  = 1,
diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
index 45761e2..7648b40 100644
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -60,6 +60,12 @@
   return patan_float(_x);
 }
 
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet4f patanh<Packet4f>(const Packet4f& _x)
+{
+  return patanh_float(_x);
+}
+
 #ifdef EIGEN_VECTORIZE_VSX
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet4f psqrt<Packet4f>(const Packet4f& x)
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 4cebd8f..f1676af 100644
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -171,6 +171,7 @@
     HasACos = 1,
     HasASin = 1,
     HasATan = 1,
+    HasATanh = 1,
     HasLog = 1,
     HasExp = 1,
 #ifdef EIGEN_VECTORIZE_VSX
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index a322bc4..52d076a 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -969,6 +969,34 @@
 
 template<typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet patanh_float(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+  const Packet half = pset1<Packet>(0.5f);
+  const Packet x_gt_half = pcmp_le(half, pabs(x));
+  // For |x| in [0:0.5] we use a polynomial approximation of the form
+  // P(x) = x + x^3*(c3 + x^2 * (c5 + x^2 * (... x^2 * c11) ... )).
+  const Packet C3 = pset1<Packet>(0.3333373963832855224609375f);
+  const Packet C5 = pset1<Packet>(0.1997792422771453857421875f);
+  const Packet C7 = pset1<Packet>(0.14672131836414337158203125f);
+  const Packet C9 = pset1<Packet>(8.2311116158962249755859375e-2f);
+  const Packet C11 = pset1<Packet>(0.1819281280040740966796875f);
+  const Packet x2 = pmul(x,x);
+  Packet p = pmadd(C11, x2, C9);
+  p = pmadd(x2, p, C7);
+  p = pmadd(x2, p, C5);
+  p = pmadd(x2, p, C3);
+  p = pmadd(pmul(x,x2), p, x);
+
+  // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
+  const Packet one = pset1<Packet>(1.0f);
+  Packet r = pdiv(padd(one, x), psub(one, x));
+  r = pmul(half, plog(r));
+  return pselect(x_gt_half, r, p);
+}
+
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pdiv_complex(const Packet& x, const Packet& y) {
   typedef typename unpacket_traits<Packet>::as_real RealPacket;
   // In the following we annotate the code for the case where the inputs
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 179c55c..d733579 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -109,6 +109,11 @@
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet patan_double(const Packet& x);
 
+/** \internal \returns atan(x) for single precision float */
+template<typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet patanh_float(const Packet& x);
+
 /** \internal \returns sqrt(x) for complex types */
 template<typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index 6ff3e20..c8ca33a 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -779,6 +779,12 @@
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) {
   return half(::acosf(float(a)));
 }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) {
+  return half(::atanf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) {
+  return half(::atanhf(float(a)));
+}
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
 #if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
   defined(EIGEN_HIP_DEVICE_COMPILE)
diff --git a/Eigen/src/Core/arch/NEON/MathFunctions.h b/Eigen/src/Core/arch/NEON/MathFunctions.h
index aea5149..f4ea6f9 100644
--- a/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -55,6 +55,12 @@
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f ptanh<Packet4f>(const Packet4f& x)
 { return internal::generic_fast_tanh_float(x); }
 
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2f patanh<Packet2f>(const Packet2f& x)
+{ return patanh_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f patanh<Packet4f>(const Packet4f& x)
+{ return patanh_float(x); }
+
+
 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
 template <>
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index aa873e5..e31f717 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -215,6 +215,7 @@
     HasACos  = 1,
     HasASin  = 1,
     HasATan  = 1,
+    HasATanh = 1,
     HasLog  = 1,
     HasExp  = 1,
     HasSqrt = 1,
diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index f98fb7a..b0cd13a 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -81,11 +81,6 @@
   return pacos_float(_x);
 }
 
-template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet2d patan<Packet2d>(const Packet2d& _x) {
-  return patan_double(_x);
-}
-
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet4f pasin<Packet4f>(const Packet4f& _x)
 {
@@ -98,6 +93,17 @@
   return patan_float(_x);
 }
 
+template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet2d patan<Packet2d>(const Packet2d& _x) {
+  return patan_double(_x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet4f patanh<Packet4f>(const Packet4f& _x)
+{
+  return patanh_float(_x);
+}
+
 // Notice that for newer processors, it is counterproductive to use Newton
 // iteration for square root. In particular, Skylake and Zen2 processors
 // have approximately doubled throughput of the _mm_sqrt_ps instruction
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index 62646a1..7d608bb 100644
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -142,6 +142,7 @@
     HasACos = 1,
     HasASin = 1,
     HasATan = 1,
+    HasATanh = 1,
     HasLog = 1,
     HasLog1p = 1,
     HasExpm1 = 1,
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index e518f98..1faed10 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -622,11 +622,16 @@
 template <typename Scalar>
 struct scalar_atanh_op {
   EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const { return patanh(x); }
 };
 
 template <typename Scalar>
 struct functor_traits<scalar_atanh_op<Scalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasATanh
+  };
 };
 
 /** \internal
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index 6090576..44a6a50 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@@ -93,27 +93,91 @@
   VERIFY(all_pass);
 }
 
+#define BINARY_FUNCTOR_TEST_ARGS(fun) #fun, \
+      [](const auto& x, const auto& y) { return (Eigen::fun)(x, y); },    \
+      [](const auto& x, const auto& y) { return (std::fun)(x, y); }
+
+
 template <typename Scalar>
 void binary_ops_test() {
-  binary_op_test<Scalar>("pow",
-                         [](const auto& x, const auto& y) { return Eigen::pow(x, y); },
-                         [](const auto& x, const auto& y) { return std::pow(x, y); });
-  binary_op_test<Scalar>("atan2",
-                         [](const auto& x, const auto& y) { return Eigen::atan2(x, y); },
-                         [](const auto& x, const auto& y) {
-                            auto t = std::atan2(x, y);
-#if EIGEN_COMP_MSVC
-                            // Work around MSVC return value on underflow.
-                            // |atan(y/x)| is bounded above by |y/x|, so on underflow return y/x according to POSIX spec.
-                            // MSVC otherwise returns denorm_min.
-                            if (EIGEN_PREDICT_FALSE(std::abs(t) == std::numeric_limits<decltype(t)>::denorm_min())) {
-                              return x/y;
-                            }
+  binary_op_test<Scalar>(BINARY_FUNCTOR_TEST_ARGS(pow));
+#ifndef EIGEN_COMP_MSVC
+  binary_op_test<Scalar>(BINARY_FUNCTOR_TEST_ARGS(atan2));
+#else
+  binary_op_test<Scalar>(
+      "atan2", [](const auto& x, const auto& y) { return Eigen::atan2(x, y); },
+      [](Scalar x, Scalar y) {
+        auto t = Scalar(std::atan2(x, y));
+        // Work around MSVC return value on underflow.
+        // |atan(y/x)| is bounded above by |y/x|, so on underflow return y/x according to POSIX spec.
+        // MSVC otherwise returns denorm_min.
+        if (EIGEN_PREDICT_FALSE(std::abs(t) == std::numeric_limits<decltype(t)>::denorm_min())) {
+          return x / y;
+        }
+        return t;
+      });
 #endif
-                            return t;
-                          });
 }
 
+
+template <typename Scalar, typename Fn, typename RefFn>
+void unary_op_test(std::string name, Fn fun, RefFn ref) {
+  const Scalar tol = test_precision<Scalar>();
+  auto values = special_values<Scalar>();
+  Map<Array<Scalar, Dynamic, 1>> x(values.data(), values.size());
+
+  Array<Scalar, Dynamic, Dynamic> actual = fun(x);
+  bool all_pass = true;
+  for (Index i = 0; i < x.size(); ++i) {
+    Scalar e = static_cast<Scalar>(ref(x(i)));
+    Scalar a = actual(i);
+    bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) ||
+                   ((numext::isnan)(a) && (numext::isnan)(e));
+    if ((a == a) && (e == e)) success &= (bool)numext::signbit(e) == (bool)numext::signbit(a);
+    all_pass &= success;
+    if (!success) {
+      std::cout << name << "(" << x(i) << ") = " << a << " !=  " << e << std::endl;
+    }
+  }
+  VERIFY(all_pass);
+}
+
+#define UNARY_FUNCTOR_TEST_ARGS(fun) #fun, \
+      [](const auto& x) { return (Eigen::fun)(x); },    \
+      [](const auto& x) { return (std::fun)(x); }
+
+template <typename Scalar>
+void unary_ops_test() {
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(sqrt));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(exp));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(log));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(sin));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(cos));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(tan));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(asin));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(acos));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(atan));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(sinh));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(cosh));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(tanh));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(asinh));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(acosh));
+  unary_op_test<Scalar>(UNARY_FUNCTOR_TEST_ARGS(atanh));
+  /* FIXME: Enable when the behavior of rsqrt on denormals for half and double is fixed. 
+  unary_op_test<Scalar>("rsqrt",
+                        [](const auto& x) { return Eigen::rsqrt(x); }, 
+                        [](Scalar x) {
+                          if (x >= 0 && x < (std::numeric_limits<Scalar>::min)()) {
+                            // rsqrt return +inf for positive subnormals.
+                            return NumTraits<Scalar>::infinity();
+                          } else {
+                            return  Scalar(std::sqrt(Scalar(1)/x));
+                          }
+                        });
+  */
+}
+
+
 template <typename Scalar>
 void pow_scalar_exponent_test() {
   using Int_t = typename internal::make_integer<Scalar>::type;
@@ -630,6 +694,7 @@
   VERIFY_IS_APPROX(m1.tanh().atanh(), atanh(tanh(m1)));
   VERIFY_IS_APPROX(m1.sinh().asinh(), asinh(sinh(m1)));
   VERIFY_IS_APPROX(m1.cosh().acosh(), acosh(cosh(m1)));
+  VERIFY_IS_APPROX(m1.tanh().atanh(), atanh(tanh(m1)));
   VERIFY_IS_APPROX(m1.logistic(), logistic(m1));
 
   VERIFY_IS_APPROX(m1.arg(), arg(m1));
@@ -722,6 +787,7 @@
   VERIFY_IS_APPROX(m3.pow(RealScalar(-2)), m3.square().inverse());
 
   // Test pow and atan2 on special IEEE values.
+  unary_ops_test<Scalar>();
   binary_ops_test<Scalar>();
   pow_scalar_exponent_test<Scalar>();
 
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index f082985..76332d7 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -879,6 +879,9 @@
   }
   CHECK_CWISE1_IF(PacketTraits::HasASin, std::asin, internal::pasin);
   CHECK_CWISE1_IF(PacketTraits::HasACos, std::acos, internal::pacos);
+  CHECK_CWISE1_IF(PacketTraits::HasATan, std::atan, internal::patan);
+  CHECK_CWISE1_IF(PacketTraits::HasATanh, std::atanh, internal::patanh);
+
 
   for (int i = 0; i < size; ++i) {
     data1[i] = Scalar(internal::random<double>(-87, 88));