Add reciprocal packet op and fast specializations for float with SSE, AVX, and AVX512.

diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 7223428..4f1ff6b 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h

@@ -65,6 +65,7 @@
     HasCmp       = 0,
 
     HasDiv    = 0,
+    HasReciprocal = 0,
     HasSqrt   = 0,
     HasRsqrt  = 0,
     HasExp    = 0,
@@ -816,13 +817,6 @@
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet psqrt(const Packet& a) { return numext::sqrt(a); }
 
-/** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet prsqrt(const Packet& a) {
-  typedef typename internal::unpacket_traits<Packet>::type Scalar;
-  return pdiv(pset1<Packet>(Scalar(1)), psqrt(a));
-}
-
 /** \internal \returns the rounded value of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pround(const Packet& a) { using numext::round; return round(a); }
@@ -1035,6 +1029,19 @@
   return ifPacket.select[0] ? thenPacket : elsePacket;
 }
 
+/** \internal \returns 1 / a (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet preciprocal(const Packet& a) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  return pdiv(pset1<Packet>(Scalar(1)), a);
+}
+
+/** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet prsqrt(const Packet& a) {
+  return preciprocal<Packet>(psqrt(a));
+}
+
 } // end namespace internal
 
 } // end namespace Eigen

diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index 2c9bbb5..182dd37 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h

@@ -17,6 +17,35 @@
 
 namespace internal {
 
+/** \internal Fast reciprocal using Newton-Raphson's method.
+ We assume that the starting guess provided in approx_a_recip has at least
+ half the leading mantissa bits in the correct result, such that a single
+ Newton-Raphson step is sufficient to get within 1-2 ulps of the currect result.
+*/
+template <typename Packet, int Steps>
+struct generic_reciprocal_newton_step {
+  static_assert(Steps > 0, "Steps must be at least 1.");
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE  Packet
+  run(const Packet& a, const Packet& approx_a_recip) {
+    using Scalar = typename unpacket_traits<Packet>::type;
+    const Packet two = pset1<Packet>(Scalar(2));
+    const Packet neg_a = pnegate(a);
+    // Refine the approximation using one Newton-Raphson step:
+    //   x_{i} = x_{i-1} * (2 - a * x_{i-1})
+    const Packet x =
+        generic_reciprocal_newton_step<Packet,Steps - 1>::run(a, approx_a_recip);
+    return pmul(x, pmadd(neg_a, x, two));
+  }
+};
+
+template<typename Packet>
+struct generic_reciprocal_newton_step<Packet, 0> {
+   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet
+   run(const Packet& /*unused*/, const Packet& approx_a_recip) {
+    return approx_a_recip;
+  }
+};
+
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise)
     Doesn't do anything fancy, just a 13/6-degree rational interpolant which
     is accurate up to a couple of ulps in the (approximate) range [-8, 8],

diff --git a/Eigen/src/Core/arch/AVX/MathFunctions.h b/Eigen/src/Core/arch/AVX/MathFunctions.h
index d517dff..9f61af6 100644
--- a/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX/MathFunctions.h

@@ -166,19 +166,12 @@
   return pselect<Packet8f>(not_normal_finite_mask, y_approx, y_newton);
 }
 
-#else
-template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
-  EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
-  return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(_x));
+template<> EIGEN_STRONG_INLINE Packet8f preciprocal<Packet8f>(const Packet8f& a) {
+  return generic_reciprocal_newton_step<Packet8f, /*Steps=*/1>::run(a, _mm256_rcp_ps(a));
 }
+
 #endif
 
-template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4d prsqrt<Packet4d>(const Packet4d& _x) {
-  EIGEN_DECLARE_CONST_Packet4d(one, 1.0);
-  return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(_x));
-}
 
 F16_PACKET_FUNCTION(Packet8f, Packet8h, psin)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos)
@@ -190,6 +183,7 @@
 F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt)
 F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, preciprocal)
 
 template <>
 EIGEN_STRONG_INLINE Packet8h pfrexp(const Packet8h& a, Packet8h& exponent) {
@@ -214,6 +208,7 @@
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt)
 BF16_PACKET_FUNCTION(Packet8f, Packet8bf, prsqrt)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, preciprocal)
 
 template <>
 EIGEN_STRONG_INLINE Packet8bf pfrexp(const Packet8bf& a, Packet8bf& exponent) {

diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 6b20da6..bf832c9 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h

@@ -78,6 +78,7 @@
 
     HasCmp  = 1,
     HasDiv = 1,
+    HasReciprocal = EIGEN_FAST_MATH,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
     HasLog = 1,

diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 54be6cf..6caed2d 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h

@@ -253,13 +253,6 @@
   // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
   return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx);
 }
-#else
-
-template <>
-EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
-  EIGEN_DECLARE_CONST_Packet16f(one, 1.0f);
-  return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x));
-}
 #endif
 
 F16_PACKET_FUNCTION(Packet16f, Packet16h, prsqrt)
@@ -304,12 +297,17 @@
   // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
   return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx);
 }
+
+template<> EIGEN_STRONG_INLINE Packet16f preciprocal<Packet16f>(const Packet16f& a) {
+#ifdef EIGEN_VECTORIZE_AVX512ER
+  return _mm512_rcp28_ps(a));
 #else
-template <>
-EIGEN_STRONG_INLINE Packet8d prsqrt<Packet8d>(const Packet8d& x) {
-  EIGEN_DECLARE_CONST_Packet8d(one, 1.0f);
-  return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(x));
+  return generic_reciprocal_newton_step<Packet16f, /*Steps=*/1>::run(a, _mm512_rcp14_ps(a));
+#endif
 }
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, preciprocal)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, preciprocal)
 #endif
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED

diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index ff0dd29..8a00c62 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h

@@ -120,6 +120,7 @@
     HasExp = 1,
     HasSqrt = EIGEN_FAST_MATH,
     HasRsqrt = EIGEN_FAST_MATH,
+    HasReciprocal = EIGEN_FAST_MATH,
     HasTanh = EIGEN_FAST_MATH,
     HasErf = EIGEN_FAST_MATH,
 #endif

diff --git a/Eigen/src/Core/arch/SSE/MathFunctions.h b/Eigen/src/Core/arch/SSE/MathFunctions.h
index 5a063d3..10bc107 100644
--- a/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/Eigen/src/Core/arch/SSE/MathFunctions.h

@@ -148,21 +148,13 @@
   return pselect<Packet4f>(not_normal_finite_mask, y_approx, y_newton);
 }
 
-#else
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f prsqrt<Packet4f>(const Packet4f& x) {
-  // Unfortunately we can't use the much faster mm_rsqrt_ps since it only provides an approximation.
-  return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x));
-}
-
 #endif
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet2d prsqrt<Packet2d>(const Packet2d& x) {
-  return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x));
+template<> EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) {
+  return generic_reciprocal_newton_step<Packet4f, /*Steps=*/1>::run(a, _mm_rcp_ps(a));
 }
 
+
 // Hyperbolic Tangent function.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f

diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index a843226..4de3d47 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h

@@ -136,6 +136,7 @@
 
     HasCmp  = 1,
     HasDiv = 1,
+    HasReciprocal = EIGEN_FAST_MATH,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
     HasLog = 1,

diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index d56aae5..e7bccaf 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h

@@ -723,13 +723,18 @@
   EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; }
   template<typename Packet>
   EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
+  { return internal::preciprocal(a); }
 };
 template <typename Scalar>
 struct functor_traits<scalar_inverse_op<Scalar> > {
   enum {
     PacketAccess = packet_traits<Scalar>::HasDiv,
-    Cost = scalar_div_cost<Scalar, PacketAccess>::value
+    // If packet_traits<Scalar>::HasReciprocal then the Estimated cost is that
+    // of computing an approximation plus a single Newton-Raphson step, which
+    // consists of 1 pmul + 1 pmadd.
+    Cost = (packet_traits<Scalar>::HasReciprocal
+                ? 4 * NumTraits<Scalar>::MulCost
+                : scalar_div_cost<Scalar, PacketAccess>::value)
   };
 };
 
@@ -1033,7 +1038,7 @@
   }
 };
 
-// TODO(rmlarsen): Enable the following on host when integer_packet is defined 
+// TODO(rmlarsen): Enable the following on host when integer_packet is defined
 // for the relevant packet types.
 #ifdef EIGEN_GPU_CC
 

diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h
index 220da03..69f5e79 100644
--- a/Eigen/src/LU/arch/InverseSize4.h
+++ b/Eigen/src/LU/arch/InverseSize4.h

@@ -58,10 +58,10 @@
 
     const float* data = matrix.data();
     const Index stride = matrix.innerStride();
-    Packet4f L1_ = ploadt<Packet4f,MatrixAlignment>(data);
-    Packet4f L2_ = ploadt<Packet4f,MatrixAlignment>(data + stride*4);
-    Packet4f L3_ = ploadt<Packet4f,MatrixAlignment>(data + stride*8);
-    Packet4f L4_ = ploadt<Packet4f,MatrixAlignment>(data + stride*12);
+    Packet4f L1 = ploadt<Packet4f,MatrixAlignment>(data);
+    Packet4f L2 = ploadt<Packet4f,MatrixAlignment>(data + stride*4);
+    Packet4f L3 = ploadt<Packet4f,MatrixAlignment>(data + stride*8);
+    Packet4f L4 = ploadt<Packet4f,MatrixAlignment>(data + stride*12);
 
     // Four 2x2 sub-matrices of the input matrix
     // input = [[A, B],
@@ -70,17 +70,17 @@
 
     if (!StorageOrdersMatch)
     {
-      A = vec4f_unpacklo(L1_, L2_);
-      B = vec4f_unpacklo(L3_, L4_);
-      C = vec4f_unpackhi(L1_, L2_);
-      D = vec4f_unpackhi(L3_, L4_);
+      A = vec4f_unpacklo(L1, L2);
+      B = vec4f_unpacklo(L3, L4);
+      C = vec4f_unpackhi(L1, L2);
+      D = vec4f_unpackhi(L3, L4);
     }
     else
     {
-      A = vec4f_movelh(L1_, L2_);
-      B = vec4f_movehl(L2_, L1_);
-      C = vec4f_movelh(L3_, L4_);
-      D = vec4f_movehl(L4_, L3_);
+      A = vec4f_movelh(L1, L2);
+      B = vec4f_movehl(L2, L1);
+      C = vec4f_movelh(L3, L4);
+      D = vec4f_movehl(L4, L3);
     }
 
     Packet4f AB, DC;
@@ -120,7 +120,7 @@
     Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0);
 
     // reciprocal of the determinant of the input matrix, rd = 1/det
-    Packet4f rd = pdiv(pset1<Packet4f>(1.0f), det);
+    Packet4f rd = preciprocal(det);
 
     // Four sub-matrices of the inverse
     Packet4f iA, iB, iC, iD;

diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index fe0a9f6..455ecab 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp

@@ -28,6 +28,10 @@
   return a / b;
 }
 template <typename T>
+inline T REF_RECIPROCAL(const T& a) {
+  return T(1) / a;
+}
+template <typename T>
 inline T REF_ABS_DIFF(const T& a, const T& b) {
   return a > b ? a - b : b - a;
 }
@@ -464,9 +468,11 @@
   CHECK_CWISE2_IF(PacketTraits::HasMul, REF_MUL, internal::pmul);
   CHECK_CWISE2_IF(PacketTraits::HasDiv, REF_DIV, internal::pdiv);
 
-  if (PacketTraits::HasNegate) CHECK_CWISE1(internal::negate, internal::pnegate);
+  CHECK_CWISE1_IF(PacketTraits::HasNegate, internal::negate, internal::pnegate);
+  CHECK_CWISE1_IF(PacketTraits::HasReciprocal, REF_RECIPROCAL, internal::preciprocal);
   CHECK_CWISE1(numext::conj, internal::pconj);
 
+
   for (int offset = 0; offset < 3; ++offset) {
     for (int i = 0; i < PacketSize; ++i) ref[i] = data1[offset];
     internal::pstore(data2, internal::pset1<Packet>(data1[offset]));

diff --git a/test/prec_inverse_4x4.cpp b/test/prec_inverse_4x4.cpp
index 86f0571..3eb061d 100644
--- a/test/prec_inverse_4x4.cpp
+++ b/test/prec_inverse_4x4.cpp

@@ -19,9 +19,7 @@
   {
     MatrixType m = PermutationMatrix<4>(indices);
     MatrixType inv = m.inverse();
-    double error = double( (m*inv-MatrixType::Identity()).norm() / NumTraits<Scalar>::epsilon() );
-    EIGEN_DEBUG_VAR(error)
-    VERIFY(error == 0.0);
+    VERIFY_IS_APPROX(m*inv, MatrixType::Identity());
     std::next_permutation(indices.data(),indices.data()+4);
   }
 }

diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index e618042..c1609f1 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h

@@ -601,13 +601,12 @@
     ScalarType(6.79019408009981274425e-9)
   };
   const T eight = pset1<T>(ScalarType(8.0));
-  const T one = pset1<T>(ScalarType(1));
   const T neg_two = pset1<T>(ScalarType(-2));
   T x, x0, x1, z;
 
   x = psqrt(pmul(neg_two, plog(b)));
   x0 = psub(x, pdiv(plog(x), x));
-  z = pdiv(one, x);
+  z = preciprocal(x);
   x1 = pmul(
       z, pselect(
           pcmp_lt(x, eight),

diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
index 27c2845..ddc8132 100644
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp

@@ -130,7 +130,7 @@
   Tensor<float, 3, RowMajor> mat4(2,3,7);
   mat4 = mat2 * 3.14f;
   Tensor<float, 3> mat5(2,3,7);
-  mat5 = mat1.inverse().log();
+  mat5 = (mat1 + mat1.constant(1)).inverse().log();
   Tensor<float, 3, RowMajor> mat6(2,3,7);
   mat6 = mat2.pow(0.5f) * 3.14f;
   Tensor<float, 3> mat7(2,3,7);
@@ -150,7 +150,7 @@
       for (int k = 0; k < 7; ++k) {
         VERIFY_IS_APPROX(mat3(i,j,k), val + val);
         VERIFY_IS_APPROX(mat4(i,j,k), val * 3.14f);
-        VERIFY_IS_APPROX(mat5(i,j,k), logf(1.0f/val));
+        VERIFY_IS_APPROX(mat5(i,j,k), logf(1.0f/(val + 1)));
         VERIFY_IS_APPROX(mat6(i,j,k), sqrtf(val) * 3.14f);
         VERIFY_IS_APPROX(mat7(i,j,k), expf((std::max)(val, mat5(i,j,k) * 2.0f)));
         VERIFY_IS_APPROX(mat8(i,j,k), expf(-val) * 3.14f);