Add an additional step of Newton-Raphson for `psqrt<double>` on Arm, which otherwise has an error of ~1000 ulps.
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 90ffee7..5883eca 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -3896,7 +3896,8 @@
   // Do a single step of Newton's iteration.
   //the number 1.5f was set reference to Quake3's fast inverse square root
   x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
-  // Do one more Newton's iteration to get more accurate result.
+  // Do two more Newton's iteration to get a result accurate to 1 ulp.
+  x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
   x = vmulq_f64(x, psub(pset1<Packet2d>(1.5), pmul(half, pmul(x, x))));
   // Flush results for denormals to zero.
   return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(pmul(_x, x)), denormal_mask));