Fix rint SSE/NEON again, using optimization barrier.

This is a new version of !423, which failed for MSVC.

Defined `EIGEN_OPTIMIZATION_BARRIER(X)` that uses inline assembly to
prevent operations involving `X` from crossing that barrier. Should
work on most `GNUC` compatible compilers (MSVC doesn't seem to need
this). This is a modified version adapted from what was used in
`psincos_float` and tested on more platforms
(see #1674, https://godbolt.org/z/73ezTG).

Modified `rint` to use the barrier to prevent the add/subtract rounding
trick from being optimized away.

Also fixed an edge case for large inputs that get bumped up a power of two
and ends up rounding away more than just the fractional part.  If we are
over `2^digits` then just return the input.  This edge case was missed in
the test since the test was comparing approximate equality, which was still
satisfied.  Adding a strict equality option catches it.
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index b1d4be3..411640e 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -630,14 +630,6 @@
 #endif
 Packet psincos_float(const Packet& _x)
 {
-// Workaround -ffast-math aggressive optimizations
-// See bug 1674
-#if EIGEN_COMP_CLANG && defined(EIGEN_VECTORIZE_SSE)
-#define EIGEN_SINCOS_DONT_OPT(X) __asm__  ("" : "+x" (X));
-#else
-#define EIGEN_SINCOS_DONT_OPT(X)
-#endif
-
   typedef typename unpacket_traits<Packet>::integer_packet PacketI;
 
   const Packet  cst_2oPI            = pset1<Packet>(0.636619746685028076171875f); // 2/PI
@@ -652,7 +644,7 @@
 
   // Rounding trick:
   Packet y_round = padd(y, cst_rounding_magic);
-  EIGEN_SINCOS_DONT_OPT(y_round)
+  EIGEN_OPTIMIZATION_BARRIER(y_round)
   PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
   y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
 
@@ -674,9 +666,9 @@
   // and 2 ULP up to:
   const float huge_th = ComputeSine ? 25966.f : 18838.f;
   x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
-  EIGEN_SINCOS_DONT_OPT(x)
+  EIGEN_OPTIMIZATION_BARRIER(x)
   x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
-  EIGEN_SINCOS_DONT_OPT(x)
+  EIGEN_OPTIMIZATION_BARRIER(x)
   x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
   x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
 
@@ -753,8 +745,6 @@
 
   // Update the sign and filter huge inputs
   return pxor(y, sign_bit);
-
-#undef EIGEN_SINCOS_DONT_OPT
 }
 
 template<typename Packet>
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index ec6ea90..7d69de6 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -3207,20 +3207,30 @@
 
 template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
   // Adds and subtracts signum(a) * 2^23 to force rounding.
-  const Packet4f offset = 
-    pselect(pcmp_lt(a, pzero(a)), 
-      pset1<Packet4f>(-static_cast<float>(1<<23)),
-      pset1<Packet4f>(+static_cast<float>(1<<23)));
-  return psub(padd(a, offset), offset);
+  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
+  const Packet4f abs_a = pabs(a);
+  Packet4f r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
 }
 
 template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) {
   // Adds and subtracts signum(a) * 2^23 to force rounding.
-  const Packet2f offset = 
-    pselect(pcmp_lt(a, pzero(a)), 
-      pset1<Packet2f>(-static_cast<float>(1<<23)),
-      pset1<Packet2f>(+static_cast<float>(1<<23)));
-  return psub(padd(a, offset), offset);
+  const Packet2f limit = pset1<Packet2f>(static_cast<float>(1<<23));
+  const Packet2f abs_a = pabs(a);
+  Packet2f r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index b9821ad..d7b8bc8 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -646,20 +646,30 @@
 #else
 template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
   // Adds and subtracts signum(a) * 2^23 to force rounding.
-  const Packet4f offset = 
-    pselect(pcmp_lt(a, pzero(a)), 
-      pset1<Packet4f>(-static_cast<float>(1<<23)),
-      pset1<Packet4f>(+static_cast<float>(1<<23)));
-  return psub(padd(a, offset), offset);
+  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
+  const Packet4f abs_a = pabs(a);
+  Packet4f r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
 }
 
 template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
   // Adds and subtracts signum(a) * 2^52 to force rounding.
-  const Packet2d offset = 
-    pselect(pcmp_lt(a, pzero(a)), 
-      pset1<Packet2d>(-static_cast<double>(1ull<<52)),
-      pset1<Packet2d>(+static_cast<double>(1ull<<52)));
-  return psub(padd(a, offset), offset);
+  const Packet2d limit = pset1<Packet2d>(static_cast<double>(1ull<<52));
+  const Packet2d abs_a = pabs(a);
+  Packet2d r = padd(abs_a, limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(r);
+  r = psub(r, limit);
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index ac514cb..43890ea 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -51,7 +51,11 @@
 
 #ifndef EIGEN_STACK_ALLOCATION_LIMIT
 // 131072 == 128 KB
-#define EIGEN_STACK_ALLOCATION_LIMIT 131072
+#if defined(__AVX512F__)
+  #define EIGEN_STACK_ALLOCATION_LIMIT 0
+#else
+  #define EIGEN_STACK_ALLOCATION_LIMIT 16384
+#endif
 #endif
 
 //------------------------------------------------------------------------------------------
@@ -1063,6 +1067,64 @@
 #endif
 
 
+// Acts as a barrier preventing operations involving `X` from crossing. This
+// occurs, for example, in the fast rounding trick where a magic constant is
+// added then subtracted, which is otherwise compiled away with -ffast-math.
+//
+// See bug 1674
+#if !defined(EIGEN_OPTIMIZATION_BARRIER)
+  #if EIGEN_COMP_GNUC
+    // According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html:
+    //   X: Any operand whatsoever.
+    //   r: A register operand is allowed provided that it is in a general
+    //      register.
+    //   g: Any register, memory or immediate integer operand is allowed, except
+    //      for registers that are not general registers.
+    //   w: (AArch32/AArch64) Floating point register, Advanced SIMD vector
+    //      register or SVE vector register.
+    //   x: (SSE) Any SSE register.
+    //      (AArch64) Like w, but restricted to registers 0 to 15 inclusive.
+    //   v: (PowerPC) An Altivec vector register.
+    //   wa:(PowerPC) A VSX register.
+    //
+    // "X" (uppercase) should work for all cases, though this seems to fail for
+    // some versions of GCC for arm/aarch64 with
+    //   "error: inconsistent operand constraints in an 'asm'"
+    // Clang x86_64/arm/aarch64 seems to require "g" to support both scalars and
+    // vectors, otherwise
+    //   "error: non-trivial scalar-to-vector conversion, possible invalid
+    //    constraint for vector type"
+    //
+    // GCC for ppc64le generates an internal compiler error with x/X/g.
+    // GCC for AVX generates an internal compiler error with X.
+    //
+    // Tested on icc/gcc/clang for sse, avx, avx2, avx512dq
+    //           gcc for arm, aarch64,
+    //           gcc for ppc64le,
+    // both vectors and scalars.
+    //
+    // Note that this is restricted to plain types - this will not work
+    // directly for std::complex<T>, Eigen::half, Eigen::bfloat16. For these,
+    // you will need to apply to the underlying POD type.
+    #if EIGEN_ARCH_PPC
+      // General, Altivec, VSX.
+      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+r,v,wa" (X));
+    #elif EIGEN_ARCH_ARM_OR_ARM64
+      // General, NEON.
+      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+g,w" (X));
+    #elif EIGEN_ARCH_i386_OR_x86_64
+      // General, SSE.
+      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+g,x" (X));
+    #else
+      // Not implemented for other architectures.
+      #define EIGEN_OPTIMIZATION_BARRIER(X)
+    #endif
+  #else
+    // Not implemented for other compilers.
+    #define EIGEN_OPTIMIZATION_BARRIER(X)
+  #endif
+#endif
+
 #if EIGEN_COMP_MSVC
   // NOTE MSVC often gives C4127 warnings with compiletime if statements. See bug 1362.
   // This workaround is ugly, but it does the job.
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 76ac475..4ff193e 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -299,6 +299,29 @@
   CHECK_CWISE2_IF(internal::packet_traits<Scalar>::HasAdd, REF_ADD, internal::padd);
 }
 
+// Ensure optimization barrier compiles and doesn't modify contents.
+// Only applies to raw types, so will not work for std::complex, Eigen::half
+// or Eigen::bfloat16. For those you would need to refer to an underlying
+// storage element.
+template<typename Packet, typename EnableIf = void>
+struct eigen_optimization_barrier_test {
+  static void run() {}
+};
+
+template<typename Packet>
+struct eigen_optimization_barrier_test<Packet, typename internal::enable_if<
+    !NumTraits<Packet>::IsComplex &&
+    !internal::is_same<Packet, Eigen::half>::value &&
+    !internal::is_same<Packet, Eigen::bfloat16>::value
+  >::type> {
+  static void run() {
+    typedef typename internal::unpacket_traits<Packet>::type Scalar;
+    Scalar s = internal::random<Scalar>();
+    Packet barrier = internal::pset1<Packet>(s);
+    EIGEN_OPTIMIZATION_BARRIER(barrier);
+    eigen_assert(s == internal::pfirst(barrier) && "EIGEN_OPTIMIZATION_BARRIER");
+  }
+};
 
 template <typename Scalar, typename Packet>
 void packetmath() {
@@ -317,6 +340,10 @@
   EIGEN_ALIGN_MAX Scalar data3[size];
   EIGEN_ALIGN_MAX Scalar ref[size];
   RealScalar refvalue = RealScalar(0);
+
+  eigen_optimization_barrier_test<Packet>::run();
+  eigen_optimization_barrier_test<Scalar>::run();
+
   for (int i = 0; i < size; ++i) {
     data1[i] = internal::random<Scalar>() / RealScalar(PacketSize);
     data2[i] = internal::random<Scalar>() / RealScalar(PacketSize);
@@ -543,10 +570,10 @@
   CHECK_CWISE1_IF(PacketTraits::HasCos, std::cos, internal::pcos);
   CHECK_CWISE1_IF(PacketTraits::HasTan, std::tan, internal::ptan);
 
-  CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround);
-  CHECK_CWISE1_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
-  CHECK_CWISE1_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
-  CHECK_CWISE1_IF(PacketTraits::HasRint, numext::rint, internal::print);
+  CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::round, internal::pround);
+  CHECK_CWISE1_EXACT_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
+  CHECK_CWISE1_EXACT_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
+  CHECK_CWISE1_EXACT_IF(PacketTraits::HasRint, numext::rint, internal::print);
   
   // Rounding edge cases.
   if (PacketTraits::HasRound || PacketTraits::HasCeil || PacketTraits::HasFloor || PacketTraits::HasRint) {
@@ -583,10 +610,10 @@
     
     for (size_t k=0; k<values.size(); ++k) {
       data1[0] = values[k];
-      CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround);
-      CHECK_CWISE1_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
-      CHECK_CWISE1_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
-      CHECK_CWISE1_IF(PacketTraits::HasRint, numext::rint, internal::print);
+      CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::round, internal::pround);
+      CHECK_CWISE1_EXACT_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
+      CHECK_CWISE1_EXACT_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
+      CHECK_CWISE1_EXACT_IF(PacketTraits::HasRint, numext::rint, internal::print);
     }
   }
 
@@ -644,7 +671,7 @@
   if (PacketTraits::HasExp) {
     data1[0] = Scalar(-1);
     // underflow to zero
-    data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::min_exponent-10);
+    data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::min_exponent-55);
     CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
     // overflow to inf
     data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::max_exponent+10);
diff --git a/test/packetmath_test_shared.h b/test/packetmath_test_shared.h
index 027715a..8624fe2 100644
--- a/test/packetmath_test_shared.h
+++ b/test/packetmath_test_shared.h
@@ -78,13 +78,18 @@
   return internal::isMuchSmallerThan(a-b, refvalue);
 }
 
+template<typename Scalar>
+inline void print_mismatch(const Scalar* ref, const Scalar* vec, int size) {
+  std::cout << "ref: [" << Map<const Matrix<Scalar,1,Dynamic> >(ref,size) << "]" << " != vec: [" << Map<const Matrix<Scalar,1,Dynamic> >(vec,size) << "]\n";
+}
+
 template<typename Scalar> bool areApproxAbs(const Scalar* a, const Scalar* b, int size, const typename NumTraits<Scalar>::Real& refvalue)
 {
   for (int i=0; i<size; ++i)
   {
     if (!isApproxAbs(a[i],b[i],refvalue))
     {
-      std::cout << "ref: [" << Map<const Matrix<Scalar,1,Dynamic> >(a,size) << "]" << " != vec: [" << Map<const Matrix<Scalar,1,Dynamic> >(b,size) << "]\n";
+      print_mismatch(a, b, size);
       return false;
     }
   }
@@ -95,13 +100,23 @@
 {
   for (int i=0; i<size; ++i)
   {
-    if (a[i]!=b[i] && !internal::isApprox(a[i],b[i]))
+    if ( a[i]!=b[i] && !internal::isApprox(a[i],b[i]) 
+         && !((numext::isnan)(a[i]) && (numext::isnan)(b[i])) )
     {
-      if((numext::isnan)(a[i]) && (numext::isnan)(b[i]))
-      {
-        continue;
-      }
-      std::cout << "ref: [" << Map<const Matrix<Scalar,1,Dynamic> >(a,size) << "]" << " != vec: [" << Map<const Matrix<Scalar,1,Dynamic> >(b,size) << "]\n";
+      print_mismatch(a, b, size);
+      return false;
+    }
+  }
+  return true;
+}
+
+template<typename Scalar> bool areEqual(const Scalar* a, const Scalar* b, int size)
+{
+  for (int i=0; i<size; ++i)
+  {
+    if ( (a[i] != b[i]) && !((numext::isnan)(a[i]) && (numext::isnan)(b[i])) )
+    {
+      print_mismatch(a, b, size);
       return false;
     }
   }
@@ -178,6 +193,14 @@
   VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \
 }
 
+#define CHECK_CWISE1_EXACT_IF(COND, REFOP, POP) if(COND) { \
+  test::packet_helper<COND,Packet> h; \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = Scalar(REFOP(data1[i])); \
+  h.store(data2, POP(h.load(data1))); \
+  VERIFY(test::areEqual(ref, data2, PacketSize) && #POP); \
+}
+
 #define CHECK_CWISE2_IF(COND, REFOP, POP) if(COND) { \
   test::packet_helper<COND,Packet> h; \
   for (int i=0; i<PacketSize; ++i) \