Fix rint for SSE/NEON.

It seems *sometimes* with aggressive optimizations the combination
`psub(padd(a, b), b)` trick to force rounding is compiled away. Here
we replace with inline assembly to prevent this (I tried `volatile`,
but that leads to additional loads from memory).

Also fixed an edge case for large inputs `a` where adding `b` bumps
the value up a power of two and ends up rounding away more than
just the fractional part.  If we are over `2^digits` then just return
the input.  This edge case was missed in the test since the test was
comparing approximate equality, which was still satisfied.  Adding
a strict equality option catches it.
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index ec6ea90..51cebaf 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -3207,20 +3207,34 @@
 
 template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
   // Adds and subtracts signum(a) * 2^23 to force rounding.
-  const Packet4f offset = 
-    pselect(pcmp_lt(a, pzero(a)), 
-      pset1<Packet4f>(-static_cast<float>(1<<23)),
-      pset1<Packet4f>(+static_cast<float>(1<<23)));
-  return psub(padd(a, offset), offset);
+  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
+  const Packet4f abs_a = pabs(a);
+  // Inline asm to prevent the compiler from optimizing away the
+  // addition and subtraction.
+  // Packet4f r = psub(padd(abs_a, limit), limit);
+  Packet4f r = abs_a;
+  __asm__ ("vadd.f32 %[r], %[r], %[limit]\n\t"
+           "vsub.f32 %[r], %[r], %[limit]" : [r] "+x" (r) : [limit] "x" (limit));
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
 }
 
 template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) {
   // Adds and subtracts signum(a) * 2^23 to force rounding.
-  const Packet2f offset = 
-    pselect(pcmp_lt(a, pzero(a)), 
-      pset1<Packet2f>(-static_cast<float>(1<<23)),
-      pset1<Packet2f>(+static_cast<float>(1<<23)));
-  return psub(padd(a, offset), offset);
+  const Packet2f limit = pset1<Packet2f>(static_cast<float>(1<<23));
+  const Packet2f abs_a = pabs(a);
+  // Inline asm to prevent the compiler from optimizing away the
+  // addition and subtraction.
+  // Packet4f r = psub(padd(abs_a, limit), limit);
+  Packet2f r = abs_a;
+  __asm__ ("vadd.f32 %[r], %[r], %[limit]\n\t"
+           "vsub.f32 %[r], %[r], %[limit]" : [r] "+x" (r) : [limit] "x" (limit));
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index b9821ad..652ad1d 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -646,20 +646,35 @@
 #else
 template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
   // Adds and subtracts signum(a) * 2^23 to force rounding.
-  const Packet4f offset = 
-    pselect(pcmp_lt(a, pzero(a)), 
-      pset1<Packet4f>(-static_cast<float>(1<<23)),
-      pset1<Packet4f>(+static_cast<float>(1<<23)));
-  return psub(padd(a, offset), offset);
+  const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
+  const Packet4f abs_a = pabs(a);
+  // Inline asm to prevent the compiler from optimizing away the
+  // addition and subtraction.
+  // Packet4f r = psub(padd(abs_a, limit), limit);
+  Packet4f r = abs_a;
+  __asm__ ("addps %[limit], %[r]\n\t"
+           "subps %[limit], %[r]" : [r] "+x" (r) : [limit] "x" (limit));
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
 }
 
 template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
   // Adds and subtracts signum(a) * 2^52 to force rounding.
-  const Packet2d offset = 
-    pselect(pcmp_lt(a, pzero(a)), 
-      pset1<Packet2d>(-static_cast<double>(1ull<<52)),
-      pset1<Packet2d>(+static_cast<double>(1ull<<52)));
-  return psub(padd(a, offset), offset);
+  const Packet2d limit = pset1<Packet2d>(static_cast<double>(1ull<<52));
+  const Packet2d abs_a = pabs(a);
+  // Inline asm to prevent the compiler from optimizing away the
+  // addition and subtraction.
+  // Packet2d r = psub(padd(abs_a, limit), limit);
+  Packet2d r = abs_a;
+  asm("addpd %[limit], %[r] \n\t"
+      "subpd %[limit], %[r]" : [r] "+x" (r) : [limit] "x" (limit));
+
+  // If greater than limit, simply return a.  Otherwise, account for sign.
+  r = pselect(pcmp_lt(abs_a, limit),
+              pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
+  return r;
 }
 
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 76ac475..e69120a 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -543,10 +543,10 @@
   CHECK_CWISE1_IF(PacketTraits::HasCos, std::cos, internal::pcos);
   CHECK_CWISE1_IF(PacketTraits::HasTan, std::tan, internal::ptan);
 
-  CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround);
-  CHECK_CWISE1_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
-  CHECK_CWISE1_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
-  CHECK_CWISE1_IF(PacketTraits::HasRint, numext::rint, internal::print);
+  CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::round, internal::pround);
+  CHECK_CWISE1_EXACT_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
+  CHECK_CWISE1_EXACT_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
+  CHECK_CWISE1_EXACT_IF(PacketTraits::HasRint, numext::rint, internal::print);
   
   // Rounding edge cases.
   if (PacketTraits::HasRound || PacketTraits::HasCeil || PacketTraits::HasFloor || PacketTraits::HasRint) {
@@ -583,10 +583,10 @@
     
     for (size_t k=0; k<values.size(); ++k) {
       data1[0] = values[k];
-      CHECK_CWISE1_IF(PacketTraits::HasRound, numext::round, internal::pround);
-      CHECK_CWISE1_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
-      CHECK_CWISE1_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
-      CHECK_CWISE1_IF(PacketTraits::HasRint, numext::rint, internal::print);
+      CHECK_CWISE1_EXACT_IF(PacketTraits::HasRound, numext::round, internal::pround);
+      CHECK_CWISE1_EXACT_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
+      CHECK_CWISE1_EXACT_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
+      CHECK_CWISE1_EXACT_IF(PacketTraits::HasRint, numext::rint, internal::print);
     }
   }
 
@@ -644,7 +644,7 @@
   if (PacketTraits::HasExp) {
     data1[0] = Scalar(-1);
     // underflow to zero
-    data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::min_exponent-10);
+    data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::min_exponent-55);
     CHECK_CWISE2_IF(PacketTraits::HasExp, REF_LDEXP, internal::pldexp);
     // overflow to inf
     data1[PacketSize] = Scalar(std::numeric_limits<Scalar>::max_exponent+10);
diff --git a/test/packetmath_test_shared.h b/test/packetmath_test_shared.h
index 027715a..8fbe0d2 100644
--- a/test/packetmath_test_shared.h
+++ b/test/packetmath_test_shared.h
@@ -108,6 +108,23 @@
   return true;
 }
 
+template<typename Scalar> bool areEqual(const Scalar* a, const Scalar* b, int size)
+{
+  for (int i=0; i<size; ++i)
+  {
+    if (a[i] != b[i])
+    {
+      if((numext::isnan)(a[i]) && (numext::isnan)(b[i]))
+      {
+        continue;
+      }
+      std::cout << "ref: [" << Map<const Matrix<Scalar,1,Dynamic> >(a,size) << "]" << " != vec: [" << Map<const Matrix<Scalar,1,Dynamic> >(b,size) << "]\n";
+      return false;
+    }
+  }
+  return true;
+}
+
 #define CHECK_CWISE1(REFOP, POP) { \
   for (int i=0; i<PacketSize; ++i) \
     ref[i] = REFOP(data1[i]); \
@@ -178,6 +195,14 @@
   VERIFY(test::areApprox(ref, data2, PacketSize) && #POP); \
 }
 
+#define CHECK_CWISE1_EXACT_IF(COND, REFOP, POP) if(COND) { \
+  test::packet_helper<COND,Packet> h; \
+  for (int i=0; i<PacketSize; ++i) \
+    ref[i] = Scalar(REFOP(data1[i])); \
+  h.store(data2, POP(h.load(data1))); \
+  VERIFY(test::areEqual(ref, data2, PacketSize) && #POP); \
+}
+
 #define CHECK_CWISE2_IF(COND, REFOP, POP) if(COND) { \
   test::packet_helper<COND,Packet> h; \
   for (int i=0; i<PacketSize; ++i) \