Fix new generic nearest integer ops on GPU.
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 537dffe..1c46ba4 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -2470,7 +2470,7 @@
 };
 
 template <typename Packet>
-EIGEN_STRONG_INLINE Packet generic_rint(const Packet& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_rint(const Packet& a) {
   using Scalar = typename unpacket_traits<Packet>::type;
   using IntType = typename numext::get_integer_by_size<sizeof(Scalar)>::signed_type;
   // Adds and subtracts signum(a) * 2^kMantissaBits to force rounding.
@@ -2490,7 +2490,7 @@
 }
 
 template <typename Packet>
-EIGEN_STRONG_INLINE Packet generic_floor(const Packet& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_floor(const Packet& a) {
   using Scalar = typename unpacket_traits<Packet>::type;
   const Packet cst_1 = pset1<Packet>(Scalar(1));
   Packet rint_a = generic_rint(a);
@@ -2502,7 +2502,7 @@
 }
 
 template <typename Packet>
-EIGEN_STRONG_INLINE Packet generic_ceil(const Packet& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_ceil(const Packet& a) {
   using Scalar = typename unpacket_traits<Packet>::type;
   const Packet cst_1 = pset1<Packet>(Scalar(1));
   Packet rint_a = generic_rint(a);
@@ -2514,7 +2514,7 @@
 }
 
 template <typename Packet>
-EIGEN_STRONG_INLINE Packet generic_trunc(const Packet& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_trunc(const Packet& a) {
   Packet abs_a = pabs(a);
   Packet sign_a = pandnot(a, abs_a);
   Packet floor_abs_a = generic_floor(abs_a);
@@ -2523,7 +2523,7 @@
 }
 
 template <typename Packet>
-EIGEN_STRONG_INLINE Packet generic_round(const Packet& a) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_round(const Packet& a) {
   using Scalar = typename unpacket_traits<Packet>::type;
   const Packet cst_half = pset1<Packet>(Scalar(0.5));
   const Packet cst_1 = pset1<Packet>(Scalar(1));
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 41dc068..1bf1128 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -134,19 +134,19 @@
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Packet& x);
 
 template <typename Packet>
-EIGEN_STRONG_INLINE Packet generic_rint(const Packet& a);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_rint(const Packet& a);
 
 template <typename Packet>
-EIGEN_STRONG_INLINE Packet generic_floor(const Packet& a);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_floor(const Packet& a);
 
 template <typename Packet>
-EIGEN_STRONG_INLINE Packet generic_ceil(const Packet& a);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_ceil(const Packet& a);
 
 template <typename Packet>
-EIGEN_STRONG_INLINE Packet generic_trunc(const Packet& a);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_trunc(const Packet& a);
 
 template <typename Packet>
-EIGEN_STRONG_INLINE Packet generic_round(const Packet& a);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_round(const Packet& a);
 
 // Macros for instantiating these generic functions for different backends.
 #define EIGEN_PACKET_FUNCTION(METHOD, SCALAR, PACKET)                                             \
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index 352c8f5..6d4230a 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -74,7 +74,6 @@
     HasGammaSampleDerAlpha = 1,
     HasIGammac = 1,
     HasBetaInc = 1,
-
     HasBlend = 0
   };
 };
@@ -106,9 +105,7 @@
     HasGammaSampleDerAlpha = 1,
     HasIGammac = 1,
     HasBetaInc = 1,
-
     HasBlend = 0,
-    HasFloor = 1,
   };
 };
 
@@ -518,6 +515,33 @@
   return make_double2(floor(a.x), floor(a.y));
 }
 
+template <>
+EIGEN_DEVICE_FUNC inline float4 pceil<float4>(const float4& a) {
+  return make_float4(ceilf(a.x), ceilf(a.y), ceilf(a.z), ceilf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 pceil<double2>(const double2& a) {
+  return make_double2(ceil(a.x), ceil(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline float4 print<float4>(const float4& a) {
+  return make_float4(rintf(a.x), rintf(a.y), rintf(a.z), rintf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 print<double2>(const double2& a) {
+  return make_double2(rint(a.x), rint(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline float4 ptrunc<float4>(const float4& a) {
+  return make_float4(truncf(a.x), truncf(a.y), truncf(a.z), truncf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 ptrunc<double2>(const double2& a) {
+  return make_double2(trunc(a.x), trunc(a.y));
+}
+
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<float4, 4>& kernel) {
   float tmp = kernel.packet[0].y;
   kernel.packet[0].y = kernel.packet[1].x;
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 5709257..4d10eec 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -964,6 +964,10 @@
 // added then subtracted, which is otherwise compiled away with -ffast-math.
 //
 // See bug 1674
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+#define EIGEN_OPTIMIZATION_BARRIER(X)
+#endif
+
 #if !defined(EIGEN_OPTIMIZATION_BARRIER)
 #if EIGEN_COMP_GNUC
    // According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html: