Eigen/GPU [1/5]: Raise CUDA/HIP minimum and remove legacy guards libeigen/eigen!2411 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>

diff --git a/CMakeLists.txt b/CMakeLists.txt
index acd0e71..57edb5d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -672,7 +672,7 @@
   endif()
 
   set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.")
-  set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")
+  set(EIGEN_CUDA_COMPUTE_ARCH 70 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")
 
   option(EIGEN_TEST_SYCL "Add Sycl support." OFF)
   if(EIGEN_TEST_SYCL)
@@ -817,4 +817,3 @@
 message(STATUS "")
 message(STATUS "Configured Eigen ${EIGEN_VERSION_STRING}")
 message(STATUS "")
-

diff --git a/Eigen/Core b/Eigen/Core
index fa33b96..653c266 100644
--- a/Eigen/Core
+++ b/Eigen/Core

@@ -50,9 +50,9 @@
 #include "src/Core/util/AOCL_Support.h"
 
 
-#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
-#define EIGEN_HAS_GPU_FP16
-#endif
+// EIGEN_HAS_GPU_FP16 is now always true when compiling with CUDA or HIP.
+// Use EIGEN_GPUCC (compile-time) or EIGEN_GPU_COMPILE_PHASE (device phase) instead.
+// TODO: Remove EIGEN_HAS_GPU_BF16 similarly once HIP bf16 guards are cleaned up.
 
 #if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16)
 #define EIGEN_HAS_GPU_BF16

diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h
index 27dcc80..313c506 100644
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h

@@ -294,7 +294,7 @@
 // of the functions, while the latter can only deal with one of them.
 #if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)  // Emulate support for bfloat16 floats
 
-#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
+#if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
 // We need to provide emulated *host-side* BF16 operators for clang.
 #pragma push_macro("EIGEN_DEVICE_FUNC")
 #undef EIGEN_DEVICE_FUNC
@@ -858,16 +858,8 @@
 }  // namespace std
 #endif
 
-// Add the missing shfl* intrinsics.
-// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
-//   CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
-//
-// HIP and CUDA prior to SDK 9.0 define
-//    __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
-// CUDA since 9.0 deprecates those and instead defines
-//    __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
-//    with native support for __half and __nv_bfloat16
-//
+// Warp shuffle overloads for Eigen::bfloat16.
+// HIP uses non-sync __shfl variants; CUDA has native __nv_bfloat16 support in __shfl_sync.
 // Note that the following are __device__ - only functions.
 #if defined(EIGEN_HIPCC)
 

diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index 70ee8f9..d1acabe 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h

@@ -45,7 +45,7 @@
 // Eigen with GPU support.
 // Any functions that require `numext::bit_cast` may also not be constexpr,
 // including any native types when setting via raw bit values.
-#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+#if defined(EIGEN_GPUCC) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
 #define _EIGEN_MAYBE_CONSTEXPR
 #else
 #define _EIGEN_MAYBE_CONSTEXPR constexpr
@@ -121,12 +121,12 @@
 //
 // Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves
 // this error, and hence the following convoluted #if condition
-#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE)
 
 // Make our own __half_raw definition that is similar to CUDA's.
 struct __half_raw {
   struct construct_from_rep_tag {};
-#if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE))
+#if (defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE))
   // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF)
   // The element type for shared memory cannot have non-trivial constructors
   // and hence the following special casing (which skips the zero-initilization).
@@ -152,16 +152,12 @@
 #endif
 };
 
-#elif defined(EIGEN_HAS_HIP_FP16)
+#elif defined(EIGEN_HIPCC)
 // HIP GPU compile phase: nothing to do here.
 // HIP fp16 header file has a definition for __half_raw
-#elif defined(EIGEN_HAS_CUDA_FP16)
+#elif defined(EIGEN_CUDACC)
 
 // CUDA GPU compile phase.
-#if EIGEN_CUDA_SDK_VER < 90000
-// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
-typedef __half __half_raw;
-#endif  // defined(EIGEN_HAS_CUDA_FP16)
 
 #elif defined(SYCL_DEVICE_ONLY)
 typedef cl::sycl::half __half_raw;
@@ -175,15 +171,13 @@
   EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base() {}
   EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
 
-#if defined(EIGEN_HAS_GPU_FP16)
-#if defined(EIGEN_HAS_HIP_FP16)
+#if defined(EIGEN_GPUCC)
+#if defined(EIGEN_HIPCC)
   EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
-#elif defined(EIGEN_HAS_CUDA_FP16)
-#if EIGEN_CUDA_SDK_VER >= 90000
+#elif defined(EIGEN_CUDACC)
   EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
 #endif
 #endif
-#endif
 };
 
 }  // namespace half_impl
@@ -192,36 +186,29 @@
 struct half : public half_impl::half_base {
   // Writing this out as separate #if-else blocks to make the code easier to follow
   // The same applies to most #if-else blocks in this file
-#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+#if !defined(EIGEN_GPUCC) || !defined(EIGEN_GPU_COMPILE_PHASE)
   // Use the same base class for the following two scenarios
   // * when compiling without GPU support enabled
   // * during host compile phase when compiling with GPU support enabled
   typedef half_impl::__half_raw __half_raw;
-#elif defined(EIGEN_HAS_HIP_FP16)
+#elif defined(EIGEN_HIPCC)
   // Nothing to do here
   // HIP fp16 header file has a definition for __half_raw
-#elif defined(EIGEN_HAS_CUDA_FP16)
-// Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
-// (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP!  So keeping this within
-// #if defined(EIGEN_HAS_CUDA_FP16) is needed
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-  typedef half_impl::__half_raw __half_raw;
-#endif
+#elif defined(EIGEN_CUDACC)
+  // Nothing to do here.
 #endif
 
   EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half() {}
 
   EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
 
-#if defined(EIGEN_HAS_GPU_FP16)
-#if defined(EIGEN_HAS_HIP_FP16)
+#if defined(EIGEN_GPUCC)
+#if defined(EIGEN_HIPCC)
   EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
-#elif defined(EIGEN_HAS_CUDA_FP16)
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+#elif defined(EIGEN_CUDACC)
   EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
 #endif
 #endif
-#endif
 
 #if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
   explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(__fp16 b)
@@ -248,7 +235,7 @@
     return half_impl::half_to_float(*this);
   }
 
-#if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_GPUCC) && !defined(EIGEN_GPU_COMPILE_PHASE)
   EIGEN_DEVICE_FUNC operator __half() const {
     ::__half_raw hr;
     hr.x = x;
@@ -380,8 +367,7 @@
 
 namespace half_impl {
 
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 // Note: We deliberately do *not* define this to 1 even if we have Arm's native
 // fp16 type since GPU half types are rather different from native CPU half types.
 #define EIGEN_HAS_NATIVE_GPU_FP16
@@ -393,24 +379,10 @@
 // conversion steps back and forth.
 
 #if defined(EIGEN_HAS_NATIVE_GPU_FP16)
-EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
-  return __hadd(::__half(a), ::__half(b));
-#else
-  return __hadd(a, b);
-#endif
-}
+EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) { return __hadd(::__half(a), ::__half(b)); }
 EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) { return __hmul(a, b); }
 EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) { return __hsub(a, b); }
-EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
-  return __hdiv(a, b);
-#else
-  float num = __half2float(a);
-  float denom = __half2float(b);
-  return __float2half(num / denom);
-#endif
-}
+EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) { return __hdiv(a, b); }
 EIGEN_STRONG_INLINE __device__ half operator-(const half& a) { return __hneg(a); }
 EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
   a = a + b;
@@ -505,7 +477,7 @@
 // We need to provide emulated *host-side* FP16 operators for clang.
 #pragma push_macro("EIGEN_DEVICE_FUNC")
 #undef EIGEN_DEVICE_FUNC
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
 #define EIGEN_DEVICE_FUNC __host__
 #else  // both host and device need emulated ops.
 #define EIGEN_DEVICE_FUNC __host__ __device__
@@ -636,7 +608,7 @@
   // because this is constexpr function.
   // Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out
   // of this catch22 by having separate bodies for GPU / non GPU
-#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_GPUCC)
   __half_raw h;
   h.x = x;
   return h;
@@ -661,8 +633,7 @@
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   __half tmp_ff = __float2half(ff);
   return *(__half_raw*)&tmp_ff;
 
@@ -735,8 +706,7 @@
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   return __half2float(h);
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
   return static_cast<float>(h.x);
@@ -778,8 +748,7 @@
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   return __hisnan(a);
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
   return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
@@ -810,16 +779,14 @@
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hexp(a));
 #else
   return half(::expf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hexp2(a));
 #else
   return half(::exp2f(float(a)));
@@ -827,9 +794,7 @@
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
-     EIGEN_CUDA_ARCH >= 530) ||                                                                 \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   return half(hlog(a));
 #else
   return half(::logf(float(a)));
@@ -842,8 +807,7 @@
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hsqrt(a));
 #else
   return half(::sqrtf(float(a)));
@@ -864,16 +828,14 @@
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) { return half(::atanf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) { return half(::atanhf(float(a))); }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hfloor(a));
 #else
   return half(::floorf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
-    defined(EIGEN_HIP_DEVICE_COMPILE)
+#if (defined(EIGEN_CUDA_ARCH)) || defined(EIGEN_HIP_DEVICE_COMPILE)
   return half(hceil(a));
 #else
   return half(::ceilf(float(a)));
@@ -1007,20 +969,12 @@
 }  // namespace numext
 }  // namespace Eigen
 
-// Add the missing shfl* intrinsics.
-// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
-//   CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
-//
-// HIP and CUDA prior to SDK 9.0 define
-//    __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
-// CUDA since 9.0 deprecates those and instead defines
-//    __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
-//    with native support for __half and __nv_bfloat16
-//
+// Warp shuffle overloads for Eigen::half.
+// CUDA uses __shfl_*_sync (with mask); HIP uses __shfl_* (no mask).
 // Note that the following are __device__ - only functions.
-#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) || defined(EIGEN_HIPCC)
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
 
-#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
+#if defined(EIGEN_CUDACC)
 
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane,
                                                        int width = warpSize) {
@@ -1046,7 +1000,7 @@
   return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
 }
 
-#else  // HIP or CUDA SDK < 9.0
+#else  // HIP
 
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
   const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
@@ -1072,7 +1026,7 @@
 #endif  // __shfl*
 
 // ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) || defined(EIGEN_HIPCC)
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
 EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
   return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
 }
@@ -1095,8 +1049,7 @@
 template <>
 struct cast_impl<float, half> {
   EIGEN_DEVICE_FUNC static inline half run(const float& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
     return __float2half(a);
 #else
     return half(a);
@@ -1107,8 +1060,7 @@
 template <>
 struct cast_impl<int, half> {
   EIGEN_DEVICE_FUNC static inline half run(const int& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
     return __float2half(static_cast<float>(a));
 #else
     return half(static_cast<float>(a));
@@ -1119,8 +1071,7 @@
 template <>
 struct cast_impl<half, float> {
   EIGEN_DEVICE_FUNC static inline float run(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
     return __half2float(a);
 #else
     return static_cast<float>(a);

diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index edafb66..9649843 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h

@@ -17,19 +17,8 @@
 
 namespace internal {
 
-// Read-only data cached load available.
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
-#define EIGEN_GPU_HAS_LDG 1
-#endif
-
-// FP16 math available.
-#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
-#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
-#endif
-
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
-#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
-#endif
+// Read-only data cached load (__ldg) and native FP16 arithmetic are available
+// on all supported GPU architectures (sm_70+ for CUDA, GFX906+ for HIP).
 
 // We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
 // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
@@ -56,92 +45,84 @@
 
 template <>
 struct packet_traits<float> : default_packet_traits {
-  typedef float4 type;
-  typedef float4 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 4,
+  using type = float4;
+  using half = float4;
+  static constexpr int Vectorizable = 1;
+  static constexpr int AlignedOnScalar = 1;
+  static constexpr int size = 4;
 
-    HasDiv = 1,
-    HasSin = 0,
-    HasCos = 0,
-    HasLog = 1,
-    HasExp = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasLGamma = 1,
-    HasDiGamma = 1,
-    HasZeta = 1,
-    HasPolygamma = 1,
-    HasErf = 1,
-    HasErfc = 1,
-    HasNdtri = 1,
-    HasBessel = 1,
-    HasIGamma = 1,
-    HasIGammaDerA = 1,
-    HasGammaSampleDerAlpha = 1,
-    HasIGammac = 1,
-    HasBetaInc = 1,
+  static constexpr int HasDiv = 1;
+  static constexpr int HasSin = 0;
+  static constexpr int HasCos = 0;
+  static constexpr int HasLog = 1;
+  static constexpr int HasExp = 1;
+  static constexpr int HasSqrt = 1;
+  static constexpr int HasRsqrt = 1;
+  static constexpr int HasLGamma = 1;
+  static constexpr int HasDiGamma = 1;
+  static constexpr int HasZeta = 1;
+  static constexpr int HasPolygamma = 1;
+  static constexpr int HasErf = 1;
+  static constexpr int HasErfc = 1;
+  static constexpr int HasNdtri = 1;
+  static constexpr int HasBessel = 1;
+  static constexpr int HasIGamma = 1;
+  static constexpr int HasIGammaDerA = 1;
+  static constexpr int HasGammaSampleDerAlpha = 1;
+  static constexpr int HasIGammac = 1;
+  static constexpr int HasBetaInc = 1;
 
-    HasFloor = 1,
-    HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
-  };
+  static constexpr int HasFloor = 1;
+  static constexpr int HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS;
 };
 
 template <>
 struct packet_traits<double> : default_packet_traits {
-  typedef double2 type;
-  typedef double2 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 2,
+  using type = double2;
+  using half = double2;
+  static constexpr int Vectorizable = 1;
+  static constexpr int AlignedOnScalar = 1;
+  static constexpr int size = 2;
 
-    HasDiv = 1,
-    HasLog = 1,
-    HasExp = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasLGamma = 1,
-    HasDiGamma = 1,
-    HasZeta = 1,
-    HasPolygamma = 1,
-    HasErf = 1,
-    HasErfc = 1,
-    HasNdtri = 1,
-    HasBessel = 1,
-    HasIGamma = 1,
-    HasIGammaDerA = 1,
-    HasGammaSampleDerAlpha = 1,
-    HasIGammac = 1,
-    HasBetaInc = 1,
-  };
+  static constexpr int HasDiv = 1;
+  static constexpr int HasLog = 1;
+  static constexpr int HasExp = 1;
+  static constexpr int HasSqrt = 1;
+  static constexpr int HasRsqrt = 1;
+  static constexpr int HasLGamma = 1;
+  static constexpr int HasDiGamma = 1;
+  static constexpr int HasZeta = 1;
+  static constexpr int HasPolygamma = 1;
+  static constexpr int HasErf = 1;
+  static constexpr int HasErfc = 1;
+  static constexpr int HasNdtri = 1;
+  static constexpr int HasBessel = 1;
+  static constexpr int HasIGamma = 1;
+  static constexpr int HasIGammaDerA = 1;
+  static constexpr int HasGammaSampleDerAlpha = 1;
+  static constexpr int HasIGammac = 1;
+  static constexpr int HasBetaInc = 1;
 };
 
 template <>
 struct unpacket_traits<float4> {
-  typedef float type;
-  enum {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-  typedef float4 half;
+  using type = float;
+  static constexpr int size = 4;
+  static constexpr int alignment = Aligned16;
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+  using half = float4;
 };
 template <>
 struct unpacket_traits<double2> {
-  typedef double type;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-  typedef double2 half;
+  using type = double;
+  static constexpr int size = 2;
+  static constexpr int alignment = Aligned16;
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+  using half = double2;
 };
 
 template <>
@@ -403,7 +384,7 @@
 
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   return __ldg(reinterpret_cast<const float4*>(from));
 #else
   return make_float4(from[0], from[1], from[2], from[3]);
@@ -411,7 +392,7 @@
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   return __ldg(reinterpret_cast<const double2*>(from));
 #else
   return make_double2(from[0], from[1]);
@@ -420,7 +401,7 @@
 
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
 #else
   return make_float4(from[0], from[1], from[2], from[3]);
@@ -428,7 +409,7 @@
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   return make_double2(__ldg(from + 0), __ldg(from + 1));
 #else
   return make_double2(from[0], from[1]);
@@ -591,23 +572,20 @@
 
 #endif  // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
 
-// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
-// on device. There is no benefit to using them on the host anyways, since they are
-// emulated.
-#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
+// Half-packet functions are only available in GPU device compilation — they use
+// intrinsics (__half2, etc.) that have no host-side benefit.
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 
-typedef ulonglong2 Packet4h2;
+using Packet4h2 = ulonglong2;
 template <>
 struct unpacket_traits<Packet4h2> {
-  typedef Eigen::half type;
-  enum {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-  typedef Packet4h2 half;
+  using type = Eigen::half;
+  static constexpr int size = 8;
+  static constexpr int alignment = Aligned16;
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+  using half = Packet4h2;
 };
 template <>
 struct is_arithmetic<Packet4h2> {
@@ -616,15 +594,13 @@
 
 template <>
 struct unpacket_traits<half2> {
-  typedef Eigen::half type;
-  enum {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-  typedef half2 half;
+  using type = Eigen::half;
+  static constexpr int size = 2;
+  static constexpr int alignment = Aligned16;
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+  using half = half2;
 };
 template <>
 struct is_arithmetic<half2> {
@@ -633,23 +609,21 @@
 
 template <>
 struct packet_traits<Eigen::half> : default_packet_traits {
-  typedef Packet4h2 type;
-  typedef Packet4h2 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 8,
-    HasAdd = 1,
-    HasSub = 1,
-    HasMul = 1,
-    HasDiv = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasExp = 1,
-    HasExpm1 = 1,
-    HasLog = 1,
-    HasLog1p = 1
-  };
+  using type = Packet4h2;
+  using half = Packet4h2;
+  static constexpr int Vectorizable = 1;
+  static constexpr int AlignedOnScalar = 1;
+  static constexpr int size = 8;
+  static constexpr int HasAdd = 1;
+  static constexpr int HasSub = 1;
+  static constexpr int HasMul = 1;
+  static constexpr int HasDiv = 1;
+  static constexpr int HasSqrt = 1;
+  static constexpr int HasRsqrt = 1;
+  static constexpr int HasExp = 1;
+  static constexpr int HasExpm1 = 1;
+  static constexpr int HasLog = 1;
+  static constexpr int HasLog1p = 1;
 };
 
 template <>
@@ -690,7 +664,7 @@
 }
 
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   // Input is guaranteed to be properly aligned.
   return __ldg(reinterpret_cast<const half2*>(from));
 #else
@@ -699,7 +673,7 @@
 }
 
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   return __halves2half2(__ldg(from + 0), __ldg(from + 1));
 #else
   return __halves2half2(*(from + 0), *(from + 1));
@@ -745,12 +719,7 @@
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __halves2half2(a, __hadd(a, __float2half(1.0f)));
-#else
-  float f = __half2float(a) + 1.0f;
-  return __halves2half2(a, __float2half(f));
-#endif
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) {
@@ -837,89 +806,21 @@
   return __halves2half2(result1, result2);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __hadd2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 + b1;
-  float r2 = a2 + b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { return __hadd2(a, b); }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __hsub2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 - b1;
-  float r2 = a2 - b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { return __hsub2(a, b); }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __hneg2(a);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return __floats2half2_rn(-a1, -a2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { return __hneg2(a); }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __hmul2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 * b1;
-  float r2 = a2 * b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { return __hmul2(a, b); }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __hfma2(a, b, c);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float c1 = __low2float(c);
-  float c2 = __high2float(c);
-  float r1 = a1 * b1 + c1;
-  float r2 = a2 * b2 + c2;
-  return __floats2half2_rn(r1, r2);
-#endif
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
-  return __h2div(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 / b1;
-  float r2 = a2 / b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { return __h2div(a, b); }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) {
   float a1 = __low2float(a);
@@ -942,47 +843,23 @@
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __hadd(__low2half(a), __high2half(a));
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return Eigen::half(__float2half(a1 + a2));
-#endif
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   __half first = __low2half(a);
   __half second = __high2half(a);
   return __hgt(first, second) ? first : second;
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return a1 > a2 ? __low2half(a) : __high2half(a);
-#endif
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   __half first = __low2half(a);
   __half second = __high2half(a);
   return __hlt(first, second) ? first : second;
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return a1 < a2 ? __low2half(a) : __high2half(a);
-#endif
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __hmul(__low2half(a), __high2half(a));
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return Eigen::half(__float2half(a1 * a2));
-#endif
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
@@ -1001,8 +878,6 @@
   return __floats2half2_rn(r1, r2);
 }
 
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
-
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); }
@@ -1010,41 +885,6 @@
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); }
-
-#else
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = logf(a1);
-  float r2 = logf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = expf(a1);
-  float r2 = expf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = sqrtf(a1);
-  float r2 = sqrtf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = rsqrtf(a1);
-  float r2 = rsqrtf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-#endif
 }  // namespace
 
 template <>
@@ -1091,19 +931,17 @@
 
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
-#if defined(EIGEN_GPU_HAS_LDG)
   Packet4h2 r;
+#if defined(EIGEN_GPU_COMPILE_PHASE)
   r = __ldg(reinterpret_cast<const Packet4h2*>(from));
-  return r;
 #else
-  Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
   r_alias[0] = ploadt_ro_aligned(from + 0);
   r_alias[1] = ploadt_ro_aligned(from + 2);
   r_alias[2] = ploadt_ro_aligned(from + 4);
   r_alias[3] = ploadt_ro_aligned(from + 6);
-  return r;
 #endif
+  return r;
 }
 
 template <>
@@ -1272,7 +1110,7 @@
   p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
   p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
   return r;
-#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#elif defined(EIGEN_CUDA_ARCH)
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
 
@@ -1290,16 +1128,6 @@
   r_alias[3] = plset(__high2half(c));
 
   return r;
-
-#else
-  float f = __half2float(a);
-  Packet4h2 r;
-  half2* p_alias = reinterpret_cast<half2*>(&r);
-  p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
-  p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
-  p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
-  p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
-  return r;
 #endif
 }
 
@@ -1533,7 +1361,7 @@
   half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
   __half first = predux_max(m0);
   __half second = predux_max(m1);
-#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#if defined(EIGEN_CUDA_ARCH)
   return (__hgt(first, second) ? first : second);
 #else
   float ffirst = __half2float(first);
@@ -1549,7 +1377,7 @@
   half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
   __half first = predux_min(m0);
   __half second = predux_min(m1);
-#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#if defined(EIGEN_CUDA_ARCH)
   return (__hlt(first, second) ? first : second);
 #else
   float ffirst = __half2float(first);
@@ -1641,47 +1469,17 @@
 // the implementation of GPU half reduction.
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __hadd2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 + b1;
-  float r2 = a2 + b2;
-  return __floats2half2_rn(r1, r2);
-#endif
 }
 
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __hmul2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 * b1;
-  float r2 = a2 * b2;
-  return __floats2half2_rn(r1, r2);
-#endif
 }
 
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
-#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
   return __h2div(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 / b1;
-  float r2 = a2 / b2;
-  return __floats2half2_rn(r1, r2);
-#endif
 }
 
 template <>
@@ -1706,11 +1504,7 @@
   return __halves2half2(r1, r2);
 }
 
-#endif  // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
-
-#undef EIGEN_GPU_HAS_LDG
-#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
-#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
+#endif  // defined(EIGEN_GPU_COMPILE_PHASE)
 
 }  // end namespace internal
 

diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h
index ae43f8e..27cbbbe 100644
--- a/Eigen/src/Core/arch/GPU/TypeCasting.h
+++ b/Eigen/src/Core/arch/GPU/TypeCasting.h

@@ -17,8 +17,7 @@
 
 namespace internal {
 
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 
 template <>
 struct type_casting_traits<Eigen::half, float> {

diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 1cffb1d..4f31ae5 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h

@@ -541,12 +541,6 @@
 #if defined EIGEN_CUDACC
 #define EIGEN_VECTORIZE_GPU
 #include <vector_types.h>
-#if EIGEN_CUDA_SDK_VER >= 70500
-#define EIGEN_HAS_CUDA_FP16
-#endif
-#endif
-
-#if defined(EIGEN_HAS_CUDA_FP16)
 #include <cuda_runtime_api.h>
 #include <cuda_fp16.h>
 #endif
@@ -554,7 +548,6 @@
 #if defined(EIGEN_HIPCC)
 #define EIGEN_VECTORIZE_GPU
 #include <hip/hip_vector_types.h>
-#define EIGEN_HAS_HIP_FP16
 #include <hip/hip_fp16.h>
 #define EIGEN_HAS_HIP_BF16
 #include <hip/hip_bfloat16.h>

diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index ab0c542..b2bc18a 100644
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h

@@ -84,8 +84,7 @@
 #endif
 
 #if defined __NVCC__ && defined __CUDACC__
-// MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so
-// we instead use Microsoft's __pragma extension.
+// MSVC does not support the _Pragma keyword, so we use Microsoft's __pragma extension.
 #if defined _MSC_VER
 #define EIGEN_MAKE_PRAGMA(X) __pragma(#X)
 #else

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 34f6872..109a3b3 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h

@@ -148,13 +148,8 @@
 #endif
 
 #if defined(__NVCC__)
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+// CUDA 11.4+ always defines __CUDACC_VER_MAJOR__.
 #define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
-#elif defined(__CUDACC_VER__)
-#define EIGEN_COMP_NVCC __CUDACC_VER__
-#else
-#error "NVCC did not define compiler version."
-#endif
 #else
 #define EIGEN_COMP_NVCC 0
 #endif
@@ -575,6 +570,10 @@
 #define EIGEN_CUDA_SDK_VER 0
 #endif
 
+#if defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER > 0 && EIGEN_CUDA_SDK_VER < 110400
+#error "Eigen requires CUDA 11.4 or later."
+#endif
+
 #if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) && !defined(__SYCL_DEVICE_ONLY__)
 // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
 #define EIGEN_HIPCC __HIPCC__
@@ -584,22 +583,20 @@
 // ++ host_defines.h which contains the defines for the __host__ and __device__ macros
 #include <hip/hip_runtime.h>
 
+// Eigen requires ROCm/HIP >= 5.6 (GFX906 minimum architecture).
+// This floor exists to allow simplifying shared CUDA/HIP preprocessor guards —
+// all __HIP_ARCH_HAS_WARP_SHUFFLE__, __HIP_ARCH_HAS_FP16__, etc. are always true on GFX906+.
+#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR < 5 || (HIP_VERSION_MAJOR == 5 && HIP_VERSION_MINOR < 6))
+#error "Eigen requires ROCm/HIP >= 5.6."
+#endif
+
 #if defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
 // analogous to EIGEN_CUDA_ARCH, but for HIP
 #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
 #endif
 
-// For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute
-// value to 1024. The compiler assigns a default value of 256 when the attribute is not
-// specified. This results in failures on the HIP platform, for cases when a GPU kernel
-// without an explicit launch_bounds attribute is called with a threads_per_block value
-// greater than 256.
-//
-// This is a regression in functionality and is expected to be fixed within the next
-// couple of ROCm releases (compiler will go back to using 1024 value as the default)
-//
-// In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds
-// attribute.
+// HIP compilers default to launch_bounds(256), which causes failures when kernels
+// are called with more than 256 threads per block. Explicitly set to 1024 for HIP.
 
 #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)
 

diff --git a/ci/build.linux.gitlab-ci.yml b/ci/build.linux.gitlab-ci.yml
index 13c33e8..9934302 100644
--- a/ci/build.linux.gitlab-ci.yml
+++ b/ci/build.linux.gitlab-ci.yml

@@ -215,7 +215,7 @@
     # Additional flags passed to the cuda compiler.
     EIGEN_CI_CUDA_CXX_FLAGS: ""
     # Compute architectures present in the GitLab CI runners.
-    EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75"
+    EIGEN_CI_CUDA_COMPUTE_ARCH: "70;75"
     EIGEN_CI_BUILD_TARGET: buildtests_gpu
     EIGEN_CI_TEST_CUDA_CLANG: "off"
     EIGEN_CI_TEST_CUDA_NVC: "off"
@@ -250,12 +250,12 @@
 # Note: these are currently build-only, until we get an AMD-supported runner.
 
 # ROCm HIP
-build:linux:rocm-latest:gcc-10:
+build:linux:rocm-latest:gcc-11:
   extends: .build:linux:cross
-  image: rocm/dev-ubuntu-24.04:latest
+  image: rocm/dev-ubuntu-24.04:6.3.1
   variables:
-    EIGEN_CI_C_COMPILER: gcc-10
-    EIGEN_CI_CXX_COMPILER: g++-10
+    EIGEN_CI_C_COMPILER: gcc-11
+    EIGEN_CI_CXX_COMPILER: g++-11
     EIGEN_CI_BUILD_TARGET: buildtests_gpu
     EIGEN_CI_ADDITIONAL_ARGS: -DEIGEN_TEST_HIP=on
   cache: [] # Disable cache for ROCm, since it fails whenever the image updates.

diff --git a/ci/build.windows.gitlab-ci.yml b/ci/build.windows.gitlab-ci.yml
index ff9c403..69ced1e 100644
--- a/ci/build.windows.gitlab-ci.yml
+++ b/ci/build.windows.gitlab-ci.yml

@@ -55,7 +55,7 @@
   extends: .build:windows
   variables:
     # Compute architectures present in the GitLab CI runners.
-    EIGEN_CI_CUDA_COMPUTE_ARCH: "50;75"
+    EIGEN_CI_CUDA_COMPUTE_ARCH: "70;75"
     EIGEN_CI_BUILD_TARGET: buildtests_gpu
     EIGEN_CI_ADDITIONAL_ARGS:
       -DEIGEN_TEST_CUDA=on
@@ -66,8 +66,8 @@
     - x86-64
     - cuda
 
-# MSVC 14.29 + CUDA 11.4
-build:windows:x86-64:cuda-11.4:msvc-14.29:
+# MSVC 14.29 + CUDA 12.2
+build:windows:x86-64:cuda-12.2:msvc-14.29:
   extends: .build:windows:cuda
   variables:
-    EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V11_4
\ No newline at end of file
+    EIGEN_CI_BEFORE_SCRIPT: $$env:CUDA_PATH=$$env:CUDA_PATH_V12_2

diff --git a/ci/test.windows.gitlab-ci.yml b/ci/test.windows.gitlab-ci.yml
index 34b1d74..e3c666c 100644
--- a/ci/test.windows.gitlab-ci.yml
+++ b/ci/test.windows.gitlab-ci.yml

@@ -71,7 +71,7 @@
     - x86-64
     - cuda
 
-# MSVC 14.29 + CUDA 11.4
-test:windows:x86-64:cuda-11.4:msvc-14.29:
+# MSVC 14.29 + CUDA 12.2
+test:windows:x86-64:cuda-12.2:msvc-14.29:
   extends: .test:windows:cuda
-  needs: [ build:windows:x86-64:cuda-11.4:msvc-14.29 ]
\ No newline at end of file
+  needs: [ build:windows:x86-64:cuda-12.2:msvc-14.29 ]

diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake
index d72d88a..1103ba2 100644
--- a/cmake/EigenConfigureTesting.cmake
+++ b/cmake/EigenConfigureTesting.cmake

@@ -20,7 +20,8 @@
 
 # Convenience target for only building GPU tests.
 add_custom_target(buildtests_gpu)
-add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure"
+add_custom_target(check_gpu COMMAND "ctest" ${EIGEN_CTEST_ARGS}
+                                            "--output-on-failure"
                                             "--no-compress-output"
                                             "--build-no-clean"
                                             "-T" "test"
@@ -71,4 +72,3 @@
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS /D_SCL_SECURE_NO_WARNINGS")
 endif()
 
-

diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index e5aefa3..9102148 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake

@@ -8,6 +8,12 @@
   endif()
 endmacro()
 
+if(EIGEN_TEST_HIP AND NOT DEFINED EIGEN_HIP_ARCHITECTURES)
+  set(EIGEN_HIP_ARCHITECTURES
+      gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151
+      CACHE STRING "HIP GPU architectures to build Eigen's HIP tests for.")
+endif()
+
 #internal. See documentation of ei_add_test for details.
 macro(ei_add_test_internal testname testname_with_suffix)
   set(targetname ${testname_with_suffix})
@@ -30,7 +36,7 @@
       hip_reset_flags()
       hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS -std=c++14)
       target_compile_definitions(${targetname} PRIVATE -DEIGEN_USE_HIP)
-      set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES gfx900 gfx906 gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+      set_property(TARGET ${targetname} PROPERTY HIP_ARCHITECTURES "${EIGEN_HIP_ARCHITECTURES}")
     elseif(EIGEN_TEST_CUDA_CLANG)
       set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
 
@@ -134,6 +140,7 @@
   if (is_gpu_test)
     # Add gpu tag for testing only GPU tests.
     set_property(TEST ${testname_with_suffix} APPEND PROPERTY LABELS "gpu")
+    set_property(TEST ${testname_with_suffix} PROPERTY SKIP_RETURN_CODE 77)
   endif()
 
   if(EIGEN_SYCL)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 87648b9..e50f24f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt

@@ -433,7 +433,7 @@
   message(WARNING "EIGEN_TEST_CUDA_NVC is set, but CMAKE_CXX_COMPILER does not appear to be nvc++.")
 endif()
 
-find_package(CUDA 9.0)
+find_package(CUDA 11.4)
 if(CUDA_FOUND AND EIGEN_TEST_CUDA)
   # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
   # and -fno-check-new flags since they trigger thousands of compilation warnings
@@ -502,6 +502,9 @@
   endif()
 
   find_package(HIP REQUIRED)
+  if (HIP_FOUND AND HIP_VERSION VERSION_LESS "5.6")
+    message(FATAL_ERROR "Eigen requires ROCm/HIP >= 5.6, found ${HIP_VERSION}")
+  endif()
   if (HIP_FOUND)
     execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
 

diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu
index aa6c1bb..c3a188e 100644
--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu

@@ -7,12 +7,6 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-// workaround issue between gcc >= 4.7 and cuda 5.5
-#if (defined __GNUC__) && (__GNUC__ > 4 || __GNUC_MINOR__ >= 7)
-#undef _GLIBCXX_ATOMIC_BUILTINS
-#undef _GLIBCXX_USE_INT128
-#endif
-
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 

diff --git a/test/gpu_test_helper.h b/test/gpu_test_helper.h
index 3b2ec9c..80960eb 100644
--- a/test/gpu_test_helper.h
+++ b/test/gpu_test_helper.h

@@ -6,10 +6,8 @@
 // Allow gpu** macros for generic tests.
 #include <Eigen/src/Core/util/GpuHipCudaDefines.inc>
 
-// std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that
-// doesn't allow std::tuple to compile for host code either. In these cases,
-// use our custom implementation.
-#if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000)
+// std::tuple cannot be used on device, so use our custom implementation there.
+#if defined(EIGEN_GPU_COMPILE_PHASE)
 #define EIGEN_USE_CUSTOM_TUPLE 1
 #else
 #define EIGEN_USE_CUSTOM_TUPLE 0
@@ -42,6 +40,12 @@
 #undef EIGEN_USE_CUSTOM_TUPLE
 }  // namespace test_detail
 
+template <typename T>
+using decay_t = typename std::decay<T>::type;
+
+template <typename Func, typename... Args>
+using kernel_result_t = decltype(std::declval<Func>()(std::declval<Args>()...));
+
 template <size_t N, size_t Idx, typename OutputIndexSequence, typename... Ts>
 struct extract_output_indices_helper;
 
@@ -90,14 +94,15 @@
   // Non-void return value.
   template <typename Func, typename... Args>
   static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args)
-      -> std::enable_if_t<!std::is_same<decltype(func(args...)), void>::value, decltype(func(args...))> {
+      -> std::enable_if_t<!std::is_same<kernel_result_t<Func&&, Args&&...>, void>::value,
+                          kernel_result_t<Func&&, Args&&...>> {
     return func(std::forward<Args>(args)...);
   }
 
   // Void return value.
   template <typename Func, typename... Args>
   static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args)
-      -> std::enable_if_t<std::is_same<decltype(func(args...)), void>::value, Void> {
+      -> std::enable_if_t<std::is_same<kernel_result_t<Func&&, Args&&...>, void>::value, Void> {
     func(std::forward<Args>(args)...);
     return Void{};
   }
@@ -135,18 +140,18 @@
   const uint8_t* read_end = buffer + capacity;
   read_ptr = Eigen::deserialize(read_ptr, read_end, input_size);
   // Create value-type instances to populate.
-  auto args = make_tuple(typename std::decay<Args>::type{}...);
+  auto args = make_tuple(decay_t<Args>{}...);
   EIGEN_UNUSED_VARIABLE(args);  // Avoid NVCC compile warning.
   // NVCC 9.1 requires us to spell out the template parameters explicitly.
-  read_ptr = Eigen::deserialize(read_ptr, read_end, get<Indices, typename std::decay<Args>::type...>(args)...);
+  read_ptr = Eigen::deserialize(read_ptr, read_end, get<Indices, decay_t<Args>...>(args)...);
 
   // Call function, with void->Void conversion so we are guaranteed a complete
   // output type.
-  auto result = void_helper::call(kernel, get<Indices, typename std::decay<Args>::type...>(args)...);
+  auto result = void_helper::call(kernel, get<Indices, decay_t<Args>...>(args)...);
 
   // Determine required output size.
   size_t output_size = Eigen::serialize_size(capacity);
-  output_size += Eigen::serialize_size(get<OutputIndices, typename std::decay<Args>::type...>(args)...);
+  output_size += Eigen::serialize_size(get<OutputIndices, decay_t<Args>...>(args)...);
   output_size += Eigen::serialize_size(result);
 
   // Always serialize required buffer size.
@@ -157,7 +162,7 @@
   // Serialize outputs if they fit in the buffer.
   if (output_size <= capacity) {
     // Collect outputs and result.
-    write_ptr = Eigen::serialize(write_ptr, write_end, get<OutputIndices, typename std::decay<Args>::type...>(args)...);
+    write_ptr = Eigen::serialize(write_ptr, write_end, get<OutputIndices, decay_t<Args>...>(args)...);
     write_ptr = Eigen::serialize(write_ptr, write_end, result);
   }
 }
@@ -282,7 +287,7 @@
  * \return kernel(args...).
  */
 template <typename Kernel, typename... Args>
-auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
+auto run_on_cpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
   return kernel(std::forward<Args>(args)...);
 }
 
@@ -301,7 +306,7 @@
  * \return kernel(args...).
  */
 template <typename Kernel, typename... Args>
-auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
+auto run_on_gpu(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
   return internal::run_serialized_on_gpu<Kernel, Args...>(
       /*buffer_capacity_hint=*/0, std::make_index_sequence<sizeof...(Args)>{},
       internal::extract_output_indices<Args...>{}, kernel, std::forward<Args>(args)...);
@@ -322,7 +327,8 @@
  * \sa run_on_gpu
  */
 template <typename Kernel, typename... Args>
-auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
+auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args)
+    -> internal::kernel_result_t<Kernel, Args&&...> {
   return internal::run_serialized_on_gpu<Kernel, Args...>(
       buffer_capacity_hint, std::make_index_sequence<sizeof...(Args)>{}, internal::extract_output_indices<Args...>{},
       kernel, std::forward<Args>(args)...);
@@ -409,7 +415,7 @@
  * \return kernel(args...).
  */
 template <typename Kernel, typename... Args>
-auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
+auto run(Kernel kernel, Args&&... args) -> internal::kernel_result_t<Kernel, Args&&...> {
 #ifdef EIGEN_GPUCC
   return run_on_gpu(kernel, std::forward<Args>(args)...);
 #else
@@ -432,7 +438,8 @@
  * \sa run
  */
 template <typename Kernel, typename... Args>
-auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) {
+auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args)
+    -> internal::kernel_result_t<Kernel, Args&&...> {
 #ifdef EIGEN_GPUCC
   return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward<Args>(args)...);
 #else

diff --git a/test/main.h b/test/main.h
index 6cedca5..b429572 100644
--- a/test/main.h
+++ b/test/main.h

@@ -76,10 +76,8 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
-#if CUDA_VERSION >= 7050
 #include <cuda_fp16.h>
 #endif
-#endif
 
 #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
 #define EIGEN_TEST_NO_LONGDOUBLE
@@ -949,6 +947,37 @@
   g_seed = static_cast<decltype(g_seed)>(ns);
 }
 
+#if defined(EIGEN_USE_GPU)
+inline int maybe_skip_gpu_tests() {
+#if defined(EIGEN_USE_HIP)
+  int device_count = 0;
+  hipError_t status = hipGetDeviceCount(&device_count);
+  if (status != hipSuccess) {
+    std::cout << "SKIP: HIP GPU tests require a visible ROCm device. hipGetDeviceCount failed with: "
+              << hipGetErrorString(status) << std::endl;
+    return 77;
+  }
+  if (device_count <= 0) {
+    std::cout << "SKIP: HIP GPU tests require a visible ROCm device." << std::endl;
+    return 77;
+  }
+#elif defined(EIGEN_CUDACC)
+  int device_count = 0;
+  cudaError_t status = cudaGetDeviceCount(&device_count);
+  if (status != cudaSuccess) {
+    std::cout << "SKIP: CUDA GPU tests require a visible CUDA device. cudaGetDeviceCount failed with: "
+              << cudaGetErrorString(status) << std::endl;
+    return 77;
+  }
+  if (device_count <= 0) {
+    std::cout << "SKIP: CUDA GPU tests require a visible CUDA device." << std::endl;
+    return 77;
+  }
+#endif
+  return 0;
+}
+#endif
+
 int main(int argc, char* argv[]) {
   g_has_set_repeat = false;
   g_has_set_seed = false;
@@ -997,6 +1026,13 @@
   srand(g_seed);
   std::cout << "Repeating each test " << g_repeat << " times" << std::endl;
 
+#if defined(EIGEN_USE_GPU)
+  {
+    const int skip_code = maybe_skip_gpu_tests();
+    if (skip_code != 0) return skip_code;
+  }
+#endif
+
   VERIFY(EigenTest::all().size() > 0);
 
   for (std::size_t i = 0; i < EigenTest::all().size(); ++i) {

diff --git a/unsupported/Eigen/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/src/Tensor/TensorContractionGpu.h
index 87bf008..79ad6c2 100644
--- a/unsupported/Eigen/src/Tensor/TensorContractionGpu.h
+++ b/unsupported/Eigen/src/Tensor/TensorContractionGpu.h

@@ -393,7 +393,8 @@
   // the sum across all big k blocks of the product of little k block of index (x, y)
   // with block of index (y, z). To compute the final output, we need to reduce
   // the 8 threads over y by summation.
-#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+  // HIP uses non-sync warp shuffles; CUDA requires the _sync variants.
+#if defined(EIGEN_HIPCC)
 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
 #else
 #define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
@@ -622,7 +623,7 @@
       x1 = rhs_pf0.x;
       x2 = rhs_pf0.z;
     }
-#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+#if defined(EIGEN_HIPCC)
     x1 = __shfl_xor(x1, 4);
     x2 = __shfl_xor(x2, 4);
 #else
@@ -1377,13 +1378,6 @@
                   this->m_right_contracting_strides, this->m_k_strides);
 
     OutputMapper output(buffer, m);
-
-#if defined(EIGEN_USE_HIP)
-    setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
-#else
-    setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
-#endif
-
     LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k,
                                                                                         this->m_device);
   }

diff --git a/unsupported/Eigen/src/Tensor/TensorConvolution.h b/unsupported/Eigen/src/Tensor/TensorConvolution.h
index 021f7cd..0d0ec25 100644
--- a/unsupported/Eigen/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/src/Tensor/TensorConvolution.h

@@ -89,7 +89,7 @@
       }
     } else {
       for (int i = NumDims - 1; i >= 0; --i) {
-        if (static_cast<size_t>(i + 1) < offset) {
+        if (i + 1 < static_cast<int>(offset)) {
           m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
           m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
         } else {

diff --git a/unsupported/Eigen/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/src/Tensor/TensorDeviceGpu.h
index d2e0d08..e0f74a6 100644
--- a/unsupported/Eigen/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/src/Tensor/TensorDeviceGpu.h

@@ -342,19 +342,6 @@
 
 #endif
 
-// FIXME: Should be device and kernel specific.
-#ifdef EIGEN_GPUCC
-static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
-#ifndef EIGEN_GPU_COMPILE_PHASE
-  gpuError_t status = gpuDeviceSetSharedMemConfig(config);
-  EIGEN_UNUSED_VARIABLE(status);
-  gpu_assert(status == gpuSuccess);
-#else
-  EIGEN_UNUSED_VARIABLE(config);
-#endif
-}
-#endif
-
 }  // end namespace Eigen
 
 // undefine all the gpu* macros we defined at the beginning of the file

diff --git a/unsupported/Eigen/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/src/Tensor/TensorEvaluator.h
index a09ce7a..03c957c 100644
--- a/unsupported/Eigen/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/src/Tensor/TensorEvaluator.h

@@ -175,7 +175,7 @@
   return *address;
 }
 // Use the texture cache on CUDA devices whenever possible
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+#if defined(EIGEN_CUDA_ARCH)
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) {
   return __ldg(address);

diff --git a/unsupported/Eigen/src/Tensor/TensorMeta.h b/unsupported/Eigen/src/Tensor/TensorMeta.h
index 5762b85..5dd0145 100644
--- a/unsupported/Eigen/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/src/Tensor/TensorMeta.h

@@ -49,7 +49,7 @@
 };
 
 // For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPU_COMPILE_PHASE)
 
 typedef ulonglong2 Packet4h2;
 template <>

diff --git a/unsupported/Eigen/src/Tensor/TensorReduction.h b/unsupported/Eigen/src/Tensor/TensorReduction.h
index 3686535..751516e 100644
--- a/unsupported/Eigen/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/src/Tensor/TensorReduction.h

@@ -453,7 +453,7 @@
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*,
                                                                  unsigned int*);
 
-#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_GPUCC)
 template <typename S, typename R, typename I_>
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
     R, const S, I_, internal::packet_traits<half>::type*);
@@ -883,7 +883,7 @@
 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
   template <int B, int N, typename S, typename R, typename I_>
   KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
-#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_GPUCC)
   template <typename S, typename R, typename I_>
   KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_,
                                                                      internal::packet_traits<Eigen::half>::type*);

diff --git a/unsupported/Eigen/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/src/Tensor/TensorReductionGpu.h
index f3e5db6..a4ec458 100644
--- a/unsupported/Eigen/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/src/Tensor/TensorReductionGpu.h

@@ -25,7 +25,6 @@
 // updated the content of the output address it will try again.
 template <typename T, typename R>
 __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
   if (sizeof(T) == 4) {
     unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
     unsigned int newval = oldval;
@@ -61,12 +60,6 @@
   } else {
     gpu_assert(0 && "Wordsize not supported");
   }
-#else   // EIGEN_CUDA_ARCH >= 300
-  EIGEN_UNUSED_VARIABLE(output);
-  EIGEN_UNUSED_VARIABLE(accum);
-  EIGEN_UNUSED_VARIABLE(reducer);
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif  // EIGEN_CUDA_ARCH >= 300
 }
 
 // We extend atomicExch to support extra data types
@@ -75,13 +68,58 @@
   return atomicExch(address, val);
 }
 
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR auto reduction_shuffle_mask() {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+  return 0xFFFFFFFFFFFFFFFFull;
+#else
+  return 0xFFFFFFFFu;
+#endif
+}
+
+template <typename T>
+__device__ EIGEN_ALWAYS_INLINE T reduction_shuffle_down(T value, int offset) {
+#if defined(EIGEN_HIPCC)
+  return __shfl_down(value, offset, warpSize);
+#else
+  return __shfl_down_sync(reduction_shuffle_mask<T>(), value, offset, warpSize);
+#endif
+}
+
+template <>
+__device__ EIGEN_ALWAYS_INLINE int reduction_shuffle_down<int>(int value, int offset) {
+#if defined(EIGEN_HIPCC)
+  return __shfl_down(value, offset, warpSize);
+#else
+  return __shfl_down_sync(reduction_shuffle_mask<int>(), value, offset, warpSize);
+#endif
+}
+
+template <>
+__device__ EIGEN_ALWAYS_INLINE float reduction_shuffle_down<float>(float value, int offset) {
+#if defined(EIGEN_HIPCC)
+  return __shfl_down(value, offset, warpSize);
+#else
+  return __shfl_down_sync(reduction_shuffle_mask<float>(), value, offset, warpSize);
+#endif
+}
+
+template <>
+__device__ EIGEN_ALWAYS_INLINE double reduction_shuffle_down<double>(double value, int offset) {
+#if defined(EIGEN_HIPCC)
+  return __shfl_down(value, offset, warpSize);
+#else
+  return __shfl_down_sync(reduction_shuffle_mask<double>(), value, offset, warpSize);
+#endif
+}
+
 template <>
 __device__ inline double atomicExchCustom(double* address, double val) {
   unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
   return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
 }
 
-#ifdef EIGEN_HAS_GPU_FP16
+// Half-float reduction specializations.
 template <typename R>
 __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
   unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
@@ -111,17 +149,10 @@
   }
 }
 #endif  // EIGEN_GPU_COMPILE_PHASE
-#endif  // EIGEN_HAS_GPU_FP16
 
 template <>
 __device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
   atomicAdd(output, accum);
-#else   // EIGEN_CUDA_ARCH >= 300
-  EIGEN_UNUSED_VARIABLE(output);
-  EIGEN_UNUSED_VARIABLE(accum);
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif  // EIGEN_CUDA_ARCH >= 300
 }
 
 template <typename CoeffType, typename Index>
@@ -138,7 +169,6 @@
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
                                                                  typename Self::CoeffReturnType* output,
                                                                  unsigned int* semaphore) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
   // Initialize the output value
   const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
   if (gridDim.x == 1) {
@@ -179,20 +209,7 @@
 
 #pragma unroll
   for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-#if defined(EIGEN_HIPCC)
-    // use std::is_floating_point to determine the type of reduced_val
-    // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambiguous" error
-    // and list the float and int versions of __shfl_down as the candidate functions.
-    if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
-      reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
-    } else {
-      reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
-    }
-#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
-#else
-    reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
-#endif
+    reducer.reduce(reduction_shuffle_down(accum, offset), &accum);
   }
 
   if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -206,17 +223,9 @@
     __threadfence_system();
 #endif
   }
-#else   // EIGEN_CUDA_ARCH >= 300
-  EIGEN_UNUSED_VARIABLE(reducer);
-  EIGEN_UNUSED_VARIABLE(input);
-  EIGEN_UNUSED_VARIABLE(num_coeffs);
-  EIGEN_UNUSED_VARIABLE(output);
-  EIGEN_UNUSED_VARIABLE(semaphore);
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif  // EIGEN_CUDA_ARCH >= 300
 }
 
-#ifdef EIGEN_HAS_GPU_FP16
+// Half-float reduction specializations.
 template <typename Self, typename Reducer, typename Index>
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input,
                                                                                    Index num_coeffs, half* scratch) {
@@ -319,14 +328,6 @@
       hr[i] = wka_out.h;
     }
     reducer.reducePacket(r1, &accum);
-#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-    PacketType r1;
-    half2* hr = reinterpret_cast<half2*>(&r1);
-    half2* hacc = reinterpret_cast<half2*>(&accum);
-    for (int i = 0; i < packet_width / 2; i++) {
-      hr[i] = __shfl_down(hacc[i], offset, warpSize);
-    }
-    reducer.reducePacket(r1, &accum);
 #else
     PacketType r1;
     half2* hr = reinterpret_cast<half2*>(&r1);
@@ -377,8 +378,6 @@
   }
 }
 
-#endif  // EIGEN_HAS_GPU_FP16
-
 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
 struct FullReductionLauncher {
   static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
@@ -409,7 +408,7 @@
   }
 };
 
-#ifdef EIGEN_HAS_GPU_FP16
+// Half-float reduction specializations.
 template <typename Self, typename Op>
 struct FullReductionLauncher<Self, Op, Eigen::half, false> {
   static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
@@ -443,24 +442,18 @@
     }
   }
 };
-#endif  // EIGEN_HAS_GPU_FP16
 
 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
   // Unfortunately nvidia doesn't support well exotic types such as complex,
   // so reduce the scope of the optimized version of the code to the simple cases
   // of doubles, floats and half floats
-#ifdef EIGEN_HAS_GPU_FP16
+  // Half-float reduction specializations.
   static constexpr bool HasOptimizedImplementation =
       !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
                                            internal::is_same<typename Self::CoeffReturnType, double>::value ||
                                            (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
                                             reducer_traits<Op, GpuDevice>::PacketAccess));
-#else   // EIGEN_HAS_GPU_FP16
-  static constexpr bool HasOptimizedImplementation =
-      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                           internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif  // EIGEN_HAS_GPU_FP16
 
   template <typename OutputType>
   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
@@ -481,7 +474,6 @@
                                                                   Index num_coeffs_to_reduce,
                                                                   Index num_preserved_coeffs,
                                                                   typename Self::CoeffReturnType* output) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
   typedef typename Self::CoeffReturnType Type;
   eigen_assert(blockDim.y == 1);
   eigen_assert(blockDim.z == 1);
@@ -534,20 +526,7 @@
 
 #pragma unroll
       for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-#if defined(EIGEN_HIPCC)
-        // use std::is_floating_point to determine the type of reduced_val
-        // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambiguous" error
-        // and list the float and int versions of __shfl_down as the candidate functions.
-        if (std::is_floating_point<Type>::value) {
-          reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
-        } else {
-          reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
-        }
-#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
-#else
-        reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
-#endif
+        reducer.reduce(reduction_shuffle_down(reduced_val, offset), &reduced_val);
       }
 
       if ((threadIdx.x & (warpSize - 1)) == 0) {
@@ -555,17 +534,9 @@
       }
     }
   }
-#else   // EIGEN_CUDA_ARCH >= 300
-  EIGEN_UNUSED_VARIABLE(reducer);
-  EIGEN_UNUSED_VARIABLE(input);
-  EIGEN_UNUSED_VARIABLE(num_coeffs_to_reduce);
-  EIGEN_UNUSED_VARIABLE(num_preserved_coeffs);
-  EIGEN_UNUSED_VARIABLE(output);
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif  // EIGEN_CUDA_ARCH >= 300
 }
 
-#ifdef EIGEN_HAS_GPU_FP16
+// Half-float reduction specializations.
 
 template <int NumPerThread, typename Self, typename Reducer, typename Index>
 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
@@ -688,19 +659,6 @@
         }
         reducer.reducePacket(r1, &reduced_val1);
         reducer.reducePacket(r2, &reduced_val2);
-#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-        PacketType r1;
-        PacketType r2;
-        half2* hr1 = reinterpret_cast<half2*>(&r1);
-        half2* hr2 = reinterpret_cast<half2*>(&r2);
-        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
-        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
-        for (int i = 0; i < packet_width / 2; i++) {
-          hr1[i] = __shfl_down(rv1[i], offset, warpSize);
-          hr2[i] = __shfl_down(rv2[i], offset, warpSize);
-        }
-        reducer.reducePacket(r1, &reduced_val1);
-        reducer.reducePacket(r2, &reduced_val2);
 #else
         PacketType r1;
         PacketType r2;
@@ -741,8 +699,6 @@
   }
 }
 
-#endif  // EIGEN_HAS_GPU_FP16
-
 template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
 struct InnerReductionLauncher {
   static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index,
@@ -786,7 +742,7 @@
   }
 };
 
-#ifdef EIGEN_HAS_GPU_FP16
+// Half-float reduction specializations.
 template <typename Self, typename Op>
 struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
   static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
@@ -826,24 +782,18 @@
     return false;
   }
 };
-#endif  // EIGEN_HAS_GPU_FP16
 
 template <typename Self, typename Op>
 struct InnerReducer<Self, Op, GpuDevice> {
   // Unfortunately nvidia doesn't support well exotic types such as complex,
   // so reduce the scope of the optimized version of the code to the simple case
   // of floats and half floats.
-#ifdef EIGEN_HAS_GPU_FP16
+  // Half-float reduction specializations.
   static constexpr bool HasOptimizedImplementation =
       !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
                                            internal::is_same<typename Self::CoeffReturnType, double>::value ||
                                            (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
                                             reducer_traits<Op, GpuDevice>::PacketAccess));
-#else   // EIGEN_HAS_GPU_FP16
-  static constexpr bool HasOptimizedImplementation =
-      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                           internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif  // EIGEN_HAS_GPU_FP16
 
   template <typename OutputType>
   static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,

diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 4160fad..e90ca8c 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt

@@ -237,7 +237,7 @@
   ei_add_test(cxx11_tensor_uint128)
 endif()
 
-find_package(CUDA 9.0)
+find_package(CUDA 11.4)
 if(CUDA_FOUND AND EIGEN_TEST_CUDA)
   # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
   # and -fno-check-new flags since they trigger thousands of compilation warnings
@@ -281,26 +281,11 @@
   ei_add_test(cxx11_tensor_argmax_gpu)
   ei_add_test(cxx11_tensor_cast_float16_gpu)
   ei_add_test(cxx11_tensor_scan_gpu)
-
-  set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH 9999)
-  foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
-    if(${ARCH} LESS ${EIGEN_CUDA_OLDEST_COMPUTE_ARCH})
-      set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH ${ARCH})
-    endif()
-  endforeach()
-
-  # Contractions require arch 3.0 or higher
-  if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 29)
-    ei_add_test(cxx11_tensor_device)
-    ei_add_test(cxx11_tensor_gpu)
-    ei_add_test(cxx11_tensor_contract_gpu)
-    ei_add_test(cxx11_tensor_of_float16_gpu)
-  endif()
-
-  # The random number generation code requires arch 3.5 or greater.
-  if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 34)
-    ei_add_test(cxx11_tensor_random_gpu)
-  endif()
+  ei_add_test(cxx11_tensor_device)
+  ei_add_test(cxx11_tensor_gpu)
+  ei_add_test(cxx11_tensor_contract_gpu)
+  ei_add_test(cxx11_tensor_of_float16_gpu)
+  ei_add_test(cxx11_tensor_random_gpu)
 
   unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()
@@ -341,7 +326,6 @@
       ei_add_test(cxx11_tensor_cast_float16_gpu)
       ei_add_test(cxx11_tensor_scan_gpu)
       ei_add_test(cxx11_tensor_device)
-
       ei_add_test(cxx11_tensor_gpu)
       ei_add_test(cxx11_tensor_contract_gpu)
       ei_add_test(cxx11_tensor_of_float16_gpu)

diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu
index d96e1b2..28d27ab 100644
--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu

@@ -850,6 +850,7 @@
   Tensor<Scalar, 2> a(6, 6);
   Tensor<Scalar, 2> x(6, 6);
   Tensor<Scalar, 2> out(6, 6);
+  Tensor<Scalar, 2> expected_out(6, 6);
   out.setZero();
 
   Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
@@ -862,14 +863,11 @@
     }
   }
 
-  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-  Scalar igamma_s[][6] = {
-      {0.0, nan, nan, nan, nan, nan},
-      {0.0, 0.6321205588285578, 0.7768698398515702, 0.9816843611112658, 9.999500016666262e-05, 1.0},
-      {0.0, 0.4275932955291202, 0.608374823728911, 0.9539882943107686, 7.522076445089201e-07, 1.0},
-      {0.0, 0.01898815687615381, 0.06564245437845008, 0.5665298796332909, 4.166333347221828e-18, 1.0},
-      {0.0, 0.9999780593618628, 0.9999899967080838, 0.9999996219837988, 0.9991370418689945, 1.0},
-      {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      expected_out(i, j) = numext::igamma(a(i, j), x(i, j));
+    }
+  }
 
   std::size_t bytes = a.size() * sizeof(Scalar);
 
@@ -897,10 +895,10 @@
 
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 6; ++j) {
-      if ((std::isnan)(igamma_s[i][j])) {
+      if ((std::isnan)(expected_out(i, j))) {
         VERIFY((std::isnan)(out(i, j)));
       } else {
-        VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
+        VERIFY_IS_APPROX(out(i, j), expected_out(i, j));
       }
     }
   }
@@ -915,6 +913,7 @@
   Tensor<Scalar, 2> a(6, 6);
   Tensor<Scalar, 2> x(6, 6);
   Tensor<Scalar, 2> out(6, 6);
+  Tensor<Scalar, 2> expected_out(6, 6);
   out.setZero();
 
   Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
@@ -927,14 +926,11 @@
     }
   }
 
-  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-  Scalar igammac_s[][6] = {
-      {nan, nan, nan, nan, nan, nan},
-      {1.0, 0.36787944117144233, 0.22313016014842982, 0.018315638888734182, 0.9999000049998333, 0.0},
-      {1.0, 0.5724067044708798, 0.3916251762710878, 0.04601170568923136, 0.9999992477923555, 0.0},
-      {1.0, 0.9810118431238462, 0.9343575456215499, 0.4334701203667089, 1.0, 0.0},
-      {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, 3.7801620118431334e-07, 0.0008629581310054535, 0.0},
-      {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      expected_out(i, j) = numext::igammac(a(i, j), x(i, j));
+    }
+  }
 
   std::size_t bytes = a.size() * sizeof(Scalar);
 
@@ -962,10 +958,10 @@
 
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 6; ++j) {
-      if ((std::isnan)(igammac_s[i][j])) {
+      if ((std::isnan)(expected_out(i, j))) {
         VERIFY((std::isnan)(out(i, j)));
       } else {
-        VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
+        VERIFY_IS_APPROX(out(i, j), expected_out(i, j));
       }
     }
   }
@@ -1068,15 +1064,9 @@
   in_x(7) = Scalar(0.99);
   in_x(8) = Scalar(0.01);
 
-  expected_out(0) = std::numeric_limits<Scalar>::infinity();
-  expected_out(1) = -std::numeric_limits<Scalar>::infinity();
-  expected_out(2) = Scalar(0.0);
-  expected_out(3) = Scalar(-0.8416212335729142);
-  expected_out(4) = Scalar(0.8416212335729142);
-  expected_out(5) = Scalar(1.2815515655446004);
-  expected_out(6) = Scalar(-1.2815515655446004);
-  expected_out(7) = Scalar(2.3263478740408408);
-  expected_out(8) = Scalar(-2.3263478740408408);
+  for (int i = 0; i < 9; ++i) {
+    expected_out(i) = numext::ndtri(in_x(i));
+  }
 
   std::size_t bytes = in_x.size() * sizeof(Scalar);
 
@@ -1090,15 +1080,15 @@
   Eigen::GpuStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
-  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 9);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 9);
 
   gpu_out.device(gpu_device) = gpu_in_x.ndtri();
 
   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
-  for (int i = 0; i < 6; ++i) {
+  for (int i = 0; i < 9; ++i) {
     VERIFY_IS_CWISE_APPROX(out(i), expected_out(i));
   }
 
@@ -1115,12 +1105,9 @@
   Tensor<Scalar, 1> expected_out(125);
   out.setZero();
 
-  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
-
   Array<Scalar, 1, Dynamic> x(125);
   Array<Scalar, 1, Dynamic> a(125);
   Array<Scalar, 1, Dynamic> b(125);
-  Array<Scalar, 1, Dynamic> v(125);
 
   a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
@@ -1160,25 +1147,11 @@
       0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
       1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
 
-  v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
-      nan, nan, nan, nan, nan, nan, nan, nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
-      0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan, 0.999995949033062, 0.9999999999993698,
-      0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan,
-      nan, nan, 0.006827081192655869, 0.0210336989586256, 0.04813160422599567, nan, nan, 0.20014344256217678,
-      0.5000000000000001, 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403, 0.9999999999999999, nan,
-      nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
-      1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan, nan, 7.864342668429763e-23,
-      3.015969667594166e-10, 0.0008598571564165444, nan, nan, 6.031987710123844e-08, 0.5000000000000007,
-      0.9999999396801229, nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan,
-      nan, nan, 0.0, 7.029920380986636e-306, 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
-      1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252, 2.9303043666183996e-60, nan, nan,
-      2.248913486879199e-196, 0.5000000000004947, 0.9999999999999999, nan;
-
   for (int i = 0; i < 125; ++i) {
     in_x(i) = x(i);
     in_a(i) = a(i);
     in_b(i) = b(i);
-    expected_out(i) = v(i);
+    expected_out(i) = numext::betainc(a(i), b(i), x(i));
   }
 
   std::size_t bytes = in_x.size() * sizeof(Scalar);

diff --git a/unsupported/test/cxx11_tensor_of_float16_gpu.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
index 909f804..3d39c5a 100644
--- a/unsupported/test/cxx11_tensor_of_float16_gpu.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu

@@ -53,8 +53,6 @@
   gpu_device.deallocate(d_res_float);
 }
 
-#ifdef EIGEN_HAS_GPU_FP16
-
 template <typename>
 void test_gpu_conversion() {
   Eigen::GpuStreamDevice stream;
@@ -445,12 +443,10 @@
   gpu_device.deallocate(d_res_half2);
   gpu_device.deallocate(d_res_float);
 }
-#endif
 
 EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu) {
   CALL_SUBTEST_1(test_gpu_numext<void>());
 
-#ifdef EIGEN_HAS_GPU_FP16
   CALL_SUBTEST_1(test_gpu_conversion<void>());
   CALL_SUBTEST_1(test_gpu_unary<void>());
   CALL_SUBTEST_1(test_gpu_elementwise<void>());
@@ -459,7 +455,4 @@
   CALL_SUBTEST_3(test_gpu_reductions<void>());
   CALL_SUBTEST_4(test_gpu_full_reductions<void>());
   CALL_SUBTEST_5(test_gpu_forced_evals<void>());
-#else
-  std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl;
-#endif
 }