Fix some CUDA warnings. Added `EIGEN_HAS_STD_HASH` macro, checking for C++11 support and not running on GPU. `std::hash<float>` is not a device function, so cannot be used by `std::hash<bfloat16>`. Removed `EIGEN_DEVICE_FUNC` and only define if `EIGEN_HAS_STD_HASH`. Same for `half`. Added `EIGEN_CUDA_HAS_FP16_ARITHMETIC` to improve readability, eliminate warnings about `EIGEN_CUDA_ARCH` not being defined. Replaced a couple C-style casts with `reinterpret_cast` for aligned loading of `half*` to `half2*`. This eliminates `-Wcast-align` warnings in clang. Although not ideal due to potential type aliasing, this is how CUDA handles these conversions internally.

commit: db5691ff2b537ef003b192b5cbd871f0eb9309ba [log] [tgz]
author: Antonio Sanchez <cantonios@google.com> Fri Feb 19 08:52:31 2021 -0800
committer: Rasmus Munk Larsen <rmlarsen@google.com> Wed Feb 24 00:16:31 2021 +0000
tree: 90c6b8faf224d63ea8b7f692ecb5b0bf789b95db
parent: 88d4c6d4c870f53d129ab5f8b43e01812d9b500e [diff]
diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h
index 72a489b..81af56a 100644
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h

@@ -655,20 +655,6 @@
 
 } // namespace Eigen
 
-namespace std {
-
-#if __cplusplus > 199711L
-template <>
-struct hash<Eigen::bfloat16> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::bfloat16& a) const {
-    return hash<float>()(static_cast<float>(a));
-  }
-};
-#endif
-
-} // namespace std
-
-
 namespace Eigen {
 namespace numext {
 
@@ -703,4 +689,16 @@
 }  // namespace numext
 }  // namespace Eigen
 
+#if EIGEN_HAS_STD_HASH
+namespace std {
+template <>
+struct hash<Eigen::bfloat16> {
+  EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::bfloat16& a) const {
+    return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a));
+  }
+};
+} // namespace std
+#endif
+
+
 #endif // EIGEN_BFLOAT16_H

diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index b273abe..c91b0ce 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h

@@ -757,19 +757,6 @@
   #pragma pop_macro("EIGEN_CONSTEXPR")
 #endif
 
-namespace std {
-
-#if __cplusplus > 199711L
-template <>
-struct hash<Eigen::half> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const {
-    return static_cast<std::size_t>(a.x);
-  }
-};
-#endif
-
-} // end namespace std
-
 namespace Eigen {
 namespace numext {
 
@@ -870,4 +857,15 @@
 }
 #endif // __ldg
 
+#if EIGEN_HAS_STD_HASH
+namespace std {
+template <>
+struct hash<Eigen::half> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const {
+    return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a));
+  }
+};
+} // end namespace std
+#endif
+
 #endif // EIGEN_HALF_H

diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index c16f95e..689110d 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h

@@ -15,12 +15,16 @@
 namespace internal {
 
 // Read-only data cached load available.
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 350
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
 #define EIGEN_GPU_HAS_LDG 1
 #endif
 
 // FP16 math available.
-#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
+#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
+#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
+#endif
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
 #define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
 #endif
 
@@ -603,7 +607,8 @@
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
     const Eigen::half* from) {
 #if defined(EIGEN_GPU_HAS_LDG)
-  return __ldg((const half2*)from);
+  // Input is guaranteed to be properly aligned.
+  return __ldg(reinterpret_cast<const half2*>(from));
 #else
   return combine_half(*(from+0), *(from+1));
 #endif
@@ -922,7 +927,7 @@
   return __floats2half2_rn(r1, r2);
 }
 
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \
   defined(EIGEN_HIP_DEVICE_COMPILE)
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1033,7 +1038,7 @@
 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
 #if defined(EIGEN_GPU_HAS_LDG)
   Packet4h2 r;
-  r = __ldg((const Packet4h2*)from);
+  r = __ldg(reinterpret_cast<const Packet4h2*>(from));
   return r;
 #else
   Packet4h2 r;
@@ -1226,7 +1231,7 @@
   p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)),
                               __hadd(a, __float2half(7.0f)));
   return r;
-#elif EIGEN_CUDA_ARCH >= 530
+#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
   Packet4h2 r;
   half2* r_alias = reinterpret_cast<half2*>(&r);
 
@@ -1478,7 +1483,7 @@
                             predux_max(a_alias[3]));
   __half first  = predux_max(m0);
   __half second = predux_max(m1);
-#if EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
   return (__hgt(first, second) ? first : second);
 #else
   float ffirst  = __half2float(first);
@@ -1497,7 +1502,7 @@
                             predux_min(a_alias[3]));
   __half first  = predux_min(m0);
   __half second = predux_min(m1);
-#if EIGEN_CUDA_ARCH >= 530
+#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
   return (__hlt(first, second) ? first : second);
 #else
   float ffirst  = __half2float(first);
@@ -1669,6 +1674,7 @@
 #endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
 
 #undef EIGEN_GPU_HAS_LDG
+#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
 #undef EIGEN_GPU_HAS_FP16_ARITHMETIC
 
 } // end namespace internal

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 0c28825..d4f91a9 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h

@@ -668,6 +668,17 @@
 #endif
 #endif
 
+// Does the compiler support std::hash?
+#ifndef EIGEN_HAS_STD_HASH
+// The std::hash struct is defined in C++11 but is not labelled as a __device__
+// function and is not constexpr, so cannot be used on device.
+#if EIGEN_HAS_CXX11 && !EIGEN_GPU_COMPILE_PHASE
+#define EIGEN_HAS_STD_HASH 1
+#else
+#define EIGEN_HAS_STD_HASH 0
+#endif
+#endif  // EIGEN_HAS_STD_HASH
+
 #ifndef EIGEN_HAS_ALIGNAS
 #if EIGEN_MAX_CPP_VER>=11 && EIGEN_HAS_CXX11 &&   \
       (     __has_feature(cxx_alignas)            \
commit	db5691ff2b537ef003b192b5cbd871f0eb9309ba	[log] [tgz]
author	Antonio Sanchez <cantonios@google.com>	Fri Feb 19 08:52:31 2021 -0800
committer	Rasmus Munk Larsen <rmlarsen@google.com>	Wed Feb 24 00:16:31 2021 +0000
tree	90c6b8faf224d63ea8b7f692ecb5b0bf789b95db
parent	88d4c6d4c870f53d129ab5f8b43e01812d9b500e [diff]