Apply __launch_bounds__(1024) on CUDA, not just HIP

libeigen/eigen!2431

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 109a3b3..57433e4 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -596,14 +596,21 @@
 #endif
 
 // HIP compilers default to launch_bounds(256), which causes failures when kernels
-// are called with more than 256 threads per block. Explicitly set to 1024 for HIP.
+// are called with more than 256 threads per block. On CUDA, without explicit
+// launch_bounds the compiler may over-allocate registers per thread, causing
+// cudaErrorLaunchOutOfResources for kernels launched with 1024 threads (e.g. 3D
+// convolution). Set to 1024 for all GPU compilers.
 
 #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)
 
 #endif
 
 #if !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
+#if defined(EIGEN_CUDACC)
+#define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)
+#else
 #define EIGEN_HIP_LAUNCH_BOUNDS_1024
+#endif
 #endif  // !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
 
 // Unify CUDA/HIPCC