Pulled latest updates from trunk
diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 4f052dc..5138650 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -264,6 +264,30 @@ #endif } +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +half2 plog<half2>(const half2& a) { + return h2log(a); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +half2 pexp<half2>(const half2& a) { + return h2exp(a); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +half2 psqrt<half2>(const half2& a) { + return h2sqrt(a); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +half2 prsqrt<half2>(const half2& a) { + return h2rsqrt(a); +} + +#else + template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) { float a1 = __low2float(a); float a2 = __high2float(a); @@ -296,6 +320,8 @@ return __floats2half2_rn(r1, r2); } +#endif + #elif defined EIGEN_VECTORIZE_AVX typedef struct {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 4f2dfcb..7368768 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -291,7 +291,7 @@ #ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value || - internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value); + (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess)); #else static const bool HasOptimizedImplementation = !Op::IsStateful && internal::is_same<typename Self::CoeffReturnType, float>::value; @@ -475,12 +475,6 @@ template <typename Self, typename Op> struct InnerReductionLauncher { - // Unfortunately nvidia doesn't support well exotic types such as complex, - // so reduce the scope of the optimized version of the code to the simple case - // of floats. - static const bool HasOptimizedImplementation = !Op::IsStateful && - internal::is_same<typename Self::CoeffReturnType, float>::value; - template <typename OutputType> static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) { assert(false && "Should only be called to reduce floats and half floats on a gpu device"); @@ -561,7 +555,7 @@ #ifdef EIGEN_HAS_CUDA_FP16 static const bool HasOptimizedImplementation = !Op::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value || - internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value); + (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess)); #else static const bool HasOptimizedImplementation = !Op::IsStateful && internal::is_same<typename Self::CoeffReturnType, float>::value;