Eigen/src/Core/arch/CUDA/PacketMathHalf.h - mirror - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 #ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
 #define EIGEN_PACKET_MATH_HALF_CUDA_H

 #if defined(EIGEN_HAS_CUDA_FP16)

 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
 #if defined(__CUDACC__) && defined(EIGEN_USE_GPU)

 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300

 namespace Eigen {
 namespace internal {

 template<> struct is_arithmetic<half2> { enum { value = true }; };

 template<> struct packet_traits<half> : default_packet_traits
 {
   typedef half2 type;
   typedef half2 half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size=2,
     HasHalfPacket = 0,

     HasDiv  = 1,
     HasLog  = 1,
     HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
     HasLGamma = 1,
     HasDiGamma = 1,
     HasErf = 1,
     HasErfc = 1,

     HasBlend = 0,
   };
 };


 template<> struct unpacket_traits<half2> { typedef half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };

 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const half& from) {
   return __half2half2(from);
 }

 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const half* from) {
   return *reinterpret_cast<const half2*>(from);
 }

 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const half* from) {
   return __halves2half2(from[0], from[1]);
 }

 template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const half*  from) {
   return __halves2half2(from[0], from[0]);
 }

 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<half>(half* to, const half2& from) {
   *reinterpret_cast<half2*>(to) = from;
 }

 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, const half2& from) {
   to[0] = __low2half(from);
   to[1] = __high2half(from);
 }

 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
 #if __CUDA_ARCH__ >= 320
   return __ldg((const half2*)from);
 #else
   return __halves2half2(*(from+0), *(from+1));
 #endif
 }

 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) {
 #if __CUDA_ARCH__ >= 320
   return __halves2half2(__ldg(from+0), __ldg(from+1));
 #else
   return __halves2half2(*(from+0), *(from+1));
 #endif
 }

 template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) {
   return __halves2half2(from[0*stride], from[1*stride]);
 }

 template<> EIGEN_DEVICE_FUNC inline void pscatter<half, half2>(half* to, const half2& from, Index stride) {
   to[stride*0] = __low2half(from);
   to[stride*1] = __high2half(from);
 }

 template<> EIGEN_DEVICE_FUNC inline half pfirst<half2>(const half2& a) {
   return __low2half(a);
 }

 template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) {
   half2 result;
   result.x = a.x & 0x7FFF7FFF;
   return result;
 }


 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<half2,2>& kernel) {
   half a1 = __low2half(kernel.packet[0]);
   half a2 = __high2half(kernel.packet[0]);
   half b1 = __low2half(kernel.packet[1]);
   half b2 = __high2half(kernel.packet[1]);
   kernel.packet[0] = __halves2half2(a1, b1);
   kernel.packet[1] = __halves2half2(a2, b2);
 }

 // The following operations require arch >= 5.3
 #if  __CUDA_ARCH__ >= 530
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) {
   return __halves2half2(a, __hadd(a, __float2half(1.0f)));
 }

 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
   return __hadd2(a, b);
 }

 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
   return __hsub2(a, b);
 }

 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
   return __hneg2(a);
 }

 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }

 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
   return __hmul2(a, b);
 }

  template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
    return __hfma2(a, b, c);
  }

 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
   float b2 = __high2float(b);
   float r1 = a1 / b1;
   float r2 = a2 / b2;
   return __floats2half2_rn(r1, r2);
 }

 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
   float b2 = __high2float(b);
   half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
   half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
   return __halves2half2(r1, r2);
 }

 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
   float a1 = __low2float(a);
   float a2 = __high2float(a);
   float b1 = __low2float(b);
   float b2 = __high2float(b);
   half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
   half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
   return __halves2half2(r1, r2);
 }

 template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) {
   return __hadd(__low2half(a), __high2half(a));
 }

 template<> EIGEN_DEVICE_FUNC inline half predux_max<half2>(const half2& a) {
   half first = __low2half(a);
   half second = __high2half(a);
   return __hgt(first, second) ? first : second;
 }

 template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) {
   half first = __low2half(a);
   half second = __high2half(a);
   return __hlt(first, second) ? first : second;
 }

 template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
   return __hmul(__low2half(a), __high2half(a));
 }
 #endif

 } // end namespace internal

 } // end namespace Eigen

 #endif
 #endif
 #endif
 #endif // EIGEN_PACKET_MATH_HALF_CUDA_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

	#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
	#define EIGEN_PACKET_MATH_HALF_CUDA_H

	#if defined(EIGEN_HAS_CUDA_FP16)

	// Make sure this is only available when targeting a GPU: we don't want to
	// introduce conflicts between these packet_traits definitions and the ones
	// we'll use on the host side (SSE, AVX, ...)
	#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)

	#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300

	namespace Eigen {
	namespace internal {

	template<> struct is_arithmetic<half2> { enum { value = true }; };

	template<> struct packet_traits<half> : default_packet_traits
	{
	typedef half2 type;
	typedef half2 half;
	enum {
	Vectorizable = 1,
	AlignedOnScalar = 1,
	size=2,
	HasHalfPacket = 0,

	HasDiv = 1,
	HasLog = 1,
	HasExp = 1,
	HasSqrt = 1,
	HasRsqrt = 1,
	HasLGamma = 1,
	HasDiGamma = 1,
	HasErf = 1,
	HasErfc = 1,

	HasBlend = 0,
	};
	};


	template<> struct unpacket_traits<half2> { typedef half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };

	template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const half& from) {
	return __half2half2(from);
	}

	template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const half* from) {
	return reinterpret_cast<const half2>(from);
	}

	template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const half* from) {
	return __halves2half2(from[0], from[1]);
	}

	template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const half* from) {
	return __halves2half2(from[0], from[0]);
	}

	template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<half>(half* to, const half2& from) {
	reinterpret_cast<half2>(to) = from;
	}

	template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, const half2& from) {
	to[0] = __low2half(from);
	to[1] = __high2half(from);
	}

	template<>
	EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
	#if __CUDA_ARCH__ >= 320
	return __ldg((const half2*)from);
	#else
	return __halves2half2((from+0), (from+1));
	#endif
	}

	template<>
	EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) {
	#if __CUDA_ARCH__ >= 320
	return __halves2half2(__ldg(from+0), __ldg(from+1));
	#else
	return __halves2half2((from+0), (from+1));
	#endif
	}

	template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) {
	return __halves2half2(from[0stride], from[1stride]);
	}

	template<> EIGEN_DEVICE_FUNC inline void pscatter<half, half2>(half* to, const half2& from, Index stride) {
	to[stride*0] = __low2half(from);
	to[stride*1] = __high2half(from);
	}

	template<> EIGEN_DEVICE_FUNC inline half pfirst<half2>(const half2& a) {
	return __low2half(a);
	}

	template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) {
	half2 result;
	result.x = a.x & 0x7FFF7FFF;
	return result;
	}


	EIGEN_DEVICE_FUNC inline void
	ptranspose(PacketBlock<half2,2>& kernel) {
	half a1 = __low2half(kernel.packet[0]);
	half a2 = __high2half(kernel.packet[0]);
	half b1 = __low2half(kernel.packet[1]);
	half b2 = __high2half(kernel.packet[1]);
	kernel.packet[0] = __halves2half2(a1, b1);
	kernel.packet[1] = __halves2half2(a2, b2);
	}

	// The following operations require arch >= 5.3
	#if __CUDA_ARCH__ >= 530
	template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) {
	return __halves2half2(a, __hadd(a, __float2half(1.0f)));
	}

	template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
	return __hadd2(a, b);
	}

	template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
	return __hsub2(a, b);
	}

	template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
	return __hneg2(a);
	}

	template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }

	template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
	return __hmul2(a, b);
	}

	template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
	return __hfma2(a, b, c);
	}

	template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
	float a1 = __low2float(a);
	float a2 = __high2float(a);
	float b1 = __low2float(b);
	float b2 = __high2float(b);
	float r1 = a1 / b1;
	float r2 = a2 / b2;
	return __floats2half2_rn(r1, r2);
	}

	template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
	float a1 = __low2float(a);
	float a2 = __high2float(a);
	float b1 = __low2float(b);
	float b2 = __high2float(b);
	half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
	half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
	return __halves2half2(r1, r2);
	}

	template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
	float a1 = __low2float(a);
	float a2 = __high2float(a);
	float b1 = __low2float(b);
	float b2 = __high2float(b);
	half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
	half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
	return __halves2half2(r1, r2);
	}

	template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) {
	return __hadd(__low2half(a), __high2half(a));
	}

	template<> EIGEN_DEVICE_FUNC inline half predux_max<half2>(const half2& a) {
	half first = __low2half(a);
	half second = __high2half(a);
	return __hgt(first, second) ? first : second;
	}

	template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) {
	half first = __low2half(a);
	half second = __high2half(a);
	return __hlt(first, second) ? first : second;
	}

	template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
	return __hmul(__low2half(a), __high2half(a));
	}
	#endif

	} // end namespace internal

	} // end namespace Eigen

	#endif
	#endif
	#endif
	#endif // EIGEN_PACKET_MATH_HALF_CUDA_H