unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h - mirror - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
 // Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 #ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
 #define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H

 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"

 namespace Eigen {
 namespace internal {

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t get_random_seed() {
 #if defined(EIGEN_GPU_COMPILE_PHASE)
   // We don't support 3d kernels since we currently only use 1 and
   // 2d kernels.
   gpu_assert(threadIdx.z == 0);
   return blockIdx.x * blockDim.x + threadIdx.x + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
 #else
   // Rely on Eigen's random implementation.
   return random<uint64_t>();
 #endif
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
   // TODO: Unify with the implementation in the non blocking thread pool.
   uint64_t current = *state;
   // Update the internal state
   *state = current * 6364136223846793005ULL + (stream << 1 | 1);
   // Generate the random output (using the PCG-XSH-RS scheme)
   return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
 }

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
   seed = seed ? seed : get_random_seed();
   return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
 }

 template <typename T>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
   unsigned rnd = PCG_XSH_RS_generator(state, stream);
   return static_cast<T>(rnd);
 }

 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
   // Generate 10 random bits for the mantissa, merge with exponent.
   unsigned rnd = PCG_XSH_RS_generator(state, stream);
   const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
   Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
   // Return the final result
   return result - Eigen::half(1.0f);
 }

 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state,
                                                                                            uint64_t stream) {
   // Generate 7 random bits for the mantissa, merge with exponent.
   unsigned rnd = PCG_XSH_RS_generator(state, stream);
   const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
   Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
   // Return the final result
   return result - Eigen::bfloat16(1.0f);
 }

 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
   typedef union {
     uint32_t raw;
     float fp;
   } internal;
   internal result;
   // Generate 23 random bits for the mantissa mantissa
   const unsigned rnd = PCG_XSH_RS_generator(state, stream);
   result.raw = rnd & 0x7fffffu;
   // Set the exponent
   result.raw |= (static_cast<uint32_t>(127) << 23);
   // Return the final result
   return result.fp - 1.0f;
 }

 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
   typedef union {
     uint64_t raw;
     double dp;
   } internal;
   internal result;
   result.raw = 0;
   // Generate 52 random bits for the mantissa
   // First generate the upper 20 bits
   unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
   // The generate the lower 32 bits
   unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
   result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
   // Set the exponent
   result.raw |= (static_cast<uint64_t>(1023) << 52);
   // Return the final result
   return result.dp - 1.0;
 }

 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state,
                                                                                                     uint64_t stream) {
   return std::complex<float>(RandomToTypeUniform<float>(state, stream), RandomToTypeUniform<float>(state, stream));
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state,
                                                                                                       uint64_t stream) {
   return std::complex<double>(RandomToTypeUniform<double>(state, stream), RandomToTypeUniform<double>(state, stream));
 }

 template <typename T>
 class UniformRandomGenerator {
  public:
   static constexpr bool PacketAccess = true;

   // Uses the given "seed" if non-zero, otherwise uses a random seed.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(uint64_t seed = 0) {
     m_state = PCG_XSH_RS_state(seed);
 #ifdef EIGEN_USE_SYCL
     // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
     // Therefore, we need two steps to initializate the m_state.
     // IN SYCL, the constructor of the functor is s called on the CPU
     // and we get the clock seed here from the CPU. However, This seed is
     // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
     // and only  available on the Operator() function (which is called on the GPU).
     // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each
     // thread but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each
     // thread adds the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the
     // construction similar to CUDA Therefore, the thread Id injection is not available at this stage.
     // However when the operator() is called the thread ID will be available. So inside the operator,
     // we add the thrreadID, BlockId,... (which is equivalent of i)
     // to the seed and construct the unique m_state per thead similar to cuda.
     m_exec_once = false;
 #endif
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(const UniformRandomGenerator& other) {
     m_state = other.m_state;
 #ifdef EIGEN_USE_SYCL
     m_exec_once = other.m_exec_once;
 #endif
   }

   template <typename Index>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const {
 #ifdef EIGEN_USE_SYCL
     if (!m_exec_once) {
       // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
       // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
       m_state += (i * 6364136223846793005ULL);
       m_exec_once = true;
     }
 #endif
     T result = RandomToTypeUniform<T>(&m_state, i);
     return result;
   }

   template <typename Packet, typename Index>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const {
     const int packetSize = internal::unpacket_traits<Packet>::size;
     EIGEN_ALIGN_MAX T values[packetSize];
 #ifdef EIGEN_USE_SYCL
     if (!m_exec_once) {
       // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
       m_state += (i * 6364136223846793005ULL);
       m_exec_once = true;
     }
 #endif
     EIGEN_UNROLL_LOOP
     for (int j = 0; j < packetSize; ++j) {
       values[j] = RandomToTypeUniform<T>(&m_state, i);
     }
     return internal::pload<Packet>(values);
   }

  private:
   mutable uint64_t m_state;
 #ifdef EIGEN_USE_SYCL
   mutable bool m_exec_once;
 #endif
 };

 template <typename Scalar>
 struct functor_traits<UniformRandomGenerator<Scalar> > {
   enum {
     // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
     Cost = 12 * NumTraits<Scalar>::AddCost * ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
     PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
   };
 };

 template <typename T>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
   // Use the ratio of uniform method to generate numbers following a normal
   // distribution. See for example Numerical Recipes chapter 7.3.9 for the
   // details.
   T u, v, q;
   do {
     u = RandomToTypeUniform<T>(state, stream);
     v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
     const T x = u - T(0.449871);
     const T y = numext::abs(v) + T(0.386595);
     q = x * x + y * (T(0.196) * y - T(0.25472) * x);
   } while (q > T(0.27597) && (q > T(0.27846) || v * v > T(-4) * numext::log(u) * u * u));

   return v / u;
 }

 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state,
                                                                                                    uint64_t stream) {
   return std::complex<float>(RandomToTypeNormal<float>(state, stream), RandomToTypeNormal<float>(state, stream));
 }
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state,
                                                                                                      uint64_t stream) {
   return std::complex<double>(RandomToTypeNormal<double>(state, stream), RandomToTypeNormal<double>(state, stream));
 }

 template <typename T>
 class NormalRandomGenerator {
  public:
   static constexpr bool PacketAccess = true;

   // Uses the given "seed" if non-zero, otherwise uses a random seed.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
     m_state = PCG_XSH_RS_state(seed);
 #ifdef EIGEN_USE_SYCL
     // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
     // Therefore, we need two steps to initializate the m_state.
     // IN SYCL, the constructor of the functor is s called on the CPU
     // and we get the clock seed here from the CPU. However, This seed is
     // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
     // and only  available on the Operator() function (which is called on the GPU).
     // Therefore, the thread Id injection is not available at this stage. However when the operator()
     // is called the thread ID will be available. So inside the operator,
     // we add the thrreadID, BlockId,... (which is equivalent of i)
     // to the seed and construct the unique m_state per thead similar to cuda.
     m_exec_once = false;
 #endif
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(const NormalRandomGenerator& other) {
     m_state = other.m_state;
 #ifdef EIGEN_USE_SYCL
     m_exec_once = other.m_exec_once;
 #endif
   }

   template <typename Index>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const {
 #ifdef EIGEN_USE_SYCL
     if (!m_exec_once) {
       // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
       m_state += (i * 6364136223846793005ULL);
       m_exec_once = true;
     }
 #endif
     T result = RandomToTypeNormal<T>(&m_state, i);
     return result;
   }

   template <typename Packet, typename Index>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const {
     const int packetSize = internal::unpacket_traits<Packet>::size;
     EIGEN_ALIGN_MAX T values[packetSize];
 #ifdef EIGEN_USE_SYCL
     if (!m_exec_once) {
       // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
       m_state += (i * 6364136223846793005ULL);
       m_exec_once = true;
     }
 #endif
     EIGEN_UNROLL_LOOP
     for (int j = 0; j < packetSize; ++j) {
       values[j] = RandomToTypeNormal<T>(&m_state, i);
     }
     return internal::pload<Packet>(values);
   }

  private:
   mutable uint64_t m_state;
 #ifdef EIGEN_USE_SYCL
   mutable bool m_exec_once;
 #endif
 };

 template <typename Scalar>
 struct functor_traits<NormalRandomGenerator<Scalar> > {
   enum {
     // On average, we need to generate about 3 random numbers
     // 15 mul, 8 add, 1.5 logs
     Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost + 15 * NumTraits<Scalar>::AddCost +
            8 * NumTraits<Scalar>::AddCost + 3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
     PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
   };
 };

 }  // end namespace internal
 }  // end namespace Eigen

 #endif  // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
	// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

	#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
	#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H

	// IWYU pragma: private
	#include "./InternalHeaderCheck.h"

	namespace Eigen {
	namespace internal {

	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t get_random_seed() {
	#if defined(EIGEN_GPU_COMPILE_PHASE)
	// We don't support 3d kernels since we currently only use 1 and
	// 2d kernels.
	gpu_assert(threadIdx.z == 0);
	return blockIdx.x * blockDim.x + threadIdx.x + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
	#else
	// Rely on Eigen's random implementation.
	return random<uint64_t>();
	#endif
	}

	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
	// TODO: Unify with the implementation in the non blocking thread pool.
	uint64_t current = *state;
	// Update the internal state
	state = current 6364136223846793005ULL + (stream << 1 \| 1);
	// Generate the random output (using the PCG-XSH-RS scheme)
	return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
	}

	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
	seed = seed ? seed : get_random_seed();
	return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
	}

	template <typename T>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
	unsigned rnd = PCG_XSH_RS_generator(state, stream);
	return static_cast<T>(rnd);
	}

	template <>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
	// Generate 10 random bits for the mantissa, merge with exponent.
	unsigned rnd = PCG_XSH_RS_generator(state, stream);
	const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) \| (static_cast<uint16_t>(15) << 10);
	Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
	// Return the final result
	return result - Eigen::half(1.0f);
	}

	template <>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state,
	uint64_t stream) {
	// Generate 7 random bits for the mantissa, merge with exponent.
	unsigned rnd = PCG_XSH_RS_generator(state, stream);
	const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) \| (static_cast<uint16_t>(127) << 7);
	Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
	// Return the final result
	return result - Eigen::bfloat16(1.0f);
	}

	template <>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
	typedef union {
	uint32_t raw;
	float fp;
	} internal;
	internal result;
	// Generate 23 random bits for the mantissa mantissa
	const unsigned rnd = PCG_XSH_RS_generator(state, stream);
	result.raw = rnd & 0x7fffffu;
	// Set the exponent
	result.raw \|= (static_cast<uint32_t>(127) << 23);
	// Return the final result
	return result.fp - 1.0f;
	}

	template <>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
	typedef union {
	uint64_t raw;
	double dp;
	} internal;
	internal result;
	result.raw = 0;
	// Generate 52 random bits for the mantissa
	// First generate the upper 20 bits
	unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
	// The generate the lower 32 bits
	unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
	result.raw = (static_cast<uint64_t>(rnd1) << 32) \| rnd2;
	// Set the exponent
	result.raw \|= (static_cast<uint64_t>(1023) << 52);
	// Return the final result
	return result.dp - 1.0;
	}

	template <>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state,
	uint64_t stream) {
	return std::complex<float>(RandomToTypeUniform<float>(state, stream), RandomToTypeUniform<float>(state, stream));
	}
	template <>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state,
	uint64_t stream) {
	return std::complex<double>(RandomToTypeUniform<double>(state, stream), RandomToTypeUniform<double>(state, stream));
	}

	template <typename T>
	class UniformRandomGenerator {
	public:
	static constexpr bool PacketAccess = true;

	// Uses the given "seed" if non-zero, otherwise uses a random seed.
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(uint64_t seed = 0) {
	m_state = PCG_XSH_RS_state(seed);
	#ifdef EIGEN_USE_SYCL
	// In SYCL it is not possible to build PCG_XSH_RS_state in one step.
	// Therefore, we need two steps to initializate the m_state.
	// IN SYCL, the constructor of the functor is s called on the CPU
	// and we get the clock seed here from the CPU. However, This seed is
	// the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
	// and only available on the Operator() function (which is called on the GPU).
	// Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each
	// thread but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each
	// thread adds the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the
	// construction similar to CUDA Therefore, the thread Id injection is not available at this stage.
	// However when the operator() is called the thread ID will be available. So inside the operator,
	// we add the thrreadID, BlockId,... (which is equivalent of i)
	// to the seed and construct the unique m_state per thead similar to cuda.
	m_exec_once = false;
	#endif
	}
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(const UniformRandomGenerator& other) {
	m_state = other.m_state;
	#ifdef EIGEN_USE_SYCL
	m_exec_once = other.m_exec_once;
	#endif
	}

	template <typename Index>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const {
	#ifdef EIGEN_USE_SYCL
	if (!m_exec_once) {
	// This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
	// The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
	m_state += (i * 6364136223846793005ULL);
	m_exec_once = true;
	}
	#endif
	T result = RandomToTypeUniform<T>(&m_state, i);
	return result;
	}

	template <typename Packet, typename Index>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const {
	const int packetSize = internal::unpacket_traits<Packet>::size;
	EIGEN_ALIGN_MAX T values[packetSize];
	#ifdef EIGEN_USE_SYCL
	if (!m_exec_once) {
	// This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
	m_state += (i * 6364136223846793005ULL);
	m_exec_once = true;
	}
	#endif
	EIGEN_UNROLL_LOOP
	for (int j = 0; j < packetSize; ++j) {
	values[j] = RandomToTypeUniform<T>(&m_state, i);
	}
	return internal::pload<Packet>(values);
	}

	private:
	mutable uint64_t m_state;
	#ifdef EIGEN_USE_SYCL
	mutable bool m_exec_once;
	#endif
	};

	template <typename Scalar>
	struct functor_traits<UniformRandomGenerator<Scalar> > {
	enum {
	// Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
	Cost = 12 * NumTraits<Scalar>::AddCost * ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
	PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
	};
	};

	template <typename T>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
	// Use the ratio of uniform method to generate numbers following a normal
	// distribution. See for example Numerical Recipes chapter 7.3.9 for the
	// details.
	T u, v, q;
	do {
	u = RandomToTypeUniform<T>(state, stream);
	v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
	const T x = u - T(0.449871);
	const T y = numext::abs(v) + T(0.386595);
	q = x * x + y * (T(0.196) * y - T(0.25472) * x);
	} while (q > T(0.27597) && (q > T(0.27846) \|\| v * v > T(-4) * numext::log(u) * u * u));

	return v / u;
	}

	template <>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state,
	uint64_t stream) {
	return std::complex<float>(RandomToTypeNormal<float>(state, stream), RandomToTypeNormal<float>(state, stream));
	}
	template <>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state,
	uint64_t stream) {
	return std::complex<double>(RandomToTypeNormal<double>(state, stream), RandomToTypeNormal<double>(state, stream));
	}

	template <typename T>
	class NormalRandomGenerator {
	public:
	static constexpr bool PacketAccess = true;

	// Uses the given "seed" if non-zero, otherwise uses a random seed.
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
	m_state = PCG_XSH_RS_state(seed);
	#ifdef EIGEN_USE_SYCL
	// In SYCL it is not possible to build PCG_XSH_RS_state in one step.
	// Therefore, we need two steps to initializate the m_state.
	// IN SYCL, the constructor of the functor is s called on the CPU
	// and we get the clock seed here from the CPU. However, This seed is
	// the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
	// and only available on the Operator() function (which is called on the GPU).
	// Therefore, the thread Id injection is not available at this stage. However when the operator()
	// is called the thread ID will be available. So inside the operator,
	// we add the thrreadID, BlockId,... (which is equivalent of i)
	// to the seed and construct the unique m_state per thead similar to cuda.
	m_exec_once = false;
	#endif
	}
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(const NormalRandomGenerator& other) {
	m_state = other.m_state;
	#ifdef EIGEN_USE_SYCL
	m_exec_once = other.m_exec_once;
	#endif
	}

	template <typename Index>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const {
	#ifdef EIGEN_USE_SYCL
	if (!m_exec_once) {
	// This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
	m_state += (i * 6364136223846793005ULL);
	m_exec_once = true;
	}
	#endif
	T result = RandomToTypeNormal<T>(&m_state, i);
	return result;
	}

	template <typename Packet, typename Index>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const {
	const int packetSize = internal::unpacket_traits<Packet>::size;
	EIGEN_ALIGN_MAX T values[packetSize];
	#ifdef EIGEN_USE_SYCL
	if (!m_exec_once) {
	// This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
	m_state += (i * 6364136223846793005ULL);
	m_exec_once = true;
	}
	#endif
	EIGEN_UNROLL_LOOP
	for (int j = 0; j < packetSize; ++j) {
	values[j] = RandomToTypeNormal<T>(&m_state, i);
	}
	return internal::pload<Packet>(values);
	}

	private:
	mutable uint64_t m_state;
	#ifdef EIGEN_USE_SYCL
	mutable bool m_exec_once;
	#endif
	};

	template <typename Scalar>
	struct functor_traits<NormalRandomGenerator<Scalar> > {
	enum {
	// On average, we need to generate about 3 random numbers
	// 15 mul, 8 add, 1.5 logs
	Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost + 15 * NumTraits<Scalar>::AddCost +
	8 * NumTraits<Scalar>::AddCost + 3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
	PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
	};
	};

	} // end namespace internal
	} // end namespace Eigen

	#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H