Eigen/src/ThreadPool/CoreThreadPoolDevice.h - mirror - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2023 Charlie Schlosser <cs.schlosser@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 #ifndef EIGEN_CORE_THREAD_POOL_DEVICE_H
 #define EIGEN_CORE_THREAD_POOL_DEVICE_H

 namespace Eigen {

 // CoreThreadPoolDevice provides an easy-to-understand Device for parallelizing Eigen Core expressions with
 // Threadpool. Expressions are recursively split evenly until the evaluation cost is less than the threshold for
 // delegating the task to a thread.
 /*
                  a
                 / \
                /   \
               /     \
              /       \
             /         \
            /           \
           /             \
          a               e
         / \             / \
        /   \           /   \
       /     \         /     \
      a       c       e       g
     / \     / \     / \     / \
    /   \   /   \   /   \   /   \
   a     b c     d e     f g     h
 */
 // Each task descends the binary tree to the left, delegates the right task to a new thread, and continues to the
 // left. This ensures that work is evenly distributed to the thread pool as quickly as possible and minimizes the number
 // of tasks created during the evaluation. Consider an expression that is divided into 8 chunks. The
 // primary task 'a' creates tasks 'e' 'c' and 'b', and executes its portion of the expression at the bottom of the
 // tree. Likewise, task 'e' creates tasks 'g' and 'f', and executes its portion of the expression.

 struct CoreThreadPoolDevice {
   using Task = std::function<void()>;
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoreThreadPoolDevice(ThreadPool& pool, float threadCostThreshold = 3e-5f)
       : m_pool(pool) {
     eigen_assert(threadCostThreshold >= 0.0f && "threadCostThreshold must be non-negative");
     m_costFactor = threadCostThreshold;
   }

   template <int PacketSize>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int calculateLevels(Index size, float cost) const {
     eigen_assert(cost >= 0.0f && "cost must be non-negative");
     Index numOps = size / PacketSize;
     int actualThreads = numOps < m_pool.NumThreads() ? static_cast<int>(numOps) : m_pool.NumThreads();
     float totalCost = static_cast<float>(numOps) * cost;
     float idealThreads = totalCost * m_costFactor;
     if (idealThreads < static_cast<float>(actualThreads)) {
       idealThreads = numext::maxi(idealThreads, 1.0f);
       actualThreads = numext::mini(actualThreads, static_cast<int>(idealThreads));
     }
     int maxLevel = internal::log2_ceil(actualThreads);
     return maxLevel;
   }

 // MSVC does not like inlining parallelForImpl
 #if EIGEN_COMP_MSVC && !EIGEN_COMP_CLANG
 #define EIGEN_PARALLEL_FOR_INLINE
 #else
 #define EIGEN_PARALLEL_FOR_INLINE EIGEN_STRONG_INLINE
 #endif

   template <typename UnaryFunctor, int PacketSize>
   EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index begin, Index end, UnaryFunctor& f,
                                                                    Barrier& barrier, int level) {
     while (level > 0) {
       level--;
       Index size = end - begin;
       eigen_assert(size % PacketSize == 0 && "this function assumes size is a multiple of PacketSize");
       Index mid = begin + numext::round_down(size >> 1, PacketSize);
       Task right = [this, mid, end, &f, &barrier, level]() {
         parallelForImpl<UnaryFunctor, PacketSize>(mid, end, f, barrier, level);
       };
       m_pool.Schedule(std::move(right));
       end = mid;
     }
     for (Index i = begin; i < end; i += PacketSize) f(i);
     barrier.Notify();
   }

   template <typename BinaryFunctor, int PacketSize>
   EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index outerBegin, Index outerEnd, Index innerBegin,
                                                                    Index innerEnd, BinaryFunctor& f, Barrier& barrier,
                                                                    int level) {
     while (level > 0) {
       level--;
       Index outerSize = outerEnd - outerBegin;
       if (outerSize > 1) {
         Index outerMid = outerBegin + (outerSize >> 1);
         Task right = [this, &f, &barrier, outerMid, outerEnd, innerBegin, innerEnd, level]() {
           parallelForImpl<BinaryFunctor, PacketSize>(outerMid, outerEnd, innerBegin, innerEnd, f, barrier, level);
         };
         m_pool.Schedule(std::move(right));
         outerEnd = outerMid;
       } else {
         Index innerSize = innerEnd - innerBegin;
         eigen_assert(innerSize % PacketSize == 0 && "this function assumes innerSize is a multiple of PacketSize");
         Index innerMid = innerBegin + numext::round_down(innerSize >> 1, PacketSize);
         Task right = [this, &f, &barrier, outerBegin, outerEnd, innerMid, innerEnd, level]() {
           parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerMid, innerEnd, f, barrier, level);
         };
         m_pool.Schedule(std::move(right));
         innerEnd = innerMid;
       }
     }
     for (Index outer = outerBegin; outer < outerEnd; outer++)
       for (Index inner = innerBegin; inner < innerEnd; inner += PacketSize) f(outer, inner);
     barrier.Notify();
   }

 #undef EIGEN_PARALLEL_FOR_INLINE

   template <typename UnaryFunctor, int PacketSize>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index begin, Index end, UnaryFunctor& f, float cost) {
     Index size = end - begin;
     int maxLevel = calculateLevels<PacketSize>(size, cost);
     Barrier barrier(1 << maxLevel);
     parallelForImpl<UnaryFunctor, PacketSize>(begin, end, f, barrier, maxLevel);
     barrier.Wait();
   }

   template <typename BinaryFunctor, int PacketSize>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index outerBegin, Index outerEnd, Index innerBegin,
                                                          Index innerEnd, BinaryFunctor& f, float cost) {
     Index outerSize = outerEnd - outerBegin;
     Index innerSize = innerEnd - innerBegin;
     Index size = outerSize * innerSize;
     int maxLevel = calculateLevels<PacketSize>(size, cost);
     Barrier barrier(1 << maxLevel);
     parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerBegin, innerEnd, f, barrier, maxLevel);
     barrier.Wait();
   }

   ThreadPool& m_pool;
   // costFactor is the cost of delegating a task to a thread
   // the inverse is used to avoid a floating point division
   float m_costFactor;
 };

 // specialization of coefficient-wise assignment loops for CoreThreadPoolDevice

 namespace internal {

 template <typename Kernel>
 struct cost_helper {
   using SrcEvaluatorType = typename Kernel::SrcEvaluatorType;
   using DstEvaluatorType = typename Kernel::DstEvaluatorType;
   using SrcXprType = typename SrcEvaluatorType::XprType;
   using DstXprType = typename DstEvaluatorType::XprType;
   static constexpr Index Cost = functor_cost<SrcXprType>::Cost + functor_cost<DstXprType>::Cost;
 };

 template <typename Kernel>
 struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, NoUnrolling> {
   static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost;
   struct AssignmentFunctor : public Kernel {
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
       this->assignCoeffByOuterInner(outer, inner);
     }
   };

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
     const Index innerSize = kernel.innerSize();
     const Index outerSize = kernel.outerSize();
     constexpr float cost = static_cast<float>(XprEvaluationCost);
     AssignmentFunctor functor(kernel);
     device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, 0, innerSize, functor, cost);
   }
 };

 template <typename Kernel>
 struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, InnerUnrolling> {
   using DstXprType = typename Kernel::DstEvaluatorType::XprType;
   static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, InnerSize = DstXprType::InnerSizeAtCompileTime;
   struct AssignmentFunctor : public Kernel {
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
       copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, InnerSize>::run(*this, outer);
     }
   };
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
     const Index outerSize = kernel.outerSize();
     AssignmentFunctor functor(kernel);
     constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize);
     device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, functor, cost);
   }
 };

 template <typename Kernel>
 struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, NoUnrolling> {
   using PacketType = typename Kernel::PacketType;
   static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size,
                          SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
                          DstAlignment = Kernel::AssignmentTraits::DstAlignment;
   struct AssignmentFunctor : public Kernel {
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
       this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);
     }
   };
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
     const Index innerSize = kernel.innerSize();
     const Index outerSize = kernel.outerSize();
     const float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize);
     AssignmentFunctor functor(kernel);
     device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, 0, innerSize, functor, cost);
   }
 };

 template <typename Kernel>
 struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, InnerUnrolling> {
   using PacketType = typename Kernel::PacketType;
   using DstXprType = typename Kernel::DstEvaluatorType::XprType;
   static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size,
                          SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
                          DstAlignment = Kernel::AssignmentTraits::DstAlignment,
                          InnerSize = DstXprType::InnerSizeAtCompileTime;
   struct AssignmentFunctor : public Kernel {
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
       copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, InnerSize, SrcAlignment, DstAlignment>::run(*this, outer);
     }
   };
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
     const Index outerSize = kernel.outerSize();
     constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize);
     AssignmentFunctor functor(kernel);
     device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, functor, cost);
   }
 };

 template <typename Kernel>
 struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, SliceVectorizedTraversal, NoUnrolling> {
   using Scalar = typename Kernel::Scalar;
   using PacketType = typename Kernel::PacketType;
   static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size;
   struct PacketAssignmentFunctor : public Kernel {
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
       this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);
     }
   };
   struct ScalarAssignmentFunctor : public Kernel {
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
       const Index innerSize = this->innerSize();
       const Index packetAccessSize = numext::round_down(innerSize, PacketSize);
       for (Index inner = packetAccessSize; inner < innerSize; inner++) this->assignCoeffByOuterInner(outer, inner);
     }
   };
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
     const Index outerSize = kernel.outerSize();
     const Index innerSize = kernel.innerSize();
     const Index packetAccessSize = numext::round_down(innerSize, PacketSize);
     constexpr float packetCost = static_cast<float>(XprEvaluationCost);
     const float scalarCost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize - packetAccessSize);
     PacketAssignmentFunctor packetFunctor(kernel);
     ScalarAssignmentFunctor scalarFunctor(kernel);
     device.template parallelFor<PacketAssignmentFunctor, PacketSize>(0, outerSize, 0, packetAccessSize, packetFunctor,
                                                                      packetCost);
     device.template parallelFor<ScalarAssignmentFunctor, 1>(0, outerSize, scalarFunctor, scalarCost);
   };
 };

 template <typename Kernel>
 struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearTraversal, NoUnrolling> {
   static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost;
   struct AssignmentFunctor : public Kernel {
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) { this->assignCoeff(index); }
   };
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
     const Index size = kernel.size();
     constexpr float cost = static_cast<float>(XprEvaluationCost);
     AssignmentFunctor functor(kernel);
     device.template parallelFor<AssignmentFunctor, 1>(0, size, functor, cost);
   }
 };

 template <typename Kernel>
 struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearVectorizedTraversal, NoUnrolling> {
   using Scalar = typename Kernel::Scalar;
   using PacketType = typename Kernel::PacketType;
   static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost,
                          RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
                          PacketSize = unpacket_traits<PacketType>::size,
                          DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment,
                          DstAlignment = packet_traits<Scalar>::AlignedOnScalar ? RequestedAlignment
                                                                                : Kernel::AssignmentTraits::DstAlignment,
                          SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
   struct AssignmentFunctor : public Kernel {
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) {
       this->template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);
     }
   };
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
     const Index size = kernel.size();
     const Index alignedStart =
         DstIsAligned ? 0 : internal::first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);
     const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);

     unaligned_dense_assignment_loop<DstIsAligned != 0>::run(kernel, 0, alignedStart);

     constexpr float cost = static_cast<float>(XprEvaluationCost);
     AssignmentFunctor functor(kernel);
     device.template parallelFor<AssignmentFunctor, PacketSize>(alignedStart, alignedEnd, functor, cost);

     unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
   }
 };

 }  // namespace internal

 }  // namespace Eigen

 #endif  // EIGEN_CORE_THREAD_POOL_DEVICE_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2023 Charlie Schlosser <cs.schlosser@gmail.com>
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

	#ifndef EIGEN_CORE_THREAD_POOL_DEVICE_H
	#define EIGEN_CORE_THREAD_POOL_DEVICE_H

	namespace Eigen {

	// CoreThreadPoolDevice provides an easy-to-understand Device for parallelizing Eigen Core expressions with
	// Threadpool. Expressions are recursively split evenly until the evaluation cost is less than the threshold for
	// delegating the task to a thread.
	/*
	a
	/ \
	/ \
	/ \
	/ \
	/ \
	/ \
	/ \
	a e
	/ \ / \
	/ \ / \
	/ \ / \
	a c e g
	/ \ / \ / \ / \
	/ \ / \ / \ / \
	a b c d e f g h
	*/
	// Each task descends the binary tree to the left, delegates the right task to a new thread, and continues to the
	// left. This ensures that work is evenly distributed to the thread pool as quickly as possible and minimizes the number
	// of tasks created during the evaluation. Consider an expression that is divided into 8 chunks. The
	// primary task 'a' creates tasks 'e' 'c' and 'b', and executes its portion of the expression at the bottom of the
	// tree. Likewise, task 'e' creates tasks 'g' and 'f', and executes its portion of the expression.

	struct CoreThreadPoolDevice {
	using Task = std::function<void()>;
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoreThreadPoolDevice(ThreadPool& pool, float threadCostThreshold = 3e-5f)
	: m_pool(pool) {
	eigen_assert(threadCostThreshold >= 0.0f && "threadCostThreshold must be non-negative");
	m_costFactor = threadCostThreshold;
	}

	template <int PacketSize>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int calculateLevels(Index size, float cost) const {
	eigen_assert(cost >= 0.0f && "cost must be non-negative");
	Index numOps = size / PacketSize;
	int actualThreads = numOps < m_pool.NumThreads() ? static_cast<int>(numOps) : m_pool.NumThreads();
	float totalCost = static_cast<float>(numOps) * cost;
	float idealThreads = totalCost * m_costFactor;
	if (idealThreads < static_cast<float>(actualThreads)) {
	idealThreads = numext::maxi(idealThreads, 1.0f);
	actualThreads = numext::mini(actualThreads, static_cast<int>(idealThreads));
	}
	int maxLevel = internal::log2_ceil(actualThreads);
	return maxLevel;
	}

	// MSVC does not like inlining parallelForImpl
	#if EIGEN_COMP_MSVC && !EIGEN_COMP_CLANG
	#define EIGEN_PARALLEL_FOR_INLINE
	#else
	#define EIGEN_PARALLEL_FOR_INLINE EIGEN_STRONG_INLINE
	#endif

	template <typename UnaryFunctor, int PacketSize>
	EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index begin, Index end, UnaryFunctor& f,
	Barrier& barrier, int level) {
	while (level > 0) {
	level--;
	Index size = end - begin;
	eigen_assert(size % PacketSize == 0 && "this function assumes size is a multiple of PacketSize");
	Index mid = begin + numext::round_down(size >> 1, PacketSize);
	Task right = [this, mid, end, &f, &barrier, level]() {
	parallelForImpl<UnaryFunctor, PacketSize>(mid, end, f, barrier, level);
	};
	m_pool.Schedule(std::move(right));
	end = mid;
	}
	for (Index i = begin; i < end; i += PacketSize) f(i);
	barrier.Notify();
	}

	template <typename BinaryFunctor, int PacketSize>
	EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index outerBegin, Index outerEnd, Index innerBegin,
	Index innerEnd, BinaryFunctor& f, Barrier& barrier,
	int level) {
	while (level > 0) {
	level--;
	Index outerSize = outerEnd - outerBegin;
	if (outerSize > 1) {
	Index outerMid = outerBegin + (outerSize >> 1);
	Task right = [this, &f, &barrier, outerMid, outerEnd, innerBegin, innerEnd, level]() {
	parallelForImpl<BinaryFunctor, PacketSize>(outerMid, outerEnd, innerBegin, innerEnd, f, barrier, level);
	};
	m_pool.Schedule(std::move(right));
	outerEnd = outerMid;
	} else {
	Index innerSize = innerEnd - innerBegin;
	eigen_assert(innerSize % PacketSize == 0 && "this function assumes innerSize is a multiple of PacketSize");
	Index innerMid = innerBegin + numext::round_down(innerSize >> 1, PacketSize);
	Task right = [this, &f, &barrier, outerBegin, outerEnd, innerMid, innerEnd, level]() {
	parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerMid, innerEnd, f, barrier, level);
	};
	m_pool.Schedule(std::move(right));
	innerEnd = innerMid;
	}
	}
	for (Index outer = outerBegin; outer < outerEnd; outer++)
	for (Index inner = innerBegin; inner < innerEnd; inner += PacketSize) f(outer, inner);
	barrier.Notify();
	}

	#undef EIGEN_PARALLEL_FOR_INLINE

	template <typename UnaryFunctor, int PacketSize>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index begin, Index end, UnaryFunctor& f, float cost) {
	Index size = end - begin;
	int maxLevel = calculateLevels<PacketSize>(size, cost);
	Barrier barrier(1 << maxLevel);
	parallelForImpl<UnaryFunctor, PacketSize>(begin, end, f, barrier, maxLevel);
	barrier.Wait();
	}

	template <typename BinaryFunctor, int PacketSize>
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index outerBegin, Index outerEnd, Index innerBegin,
	Index innerEnd, BinaryFunctor& f, float cost) {
	Index outerSize = outerEnd - outerBegin;
	Index innerSize = innerEnd - innerBegin;
	Index size = outerSize * innerSize;
	int maxLevel = calculateLevels<PacketSize>(size, cost);
	Barrier barrier(1 << maxLevel);
	parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerBegin, innerEnd, f, barrier, maxLevel);
	barrier.Wait();
	}

	ThreadPool& m_pool;
	// costFactor is the cost of delegating a task to a thread
	// the inverse is used to avoid a floating point division
	float m_costFactor;
	};

	// specialization of coefficient-wise assignment loops for CoreThreadPoolDevice

	namespace internal {

	template <typename Kernel>
	struct cost_helper {
	using SrcEvaluatorType = typename Kernel::SrcEvaluatorType;
	using DstEvaluatorType = typename Kernel::DstEvaluatorType;
	using SrcXprType = typename SrcEvaluatorType::XprType;
	using DstXprType = typename DstEvaluatorType::XprType;
	static constexpr Index Cost = functor_cost<SrcXprType>::Cost + functor_cost<DstXprType>::Cost;
	};

	template <typename Kernel>
	struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, NoUnrolling> {
	static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost;
	struct AssignmentFunctor : public Kernel {
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
	this->assignCoeffByOuterInner(outer, inner);
	}
	};

	static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
	const Index innerSize = kernel.innerSize();
	const Index outerSize = kernel.outerSize();
	constexpr float cost = static_cast<float>(XprEvaluationCost);
	AssignmentFunctor functor(kernel);
	device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, 0, innerSize, functor, cost);
	}
	};

	template <typename Kernel>
	struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, InnerUnrolling> {
	using DstXprType = typename Kernel::DstEvaluatorType::XprType;
	static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, InnerSize = DstXprType::InnerSizeAtCompileTime;
	struct AssignmentFunctor : public Kernel {
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
	copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, InnerSize>::run(*this, outer);
	}
	};
	static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
	const Index outerSize = kernel.outerSize();
	AssignmentFunctor functor(kernel);
	constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize);
	device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, functor, cost);
	}
	};

	template <typename Kernel>
	struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, NoUnrolling> {
	using PacketType = typename Kernel::PacketType;
	static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size,
	SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
	DstAlignment = Kernel::AssignmentTraits::DstAlignment;
	struct AssignmentFunctor : public Kernel {
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
	this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);
	}
	};
	static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
	const Index innerSize = kernel.innerSize();
	const Index outerSize = kernel.outerSize();
	const float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize);
	AssignmentFunctor functor(kernel);
	device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, 0, innerSize, functor, cost);
	}
	};

	template <typename Kernel>
	struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, InnerUnrolling> {
	using PacketType = typename Kernel::PacketType;
	using DstXprType = typename Kernel::DstEvaluatorType::XprType;
	static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size,
	SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
	DstAlignment = Kernel::AssignmentTraits::DstAlignment,
	InnerSize = DstXprType::InnerSizeAtCompileTime;
	struct AssignmentFunctor : public Kernel {
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
	copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, InnerSize, SrcAlignment, DstAlignment>::run(*this, outer);
	}
	};
	static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
	const Index outerSize = kernel.outerSize();
	constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize);
	AssignmentFunctor functor(kernel);
	device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, functor, cost);
	}
	};

	template <typename Kernel>
	struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, SliceVectorizedTraversal, NoUnrolling> {
	using Scalar = typename Kernel::Scalar;
	using PacketType = typename Kernel::PacketType;
	static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size;
	struct PacketAssignmentFunctor : public Kernel {
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
	this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);
	}
	};
	struct ScalarAssignmentFunctor : public Kernel {
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
	const Index innerSize = this->innerSize();
	const Index packetAccessSize = numext::round_down(innerSize, PacketSize);
	for (Index inner = packetAccessSize; inner < innerSize; inner++) this->assignCoeffByOuterInner(outer, inner);
	}
	};
	static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
	const Index outerSize = kernel.outerSize();
	const Index innerSize = kernel.innerSize();
	const Index packetAccessSize = numext::round_down(innerSize, PacketSize);
	constexpr float packetCost = static_cast<float>(XprEvaluationCost);
	const float scalarCost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize - packetAccessSize);
	PacketAssignmentFunctor packetFunctor(kernel);
	ScalarAssignmentFunctor scalarFunctor(kernel);
	device.template parallelFor<PacketAssignmentFunctor, PacketSize>(0, outerSize, 0, packetAccessSize, packetFunctor,
	packetCost);
	device.template parallelFor<ScalarAssignmentFunctor, 1>(0, outerSize, scalarFunctor, scalarCost);
	};
	};

	template <typename Kernel>
	struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearTraversal, NoUnrolling> {
	static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost;
	struct AssignmentFunctor : public Kernel {
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) { this->assignCoeff(index); }
	};
	static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
	const Index size = kernel.size();
	constexpr float cost = static_cast<float>(XprEvaluationCost);
	AssignmentFunctor functor(kernel);
	device.template parallelFor<AssignmentFunctor, 1>(0, size, functor, cost);
	}
	};

	template <typename Kernel>
	struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearVectorizedTraversal, NoUnrolling> {
	using Scalar = typename Kernel::Scalar;
	using PacketType = typename Kernel::PacketType;
	static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost,
	RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
	PacketSize = unpacket_traits<PacketType>::size,
	DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment,
	DstAlignment = packet_traits<Scalar>::AlignedOnScalar ? RequestedAlignment
	: Kernel::AssignmentTraits::DstAlignment,
	SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
	struct AssignmentFunctor : public Kernel {
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
	EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) {
	this->template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);
	}
	};
	static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
	const Index size = kernel.size();
	const Index alignedStart =
	DstIsAligned ? 0 : internal::first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);
	const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);

	unaligned_dense_assignment_loop<DstIsAligned != 0>::run(kernel, 0, alignedStart);

	constexpr float cost = static_cast<float>(XprEvaluationCost);
	AssignmentFunctor functor(kernel);
	device.template parallelFor<AssignmentFunctor, PacketSize>(alignedStart, alignedEnd, functor, cost);

	unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);
	}
	};

	} // namespace internal

	} // namespace Eigen

	#endif // EIGEN_CORE_THREAD_POOL_DEVICE_H