|  | // This file is part of Eigen, a lightweight C++ template library | 
|  | // for linear algebra. | 
|  | // | 
|  | // Copyright (C) 2023 Charlie Schlosser <cs.schlosser@gmail.com> | 
|  | // | 
|  | // This Source Code Form is subject to the terms of the Mozilla | 
|  | // Public License v. 2.0. If a copy of the MPL was not distributed | 
|  | // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. | 
|  |  | 
|  | #ifndef EIGEN_CORE_THREAD_POOL_DEVICE_H | 
|  | #define EIGEN_CORE_THREAD_POOL_DEVICE_H | 
|  |  | 
|  | namespace Eigen { | 
|  |  | 
|  | // CoreThreadPoolDevice provides an easy-to-understand Device for parallelizing Eigen Core expressions with | 
|  | // Threadpool. Expressions are recursively split evenly until the evaluation cost is less than the threshold for | 
|  | // delegating the task to a thread. | 
|  | /* | 
|  | a | 
|  | / \ | 
|  | /   \ | 
|  | /     \ | 
|  | /       \ | 
|  | /         \ | 
|  | /           \ | 
|  | /             \ | 
|  | a               e | 
|  | / \             / \ | 
|  | /   \           /   \ | 
|  | /     \         /     \ | 
|  | a       c       e       g | 
|  | / \     / \     / \     / \ | 
|  | /   \   /   \   /   \   /   \ | 
|  | a     b c     d e     f g     h | 
|  | */ | 
|  | // Each task descends the binary tree to the left, delegates the right task to a new thread, and continues to the | 
|  | // left. This ensures that work is evenly distributed to the thread pool as quickly as possible and minimizes the number | 
|  | // of tasks created during the evaluation. Consider an expression that is divided into 8 chunks. The | 
|  | // primary task 'a' creates tasks 'e' 'c' and 'b', and executes its portion of the expression at the bottom of the | 
|  | // tree. Likewise, task 'e' creates tasks 'g' and 'f', and executes its portion of the expression. | 
|  |  | 
|  | struct CoreThreadPoolDevice { | 
|  | using Task = std::function<void()>; | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoreThreadPoolDevice(ThreadPool& pool, float threadCostThreshold = 3e-5f) | 
|  | : m_pool(pool) { | 
|  | eigen_assert(threadCostThreshold >= 0.0f && "threadCostThreshold must be non-negative"); | 
|  | m_costFactor = threadCostThreshold; | 
|  | } | 
|  |  | 
|  | template <int PacketSize> | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int calculateLevels(Index size, float cost) const { | 
|  | eigen_assert(cost >= 0.0f && "cost must be non-negative"); | 
|  | Index numOps = size / PacketSize; | 
|  | int actualThreads = numOps < m_pool.NumThreads() ? static_cast<int>(numOps) : m_pool.NumThreads(); | 
|  | float totalCost = static_cast<float>(numOps) * cost; | 
|  | float idealThreads = totalCost * m_costFactor; | 
|  | if (idealThreads < static_cast<float>(actualThreads)) { | 
|  | idealThreads = numext::maxi(idealThreads, 1.0f); | 
|  | actualThreads = numext::mini(actualThreads, static_cast<int>(idealThreads)); | 
|  | } | 
|  | int maxLevel = internal::log2_ceil(actualThreads); | 
|  | return maxLevel; | 
|  | } | 
|  |  | 
|  | // MSVC does not like inlining parallelForImpl | 
|  | #if EIGEN_COMP_MSVC && !EIGEN_COMP_CLANG | 
|  | #define EIGEN_PARALLEL_FOR_INLINE | 
|  | #else | 
|  | #define EIGEN_PARALLEL_FOR_INLINE EIGEN_STRONG_INLINE | 
|  | #endif | 
|  |  | 
|  | template <typename UnaryFunctor, int PacketSize> | 
|  | EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index begin, Index end, UnaryFunctor& f, | 
|  | Barrier& barrier, int level) { | 
|  | while (level > 0) { | 
|  | level--; | 
|  | Index size = end - begin; | 
|  | eigen_assert(size % PacketSize == 0 && "this function assumes size is a multiple of PacketSize"); | 
|  | Index mid = begin + numext::round_down(size >> 1, PacketSize); | 
|  | Task right = [this, mid, end, &f, &barrier, level]() { | 
|  | parallelForImpl<UnaryFunctor, PacketSize>(mid, end, f, barrier, level); | 
|  | }; | 
|  | m_pool.Schedule(std::move(right)); | 
|  | end = mid; | 
|  | } | 
|  | for (Index i = begin; i < end; i += PacketSize) f(i); | 
|  | barrier.Notify(); | 
|  | } | 
|  |  | 
|  | template <typename BinaryFunctor, int PacketSize> | 
|  | EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index outerBegin, Index outerEnd, Index innerBegin, | 
|  | Index innerEnd, BinaryFunctor& f, Barrier& barrier, | 
|  | int level) { | 
|  | while (level > 0) { | 
|  | level--; | 
|  | Index outerSize = outerEnd - outerBegin; | 
|  | if (outerSize > 1) { | 
|  | Index outerMid = outerBegin + (outerSize >> 1); | 
|  | Task right = [this, &f, &barrier, outerMid, outerEnd, innerBegin, innerEnd, level]() { | 
|  | parallelForImpl<BinaryFunctor, PacketSize>(outerMid, outerEnd, innerBegin, innerEnd, f, barrier, level); | 
|  | }; | 
|  | m_pool.Schedule(std::move(right)); | 
|  | outerEnd = outerMid; | 
|  | } else { | 
|  | Index innerSize = innerEnd - innerBegin; | 
|  | eigen_assert(innerSize % PacketSize == 0 && "this function assumes innerSize is a multiple of PacketSize"); | 
|  | Index innerMid = innerBegin + numext::round_down(innerSize >> 1, PacketSize); | 
|  | Task right = [this, &f, &barrier, outerBegin, outerEnd, innerMid, innerEnd, level]() { | 
|  | parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerMid, innerEnd, f, barrier, level); | 
|  | }; | 
|  | m_pool.Schedule(std::move(right)); | 
|  | innerEnd = innerMid; | 
|  | } | 
|  | } | 
|  | for (Index outer = outerBegin; outer < outerEnd; outer++) | 
|  | for (Index inner = innerBegin; inner < innerEnd; inner += PacketSize) f(outer, inner); | 
|  | barrier.Notify(); | 
|  | } | 
|  |  | 
|  | #undef EIGEN_PARALLEL_FOR_INLINE | 
|  |  | 
|  | template <typename UnaryFunctor, int PacketSize> | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index begin, Index end, UnaryFunctor& f, float cost) { | 
|  | Index size = end - begin; | 
|  | int maxLevel = calculateLevels<PacketSize>(size, cost); | 
|  | Barrier barrier(1 << maxLevel); | 
|  | parallelForImpl<UnaryFunctor, PacketSize>(begin, end, f, barrier, maxLevel); | 
|  | barrier.Wait(); | 
|  | } | 
|  |  | 
|  | template <typename BinaryFunctor, int PacketSize> | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index outerBegin, Index outerEnd, Index innerBegin, | 
|  | Index innerEnd, BinaryFunctor& f, float cost) { | 
|  | Index outerSize = outerEnd - outerBegin; | 
|  | Index innerSize = innerEnd - innerBegin; | 
|  | Index size = outerSize * innerSize; | 
|  | int maxLevel = calculateLevels<PacketSize>(size, cost); | 
|  | Barrier barrier(1 << maxLevel); | 
|  | parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerBegin, innerEnd, f, barrier, maxLevel); | 
|  | barrier.Wait(); | 
|  | } | 
|  |  | 
|  | ThreadPool& m_pool; | 
|  | // costFactor is the cost of delegating a task to a thread | 
|  | // the inverse is used to avoid a floating point division | 
|  | float m_costFactor; | 
|  | }; | 
|  |  | 
|  | // specialization of coefficient-wise assignment loops for CoreThreadPoolDevice | 
|  |  | 
|  | namespace internal { | 
|  |  | 
|  | template <typename Kernel> | 
|  | struct cost_helper { | 
|  | using SrcEvaluatorType = typename Kernel::SrcEvaluatorType; | 
|  | using DstEvaluatorType = typename Kernel::DstEvaluatorType; | 
|  | using SrcXprType = typename SrcEvaluatorType::XprType; | 
|  | using DstXprType = typename DstEvaluatorType::XprType; | 
|  | static constexpr Index Cost = functor_cost<SrcXprType>::Cost + functor_cost<DstXprType>::Cost; | 
|  | }; | 
|  |  | 
|  | template <typename Kernel> | 
|  | struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, NoUnrolling> { | 
|  | static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost; | 
|  | struct AssignmentFunctor : public Kernel { | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {} | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) { | 
|  | this->assignCoeffByOuterInner(outer, inner); | 
|  | } | 
|  | }; | 
|  |  | 
|  | static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) { | 
|  | const Index innerSize = kernel.innerSize(); | 
|  | const Index outerSize = kernel.outerSize(); | 
|  | constexpr float cost = static_cast<float>(XprEvaluationCost); | 
|  | AssignmentFunctor functor(kernel); | 
|  | device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, 0, innerSize, functor, cost); | 
|  | } | 
|  | }; | 
|  |  | 
|  | template <typename Kernel> | 
|  | struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, InnerUnrolling> { | 
|  | using DstXprType = typename Kernel::DstEvaluatorType::XprType; | 
|  | static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, InnerSize = DstXprType::InnerSizeAtCompileTime; | 
|  | struct AssignmentFunctor : public Kernel { | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {} | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) { | 
|  | copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, InnerSize>::run(*this, outer); | 
|  | } | 
|  | }; | 
|  | static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) { | 
|  | const Index outerSize = kernel.outerSize(); | 
|  | AssignmentFunctor functor(kernel); | 
|  | constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize); | 
|  | device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, functor, cost); | 
|  | } | 
|  | }; | 
|  |  | 
|  | template <typename Kernel> | 
|  | struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, NoUnrolling> { | 
|  | using PacketType = typename Kernel::PacketType; | 
|  | static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size, | 
|  | SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, | 
|  | DstAlignment = Kernel::AssignmentTraits::DstAlignment; | 
|  | struct AssignmentFunctor : public Kernel { | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {} | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) { | 
|  | this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner); | 
|  | } | 
|  | }; | 
|  | static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) { | 
|  | const Index innerSize = kernel.innerSize(); | 
|  | const Index outerSize = kernel.outerSize(); | 
|  | const float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize); | 
|  | AssignmentFunctor functor(kernel); | 
|  | device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, 0, innerSize, functor, cost); | 
|  | } | 
|  | }; | 
|  |  | 
|  | template <typename Kernel> | 
|  | struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, InnerUnrolling> { | 
|  | using PacketType = typename Kernel::PacketType; | 
|  | using DstXprType = typename Kernel::DstEvaluatorType::XprType; | 
|  | static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size, | 
|  | SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, | 
|  | DstAlignment = Kernel::AssignmentTraits::DstAlignment, | 
|  | InnerSize = DstXprType::InnerSizeAtCompileTime; | 
|  | struct AssignmentFunctor : public Kernel { | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {} | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) { | 
|  | copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, InnerSize, SrcAlignment, DstAlignment>::run(*this, outer); | 
|  | } | 
|  | }; | 
|  | static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) { | 
|  | const Index outerSize = kernel.outerSize(); | 
|  | constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize); | 
|  | AssignmentFunctor functor(kernel); | 
|  | device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, functor, cost); | 
|  | } | 
|  | }; | 
|  |  | 
|  | template <typename Kernel> | 
|  | struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, SliceVectorizedTraversal, NoUnrolling> { | 
|  | using Scalar = typename Kernel::Scalar; | 
|  | using PacketType = typename Kernel::PacketType; | 
|  | static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size; | 
|  | struct PacketAssignmentFunctor : public Kernel { | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {} | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) { | 
|  | this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner); | 
|  | } | 
|  | }; | 
|  | struct ScalarAssignmentFunctor : public Kernel { | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {} | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) { | 
|  | const Index innerSize = this->innerSize(); | 
|  | const Index packetAccessSize = numext::round_down(innerSize, PacketSize); | 
|  | for (Index inner = packetAccessSize; inner < innerSize; inner++) this->assignCoeffByOuterInner(outer, inner); | 
|  | } | 
|  | }; | 
|  | static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) { | 
|  | const Index outerSize = kernel.outerSize(); | 
|  | const Index innerSize = kernel.innerSize(); | 
|  | const Index packetAccessSize = numext::round_down(innerSize, PacketSize); | 
|  | constexpr float packetCost = static_cast<float>(XprEvaluationCost); | 
|  | const float scalarCost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize - packetAccessSize); | 
|  | PacketAssignmentFunctor packetFunctor(kernel); | 
|  | ScalarAssignmentFunctor scalarFunctor(kernel); | 
|  | device.template parallelFor<PacketAssignmentFunctor, PacketSize>(0, outerSize, 0, packetAccessSize, packetFunctor, | 
|  | packetCost); | 
|  | device.template parallelFor<ScalarAssignmentFunctor, 1>(0, outerSize, scalarFunctor, scalarCost); | 
|  | }; | 
|  | }; | 
|  |  | 
|  | template <typename Kernel> | 
|  | struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearTraversal, NoUnrolling> { | 
|  | static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost; | 
|  | struct AssignmentFunctor : public Kernel { | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {} | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) { this->assignCoeff(index); } | 
|  | }; | 
|  | static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) { | 
|  | const Index size = kernel.size(); | 
|  | constexpr float cost = static_cast<float>(XprEvaluationCost); | 
|  | AssignmentFunctor functor(kernel); | 
|  | device.template parallelFor<AssignmentFunctor, 1>(0, size, functor, cost); | 
|  | } | 
|  | }; | 
|  |  | 
|  | template <typename Kernel> | 
|  | struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearVectorizedTraversal, NoUnrolling> { | 
|  | using Scalar = typename Kernel::Scalar; | 
|  | using PacketType = typename Kernel::PacketType; | 
|  | static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, | 
|  | RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment, | 
|  | PacketSize = unpacket_traits<PacketType>::size, | 
|  | DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment, | 
|  | DstAlignment = packet_traits<Scalar>::AlignedOnScalar ? RequestedAlignment | 
|  | : Kernel::AssignmentTraits::DstAlignment, | 
|  | SrcAlignment = Kernel::AssignmentTraits::JointAlignment; | 
|  | struct AssignmentFunctor : public Kernel { | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {} | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) { | 
|  | this->template assignPacket<DstAlignment, SrcAlignment, PacketType>(index); | 
|  | } | 
|  | }; | 
|  | static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) { | 
|  | const Index size = kernel.size(); | 
|  | const Index alignedStart = | 
|  | DstIsAligned ? 0 : internal::first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size); | 
|  | const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize); | 
|  |  | 
|  | unaligned_dense_assignment_loop<DstIsAligned != 0>::run(kernel, 0, alignedStart); | 
|  |  | 
|  | constexpr float cost = static_cast<float>(XprEvaluationCost); | 
|  | AssignmentFunctor functor(kernel); | 
|  | device.template parallelFor<AssignmentFunctor, PacketSize>(alignedStart, alignedEnd, functor, cost); | 
|  |  | 
|  | unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size); | 
|  | } | 
|  | }; | 
|  |  | 
|  | }  // namespace internal | 
|  |  | 
|  | }  // namespace Eigen | 
|  |  | 
|  | #endif  // EIGEN_CORE_THREAD_POOL_DEVICE_H |