Avoid atomic false sharing in RunQueue
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 1c72173..47ddd4f 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -38,6 +38,15 @@ #define EIGEN_ALIGNOF(x) alignof(x) #endif +// Align to the boundary that avoids false sharing. +// https://en.cppreference.com/w/cpp/thread/hardware_destructive_interference_size +#ifdef __cpp_lib_hardware_interference_size +#define EIGEN_ALIGN_TO_AVOID_FALSE_SHARING EIGEN_ALIGN_TO_BOUNDARY(std::hardware_destructive_interference_size) +#else +// Overalign for the cache line size of 128 bytes (Apple M1) +#define EIGEN_ALIGN_TO_AVOID_FALSE_SHARING EIGEN_ALIGN_TO_BOUNDARY(128) +#endif + // If the user explicitly disable vectorization, then we also disable alignment #if defined(EIGEN_DONT_VECTORIZE) #if defined(EIGEN_GPUCC)
diff --git a/Eigen/src/ThreadPool/EventCount.h b/Eigen/src/ThreadPool/EventCount.h index 0117b4b..6eda6f4 100644 --- a/Eigen/src/ThreadPool/EventCount.h +++ b/Eigen/src/ThreadPool/EventCount.h
@@ -57,6 +57,9 @@ eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1); } + EventCount(const EventCount&) = delete; + void operator=(const EventCount&) = delete; + ~EventCount() { // Ensure there are no waiters. eigen_plain_assert(state_.load() == kStackMask); @@ -155,22 +158,6 @@ } } - class Waiter { - friend class EventCount; - // Align to 128 byte boundary to prevent false sharing with other Waiter - // objects in the same vector. - EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<uint64_t> next; - EIGEN_MUTEX mu; - EIGEN_CONDVAR cv; - uint64_t epoch = 0; - unsigned state = kNotSignaled; - enum { - kNotSignaled, - kWaiting, - kSignaled, - }; - }; - private: // State_ layout: // - low kWaiterBits is a stack of waiters committed wait @@ -192,9 +179,25 @@ static const uint64_t kEpochBits = 64 - kEpochShift; static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift; static const uint64_t kEpochInc = 1ull << kEpochShift; - std::atomic<uint64_t> state_; - MaxSizeVector<Waiter>& waiters_; + public: + class Waiter { + friend class EventCount; + + enum State { + kNotSignaled, + kWaiting, + kSignaled, + }; + + EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<uint64_t> next{kStackMask}; + EIGEN_MUTEX mu; + EIGEN_CONDVAR cv; + uint64_t epoch{0}; + unsigned state{kNotSignaled}; + }; + + private: static void CheckState(uint64_t state, bool waiter = false) { static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem"); const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift; @@ -229,8 +232,8 @@ } } - EventCount(const EventCount&) = delete; - void operator=(const EventCount&) = delete; + std::atomic<uint64_t> state_; + MaxSizeVector<Waiter>& waiters_; }; } // namespace Eigen
diff --git a/Eigen/src/ThreadPool/RunQueue.h b/Eigen/src/ThreadPool/RunQueue.h index 9f40e9d..9046b18 100644 --- a/Eigen/src/ThreadPool/RunQueue.h +++ b/Eigen/src/ThreadPool/RunQueue.h
@@ -154,16 +154,18 @@ private: static const unsigned kMask = kSize - 1; static const unsigned kMask2 = (kSize << 1) - 1; - struct Elem { - std::atomic<uint8_t> state; - Work w; - }; - enum { + + enum State { kEmpty, kBusy, kReady, }; - EIGEN_MUTEX mutex_; + + struct Elem { + std::atomic<uint8_t> state; + Work w; + }; + // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of // front/back, respectively. The remaining bits contain modification counters // that are incremented on Push operations. This allows us to (1) distinguish @@ -171,9 +173,11 @@ // position, these conditions would be indistinguishable); (2) obtain // consistent snapshot of front_/back_ for Size operation using the // modification counters. - std::atomic<unsigned> front_; - std::atomic<unsigned> back_; - Elem array_[kSize]; + EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> front_; + EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> back_; + EIGEN_MUTEX mutex_; // guards `PushBack` and `PopBack` (accesses `back_`) + + EIGEN_ALIGN_TO_AVOID_FALSE_SHARING Elem array_[kSize]; // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false, // only whether the size is 0 is guaranteed to be correct. @@ -208,12 +212,12 @@ EIGEN_ALWAYS_INLINE unsigned CalculateSize(unsigned front, unsigned back) const { int size = (front & kMask2) - (back & kMask2); // Fix overflow. - if (size < 0) size += 2 * kSize; + if (EIGEN_PREDICT_FALSE(size < 0)) size += 2 * kSize; // Order of modification in push/pop is crafted to make the queue look // larger than it is during concurrent modifications. E.g. push can // increment size before the corresponding pop has decremented it. // So the computed size can be up to kSize + 1, fix it. - if (size > static_cast<int>(kSize)) size = kSize; + if (EIGEN_PREDICT_FALSE(size > static_cast<int>(kSize))) size = kSize; return static_cast<unsigned>(size); }