Avoid atomic false sharing in RunQueue
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 1c72173..47ddd4f 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -38,6 +38,15 @@
#define EIGEN_ALIGNOF(x) alignof(x)
#endif
+// Align to the boundary that avoids false sharing.
+// https://en.cppreference.com/w/cpp/thread/hardware_destructive_interference_size
+#ifdef __cpp_lib_hardware_interference_size
+#define EIGEN_ALIGN_TO_AVOID_FALSE_SHARING EIGEN_ALIGN_TO_BOUNDARY(std::hardware_destructive_interference_size)
+#else
+// Overalign for the cache line size of 128 bytes (Apple M1)
+#define EIGEN_ALIGN_TO_AVOID_FALSE_SHARING EIGEN_ALIGN_TO_BOUNDARY(128)
+#endif
+
// If the user explicitly disable vectorization, then we also disable alignment
#if defined(EIGEN_DONT_VECTORIZE)
#if defined(EIGEN_GPUCC)
diff --git a/Eigen/src/ThreadPool/EventCount.h b/Eigen/src/ThreadPool/EventCount.h
index 0117b4b..6eda6f4 100644
--- a/Eigen/src/ThreadPool/EventCount.h
+++ b/Eigen/src/ThreadPool/EventCount.h
@@ -57,6 +57,9 @@
eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1);
}
+ EventCount(const EventCount&) = delete;
+ void operator=(const EventCount&) = delete;
+
~EventCount() {
// Ensure there are no waiters.
eigen_plain_assert(state_.load() == kStackMask);
@@ -155,22 +158,6 @@
}
}
- class Waiter {
- friend class EventCount;
- // Align to 128 byte boundary to prevent false sharing with other Waiter
- // objects in the same vector.
- EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<uint64_t> next;
- EIGEN_MUTEX mu;
- EIGEN_CONDVAR cv;
- uint64_t epoch = 0;
- unsigned state = kNotSignaled;
- enum {
- kNotSignaled,
- kWaiting,
- kSignaled,
- };
- };
-
private:
// State_ layout:
// - low kWaiterBits is a stack of waiters committed wait
@@ -192,9 +179,25 @@
static const uint64_t kEpochBits = 64 - kEpochShift;
static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
static const uint64_t kEpochInc = 1ull << kEpochShift;
- std::atomic<uint64_t> state_;
- MaxSizeVector<Waiter>& waiters_;
+ public:
+ class Waiter {
+ friend class EventCount;
+
+ enum State {
+ kNotSignaled,
+ kWaiting,
+ kSignaled,
+ };
+
+ EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<uint64_t> next{kStackMask};
+ EIGEN_MUTEX mu;
+ EIGEN_CONDVAR cv;
+ uint64_t epoch{0};
+ unsigned state{kNotSignaled};
+ };
+
+ private:
static void CheckState(uint64_t state, bool waiter = false) {
static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem");
const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
@@ -229,8 +232,8 @@
}
}
- EventCount(const EventCount&) = delete;
- void operator=(const EventCount&) = delete;
+ std::atomic<uint64_t> state_;
+ MaxSizeVector<Waiter>& waiters_;
};
} // namespace Eigen
diff --git a/Eigen/src/ThreadPool/RunQueue.h b/Eigen/src/ThreadPool/RunQueue.h
index 9f40e9d..9046b18 100644
--- a/Eigen/src/ThreadPool/RunQueue.h
+++ b/Eigen/src/ThreadPool/RunQueue.h
@@ -154,16 +154,18 @@
private:
static const unsigned kMask = kSize - 1;
static const unsigned kMask2 = (kSize << 1) - 1;
- struct Elem {
- std::atomic<uint8_t> state;
- Work w;
- };
- enum {
+
+ enum State {
kEmpty,
kBusy,
kReady,
};
- EIGEN_MUTEX mutex_;
+
+ struct Elem {
+ std::atomic<uint8_t> state;
+ Work w;
+ };
+
// Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
// front/back, respectively. The remaining bits contain modification counters
// that are incremented on Push operations. This allows us to (1) distinguish
@@ -171,9 +173,11 @@
// position, these conditions would be indistinguishable); (2) obtain
// consistent snapshot of front_/back_ for Size operation using the
// modification counters.
- std::atomic<unsigned> front_;
- std::atomic<unsigned> back_;
- Elem array_[kSize];
+ EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> front_;
+ EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> back_;
+ EIGEN_MUTEX mutex_; // guards `PushBack` and `PopBack` (accesses `back_`)
+
+ EIGEN_ALIGN_TO_AVOID_FALSE_SHARING Elem array_[kSize];
// SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
// only whether the size is 0 is guaranteed to be correct.
@@ -208,12 +212,12 @@
EIGEN_ALWAYS_INLINE unsigned CalculateSize(unsigned front, unsigned back) const {
int size = (front & kMask2) - (back & kMask2);
// Fix overflow.
- if (size < 0) size += 2 * kSize;
+ if (EIGEN_PREDICT_FALSE(size < 0)) size += 2 * kSize;
// Order of modification in push/pop is crafted to make the queue look
// larger than it is during concurrent modifications. E.g. push can
// increment size before the corresponding pop has decremented it.
// So the computed size can be up to kSize + 1, fix it.
- if (size > static_cast<int>(kSize)) size = kSize;
+ if (EIGEN_PREDICT_FALSE(size > static_cast<int>(kSize))) size = kSize;
return static_cast<unsigned>(size);
}