Avoid atomic false sharing in RunQueue

commit: fd98cc49f1f8ffb8b006399359124671ed4818d8 [log] [tgz]
author: Eugene Zhulenev <eugene.zhulenev@gmail.com> Thu Aug 01 17:41:16 2024 +0000
committer: Rasmus Munk Larsen <rmlarsen@google.com> Thu Aug 01 17:41:16 2024 +0000
tree: 0f1982dbc8c0213b25cd0525379ae7555afe0aec
parent: 0b646f3f36ed93b9baed305fc337177357acd9e9 [diff]
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 1c72173..47ddd4f 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h

@@ -38,6 +38,15 @@
 #define EIGEN_ALIGNOF(x) alignof(x)
 #endif
 
+// Align to the boundary that avoids false sharing.
+// https://en.cppreference.com/w/cpp/thread/hardware_destructive_interference_size
+#ifdef __cpp_lib_hardware_interference_size
+#define EIGEN_ALIGN_TO_AVOID_FALSE_SHARING EIGEN_ALIGN_TO_BOUNDARY(std::hardware_destructive_interference_size)
+#else
+// Overalign for the cache line size of 128 bytes (Apple M1)
+#define EIGEN_ALIGN_TO_AVOID_FALSE_SHARING EIGEN_ALIGN_TO_BOUNDARY(128)
+#endif
+
 // If the user explicitly disable vectorization, then we also disable alignment
 #if defined(EIGEN_DONT_VECTORIZE)
 #if defined(EIGEN_GPUCC)

diff --git a/Eigen/src/ThreadPool/EventCount.h b/Eigen/src/ThreadPool/EventCount.h
index 0117b4b..6eda6f4 100644
--- a/Eigen/src/ThreadPool/EventCount.h
+++ b/Eigen/src/ThreadPool/EventCount.h

@@ -57,6 +57,9 @@
     eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1);
   }
 
+  EventCount(const EventCount&) = delete;
+  void operator=(const EventCount&) = delete;
+
   ~EventCount() {
     // Ensure there are no waiters.
     eigen_plain_assert(state_.load() == kStackMask);
@@ -155,22 +158,6 @@
     }
   }
 
-  class Waiter {
-    friend class EventCount;
-    // Align to 128 byte boundary to prevent false sharing with other Waiter
-    // objects in the same vector.
-    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<uint64_t> next;
-    EIGEN_MUTEX mu;
-    EIGEN_CONDVAR cv;
-    uint64_t epoch = 0;
-    unsigned state = kNotSignaled;
-    enum {
-      kNotSignaled,
-      kWaiting,
-      kSignaled,
-    };
-  };
-
  private:
   // State_ layout:
   // - low kWaiterBits is a stack of waiters committed wait
@@ -192,9 +179,25 @@
   static const uint64_t kEpochBits = 64 - kEpochShift;
   static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
   static const uint64_t kEpochInc = 1ull << kEpochShift;
-  std::atomic<uint64_t> state_;
-  MaxSizeVector<Waiter>& waiters_;
 
+ public:
+  class Waiter {
+    friend class EventCount;
+
+    enum State {
+      kNotSignaled,
+      kWaiting,
+      kSignaled,
+    };
+
+    EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<uint64_t> next{kStackMask};
+    EIGEN_MUTEX mu;
+    EIGEN_CONDVAR cv;
+    uint64_t epoch{0};
+    unsigned state{kNotSignaled};
+  };
+
+ private:
   static void CheckState(uint64_t state, bool waiter = false) {
     static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem");
     const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
@@ -229,8 +232,8 @@
     }
   }
 
-  EventCount(const EventCount&) = delete;
-  void operator=(const EventCount&) = delete;
+  std::atomic<uint64_t> state_;
+  MaxSizeVector<Waiter>& waiters_;
 };
 
 }  // namespace Eigen

diff --git a/Eigen/src/ThreadPool/RunQueue.h b/Eigen/src/ThreadPool/RunQueue.h
index 9f40e9d..9046b18 100644
--- a/Eigen/src/ThreadPool/RunQueue.h
+++ b/Eigen/src/ThreadPool/RunQueue.h

@@ -154,16 +154,18 @@
  private:
   static const unsigned kMask = kSize - 1;
   static const unsigned kMask2 = (kSize << 1) - 1;
-  struct Elem {
-    std::atomic<uint8_t> state;
-    Work w;
-  };
-  enum {
+
+  enum State {
     kEmpty,
     kBusy,
     kReady,
   };
-  EIGEN_MUTEX mutex_;
+
+  struct Elem {
+    std::atomic<uint8_t> state;
+    Work w;
+  };
+
   // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
   // front/back, respectively. The remaining bits contain modification counters
   // that are incremented on Push operations. This allows us to (1) distinguish
@@ -171,9 +173,11 @@
   // position, these conditions would be indistinguishable); (2) obtain
   // consistent snapshot of front_/back_ for Size operation using the
   // modification counters.
-  std::atomic<unsigned> front_;
-  std::atomic<unsigned> back_;
-  Elem array_[kSize];
+  EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> front_;
+  EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> back_;
+  EIGEN_MUTEX mutex_;  // guards `PushBack` and `PopBack` (accesses `back_`)
+
+  EIGEN_ALIGN_TO_AVOID_FALSE_SHARING Elem array_[kSize];
 
   // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
   // only whether the size is 0 is guaranteed to be correct.
@@ -208,12 +212,12 @@
   EIGEN_ALWAYS_INLINE unsigned CalculateSize(unsigned front, unsigned back) const {
     int size = (front & kMask2) - (back & kMask2);
     // Fix overflow.
-    if (size < 0) size += 2 * kSize;
+    if (EIGEN_PREDICT_FALSE(size < 0)) size += 2 * kSize;
     // Order of modification in push/pop is crafted to make the queue look
     // larger than it is during concurrent modifications. E.g. push can
     // increment size before the corresponding pop has decremented it.
     // So the computed size can be up to kSize + 1, fix it.
-    if (size > static_cast<int>(kSize)) size = kSize;
+    if (EIGEN_PREDICT_FALSE(size > static_cast<int>(kSize))) size = kSize;
     return static_cast<unsigned>(size);
   }
commit	fd98cc49f1f8ffb8b006399359124671ed4818d8	[log] [tgz]
author	Eugene Zhulenev <eugene.zhulenev@gmail.com>	Thu Aug 01 17:41:16 2024 +0000
committer	Rasmus Munk Larsen <rmlarsen@google.com>	Thu Aug 01 17:41:16 2024 +0000
tree	0f1982dbc8c0213b25cd0525379ae7555afe0aec
parent	0b646f3f36ed93b9baed305fc337177357acd9e9 [diff]