Rewrite TensorCostModel with principled roofline model

libeigen/eigen!2255

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
diff --git a/unsupported/Eigen/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/src/Tensor/TensorContractionThreadPool.h
index 0e102c2..2ea7136 100644
--- a/unsupported/Eigen/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/src/Tensor/TensorContractionThreadPool.h
@@ -1507,7 +1507,7 @@
     int num_threads = 1;
     double min_cost = total_parallel_cost;
     double kPerThreadOverHead = 3000;
-    double kFixedOverHead = 100000;
+    double kFixedOverHead = 20000;
     for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) {
       double sequential_cost = kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead);
       double parallel_cost = total_parallel_cost / nt + sequential_cost;
diff --git a/unsupported/Eigen/src/Tensor/TensorCostModel.h b/unsupported/Eigen/src/Tensor/TensorCostModel.h
index 6134151..b914ac4 100644
--- a/unsupported/Eigen/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/src/Tensor/TensorCostModel.h
@@ -60,6 +60,7 @@
   constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { return bytes_loaded_; }
   constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { return bytes_stored_; }
   constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { return compute_cycles_; }
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_bytes() const { return bytes_loaded_ + bytes_stored_; }
   constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(double load_cost, double store_cost,
                                                                     double compute_cost) const {
     return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + compute_cost * compute_cycles_;
@@ -126,15 +127,16 @@
   double compute_cycles_;
 };
 
-// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of threads
-// in [1:max_threads] instead of just switching multi-threading off for small
-// work units.
 /**
  * \ingroup CXX11_Tensor_Module
  *
  * \brief A cost model used to limit the number of threads used for evaluating
  * tensor expression.
  *
+ * Uses a roofline model: cost = max(memory_time, compute_time) instead of
+ * summing them. This avoids overestimating cost for balanced workloads.
+ * Memory-bound operations are capped at a limited number of threads to
+ * avoid wasting cycles competing for shared memory bandwidth.
  */
 template <typename Device>
 class TensorCostModel {
@@ -142,21 +144,73 @@
   // Scaling from Eigen compute cost to device cycles.
   static constexpr int kDeviceCyclesPerComputeCycle = 1;
 
-  // Costs in device cycles.
-  static constexpr int kStartupCycles = 100000;
-  static constexpr int kPerThreadCycles = 100000;
+  // Thread overhead in device cycles. ~8us at 3GHz.
+  // Minimum total work to justify thread pool dispatch.
+  static constexpr int kStartupCycles = 25000;
+  // Minimum work per thread to amortize dispatch and synchronization overhead.
+  static constexpr int kPerThreadCycles = 25000;
   static constexpr int kTaskSize = 40000;
 
+  // Memory bandwidth saturation: on typical multi-socket servers, 2-6 cores
+  // saturate DRAM bandwidth. 4 is a conservative default.
+  static constexpr int kMemBandwidthSaturationThreads = 4;
+
+  // If memory_time / compute_time exceeds this ratio, the op is memory-bound.
+  // With vectorized costs (AVX2, PacketSize=8), typical ratios are:
+  //   Add/Mul (2 loads + 1 store): mem/comp = 6.0
+  //   FMA (3 loads + 1 store):     mem/comp = 4.0
+  //   ReLU max(x,0) (1 load + 1 store): mem/comp = 4.0
+  //   Polynomial 3rd-order (3 loads + 1 store, 6 ops): mem/comp = 1.3
+  //   Exp (1 load + 1 store, ~8 ops): mem/comp = 0.5
+  // Threshold of 2.0 cleanly separates memory-bound (>=4) from compute-bound.
+  static constexpr double kMemBoundThreshold = 2.0;
+
+  // Data sets larger than this are assumed to be DRAM-resident.
+  // Below this threshold, data is likely L2-cache-resident and benefits from
+  // high per-core L2 bandwidth, so no bandwidth saturation cap is applied.
+  static constexpr double kDramThresholdBytes = 1024.0 * 1024.0;
+
   // Returns the number of threads in [1:max_threads] to use for
   // evaluating an expression with the given output size and cost per
   // coefficient.
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost& cost_per_coeff,
                                                               int max_threads) {
-    double cost = totalCost(output_size, cost_per_coeff);
-    double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
-    // Make sure we don't invoke undefined behavior when we convert to an int.
+    if (max_threads <= 1) return 1;
+
+    double mem = memoryTime(cost_per_coeff);
+    double comp = computeTime(cost_per_coeff);
+    double per_coeff = numext::maxi(mem, comp);
+    double total = output_size * per_coeff;
+
+    // Not enough total work to justify thread pool dispatch.
+    if (total < kStartupCycles) return 1;
+
+    // Each thread needs at least kPerThreadCycles of work to
+    // amortize dispatch and synchronization overhead.
+    double threads = total / kPerThreadCycles;
+    // Guard against integer overflow.
     threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
-    return numext::mini(max_threads, numext::maxi<int>(1, static_cast<int>(threads)));
+    int candidate = numext::mini(max_threads, numext::maxi<int>(1, static_cast<int>(threads)));
+
+    // Memory-bound ops on DRAM-resident data: cap at bandwidth saturation.
+    // Cache-resident data has high per-core L2 bandwidth, so no cap needed.
+    //
+    // The total-traffic proxy below overestimates the working set whenever an
+    // operand is reused (e.g. broadcasted): every output coefficient charges a
+    // fresh load, so a 1xN broadcast can show up as MxN bytes even though the
+    // live data is N. A working-set-aware estimate would need each evaluator to
+    // surface its unique-operand footprint; until that exists we accept a small
+    // bias toward over-capping for reuse-heavy expressions.
+    if (candidate > kMemBandwidthSaturationThreads) {
+      bool is_memory_bound = (comp > 0) ? (mem / comp > kMemBoundThreshold) : (mem > 0);
+      if (is_memory_bound) {
+        double total_bytes = output_size * cost_per_coeff.total_bytes();
+        if (total_bytes > kDramThresholdBytes) {
+          candidate = numext::mini(candidate, kMemBandwidthSaturationThreads);
+        }
+      }
+    }
+    return candidate;
   }
 
   // taskSize assesses parallel task size.
@@ -166,21 +220,27 @@
     return totalCost(output_size, cost_per_coeff) / kTaskSize;
   }
 
+  // Roofline model: cost is the max of memory time and compute time.
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(double output_size,
                                                                 const TensorOpCost& cost_per_coeff) {
-    // Cost of memory fetches from L2 cache. 64 is typical cache line size.
-    // 11 is L2 cache latency on Haswell.
-    // We don't know whether data is in L1, L2 or L3. But we are most interested
-    // in single-threaded computational time around 100us-10ms (smaller time
-    // is too small for parallelization, larger time is not interesting
-    // either because we are probably using all available threads already).
-    // And for the target time range, L2 seems to be what matters. Data set
-    // fitting into L1 is too small to take noticeable time. Data set fitting
-    // only into L3 presumably will take more than 10ms to load and process.
-    const double kLoadCycles = 1.0 / 64 * 11;
-    const double kStoreCycles = 1.0 / 64 * 11;
-    // Scaling from Eigen compute cost to device cycles.
-    return output_size * cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, kDeviceCyclesPerComputeCycle);
+    double mem_cost = memoryTime(cost_per_coeff);
+    double comp_cost = computeTime(cost_per_coeff);
+    return output_size * numext::maxi(mem_cost, comp_cost);
+  }
+
+ private:
+  // Effective sustained bandwidth cost in cycles/byte.
+  // ~1/16 = 0.0625 cycles/byte represents L3/DRAM streaming bandwidth
+  // on modern CPUs (~16 bytes/cycle single-core sustained throughput).
+  // For L1/L2-resident data, compute typically dominates anyway.
+  static constexpr double kByteCost = 1.0 / 16.0;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double memoryTime(const TensorOpCost& cost) {
+    return cost.total_bytes() * kByteCost;
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double computeTime(const TensorOpCost& cost) {
+    return cost.compute_cycles() * kDeviceCyclesPerComputeCycle;
   }
 };
 
diff --git a/unsupported/Eigen/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/src/Tensor/TensorDeviceThreadPool.h
index 85c1dc8..dacf93d 100644
--- a/unsupported/Eigen/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/src/Tensor/TensorDeviceThreadPool.h
@@ -55,11 +55,10 @@
     ::memcpy(dst, src, n);
 #else
     // TODO(rmlarsen): Align blocks on cache lines.
-    // We have observed that going beyond 4 threads usually just wastes
-    // CPU cycles due to the threads competing for memory bandwidth, so we
-    // statically schedule at most 4 block copies here.
     const size_t kMinBlockSize = 32768;
-    const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4);
+    // Pure memory op (zero compute) — the cost model's bandwidth saturation
+    // cap will limit threads appropriately.
+    const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), static_cast<int>(numThreads()));
     if (n <= kMinBlockSize || num_threads < 2) {
       ::memcpy(dst, src, n);
     } else {
diff --git a/unsupported/Eigen/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/src/Tensor/TensorEvaluator.h
index 03c957c..8f50171 100644
--- a/unsupported/Eigen/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/src/Tensor/TensorEvaluator.h
@@ -360,7 +360,12 @@
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketType<CoeffReturnType, Device>::size);
+    // NullaryOps (constants, zero, identity, random) generate values from
+    // registers or minimal state — they do not load from memory.  Report
+    // zero bytes_loaded so the cost model correctly classifies expressions
+    // containing many constants (e.g. Horner polynomials) as compute-bound
+    // rather than memory-bound.
+    return TensorOpCost(0, 0, 0, vectorized, PacketType<CoeffReturnType, Device>::size);
   }
 
   EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
diff --git a/unsupported/benchmarks/Tensor/CMakeLists.txt b/unsupported/benchmarks/Tensor/CMakeLists.txt
index 06ee4f9..1c57b1e 100644
--- a/unsupported/benchmarks/Tensor/CMakeLists.txt
+++ b/unsupported/benchmarks/Tensor/CMakeLists.txt
@@ -10,3 +10,4 @@
 eigen_add_benchmark(bench_reverse bench_reverse.cpp)
 eigen_add_benchmark(bench_roll bench_roll.cpp)
 eigen_add_benchmark(bench_layout_swap bench_layout_swap.cpp)
+eigen_add_benchmark(bench_chained_expressions bench_chained_expressions.cpp)
diff --git a/unsupported/benchmarks/Tensor/bench_broadcasting.cpp b/unsupported/benchmarks/Tensor/bench_broadcasting.cpp
index 01aed20..754fe96 100644
--- a/unsupported/benchmarks/Tensor/bench_broadcasting.cpp
+++ b/unsupported/benchmarks/Tensor/bench_broadcasting.cpp
@@ -1,8 +1,11 @@
 // Benchmarks for Eigen Tensor broadcasting.
 // Tests broadcasting along various dimensions and ranks.
 
+#define EIGEN_USE_THREADS
+
 #include <benchmark/benchmark.h>
 #include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/ThreadPool>
 
 using namespace Eigen;
 
@@ -87,6 +90,56 @@
   state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar));
 }
 
+// --- ThreadPool variants ---
+
+static void BM_BroadcastRow_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> row(1, N);
+  Tensor<Scalar, 2> result(M, N);
+  row.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  Eigen::array<int, 2> bcast = {M, 1};
+
+  for (auto _ : state) {
+    result.device(dev) = row.broadcast(bcast);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
+  state.counters["threads"] = threads;
+}
+
+static void BM_BroadcastAdd_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> mat(M, N);
+  Tensor<Scalar, 2> bias(1, N);
+  Tensor<Scalar, 2> result(M, N);
+  mat.setRandom();
+  bias.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  Eigen::array<int, 2> bcast = {M, 1};
+
+  for (auto _ : state) {
+    result.device(dev) = mat + bias.broadcast(bcast);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
+  state.counters["threads"] = threads;
+}
+
 static void BroadcastSizes(::benchmark::Benchmark* b) {
   for (int m : {64, 256, 1024}) {
     for (int n : {64, 256, 1024}) {
@@ -105,7 +158,17 @@
   }
 }
 
+static void BroadcastThreadPoolSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024}) {
+    for (int threads : {1, 2, 4, 8, 12, 16}) {
+      b->Args({size, size, threads});
+    }
+  }
+}
+
 BENCHMARK(BM_BroadcastRow)->Apply(BroadcastSizes);
 BENCHMARK(BM_BroadcastCol)->Apply(BroadcastSizes);
 BENCHMARK(BM_BroadcastAdd)->Apply(BroadcastSizes);
 BENCHMARK(BM_BroadcastRank4)->Apply(Rank4Sizes);
+BENCHMARK(BM_BroadcastRow_ThreadPool)->Apply(BroadcastThreadPoolSizes)->UseRealTime();
+BENCHMARK(BM_BroadcastAdd_ThreadPool)->Apply(BroadcastThreadPoolSizes)->UseRealTime();
diff --git a/unsupported/benchmarks/Tensor/bench_chained_expressions.cpp b/unsupported/benchmarks/Tensor/bench_chained_expressions.cpp
new file mode 100644
index 0000000..0069195
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_chained_expressions.cpp
@@ -0,0 +1,157 @@
+// Benchmarks for chained tensor expressions with ThreadPool.
+// Tests realistic compound expressions spanning memory-bound to compute-bound.
+
+#define EIGEN_USE_THREADS
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/ThreadPool>
+
+using namespace Eigen;
+
+typedef float Scalar;
+
+// --- Pure memory-bound baseline (dst = src) ---
+static void BM_Copy_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> src(M, N);
+  Tensor<Scalar, 2> dst(M, N);
+  src.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  for (auto _ : state) {
+    dst.device(dev) = src;
+    benchmark::DoNotOptimize(dst.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
+  state.counters["threads"] = threads;
+}
+
+// --- Near-memory-bound: bias + ReLU ---
+// Pattern: (x + bias.broadcast()).cwiseMax(0)
+static void BM_BiasReLU_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> x(M, N);
+  Tensor<Scalar, 2> bias(1, N);
+  Tensor<Scalar, 2> result(M, N);
+  x.setRandom();
+  bias.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  Eigen::array<int, 2> bcast = {M, 1};
+
+  for (auto _ : state) {
+    result.device(dev) = (x + bias.broadcast(bcast)).cwiseMax(Scalar(0));
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
+  state.counters["threads"] = threads;
+}
+
+// --- Compute-bound: Horner polynomial ((a*x+b)*x+c)*x+d ---
+static void BM_Polynomial_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> x(M, N);
+  Tensor<Scalar, 2> result(M, N);
+  x.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  const Scalar a = 0.5f, b = 1.2f, c = -0.3f, d = 0.7f;
+
+  for (auto _ : state) {
+    result.device(dev) = ((x.constant(a) * x + x.constant(b)) * x + x.constant(c)) * x + x.constant(d);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
+  state.counters["threads"] = threads;
+}
+
+// --- Compute-bound: exp (expensive transcendental) ---
+static void BM_ExpNormalize_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> x(M, N);
+  Tensor<Scalar, 2> result(M, N);
+  x.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  for (auto _ : state) {
+    result.device(dev) = x.exp();
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
+  state.counters["threads"] = threads;
+}
+
+// --- Batch normalization: gamma * (x - mean) / sqrt(var + eps) + beta ---
+static void BM_BatchNorm_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> x(M, N);
+  Tensor<Scalar, 2> result(M, N);
+  Tensor<Scalar, 2> gamma(1, N);
+  Tensor<Scalar, 2> beta(1, N);
+  Tensor<Scalar, 2> mean(1, N);
+  Tensor<Scalar, 2> var(1, N);
+  x.setRandom();
+  gamma.setRandom();
+  beta.setRandom();
+  mean.setRandom();
+  var.setRandom();
+  var = var.abs() + var.constant(Scalar(0.1));
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  Eigen::array<int, 2> bcast = {M, 1};
+  const Scalar eps = 1e-5f;
+
+  for (auto _ : state) {
+    result.device(dev) =
+        gamma.broadcast(bcast) * (x - mean.broadcast(bcast)) * (var.broadcast(bcast) + x.constant(eps)).rsqrt() +
+        beta.broadcast(bcast);
+    benchmark::DoNotOptimize(result.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
+  state.counters["threads"] = threads;
+}
+
+static void ChainedSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024, 4096}) {
+    for (int threads : {1, 2, 4, 8, 12, 16}) {
+      b->Args({size, size, threads});
+    }
+  }
+}
+
+BENCHMARK(BM_Copy_ThreadPool)->Apply(ChainedSizes)->UseRealTime();
+BENCHMARK(BM_BiasReLU_ThreadPool)->Apply(ChainedSizes)->UseRealTime();
+BENCHMARK(BM_Polynomial_ThreadPool)->Apply(ChainedSizes)->UseRealTime();
+BENCHMARK(BM_ExpNormalize_ThreadPool)->Apply(ChainedSizes)->UseRealTime();
+BENCHMARK(BM_BatchNorm_ThreadPool)->Apply(ChainedSizes)->UseRealTime();
diff --git a/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp b/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp
index aed4828..b6e4c14 100644
--- a/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp
+++ b/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp
@@ -1,8 +1,11 @@
 // Benchmarks for Eigen Tensor coefficient-wise operations.
 // Covers activation functions, normalization, and element-wise arithmetic.
 
+#define EIGEN_USE_THREADS
+
 #include <benchmark/benchmark.h>
 #include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/ThreadPool>
 
 using namespace Eigen;
 
@@ -24,6 +27,26 @@
     state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); \
   }
 
+// Macro for ThreadPool variant of a unary tensor operation.
+#define BENCH_TENSOR_UNARY_THREADPOOL(NAME, EXPR)                             \
+  static void BM_##NAME##_ThreadPool(benchmark::State& state) {               \
+    const int M = state.range(0);                                             \
+    const int N = state.range(1);                                             \
+    const int threads = state.range(2);                                       \
+    Tensor<Scalar, 2> a(M, N);                                                \
+    a.setRandom();                                                            \
+    Tensor<Scalar, 2> b(M, N);                                                \
+    ThreadPool tp(threads);                                                   \
+    ThreadPoolDevice dev(&tp, threads);                                       \
+    for (auto _ : state) {                                                    \
+      b.device(dev) = EXPR;                                                   \
+      benchmark::DoNotOptimize(b.data());                                     \
+      benchmark::ClobberMemory();                                             \
+    }                                                                         \
+    state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); \
+    state.counters["threads"] = threads;                                      \
+  }
+
 BENCH_TENSOR_UNARY(Exp, a.exp())
 BENCH_TENSOR_UNARY(Log, a.abs().log())
 BENCH_TENSOR_UNARY(Tanh, a.tanh())
@@ -31,6 +54,10 @@
 BENCH_TENSOR_UNARY(ReLU, a.cwiseMax(Scalar(0)))
 BENCH_TENSOR_UNARY(Sqrt, a.abs().sqrt())
 
+BENCH_TENSOR_UNARY_THREADPOOL(Exp, a.exp())
+BENCH_TENSOR_UNARY_THREADPOOL(Tanh, a.tanh())
+BENCH_TENSOR_UNARY_THREADPOOL(ReLU, a.cwiseMax(Scalar(0)))
+
 // --- Element-wise binary operations ---
 static void BM_Add(benchmark::State& state) {
   const int M = state.range(0);
@@ -89,6 +116,78 @@
   state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 4);
 }
 
+// --- ThreadPool binary operations ---
+static void BM_Add_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> a(M, N);
+  Tensor<Scalar, 2> b(M, N);
+  Tensor<Scalar, 2> c(M, N);
+  a.setRandom();
+  b.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  for (auto _ : state) {
+    c.device(dev) = a + b;
+    benchmark::DoNotOptimize(c.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3);
+  state.counters["threads"] = threads;
+}
+
+static void BM_Mul_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> a(M, N);
+  Tensor<Scalar, 2> b(M, N);
+  Tensor<Scalar, 2> c(M, N);
+  a.setRandom();
+  b.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  for (auto _ : state) {
+    c.device(dev) = a * b;
+    benchmark::DoNotOptimize(c.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3);
+  state.counters["threads"] = threads;
+}
+
+static void BM_FMA_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> a(M, N);
+  Tensor<Scalar, 2> b(M, N);
+  Tensor<Scalar, 2> c(M, N);
+  Tensor<Scalar, 2> d(M, N);
+  a.setRandom();
+  b.setRandom();
+  c.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  for (auto _ : state) {
+    d.device(dev) = a * b + c;
+    benchmark::DoNotOptimize(d.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 4);
+  state.counters["threads"] = threads;
+}
+
 // --- Rank-4 coefficient-wise (CNN feature maps) ---
 static void BM_ReLU_Rank4(benchmark::State& state) {
   const int batch = state.range(0);
@@ -113,6 +212,14 @@
   }
 }
 
+static void CwiseThreadPoolSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024}) {
+    for (int threads : {1, 2, 4, 8, 12, 16}) {
+      b->Args({size, size, threads});
+    }
+  }
+}
+
 static void Rank4Sizes(::benchmark::Benchmark* b) {
   b->Args({32, 64, 16});
   b->Args({8, 128, 32});
@@ -129,3 +236,9 @@
 BENCHMARK(BM_Mul)->Apply(CwiseSizes);
 BENCHMARK(BM_FMA)->Apply(CwiseSizes);
 BENCHMARK(BM_ReLU_Rank4)->Apply(Rank4Sizes);
+BENCHMARK(BM_Add_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime();
+BENCHMARK(BM_Mul_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime();
+BENCHMARK(BM_FMA_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime();
+BENCHMARK(BM_Exp_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime();
+BENCHMARK(BM_Tanh_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime();
+BENCHMARK(BM_ReLU_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime();
diff --git a/unsupported/benchmarks/Tensor/bench_morphing.cpp b/unsupported/benchmarks/Tensor/bench_morphing.cpp
index 8d226e7..ff7e17f 100644
--- a/unsupported/benchmarks/Tensor/bench_morphing.cpp
+++ b/unsupported/benchmarks/Tensor/bench_morphing.cpp
@@ -1,7 +1,10 @@
 // Benchmarks for Eigen Tensor morphing operations: reshape, slice, chip, pad, stride.
 
+#define EIGEN_USE_THREADS
+
 #include <benchmark/benchmark.h>
 #include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/ThreadPool>
 
 using namespace Eigen;
 
@@ -107,6 +110,64 @@
   state.SetBytesProcessed(state.iterations() * outM * outN * sizeof(Scalar));
 }
 
+// --- ThreadPool variants ---
+
+static void BM_Slice_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  int sliceM = M / 2;
+  int sliceN = N / 2;
+  Eigen::array<Index, 2> offsets = {0, 0};
+  Eigen::array<Index, 2> extents = {sliceM, sliceN};
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  Tensor<Scalar, 2> B(sliceM, sliceN);
+
+  for (auto _ : state) {
+    B.device(dev) = A.slice(offsets, extents);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * sliceM * sliceN * sizeof(Scalar));
+  state.counters["threads"] = threads;
+}
+
+static void BM_Pad_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  Eigen::array<std::pair<int, int>, 2> paddings;
+  paddings[0] = {4, 4};
+  paddings[1] = {4, 4};
+
+  int outM = M + 8;
+  int outN = N + 8;
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  Tensor<Scalar, 2> B(outM, outN);
+
+  for (auto _ : state) {
+    B.device(dev) = A.pad(paddings);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * outM * outN * sizeof(Scalar));
+  state.counters["threads"] = threads;
+}
+
 static void MorphSizes(::benchmark::Benchmark* b) {
   for (int size : {256, 1024}) {
     b->Args({size, size});
@@ -135,8 +196,18 @@
   }
 }
 
+static void MorphThreadPoolSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024}) {
+    for (int threads : {1, 2, 4, 8, 12, 16}) {
+      b->Args({size, size, threads});
+    }
+  }
+}
+
 BENCHMARK(BM_Reshape)->Apply(MorphSizes);
 BENCHMARK(BM_Slice)->Apply(MorphSizes);
 BENCHMARK(BM_Chip)->Apply(ChipSizes);
 BENCHMARK(BM_Pad)->Apply(PadSizes);
 BENCHMARK(BM_Stride)->Apply(StrideSizes);
+BENCHMARK(BM_Slice_ThreadPool)->Apply(MorphThreadPoolSizes)->UseRealTime();
+BENCHMARK(BM_Pad_ThreadPool)->Apply(MorphThreadPoolSizes)->UseRealTime();
diff --git a/unsupported/benchmarks/Tensor/bench_shuffling.cpp b/unsupported/benchmarks/Tensor/bench_shuffling.cpp
index 6296acc..6de76a8 100644
--- a/unsupported/benchmarks/Tensor/bench_shuffling.cpp
+++ b/unsupported/benchmarks/Tensor/bench_shuffling.cpp
@@ -1,7 +1,10 @@
 // Benchmarks for Eigen Tensor shuffling (transpose / permutation).
 
+#define EIGEN_USE_THREADS
+
 #include <benchmark/benchmark.h>
 #include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/ThreadPool>
 
 using namespace Eigen;
 
@@ -85,6 +88,56 @@
   state.SetBytesProcessed(state.iterations() * N * C * H * H * sizeof(Scalar) * 2);
 }
 
+// --- ThreadPool variants ---
+
+static void BM_Shuffle2D_ThreadPool(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int threads = state.range(2);
+
+  Tensor<Scalar, 2> A(M, N);
+  Tensor<Scalar, 2> B(N, M);
+  A.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  Eigen::array<int, 2> perm = {1, 0};
+
+  for (auto _ : state) {
+    B.device(dev) = A.shuffle(perm);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
+  state.counters["threads"] = threads;
+}
+
+static void BM_Shuffle4D_NCHW_to_NHWC_ThreadPool(benchmark::State& state) {
+  const int N = state.range(0);
+  const int C = state.range(1);
+  const int H = state.range(2);
+  const int threads = state.range(3);
+
+  Tensor<Scalar, 4> A(N, C, H, H);
+  Tensor<Scalar, 4> B(N, H, H, C);
+  A.setRandom();
+
+  ThreadPool tp(threads);
+  ThreadPoolDevice dev(&tp, threads);
+
+  // NCHW -> NHWC: permute (0, 2, 3, 1)
+  Eigen::array<int, 4> perm = {0, 2, 3, 1};
+
+  for (auto _ : state) {
+    B.device(dev) = A.shuffle(perm);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * N * C * H * H * sizeof(Scalar) * 2);
+  state.counters["threads"] = threads;
+}
+
 static void Shuffle2DSizes(::benchmark::Benchmark* b) {
   for (int size : {256, 1024}) {
     b->Args({size, size});
@@ -109,7 +162,29 @@
   }
 }
 
+static void Shuffle2DThreadPoolSizes(::benchmark::Benchmark* b) {
+  for (int size : {256, 1024}) {
+    for (int threads : {1, 2, 4, 8, 12, 16}) {
+      b->Args({size, size, threads});
+    }
+  }
+}
+
+static void Shuffle4DThreadPoolSizes(::benchmark::Benchmark* b) {
+  for (int batch : {1, 8}) {
+    for (int c : {64}) {
+      for (int h : {32, 64}) {
+        for (int threads : {1, 2, 4, 8, 12, 16}) {
+          b->Args({batch, c, h, threads});
+        }
+      }
+    }
+  }
+}
+
 BENCHMARK(BM_Shuffle2D)->Apply(Shuffle2DSizes);
 BENCHMARK(BM_ShuffleIdentity)->Apply(Shuffle2DSizes);
 BENCHMARK(BM_Shuffle3D)->Apply(Shuffle3DSizes);
 BENCHMARK(BM_Shuffle4D_NCHW_to_NHWC)->Apply(Shuffle4DSizes);
+BENCHMARK(BM_Shuffle2D_ThreadPool)->Apply(Shuffle2DThreadPoolSizes)->UseRealTime();
+BENCHMARK(BM_Shuffle4D_NCHW_to_NHWC_ThreadPool)->Apply(Shuffle4DThreadPoolSizes)->UseRealTime();
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index e7d628f..fb79127 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -186,6 +186,7 @@
 ei_add_test(tensor_concatenation)
 ei_add_test(tensor_const)
 ei_add_test(tensor_contraction)
+ei_add_test(tensor_cost_model)
 ei_add_test(tensor_convolution)
 ei_add_test(tensor_custom_index)
 ei_add_test(tensor_custom_op)
diff --git a/unsupported/test/tensor_cost_model.cpp b/unsupported/test/tensor_cost_model.cpp
new file mode 100644
index 0000000..f74bde8
--- /dev/null
+++ b/unsupported/test/tensor_cost_model.cpp
@@ -0,0 +1,53 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 The Eigen Authors.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/Tensor>
+
+using Eigen::DefaultDevice;
+using Eigen::Tensor;
+using Eigen::TensorEvaluator;
+
+// Regression: TensorCwiseNullaryOp must report bytes_loaded == 0.
+// NullaryOps (constants, Zero, Identity, Random, sequence generators)
+// produce values from registers or minimal state without loading from
+// memory. If they reported nonzero bytes_loaded, expressions dominated
+// by constants (e.g. Horner-form polynomials) would be misclassified as
+// memory-bound and the threadpool cost model would over-restrict
+// parallelism. See TensorEvaluator.h, the CwiseNullaryOp specialization
+// of costPerCoeff().
+template <typename Scalar>
+static void test_nullary_zero_bytes_loaded() {
+  Tensor<Scalar, 1> shape(/*size=*/16);
+  auto zeros = shape.constant(Scalar(0));
+  auto sevens = shape.constant(Scalar(7));
+
+  using ZeroEval = TensorEvaluator<const decltype(zeros), DefaultDevice>;
+  using ConstEval = TensorEvaluator<const decltype(sevens), DefaultDevice>;
+
+  DefaultDevice device;
+  ZeroEval zero_eval(zeros, device);
+  ConstEval const_eval(sevens, device);
+
+  for (bool vectorized : {false, true}) {
+    const auto zero_cost = zero_eval.costPerCoeff(vectorized);
+    const auto const_cost = const_eval.costPerCoeff(vectorized);
+    VERIFY_IS_EQUAL(zero_cost.bytes_loaded(), 0.0);
+    VERIFY_IS_EQUAL(zero_cost.bytes_stored(), 0.0);
+    VERIFY_IS_EQUAL(const_cost.bytes_loaded(), 0.0);
+    VERIFY_IS_EQUAL(const_cost.bytes_stored(), 0.0);
+  }
+}
+
+EIGEN_DECLARE_TEST(tensor_cost_model) {
+  CALL_SUBTEST(test_nullary_zero_bytes_loaded<float>());
+  CALL_SUBTEST(test_nullary_zero_bytes_loaded<double>());
+  CALL_SUBTEST(test_nullary_zero_bytes_loaded<int>());
+}