Rewrite TensorCostModel with principled roofline model libeigen/eigen!2255 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
diff --git a/unsupported/Eigen/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/src/Tensor/TensorContractionThreadPool.h index 0e102c2..2ea7136 100644 --- a/unsupported/Eigen/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/src/Tensor/TensorContractionThreadPool.h
@@ -1507,7 +1507,7 @@ int num_threads = 1; double min_cost = total_parallel_cost; double kPerThreadOverHead = 3000; - double kFixedOverHead = 100000; + double kFixedOverHead = 20000; for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) { double sequential_cost = kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead); double parallel_cost = total_parallel_cost / nt + sequential_cost;
diff --git a/unsupported/Eigen/src/Tensor/TensorCostModel.h b/unsupported/Eigen/src/Tensor/TensorCostModel.h index 6134151..b914ac4 100644 --- a/unsupported/Eigen/src/Tensor/TensorCostModel.h +++ b/unsupported/Eigen/src/Tensor/TensorCostModel.h
@@ -60,6 +60,7 @@ constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { return bytes_loaded_; } constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { return bytes_stored_; } constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { return compute_cycles_; } + constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_bytes() const { return bytes_loaded_ + bytes_stored_; } constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(double load_cost, double store_cost, double compute_cost) const { return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + compute_cost * compute_cycles_; @@ -126,15 +127,16 @@ double compute_cycles_; }; -// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of threads -// in [1:max_threads] instead of just switching multi-threading off for small -// work units. /** * \ingroup CXX11_Tensor_Module * * \brief A cost model used to limit the number of threads used for evaluating * tensor expression. * + * Uses a roofline model: cost = max(memory_time, compute_time) instead of + * summing them. This avoids overestimating cost for balanced workloads. + * Memory-bound operations are capped at a limited number of threads to + * avoid wasting cycles competing for shared memory bandwidth. */ template <typename Device> class TensorCostModel { @@ -142,21 +144,73 @@ // Scaling from Eigen compute cost to device cycles. static constexpr int kDeviceCyclesPerComputeCycle = 1; - // Costs in device cycles. - static constexpr int kStartupCycles = 100000; - static constexpr int kPerThreadCycles = 100000; + // Thread overhead in device cycles. ~8us at 3GHz. + // Minimum total work to justify thread pool dispatch. + static constexpr int kStartupCycles = 25000; + // Minimum work per thread to amortize dispatch and synchronization overhead. + static constexpr int kPerThreadCycles = 25000; static constexpr int kTaskSize = 40000; + // Memory bandwidth saturation: on typical multi-socket servers, 2-6 cores + // saturate DRAM bandwidth. 4 is a conservative default. + static constexpr int kMemBandwidthSaturationThreads = 4; + + // If memory_time / compute_time exceeds this ratio, the op is memory-bound. + // With vectorized costs (AVX2, PacketSize=8), typical ratios are: + // Add/Mul (2 loads + 1 store): mem/comp = 6.0 + // FMA (3 loads + 1 store): mem/comp = 4.0 + // ReLU max(x,0) (1 load + 1 store): mem/comp = 4.0 + // Polynomial 3rd-order (3 loads + 1 store, 6 ops): mem/comp = 1.3 + // Exp (1 load + 1 store, ~8 ops): mem/comp = 0.5 + // Threshold of 2.0 cleanly separates memory-bound (>=4) from compute-bound. + static constexpr double kMemBoundThreshold = 2.0; + + // Data sets larger than this are assumed to be DRAM-resident. + // Below this threshold, data is likely L2-cache-resident and benefits from + // high per-core L2 bandwidth, so no bandwidth saturation cap is applied. + static constexpr double kDramThresholdBytes = 1024.0 * 1024.0; + // Returns the number of threads in [1:max_threads] to use for // evaluating an expression with the given output size and cost per // coefficient. static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { - double cost = totalCost(output_size, cost_per_coeff); - double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; - // Make sure we don't invoke undefined behavior when we convert to an int. + if (max_threads <= 1) return 1; + + double mem = memoryTime(cost_per_coeff); + double comp = computeTime(cost_per_coeff); + double per_coeff = numext::maxi(mem, comp); + double total = output_size * per_coeff; + + // Not enough total work to justify thread pool dispatch. + if (total < kStartupCycles) return 1; + + // Each thread needs at least kPerThreadCycles of work to + // amortize dispatch and synchronization overhead. + double threads = total / kPerThreadCycles; + // Guard against integer overflow. threads = numext::mini<double>(threads, GenericNumTraits<int>::highest()); - return numext::mini(max_threads, numext::maxi<int>(1, static_cast<int>(threads))); + int candidate = numext::mini(max_threads, numext::maxi<int>(1, static_cast<int>(threads))); + + // Memory-bound ops on DRAM-resident data: cap at bandwidth saturation. + // Cache-resident data has high per-core L2 bandwidth, so no cap needed. + // + // The total-traffic proxy below overestimates the working set whenever an + // operand is reused (e.g. broadcasted): every output coefficient charges a + // fresh load, so a 1xN broadcast can show up as MxN bytes even though the + // live data is N. A working-set-aware estimate would need each evaluator to + // surface its unique-operand footprint; until that exists we accept a small + // bias toward over-capping for reuse-heavy expressions. + if (candidate > kMemBandwidthSaturationThreads) { + bool is_memory_bound = (comp > 0) ? (mem / comp > kMemBoundThreshold) : (mem > 0); + if (is_memory_bound) { + double total_bytes = output_size * cost_per_coeff.total_bytes(); + if (total_bytes > kDramThresholdBytes) { + candidate = numext::mini(candidate, kMemBandwidthSaturationThreads); + } + } + } + return candidate; } // taskSize assesses parallel task size. @@ -166,21 +220,27 @@ return totalCost(output_size, cost_per_coeff) / kTaskSize; } + // Roofline model: cost is the max of memory time and compute time. static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(double output_size, const TensorOpCost& cost_per_coeff) { - // Cost of memory fetches from L2 cache. 64 is typical cache line size. - // 11 is L2 cache latency on Haswell. - // We don't know whether data is in L1, L2 or L3. But we are most interested - // in single-threaded computational time around 100us-10ms (smaller time - // is too small for parallelization, larger time is not interesting - // either because we are probably using all available threads already). - // And for the target time range, L2 seems to be what matters. Data set - // fitting into L1 is too small to take noticeable time. Data set fitting - // only into L3 presumably will take more than 10ms to load and process. - const double kLoadCycles = 1.0 / 64 * 11; - const double kStoreCycles = 1.0 / 64 * 11; - // Scaling from Eigen compute cost to device cycles. - return output_size * cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, kDeviceCyclesPerComputeCycle); + double mem_cost = memoryTime(cost_per_coeff); + double comp_cost = computeTime(cost_per_coeff); + return output_size * numext::maxi(mem_cost, comp_cost); + } + + private: + // Effective sustained bandwidth cost in cycles/byte. + // ~1/16 = 0.0625 cycles/byte represents L3/DRAM streaming bandwidth + // on modern CPUs (~16 bytes/cycle single-core sustained throughput). + // For L1/L2-resident data, compute typically dominates anyway. + static constexpr double kByteCost = 1.0 / 16.0; + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double memoryTime(const TensorOpCost& cost) { + return cost.total_bytes() * kByteCost; + } + + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double computeTime(const TensorOpCost& cost) { + return cost.compute_cycles() * kDeviceCyclesPerComputeCycle; } };
diff --git a/unsupported/Eigen/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/src/Tensor/TensorDeviceThreadPool.h index 85c1dc8..dacf93d 100644 --- a/unsupported/Eigen/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/src/Tensor/TensorDeviceThreadPool.h
@@ -55,11 +55,10 @@ ::memcpy(dst, src, n); #else // TODO(rmlarsen): Align blocks on cache lines. - // We have observed that going beyond 4 threads usually just wastes - // CPU cycles due to the threads competing for memory bandwidth, so we - // statically schedule at most 4 block copies here. const size_t kMinBlockSize = 32768; - const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4); + // Pure memory op (zero compute) — the cost model's bandwidth saturation + // cap will limit threads appropriately. + const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), static_cast<int>(numThreads())); if (n <= kMinBlockSize || num_threads < 2) { ::memcpy(dst, src, n); } else {
diff --git a/unsupported/Eigen/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/src/Tensor/TensorEvaluator.h index 03c957c..8f50171 100644 --- a/unsupported/Eigen/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/src/Tensor/TensorEvaluator.h
@@ -360,7 +360,12 @@ } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketType<CoeffReturnType, Device>::size); + // NullaryOps (constants, zero, identity, random) generate values from + // registers or minimal state — they do not load from memory. Report + // zero bytes_loaded so the cost model correctly classifies expressions + // containing many constants (e.g. Horner polynomials) as compute-bound + // rather than memory-bound. + return TensorOpCost(0, 0, 0, vectorized, PacketType<CoeffReturnType, Device>::size); } EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
diff --git a/unsupported/benchmarks/Tensor/CMakeLists.txt b/unsupported/benchmarks/Tensor/CMakeLists.txt index 06ee4f9..1c57b1e 100644 --- a/unsupported/benchmarks/Tensor/CMakeLists.txt +++ b/unsupported/benchmarks/Tensor/CMakeLists.txt
@@ -10,3 +10,4 @@ eigen_add_benchmark(bench_reverse bench_reverse.cpp) eigen_add_benchmark(bench_roll bench_roll.cpp) eigen_add_benchmark(bench_layout_swap bench_layout_swap.cpp) +eigen_add_benchmark(bench_chained_expressions bench_chained_expressions.cpp)
diff --git a/unsupported/benchmarks/Tensor/bench_broadcasting.cpp b/unsupported/benchmarks/Tensor/bench_broadcasting.cpp index 01aed20..754fe96 100644 --- a/unsupported/benchmarks/Tensor/bench_broadcasting.cpp +++ b/unsupported/benchmarks/Tensor/bench_broadcasting.cpp
@@ -1,8 +1,11 @@ // Benchmarks for Eigen Tensor broadcasting. // Tests broadcasting along various dimensions and ranks. +#define EIGEN_USE_THREADS + #include <benchmark/benchmark.h> #include <unsupported/Eigen/CXX11/Tensor> +#include <unsupported/Eigen/CXX11/ThreadPool> using namespace Eigen; @@ -87,6 +90,56 @@ state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar)); } +// --- ThreadPool variants --- + +static void BM_BroadcastRow_ThreadPool(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int threads = state.range(2); + + Tensor<Scalar, 2> row(1, N); + Tensor<Scalar, 2> result(M, N); + row.setRandom(); + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + Eigen::array<int, 2> bcast = {M, 1}; + + for (auto _ : state) { + result.device(dev) = row.broadcast(bcast); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar)); + state.counters["threads"] = threads; +} + +static void BM_BroadcastAdd_ThreadPool(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int threads = state.range(2); + + Tensor<Scalar, 2> mat(M, N); + Tensor<Scalar, 2> bias(1, N); + Tensor<Scalar, 2> result(M, N); + mat.setRandom(); + bias.setRandom(); + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + Eigen::array<int, 2> bcast = {M, 1}; + + for (auto _ : state) { + result.device(dev) = mat + bias.broadcast(bcast); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); + state.counters["threads"] = threads; +} + static void BroadcastSizes(::benchmark::Benchmark* b) { for (int m : {64, 256, 1024}) { for (int n : {64, 256, 1024}) { @@ -105,7 +158,17 @@ } } +static void BroadcastThreadPoolSizes(::benchmark::Benchmark* b) { + for (int size : {256, 1024}) { + for (int threads : {1, 2, 4, 8, 12, 16}) { + b->Args({size, size, threads}); + } + } +} + BENCHMARK(BM_BroadcastRow)->Apply(BroadcastSizes); BENCHMARK(BM_BroadcastCol)->Apply(BroadcastSizes); BENCHMARK(BM_BroadcastAdd)->Apply(BroadcastSizes); BENCHMARK(BM_BroadcastRank4)->Apply(Rank4Sizes); +BENCHMARK(BM_BroadcastRow_ThreadPool)->Apply(BroadcastThreadPoolSizes)->UseRealTime(); +BENCHMARK(BM_BroadcastAdd_ThreadPool)->Apply(BroadcastThreadPoolSizes)->UseRealTime();
diff --git a/unsupported/benchmarks/Tensor/bench_chained_expressions.cpp b/unsupported/benchmarks/Tensor/bench_chained_expressions.cpp new file mode 100644 index 0000000..0069195 --- /dev/null +++ b/unsupported/benchmarks/Tensor/bench_chained_expressions.cpp
@@ -0,0 +1,157 @@ +// Benchmarks for chained tensor expressions with ThreadPool. +// Tests realistic compound expressions spanning memory-bound to compute-bound. + +#define EIGEN_USE_THREADS + +#include <benchmark/benchmark.h> +#include <unsupported/Eigen/CXX11/Tensor> +#include <unsupported/Eigen/CXX11/ThreadPool> + +using namespace Eigen; + +typedef float Scalar; + +// --- Pure memory-bound baseline (dst = src) --- +static void BM_Copy_ThreadPool(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int threads = state.range(2); + + Tensor<Scalar, 2> src(M, N); + Tensor<Scalar, 2> dst(M, N); + src.setRandom(); + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + for (auto _ : state) { + dst.device(dev) = src; + benchmark::DoNotOptimize(dst.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); + state.counters["threads"] = threads; +} + +// --- Near-memory-bound: bias + ReLU --- +// Pattern: (x + bias.broadcast()).cwiseMax(0) +static void BM_BiasReLU_ThreadPool(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int threads = state.range(2); + + Tensor<Scalar, 2> x(M, N); + Tensor<Scalar, 2> bias(1, N); + Tensor<Scalar, 2> result(M, N); + x.setRandom(); + bias.setRandom(); + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + Eigen::array<int, 2> bcast = {M, 1}; + + for (auto _ : state) { + result.device(dev) = (x + bias.broadcast(bcast)).cwiseMax(Scalar(0)); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); + state.counters["threads"] = threads; +} + +// --- Compute-bound: Horner polynomial ((a*x+b)*x+c)*x+d --- +static void BM_Polynomial_ThreadPool(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int threads = state.range(2); + + Tensor<Scalar, 2> x(M, N); + Tensor<Scalar, 2> result(M, N); + x.setRandom(); + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + const Scalar a = 0.5f, b = 1.2f, c = -0.3f, d = 0.7f; + + for (auto _ : state) { + result.device(dev) = ((x.constant(a) * x + x.constant(b)) * x + x.constant(c)) * x + x.constant(d); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); + state.counters["threads"] = threads; +} + +// --- Compute-bound: exp (expensive transcendental) --- +static void BM_ExpNormalize_ThreadPool(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int threads = state.range(2); + + Tensor<Scalar, 2> x(M, N); + Tensor<Scalar, 2> result(M, N); + x.setRandom(); + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + for (auto _ : state) { + result.device(dev) = x.exp(); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); + state.counters["threads"] = threads; +} + +// --- Batch normalization: gamma * (x - mean) / sqrt(var + eps) + beta --- +static void BM_BatchNorm_ThreadPool(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int threads = state.range(2); + + Tensor<Scalar, 2> x(M, N); + Tensor<Scalar, 2> result(M, N); + Tensor<Scalar, 2> gamma(1, N); + Tensor<Scalar, 2> beta(1, N); + Tensor<Scalar, 2> mean(1, N); + Tensor<Scalar, 2> var(1, N); + x.setRandom(); + gamma.setRandom(); + beta.setRandom(); + mean.setRandom(); + var.setRandom(); + var = var.abs() + var.constant(Scalar(0.1)); + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + Eigen::array<int, 2> bcast = {M, 1}; + const Scalar eps = 1e-5f; + + for (auto _ : state) { + result.device(dev) = + gamma.broadcast(bcast) * (x - mean.broadcast(bcast)) * (var.broadcast(bcast) + x.constant(eps)).rsqrt() + + beta.broadcast(bcast); + benchmark::DoNotOptimize(result.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); + state.counters["threads"] = threads; +} + +static void ChainedSizes(::benchmark::Benchmark* b) { + for (int size : {256, 1024, 4096}) { + for (int threads : {1, 2, 4, 8, 12, 16}) { + b->Args({size, size, threads}); + } + } +} + +BENCHMARK(BM_Copy_ThreadPool)->Apply(ChainedSizes)->UseRealTime(); +BENCHMARK(BM_BiasReLU_ThreadPool)->Apply(ChainedSizes)->UseRealTime(); +BENCHMARK(BM_Polynomial_ThreadPool)->Apply(ChainedSizes)->UseRealTime(); +BENCHMARK(BM_ExpNormalize_ThreadPool)->Apply(ChainedSizes)->UseRealTime(); +BENCHMARK(BM_BatchNorm_ThreadPool)->Apply(ChainedSizes)->UseRealTime();
diff --git a/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp b/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp index aed4828..b6e4c14 100644 --- a/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp +++ b/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp
@@ -1,8 +1,11 @@ // Benchmarks for Eigen Tensor coefficient-wise operations. // Covers activation functions, normalization, and element-wise arithmetic. +#define EIGEN_USE_THREADS + #include <benchmark/benchmark.h> #include <unsupported/Eigen/CXX11/Tensor> +#include <unsupported/Eigen/CXX11/ThreadPool> using namespace Eigen; @@ -24,6 +27,26 @@ state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); \ } +// Macro for ThreadPool variant of a unary tensor operation. +#define BENCH_TENSOR_UNARY_THREADPOOL(NAME, EXPR) \ + static void BM_##NAME##_ThreadPool(benchmark::State& state) { \ + const int M = state.range(0); \ + const int N = state.range(1); \ + const int threads = state.range(2); \ + Tensor<Scalar, 2> a(M, N); \ + a.setRandom(); \ + Tensor<Scalar, 2> b(M, N); \ + ThreadPool tp(threads); \ + ThreadPoolDevice dev(&tp, threads); \ + for (auto _ : state) { \ + b.device(dev) = EXPR; \ + benchmark::DoNotOptimize(b.data()); \ + benchmark::ClobberMemory(); \ + } \ + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); \ + state.counters["threads"] = threads; \ + } + BENCH_TENSOR_UNARY(Exp, a.exp()) BENCH_TENSOR_UNARY(Log, a.abs().log()) BENCH_TENSOR_UNARY(Tanh, a.tanh()) @@ -31,6 +54,10 @@ BENCH_TENSOR_UNARY(ReLU, a.cwiseMax(Scalar(0))) BENCH_TENSOR_UNARY(Sqrt, a.abs().sqrt()) +BENCH_TENSOR_UNARY_THREADPOOL(Exp, a.exp()) +BENCH_TENSOR_UNARY_THREADPOOL(Tanh, a.tanh()) +BENCH_TENSOR_UNARY_THREADPOOL(ReLU, a.cwiseMax(Scalar(0))) + // --- Element-wise binary operations --- static void BM_Add(benchmark::State& state) { const int M = state.range(0); @@ -89,6 +116,78 @@ state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 4); } +// --- ThreadPool binary operations --- +static void BM_Add_ThreadPool(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int threads = state.range(2); + + Tensor<Scalar, 2> a(M, N); + Tensor<Scalar, 2> b(M, N); + Tensor<Scalar, 2> c(M, N); + a.setRandom(); + b.setRandom(); + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + for (auto _ : state) { + c.device(dev) = a + b; + benchmark::DoNotOptimize(c.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3); + state.counters["threads"] = threads; +} + +static void BM_Mul_ThreadPool(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int threads = state.range(2); + + Tensor<Scalar, 2> a(M, N); + Tensor<Scalar, 2> b(M, N); + Tensor<Scalar, 2> c(M, N); + a.setRandom(); + b.setRandom(); + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + for (auto _ : state) { + c.device(dev) = a * b; + benchmark::DoNotOptimize(c.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 3); + state.counters["threads"] = threads; +} + +static void BM_FMA_ThreadPool(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int threads = state.range(2); + + Tensor<Scalar, 2> a(M, N); + Tensor<Scalar, 2> b(M, N); + Tensor<Scalar, 2> c(M, N); + Tensor<Scalar, 2> d(M, N); + a.setRandom(); + b.setRandom(); + c.setRandom(); + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + for (auto _ : state) { + d.device(dev) = a * b + c; + benchmark::DoNotOptimize(d.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 4); + state.counters["threads"] = threads; +} + // --- Rank-4 coefficient-wise (CNN feature maps) --- static void BM_ReLU_Rank4(benchmark::State& state) { const int batch = state.range(0); @@ -113,6 +212,14 @@ } } +static void CwiseThreadPoolSizes(::benchmark::Benchmark* b) { + for (int size : {256, 1024}) { + for (int threads : {1, 2, 4, 8, 12, 16}) { + b->Args({size, size, threads}); + } + } +} + static void Rank4Sizes(::benchmark::Benchmark* b) { b->Args({32, 64, 16}); b->Args({8, 128, 32}); @@ -129,3 +236,9 @@ BENCHMARK(BM_Mul)->Apply(CwiseSizes); BENCHMARK(BM_FMA)->Apply(CwiseSizes); BENCHMARK(BM_ReLU_Rank4)->Apply(Rank4Sizes); +BENCHMARK(BM_Add_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime(); +BENCHMARK(BM_Mul_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime(); +BENCHMARK(BM_FMA_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime(); +BENCHMARK(BM_Exp_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime(); +BENCHMARK(BM_Tanh_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime(); +BENCHMARK(BM_ReLU_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime();
diff --git a/unsupported/benchmarks/Tensor/bench_morphing.cpp b/unsupported/benchmarks/Tensor/bench_morphing.cpp index 8d226e7..ff7e17f 100644 --- a/unsupported/benchmarks/Tensor/bench_morphing.cpp +++ b/unsupported/benchmarks/Tensor/bench_morphing.cpp
@@ -1,7 +1,10 @@ // Benchmarks for Eigen Tensor morphing operations: reshape, slice, chip, pad, stride. +#define EIGEN_USE_THREADS + #include <benchmark/benchmark.h> #include <unsupported/Eigen/CXX11/Tensor> +#include <unsupported/Eigen/CXX11/ThreadPool> using namespace Eigen; @@ -107,6 +110,64 @@ state.SetBytesProcessed(state.iterations() * outM * outN * sizeof(Scalar)); } +// --- ThreadPool variants --- + +static void BM_Slice_ThreadPool(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int threads = state.range(2); + + Tensor<Scalar, 2> A(M, N); + A.setRandom(); + + int sliceM = M / 2; + int sliceN = N / 2; + Eigen::array<Index, 2> offsets = {0, 0}; + Eigen::array<Index, 2> extents = {sliceM, sliceN}; + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + Tensor<Scalar, 2> B(sliceM, sliceN); + + for (auto _ : state) { + B.device(dev) = A.slice(offsets, extents); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * sliceM * sliceN * sizeof(Scalar)); + state.counters["threads"] = threads; +} + +static void BM_Pad_ThreadPool(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int threads = state.range(2); + + Tensor<Scalar, 2> A(M, N); + A.setRandom(); + + Eigen::array<std::pair<int, int>, 2> paddings; + paddings[0] = {4, 4}; + paddings[1] = {4, 4}; + + int outM = M + 8; + int outN = N + 8; + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + Tensor<Scalar, 2> B(outM, outN); + + for (auto _ : state) { + B.device(dev) = A.pad(paddings); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * outM * outN * sizeof(Scalar)); + state.counters["threads"] = threads; +} + static void MorphSizes(::benchmark::Benchmark* b) { for (int size : {256, 1024}) { b->Args({size, size}); @@ -135,8 +196,18 @@ } } +static void MorphThreadPoolSizes(::benchmark::Benchmark* b) { + for (int size : {256, 1024}) { + for (int threads : {1, 2, 4, 8, 12, 16}) { + b->Args({size, size, threads}); + } + } +} + BENCHMARK(BM_Reshape)->Apply(MorphSizes); BENCHMARK(BM_Slice)->Apply(MorphSizes); BENCHMARK(BM_Chip)->Apply(ChipSizes); BENCHMARK(BM_Pad)->Apply(PadSizes); BENCHMARK(BM_Stride)->Apply(StrideSizes); +BENCHMARK(BM_Slice_ThreadPool)->Apply(MorphThreadPoolSizes)->UseRealTime(); +BENCHMARK(BM_Pad_ThreadPool)->Apply(MorphThreadPoolSizes)->UseRealTime();
diff --git a/unsupported/benchmarks/Tensor/bench_shuffling.cpp b/unsupported/benchmarks/Tensor/bench_shuffling.cpp index 6296acc..6de76a8 100644 --- a/unsupported/benchmarks/Tensor/bench_shuffling.cpp +++ b/unsupported/benchmarks/Tensor/bench_shuffling.cpp
@@ -1,7 +1,10 @@ // Benchmarks for Eigen Tensor shuffling (transpose / permutation). +#define EIGEN_USE_THREADS + #include <benchmark/benchmark.h> #include <unsupported/Eigen/CXX11/Tensor> +#include <unsupported/Eigen/CXX11/ThreadPool> using namespace Eigen; @@ -85,6 +88,56 @@ state.SetBytesProcessed(state.iterations() * N * C * H * H * sizeof(Scalar) * 2); } +// --- ThreadPool variants --- + +static void BM_Shuffle2D_ThreadPool(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int threads = state.range(2); + + Tensor<Scalar, 2> A(M, N); + Tensor<Scalar, 2> B(N, M); + A.setRandom(); + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + Eigen::array<int, 2> perm = {1, 0}; + + for (auto _ : state) { + B.device(dev) = A.shuffle(perm); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2); + state.counters["threads"] = threads; +} + +static void BM_Shuffle4D_NCHW_to_NHWC_ThreadPool(benchmark::State& state) { + const int N = state.range(0); + const int C = state.range(1); + const int H = state.range(2); + const int threads = state.range(3); + + Tensor<Scalar, 4> A(N, C, H, H); + Tensor<Scalar, 4> B(N, H, H, C); + A.setRandom(); + + ThreadPool tp(threads); + ThreadPoolDevice dev(&tp, threads); + + // NCHW -> NHWC: permute (0, 2, 3, 1) + Eigen::array<int, 4> perm = {0, 2, 3, 1}; + + for (auto _ : state) { + B.device(dev) = A.shuffle(perm); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * N * C * H * H * sizeof(Scalar) * 2); + state.counters["threads"] = threads; +} + static void Shuffle2DSizes(::benchmark::Benchmark* b) { for (int size : {256, 1024}) { b->Args({size, size}); @@ -109,7 +162,29 @@ } } +static void Shuffle2DThreadPoolSizes(::benchmark::Benchmark* b) { + for (int size : {256, 1024}) { + for (int threads : {1, 2, 4, 8, 12, 16}) { + b->Args({size, size, threads}); + } + } +} + +static void Shuffle4DThreadPoolSizes(::benchmark::Benchmark* b) { + for (int batch : {1, 8}) { + for (int c : {64}) { + for (int h : {32, 64}) { + for (int threads : {1, 2, 4, 8, 12, 16}) { + b->Args({batch, c, h, threads}); + } + } + } + } +} + BENCHMARK(BM_Shuffle2D)->Apply(Shuffle2DSizes); BENCHMARK(BM_ShuffleIdentity)->Apply(Shuffle2DSizes); BENCHMARK(BM_Shuffle3D)->Apply(Shuffle3DSizes); BENCHMARK(BM_Shuffle4D_NCHW_to_NHWC)->Apply(Shuffle4DSizes); +BENCHMARK(BM_Shuffle2D_ThreadPool)->Apply(Shuffle2DThreadPoolSizes)->UseRealTime(); +BENCHMARK(BM_Shuffle4D_NCHW_to_NHWC_ThreadPool)->Apply(Shuffle4DThreadPoolSizes)->UseRealTime();
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index e7d628f..fb79127 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt
@@ -186,6 +186,7 @@ ei_add_test(tensor_concatenation) ei_add_test(tensor_const) ei_add_test(tensor_contraction) +ei_add_test(tensor_cost_model) ei_add_test(tensor_convolution) ei_add_test(tensor_custom_index) ei_add_test(tensor_custom_op)
diff --git a/unsupported/test/tensor_cost_model.cpp b/unsupported/test/tensor_cost_model.cpp new file mode 100644 index 0000000..f74bde8 --- /dev/null +++ b/unsupported/test/tensor_cost_model.cpp
@@ -0,0 +1,53 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 The Eigen Authors. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/Tensor> + +using Eigen::DefaultDevice; +using Eigen::Tensor; +using Eigen::TensorEvaluator; + +// Regression: TensorCwiseNullaryOp must report bytes_loaded == 0. +// NullaryOps (constants, Zero, Identity, Random, sequence generators) +// produce values from registers or minimal state without loading from +// memory. If they reported nonzero bytes_loaded, expressions dominated +// by constants (e.g. Horner-form polynomials) would be misclassified as +// memory-bound and the threadpool cost model would over-restrict +// parallelism. See TensorEvaluator.h, the CwiseNullaryOp specialization +// of costPerCoeff(). +template <typename Scalar> +static void test_nullary_zero_bytes_loaded() { + Tensor<Scalar, 1> shape(/*size=*/16); + auto zeros = shape.constant(Scalar(0)); + auto sevens = shape.constant(Scalar(7)); + + using ZeroEval = TensorEvaluator<const decltype(zeros), DefaultDevice>; + using ConstEval = TensorEvaluator<const decltype(sevens), DefaultDevice>; + + DefaultDevice device; + ZeroEval zero_eval(zeros, device); + ConstEval const_eval(sevens, device); + + for (bool vectorized : {false, true}) { + const auto zero_cost = zero_eval.costPerCoeff(vectorized); + const auto const_cost = const_eval.costPerCoeff(vectorized); + VERIFY_IS_EQUAL(zero_cost.bytes_loaded(), 0.0); + VERIFY_IS_EQUAL(zero_cost.bytes_stored(), 0.0); + VERIFY_IS_EQUAL(const_cost.bytes_loaded(), 0.0); + VERIFY_IS_EQUAL(const_cost.bytes_stored(), 0.0); + } +} + +EIGEN_DECLARE_TEST(tensor_cost_model) { + CALL_SUBTEST(test_nullary_zero_bytes_loaded<float>()); + CALL_SUBTEST(test_nullary_zero_bytes_loaded<double>()); + CALL_SUBTEST(test_nullary_zero_bytes_loaded<int>()); +}