blob: 6084f24b49ce2462e21b81c6f2a45785c311b610 [file]
// Benchmarks for chained tensor expressions with ThreadPool.
// Tests realistic compound expressions spanning memory-bound to compute-bound.
// SPDX-FileCopyrightText: The Eigen Authors
// SPDX-License-Identifier: MPL-2.0
#define EIGEN_USE_THREADS
#include <benchmark/benchmark.h>
#include <unsupported/Eigen/Tensor>
#include <unsupported/Eigen/ThreadPool>
using namespace Eigen;
typedef float Scalar;
// --- Pure memory-bound baseline (dst = src) ---
static void BM_Copy_ThreadPool(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
const int threads = state.range(2);
Tensor<Scalar, 2> src(M, N);
Tensor<Scalar, 2> dst(M, N);
src.setRandom();
ThreadPool tp(threads);
ThreadPoolDevice dev(&tp, threads);
for (auto _ : state) {
dst.device(dev) = src;
benchmark::DoNotOptimize(dst.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
state.counters["threads"] = threads;
}
// --- Near-memory-bound: bias + ReLU ---
// Pattern: (x + bias.broadcast()).cwiseMax(0)
static void BM_BiasReLU_ThreadPool(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
const int threads = state.range(2);
Tensor<Scalar, 2> x(M, N);
Tensor<Scalar, 2> bias(1, N);
Tensor<Scalar, 2> result(M, N);
x.setRandom();
bias.setRandom();
ThreadPool tp(threads);
ThreadPoolDevice dev(&tp, threads);
Eigen::array<int, 2> bcast = {M, 1};
for (auto _ : state) {
result.device(dev) = (x + bias.broadcast(bcast)).cwiseMax(Scalar(0));
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
state.counters["threads"] = threads;
}
// --- Compute-bound: Horner polynomial ((a*x+b)*x+c)*x+d ---
static void BM_Polynomial_ThreadPool(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
const int threads = state.range(2);
Tensor<Scalar, 2> x(M, N);
Tensor<Scalar, 2> result(M, N);
x.setRandom();
ThreadPool tp(threads);
ThreadPoolDevice dev(&tp, threads);
const Scalar a = 0.5f, b = 1.2f, c = -0.3f, d = 0.7f;
for (auto _ : state) {
result.device(dev) = ((x.constant(a) * x + x.constant(b)) * x + x.constant(c)) * x + x.constant(d);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
state.counters["threads"] = threads;
}
// --- Compute-bound: exp (expensive transcendental) ---
static void BM_ExpNormalize_ThreadPool(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
const int threads = state.range(2);
Tensor<Scalar, 2> x(M, N);
Tensor<Scalar, 2> result(M, N);
x.setRandom();
ThreadPool tp(threads);
ThreadPoolDevice dev(&tp, threads);
for (auto _ : state) {
result.device(dev) = x.exp();
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
state.counters["threads"] = threads;
}
// --- Batch normalization: gamma * (x - mean) / sqrt(var + eps) + beta ---
static void BM_BatchNorm_ThreadPool(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
const int threads = state.range(2);
Tensor<Scalar, 2> x(M, N);
Tensor<Scalar, 2> result(M, N);
Tensor<Scalar, 2> gamma(1, N);
Tensor<Scalar, 2> beta(1, N);
Tensor<Scalar, 2> mean(1, N);
Tensor<Scalar, 2> var(1, N);
x.setRandom();
gamma.setRandom();
beta.setRandom();
mean.setRandom();
var.setRandom();
var = var.abs() + var.constant(Scalar(0.1));
ThreadPool tp(threads);
ThreadPoolDevice dev(&tp, threads);
Eigen::array<int, 2> bcast = {M, 1};
const Scalar eps = 1e-5f;
for (auto _ : state) {
result.device(dev) =
gamma.broadcast(bcast) * (x - mean.broadcast(bcast)) * (var.broadcast(bcast) + x.constant(eps)).rsqrt() +
beta.broadcast(bcast);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
state.counters["threads"] = threads;
}
// clang-format off
#define CHAINED_SIZES \
->Args({256, 256, 1})->Args({256, 256, 2})->Args({256, 256, 4}) \
->Args({256, 256, 8})->Args({256, 256, 12})->Args({256, 256, 16}) \
->Args({1024, 1024, 1})->Args({1024, 1024, 2})->Args({1024, 1024, 4}) \
->Args({1024, 1024, 8})->Args({1024, 1024, 12})->Args({1024, 1024, 16}) \
->Args({4096, 4096, 1})->Args({4096, 4096, 2})->Args({4096, 4096, 4}) \
->Args({4096, 4096, 8})->Args({4096, 4096, 12})->Args({4096, 4096, 16})
// clang-format on
BENCHMARK(BM_Copy_ThreadPool) CHAINED_SIZES->UseRealTime();
BENCHMARK(BM_BiasReLU_ThreadPool) CHAINED_SIZES->UseRealTime();
BENCHMARK(BM_Polynomial_ThreadPool) CHAINED_SIZES->UseRealTime();
BENCHMARK(BM_ExpNormalize_ThreadPool) CHAINED_SIZES->UseRealTime();
BENCHMARK(BM_BatchNorm_ThreadPool) CHAINED_SIZES->UseRealTime();