blob: 0069195bdeadb589e3d3fc55e17952040ac54a61 [file]
// Benchmarks for chained tensor expressions with ThreadPool.
// Tests realistic compound expressions spanning memory-bound to compute-bound.
#define EIGEN_USE_THREADS
#include <benchmark/benchmark.h>
#include <unsupported/Eigen/CXX11/Tensor>
#include <unsupported/Eigen/CXX11/ThreadPool>
using namespace Eigen;
typedef float Scalar;
// --- Pure memory-bound baseline (dst = src) ---
static void BM_Copy_ThreadPool(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
const int threads = state.range(2);
Tensor<Scalar, 2> src(M, N);
Tensor<Scalar, 2> dst(M, N);
src.setRandom();
ThreadPool tp(threads);
ThreadPoolDevice dev(&tp, threads);
for (auto _ : state) {
dst.device(dev) = src;
benchmark::DoNotOptimize(dst.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
state.counters["threads"] = threads;
}
// --- Near-memory-bound: bias + ReLU ---
// Pattern: (x + bias.broadcast()).cwiseMax(0)
static void BM_BiasReLU_ThreadPool(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
const int threads = state.range(2);
Tensor<Scalar, 2> x(M, N);
Tensor<Scalar, 2> bias(1, N);
Tensor<Scalar, 2> result(M, N);
x.setRandom();
bias.setRandom();
ThreadPool tp(threads);
ThreadPoolDevice dev(&tp, threads);
Eigen::array<int, 2> bcast = {M, 1};
for (auto _ : state) {
result.device(dev) = (x + bias.broadcast(bcast)).cwiseMax(Scalar(0));
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
state.counters["threads"] = threads;
}
// --- Compute-bound: Horner polynomial ((a*x+b)*x+c)*x+d ---
static void BM_Polynomial_ThreadPool(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
const int threads = state.range(2);
Tensor<Scalar, 2> x(M, N);
Tensor<Scalar, 2> result(M, N);
x.setRandom();
ThreadPool tp(threads);
ThreadPoolDevice dev(&tp, threads);
const Scalar a = 0.5f, b = 1.2f, c = -0.3f, d = 0.7f;
for (auto _ : state) {
result.device(dev) = ((x.constant(a) * x + x.constant(b)) * x + x.constant(c)) * x + x.constant(d);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
state.counters["threads"] = threads;
}
// --- Compute-bound: exp (expensive transcendental) ---
static void BM_ExpNormalize_ThreadPool(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
const int threads = state.range(2);
Tensor<Scalar, 2> x(M, N);
Tensor<Scalar, 2> result(M, N);
x.setRandom();
ThreadPool tp(threads);
ThreadPoolDevice dev(&tp, threads);
for (auto _ : state) {
result.device(dev) = x.exp();
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
state.counters["threads"] = threads;
}
// --- Batch normalization: gamma * (x - mean) / sqrt(var + eps) + beta ---
static void BM_BatchNorm_ThreadPool(benchmark::State& state) {
const int M = state.range(0);
const int N = state.range(1);
const int threads = state.range(2);
Tensor<Scalar, 2> x(M, N);
Tensor<Scalar, 2> result(M, N);
Tensor<Scalar, 2> gamma(1, N);
Tensor<Scalar, 2> beta(1, N);
Tensor<Scalar, 2> mean(1, N);
Tensor<Scalar, 2> var(1, N);
x.setRandom();
gamma.setRandom();
beta.setRandom();
mean.setRandom();
var.setRandom();
var = var.abs() + var.constant(Scalar(0.1));
ThreadPool tp(threads);
ThreadPoolDevice dev(&tp, threads);
Eigen::array<int, 2> bcast = {M, 1};
const Scalar eps = 1e-5f;
for (auto _ : state) {
result.device(dev) =
gamma.broadcast(bcast) * (x - mean.broadcast(bcast)) * (var.broadcast(bcast) + x.constant(eps)).rsqrt() +
beta.broadcast(bcast);
benchmark::DoNotOptimize(result.data());
benchmark::ClobberMemory();
}
state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar) * 2);
state.counters["threads"] = threads;
}
static void ChainedSizes(::benchmark::Benchmark* b) {
for (int size : {256, 1024, 4096}) {
for (int threads : {1, 2, 4, 8, 12, 16}) {
b->Args({size, size, threads});
}
}
}
BENCHMARK(BM_Copy_ThreadPool)->Apply(ChainedSizes)->UseRealTime();
BENCHMARK(BM_BiasReLU_ThreadPool)->Apply(ChainedSizes)->UseRealTime();
BENCHMARK(BM_Polynomial_ThreadPool)->Apply(ChainedSizes)->UseRealTime();
BENCHMARK(BM_ExpNormalize_ThreadPool)->Apply(ChainedSizes)->UseRealTime();
BENCHMARK(BM_BatchNorm_ThreadPool)->Apply(ChainedSizes)->UseRealTime();