blob: 572eef75943fdda0e9d7405c7dd606e3373d2b21 [file]
// SPDX-FileCopyrightText: The Eigen Authors
// SPDX-License-Identifier: MPL-2.0
#define EIGEN_USE_THREADS 1
#include <benchmark/benchmark.h>
#include <Eigen/Sparse>
#include <Eigen/SparseCore>
#include <random>
#include <set>
using namespace Eigen;
using Scalar = double;
using SpMatColMajor = SparseMatrix<Scalar, ColMajor>;
using SpMatRowMajor = SparseMatrix<Scalar, RowMajor>;
using DenseVec = Matrix<Scalar, Dynamic, 1>;
// Three matrix shapes, chosen to expose different parallel-SpMV regimes.
enum MatrixKind { kBanded = 0, kRandom = 1, kHub = 2 };
template <typename SpMat>
static SpMat make_banded(int n, int bandwidth) {
std::mt19937 gen(12345);
std::uniform_real_distribution<double> dist(-1.0, 1.0);
std::vector<Triplet<Scalar>> triplets;
triplets.reserve(static_cast<std::size_t>(n) * (2 * bandwidth + 1));
for (int i = 0; i < n; ++i) {
for (int j = std::max(0, i - bandwidth); j < std::min(n, i + bandwidth + 1); ++j) {
triplets.emplace_back(i, j, dist(gen));
}
}
SpMat m(n, n);
m.setFromTriplets(triplets.begin(), triplets.end());
m.makeCompressed();
return m;
}
template <typename SpMat>
static SpMat make_random(int n, int nnz_per_row) {
std::mt19937 gen(67890);
std::uniform_int_distribution<int> col_dist(0, n - 1);
std::uniform_real_distribution<double> val_dist(-1.0, 1.0);
std::vector<Triplet<Scalar>> triplets;
triplets.reserve(static_cast<std::size_t>(n) * nnz_per_row);
for (int i = 0; i < n; ++i) {
std::set<int> cols;
while (static_cast<int>(cols.size()) < nnz_per_row) cols.insert(col_dist(gen));
for (int j : cols) triplets.emplace_back(i, j, val_dist(gen));
}
SpMat m(n, n);
m.setFromTriplets(triplets.begin(), triplets.end());
m.makeCompressed();
return m;
}
// Hub matrix: a few extremely dense rows (the "hubs") plus a sparse tail.
// Forces partition imbalance to expose nnz-balance behaviour.
template <typename SpMat>
static SpMat make_hub(int n, int hub_rows, int hub_density_pct, int tail_nnz_per_row) {
std::mt19937 gen(13579);
std::uniform_int_distribution<int> col_dist(0, n - 1);
std::uniform_real_distribution<double> val_dist(-1.0, 1.0);
std::vector<Triplet<Scalar>> triplets;
for (int i = 0; i < n; ++i) {
if (i < hub_rows) {
// Hub row: hub_density_pct% of all columns
int target = (n * hub_density_pct) / 100;
std::set<int> cols;
while (static_cast<int>(cols.size()) < target) cols.insert(col_dist(gen));
for (int j : cols) triplets.emplace_back(i, j, val_dist(gen));
} else {
std::set<int> cols;
while (static_cast<int>(cols.size()) < tail_nnz_per_row) cols.insert(col_dist(gen));
for (int j : cols) triplets.emplace_back(i, j, val_dist(gen));
}
}
SpMat m(n, n);
m.setFromTriplets(triplets.begin(), triplets.end());
m.makeCompressed();
return m;
}
template <typename SpMat>
static SpMat make_matrix(MatrixKind kind, int n) {
switch (kind) {
case kBanded:
return make_banded<SpMat>(n, 32); // ~65 nnz/row, regular
case kRandom:
return make_random<SpMat>(n, 40); // 40 nnz/row, uniform
case kHub:
return make_hub<SpMat>(n, 4, 50, 10); // 4 dense rows + sparse tail
}
return SpMat{};
}
// Baseline: existing single-threaded (or OpenMP, if compiled with it) SpMV.
template <typename SpMat>
static void BM_Baseline_Forward(benchmark::State& state) {
const MatrixKind kind = static_cast<MatrixKind>(state.range(0));
const int n = static_cast<int>(state.range(1));
SpMat A = make_matrix<SpMat>(kind, n);
DenseVec x = DenseVec::Random(n);
DenseVec y(n);
for (auto _ : state) {
y.noalias() = A * x;
benchmark::DoNotOptimize(y.data());
}
state.counters["nnz"] = static_cast<double>(A.nonZeros());
state.counters["n"] = n;
}
template <typename SpMat>
static void BM_Baseline_Adjoint(benchmark::State& state) {
const MatrixKind kind = static_cast<MatrixKind>(state.range(0));
const int n = static_cast<int>(state.range(1));
SpMat A = make_matrix<SpMat>(kind, n);
DenseVec x = DenseVec::Random(n);
DenseVec y(n);
for (auto _ : state) {
y.noalias() = A.adjoint() * x;
benchmark::DoNotOptimize(y.data());
}
state.counters["nnz"] = static_cast<double>(A.nonZeros());
state.counters["n"] = n;
}
// New: ThreadedSparseProduct, pool sized via state.range(2).
template <typename SpMat>
static void BM_Threaded_Forward(benchmark::State& state) {
const MatrixKind kind = static_cast<MatrixKind>(state.range(0));
const int n = static_cast<int>(state.range(1));
const int threads = static_cast<int>(state.range(2));
// Make OpenMP team size match the requested thread count: under
// EIGEN_HAS_OPENMP the operator launches `omp parallel for num_threads(T)`
// where T = Eigen::nbThreads(), not the pool's size. Otherwise the
// reported "T" counter would not match the actual parallelism.
Eigen::setNbThreads(threads);
SpMat A = make_matrix<SpMat>(kind, n);
ThreadPool pool(threads);
ThreadedSparseProduct<SpMat> op(A, &pool);
DenseVec x = DenseVec::Random(n);
DenseVec y(n);
// Warm the mirror (only material for ColMajor forward) and prime the pool.
op.apply(x, y);
for (auto _ : state) {
op.apply(x, y);
benchmark::DoNotOptimize(y.data());
}
state.counters["nnz"] = static_cast<double>(A.nonZeros());
state.counters["n"] = n;
state.counters["T"] = threads;
}
template <typename SpMat>
static void BM_Threaded_Adjoint(benchmark::State& state) {
const MatrixKind kind = static_cast<MatrixKind>(state.range(0));
const int n = static_cast<int>(state.range(1));
const int threads = static_cast<int>(state.range(2));
// Make OpenMP team size match the requested thread count: under
// EIGEN_HAS_OPENMP the operator launches `omp parallel for num_threads(T)`
// where T = Eigen::nbThreads(), not the pool's size. Otherwise the
// reported "T" counter would not match the actual parallelism.
Eigen::setNbThreads(threads);
SpMat A = make_matrix<SpMat>(kind, n);
ThreadPool pool(threads);
ThreadedSparseProduct<SpMat> op(A, &pool);
DenseVec x = DenseVec::Random(n);
DenseVec y(n);
// Warm the mirror outside the timed loop.
op.applyAdjoint(x, y);
for (auto _ : state) {
op.applyAdjoint(x, y);
benchmark::DoNotOptimize(y.data());
}
state.counters["nnz"] = static_cast<double>(A.nonZeros());
state.counters["n"] = n;
state.counters["T"] = threads;
}
// Matrix sizes: spans the threading threshold (20000 nnz) and goes up to a
// few million nnz where threading should clearly win.
// n=200 -> ~13K nnz (random) -> below threshold, expects serial
// n=2000 -> ~80K nnz -> threaded
// n=20000 -> ~800K nnz -> threaded, parallelism matters
// n=100000-> ~4M nnz -> threaded, memory-bandwidth bound
// Common arg grids, expressed declaratively with ArgsProduct: the Cartesian
// product of {kind} x {n} for the baselines, and {kind} x {n} x {threads} for
// the threaded variants. ArgsProduct keeps the grid on the registration itself
// rather than routing through an Apply() callback, which would have to name the
// implementation-detail benchmark::internal::Benchmark* type.
BENCHMARK_TEMPLATE(BM_Baseline_Forward, SpMatRowMajor)
->ArgsProduct({{kBanded, kRandom, kHub}, {200, 2000, 20000, 100000}});
BENCHMARK_TEMPLATE(BM_Baseline_Forward, SpMatColMajor)
->ArgsProduct({{kBanded, kRandom, kHub}, {200, 2000, 20000, 100000}});
BENCHMARK_TEMPLATE(BM_Baseline_Adjoint, SpMatRowMajor)
->ArgsProduct({{kBanded, kRandom, kHub}, {200, 2000, 20000, 100000}});
BENCHMARK_TEMPLATE(BM_Baseline_Adjoint, SpMatColMajor)
->ArgsProduct({{kBanded, kRandom, kHub}, {200, 2000, 20000, 100000}});
// Threaded benchmarks measure wall-clock latency: Google Benchmark's default
// CPU-time mode sums across worker threads and would hide actual speedup.
BENCHMARK_TEMPLATE(BM_Threaded_Forward, SpMatRowMajor)
->ArgsProduct({{kBanded, kRandom, kHub}, {200, 2000, 20000, 100000}, {1, 2, 4, 8}})
->UseRealTime();
BENCHMARK_TEMPLATE(BM_Threaded_Forward, SpMatColMajor)
->ArgsProduct({{kBanded, kRandom, kHub}, {200, 2000, 20000, 100000}, {1, 2, 4, 8}})
->UseRealTime();
BENCHMARK_TEMPLATE(BM_Threaded_Adjoint, SpMatRowMajor)
->ArgsProduct({{kBanded, kRandom, kHub}, {200, 2000, 20000, 100000}, {1, 2, 4, 8}})
->UseRealTime();
BENCHMARK_TEMPLATE(BM_Threaded_Adjoint, SpMatColMajor)
->ArgsProduct({{kBanded, kRandom, kHub}, {200, 2000, 20000, 100000}, {1, 2, 4, 8}})
->UseRealTime();