benchmarks/Core/bench_gemv.cpp - mirror - Git at Google

 // Benchmark for dense general matrix-vector multiplication (GEMV).
 //
 // Tests performance of y += op(A) * x for various matrix sizes, aspect ratios,
 // scalar types, and operation variants (transpose, conjugate, adjoint).
 //
 // The Eigen GEMV kernel (Eigen/src/Core/products/GeneralMatrixVector.h) has
 // two main specializations:
 //   - ColMajor kernel: used for y += A * x with column-major A.
 //     Processes vertical panels, vectorizes along rows.
 //   - RowMajor kernel: used for y += A^T * x with column-major A.
 //     Processes groups of rows, vectorizes the dot product along columns.
 //
 // For complex scalars, conjugation flags (ConjugateLhs, ConjugateRhs) select
 // additional code paths within each kernel via conj_helper.
 //
 // Operation mapping (for column-major stored A):
 //   Gemv       y += A * x           -> ColMajor kernel, no conjugation
 //   GemvTrans  y += A^T * x         -> RowMajor kernel, no conjugation
 //   GemvConj   y += conj(A) * x     -> ColMajor kernel, ConjugateLhs=true
 //   GemvAdj    y += A^H * x         -> RowMajor kernel, ConjugateLhs=true

 #include <benchmark/benchmark.h>
 #include <Eigen/Core>

 using namespace Eigen;

 // ---------- Benchmark helpers ----------

 // GEMV flop count: 2*m*n for real, 8*m*n for complex.
 template <typename Scalar>
 double gemvFlops(Index m, Index n) {
   return (NumTraits<Scalar>::IsComplex ? 8.0 : 2.0) * m * n;
 }

 // ---------- y += A * x  (ColMajor GEMV kernel, no conjugation) ----------

 template <typename Scalar>
 static void BM_Gemv(benchmark::State& state) {
   using Mat = Matrix<Scalar, Dynamic, Dynamic>;
   using Vec = Matrix<Scalar, Dynamic, 1>;
   const Index m = state.range(0);
   const Index n = state.range(1);
   Mat A = Mat::Random(m, n);
   Vec x = Vec::Random(n);
   Vec y = Vec::Random(m);
   for (auto _ : state) {
     y.noalias() += A * x;
     benchmark::DoNotOptimize(y.data());
     benchmark::ClobberMemory();
   }
   state.counters["GFLOPS"] = benchmark::Counter(gemvFlops<Scalar>(m, n), benchmark::Counter::kIsIterationInvariantRate,
                                                 benchmark::Counter::kIs1000);
 }

 // ---------- y += A^T * x  (RowMajor GEMV kernel, no conjugation) ----------

 template <typename Scalar>
 static void BM_GemvTrans(benchmark::State& state) {
   using Mat = Matrix<Scalar, Dynamic, Dynamic>;
   using Vec = Matrix<Scalar, Dynamic, 1>;
   const Index m = state.range(0);
   const Index n = state.range(1);
   Mat A = Mat::Random(m, n);
   Vec x = Vec::Random(m);
   Vec y = Vec::Random(n);
   for (auto _ : state) {
     y.noalias() += A.transpose() * x;
     benchmark::DoNotOptimize(y.data());
     benchmark::ClobberMemory();
   }
   state.counters["GFLOPS"] = benchmark::Counter(gemvFlops<Scalar>(m, n), benchmark::Counter::kIsIterationInvariantRate,
                                                 benchmark::Counter::kIs1000);
 }

 // ---------- y += conj(A) * x  (ColMajor kernel, ConjugateLhs=true) ----------

 template <typename Scalar>
 static void BM_GemvConj(benchmark::State& state) {
   using Mat = Matrix<Scalar, Dynamic, Dynamic>;
   using Vec = Matrix<Scalar, Dynamic, 1>;
   const Index m = state.range(0);
   const Index n = state.range(1);
   Mat A = Mat::Random(m, n);
   Vec x = Vec::Random(n);
   Vec y = Vec::Random(m);
   for (auto _ : state) {
     y.noalias() += A.conjugate() * x;
     benchmark::DoNotOptimize(y.data());
     benchmark::ClobberMemory();
   }
   state.counters["GFLOPS"] = benchmark::Counter(gemvFlops<Scalar>(m, n), benchmark::Counter::kIsIterationInvariantRate,
                                                 benchmark::Counter::kIs1000);
 }

 // ---------- y += A^H * x  (RowMajor kernel, ConjugateLhs=true) ----------

 template <typename Scalar>
 static void BM_GemvAdj(benchmark::State& state) {
   using Mat = Matrix<Scalar, Dynamic, Dynamic>;
   using Vec = Matrix<Scalar, Dynamic, 1>;
   const Index m = state.range(0);
   const Index n = state.range(1);
   Mat A = Mat::Random(m, n);
   Vec x = Vec::Random(m);
   Vec y = Vec::Random(n);
   for (auto _ : state) {
     y.noalias() += A.adjoint() * x;
     benchmark::DoNotOptimize(y.data());
     benchmark::ClobberMemory();
   }
   state.counters["GFLOPS"] = benchmark::Counter(gemvFlops<Scalar>(m, n), benchmark::Counter::kIsIterationInvariantRate,
                                                 benchmark::Counter::kIs1000);
 }

 // ---------- Size configurations ----------
 // All sizes refer to the stored matrix A (m rows, n cols).

 // ---------- Register benchmarks ----------

 // clang-format off
 // Square matrices; tall-thin (m >> n); short-wide (m << n).
 // The 4096..32768-row x 1..3-col cases straddle the run_small_cols
 // "stride*sizeof > L1" threshold where the 8-row inner unroll flips off.
 #define GEMV_SIZES \
     ->Args({8, 8})->Args({32, 32})->Args({128, 128})->Args({512, 512})->Args({1024, 1024}) \
     ->Args({256, 1})->Args({1024, 1})->Args({256, 16})->Args({1024, 16}) \
     ->Args({1, 256})->Args({1, 1024})->Args({16, 256})->Args({16, 1024}) \
     ->Args({4096, 1})->Args({8192, 1})->Args({16384, 1})->Args({32768, 1}) \
     ->Args({4096, 2})->Args({8192, 2})->Args({16384, 2})

 // Real types: Gemv and GemvTrans exercise the two kernel specializations.
 // Conjugation is a no-op for real scalars.
 BENCHMARK(BM_Gemv<float>) GEMV_SIZES ->Name("Gemv_float");
 BENCHMARK(BM_Gemv<double>) GEMV_SIZES ->Name("Gemv_double");
 BENCHMARK(BM_GemvTrans<float>) GEMV_SIZES ->Name("GemvTrans_float");
 BENCHMARK(BM_GemvTrans<double>) GEMV_SIZES ->Name("GemvTrans_double");

 // Complex types: all four variants exercise distinct kernel code paths.
 // Only cfloat is benchmarked since cdouble exercises the same paths but slower.
 BENCHMARK(BM_Gemv<std::complex<float>>) GEMV_SIZES ->Name("Gemv_cfloat");
 BENCHMARK(BM_GemvTrans<std::complex<float>>) GEMV_SIZES ->Name("GemvTrans_cfloat");
 BENCHMARK(BM_GemvConj<std::complex<float>>) GEMV_SIZES ->Name("GemvConj_cfloat");
 BENCHMARK(BM_GemvAdj<std::complex<float>>) GEMV_SIZES ->Name("GemvAdj_cfloat");

 #undef GEMV_SIZES
 // clang-format on
	// Benchmark for dense general matrix-vector multiplication (GEMV).
	//
	// Tests performance of y += op(A) * x for various matrix sizes, aspect ratios,
	// scalar types, and operation variants (transpose, conjugate, adjoint).
	//
	// The Eigen GEMV kernel (Eigen/src/Core/products/GeneralMatrixVector.h) has
	// two main specializations:
	// - ColMajor kernel: used for y += A * x with column-major A.
	// Processes vertical panels, vectorizes along rows.
	// - RowMajor kernel: used for y += A^T * x with column-major A.
	// Processes groups of rows, vectorizes the dot product along columns.
	//
	// For complex scalars, conjugation flags (ConjugateLhs, ConjugateRhs) select
	// additional code paths within each kernel via conj_helper.
	//
	// Operation mapping (for column-major stored A):
	// Gemv y += A * x -> ColMajor kernel, no conjugation
	// GemvTrans y += A^T * x -> RowMajor kernel, no conjugation
	// GemvConj y += conj(A) * x -> ColMajor kernel, ConjugateLhs=true
	// GemvAdj y += A^H * x -> RowMajor kernel, ConjugateLhs=true

	#include <benchmark/benchmark.h>
	#include <Eigen/Core>

	using namespace Eigen;

	// ---------- Benchmark helpers ----------

	// GEMV flop count: 2mn for real, 8mn for complex.
	template <typename Scalar>
	double gemvFlops(Index m, Index n) {
	return (NumTraits<Scalar>::IsComplex ? 8.0 : 2.0) * m * n;
	}

	// ---------- y += A * x (ColMajor GEMV kernel, no conjugation) ----------

	template <typename Scalar>
	static void BM_Gemv(benchmark::State& state) {
	using Mat = Matrix<Scalar, Dynamic, Dynamic>;
	using Vec = Matrix<Scalar, Dynamic, 1>;
	const Index m = state.range(0);
	const Index n = state.range(1);
	Mat A = Mat::Random(m, n);
	Vec x = Vec::Random(n);
	Vec y = Vec::Random(m);
	for (auto _ : state) {
	y.noalias() += A * x;
	benchmark::DoNotOptimize(y.data());
	benchmark::ClobberMemory();
	}
	state.counters["GFLOPS"] = benchmark::Counter(gemvFlops<Scalar>(m, n), benchmark::Counter::kIsIterationInvariantRate,
	benchmark::Counter::kIs1000);
	}

	// ---------- y += A^T * x (RowMajor GEMV kernel, no conjugation) ----------

	template <typename Scalar>
	static void BM_GemvTrans(benchmark::State& state) {
	using Mat = Matrix<Scalar, Dynamic, Dynamic>;
	using Vec = Matrix<Scalar, Dynamic, 1>;
	const Index m = state.range(0);
	const Index n = state.range(1);
	Mat A = Mat::Random(m, n);
	Vec x = Vec::Random(m);
	Vec y = Vec::Random(n);
	for (auto _ : state) {
	y.noalias() += A.transpose() * x;
	benchmark::DoNotOptimize(y.data());
	benchmark::ClobberMemory();
	}
	state.counters["GFLOPS"] = benchmark::Counter(gemvFlops<Scalar>(m, n), benchmark::Counter::kIsIterationInvariantRate,
	benchmark::Counter::kIs1000);
	}

	// ---------- y += conj(A) * x (ColMajor kernel, ConjugateLhs=true) ----------

	template <typename Scalar>
	static void BM_GemvConj(benchmark::State& state) {
	using Mat = Matrix<Scalar, Dynamic, Dynamic>;
	using Vec = Matrix<Scalar, Dynamic, 1>;
	const Index m = state.range(0);
	const Index n = state.range(1);
	Mat A = Mat::Random(m, n);
	Vec x = Vec::Random(n);
	Vec y = Vec::Random(m);
	for (auto _ : state) {
	y.noalias() += A.conjugate() * x;
	benchmark::DoNotOptimize(y.data());
	benchmark::ClobberMemory();
	}
	state.counters["GFLOPS"] = benchmark::Counter(gemvFlops<Scalar>(m, n), benchmark::Counter::kIsIterationInvariantRate,
	benchmark::Counter::kIs1000);
	}

	// ---------- y += A^H * x (RowMajor kernel, ConjugateLhs=true) ----------

	template <typename Scalar>
	static void BM_GemvAdj(benchmark::State& state) {
	using Mat = Matrix<Scalar, Dynamic, Dynamic>;
	using Vec = Matrix<Scalar, Dynamic, 1>;
	const Index m = state.range(0);
	const Index n = state.range(1);
	Mat A = Mat::Random(m, n);
	Vec x = Vec::Random(m);
	Vec y = Vec::Random(n);
	for (auto _ : state) {
	y.noalias() += A.adjoint() * x;
	benchmark::DoNotOptimize(y.data());
	benchmark::ClobberMemory();
	}
	state.counters["GFLOPS"] = benchmark::Counter(gemvFlops<Scalar>(m, n), benchmark::Counter::kIsIterationInvariantRate,
	benchmark::Counter::kIs1000);
	}

	// ---------- Size configurations ----------
	// All sizes refer to the stored matrix A (m rows, n cols).

	// ---------- Register benchmarks ----------

	// clang-format off
	// Square matrices; tall-thin (m >> n); short-wide (m << n).
	// The 4096..32768-row x 1..3-col cases straddle the run_small_cols
	// "stride*sizeof > L1" threshold where the 8-row inner unroll flips off.
	#define GEMV_SIZES \
	->Args({8, 8})->Args({32, 32})->Args({128, 128})->Args({512, 512})->Args({1024, 1024}) \
	->Args({256, 1})->Args({1024, 1})->Args({256, 16})->Args({1024, 16}) \
	->Args({1, 256})->Args({1, 1024})->Args({16, 256})->Args({16, 1024}) \
	->Args({4096, 1})->Args({8192, 1})->Args({16384, 1})->Args({32768, 1}) \
	->Args({4096, 2})->Args({8192, 2})->Args({16384, 2})

	// Real types: Gemv and GemvTrans exercise the two kernel specializations.
	// Conjugation is a no-op for real scalars.
	BENCHMARK(BM_Gemv<float>) GEMV_SIZES ->Name("Gemv_float");
	BENCHMARK(BM_Gemv<double>) GEMV_SIZES ->Name("Gemv_double");
	BENCHMARK(BM_GemvTrans<float>) GEMV_SIZES ->Name("GemvTrans_float");
	BENCHMARK(BM_GemvTrans<double>) GEMV_SIZES ->Name("GemvTrans_double");

	// Complex types: all four variants exercise distinct kernel code paths.
	// Only cfloat is benchmarked since cdouble exercises the same paths but slower.
	BENCHMARK(BM_Gemv<std::complex<float>>) GEMV_SIZES ->Name("Gemv_cfloat");
	BENCHMARK(BM_GemvTrans<std::complex<float>>) GEMV_SIZES ->Name("GemvTrans_cfloat");
	BENCHMARK(BM_GemvConj<std::complex<float>>) GEMV_SIZES ->Name("GemvConj_cfloat");
	BENCHMARK(BM_GemvAdj<std::complex<float>>) GEMV_SIZES ->Name("GemvAdj_cfloat");

	#undef GEMV_SIZES
	// clang-format on