CI: fix bench builds (drop Apply/internal::Benchmark*) + relax eigensolver_selfadjoint tolerance

libeigen/eigen!2507

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
diff --git a/benchmarks/Householder/bench_householder.cpp b/benchmarks/Householder/bench_householder.cpp
index 61d39ed..047b27b 100644
--- a/benchmarks/Householder/bench_householder.cpp
+++ b/benchmarks/Householder/bench_householder.cpp
@@ -263,108 +263,89 @@
 }
 
 // =============================================================================
-// Size configurations
+// Size configurations: chained ->Arg / ->Args macros applied at registration.
 // =============================================================================
 
-static void VectorSizes(::benchmark::Benchmark* b) {
-  for (int n : {8, 16, 32, 64, 128, 256, 512, 1024, 4096}) b->Arg(n);
-}
+// clang-format off
+#define VECTOR_SIZES \
+  ->Arg(8)->Arg(16)->Arg(32)->Arg(64)->Arg(128)->Arg(256)->Arg(512)->Arg(1024)->Arg(4096)
 
-static void SquareSizes(::benchmark::Benchmark* b) {
-  for (int n : {32, 48, 64, 80, 96, 112, 128, 160, 192, 256, 384, 512, 768, 1024}) b->Args({n, n});
-}
+#define SQUARE_SIZES \
+  ->Args({32, 32})->Args({48, 48})->Args({64, 64})->Args({80, 80})->Args({96, 96}) \
+  ->Args({112, 112})->Args({128, 128})->Args({160, 160})->Args({192, 192})->Args({256, 256}) \
+  ->Args({384, 384})->Args({512, 512})->Args({768, 768})->Args({1024, 1024})
 
 // Fine-grained sizes around the blocking threshold to find the crossover point.
-static void SquareSizesFine(::benchmark::Benchmark* b) {
-  for (int n : {32, 40, 48, 56, 64, 72, 80, 88, 96, 112, 128, 160, 192, 256}) b->Args({n, n});
-}
+#define SQUARE_SIZES_FINE \
+  ->Args({32, 32})->Args({40, 40})->Args({48, 48})->Args({56, 56})->Args({64, 64}) \
+  ->Args({72, 72})->Args({80, 80})->Args({88, 88})->Args({96, 96})->Args({112, 112}) \
+  ->Args({128, 128})->Args({160, 160})->Args({192, 192})->Args({256, 256})
 
 // Rectangular: many rows, fewer columns (m_length = cols, dst is rows x rows).
-static void RectApplyRight(::benchmark::Benchmark* b) {
-  // Square
-  for (int n : {48, 64, 96, 128, 256, 512, 1024}) b->Args({n, n});
-  // Wide dst * narrow Q: dst is (rows x rows), Q is (cols x cols), so rows > cols.
-  b->Args({256, 64});
-  b->Args({256, 128});
-  b->Args({512, 64});
-  b->Args({512, 128});
-  b->Args({1024, 64});
-  b->Args({1024, 128});
-  b->Args({1024, 256});
-}
+// Wide dst * narrow Q: dst is (rows x rows), Q is (cols x cols), so rows > cols.
+#define RECT_APPLY_RIGHT_SIZES \
+  ->Args({48, 48})->Args({64, 64})->Args({96, 96})->Args({128, 128}) \
+  ->Args({256, 256})->Args({512, 512})->Args({1024, 1024}) \
+  ->Args({256, 64})->Args({256, 128}) \
+  ->Args({512, 64})->Args({512, 128}) \
+  ->Args({1024, 64})->Args({1024, 128})->Args({1024, 256})
 
-static void RectSizes(::benchmark::Benchmark* b) {
-  // Square
-  for (int n : {32, 64, 128, 256, 512, 1024}) b->Args({n, n});
-  // Tall-thin
-  b->Args({1000, 32});
-  b->Args({1000, 100});
-  b->Args({10000, 32});
-  b->Args({10000, 100});
-}
+// Square plus tall-thin shapes.
+#define RECT_SIZES \
+  ->Args({32, 32})->Args({64, 64})->Args({128, 128}) \
+  ->Args({256, 256})->Args({512, 512})->Args({1024, 1024}) \
+  ->Args({1000, 32})->Args({1000, 100})->Args({10000, 32})->Args({10000, 100})
 
-static void BlockSizes(::benchmark::Benchmark* b) {
-  for (int n : {64, 128, 256, 512, 1024}) {
-    b->Args({n, n});
-    b->Args({n, 32});
-  }
-}
+#define BLOCK_SIZES \
+  ->Args({64, 64})->Args({64, 32}) \
+  ->Args({128, 128})->Args({128, 32}) \
+  ->Args({256, 256})->Args({256, 32}) \
+  ->Args({512, 512})->Args({512, 32}) \
+  ->Args({1024, 1024})->Args({1024, 32})
+// clang-format on
 
 // =============================================================================
 // Register benchmarks: float
 // =============================================================================
 
-BENCHMARK(BM_MakeHouseholderInPlace<float>)->Apply(VectorSizes)->Name("MakeHouseholderInPlace_float");
-BENCHMARK(BM_MakeHouseholder<float>)->Apply(VectorSizes)->Name("MakeHouseholder_float");
-BENCHMARK(BM_ApplyHouseholderOnTheLeft<float>)->Apply(RectSizes)->Name("ApplyHouseholderOnTheLeft_float");
-BENCHMARK(BM_ApplyHouseholderOnTheRight<float>)->Apply(RectSizes)->Name("ApplyHouseholderOnTheRight_float");
-BENCHMARK(BM_HouseholderSequence_EvalTo<float>)->Apply(SquareSizesFine)->Name("HouseholderSequence_EvalTo_float");
-BENCHMARK(BM_HouseholderSequence_ApplyLeft<float>)->Apply(RectSizes)->Name("HouseholderSequence_ApplyLeft_float");
+BENCHMARK(BM_MakeHouseholderInPlace<float>) VECTOR_SIZES->Name("MakeHouseholderInPlace_float");
+BENCHMARK(BM_MakeHouseholder<float>) VECTOR_SIZES->Name("MakeHouseholder_float");
+BENCHMARK(BM_ApplyHouseholderOnTheLeft<float>) RECT_SIZES->Name("ApplyHouseholderOnTheLeft_float");
+BENCHMARK(BM_ApplyHouseholderOnTheRight<float>) RECT_SIZES->Name("ApplyHouseholderOnTheRight_float");
+BENCHMARK(BM_HouseholderSequence_EvalTo<float>) SQUARE_SIZES_FINE->Name("HouseholderSequence_EvalTo_float");
+BENCHMARK(BM_HouseholderSequence_ApplyLeft<float>) RECT_SIZES->Name("HouseholderSequence_ApplyLeft_float");
 BENCHMARK(BM_HouseholderSequence_ApplyRight<float>)
-    ->Apply(RectApplyRight)
-    ->Name("HouseholderSequence_ApplyRight_float");
+RECT_APPLY_RIGHT_SIZES->Name("HouseholderSequence_ApplyRight_float");
 BENCHMARK(BM_HouseholderSequence_AdjointApplyLeft<float>)
-    ->Apply(RectSizes)
-    ->Name("HouseholderSequence_AdjointApplyLeft_float");
-BENCHMARK(BM_BlockHouseholder_TriangularFactor<float>)
-    ->Apply(VectorSizes)
-    ->Name("BlockHouseholder_TriangularFactor_float");
-BENCHMARK(BM_BlockHouseholder_ApplyLeft<float>)->Apply(BlockSizes)->Name("BlockHouseholder_ApplyLeft_float");
+RECT_SIZES->Name("HouseholderSequence_AdjointApplyLeft_float");
+BENCHMARK(BM_BlockHouseholder_TriangularFactor<float>) VECTOR_SIZES->Name("BlockHouseholder_TriangularFactor_float");
+BENCHMARK(BM_BlockHouseholder_ApplyLeft<float>) BLOCK_SIZES->Name("BlockHouseholder_ApplyLeft_float");
 
 // =============================================================================
 // Register benchmarks: double
 // =============================================================================
 
-BENCHMARK(BM_MakeHouseholderInPlace<double>)->Apply(VectorSizes)->Name("MakeHouseholderInPlace_double");
-BENCHMARK(BM_MakeHouseholder<double>)->Apply(VectorSizes)->Name("MakeHouseholder_double");
-BENCHMARK(BM_ApplyHouseholderOnTheLeft<double>)->Apply(RectSizes)->Name("ApplyHouseholderOnTheLeft_double");
-BENCHMARK(BM_ApplyHouseholderOnTheRight<double>)->Apply(RectSizes)->Name("ApplyHouseholderOnTheRight_double");
-BENCHMARK(BM_HouseholderSequence_EvalTo<double>)->Apply(SquareSizesFine)->Name("HouseholderSequence_EvalTo_double");
-BENCHMARK(BM_HouseholderSequence_ApplyLeft<double>)->Apply(RectSizes)->Name("HouseholderSequence_ApplyLeft_double");
+BENCHMARK(BM_MakeHouseholderInPlace<double>) VECTOR_SIZES->Name("MakeHouseholderInPlace_double");
+BENCHMARK(BM_MakeHouseholder<double>) VECTOR_SIZES->Name("MakeHouseholder_double");
+BENCHMARK(BM_ApplyHouseholderOnTheLeft<double>) RECT_SIZES->Name("ApplyHouseholderOnTheLeft_double");
+BENCHMARK(BM_ApplyHouseholderOnTheRight<double>) RECT_SIZES->Name("ApplyHouseholderOnTheRight_double");
+BENCHMARK(BM_HouseholderSequence_EvalTo<double>) SQUARE_SIZES_FINE->Name("HouseholderSequence_EvalTo_double");
+BENCHMARK(BM_HouseholderSequence_ApplyLeft<double>) RECT_SIZES->Name("HouseholderSequence_ApplyLeft_double");
 BENCHMARK(BM_HouseholderSequence_ApplyRight<double>)
-    ->Apply(RectApplyRight)
-    ->Name("HouseholderSequence_ApplyRight_double");
+RECT_APPLY_RIGHT_SIZES->Name("HouseholderSequence_ApplyRight_double");
 BENCHMARK(BM_HouseholderSequence_AdjointApplyLeft<double>)
-    ->Apply(RectSizes)
-    ->Name("HouseholderSequence_AdjointApplyLeft_double");
-BENCHMARK(BM_BlockHouseholder_TriangularFactor<double>)
-    ->Apply(VectorSizes)
-    ->Name("BlockHouseholder_TriangularFactor_double");
-BENCHMARK(BM_BlockHouseholder_ApplyLeft<double>)->Apply(BlockSizes)->Name("BlockHouseholder_ApplyLeft_double");
+RECT_SIZES->Name("HouseholderSequence_AdjointApplyLeft_double");
+BENCHMARK(BM_BlockHouseholder_TriangularFactor<double>) VECTOR_SIZES->Name("BlockHouseholder_TriangularFactor_double");
+BENCHMARK(BM_BlockHouseholder_ApplyLeft<double>) BLOCK_SIZES->Name("BlockHouseholder_ApplyLeft_double");
 
 // =============================================================================
 // Register benchmarks: std::complex<double>
 // =============================================================================
 
-BENCHMARK(BM_MakeHouseholderInPlace<std::complex<double>>)
-    ->Apply(VectorSizes)
-    ->Name("MakeHouseholderInPlace_complexdouble");
+BENCHMARK(BM_MakeHouseholderInPlace<std::complex<double>>) VECTOR_SIZES->Name("MakeHouseholderInPlace_complexdouble");
 BENCHMARK(BM_ApplyHouseholderOnTheLeft<std::complex<double>>)
-    ->Apply(RectSizes)
-    ->Name("ApplyHouseholderOnTheLeft_complexdouble");
+RECT_SIZES->Name("ApplyHouseholderOnTheLeft_complexdouble");
 BENCHMARK(BM_HouseholderSequence_EvalTo<std::complex<double>>)
-    ->Apply(SquareSizes)
-    ->Name("HouseholderSequence_EvalTo_complexdouble");
+SQUARE_SIZES->Name("HouseholderSequence_EvalTo_complexdouble");
 BENCHMARK(BM_HouseholderSequence_ApplyLeft<std::complex<double>>)
-    ->Apply(SquareSizes)
-    ->Name("HouseholderSequence_ApplyLeft_complexdouble");
+SQUARE_SIZES->Name("HouseholderSequence_ApplyLeft_complexdouble");
diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp
index 9f50fcb..f2d852f 100644
--- a/test/eigensolver_selfadjoint.cpp
+++ b/test/eigensolver_selfadjoint.cpp
@@ -41,7 +41,7 @@
     scaledA /= scaling;
     MatrixType residual =
         scaledA * eiSymm.eigenvectors() - eiSymm.eigenvectors() * (eiSymm.eigenvalues() / scaling).asDiagonal();
-    RealScalar tol = RealScalar(4) * RealScalar(numext::maxi(Index(1), n)) * NumTraits<RealScalar>::epsilon();
+    RealScalar tol = RealScalar(8) * RealScalar(numext::maxi(Index(1), n)) * NumTraits<RealScalar>::epsilon();
     for (Index i = 0; i < n; ++i) {
       VERIFY(residual.col(i).norm() <= tol);
     }
@@ -50,7 +50,7 @@
 
   // Eigenvectors must be unitary. Use a tolerance proportional to n*epsilon,
   // which is the expected rounding error for Householder-based orthogonal transformations.
-  RealScalar unitary_tol = RealScalar(4) * RealScalar(numext::maxi(Index(1), n)) * NumTraits<RealScalar>::epsilon();
+  RealScalar unitary_tol = RealScalar(8) * RealScalar(numext::maxi(Index(1), n)) * NumTraits<RealScalar>::epsilon();
   // But don't go below the test_precision floor (matters for float).
   unitary_tol = numext::maxi(unitary_tol, test_precision<RealScalar>());
   VERIFY(eiSymm.eigenvectors().isUnitary(unitary_tol));
diff --git a/unsupported/benchmarks/IterativeSolvers/bench_iterative_solvers.cpp b/unsupported/benchmarks/IterativeSolvers/bench_iterative_solvers.cpp
index 5105d40..8027a6d 100644
--- a/unsupported/benchmarks/IterativeSolvers/bench_iterative_solvers.cpp
+++ b/unsupported/benchmarks/IterativeSolvers/bench_iterative_solvers.cpp
@@ -192,18 +192,13 @@
   state.counters["iterations"] = solver.iterations();
 }
 
-static void SolverSizes(::benchmark::Benchmark* b) {
-  for (int n : {1000, 10000, 100000}) {
-    for (int bw : {5, 20}) {
-      b->Args({n, bw});
-    }
-  }
-}
+// {n, bandwidth} as a Cartesian product.
+#define SOLVER_SIZES ->ArgsProduct({{1000, 10000, 100000}, {5, 20}})
 
-BENCHMARK(BM_GMRES)->Apply(SolverSizes);
-BENCHMARK(BM_DGMRES)->Apply(SolverSizes);
-BENCHMARK(BM_MINRES)->Apply(SolverSizes);
-BENCHMARK(BM_IDRS)->Apply(SolverSizes);
-BENCHMARK(BM_BiCGSTABL)->Apply(SolverSizes);
-BENCHMARK(BM_CG_Reference)->Apply(SolverSizes);
-BENCHMARK(BM_BiCGSTAB_Reference)->Apply(SolverSizes);
+BENCHMARK(BM_GMRES) SOLVER_SIZES;
+BENCHMARK(BM_DGMRES) SOLVER_SIZES;
+BENCHMARK(BM_MINRES) SOLVER_SIZES;
+BENCHMARK(BM_IDRS) SOLVER_SIZES;
+BENCHMARK(BM_BiCGSTABL) SOLVER_SIZES;
+BENCHMARK(BM_CG_Reference) SOLVER_SIZES;
+BENCHMARK(BM_BiCGSTAB_Reference) SOLVER_SIZES;
diff --git a/unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp b/unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp
index 882887e..2ad7fed 100644
--- a/unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp
+++ b/unsupported/benchmarks/KroneckerProduct/bench_kronecker.cpp
@@ -63,21 +63,8 @@
   state.counters["output_size"] = na * nb;
 }
 
-static void KroneckerSizes(::benchmark::Benchmark* b) {
-  for (int na : {4, 8, 16}) {
-    for (int nb : {4, 8, 16}) {
-      b->Args({na, nb});
-    }
-  }
-}
+#define KRONECKER_SIZES ->ArgsProduct({{4, 8, 16}, {4, 8, 16}})
+#define KRONECKER_SPARSE_SIZES ->ArgsProduct({{16, 32, 64, 128}, {16, 32, 64, 128}})
 
-static void KroneckerSparseSizes(::benchmark::Benchmark* b) {
-  for (int na : {16, 32, 64, 128}) {
-    for (int nb : {16, 32, 64, 128}) {
-      b->Args({na, nb});
-    }
-  }
-}
-
-BENCHMARK(BM_KroneckerDense)->Apply(KroneckerSizes);
-BENCHMARK(BM_KroneckerSparse)->Apply(KroneckerSparseSizes);
+BENCHMARK(BM_KroneckerDense) KRONECKER_SIZES;
+BENCHMARK(BM_KroneckerSparse) KRONECKER_SPARSE_SIZES;
diff --git a/unsupported/benchmarks/MatrixFunctions/bench_matrix_power.cpp b/unsupported/benchmarks/MatrixFunctions/bench_matrix_power.cpp
index a1ebea0..5666937 100644
--- a/unsupported/benchmarks/MatrixFunctions/bench_matrix_power.cpp
+++ b/unsupported/benchmarks/MatrixFunctions/bench_matrix_power.cpp
@@ -85,15 +85,11 @@
   }
 }
 
-static void MatPowerSizes(::benchmark::Benchmark* b) {
-  for (int n : {4, 8, 16, 32, 64}) {
-    b->Arg(n);
-  }
-}
+#define MAT_POWER_SIZES ->Arg(4)->Arg(8)->Arg(16)->Arg(32)->Arg(64)
 
-BENCHMARK(BM_MatrixSqrt)->Apply(MatPowerSizes);
-BENCHMARK(BM_MatrixPow)->Apply(MatPowerSizes);
-BENCHMARK(BM_MatrixCos)->Apply(MatPowerSizes);
-BENCHMARK(BM_MatrixSin)->Apply(MatPowerSizes);
-BENCHMARK(BM_MatrixCosh)->Apply(MatPowerSizes);
-BENCHMARK(BM_MatrixSinh)->Apply(MatPowerSizes);
+BENCHMARK(BM_MatrixSqrt) MAT_POWER_SIZES;
+BENCHMARK(BM_MatrixPow) MAT_POWER_SIZES;
+BENCHMARK(BM_MatrixCos) MAT_POWER_SIZES;
+BENCHMARK(BM_MatrixSin) MAT_POWER_SIZES;
+BENCHMARK(BM_MatrixCosh) MAT_POWER_SIZES;
+BENCHMARK(BM_MatrixSinh) MAT_POWER_SIZES;
diff --git a/unsupported/benchmarks/SpecialFunctions/bench_special_functions.cpp b/unsupported/benchmarks/SpecialFunctions/bench_special_functions.cpp
index dc6505d..859e41f 100644
--- a/unsupported/benchmarks/SpecialFunctions/bench_special_functions.cpp
+++ b/unsupported/benchmarks/SpecialFunctions/bench_special_functions.cpp
@@ -84,44 +84,46 @@
   state.SetBytesProcessed(state.iterations() * n * sizeof(Scalar) * 4);
 }
 
-static void SpecialSizes(::benchmark::Benchmark* b) {
-  for (int n : {256, 4096, 65536, 1048576}) b->Arg(n);
-}
+#define SPECIAL_SIZES ->Arg(256)->Arg(4096)->Arg(65536)->Arg(1048576)
 
 // --- Register float ---
-BENCHMARK(BM_Lgamma<float>)->Apply(SpecialSizes)->Name("Lgamma_float");
-BENCHMARK(BM_Digamma<float>)->Apply(SpecialSizes)->Name("Digamma_float");
-BENCHMARK(BM_BesselI0<float>)->Apply(SpecialSizes)->Name("BesselI0_float");
-BENCHMARK(BM_BesselI1<float>)->Apply(SpecialSizes)->Name("BesselI1_float");
-BENCHMARK(BM_BesselI0e<float>)->Apply(SpecialSizes)->Name("BesselI0e_float");
-BENCHMARK(BM_BesselI1e<float>)->Apply(SpecialSizes)->Name("BesselI1e_float");
-BENCHMARK(BM_BesselJ0<float>)->Apply(SpecialSizes)->Name("BesselJ0_float");
-BENCHMARK(BM_BesselJ1<float>)->Apply(SpecialSizes)->Name("BesselJ1_float");
-BENCHMARK(BM_BesselY0<float>)->Apply(SpecialSizes)->Name("BesselY0_float");
-BENCHMARK(BM_BesselY1<float>)->Apply(SpecialSizes)->Name("BesselY1_float");
-BENCHMARK(BM_BesselK0<float>)->Apply(SpecialSizes)->Name("BesselK0_float");
-BENCHMARK(BM_BesselK1<float>)->Apply(SpecialSizes)->Name("BesselK1_float");
-BENCHMARK(BM_BesselK0e<float>)->Apply(SpecialSizes)->Name("BesselK0e_float");
-BENCHMARK(BM_BesselK1e<float>)->Apply(SpecialSizes)->Name("BesselK1e_float");
-BENCHMARK(BM_Igamma<float>)->Apply(SpecialSizes)->Name("Igamma_float");
-BENCHMARK(BM_Igammac<float>)->Apply(SpecialSizes)->Name("Igammac_float");
-BENCHMARK(BM_Betainc<float>)->Apply(SpecialSizes)->Name("Betainc_float");
-BENCHMARK(BM_Zeta<float>)->Apply(SpecialSizes)->Name("Zeta_float");
-BENCHMARK(BM_Polygamma<float>)->Apply(SpecialSizes)->Name("Polygamma_float");
+BENCHMARK(BM_Lgamma<float>) SPECIAL_SIZES->Name("Lgamma_float");
+BENCHMARK(BM_Digamma<float>) SPECIAL_SIZES->Name("Digamma_float");
+BENCHMARK(BM_BesselI0<float>) SPECIAL_SIZES->Name("BesselI0_float");
+BENCHMARK(BM_BesselI1<float>) SPECIAL_SIZES->Name("BesselI1_float");
+BENCHMARK(BM_BesselI0e<float>) SPECIAL_SIZES->Name("BesselI0e_float");
+BENCHMARK(BM_BesselI1e<float>) SPECIAL_SIZES->Name("BesselI1e_float");
+BENCHMARK(BM_BesselJ0<float>) SPECIAL_SIZES->Name("BesselJ0_float");
+BENCHMARK(BM_BesselJ1<float>) SPECIAL_SIZES->Name("BesselJ1_float");
+BENCHMARK(BM_BesselY0<float>) SPECIAL_SIZES->Name("BesselY0_float");
+BENCHMARK(BM_BesselY1<float>) SPECIAL_SIZES->Name("BesselY1_float");
+BENCHMARK(BM_BesselK0<float>) SPECIAL_SIZES->Name("BesselK0_float");
+BENCHMARK(BM_BesselK1<float>) SPECIAL_SIZES->Name("BesselK1_float");
+BENCHMARK(BM_BesselK0e<float>) SPECIAL_SIZES->Name("BesselK0e_float");
+BENCHMARK(BM_BesselK1e<float>) SPECIAL_SIZES->Name("BesselK1e_float");
+BENCHMARK(BM_Igamma<float>) SPECIAL_SIZES->Name("Igamma_float");
+BENCHMARK(BM_Igammac<float>) SPECIAL_SIZES->Name("Igammac_float");
+BENCHMARK(BM_Betainc<float>) SPECIAL_SIZES->Name("Betainc_float");
+BENCHMARK(BM_Zeta<float>) SPECIAL_SIZES->Name("Zeta_float");
+BENCHMARK(BM_Polygamma<float>) SPECIAL_SIZES->Name("Polygamma_float");
 
 // --- Register double ---
-BENCHMARK(BM_Lgamma<double>)->Apply(SpecialSizes)->Name("Lgamma_double");
-BENCHMARK(BM_Digamma<double>)->Apply(SpecialSizes)->Name("Digamma_double");
-BENCHMARK(BM_BesselI0<double>)->Apply(SpecialSizes)->Name("BesselI0_double");
-BENCHMARK(BM_BesselI1<double>)->Apply(SpecialSizes)->Name("BesselI1_double");
-BENCHMARK(BM_BesselJ0<double>)->Apply(SpecialSizes)->Name("BesselJ0_double");
-BENCHMARK(BM_BesselJ1<double>)->Apply(SpecialSizes)->Name("BesselJ1_double");
-BENCHMARK(BM_BesselY0<double>)->Apply(SpecialSizes)->Name("BesselY0_double");
-BENCHMARK(BM_BesselY1<double>)->Apply(SpecialSizes)->Name("BesselY1_double");
-BENCHMARK(BM_BesselK0<double>)->Apply(SpecialSizes)->Name("BesselK0_double");
-BENCHMARK(BM_BesselK1<double>)->Apply(SpecialSizes)->Name("BesselK1_double");
-BENCHMARK(BM_Igamma<double>)->Apply(SpecialSizes)->Name("Igamma_double");
-BENCHMARK(BM_Igammac<double>)->Apply(SpecialSizes)->Name("Igammac_double");
-BENCHMARK(BM_Betainc<double>)->Apply(SpecialSizes)->Name("Betainc_double");
-BENCHMARK(BM_Zeta<double>)->Apply(SpecialSizes)->Name("Zeta_double");
-BENCHMARK(BM_Polygamma<double>)->Apply(SpecialSizes)->Name("Polygamma_double");
+BENCHMARK(BM_Lgamma<double>) SPECIAL_SIZES->Name("Lgamma_double");
+BENCHMARK(BM_Digamma<double>) SPECIAL_SIZES->Name("Digamma_double");
+BENCHMARK(BM_BesselI0<double>) SPECIAL_SIZES->Name("BesselI0_double");
+BENCHMARK(BM_BesselI1<double>) SPECIAL_SIZES->Name("BesselI1_double");
+BENCHMARK(BM_BesselI0e<double>) SPECIAL_SIZES->Name("BesselI0e_double");
+BENCHMARK(BM_BesselI1e<double>) SPECIAL_SIZES->Name("BesselI1e_double");
+BENCHMARK(BM_BesselJ0<double>) SPECIAL_SIZES->Name("BesselJ0_double");
+BENCHMARK(BM_BesselJ1<double>) SPECIAL_SIZES->Name("BesselJ1_double");
+BENCHMARK(BM_BesselY0<double>) SPECIAL_SIZES->Name("BesselY0_double");
+BENCHMARK(BM_BesselY1<double>) SPECIAL_SIZES->Name("BesselY1_double");
+BENCHMARK(BM_BesselK0<double>) SPECIAL_SIZES->Name("BesselK0_double");
+BENCHMARK(BM_BesselK1<double>) SPECIAL_SIZES->Name("BesselK1_double");
+BENCHMARK(BM_BesselK0e<double>) SPECIAL_SIZES->Name("BesselK0e_double");
+BENCHMARK(BM_BesselK1e<double>) SPECIAL_SIZES->Name("BesselK1e_double");
+BENCHMARK(BM_Igamma<double>) SPECIAL_SIZES->Name("Igamma_double");
+BENCHMARK(BM_Igammac<double>) SPECIAL_SIZES->Name("Igammac_double");
+BENCHMARK(BM_Betainc<double>) SPECIAL_SIZES->Name("Betainc_double");
+BENCHMARK(BM_Zeta<double>) SPECIAL_SIZES->Name("Zeta_double");
+BENCHMARK(BM_Polygamma<double>) SPECIAL_SIZES->Name("Polygamma_double");
diff --git a/unsupported/benchmarks/Splines/bench_splines.cpp b/unsupported/benchmarks/Splines/bench_splines.cpp
index e422a8f..852f1c0 100644
--- a/unsupported/benchmarks/Splines/bench_splines.cpp
+++ b/unsupported/benchmarks/Splines/bench_splines.cpp
@@ -77,22 +77,18 @@
   state.counters["Evals/s"] = benchmark::Counter(neval, benchmark::Counter::kIsIterationInvariantRate);
 }
 
-static void SplineSizes(::benchmark::Benchmark* b) {
-  for (int n : {10, 50, 200, 1000}) {
-    b->Arg(n);
-  }
-}
+#define SPLINE_SIZES ->Arg(10)->Arg(50)->Arg(200)->Arg(1000)
 
 // 2D cubic splines
-BENCHMARK(BM_SplineFit<2, 3>)->Apply(SplineSizes)->Name("SplineFit_2D_Cubic");
-BENCHMARK(BM_SplineEval<2, 3>)->Apply(SplineSizes)->Name("SplineEval_2D_Cubic");
-BENCHMARK(BM_SplineDerivatives<2, 3>)->Apply(SplineSizes)->Name("SplineDerivatives_2D_Cubic");
+BENCHMARK(BM_SplineFit<2, 3>) SPLINE_SIZES->Name("SplineFit_2D_Cubic");
+BENCHMARK(BM_SplineEval<2, 3>) SPLINE_SIZES->Name("SplineEval_2D_Cubic");
+BENCHMARK(BM_SplineDerivatives<2, 3>) SPLINE_SIZES->Name("SplineDerivatives_2D_Cubic");
 
 // 3D cubic splines
-BENCHMARK(BM_SplineFit<3, 3>)->Apply(SplineSizes)->Name("SplineFit_3D_Cubic");
-BENCHMARK(BM_SplineEval<3, 3>)->Apply(SplineSizes)->Name("SplineEval_3D_Cubic");
-BENCHMARK(BM_SplineDerivatives<3, 3>)->Apply(SplineSizes)->Name("SplineDerivatives_3D_Cubic");
+BENCHMARK(BM_SplineFit<3, 3>) SPLINE_SIZES->Name("SplineFit_3D_Cubic");
+BENCHMARK(BM_SplineEval<3, 3>) SPLINE_SIZES->Name("SplineEval_3D_Cubic");
+BENCHMARK(BM_SplineDerivatives<3, 3>) SPLINE_SIZES->Name("SplineDerivatives_3D_Cubic");
 
 // 2D quintic splines
-BENCHMARK(BM_SplineFit<2, 5>)->Apply(SplineSizes)->Name("SplineFit_2D_Quintic");
-BENCHMARK(BM_SplineEval<2, 5>)->Apply(SplineSizes)->Name("SplineEval_2D_Quintic");
+BENCHMARK(BM_SplineFit<2, 5>) SPLINE_SIZES->Name("SplineFit_2D_Quintic");
+BENCHMARK(BM_SplineEval<2, 5>) SPLINE_SIZES->Name("SplineEval_2D_Quintic");
diff --git a/unsupported/benchmarks/Tensor/bench_broadcasting.cpp b/unsupported/benchmarks/Tensor/bench_broadcasting.cpp
index 754fe96..6900535 100644
--- a/unsupported/benchmarks/Tensor/bench_broadcasting.cpp
+++ b/unsupported/benchmarks/Tensor/bench_broadcasting.cpp
@@ -140,35 +140,22 @@
   state.counters["threads"] = threads;
 }
 
-static void BroadcastSizes(::benchmark::Benchmark* b) {
-  for (int m : {64, 256, 1024}) {
-    for (int n : {64, 256, 1024}) {
-      b->Args({m, n});
-    }
-  }
-}
+// {m, n} and {batch, c, h}: pure Cartesian products.
+#define BROADCAST_SIZES ->ArgsProduct({{64, 256, 1024}, {64, 256, 1024}})
+#define BROADCAST_RANK4_SIZES ->ArgsProduct({{1, 8}, {64, 256}, {16, 32}})
 
-static void Rank4Sizes(::benchmark::Benchmark* b) {
-  for (int batch : {1, 8}) {
-    for (int c : {64, 256}) {
-      for (int h : {16, 32}) {
-        b->Args({batch, c, h});
-      }
-    }
-  }
-}
+// {size, size, threads}: explicit because size is repeated.
+// clang-format off
+#define BROADCAST_THREADPOOL_SIZES \
+  ->Args({256, 256, 1})->Args({256, 256, 2})->Args({256, 256, 4}) \
+  ->Args({256, 256, 8})->Args({256, 256, 12})->Args({256, 256, 16}) \
+  ->Args({1024, 1024, 1})->Args({1024, 1024, 2})->Args({1024, 1024, 4}) \
+  ->Args({1024, 1024, 8})->Args({1024, 1024, 12})->Args({1024, 1024, 16})
+// clang-format on
 
-static void BroadcastThreadPoolSizes(::benchmark::Benchmark* b) {
-  for (int size : {256, 1024}) {
-    for (int threads : {1, 2, 4, 8, 12, 16}) {
-      b->Args({size, size, threads});
-    }
-  }
-}
-
-BENCHMARK(BM_BroadcastRow)->Apply(BroadcastSizes);
-BENCHMARK(BM_BroadcastCol)->Apply(BroadcastSizes);
-BENCHMARK(BM_BroadcastAdd)->Apply(BroadcastSizes);
-BENCHMARK(BM_BroadcastRank4)->Apply(Rank4Sizes);
-BENCHMARK(BM_BroadcastRow_ThreadPool)->Apply(BroadcastThreadPoolSizes)->UseRealTime();
-BENCHMARK(BM_BroadcastAdd_ThreadPool)->Apply(BroadcastThreadPoolSizes)->UseRealTime();
+BENCHMARK(BM_BroadcastRow) BROADCAST_SIZES;
+BENCHMARK(BM_BroadcastCol) BROADCAST_SIZES;
+BENCHMARK(BM_BroadcastAdd) BROADCAST_SIZES;
+BENCHMARK(BM_BroadcastRank4) BROADCAST_RANK4_SIZES;
+BENCHMARK(BM_BroadcastRow_ThreadPool) BROADCAST_THREADPOOL_SIZES->UseRealTime();
+BENCHMARK(BM_BroadcastAdd_ThreadPool) BROADCAST_THREADPOOL_SIZES->UseRealTime();
diff --git a/unsupported/benchmarks/Tensor/bench_chained_expressions.cpp b/unsupported/benchmarks/Tensor/bench_chained_expressions.cpp
index 0069195..b6a9a34 100644
--- a/unsupported/benchmarks/Tensor/bench_chained_expressions.cpp
+++ b/unsupported/benchmarks/Tensor/bench_chained_expressions.cpp
@@ -142,16 +142,18 @@
   state.counters["threads"] = threads;
 }
 
-static void ChainedSizes(::benchmark::Benchmark* b) {
-  for (int size : {256, 1024, 4096}) {
-    for (int threads : {1, 2, 4, 8, 12, 16}) {
-      b->Args({size, size, threads});
-    }
-  }
-}
+// clang-format off
+#define CHAINED_SIZES \
+  ->Args({256, 256, 1})->Args({256, 256, 2})->Args({256, 256, 4}) \
+  ->Args({256, 256, 8})->Args({256, 256, 12})->Args({256, 256, 16}) \
+  ->Args({1024, 1024, 1})->Args({1024, 1024, 2})->Args({1024, 1024, 4}) \
+  ->Args({1024, 1024, 8})->Args({1024, 1024, 12})->Args({1024, 1024, 16}) \
+  ->Args({4096, 4096, 1})->Args({4096, 4096, 2})->Args({4096, 4096, 4}) \
+  ->Args({4096, 4096, 8})->Args({4096, 4096, 12})->Args({4096, 4096, 16})
+// clang-format on
 
-BENCHMARK(BM_Copy_ThreadPool)->Apply(ChainedSizes)->UseRealTime();
-BENCHMARK(BM_BiasReLU_ThreadPool)->Apply(ChainedSizes)->UseRealTime();
-BENCHMARK(BM_Polynomial_ThreadPool)->Apply(ChainedSizes)->UseRealTime();
-BENCHMARK(BM_ExpNormalize_ThreadPool)->Apply(ChainedSizes)->UseRealTime();
-BENCHMARK(BM_BatchNorm_ThreadPool)->Apply(ChainedSizes)->UseRealTime();
+BENCHMARK(BM_Copy_ThreadPool) CHAINED_SIZES->UseRealTime();
+BENCHMARK(BM_BiasReLU_ThreadPool) CHAINED_SIZES->UseRealTime();
+BENCHMARK(BM_Polynomial_ThreadPool) CHAINED_SIZES->UseRealTime();
+BENCHMARK(BM_ExpNormalize_ThreadPool) CHAINED_SIZES->UseRealTime();
+BENCHMARK(BM_BatchNorm_ThreadPool) CHAINED_SIZES->UseRealTime();
diff --git a/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp b/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp
index b6e4c14..df8bb11 100644
--- a/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp
+++ b/unsupported/benchmarks/Tensor/bench_coefficient_wise.cpp
@@ -206,39 +206,33 @@
   state.SetBytesProcessed(state.iterations() * batch * C * H * H * sizeof(Scalar) * 2);
 }
 
-static void CwiseSizes(::benchmark::Benchmark* b) {
-  for (int size : {256, 1024}) {
-    b->Args({size, size});
-  }
-}
+// clang-format off
+#define CWISE_SIZES \
+  ->Args({256, 256})->Args({1024, 1024})
 
-static void CwiseThreadPoolSizes(::benchmark::Benchmark* b) {
-  for (int size : {256, 1024}) {
-    for (int threads : {1, 2, 4, 8, 12, 16}) {
-      b->Args({size, size, threads});
-    }
-  }
-}
+#define CWISE_THREADPOOL_SIZES \
+  ->Args({256, 256, 1})->Args({256, 256, 2})->Args({256, 256, 4}) \
+  ->Args({256, 256, 8})->Args({256, 256, 12})->Args({256, 256, 16}) \
+  ->Args({1024, 1024, 1})->Args({1024, 1024, 2})->Args({1024, 1024, 4}) \
+  ->Args({1024, 1024, 8})->Args({1024, 1024, 12})->Args({1024, 1024, 16})
 
-static void Rank4Sizes(::benchmark::Benchmark* b) {
-  b->Args({32, 64, 16});
-  b->Args({8, 128, 32});
-  b->Args({1, 256, 64});
-}
+#define RANK4_SIZES \
+  ->Args({32, 64, 16})->Args({8, 128, 32})->Args({1, 256, 64})
+// clang-format on
 
-BENCHMARK(BM_Exp)->Apply(CwiseSizes);
-BENCHMARK(BM_Log)->Apply(CwiseSizes);
-BENCHMARK(BM_Tanh)->Apply(CwiseSizes);
-BENCHMARK(BM_Sigmoid)->Apply(CwiseSizes);
-BENCHMARK(BM_ReLU)->Apply(CwiseSizes);
-BENCHMARK(BM_Sqrt)->Apply(CwiseSizes);
-BENCHMARK(BM_Add)->Apply(CwiseSizes);
-BENCHMARK(BM_Mul)->Apply(CwiseSizes);
-BENCHMARK(BM_FMA)->Apply(CwiseSizes);
-BENCHMARK(BM_ReLU_Rank4)->Apply(Rank4Sizes);
-BENCHMARK(BM_Add_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime();
-BENCHMARK(BM_Mul_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime();
-BENCHMARK(BM_FMA_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime();
-BENCHMARK(BM_Exp_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime();
-BENCHMARK(BM_Tanh_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime();
-BENCHMARK(BM_ReLU_ThreadPool)->Apply(CwiseThreadPoolSizes)->UseRealTime();
+BENCHMARK(BM_Exp) CWISE_SIZES;
+BENCHMARK(BM_Log) CWISE_SIZES;
+BENCHMARK(BM_Tanh) CWISE_SIZES;
+BENCHMARK(BM_Sigmoid) CWISE_SIZES;
+BENCHMARK(BM_ReLU) CWISE_SIZES;
+BENCHMARK(BM_Sqrt) CWISE_SIZES;
+BENCHMARK(BM_Add) CWISE_SIZES;
+BENCHMARK(BM_Mul) CWISE_SIZES;
+BENCHMARK(BM_FMA) CWISE_SIZES;
+BENCHMARK(BM_ReLU_Rank4) RANK4_SIZES;
+BENCHMARK(BM_Add_ThreadPool) CWISE_THREADPOOL_SIZES->UseRealTime();
+BENCHMARK(BM_Mul_ThreadPool) CWISE_THREADPOOL_SIZES->UseRealTime();
+BENCHMARK(BM_FMA_ThreadPool) CWISE_THREADPOOL_SIZES->UseRealTime();
+BENCHMARK(BM_Exp_ThreadPool) CWISE_THREADPOOL_SIZES->UseRealTime();
+BENCHMARK(BM_Tanh_ThreadPool) CWISE_THREADPOOL_SIZES->UseRealTime();
+BENCHMARK(BM_ReLU_ThreadPool) CWISE_THREADPOOL_SIZES->UseRealTime();
diff --git a/unsupported/benchmarks/Tensor/bench_contraction.cpp b/unsupported/benchmarks/Tensor/bench_contraction.cpp
index 83b1f19..9b1b45d 100644
--- a/unsupported/benchmarks/Tensor/bench_contraction.cpp
+++ b/unsupported/benchmarks/Tensor/bench_contraction.cpp
@@ -120,32 +120,29 @@
       benchmark::Counter(2.0 * M * N * K, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }
 
-static void ContractionSizes(::benchmark::Benchmark* b) {
-  for (int size : {32, 64, 128, 256, 512, 1024}) {
-    b->Args({size, size, size});
-  }
-  // Non-square
-  b->Args({256, 256, 1024});
-  b->Args({1024, 64, 64});
-}
+// clang-format off
+#define CONTRACTION_SIZES \
+  ->Args({32, 32, 32})->Args({64, 64, 64})->Args({128, 128, 128}) \
+  ->Args({256, 256, 256})->Args({512, 512, 512})->Args({1024, 1024, 1024}) \
+  ->Args({256, 256, 1024})->Args({1024, 64, 64})
 
-static void ThreadPoolSizes(::benchmark::Benchmark* b) {
-  for (int size : {64, 256, 512, 1024}) {
-    for (int threads : {1, 2, 4, 8, 16}) {
-      b->Args({size, size, size, threads});
-    }
-  }
-}
+#define CONTRACTION_THREADPOOL_SIZES \
+  ->Args({64, 64, 64, 1})->Args({64, 64, 64, 2})->Args({64, 64, 64, 4}) \
+  ->Args({64, 64, 64, 8})->Args({64, 64, 64, 16}) \
+  ->Args({256, 256, 256, 1})->Args({256, 256, 256, 2})->Args({256, 256, 256, 4}) \
+  ->Args({256, 256, 256, 8})->Args({256, 256, 256, 16}) \
+  ->Args({512, 512, 512, 1})->Args({512, 512, 512, 2})->Args({512, 512, 512, 4}) \
+  ->Args({512, 512, 512, 8})->Args({512, 512, 512, 16}) \
+  ->Args({1024, 1024, 1024, 1})->Args({1024, 1024, 1024, 2})->Args({1024, 1024, 1024, 4}) \
+  ->Args({1024, 1024, 1024, 8})->Args({1024, 1024, 1024, 16})
 
-static void BatchSizes(::benchmark::Benchmark* b) {
-  for (int batch : {1, 8, 32}) {
-    for (int size : {64, 256}) {
-      b->Args({batch, size, size, size});
-    }
-  }
-}
+#define BATCH_SIZES \
+  ->Args({1, 64, 64, 64})->Args({1, 256, 256, 256}) \
+  ->Args({8, 64, 64, 64})->Args({8, 256, 256, 256}) \
+  ->Args({32, 64, 64, 64})->Args({32, 256, 256, 256})
+// clang-format on
 
-BENCHMARK(BM_Contraction)->Apply(ContractionSizes);
-BENCHMARK(BM_Contraction_RowMajor)->Apply(ContractionSizes);
-BENCHMARK(BM_Contraction_ThreadPool)->Apply(ThreadPoolSizes);
-BENCHMARK(BM_BatchContraction)->Apply(BatchSizes);
+BENCHMARK(BM_Contraction) CONTRACTION_SIZES;
+BENCHMARK(BM_Contraction_RowMajor) CONTRACTION_SIZES;
+BENCHMARK(BM_Contraction_ThreadPool) CONTRACTION_THREADPOOL_SIZES;
+BENCHMARK(BM_BatchContraction) BATCH_SIZES;
diff --git a/unsupported/benchmarks/Tensor/bench_convolution.cpp b/unsupported/benchmarks/Tensor/bench_convolution.cpp
index 46e44ee..c6a1384 100644
--- a/unsupported/benchmarks/Tensor/bench_convolution.cpp
+++ b/unsupported/benchmarks/Tensor/bench_convolution.cpp
@@ -109,43 +109,21 @@
   state.counters["threads"] = threads;
 }
 
-static void Conv1DSizes(::benchmark::Benchmark* b) {
-  for (int input : {128, 512, 2048}) {
-    for (int kernel : {3, 5, 11}) {
-      b->Args({input, kernel});
-    }
-  }
-}
+// {input, kernel}, {channels, hw, k}, {hw, k, threads}: pure Cartesian products.
+#define CONV1D_SIZES ->ArgsProduct({{128, 512, 2048}, {3, 5, 11}})
+#define CONV2D_CHANNEL_SIZES ->ArgsProduct({{3, 64, 128}, {16, 32, 56}, {3, 5}})
+#define CONV2D_THREADPOOL_SIZES ->ArgsProduct({{64, 128, 224}, {3, 5}, {2, 4, 8}})
 
-static void Conv2DSizes(::benchmark::Benchmark* b) {
-  for (int hw : {32, 64, 128, 224}) {
-    for (int k : {3, 5, 7}) {
-      b->Args({hw, hw, k, k});
-    }
-  }
-}
+// {hw, hw, k, k}: explicit because hw and k are repeated.
+// clang-format off
+#define CONV2D_SIZES \
+  ->Args({32, 32, 3, 3})->Args({32, 32, 5, 5})->Args({32, 32, 7, 7}) \
+  ->Args({64, 64, 3, 3})->Args({64, 64, 5, 5})->Args({64, 64, 7, 7}) \
+  ->Args({128, 128, 3, 3})->Args({128, 128, 5, 5})->Args({128, 128, 7, 7}) \
+  ->Args({224, 224, 3, 3})->Args({224, 224, 5, 5})->Args({224, 224, 7, 7})
+// clang-format on
 
-static void Conv2DChannelSizes(::benchmark::Benchmark* b) {
-  for (int c : {3, 64, 128}) {
-    for (int hw : {16, 32, 56}) {
-      for (int k : {3, 5}) {
-        b->Args({c, hw, k});
-      }
-    }
-  }
-}
-
-static void Conv2DThreadPoolSizes(::benchmark::Benchmark* b) {
-  for (int hw : {64, 128, 224}) {
-    for (int k : {3, 5}) {
-      for (int threads : {2, 4, 8}) {
-        b->Args({hw, k, threads});
-      }
-    }
-  }
-}
-
-BENCHMARK(BM_Convolve1D)->Apply(Conv1DSizes);
-BENCHMARK(BM_Convolve2D)->Apply(Conv2DSizes);
-BENCHMARK(BM_Convolve2D_Channels)->Apply(Conv2DChannelSizes);
-BENCHMARK(BM_Convolve2D_ThreadPool)->Apply(Conv2DThreadPoolSizes);
+BENCHMARK(BM_Convolve1D) CONV1D_SIZES;
+BENCHMARK(BM_Convolve2D) CONV2D_SIZES;
+BENCHMARK(BM_Convolve2D_Channels) CONV2D_CHANNEL_SIZES;
+BENCHMARK(BM_Convolve2D_ThreadPool) CONV2D_THREADPOOL_SIZES;
diff --git a/unsupported/benchmarks/Tensor/bench_image_patch.cpp b/unsupported/benchmarks/Tensor/bench_image_patch.cpp
index 2ae31f6..fb8a113 100644
--- a/unsupported/benchmarks/Tensor/bench_image_patch.cpp
+++ b/unsupported/benchmarks/Tensor/bench_image_patch.cpp
@@ -198,111 +198,48 @@
   state.counters["threads"] = threads;
 }
 
-// --- Size generators ---
+// --- Size configurations ---
 
-static void PatchSizes(::benchmark::internal::Benchmark* b) {
-  // channels, H, W, kH, kW
-  for (int c : {3, 32, 64}) {
-    for (int hw : {32, 64, 128}) {
-      for (int k : {3, 5, 7}) {
-        b->Args({c, hw, hw, k, k});
-      }
-    }
-  }
-}
+// channels, H, W, kH, kW (H==W and kH==kW); explicit because of duplicated dims.
+// clang-format off
+#define PATCH_SIZES \
+  ->Args({3, 32, 32, 3, 3})->Args({3, 32, 32, 5, 5})->Args({3, 32, 32, 7, 7}) \
+  ->Args({3, 64, 64, 3, 3})->Args({3, 64, 64, 5, 5})->Args({3, 64, 64, 7, 7}) \
+  ->Args({3, 128, 128, 3, 3})->Args({3, 128, 128, 5, 5})->Args({3, 128, 128, 7, 7}) \
+  ->Args({32, 32, 32, 3, 3})->Args({32, 32, 32, 5, 5})->Args({32, 32, 32, 7, 7}) \
+  ->Args({32, 64, 64, 3, 3})->Args({32, 64, 64, 5, 5})->Args({32, 64, 64, 7, 7}) \
+  ->Args({32, 128, 128, 3, 3})->Args({32, 128, 128, 5, 5})->Args({32, 128, 128, 7, 7}) \
+  ->Args({64, 32, 32, 3, 3})->Args({64, 32, 32, 5, 5})->Args({64, 32, 32, 7, 7}) \
+  ->Args({64, 64, 64, 3, 3})->Args({64, 64, 64, 5, 5})->Args({64, 64, 64, 7, 7}) \
+  ->Args({64, 128, 128, 3, 3})->Args({64, 128, 128, 5, 5})->Args({64, 128, 128, 7, 7})
 
-static void StridedSizes(::benchmark::internal::Benchmark* b) {
-  // channels, H, kH, stride
-  for (int c : {3, 64}) {
-    for (int hw : {56, 112, 224}) {
-      for (int k : {3, 5}) {
-        for (int s : {1, 2}) {
-          b->Args({c, hw, k, s});
-        }
-      }
-    }
-  }
-}
+// channels, H, W, kH (H==W); explicit because of duplicated H/W dim.
+#define EXPLICIT_PADDING_SIZES \
+  ->Args({3, 32, 32, 3})->Args({3, 32, 32, 5})->Args({3, 64, 64, 3})->Args({3, 64, 64, 5}) \
+  ->Args({3, 128, 128, 3})->Args({3, 128, 128, 5})->Args({64, 32, 32, 3})->Args({64, 32, 32, 5}) \
+  ->Args({64, 64, 64, 3})->Args({64, 64, 64, 5})->Args({64, 128, 128, 3})->Args({64, 128, 128, 5})
 
-static void DilatedSizes(::benchmark::internal::Benchmark* b) {
-  // channels, H, kH, dilation
-  for (int c : {3, 64}) {
-    for (int hw : {32, 64}) {
-      for (int k : {3, 5}) {
-        for (int d : {2, 4}) {
-          b->Args({c, hw, k, d});
-        }
-      }
-    }
-  }
-}
+// {channels, spatial, kernel, stride/dilation/threads/batch}: pure Cartesian products.
+#define STRIDED_SIZES ->ArgsProduct({{3, 64}, {56, 112, 224}, {3, 5}, {1, 2}})
+#define DILATED_SIZES ->ArgsProduct({{3, 64}, {32, 64}, {3, 5}, {2, 4}})
+#define BATCHED_SIZES ->ArgsProduct({{3, 64}, {32, 56}, {3, 5}, {4, 16, 32}})
+#define THREAD_POOL_SIZES ->ArgsProduct({{64, 128}, {56, 112}, {3, 5}, {2, 4, 8}})
 
-static void ExplicitPaddingSizes(::benchmark::internal::Benchmark* b) {
-  // channels, H, W, kH
-  for (int c : {3, 64}) {
-    for (int hw : {32, 64, 128}) {
-      for (int k : {3, 5}) {
-        b->Args({c, hw, hw, k});
-      }
-    }
-  }
-}
+// Realistic CNN layer configurations: channels, spatial_size, kernel, stride.
+// AlexNet conv1; VGG, VGG deeper x2; ResNet, ResNet downsample, ResNet deeper x2;
+// MobileNet depthwise; Inception 1x1 (degenerate patch).
+#define IMAGENET_SIZES \
+  ->Args({3, 227, 11, 4}) \
+  ->Args({64, 224, 3, 1})->Args({128, 112, 3, 1})->Args({256, 56, 3, 1}) \
+  ->Args({64, 56, 3, 1})->Args({128, 56, 3, 2})->Args({256, 28, 3, 1})->Args({512, 14, 3, 1}) \
+  ->Args({32, 112, 3, 1})->Args({192, 28, 1, 1})
+// clang-format on
 
-static void BatchedSizes(::benchmark::internal::Benchmark* b) {
-  // channels, H, kH, batch
-  for (int c : {3, 64}) {
-    for (int hw : {32, 56}) {
-      for (int k : {3, 5}) {
-        for (int batch : {4, 16, 32}) {
-          b->Args({c, hw, k, batch});
-        }
-      }
-    }
-  }
-}
-
-static void ImageNetSizes(::benchmark::internal::Benchmark* b) {
-  // Realistic CNN layer configurations: channels, spatial_size, kernel, stride
-  // AlexNet conv1: 3x227x227, 11x11, stride 4
-  b->Args({3, 227, 11, 4});
-  // VGG-style: 64x224x224, 3x3, stride 1
-  b->Args({64, 224, 3, 1});
-  // VGG deeper: 128x112x112, 3x3, stride 1
-  b->Args({128, 112, 3, 1});
-  // VGG deeper: 256x56x56, 3x3, stride 1
-  b->Args({256, 56, 3, 1});
-  // ResNet: 64x56x56, 3x3, stride 1
-  b->Args({64, 56, 3, 1});
-  // ResNet downsample: 128x56x56, 3x3, stride 2
-  b->Args({128, 56, 3, 2});
-  // ResNet: 256x28x28, 3x3, stride 1
-  b->Args({256, 28, 3, 1});
-  // ResNet: 512x14x14, 3x3, stride 1
-  b->Args({512, 14, 3, 1});
-  // MobileNet depthwise: 32x112x112, 3x3, stride 1
-  b->Args({32, 112, 3, 1});
-  // Inception 1x1 (degenerate patch): 192x28x28, 1x1, stride 1
-  b->Args({192, 28, 1, 1});
-}
-
-static void ThreadPoolSizes(::benchmark::internal::Benchmark* b) {
-  // channels, H, kH, threads
-  for (int c : {64, 128}) {
-    for (int hw : {56, 112}) {
-      for (int k : {3, 5}) {
-        for (int threads : {2, 4, 8}) {
-          b->Args({c, hw, k, threads});
-        }
-      }
-    }
-  }
-}
-
-BENCHMARK(BM_ImagePatch_Valid)->Apply(PatchSizes);
-BENCHMARK(BM_ImagePatch_Same)->Apply(PatchSizes);
-BENCHMARK(BM_ImagePatch_Strided)->Apply(StridedSizes);
-BENCHMARK(BM_ImagePatch_Dilated)->Apply(DilatedSizes);
-BENCHMARK(BM_ImagePatch_ExplicitPadding)->Apply(ExplicitPaddingSizes);
-BENCHMARK(BM_ImagePatch_Batched)->Apply(BatchedSizes);
-BENCHMARK(BM_ImagePatch_ImageNet)->Apply(ImageNetSizes);
-BENCHMARK(BM_ImagePatch_ThreadPool)->Apply(ThreadPoolSizes);
+BENCHMARK(BM_ImagePatch_Valid) PATCH_SIZES;
+BENCHMARK(BM_ImagePatch_Same) PATCH_SIZES;
+BENCHMARK(BM_ImagePatch_Strided) STRIDED_SIZES;
+BENCHMARK(BM_ImagePatch_Dilated) DILATED_SIZES;
+BENCHMARK(BM_ImagePatch_ExplicitPadding) EXPLICIT_PADDING_SIZES;
+BENCHMARK(BM_ImagePatch_Batched) BATCHED_SIZES;
+BENCHMARK(BM_ImagePatch_ImageNet) IMAGENET_SIZES;
+BENCHMARK(BM_ImagePatch_ThreadPool) THREAD_POOL_SIZES;
diff --git a/unsupported/benchmarks/Tensor/bench_layout_swap.cpp b/unsupported/benchmarks/Tensor/bench_layout_swap.cpp
index 8caeb57..c51d1de 100644
--- a/unsupported/benchmarks/Tensor/bench_layout_swap.cpp
+++ b/unsupported/benchmarks/Tensor/bench_layout_swap.cpp
@@ -60,18 +60,10 @@
   state.SetBytesProcessed(state.iterations() * 3ll * static_cast<int64_t>(M) * N * sizeof(Scalar));
 }
 
-static void LayoutSwapSizes(::benchmark::Benchmark* b) {
-  for (int size : {64, 256, 1024}) {
-    b->Args({size, size});
-  }
-}
+// {n, n} and {n, n, n}: explicit because dims are repeated.
+#define LAYOUT_SWAP_SIZES ->Args({64, 64})->Args({256, 256})->Args({1024, 1024})
+#define LAYOUT_SWAP_3D_SIZES ->Args({32, 32, 32})->Args({64, 64, 64})->Args({128, 128, 128})
 
-static void LayoutSwap3DSizes(::benchmark::Benchmark* b) {
-  b->Args({32, 32, 32});
-  b->Args({64, 64, 64});
-  b->Args({128, 128, 128});
-}
-
-BENCHMARK(BM_LayoutSwap_2D)->Apply(LayoutSwapSizes);
-BENCHMARK(BM_LayoutSwap_3D)->Apply(LayoutSwap3DSizes);
-BENCHMARK(BM_LayoutSwap_Composed)->Apply(LayoutSwapSizes);
+BENCHMARK(BM_LayoutSwap_2D) LAYOUT_SWAP_SIZES;
+BENCHMARK(BM_LayoutSwap_3D) LAYOUT_SWAP_3D_SIZES;
+BENCHMARK(BM_LayoutSwap_Composed) LAYOUT_SWAP_SIZES;
diff --git a/unsupported/benchmarks/Tensor/bench_morphing.cpp b/unsupported/benchmarks/Tensor/bench_morphing.cpp
index ff7e17f..fa2b5f2 100644
--- a/unsupported/benchmarks/Tensor/bench_morphing.cpp
+++ b/unsupported/benchmarks/Tensor/bench_morphing.cpp
@@ -168,46 +168,32 @@
   state.counters["threads"] = threads;
 }
 
-static void MorphSizes(::benchmark::Benchmark* b) {
-  for (int size : {256, 1024}) {
-    b->Args({size, size});
-  }
-}
+// clang-format off
+#define MORPH_SIZES \
+  ->Args({256, 256})->Args({1024, 1024})
 
-static void ChipSizes(::benchmark::Benchmark* b) {
-  b->Args({32, 256, 256});
-  b->Args({64, 128, 128});
-  b->Args({8, 512, 512});
-}
+#define CHIP_SIZES \
+  ->Args({32, 256, 256})->Args({64, 128, 128})->Args({8, 512, 512})
 
-static void PadSizes(::benchmark::Benchmark* b) {
-  for (int size : {256, 1024}) {
-    for (int pad : {1, 4, 16}) {
-      b->Args({size, size, pad});
-    }
-  }
-}
+#define PAD_SIZES \
+  ->Args({256, 256, 1})->Args({256, 256, 4})->Args({256, 256, 16}) \
+  ->Args({1024, 1024, 1})->Args({1024, 1024, 4})->Args({1024, 1024, 16})
 
-static void StrideSizes(::benchmark::Benchmark* b) {
-  for (int size : {256, 1024}) {
-    for (int stride : {2, 4}) {
-      b->Args({size, size, stride});
-    }
-  }
-}
+#define STRIDE_SIZES \
+  ->Args({256, 256, 2})->Args({256, 256, 4}) \
+  ->Args({1024, 1024, 2})->Args({1024, 1024, 4})
 
-static void MorphThreadPoolSizes(::benchmark::Benchmark* b) {
-  for (int size : {256, 1024}) {
-    for (int threads : {1, 2, 4, 8, 12, 16}) {
-      b->Args({size, size, threads});
-    }
-  }
-}
+#define MORPH_THREADPOOL_SIZES \
+  ->Args({256, 256, 1})->Args({256, 256, 2})->Args({256, 256, 4}) \
+  ->Args({256, 256, 8})->Args({256, 256, 12})->Args({256, 256, 16}) \
+  ->Args({1024, 1024, 1})->Args({1024, 1024, 2})->Args({1024, 1024, 4}) \
+  ->Args({1024, 1024, 8})->Args({1024, 1024, 12})->Args({1024, 1024, 16})
+// clang-format on
 
-BENCHMARK(BM_Reshape)->Apply(MorphSizes);
-BENCHMARK(BM_Slice)->Apply(MorphSizes);
-BENCHMARK(BM_Chip)->Apply(ChipSizes);
-BENCHMARK(BM_Pad)->Apply(PadSizes);
-BENCHMARK(BM_Stride)->Apply(StrideSizes);
-BENCHMARK(BM_Slice_ThreadPool)->Apply(MorphThreadPoolSizes)->UseRealTime();
-BENCHMARK(BM_Pad_ThreadPool)->Apply(MorphThreadPoolSizes)->UseRealTime();
+BENCHMARK(BM_Reshape) MORPH_SIZES;
+BENCHMARK(BM_Slice) MORPH_SIZES;
+BENCHMARK(BM_Chip) CHIP_SIZES;
+BENCHMARK(BM_Pad) PAD_SIZES;
+BENCHMARK(BM_Stride) STRIDE_SIZES;
+BENCHMARK(BM_Slice_ThreadPool) MORPH_THREADPOOL_SIZES->UseRealTime();
+BENCHMARK(BM_Pad_ThreadPool) MORPH_THREADPOOL_SIZES->UseRealTime();
diff --git a/unsupported/benchmarks/Tensor/bench_reduction.cpp b/unsupported/benchmarks/Tensor/bench_reduction.cpp
index 795c95c..cc7f5ce 100644
--- a/unsupported/benchmarks/Tensor/bench_reduction.cpp
+++ b/unsupported/benchmarks/Tensor/bench_reduction.cpp
@@ -125,34 +125,22 @@
   state.SetBytesProcessed(state.iterations() * M * N * sizeof(Scalar));
 }
 
-static void ReductionSizes(::benchmark::Benchmark* b) {
-  for (int size : {64, 256, 1024}) {
-    b->Args({size, size});
-  }
-}
+// clang-format off
+#define REDUCTION_SIZES \
+  ->Args({64, 64})->Args({256, 256})->Args({1024, 1024})
 
-static void ThreadPoolReductionSizes(::benchmark::Benchmark* b) {
-  for (int size : {256, 1024}) {
-    for (int threads : {2, 4, 8}) {
-      b->Args({size, size, threads});
-    }
-  }
-}
+#define THREADPOOL_REDUCTION_SIZES \
+  ->Args({256, 256, 2})->Args({256, 256, 4})->Args({256, 256, 8}) \
+  ->Args({1024, 1024, 2})->Args({1024, 1024, 4})->Args({1024, 1024, 8})
 
-static void SpatialSizes(::benchmark::Benchmark* b) {
-  for (int batch : {1, 8, 32}) {
-    for (int c : {64, 128}) {
-      for (int h : {16, 32}) {
-        b->Args({batch, c, h});
-      }
-    }
-  }
-}
+// {batch, channels, h}: pure Cartesian product.
+#define SPATIAL_SIZES ->ArgsProduct({{1, 8, 32}, {64, 128}, {16, 32}})
+// clang-format on
 
-BENCHMARK(BM_FullReduction<internal::SumReducer<Scalar>>)->Apply(ReductionSizes)->Name("SumReduction");
-BENCHMARK(BM_FullReduction<internal::MaxReducer<Scalar>>)->Apply(ReductionSizes)->Name("MaxReduction_Full");
-BENCHMARK(BM_MaxReduction)->Apply(ReductionSizes);
-BENCHMARK(BM_ReduceInner)->Apply(ReductionSizes);
-BENCHMARK(BM_ReduceOuter)->Apply(ReductionSizes);
-BENCHMARK(BM_ReduceSpatial)->Apply(SpatialSizes);
-BENCHMARK(BM_FullReduction_ThreadPool)->Apply(ThreadPoolReductionSizes);
+BENCHMARK(BM_FullReduction<internal::SumReducer<Scalar>>) REDUCTION_SIZES->Name("SumReduction");
+BENCHMARK(BM_FullReduction<internal::MaxReducer<Scalar>>) REDUCTION_SIZES->Name("MaxReduction_Full");
+BENCHMARK(BM_MaxReduction) REDUCTION_SIZES;
+BENCHMARK(BM_ReduceInner) REDUCTION_SIZES;
+BENCHMARK(BM_ReduceOuter) REDUCTION_SIZES;
+BENCHMARK(BM_ReduceSpatial) SPATIAL_SIZES;
+BENCHMARK(BM_FullReduction_ThreadPool) THREADPOOL_REDUCTION_SIZES;
diff --git a/unsupported/benchmarks/Tensor/bench_reverse.cpp b/unsupported/benchmarks/Tensor/bench_reverse.cpp
index 852df68..999266c 100644
--- a/unsupported/benchmarks/Tensor/bench_reverse.cpp
+++ b/unsupported/benchmarks/Tensor/bench_reverse.cpp
@@ -85,19 +85,16 @@
 //   64x64    = 16 KB (L1)
 //   256x256  = 256 KB (L2)
 //   1024x1024 = 4 MB (LLC / DRAM)
-static void ReverseSizes(::benchmark::Benchmark* b) {
-  for (int size : {64, 256, 1024}) {
-    b->Args({size, size});
-  }
-}
+// clang-format off
+#define REVERSE_SIZES \
+  ->Args({64, 64})->Args({256, 256})->Args({1024, 1024})
 
-static void Reverse3DSizes(::benchmark::Benchmark* b) {
-  b->Args({32, 32, 32});     // 128 KB
-  b->Args({64, 64, 64});     // 1 MB
-  b->Args({128, 128, 128});  // 8 MB
-}
+// 128 KB / 1 MB / 8 MB
+#define REVERSE_3D_SIZES \
+  ->Args({32, 32, 32})->Args({64, 64, 64})->Args({128, 128, 128})
+// clang-format on
 
-BENCHMARK(BM_Reverse_Inner)->Apply(ReverseSizes);
-BENCHMARK(BM_Reverse_Outer)->Apply(ReverseSizes);
-BENCHMARK(BM_Reverse_All)->Apply(ReverseSizes);
-BENCHMARK(BM_Reverse_3D_Inner)->Apply(Reverse3DSizes);
+BENCHMARK(BM_Reverse_Inner) REVERSE_SIZES;
+BENCHMARK(BM_Reverse_Outer) REVERSE_SIZES;
+BENCHMARK(BM_Reverse_All) REVERSE_SIZES;
+BENCHMARK(BM_Reverse_3D_Inner) REVERSE_3D_SIZES;
diff --git a/unsupported/benchmarks/Tensor/bench_roll.cpp b/unsupported/benchmarks/Tensor/bench_roll.cpp
index 219a8ff..6724374 100644
--- a/unsupported/benchmarks/Tensor/bench_roll.cpp
+++ b/unsupported/benchmarks/Tensor/bench_roll.cpp
@@ -83,21 +83,17 @@
   state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(D0) * D1 * D2 * sizeof(Scalar));
 }
 
-static void RollSizes(::benchmark::Benchmark* b) {
-  for (int size : {64, 256, 1024}) {
-    for (int shift : {1, 13}) {
-      b->Args({size, size, shift});
-    }
-  }
-}
+// clang-format off
+#define ROLL_SIZES \
+  ->Args({64, 64, 1})->Args({64, 64, 13}) \
+  ->Args({256, 256, 1})->Args({256, 256, 13}) \
+  ->Args({1024, 1024, 1})->Args({1024, 1024, 13})
 
-static void Roll3DSizes(::benchmark::Benchmark* b) {
-  b->Args({32, 32, 32});
-  b->Args({64, 64, 64});
-  b->Args({128, 128, 128});
-}
+#define ROLL_3D_SIZES \
+  ->Args({32, 32, 32})->Args({64, 64, 64})->Args({128, 128, 128})
+// clang-format on
 
-BENCHMARK(BM_Roll_Inner)->Apply(RollSizes);
-BENCHMARK(BM_Roll_Outer)->Apply(RollSizes);
-BENCHMARK(BM_Roll_All)->Apply(RollSizes);
-BENCHMARK(BM_Roll_3D_Inner)->Apply(Roll3DSizes);
+BENCHMARK(BM_Roll_Inner) ROLL_SIZES;
+BENCHMARK(BM_Roll_Outer) ROLL_SIZES;
+BENCHMARK(BM_Roll_All) ROLL_SIZES;
+BENCHMARK(BM_Roll_3D_Inner) ROLL_3D_SIZES;
diff --git a/unsupported/benchmarks/Tensor/bench_shuffling.cpp b/unsupported/benchmarks/Tensor/bench_shuffling.cpp
index 6de76a8..b824676 100644
--- a/unsupported/benchmarks/Tensor/bench_shuffling.cpp
+++ b/unsupported/benchmarks/Tensor/bench_shuffling.cpp
@@ -138,53 +138,30 @@
   state.counters["threads"] = threads;
 }
 
-static void Shuffle2DSizes(::benchmark::Benchmark* b) {
-  for (int size : {256, 1024}) {
-    b->Args({size, size});
-  }
-  b->Args({64, 4096});
-  b->Args({4096, 64});
-}
+// clang-format off
+#define SHUFFLE_2D_SIZES \
+  ->Args({256, 256})->Args({1024, 1024}) \
+  ->Args({64, 4096})->Args({4096, 64})
 
-static void Shuffle3DSizes(::benchmark::Benchmark* b) {
-  b->Args({64, 64, 64});
-  b->Args({128, 128, 64});
-  b->Args({32, 256, 256});
-}
+#define SHUFFLE_3D_SIZES \
+  ->Args({64, 64, 64})->Args({128, 128, 64})->Args({32, 256, 256})
 
-static void Shuffle4DSizes(::benchmark::Benchmark* b) {
-  for (int batch : {1, 8}) {
-    for (int c : {3, 64}) {
-      for (int h : {32, 64}) {
-        b->Args({batch, c, h});
-      }
-    }
-  }
-}
+// {batch, channels, h}: pure Cartesian product.
+#define SHUFFLE_4D_SIZES ->ArgsProduct({{1, 8}, {3, 64}, {32, 64}})
 
-static void Shuffle2DThreadPoolSizes(::benchmark::Benchmark* b) {
-  for (int size : {256, 1024}) {
-    for (int threads : {1, 2, 4, 8, 12, 16}) {
-      b->Args({size, size, threads});
-    }
-  }
-}
+#define SHUFFLE_2D_THREADPOOL_SIZES \
+  ->Args({256, 256, 1})->Args({256, 256, 2})->Args({256, 256, 4}) \
+  ->Args({256, 256, 8})->Args({256, 256, 12})->Args({256, 256, 16}) \
+  ->Args({1024, 1024, 1})->Args({1024, 1024, 2})->Args({1024, 1024, 4}) \
+  ->Args({1024, 1024, 8})->Args({1024, 1024, 12})->Args({1024, 1024, 16})
 
-static void Shuffle4DThreadPoolSizes(::benchmark::Benchmark* b) {
-  for (int batch : {1, 8}) {
-    for (int c : {64}) {
-      for (int h : {32, 64}) {
-        for (int threads : {1, 2, 4, 8, 12, 16}) {
-          b->Args({batch, c, h, threads});
-        }
-      }
-    }
-  }
-}
+// {batch, channels, h, threads}: pure Cartesian product.
+#define SHUFFLE_4D_THREADPOOL_SIZES ->ArgsProduct({{1, 8}, {64}, {32, 64}, {1, 2, 4, 8, 12, 16}})
+// clang-format on
 
-BENCHMARK(BM_Shuffle2D)->Apply(Shuffle2DSizes);
-BENCHMARK(BM_ShuffleIdentity)->Apply(Shuffle2DSizes);
-BENCHMARK(BM_Shuffle3D)->Apply(Shuffle3DSizes);
-BENCHMARK(BM_Shuffle4D_NCHW_to_NHWC)->Apply(Shuffle4DSizes);
-BENCHMARK(BM_Shuffle2D_ThreadPool)->Apply(Shuffle2DThreadPoolSizes)->UseRealTime();
-BENCHMARK(BM_Shuffle4D_NCHW_to_NHWC_ThreadPool)->Apply(Shuffle4DThreadPoolSizes)->UseRealTime();
+BENCHMARK(BM_Shuffle2D) SHUFFLE_2D_SIZES;
+BENCHMARK(BM_ShuffleIdentity) SHUFFLE_2D_SIZES;
+BENCHMARK(BM_Shuffle3D) SHUFFLE_3D_SIZES;
+BENCHMARK(BM_Shuffle4D_NCHW_to_NHWC) SHUFFLE_4D_SIZES;
+BENCHMARK(BM_Shuffle2D_ThreadPool) SHUFFLE_2D_THREADPOOL_SIZES->UseRealTime();
+BENCHMARK(BM_Shuffle4D_NCHW_to_NHWC_ThreadPool) SHUFFLE_4D_THREADPOOL_SIZES->UseRealTime();
diff --git a/unsupported/benchmarks/Tensor/bench_tensor_fft.cpp b/unsupported/benchmarks/Tensor/bench_tensor_fft.cpp
index 26ff64b..39850eb 100644
--- a/unsupported/benchmarks/Tensor/bench_tensor_fft.cpp
+++ b/unsupported/benchmarks/Tensor/bench_tensor_fft.cpp
@@ -69,12 +69,10 @@
       benchmark::Counter(mflops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }
 
-static void FFTSizes(::benchmark::Benchmark* b) {
-  for (int n : {64, 256, 1024, 4096}) {
-    b->Arg(n);
-  }
-}
+// clang-format off
+#define FFT_SIZES ->Arg(64)->Arg(256)->Arg(1024)->Arg(4096)
+// clang-format on
 
-BENCHMARK(BM_TensorFFT_1D)->Apply(FFTSizes);
-BENCHMARK(BM_TensorFFT_2D)->Apply(FFTSizes);
-BENCHMARK(BM_TensorIFFT_1D)->Apply(FFTSizes);
+BENCHMARK(BM_TensorFFT_1D) FFT_SIZES;
+BENCHMARK(BM_TensorFFT_2D) FFT_SIZES;
+BENCHMARK(BM_TensorIFFT_1D) FFT_SIZES;