unsupported/benchmarks/Tensor/bench_convolution.cpp - mirror - Git at Google

 // Benchmarks for Eigen Tensor convolution (1D and 2D).
 // SPDX-FileCopyrightText: The Eigen Authors
 // SPDX-License-Identifier: MPL-2.0

 #define EIGEN_USE_THREADS

 #include <benchmark/benchmark.h>
 #include <unsupported/Eigen/Tensor>
 #include <unsupported/Eigen/ThreadPool>

 using namespace Eigen;

 typedef float Scalar;

 // --- 1D convolution ---
 static void BM_Convolve1D(benchmark::State& state) {
   const int input_size = state.range(0);
   const int kernel_size = state.range(1);

   Tensor<Scalar, 1> input(input_size);
   Tensor<Scalar, 1> kernel(kernel_size);
   input.setRandom();
   kernel.setRandom();

   Eigen::array<int, 1> dims = {0};

   for (auto _ : state) {
     Tensor<Scalar, 1> result = input.convolve(kernel, dims);
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }
   double flops = 2.0 * (input_size - kernel_size + 1) * kernel_size;
   state.counters["GFLOPS"] =
       benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }

 // --- 2D convolution ---
 static void BM_Convolve2D(benchmark::State& state) {
   const int H = state.range(0);
   const int W = state.range(1);
   const int kH = state.range(2);
   const int kW = state.range(3);

   Tensor<Scalar, 2> input(H, W);
   Tensor<Scalar, 2> kernel(kH, kW);
   input.setRandom();
   kernel.setRandom();

   Eigen::array<int, 2> dims = {0, 1};

   for (auto _ : state) {
     Tensor<Scalar, 2> result = input.convolve(kernel, dims);
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }
   double flops = 2.0 * (H - kH + 1) * (W - kW + 1) * kH * kW;
   state.counters["GFLOPS"] =
       benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }

 // --- 2D convolution with channels (rank-3: C x H x W, convolve on H,W) ---
 static void BM_Convolve2D_Channels(benchmark::State& state) {
   const int C = state.range(0);
   const int H = state.range(1);
   const int kH = state.range(2);

   Tensor<Scalar, 3> input(C, H, H);
   Tensor<Scalar, 2> kernel(kH, kH);
   input.setRandom();
   kernel.setRandom();

   Eigen::array<int, 2> dims = {1, 2};

   for (auto _ : state) {
     Tensor<Scalar, 3> result = input.convolve(kernel, dims);
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }
   int outH = H - kH + 1;
   double flops = 2.0 * C * outH * outH * kH * kH;
   state.counters["GFLOPS"] =
       benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
 }

 // --- 2D convolution with ThreadPool ---
 static void BM_Convolve2D_ThreadPool(benchmark::State& state) {
   const int H = state.range(0);
   const int kH = state.range(1);
   const int threads = state.range(2);

   Tensor<Scalar, 2> input(H, H);
   Tensor<Scalar, 2> kernel(kH, kH);
   Tensor<Scalar, 2> result(H - kH + 1, H - kH + 1);
   input.setRandom();
   kernel.setRandom();

   ThreadPool tp(threads);
   ThreadPoolDevice dev(&tp, threads);

   Eigen::array<int, 2> dims = {0, 1};

   for (auto _ : state) {
     result.device(dev) = input.convolve(kernel, dims);
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }
   int outH = H - kH + 1;
   double flops = 2.0 * outH * outH * kH * kH;
   state.counters["GFLOPS"] =
       benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
   state.counters["threads"] = threads;
 }

 // {input, kernel}, {channels, hw, k}, {hw, k, threads}: pure Cartesian products.
 #define CONV1D_SIZES ->ArgsProduct({{128, 512, 2048}, {3, 5, 11}})
 #define CONV2D_CHANNEL_SIZES ->ArgsProduct({{3, 64, 128}, {16, 32, 56}, {3, 5}})
 #define CONV2D_THREADPOOL_SIZES ->ArgsProduct({{64, 128, 224}, {3, 5}, {2, 4, 8}})

 // {hw, hw, k, k}: explicit because hw and k are repeated.
 // clang-format off
 #define CONV2D_SIZES \
   ->Args({32, 32, 3, 3})->Args({32, 32, 5, 5})->Args({32, 32, 7, 7}) \
   ->Args({64, 64, 3, 3})->Args({64, 64, 5, 5})->Args({64, 64, 7, 7}) \
   ->Args({128, 128, 3, 3})->Args({128, 128, 5, 5})->Args({128, 128, 7, 7}) \
   ->Args({224, 224, 3, 3})->Args({224, 224, 5, 5})->Args({224, 224, 7, 7})
 // clang-format on

 BENCHMARK(BM_Convolve1D) CONV1D_SIZES;
 BENCHMARK(BM_Convolve2D) CONV2D_SIZES;
 BENCHMARK(BM_Convolve2D_Channels) CONV2D_CHANNEL_SIZES;
 BENCHMARK(BM_Convolve2D_ThreadPool) CONV2D_THREADPOOL_SIZES;
	// Benchmarks for Eigen Tensor convolution (1D and 2D).
	// SPDX-FileCopyrightText: The Eigen Authors
	// SPDX-License-Identifier: MPL-2.0

	#define EIGEN_USE_THREADS

	#include <benchmark/benchmark.h>
	#include <unsupported/Eigen/Tensor>
	#include <unsupported/Eigen/ThreadPool>

	using namespace Eigen;

	typedef float Scalar;

	// --- 1D convolution ---
	static void BM_Convolve1D(benchmark::State& state) {
	const int input_size = state.range(0);
	const int kernel_size = state.range(1);

	Tensor<Scalar, 1> input(input_size);
	Tensor<Scalar, 1> kernel(kernel_size);
	input.setRandom();
	kernel.setRandom();

	Eigen::array<int, 1> dims = {0};

	for (auto _ : state) {
	Tensor<Scalar, 1> result = input.convolve(kernel, dims);
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}
	double flops = 2.0 * (input_size - kernel_size + 1) * kernel_size;
	state.counters["GFLOPS"] =
	benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
	}

	// --- 2D convolution ---
	static void BM_Convolve2D(benchmark::State& state) {
	const int H = state.range(0);
	const int W = state.range(1);
	const int kH = state.range(2);
	const int kW = state.range(3);

	Tensor<Scalar, 2> input(H, W);
	Tensor<Scalar, 2> kernel(kH, kW);
	input.setRandom();
	kernel.setRandom();

	Eigen::array<int, 2> dims = {0, 1};

	for (auto _ : state) {
	Tensor<Scalar, 2> result = input.convolve(kernel, dims);
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}
	double flops = 2.0 * (H - kH + 1) * (W - kW + 1) * kH * kW;
	state.counters["GFLOPS"] =
	benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
	}

	// --- 2D convolution with channels (rank-3: C x H x W, convolve on H,W) ---
	static void BM_Convolve2D_Channels(benchmark::State& state) {
	const int C = state.range(0);
	const int H = state.range(1);
	const int kH = state.range(2);

	Tensor<Scalar, 3> input(C, H, H);
	Tensor<Scalar, 2> kernel(kH, kH);
	input.setRandom();
	kernel.setRandom();

	Eigen::array<int, 2> dims = {1, 2};

	for (auto _ : state) {
	Tensor<Scalar, 3> result = input.convolve(kernel, dims);
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}
	int outH = H - kH + 1;
	double flops = 2.0 * C * outH * outH * kH * kH;
	state.counters["GFLOPS"] =
	benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
	}

	// --- 2D convolution with ThreadPool ---
	static void BM_Convolve2D_ThreadPool(benchmark::State& state) {
	const int H = state.range(0);
	const int kH = state.range(1);
	const int threads = state.range(2);

	Tensor<Scalar, 2> input(H, H);
	Tensor<Scalar, 2> kernel(kH, kH);
	Tensor<Scalar, 2> result(H - kH + 1, H - kH + 1);
	input.setRandom();
	kernel.setRandom();

	ThreadPool tp(threads);
	ThreadPoolDevice dev(&tp, threads);

	Eigen::array<int, 2> dims = {0, 1};

	for (auto _ : state) {
	result.device(dev) = input.convolve(kernel, dims);
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}
	int outH = H - kH + 1;
	double flops = 2.0 * outH * outH * kH * kH;
	state.counters["GFLOPS"] =
	benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
	state.counters["threads"] = threads;
	}

	// {input, kernel}, {channels, hw, k}, {hw, k, threads}: pure Cartesian products.
	#define CONV1D_SIZES ->ArgsProduct({{128, 512, 2048}, {3, 5, 11}})
	#define CONV2D_CHANNEL_SIZES ->ArgsProduct({{3, 64, 128}, {16, 32, 56}, {3, 5}})
	#define CONV2D_THREADPOOL_SIZES ->ArgsProduct({{64, 128, 224}, {3, 5}, {2, 4, 8}})

	// {hw, hw, k, k}: explicit because hw and k are repeated.
	// clang-format off
	#define CONV2D_SIZES \
	->Args({32, 32, 3, 3})->Args({32, 32, 5, 5})->Args({32, 32, 7, 7}) \
	->Args({64, 64, 3, 3})->Args({64, 64, 5, 5})->Args({64, 64, 7, 7}) \
	->Args({128, 128, 3, 3})->Args({128, 128, 5, 5})->Args({128, 128, 7, 7}) \
	->Args({224, 224, 3, 3})->Args({224, 224, 5, 5})->Args({224, 224, 7, 7})
	// clang-format on

	BENCHMARK(BM_Convolve1D) CONV1D_SIZES;
	BENCHMARK(BM_Convolve2D) CONV2D_SIZES;
	BENCHMARK(BM_Convolve2D_Channels) CONV2D_CHANNEL_SIZES;
	BENCHMARK(BM_Convolve2D_ThreadPool) CONV2D_THREADPOOL_SIZES;