unsupported/benchmarks/Tensor/bench_image_patch.cpp - mirror - Git at Google

 // Benchmarks for Eigen TensorImagePatch extraction.

 #define EIGEN_USE_THREADS

 #include <benchmark/benchmark.h>
 #include <unsupported/Eigen/CXX11/Tensor>
 #include <unsupported/Eigen/CXX11/ThreadPool>

 using namespace Eigen;

 typedef float Scalar;

 // --- Basic image patch extraction with PADDING_VALID ---
 static void BM_ImagePatch_Valid(benchmark::State& state) {
   const int C = state.range(0);
   const int H = state.range(1);
   const int W = state.range(2);
   const int kH = state.range(3);
   const int kW = state.range(4);

   Tensor<Scalar, 4> input(C, H, W, 1);
   input.setRandom();
   const int outH = H - kH + 1;
   const int outW = W - kW + 1;
   Tensor<Scalar, 5> result(C, kH, kW, outH * outW, 1);

   for (auto _ : state) {
     result = input.extract_image_patches(kH, kW, 1, 1, 1, 1, PADDING_VALID);
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }

   const double bytes = static_cast<double>(C) * outH * outW * kH * kW * sizeof(Scalar);
   state.SetBytesProcessed(state.iterations() * bytes);
 }

 // --- Basic image patch extraction with PADDING_SAME ---
 static void BM_ImagePatch_Same(benchmark::State& state) {
   const int C = state.range(0);
   const int H = state.range(1);
   const int W = state.range(2);
   const int kH = state.range(3);
   const int kW = state.range(4);

   Tensor<Scalar, 4> input(C, H, W, 1);
   input.setRandom();
   const int outH = H;
   const int outW = W;
   Tensor<Scalar, 5> result(C, kH, kW, outH * outW, 1);

   for (auto _ : state) {
     result = input.extract_image_patches(kH, kW, 1, 1, 1, 1, PADDING_SAME);
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }

   const double bytes = static_cast<double>(C) * H * W * kH * kW * sizeof(Scalar);
   state.SetBytesProcessed(state.iterations() * bytes);
 }

 // --- Image patch with strides (simulates strided convolution) ---
 static void BM_ImagePatch_Strided(benchmark::State& state) {
   const int C = state.range(0);
   const int H = state.range(1);
   const int kH = state.range(2);
   const int stride = state.range(3);

   Tensor<Scalar, 4> input(C, H, H, 1);
   input.setRandom();
   const int outH = (H + stride - 1) / stride;
   Tensor<Scalar, 5> result(C, kH, kH, outH * outH, 1);

   for (auto _ : state) {
     result = input.extract_image_patches(kH, kH, stride, stride, 1, 1, PADDING_SAME);
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }

   const double bytes = static_cast<double>(C) * outH * outH * kH * kH * sizeof(Scalar);
   state.SetBytesProcessed(state.iterations() * bytes);
 }

 // --- Image patch with dilation (atrous/dilated convolution) ---
 static void BM_ImagePatch_Dilated(benchmark::State& state) {
   const int C = state.range(0);
   const int H = state.range(1);
   const int kH = state.range(2);
   const int dilation = state.range(3);

   Tensor<Scalar, 4> input(C, H, H, 1);
   input.setRandom();
   const int outH = H;
   Tensor<Scalar, 5> result(C, kH, kH, outH * outH, 1);

   for (auto _ : state) {
     result = input.extract_image_patches(kH, kH, 1, 1, dilation, dilation, PADDING_SAME);
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }

   const double bytes = static_cast<double>(C) * H * H * kH * kH * sizeof(Scalar);
   state.SetBytesProcessed(state.iterations() * bytes);
 }

 // --- Image patch with explicit padding ---
 static void BM_ImagePatch_ExplicitPadding(benchmark::State& state) {
   const int C = state.range(0);
   const int H = state.range(1);
   const int W = state.range(2);
   const int kH = state.range(3);

   const int pad = kH / 2;

   Tensor<Scalar, 4> input(C, H, W, 1);
   input.setRandom();
   const int outH = H + 2 * pad - kH + 1;
   const int outW = W + 2 * pad - kH + 1;
   Tensor<Scalar, 5> result(C, kH, kH, outH * outW, 1);

   for (auto _ : state) {
     result = input.extract_image_patches(kH, kH, 1, 1, 1, 1, 1, 1, pad, pad, pad, pad, Scalar(0));
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }

   const double bytes = static_cast<double>(C) * outH * outW * kH * kH * sizeof(Scalar);
   state.SetBytesProcessed(state.iterations() * bytes);
 }

 // --- Batched image patch (multiple images) ---
 static void BM_ImagePatch_Batched(benchmark::State& state) {
   const int C = state.range(0);
   const int H = state.range(1);
   const int kH = state.range(2);
   const int batch = state.range(3);

   Tensor<Scalar, 4> input(C, H, H, batch);
   input.setRandom();
   const int outH = H;
   Tensor<Scalar, 5> result(C, kH, kH, outH * outH, batch);

   for (auto _ : state) {
     result = input.extract_image_patches(kH, kH, 1, 1, 1, 1, PADDING_SAME);
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }

   const double bytes = static_cast<double>(batch) * C * H * H * kH * kH * sizeof(Scalar);
   state.SetBytesProcessed(state.iterations() * bytes);
 }

 // --- ImageNet-style configurations (realistic CNN layer sizes) ---
 static void BM_ImagePatch_ImageNet(benchmark::State& state) {
   const int C = state.range(0);
   const int H = state.range(1);
   const int kH = state.range(2);
   const int stride = state.range(3);

   Tensor<Scalar, 4> input(C, H, H, 1);
   input.setRandom();
   const int outH = (H + stride - 1) / stride;
   Tensor<Scalar, 5> result(C, kH, kH, outH * outH, 1);

   for (auto _ : state) {
     result = input.extract_image_patches(kH, kH, stride, stride, 1, 1, PADDING_SAME);
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }

   const double bytes = static_cast<double>(C) * outH * outH * kH * kH * sizeof(Scalar);
   state.SetBytesProcessed(state.iterations() * bytes);
 }

 // --- ThreadPool variant ---
 static void BM_ImagePatch_ThreadPool(benchmark::State& state) {
   const int C = state.range(0);
   const int H = state.range(1);
   const int kH = state.range(2);
   const int threads = state.range(3);

   Tensor<Scalar, 4> input(C, H, H, 1);
   input.setRandom();

   ThreadPool tp(threads);
   ThreadPoolDevice dev(&tp, threads);

   const int outH = H;
   Tensor<Scalar, 5> result(C, kH, kH, outH * outH, 1);

   for (auto _ : state) {
     result.device(dev) = input.extract_image_patches(kH, kH, 1, 1, 1, 1, PADDING_SAME);
     benchmark::DoNotOptimize(result.data());
     benchmark::ClobberMemory();
   }

   const double bytes = static_cast<double>(C) * H * H * kH * kH * sizeof(Scalar);
   state.SetBytesProcessed(state.iterations() * bytes);
   state.counters["threads"] = threads;
 }

 // --- Size generators ---

 static void PatchSizes(::benchmark::internal::Benchmark* b) {
   // channels, H, W, kH, kW
   for (int c : {3, 32, 64}) {
     for (int hw : {32, 64, 128}) {
       for (int k : {3, 5, 7}) {
         b->Args({c, hw, hw, k, k});
       }
     }
   }
 }

 static void StridedSizes(::benchmark::internal::Benchmark* b) {
   // channels, H, kH, stride
   for (int c : {3, 64}) {
     for (int hw : {56, 112, 224}) {
       for (int k : {3, 5}) {
         for (int s : {1, 2}) {
           b->Args({c, hw, k, s});
         }
       }
     }
   }
 }

 static void DilatedSizes(::benchmark::internal::Benchmark* b) {
   // channels, H, kH, dilation
   for (int c : {3, 64}) {
     for (int hw : {32, 64}) {
       for (int k : {3, 5}) {
         for (int d : {2, 4}) {
           b->Args({c, hw, k, d});
         }
       }
     }
   }
 }

 static void ExplicitPaddingSizes(::benchmark::internal::Benchmark* b) {
   // channels, H, W, kH
   for (int c : {3, 64}) {
     for (int hw : {32, 64, 128}) {
       for (int k : {3, 5}) {
         b->Args({c, hw, hw, k});
       }
     }
   }
 }

 static void BatchedSizes(::benchmark::internal::Benchmark* b) {
   // channels, H, kH, batch
   for (int c : {3, 64}) {
     for (int hw : {32, 56}) {
       for (int k : {3, 5}) {
         for (int batch : {4, 16, 32}) {
           b->Args({c, hw, k, batch});
         }
       }
     }
   }
 }

 static void ImageNetSizes(::benchmark::internal::Benchmark* b) {
   // Realistic CNN layer configurations: channels, spatial_size, kernel, stride
   // AlexNet conv1: 3x227x227, 11x11, stride 4
   b->Args({3, 227, 11, 4});
   // VGG-style: 64x224x224, 3x3, stride 1
   b->Args({64, 224, 3, 1});
   // VGG deeper: 128x112x112, 3x3, stride 1
   b->Args({128, 112, 3, 1});
   // VGG deeper: 256x56x56, 3x3, stride 1
   b->Args({256, 56, 3, 1});
   // ResNet: 64x56x56, 3x3, stride 1
   b->Args({64, 56, 3, 1});
   // ResNet downsample: 128x56x56, 3x3, stride 2
   b->Args({128, 56, 3, 2});
   // ResNet: 256x28x28, 3x3, stride 1
   b->Args({256, 28, 3, 1});
   // ResNet: 512x14x14, 3x3, stride 1
   b->Args({512, 14, 3, 1});
   // MobileNet depthwise: 32x112x112, 3x3, stride 1
   b->Args({32, 112, 3, 1});
   // Inception 1x1 (degenerate patch): 192x28x28, 1x1, stride 1
   b->Args({192, 28, 1, 1});
 }

 static void ThreadPoolSizes(::benchmark::internal::Benchmark* b) {
   // channels, H, kH, threads
   for (int c : {64, 128}) {
     for (int hw : {56, 112}) {
       for (int k : {3, 5}) {
         for (int threads : {2, 4, 8}) {
           b->Args({c, hw, k, threads});
         }
       }
     }
   }
 }

 BENCHMARK(BM_ImagePatch_Valid)->Apply(PatchSizes);
 BENCHMARK(BM_ImagePatch_Same)->Apply(PatchSizes);
 BENCHMARK(BM_ImagePatch_Strided)->Apply(StridedSizes);
 BENCHMARK(BM_ImagePatch_Dilated)->Apply(DilatedSizes);
 BENCHMARK(BM_ImagePatch_ExplicitPadding)->Apply(ExplicitPaddingSizes);
 BENCHMARK(BM_ImagePatch_Batched)->Apply(BatchedSizes);
 BENCHMARK(BM_ImagePatch_ImageNet)->Apply(ImageNetSizes);
 BENCHMARK(BM_ImagePatch_ThreadPool)->Apply(ThreadPoolSizes);
	// Benchmarks for Eigen TensorImagePatch extraction.

	#define EIGEN_USE_THREADS

	#include <benchmark/benchmark.h>
	#include <unsupported/Eigen/CXX11/Tensor>
	#include <unsupported/Eigen/CXX11/ThreadPool>

	using namespace Eigen;

	typedef float Scalar;

	// --- Basic image patch extraction with PADDING_VALID ---
	static void BM_ImagePatch_Valid(benchmark::State& state) {
	const int C = state.range(0);
	const int H = state.range(1);
	const int W = state.range(2);
	const int kH = state.range(3);
	const int kW = state.range(4);

	Tensor<Scalar, 4> input(C, H, W, 1);
	input.setRandom();
	const int outH = H - kH + 1;
	const int outW = W - kW + 1;
	Tensor<Scalar, 5> result(C, kH, kW, outH * outW, 1);

	for (auto _ : state) {
	result = input.extract_image_patches(kH, kW, 1, 1, 1, 1, PADDING_VALID);
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}

	const double bytes = static_cast<double>(C) * outH * outW * kH * kW * sizeof(Scalar);
	state.SetBytesProcessed(state.iterations() * bytes);
	}

	// --- Basic image patch extraction with PADDING_SAME ---
	static void BM_ImagePatch_Same(benchmark::State& state) {
	const int C = state.range(0);
	const int H = state.range(1);
	const int W = state.range(2);
	const int kH = state.range(3);
	const int kW = state.range(4);

	Tensor<Scalar, 4> input(C, H, W, 1);
	input.setRandom();
	const int outH = H;
	const int outW = W;
	Tensor<Scalar, 5> result(C, kH, kW, outH * outW, 1);

	for (auto _ : state) {
	result = input.extract_image_patches(kH, kW, 1, 1, 1, 1, PADDING_SAME);
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}

	const double bytes = static_cast<double>(C) * H * W * kH * kW * sizeof(Scalar);
	state.SetBytesProcessed(state.iterations() * bytes);
	}

	// --- Image patch with strides (simulates strided convolution) ---
	static void BM_ImagePatch_Strided(benchmark::State& state) {
	const int C = state.range(0);
	const int H = state.range(1);
	const int kH = state.range(2);
	const int stride = state.range(3);

	Tensor<Scalar, 4> input(C, H, H, 1);
	input.setRandom();
	const int outH = (H + stride - 1) / stride;
	Tensor<Scalar, 5> result(C, kH, kH, outH * outH, 1);

	for (auto _ : state) {
	result = input.extract_image_patches(kH, kH, stride, stride, 1, 1, PADDING_SAME);
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}

	const double bytes = static_cast<double>(C) * outH * outH * kH * kH * sizeof(Scalar);
	state.SetBytesProcessed(state.iterations() * bytes);
	}

	// --- Image patch with dilation (atrous/dilated convolution) ---
	static void BM_ImagePatch_Dilated(benchmark::State& state) {
	const int C = state.range(0);
	const int H = state.range(1);
	const int kH = state.range(2);
	const int dilation = state.range(3);

	Tensor<Scalar, 4> input(C, H, H, 1);
	input.setRandom();
	const int outH = H;
	Tensor<Scalar, 5> result(C, kH, kH, outH * outH, 1);

	for (auto _ : state) {
	result = input.extract_image_patches(kH, kH, 1, 1, dilation, dilation, PADDING_SAME);
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}

	const double bytes = static_cast<double>(C) * H * H * kH * kH * sizeof(Scalar);
	state.SetBytesProcessed(state.iterations() * bytes);
	}

	// --- Image patch with explicit padding ---
	static void BM_ImagePatch_ExplicitPadding(benchmark::State& state) {
	const int C = state.range(0);
	const int H = state.range(1);
	const int W = state.range(2);
	const int kH = state.range(3);

	const int pad = kH / 2;

	Tensor<Scalar, 4> input(C, H, W, 1);
	input.setRandom();
	const int outH = H + 2 * pad - kH + 1;
	const int outW = W + 2 * pad - kH + 1;
	Tensor<Scalar, 5> result(C, kH, kH, outH * outW, 1);

	for (auto _ : state) {
	result = input.extract_image_patches(kH, kH, 1, 1, 1, 1, 1, 1, pad, pad, pad, pad, Scalar(0));
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}

	const double bytes = static_cast<double>(C) * outH * outW * kH * kH * sizeof(Scalar);
	state.SetBytesProcessed(state.iterations() * bytes);
	}

	// --- Batched image patch (multiple images) ---
	static void BM_ImagePatch_Batched(benchmark::State& state) {
	const int C = state.range(0);
	const int H = state.range(1);
	const int kH = state.range(2);
	const int batch = state.range(3);

	Tensor<Scalar, 4> input(C, H, H, batch);
	input.setRandom();
	const int outH = H;
	Tensor<Scalar, 5> result(C, kH, kH, outH * outH, batch);

	for (auto _ : state) {
	result = input.extract_image_patches(kH, kH, 1, 1, 1, 1, PADDING_SAME);
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}

	const double bytes = static_cast<double>(batch) * C * H * H * kH * kH * sizeof(Scalar);
	state.SetBytesProcessed(state.iterations() * bytes);
	}

	// --- ImageNet-style configurations (realistic CNN layer sizes) ---
	static void BM_ImagePatch_ImageNet(benchmark::State& state) {
	const int C = state.range(0);
	const int H = state.range(1);
	const int kH = state.range(2);
	const int stride = state.range(3);

	Tensor<Scalar, 4> input(C, H, H, 1);
	input.setRandom();
	const int outH = (H + stride - 1) / stride;
	Tensor<Scalar, 5> result(C, kH, kH, outH * outH, 1);

	for (auto _ : state) {
	result = input.extract_image_patches(kH, kH, stride, stride, 1, 1, PADDING_SAME);
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}

	const double bytes = static_cast<double>(C) * outH * outH * kH * kH * sizeof(Scalar);
	state.SetBytesProcessed(state.iterations() * bytes);
	}

	// --- ThreadPool variant ---
	static void BM_ImagePatch_ThreadPool(benchmark::State& state) {
	const int C = state.range(0);
	const int H = state.range(1);
	const int kH = state.range(2);
	const int threads = state.range(3);

	Tensor<Scalar, 4> input(C, H, H, 1);
	input.setRandom();

	ThreadPool tp(threads);
	ThreadPoolDevice dev(&tp, threads);

	const int outH = H;
	Tensor<Scalar, 5> result(C, kH, kH, outH * outH, 1);

	for (auto _ : state) {
	result.device(dev) = input.extract_image_patches(kH, kH, 1, 1, 1, 1, PADDING_SAME);
	benchmark::DoNotOptimize(result.data());
	benchmark::ClobberMemory();
	}

	const double bytes = static_cast<double>(C) * H * H * kH * kH * sizeof(Scalar);
	state.SetBytesProcessed(state.iterations() * bytes);
	state.counters["threads"] = threads;
	}

	// --- Size generators ---

	static void PatchSizes(::benchmark::internal::Benchmark* b) {
	// channels, H, W, kH, kW
	for (int c : {3, 32, 64}) {
	for (int hw : {32, 64, 128}) {
	for (int k : {3, 5, 7}) {
	b->Args({c, hw, hw, k, k});
	}
	}
	}
	}

	static void StridedSizes(::benchmark::internal::Benchmark* b) {
	// channels, H, kH, stride
	for (int c : {3, 64}) {
	for (int hw : {56, 112, 224}) {
	for (int k : {3, 5}) {
	for (int s : {1, 2}) {
	b->Args({c, hw, k, s});
	}
	}
	}
	}
	}

	static void DilatedSizes(::benchmark::internal::Benchmark* b) {
	// channels, H, kH, dilation
	for (int c : {3, 64}) {
	for (int hw : {32, 64}) {
	for (int k : {3, 5}) {
	for (int d : {2, 4}) {
	b->Args({c, hw, k, d});
	}
	}
	}
	}
	}

	static void ExplicitPaddingSizes(::benchmark::internal::Benchmark* b) {
	// channels, H, W, kH
	for (int c : {3, 64}) {
	for (int hw : {32, 64, 128}) {
	for (int k : {3, 5}) {
	b->Args({c, hw, hw, k});
	}
	}
	}
	}

	static void BatchedSizes(::benchmark::internal::Benchmark* b) {
	// channels, H, kH, batch
	for (int c : {3, 64}) {
	for (int hw : {32, 56}) {
	for (int k : {3, 5}) {
	for (int batch : {4, 16, 32}) {
	b->Args({c, hw, k, batch});
	}
	}
	}
	}
	}

	static void ImageNetSizes(::benchmark::internal::Benchmark* b) {
	// Realistic CNN layer configurations: channels, spatial_size, kernel, stride
	// AlexNet conv1: 3x227x227, 11x11, stride 4
	b->Args({3, 227, 11, 4});
	// VGG-style: 64x224x224, 3x3, stride 1
	b->Args({64, 224, 3, 1});
	// VGG deeper: 128x112x112, 3x3, stride 1
	b->Args({128, 112, 3, 1});
	// VGG deeper: 256x56x56, 3x3, stride 1
	b->Args({256, 56, 3, 1});
	// ResNet: 64x56x56, 3x3, stride 1
	b->Args({64, 56, 3, 1});
	// ResNet downsample: 128x56x56, 3x3, stride 2
	b->Args({128, 56, 3, 2});
	// ResNet: 256x28x28, 3x3, stride 1
	b->Args({256, 28, 3, 1});
	// ResNet: 512x14x14, 3x3, stride 1
	b->Args({512, 14, 3, 1});
	// MobileNet depthwise: 32x112x112, 3x3, stride 1
	b->Args({32, 112, 3, 1});
	// Inception 1x1 (degenerate patch): 192x28x28, 1x1, stride 1
	b->Args({192, 28, 1, 1});
	}

	static void ThreadPoolSizes(::benchmark::internal::Benchmark* b) {
	// channels, H, kH, threads
	for (int c : {64, 128}) {
	for (int hw : {56, 112}) {
	for (int k : {3, 5}) {
	for (int threads : {2, 4, 8}) {
	b->Args({c, hw, k, threads});
	}
	}
	}
	}
	}

	BENCHMARK(BM_ImagePatch_Valid)->Apply(PatchSizes);
	BENCHMARK(BM_ImagePatch_Same)->Apply(PatchSizes);
	BENCHMARK(BM_ImagePatch_Strided)->Apply(StridedSizes);
	BENCHMARK(BM_ImagePatch_Dilated)->Apply(DilatedSizes);
	BENCHMARK(BM_ImagePatch_ExplicitPadding)->Apply(ExplicitPaddingSizes);
	BENCHMARK(BM_ImagePatch_Batched)->Apply(BatchedSizes);
	BENCHMARK(BM_ImagePatch_ImageNet)->Apply(ImageNetSizes);
	BENCHMARK(BM_ImagePatch_ThreadPool)->Apply(ThreadPoolSizes);