unsupported/benchmarks/Tensor/bench_concatenation.cpp - mirror - Git at Google

 // Benchmarks for Eigen Tensor concatenation: standalone assigns and
 // compound expressions that read from a concatenation. Tests both the
 // outer-axis path (each operand slab is a single contiguous run, so
 // TensorBlockIO collapses to one memcpy per slab) and the inner-axis
 // path (the slab walk per row is short).
 //
 // SPDX-FileCopyrightText: The Eigen Authors
 // SPDX-License-Identifier: MPL-2.0

 #define EIGEN_USE_THREADS

 #include <benchmark/benchmark.h>
 #include <unsupported/Eigen/Tensor>
 #include <unsupported/Eigen/ThreadPool>

 using namespace Eigen;

 typedef float Scalar;

 // --- Standalone concat along axis 0 (innermost for ColMajor) ---
 static void BM_Concat2D_Axis0(benchmark::State& state) {
   const int M = state.range(0);
   const int N = state.range(1);

   Tensor<Scalar, 2> A(M, N);
   Tensor<Scalar, 2> B(M, N);
   Tensor<Scalar, 2> dst(2 * M, N);
   A.setRandom();
   B.setRandom();

   for (auto _ : state) {
     dst = A.concatenate(B, 0);
     benchmark::DoNotOptimize(dst.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * 2 * M * N * sizeof(Scalar) * 2);
 }

 // --- Standalone concat along axis 1 (outermost for ColMajor 2D) ---
 static void BM_Concat2D_Axis1(benchmark::State& state) {
   const int M = state.range(0);
   const int N = state.range(1);

   Tensor<Scalar, 2> A(M, N);
   Tensor<Scalar, 2> B(M, N);
   Tensor<Scalar, 2> dst(M, 2 * N);
   A.setRandom();
   B.setRandom();

   for (auto _ : state) {
     dst = A.concatenate(B, 1);
     benchmark::DoNotOptimize(dst.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * M * 2 * N * sizeof(Scalar) * 2);
 }

 // --- Standalone concat 3D along axis 2 (outermost for ColMajor) ---
 static void BM_Concat3D_OuterAxis(benchmark::State& state) {
   const int D0 = state.range(0);
   const int D1 = state.range(1);
   const int D2 = state.range(2);

   Tensor<Scalar, 3> A(D0, D1, D2);
   Tensor<Scalar, 3> B(D0, D1, D2);
   Tensor<Scalar, 3> dst(D0, D1, 2 * D2);
   A.setRandom();
   B.setRandom();

   for (auto _ : state) {
     dst = A.concatenate(B, 2);
     benchmark::DoNotOptimize(dst.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * D0 * D1 * 2 * D2 * sizeof(Scalar) * 2);
 }

 // --- Compound: (concat(A, B) + C) — concat feeds cwise binary ---
 static void BM_Concat2D_Plus(benchmark::State& state) {
   const int M = state.range(0);
   const int N = state.range(1);

   Tensor<Scalar, 2> A(M, N);
   Tensor<Scalar, 2> B(M, N);
   Tensor<Scalar, 2> C(2 * M, N);
   Tensor<Scalar, 2> dst(2 * M, N);
   A.setRandom();
   B.setRandom();
   C.setRandom();

   for (auto _ : state) {
     dst = A.concatenate(B, 0) + C;
     benchmark::DoNotOptimize(dst.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * 2 * M * N * sizeof(Scalar) * 3);
 }

 // --- Compound: ((concat(A, B) + C) * D).sqrt() — concat in longer chain ---
 static void BM_Concat2D_Chain(benchmark::State& state) {
   const int M = state.range(0);
   const int N = state.range(1);

   Tensor<Scalar, 2> A(M, N);
   Tensor<Scalar, 2> B(M, N);
   Tensor<Scalar, 2> C(2 * M, N);
   Tensor<Scalar, 2> D(2 * M, N);
   Tensor<Scalar, 2> dst(2 * M, N);
   A.setRandom();
   B.setRandom();
   C.setRandom();
   C = C.abs();
   D.setRandom();
   D = D.abs();

   for (auto _ : state) {
     dst = ((A.concatenate(B, 0) + C) * D).sqrt();
     benchmark::DoNotOptimize(dst.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * 2 * M * N * sizeof(Scalar) * 4);
 }

 // --- Compound: (concat(A, B) along axis 1 + C) — outer-axis concat feeds a
 // cwise binary. Each operand slab is a contiguous run, so block evaluation
 // returns it as a zero-copy view and the chain stays fully streaming
 // (no scratch round-trip), unlike the inner-axis BM_Concat2D_Plus. ---
 static void BM_Concat2D_Axis1_Plus(benchmark::State& state) {
   const int M = state.range(0);
   const int N = state.range(1);

   Tensor<Scalar, 2> A(M, N);
   Tensor<Scalar, 2> B(M, N);
   Tensor<Scalar, 2> C(M, 2 * N);
   Tensor<Scalar, 2> dst(M, 2 * N);
   A.setRandom();
   B.setRandom();
   C.setRandom();

   for (auto _ : state) {
     dst = A.concatenate(B, 1) + C;
     benchmark::DoNotOptimize(dst.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * M * 2 * N * sizeof(Scalar) * 3);
 }

 // --- Compound: ((concat(A, B) along axis 1 + C) * D).sqrt() — outer-axis
 // analogue of BM_Concat2D_Chain. The zero-copy view path keeps this case off
 // the materialize-then-reread trade-off that BM_Concat2D_Chain pays. ---
 static void BM_Concat2D_Axis1_Chain(benchmark::State& state) {
   const int M = state.range(0);
   const int N = state.range(1);

   Tensor<Scalar, 2> A(M, N);
   Tensor<Scalar, 2> B(M, N);
   Tensor<Scalar, 2> C(M, 2 * N);
   Tensor<Scalar, 2> D(M, 2 * N);
   Tensor<Scalar, 2> dst(M, 2 * N);
   A.setRandom();
   B.setRandom();
   C.setRandom();
   C = C.abs();
   D.setRandom();
   D = D.abs();

   for (auto _ : state) {
     dst = ((A.concatenate(B, 1) + C) * D).sqrt();
     benchmark::DoNotOptimize(dst.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * M * 2 * N * sizeof(Scalar) * 4);
 }

 // --- Compound: concat of two raw tensors with an outer reduction-ish op ---
 // .square() forces another cwise pass after the concat.
 static void BM_Concat3D_OuterAxis_Square(benchmark::State& state) {
   const int D0 = state.range(0);
   const int D1 = state.range(1);
   const int D2 = state.range(2);

   Tensor<Scalar, 3> A(D0, D1, D2);
   Tensor<Scalar, 3> B(D0, D1, D2);
   Tensor<Scalar, 3> dst(D0, D1, 2 * D2);
   A.setRandom();
   B.setRandom();

   for (auto _ : state) {
     dst = A.concatenate(B, 2).square();
     benchmark::DoNotOptimize(dst.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * D0 * D1 * 2 * D2 * sizeof(Scalar) * 2);
 }

 // --- Threaded standalone concat ---
 static void BM_Concat2D_Axis1_ThreadPool(benchmark::State& state) {
   const int M = state.range(0);
   const int N = state.range(1);
   const int threads = state.range(2);

   Tensor<Scalar, 2> A(M, N);
   Tensor<Scalar, 2> B(M, N);
   Tensor<Scalar, 2> dst(M, 2 * N);
   A.setRandom();
   B.setRandom();

   ThreadPool tp(threads);
   ThreadPoolDevice dev(&tp, threads);

   for (auto _ : state) {
     dst.device(dev) = A.concatenate(B, 1);
     benchmark::DoNotOptimize(dst.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * M * 2 * N * sizeof(Scalar) * 2);
   state.counters["threads"] = threads;
 }

 // --- Threaded compound: (concat(A, B) + C) * D ---
 static void BM_Concat2D_Chain_ThreadPool(benchmark::State& state) {
   const int M = state.range(0);
   const int N = state.range(1);
   const int threads = state.range(2);

   Tensor<Scalar, 2> A(M, N);
   Tensor<Scalar, 2> B(M, N);
   Tensor<Scalar, 2> C(2 * M, N);
   Tensor<Scalar, 2> D(2 * M, N);
   Tensor<Scalar, 2> dst(2 * M, N);
   A.setRandom();
   B.setRandom();
   C.setRandom();
   D.setRandom();

   ThreadPool tp(threads);
   ThreadPoolDevice dev(&tp, threads);

   for (auto _ : state) {
     dst.device(dev) = (A.concatenate(B, 0) + C) * D;
     benchmark::DoNotOptimize(dst.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * 2 * M * N * sizeof(Scalar) * 4);
   state.counters["threads"] = threads;
 }

 // --- Threaded outer-axis compound: (concat(A, B) along axis 1 + C) * D ---
 static void BM_Concat2D_Axis1_Chain_ThreadPool(benchmark::State& state) {
   const int M = state.range(0);
   const int N = state.range(1);
   const int threads = state.range(2);

   Tensor<Scalar, 2> A(M, N);
   Tensor<Scalar, 2> B(M, N);
   Tensor<Scalar, 2> C(M, 2 * N);
   Tensor<Scalar, 2> D(M, 2 * N);
   Tensor<Scalar, 2> dst(M, 2 * N);
   A.setRandom();
   B.setRandom();
   C.setRandom();
   D.setRandom();

   ThreadPool tp(threads);
   ThreadPoolDevice dev(&tp, threads);

   for (auto _ : state) {
     dst.device(dev) = (A.concatenate(B, 1) + C) * D;
     benchmark::DoNotOptimize(dst.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * M * 2 * N * sizeof(Scalar) * 4);
   state.counters["threads"] = threads;
 }

 // clang-format off
 #define CONCAT2D_SIZES \
   ->Args({256, 256})->Args({1024, 1024})->Args({4096, 4096})
 #define CONCAT3D_SIZES \
   ->Args({64, 64, 64})->Args({128, 128, 128})->Args({256, 256, 64})
 #define CONCAT2D_THREAD_SIZES \
   ->Args({1024, 1024, 1})->Args({1024, 1024, 4})->Args({1024, 1024, 8}) \
   ->Args({4096, 4096, 1})->Args({4096, 4096, 4})->Args({4096, 4096, 8})
 // clang-format on

 BENCHMARK(BM_Concat2D_Axis0) CONCAT2D_SIZES->UseRealTime();
 BENCHMARK(BM_Concat2D_Axis1) CONCAT2D_SIZES->UseRealTime();
 BENCHMARK(BM_Concat3D_OuterAxis) CONCAT3D_SIZES->UseRealTime();
 BENCHMARK(BM_Concat2D_Plus) CONCAT2D_SIZES->UseRealTime();
 BENCHMARK(BM_Concat2D_Chain) CONCAT2D_SIZES->UseRealTime();
 BENCHMARK(BM_Concat2D_Axis1_Plus) CONCAT2D_SIZES->UseRealTime();
 BENCHMARK(BM_Concat2D_Axis1_Chain) CONCAT2D_SIZES->UseRealTime();
 BENCHMARK(BM_Concat3D_OuterAxis_Square) CONCAT3D_SIZES->UseRealTime();
 BENCHMARK(BM_Concat2D_Axis1_ThreadPool) CONCAT2D_THREAD_SIZES->UseRealTime();
 BENCHMARK(BM_Concat2D_Chain_ThreadPool) CONCAT2D_THREAD_SIZES->UseRealTime();
 BENCHMARK(BM_Concat2D_Axis1_Chain_ThreadPool) CONCAT2D_THREAD_SIZES->UseRealTime();

 BENCHMARK_MAIN();
	// Benchmarks for Eigen Tensor concatenation: standalone assigns and
	// compound expressions that read from a concatenation. Tests both the
	// outer-axis path (each operand slab is a single contiguous run, so
	// TensorBlockIO collapses to one memcpy per slab) and the inner-axis
	// path (the slab walk per row is short).
	//
	// SPDX-FileCopyrightText: The Eigen Authors
	// SPDX-License-Identifier: MPL-2.0

	#define EIGEN_USE_THREADS

	#include <benchmark/benchmark.h>
	#include <unsupported/Eigen/Tensor>
	#include <unsupported/Eigen/ThreadPool>

	using namespace Eigen;

	typedef float Scalar;

	// --- Standalone concat along axis 0 (innermost for ColMajor) ---
	static void BM_Concat2D_Axis0(benchmark::State& state) {
	const int M = state.range(0);
	const int N = state.range(1);

	Tensor<Scalar, 2> A(M, N);
	Tensor<Scalar, 2> B(M, N);
	Tensor<Scalar, 2> dst(2 * M, N);
	A.setRandom();
	B.setRandom();

	for (auto _ : state) {
	dst = A.concatenate(B, 0);
	benchmark::DoNotOptimize(dst.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * 2 * M * N * sizeof(Scalar) * 2);
	}

	// --- Standalone concat along axis 1 (outermost for ColMajor 2D) ---
	static void BM_Concat2D_Axis1(benchmark::State& state) {
	const int M = state.range(0);
	const int N = state.range(1);

	Tensor<Scalar, 2> A(M, N);
	Tensor<Scalar, 2> B(M, N);
	Tensor<Scalar, 2> dst(M, 2 * N);
	A.setRandom();
	B.setRandom();

	for (auto _ : state) {
	dst = A.concatenate(B, 1);
	benchmark::DoNotOptimize(dst.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * M * 2 * N * sizeof(Scalar) * 2);
	}

	// --- Standalone concat 3D along axis 2 (outermost for ColMajor) ---
	static void BM_Concat3D_OuterAxis(benchmark::State& state) {
	const int D0 = state.range(0);
	const int D1 = state.range(1);
	const int D2 = state.range(2);

	Tensor<Scalar, 3> A(D0, D1, D2);
	Tensor<Scalar, 3> B(D0, D1, D2);
	Tensor<Scalar, 3> dst(D0, D1, 2 * D2);
	A.setRandom();
	B.setRandom();

	for (auto _ : state) {
	dst = A.concatenate(B, 2);
	benchmark::DoNotOptimize(dst.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * D0 * D1 * 2 * D2 * sizeof(Scalar) * 2);
	}

	// --- Compound: (concat(A, B) + C) — concat feeds cwise binary ---
	static void BM_Concat2D_Plus(benchmark::State& state) {
	const int M = state.range(0);
	const int N = state.range(1);

	Tensor<Scalar, 2> A(M, N);
	Tensor<Scalar, 2> B(M, N);
	Tensor<Scalar, 2> C(2 * M, N);
	Tensor<Scalar, 2> dst(2 * M, N);
	A.setRandom();
	B.setRandom();
	C.setRandom();

	for (auto _ : state) {
	dst = A.concatenate(B, 0) + C;
	benchmark::DoNotOptimize(dst.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * 2 * M * N * sizeof(Scalar) * 3);
	}

	// --- Compound: ((concat(A, B) + C) * D).sqrt() — concat in longer chain ---
	static void BM_Concat2D_Chain(benchmark::State& state) {
	const int M = state.range(0);
	const int N = state.range(1);

	Tensor<Scalar, 2> A(M, N);
	Tensor<Scalar, 2> B(M, N);
	Tensor<Scalar, 2> C(2 * M, N);
	Tensor<Scalar, 2> D(2 * M, N);
	Tensor<Scalar, 2> dst(2 * M, N);
	A.setRandom();
	B.setRandom();
	C.setRandom();
	C = C.abs();
	D.setRandom();
	D = D.abs();

	for (auto _ : state) {
	dst = ((A.concatenate(B, 0) + C) * D).sqrt();
	benchmark::DoNotOptimize(dst.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * 2 * M * N * sizeof(Scalar) * 4);
	}

	// --- Compound: (concat(A, B) along axis 1 + C) — outer-axis concat feeds a
	// cwise binary. Each operand slab is a contiguous run, so block evaluation
	// returns it as a zero-copy view and the chain stays fully streaming
	// (no scratch round-trip), unlike the inner-axis BM_Concat2D_Plus. ---
	static void BM_Concat2D_Axis1_Plus(benchmark::State& state) {
	const int M = state.range(0);
	const int N = state.range(1);

	Tensor<Scalar, 2> A(M, N);
	Tensor<Scalar, 2> B(M, N);
	Tensor<Scalar, 2> C(M, 2 * N);
	Tensor<Scalar, 2> dst(M, 2 * N);
	A.setRandom();
	B.setRandom();
	C.setRandom();

	for (auto _ : state) {
	dst = A.concatenate(B, 1) + C;
	benchmark::DoNotOptimize(dst.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * M * 2 * N * sizeof(Scalar) * 3);
	}

	// --- Compound: ((concat(A, B) along axis 1 + C) * D).sqrt() — outer-axis
	// analogue of BM_Concat2D_Chain. The zero-copy view path keeps this case off
	// the materialize-then-reread trade-off that BM_Concat2D_Chain pays. ---
	static void BM_Concat2D_Axis1_Chain(benchmark::State& state) {
	const int M = state.range(0);
	const int N = state.range(1);

	Tensor<Scalar, 2> A(M, N);
	Tensor<Scalar, 2> B(M, N);
	Tensor<Scalar, 2> C(M, 2 * N);
	Tensor<Scalar, 2> D(M, 2 * N);
	Tensor<Scalar, 2> dst(M, 2 * N);
	A.setRandom();
	B.setRandom();
	C.setRandom();
	C = C.abs();
	D.setRandom();
	D = D.abs();

	for (auto _ : state) {
	dst = ((A.concatenate(B, 1) + C) * D).sqrt();
	benchmark::DoNotOptimize(dst.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * M * 2 * N * sizeof(Scalar) * 4);
	}

	// --- Compound: concat of two raw tensors with an outer reduction-ish op ---
	// .square() forces another cwise pass after the concat.
	static void BM_Concat3D_OuterAxis_Square(benchmark::State& state) {
	const int D0 = state.range(0);
	const int D1 = state.range(1);
	const int D2 = state.range(2);

	Tensor<Scalar, 3> A(D0, D1, D2);
	Tensor<Scalar, 3> B(D0, D1, D2);
	Tensor<Scalar, 3> dst(D0, D1, 2 * D2);
	A.setRandom();
	B.setRandom();

	for (auto _ : state) {
	dst = A.concatenate(B, 2).square();
	benchmark::DoNotOptimize(dst.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * D0 * D1 * 2 * D2 * sizeof(Scalar) * 2);
	}

	// --- Threaded standalone concat ---
	static void BM_Concat2D_Axis1_ThreadPool(benchmark::State& state) {
	const int M = state.range(0);
	const int N = state.range(1);
	const int threads = state.range(2);

	Tensor<Scalar, 2> A(M, N);
	Tensor<Scalar, 2> B(M, N);
	Tensor<Scalar, 2> dst(M, 2 * N);
	A.setRandom();
	B.setRandom();

	ThreadPool tp(threads);
	ThreadPoolDevice dev(&tp, threads);

	for (auto _ : state) {
	dst.device(dev) = A.concatenate(B, 1);
	benchmark::DoNotOptimize(dst.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * M * 2 * N * sizeof(Scalar) * 2);
	state.counters["threads"] = threads;
	}

	// --- Threaded compound: (concat(A, B) + C) * D ---
	static void BM_Concat2D_Chain_ThreadPool(benchmark::State& state) {
	const int M = state.range(0);
	const int N = state.range(1);
	const int threads = state.range(2);

	Tensor<Scalar, 2> A(M, N);
	Tensor<Scalar, 2> B(M, N);
	Tensor<Scalar, 2> C(2 * M, N);
	Tensor<Scalar, 2> D(2 * M, N);
	Tensor<Scalar, 2> dst(2 * M, N);
	A.setRandom();
	B.setRandom();
	C.setRandom();
	D.setRandom();

	ThreadPool tp(threads);
	ThreadPoolDevice dev(&tp, threads);

	for (auto _ : state) {
	dst.device(dev) = (A.concatenate(B, 0) + C) * D;
	benchmark::DoNotOptimize(dst.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * 2 * M * N * sizeof(Scalar) * 4);
	state.counters["threads"] = threads;
	}

	// --- Threaded outer-axis compound: (concat(A, B) along axis 1 + C) * D ---
	static void BM_Concat2D_Axis1_Chain_ThreadPool(benchmark::State& state) {
	const int M = state.range(0);
	const int N = state.range(1);
	const int threads = state.range(2);

	Tensor<Scalar, 2> A(M, N);
	Tensor<Scalar, 2> B(M, N);
	Tensor<Scalar, 2> C(M, 2 * N);
	Tensor<Scalar, 2> D(M, 2 * N);
	Tensor<Scalar, 2> dst(M, 2 * N);
	A.setRandom();
	B.setRandom();
	C.setRandom();
	D.setRandom();

	ThreadPool tp(threads);
	ThreadPoolDevice dev(&tp, threads);

	for (auto _ : state) {
	dst.device(dev) = (A.concatenate(B, 1) + C) * D;
	benchmark::DoNotOptimize(dst.data());
	benchmark::ClobberMemory();
	}
	state.SetBytesProcessed(state.iterations() * M * 2 * N * sizeof(Scalar) * 4);
	state.counters["threads"] = threads;
	}

	// clang-format off
	#define CONCAT2D_SIZES \
	->Args({256, 256})->Args({1024, 1024})->Args({4096, 4096})
	#define CONCAT3D_SIZES \
	->Args({64, 64, 64})->Args({128, 128, 128})->Args({256, 256, 64})
	#define CONCAT2D_THREAD_SIZES \
	->Args({1024, 1024, 1})->Args({1024, 1024, 4})->Args({1024, 1024, 8}) \
	->Args({4096, 4096, 1})->Args({4096, 4096, 4})->Args({4096, 4096, 8})
	// clang-format on

	BENCHMARK(BM_Concat2D_Axis0) CONCAT2D_SIZES->UseRealTime();
	BENCHMARK(BM_Concat2D_Axis1) CONCAT2D_SIZES->UseRealTime();
	BENCHMARK(BM_Concat3D_OuterAxis) CONCAT3D_SIZES->UseRealTime();
	BENCHMARK(BM_Concat2D_Plus) CONCAT2D_SIZES->UseRealTime();
	BENCHMARK(BM_Concat2D_Chain) CONCAT2D_SIZES->UseRealTime();
	BENCHMARK(BM_Concat2D_Axis1_Plus) CONCAT2D_SIZES->UseRealTime();
	BENCHMARK(BM_Concat2D_Axis1_Chain) CONCAT2D_SIZES->UseRealTime();
	BENCHMARK(BM_Concat3D_OuterAxis_Square) CONCAT3D_SIZES->UseRealTime();
	BENCHMARK(BM_Concat2D_Axis1_ThreadPool) CONCAT2D_THREAD_SIZES->UseRealTime();
	BENCHMARK(BM_Concat2D_Chain_ThreadPool) CONCAT2D_THREAD_SIZES->UseRealTime();
	BENCHMARK(BM_Concat2D_Axis1_Chain_ThreadPool) CONCAT2D_THREAD_SIZES->UseRealTime();

	BENCHMARK_MAIN();