unsupported/benchmarks/GPU/bench_fft.cpp - mirror - Git at Google

 // GPU FFT benchmarks: GpuFFT 1D and 2D throughput.
 //
 // Measures forward and inverse FFT performance across a range of sizes,
 // including plan-amortized (reuse) and cold-start (new plan) scenarios.
 //
 // Usage:
 //   cmake --build build-bench-gpu --target bench_fft bench_fft_double
 //   ./build-bench-gpu/bench_fft
 //   ./build-bench-gpu/bench_fft_double
 //
 // Profiling:
 //   nsys profile --trace=cuda ./build-bench-gpu/bench_fft
 // SPDX-FileCopyrightText: The Eigen Authors
 // SPDX-License-Identifier: MPL-2.0

 #include <benchmark/benchmark.h>

 #include <unsupported/Eigen/GPU>

 using namespace Eigen;

 #ifndef SCALAR
 #define SCALAR float
 #endif

 using Scalar = SCALAR;
 using Complex = std::complex<Scalar>;
 using CVec = Matrix<Complex, Dynamic, 1>;
 using RVec = Matrix<Scalar, Dynamic, 1>;
 using CMat = Matrix<Complex, Dynamic, Dynamic>;

 // CUDA warm-up: ensure the GPU is initialized before timing.
 static void cuda_warmup() {
   static bool done = false;
   if (!done) {
     void* p;
     EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, 1));
     EIGEN_CUDA_RUNTIME_CHECK(cudaFree(p));
     done = true;
   }
 }

 // --------------------------------------------------------------------------
 // 1D C2C Forward
 // --------------------------------------------------------------------------

 static void BM_GpuFFT_1D_C2C_Fwd(benchmark::State& state) {
   cuda_warmup();
   const Index n = state.range(0);
   CVec x = CVec::Random(n);
   gpu::FFT<Scalar> fft;

   // Warm up plan.
   CVec tmp = fft.fwd(x);

   for (auto _ : state) {
     benchmark::DoNotOptimize(fft.fwd(x));
   }
   state.SetItemsProcessed(state.iterations() * n);
   state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2);  // read + write
 }

 BENCHMARK(BM_GpuFFT_1D_C2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);

 // --------------------------------------------------------------------------
 // 1D C2C Inverse
 // --------------------------------------------------------------------------

 static void BM_GpuFFT_1D_C2C_Inv(benchmark::State& state) {
   cuda_warmup();
   const Index n = state.range(0);
   CVec x = CVec::Random(n);
   gpu::FFT<Scalar> fft;
   CVec X = fft.fwd(x);

   for (auto _ : state) {
     benchmark::DoNotOptimize(fft.inv(X));
   }
   state.SetItemsProcessed(state.iterations() * n);
   state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2);
 }

 BENCHMARK(BM_GpuFFT_1D_C2C_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);

 // --------------------------------------------------------------------------
 // 1D R2C Forward
 // --------------------------------------------------------------------------

 static void BM_GpuFFT_1D_R2C_Fwd(benchmark::State& state) {
   cuda_warmup();
   const Index n = state.range(0);
   RVec r = RVec::Random(n);
   gpu::FFT<Scalar> fft;

   // Warm up plan.
   CVec tmp = fft.fwd(r);

   for (auto _ : state) {
     benchmark::DoNotOptimize(fft.fwd(r));
   }
   state.SetItemsProcessed(state.iterations() * n);
   state.SetBytesProcessed(state.iterations() * (n * sizeof(Scalar) + (n / 2 + 1) * sizeof(Complex)));
 }

 BENCHMARK(BM_GpuFFT_1D_R2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);

 // --------------------------------------------------------------------------
 // 1D C2R Inverse
 // --------------------------------------------------------------------------

 static void BM_GpuFFT_1D_C2R_Inv(benchmark::State& state) {
   cuda_warmup();
   const Index n = state.range(0);
   RVec r = RVec::Random(n);
   gpu::FFT<Scalar> fft;
   CVec R = fft.fwd(r);

   for (auto _ : state) {
     benchmark::DoNotOptimize(fft.invReal(R, n));
   }
   state.SetItemsProcessed(state.iterations() * n);
   state.SetBytesProcessed(state.iterations() * ((n / 2 + 1) * sizeof(Complex) + n * sizeof(Scalar)));
 }

 BENCHMARK(BM_GpuFFT_1D_C2R_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);

 // --------------------------------------------------------------------------
 // 2D C2C Forward
 // --------------------------------------------------------------------------

 static void BM_GpuFFT_2D_C2C_Fwd(benchmark::State& state) {
   cuda_warmup();
   const Index n = state.range(0);  // square n x n
   CMat A = CMat::Random(n, n);
   gpu::FFT<Scalar> fft;

   // Warm up plan.
   CMat tmp = fft.fwd2(A);

   for (auto _ : state) {
     benchmark::DoNotOptimize(fft.fwd2(A));
   }
   state.SetItemsProcessed(state.iterations() * n * n);
   state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 2);
 }

 BENCHMARK(BM_GpuFFT_2D_C2C_Fwd)->RangeMultiplier(2)->Range(64, 4096);

 // --------------------------------------------------------------------------
 // 2D C2C Roundtrip (fwd + inv)
 // --------------------------------------------------------------------------

 static void BM_GpuFFT_2D_C2C_Roundtrip(benchmark::State& state) {
   cuda_warmup();
   const Index n = state.range(0);
   CMat A = CMat::Random(n, n);
   gpu::FFT<Scalar> fft;

   // Warm up plans.
   CMat tmp = fft.inv2(fft.fwd2(A));

   for (auto _ : state) {
     CMat B = fft.fwd2(A);
     benchmark::DoNotOptimize(fft.inv2(B));
   }
   state.SetItemsProcessed(state.iterations() * n * n * 2);  // fwd + inv
   state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 4);
 }

 BENCHMARK(BM_GpuFFT_2D_C2C_Roundtrip)->RangeMultiplier(2)->Range(64, 4096);

 // --------------------------------------------------------------------------
 // 1D Cold start (includes plan creation)
 // --------------------------------------------------------------------------

 static void BM_GpuFFT_1D_ColdStart(benchmark::State& state) {
   cuda_warmup();
   const Index n = state.range(0);
   CVec x = CVec::Random(n);

   for (auto _ : state) {
     gpu::FFT<Scalar> fft;  // new object = new plans
     benchmark::DoNotOptimize(fft.fwd(x));
   }
   state.SetItemsProcessed(state.iterations() * n);
 }

 BENCHMARK(BM_GpuFFT_1D_ColdStart)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
	// GPU FFT benchmarks: GpuFFT 1D and 2D throughput.
	//
	// Measures forward and inverse FFT performance across a range of sizes,
	// including plan-amortized (reuse) and cold-start (new plan) scenarios.
	//
	// Usage:
	// cmake --build build-bench-gpu --target bench_fft bench_fft_double
	// ./build-bench-gpu/bench_fft
	// ./build-bench-gpu/bench_fft_double
	//
	// Profiling:
	// nsys profile --trace=cuda ./build-bench-gpu/bench_fft
	// SPDX-FileCopyrightText: The Eigen Authors
	// SPDX-License-Identifier: MPL-2.0

	#include <benchmark/benchmark.h>

	#include <unsupported/Eigen/GPU>

	using namespace Eigen;

	#ifndef SCALAR
	#define SCALAR float
	#endif

	using Scalar = SCALAR;
	using Complex = std::complex<Scalar>;
	using CVec = Matrix<Complex, Dynamic, 1>;
	using RVec = Matrix<Scalar, Dynamic, 1>;
	using CMat = Matrix<Complex, Dynamic, Dynamic>;

	// CUDA warm-up: ensure the GPU is initialized before timing.
	static void cuda_warmup() {
	static bool done = false;
	if (!done) {
	void* p;
	EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, 1));
	EIGEN_CUDA_RUNTIME_CHECK(cudaFree(p));
	done = true;
	}
	}

	// --------------------------------------------------------------------------
	// 1D C2C Forward
	// --------------------------------------------------------------------------

	static void BM_GpuFFT_1D_C2C_Fwd(benchmark::State& state) {
	cuda_warmup();
	const Index n = state.range(0);
	CVec x = CVec::Random(n);
	gpu::FFT<Scalar> fft;

	// Warm up plan.
	CVec tmp = fft.fwd(x);

	for (auto _ : state) {
	benchmark::DoNotOptimize(fft.fwd(x));
	}
	state.SetItemsProcessed(state.iterations() * n);
	state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2); // read + write
	}

	BENCHMARK(BM_GpuFFT_1D_C2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);

	// --------------------------------------------------------------------------
	// 1D C2C Inverse
	// --------------------------------------------------------------------------

	static void BM_GpuFFT_1D_C2C_Inv(benchmark::State& state) {
	cuda_warmup();
	const Index n = state.range(0);
	CVec x = CVec::Random(n);
	gpu::FFT<Scalar> fft;
	CVec X = fft.fwd(x);

	for (auto _ : state) {
	benchmark::DoNotOptimize(fft.inv(X));
	}
	state.SetItemsProcessed(state.iterations() * n);
	state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2);
	}

	BENCHMARK(BM_GpuFFT_1D_C2C_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);

	// --------------------------------------------------------------------------
	// 1D R2C Forward
	// --------------------------------------------------------------------------

	static void BM_GpuFFT_1D_R2C_Fwd(benchmark::State& state) {
	cuda_warmup();
	const Index n = state.range(0);
	RVec r = RVec::Random(n);
	gpu::FFT<Scalar> fft;

	// Warm up plan.
	CVec tmp = fft.fwd(r);

	for (auto _ : state) {
	benchmark::DoNotOptimize(fft.fwd(r));
	}
	state.SetItemsProcessed(state.iterations() * n);
	state.SetBytesProcessed(state.iterations() * (n * sizeof(Scalar) + (n / 2 + 1) * sizeof(Complex)));
	}

	BENCHMARK(BM_GpuFFT_1D_R2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);

	// --------------------------------------------------------------------------
	// 1D C2R Inverse
	// --------------------------------------------------------------------------

	static void BM_GpuFFT_1D_C2R_Inv(benchmark::State& state) {
	cuda_warmup();
	const Index n = state.range(0);
	RVec r = RVec::Random(n);
	gpu::FFT<Scalar> fft;
	CVec R = fft.fwd(r);

	for (auto _ : state) {
	benchmark::DoNotOptimize(fft.invReal(R, n));
	}
	state.SetItemsProcessed(state.iterations() * n);
	state.SetBytesProcessed(state.iterations() * ((n / 2 + 1) * sizeof(Complex) + n * sizeof(Scalar)));
	}

	BENCHMARK(BM_GpuFFT_1D_C2R_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);

	// --------------------------------------------------------------------------
	// 2D C2C Forward
	// --------------------------------------------------------------------------

	static void BM_GpuFFT_2D_C2C_Fwd(benchmark::State& state) {
	cuda_warmup();
	const Index n = state.range(0); // square n x n
	CMat A = CMat::Random(n, n);
	gpu::FFT<Scalar> fft;

	// Warm up plan.
	CMat tmp = fft.fwd2(A);

	for (auto _ : state) {
	benchmark::DoNotOptimize(fft.fwd2(A));
	}
	state.SetItemsProcessed(state.iterations() * n * n);
	state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 2);
	}

	BENCHMARK(BM_GpuFFT_2D_C2C_Fwd)->RangeMultiplier(2)->Range(64, 4096);

	// --------------------------------------------------------------------------
	// 2D C2C Roundtrip (fwd + inv)
	// --------------------------------------------------------------------------

	static void BM_GpuFFT_2D_C2C_Roundtrip(benchmark::State& state) {
	cuda_warmup();
	const Index n = state.range(0);
	CMat A = CMat::Random(n, n);
	gpu::FFT<Scalar> fft;

	// Warm up plans.
	CMat tmp = fft.inv2(fft.fwd2(A));

	for (auto _ : state) {
	CMat B = fft.fwd2(A);
	benchmark::DoNotOptimize(fft.inv2(B));
	}
	state.SetItemsProcessed(state.iterations() * n * n * 2); // fwd + inv
	state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 4);
	}

	BENCHMARK(BM_GpuFFT_2D_C2C_Roundtrip)->RangeMultiplier(2)->Range(64, 4096);

	// --------------------------------------------------------------------------
	// 1D Cold start (includes plan creation)
	// --------------------------------------------------------------------------

	static void BM_GpuFFT_1D_ColdStart(benchmark::State& state) {
	cuda_warmup();
	const Index n = state.range(0);
	CVec x = CVec::Random(n);

	for (auto _ : state) {
	gpu::FFT<Scalar> fft; // new object = new plans
	benchmark::DoNotOptimize(fft.fwd(x));
	}
	state.SetItemsProcessed(state.iterations() * n);
	}

	BENCHMARK(BM_GpuFFT_1D_ColdStart)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);