unsupported/benchmarks/GPU/bench_gemm.cpp - mirror - Git at Google

 // GEMM dispatch benchmark: measure DeviceMatrix GEMM throughput across sizes.
 //
 // Compares cublasLtMatmul with plan cache against a raw cublasGemmEx call
 // (no descriptor overhead) to verify no regression.
 //
 // Usage:
 //   cmake --build build-bench-gpu --target bench_gemm
 //   ./build-bench-gpu/bench_gemm
 //
 // Profiling:
 //   nsys profile --trace=cuda ./build-bench-gpu/bench_gemm
 // SPDX-FileCopyrightText: The Eigen Authors
 // SPDX-License-Identifier: MPL-2.0

 #include <benchmark/benchmark.h>

 // EIGEN_USE_GPU is set by the CMake target (eigen_add_gpu_benchmark).
 #include <unsupported/Eigen/GPU>

 using namespace Eigen;

 #ifndef SCALAR
 #define SCALAR double
 #endif

 using Scalar = SCALAR;
 using Mat = Matrix<Scalar, Dynamic, Dynamic>;

 static void cuda_warmup() {
   static bool done = false;
   if (!done) {
     void* p;
     cudaMalloc(&p, 256);
     cudaFree(p);
     // Force context creation and JIT.
     gpu::Context ctx;
     Mat A = Mat::Random(64, 64);
     Mat B = Mat::Random(64, 64);
     auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A, ctx.stream());
     auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B, ctx.stream());
     gpu::DeviceMatrix<Scalar> d_C;
     d_C.device(ctx) = d_A * d_B;
     if (cudaDeviceSynchronize() != cudaSuccess) abort();
     done = true;
   }
 }

 // --------------------------------------------------------------------------
 // DeviceMatrix GEMM (uses cublasLtMatmul with plan cache)
 // --------------------------------------------------------------------------

 static void BM_DeviceMatrix_Gemm(benchmark::State& state) {
   cuda_warmup();
   const Index n = state.range(0);

   gpu::Context ctx;
   Mat hostA = Mat::Random(n, n);
   Mat hostB = Mat::Random(n, n);
   auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(hostA, ctx.stream());
   auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(hostB, ctx.stream());
   gpu::DeviceMatrix<Scalar> d_C;

   // Warmup: run a few GEMMs to stabilize clocks and populate plan cache.
   for (int i = 0; i < 5; ++i) {
     d_C.device(ctx) = d_A * d_B;
   }
   if (cudaDeviceSynchronize() != cudaSuccess) abort();

   for (auto _ : state) {
     d_C.device(ctx) = d_A * d_B;
     if (cudaDeviceSynchronize() != cudaSuccess) abort();
   }

   double flops = 2.0 * n * n * n;
   state.counters["GFLOPS"] =
       benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
   state.counters["n"] = n;
 }

 // --------------------------------------------------------------------------
 // Raw cublasGemmEx (direct call, no descriptor overhead)
 // --------------------------------------------------------------------------

 static void BM_Raw_CublasGemmEx(benchmark::State& state) {
   cuda_warmup();
   const Index n = state.range(0);

   gpu::Context ctx;
   Mat hostA = Mat::Random(n, n);
   Mat hostB = Mat::Random(n, n);
   auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(hostA, ctx.stream());
   auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(hostB, ctx.stream());
   gpu::DeviceMatrix<Scalar> d_C(n, n);

   constexpr cudaDataType_t dtype = gpu::internal::cuda_data_type<Scalar>::value;
   constexpr cublasComputeType_t compute = gpu::internal::cuda_compute_type<Scalar>::value;
   Scalar alpha = Scalar(1);
   Scalar beta = Scalar(0);
   const int ni = static_cast<int>(n);

   constexpr cublasGemmAlgo_t algo = gpu::internal::cuda_gemm_algo();

   // Warmup.
   for (int i = 0; i < 5; ++i) {
     cublasStatus_t s = cublasGemmEx(ctx.cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, ni, ni, ni, &alpha, d_A.data(), dtype,
                                     ni, d_B.data(), dtype, ni, &beta, d_C.data(), dtype, ni, compute, algo);
     if (s != CUBLAS_STATUS_SUCCESS) {
       state.SkipWithError("cublasGemmEx failed");
       return;
     }
   }
   if (cudaDeviceSynchronize() != cudaSuccess) abort();

   for (auto _ : state) {
     cublasStatus_t s = cublasGemmEx(ctx.cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, ni, ni, ni, &alpha, d_A.data(), dtype,
                                     ni, d_B.data(), dtype, ni, &beta, d_C.data(), dtype, ni, compute, algo);
     if (s != CUBLAS_STATUS_SUCCESS) {
       state.SkipWithError("cublasGemmEx failed");
       return;
     }
     if (cudaDeviceSynchronize() != cudaSuccess) abort();
   }

   double flops = 2.0 * n * n * n;
   state.counters["GFLOPS"] =
       benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
   state.counters["n"] = n;
 }

 // --------------------------------------------------------------------------
 // DeviceMatrix GEMM with transpose: C = A^T * B
 // --------------------------------------------------------------------------

 static void BM_DeviceMatrix_Gemm_TransA(benchmark::State& state) {
   cuda_warmup();
   const Index n = state.range(0);

   gpu::Context ctx;
   Mat hostA = Mat::Random(n, n);
   Mat hostB = Mat::Random(n, n);
   auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(hostA, ctx.stream());
   auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(hostB, ctx.stream());
   gpu::DeviceMatrix<Scalar> d_C;

   for (int i = 0; i < 5; ++i) {
     d_C.device(ctx) = d_A.transpose() * d_B;
   }
   if (cudaDeviceSynchronize() != cudaSuccess) abort();

   for (auto _ : state) {
     d_C.device(ctx) = d_A.transpose() * d_B;
     if (cudaDeviceSynchronize() != cudaSuccess) abort();
   }

   double flops = 2.0 * n * n * n;
   state.counters["GFLOPS"] =
       benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
   state.counters["n"] = n;
 }

 // Square GEMM: range of sizes from small (where descriptor overhead matters)
 // to large (where compute dominates).
 BENCHMARK(BM_DeviceMatrix_Gemm)
     ->ArgsProduct({{16, 32, 64, 128, 256, 512, 1024, 2048, 4096}})
     ->Unit(benchmark::kMicrosecond);

 BENCHMARK(BM_Raw_CublasGemmEx)
     ->ArgsProduct({{16, 32, 64, 128, 256, 512, 1024, 2048, 4096}})
     ->Unit(benchmark::kMicrosecond);

 BENCHMARK(BM_DeviceMatrix_Gemm_TransA)
     ->ArgsProduct({{16, 32, 64, 128, 256, 512, 1024, 2048, 4096}})
     ->Unit(benchmark::kMicrosecond);
	// GEMM dispatch benchmark: measure DeviceMatrix GEMM throughput across sizes.
	//
	// Compares cublasLtMatmul with plan cache against a raw cublasGemmEx call
	// (no descriptor overhead) to verify no regression.
	//
	// Usage:
	// cmake --build build-bench-gpu --target bench_gemm
	// ./build-bench-gpu/bench_gemm
	//
	// Profiling:
	// nsys profile --trace=cuda ./build-bench-gpu/bench_gemm
	// SPDX-FileCopyrightText: The Eigen Authors
	// SPDX-License-Identifier: MPL-2.0

	#include <benchmark/benchmark.h>

	// EIGEN_USE_GPU is set by the CMake target (eigen_add_gpu_benchmark).
	#include <unsupported/Eigen/GPU>

	using namespace Eigen;

	#ifndef SCALAR
	#define SCALAR double
	#endif

	using Scalar = SCALAR;
	using Mat = Matrix<Scalar, Dynamic, Dynamic>;

	static void cuda_warmup() {
	static bool done = false;
	if (!done) {
	void* p;
	cudaMalloc(&p, 256);
	cudaFree(p);
	// Force context creation and JIT.
	gpu::Context ctx;
	Mat A = Mat::Random(64, 64);
	Mat B = Mat::Random(64, 64);
	auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A, ctx.stream());
	auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B, ctx.stream());
	gpu::DeviceMatrix<Scalar> d_C;
	d_C.device(ctx) = d_A * d_B;
	if (cudaDeviceSynchronize() != cudaSuccess) abort();
	done = true;
	}
	}

	// --------------------------------------------------------------------------
	// DeviceMatrix GEMM (uses cublasLtMatmul with plan cache)
	// --------------------------------------------------------------------------

	static void BM_DeviceMatrix_Gemm(benchmark::State& state) {
	cuda_warmup();
	const Index n = state.range(0);

	gpu::Context ctx;
	Mat hostA = Mat::Random(n, n);
	Mat hostB = Mat::Random(n, n);
	auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(hostA, ctx.stream());
	auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(hostB, ctx.stream());
	gpu::DeviceMatrix<Scalar> d_C;

	// Warmup: run a few GEMMs to stabilize clocks and populate plan cache.
	for (int i = 0; i < 5; ++i) {
	d_C.device(ctx) = d_A * d_B;
	}
	if (cudaDeviceSynchronize() != cudaSuccess) abort();

	for (auto _ : state) {
	d_C.device(ctx) = d_A * d_B;
	if (cudaDeviceSynchronize() != cudaSuccess) abort();
	}

	double flops = 2.0 * n * n * n;
	state.counters["GFLOPS"] =
	benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
	state.counters["n"] = n;
	}

	// --------------------------------------------------------------------------
	// Raw cublasGemmEx (direct call, no descriptor overhead)
	// --------------------------------------------------------------------------

	static void BM_Raw_CublasGemmEx(benchmark::State& state) {
	cuda_warmup();
	const Index n = state.range(0);

	gpu::Context ctx;
	Mat hostA = Mat::Random(n, n);
	Mat hostB = Mat::Random(n, n);
	auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(hostA, ctx.stream());
	auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(hostB, ctx.stream());
	gpu::DeviceMatrix<Scalar> d_C(n, n);

	constexpr cudaDataType_t dtype = gpu::internal::cuda_data_type<Scalar>::value;
	constexpr cublasComputeType_t compute = gpu::internal::cuda_compute_type<Scalar>::value;
	Scalar alpha = Scalar(1);
	Scalar beta = Scalar(0);
	const int ni = static_cast<int>(n);

	constexpr cublasGemmAlgo_t algo = gpu::internal::cuda_gemm_algo();

	// Warmup.
	for (int i = 0; i < 5; ++i) {
	cublasStatus_t s = cublasGemmEx(ctx.cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, ni, ni, ni, &alpha, d_A.data(), dtype,
	ni, d_B.data(), dtype, ni, &beta, d_C.data(), dtype, ni, compute, algo);
	if (s != CUBLAS_STATUS_SUCCESS) {
	state.SkipWithError("cublasGemmEx failed");
	return;
	}
	}
	if (cudaDeviceSynchronize() != cudaSuccess) abort();

	for (auto _ : state) {
	cublasStatus_t s = cublasGemmEx(ctx.cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, ni, ni, ni, &alpha, d_A.data(), dtype,
	ni, d_B.data(), dtype, ni, &beta, d_C.data(), dtype, ni, compute, algo);
	if (s != CUBLAS_STATUS_SUCCESS) {
	state.SkipWithError("cublasGemmEx failed");
	return;
	}
	if (cudaDeviceSynchronize() != cudaSuccess) abort();
	}

	double flops = 2.0 * n * n * n;
	state.counters["GFLOPS"] =
	benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
	state.counters["n"] = n;
	}

	// --------------------------------------------------------------------------
	// DeviceMatrix GEMM with transpose: C = A^T * B
	// --------------------------------------------------------------------------

	static void BM_DeviceMatrix_Gemm_TransA(benchmark::State& state) {
	cuda_warmup();
	const Index n = state.range(0);

	gpu::Context ctx;
	Mat hostA = Mat::Random(n, n);
	Mat hostB = Mat::Random(n, n);
	auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(hostA, ctx.stream());
	auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(hostB, ctx.stream());
	gpu::DeviceMatrix<Scalar> d_C;

	for (int i = 0; i < 5; ++i) {
	d_C.device(ctx) = d_A.transpose() * d_B;
	}
	if (cudaDeviceSynchronize() != cudaSuccess) abort();

	for (auto _ : state) {
	d_C.device(ctx) = d_A.transpose() * d_B;
	if (cudaDeviceSynchronize() != cudaSuccess) abort();
	}

	double flops = 2.0 * n * n * n;
	state.counters["GFLOPS"] =
	benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
	state.counters["n"] = n;
	}

	// Square GEMM: range of sizes from small (where descriptor overhead matters)
	// to large (where compute dominates).
	BENCHMARK(BM_DeviceMatrix_Gemm)
	->ArgsProduct({{16, 32, 64, 128, 256, 512, 1024, 2048, 4096}})
	->Unit(benchmark::kMicrosecond);

	BENCHMARK(BM_Raw_CublasGemmEx)
	->ArgsProduct({{16, 32, 64, 128, 256, 512, 1024, 2048, 4096}})
	->Unit(benchmark::kMicrosecond);

	BENCHMARK(BM_DeviceMatrix_Gemm_TransA)
	->ArgsProduct({{16, 32, 64, 128, 256, 512, 1024, 2048, 4096}})
	->Unit(benchmark::kMicrosecond);