blob: ce83df206881e1ac36996b81c467b85be6e347c1 [file] [edit]
// GEMM dispatch benchmark: measure DeviceMatrix GEMM throughput across sizes.
//
// Compares cublasLtMatmul with plan cache against a raw cublasGemmEx call
// (no descriptor overhead) to verify no regression.
//
// Usage:
// cmake --build build-bench-gpu --target bench_gemm
// ./build-bench-gpu/bench_gemm
//
// Profiling:
// nsys profile --trace=cuda ./build-bench-gpu/bench_gemm
// SPDX-FileCopyrightText: The Eigen Authors
// SPDX-License-Identifier: MPL-2.0
#include <benchmark/benchmark.h>
// EIGEN_USE_GPU is set by the CMake target (eigen_add_gpu_benchmark).
#include <unsupported/Eigen/GPU>
using namespace Eigen;
#ifndef SCALAR
#define SCALAR double
#endif
using Scalar = SCALAR;
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
static void cuda_warmup() {
static bool done = false;
if (!done) {
void* p;
cudaMalloc(&p, 256);
cudaFree(p);
// Force context creation and JIT.
gpu::Context ctx;
Mat A = Mat::Random(64, 64);
Mat B = Mat::Random(64, 64);
auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A, ctx.stream());
auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B, ctx.stream());
gpu::DeviceMatrix<Scalar> d_C;
d_C.device(ctx) = d_A * d_B;
if (cudaDeviceSynchronize() != cudaSuccess) abort();
done = true;
}
}
// --------------------------------------------------------------------------
// DeviceMatrix GEMM (uses cublasLtMatmul with plan cache)
// --------------------------------------------------------------------------
static void BM_DeviceMatrix_Gemm(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
gpu::Context ctx;
Mat hostA = Mat::Random(n, n);
Mat hostB = Mat::Random(n, n);
auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(hostA, ctx.stream());
auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(hostB, ctx.stream());
gpu::DeviceMatrix<Scalar> d_C;
// Warmup: run a few GEMMs to stabilize clocks and populate plan cache.
for (int i = 0; i < 5; ++i) {
d_C.device(ctx) = d_A * d_B;
}
if (cudaDeviceSynchronize() != cudaSuccess) abort();
for (auto _ : state) {
d_C.device(ctx) = d_A * d_B;
if (cudaDeviceSynchronize() != cudaSuccess) abort();
}
double flops = 2.0 * n * n * n;
state.counters["GFLOPS"] =
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
state.counters["n"] = n;
}
// --------------------------------------------------------------------------
// Raw cublasGemmEx (direct call, no descriptor overhead)
// --------------------------------------------------------------------------
static void BM_Raw_CublasGemmEx(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
gpu::Context ctx;
Mat hostA = Mat::Random(n, n);
Mat hostB = Mat::Random(n, n);
auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(hostA, ctx.stream());
auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(hostB, ctx.stream());
gpu::DeviceMatrix<Scalar> d_C(n, n);
constexpr cudaDataType_t dtype = gpu::internal::cuda_data_type<Scalar>::value;
constexpr cublasComputeType_t compute = gpu::internal::cuda_compute_type<Scalar>::value;
Scalar alpha = Scalar(1);
Scalar beta = Scalar(0);
const int ni = static_cast<int>(n);
constexpr cublasGemmAlgo_t algo = gpu::internal::cuda_gemm_algo();
// Warmup.
for (int i = 0; i < 5; ++i) {
cublasStatus_t s = cublasGemmEx(ctx.cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, ni, ni, ni, &alpha, d_A.data(), dtype,
ni, d_B.data(), dtype, ni, &beta, d_C.data(), dtype, ni, compute, algo);
if (s != CUBLAS_STATUS_SUCCESS) {
state.SkipWithError("cublasGemmEx failed");
return;
}
}
if (cudaDeviceSynchronize() != cudaSuccess) abort();
for (auto _ : state) {
cublasStatus_t s = cublasGemmEx(ctx.cublasHandle(), CUBLAS_OP_N, CUBLAS_OP_N, ni, ni, ni, &alpha, d_A.data(), dtype,
ni, d_B.data(), dtype, ni, &beta, d_C.data(), dtype, ni, compute, algo);
if (s != CUBLAS_STATUS_SUCCESS) {
state.SkipWithError("cublasGemmEx failed");
return;
}
if (cudaDeviceSynchronize() != cudaSuccess) abort();
}
double flops = 2.0 * n * n * n;
state.counters["GFLOPS"] =
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
state.counters["n"] = n;
}
// --------------------------------------------------------------------------
// DeviceMatrix GEMM with transpose: C = A^T * B
// --------------------------------------------------------------------------
static void BM_DeviceMatrix_Gemm_TransA(benchmark::State& state) {
cuda_warmup();
const Index n = state.range(0);
gpu::Context ctx;
Mat hostA = Mat::Random(n, n);
Mat hostB = Mat::Random(n, n);
auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(hostA, ctx.stream());
auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(hostB, ctx.stream());
gpu::DeviceMatrix<Scalar> d_C;
for (int i = 0; i < 5; ++i) {
d_C.device(ctx) = d_A.transpose() * d_B;
}
if (cudaDeviceSynchronize() != cudaSuccess) abort();
for (auto _ : state) {
d_C.device(ctx) = d_A.transpose() * d_B;
if (cudaDeviceSynchronize() != cudaSuccess) abort();
}
double flops = 2.0 * n * n * n;
state.counters["GFLOPS"] =
benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
state.counters["n"] = n;
}
// Square GEMM: range of sizes from small (where descriptor overhead matters)
// to large (where compute dominates).
BENCHMARK(BM_DeviceMatrix_Gemm)
->ArgsProduct({{16, 32, 64, 128, 256, 512, 1024, 2048, 4096}})
->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_Raw_CublasGemmEx)
->ArgsProduct({{16, 32, 64, 128, 256, 512, 1024, 2048, 4096}})
->Unit(benchmark::kMicrosecond);
BENCHMARK(BM_DeviceMatrix_Gemm_TransA)
->ArgsProduct({{16, 32, 64, 128, 256, 512, 1024, 2048, 4096}})
->Unit(benchmark::kMicrosecond);