| # GPU benchmarks require CUDA runtime + cuSOLVER. |
| # Build separately from the main benchmark tree since they need CUDA toolchain. |
| # |
| # Usage: |
| # cmake -G Ninja -B build-bench-gpu -S unsupported/benchmarks/GPU \ |
| # -DCMAKE_CUDA_ARCHITECTURES=89 |
| # cmake --build build-bench-gpu |
| # |
| # Profiling: |
| # nsys profile --trace=cuda ./build-bench-gpu/bench_solvers |
| # ncu --set full -o profile ./build-bench-gpu/bench_solvers --benchmark_filter=BM_GpuLLT_Compute/4096 |
| |
| cmake_minimum_required(VERSION 3.17) |
| project(EigenGpuBenchmarks CXX) |
| |
| find_package(benchmark REQUIRED) |
| find_package(CUDAToolkit REQUIRED) |
| |
| set(EIGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../..") |
| |
| function(eigen_add_gpu_benchmark name source) |
| cmake_parse_arguments(BENCH "" "" "LIBRARIES;DEFINITIONS" ${ARGN}) |
| if(NOT IS_ABSOLUTE "${source}") |
| set(source "${CMAKE_CURRENT_SOURCE_DIR}/${source}") |
| endif() |
| add_executable(${name} ${source}) |
| target_compile_features(${name} PRIVATE cxx_std_14) |
| target_include_directories(${name} PRIVATE |
| ${EIGEN_SOURCE_DIR} |
| ${CUDAToolkit_INCLUDE_DIRS}) |
| target_link_libraries(${name} PRIVATE |
| benchmark::benchmark benchmark::benchmark_main |
| CUDA::cudart CUDA::cusolver CUDA::cublas) |
| if(BENCH_LIBRARIES) |
| target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES}) |
| endif() |
| target_compile_options(${name} PRIVATE -O3 -DNDEBUG) |
| target_compile_definitions(${name} PRIVATE EIGEN_USE_GPU) |
| if(BENCH_DEFINITIONS) |
| target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS}) |
| endif() |
| endfunction() |
| |
| # Solver benchmarks: LLT/LU compute + solve, host vs device paths, CPU baselines. |
| eigen_add_gpu_benchmark(bench_solvers bench_solvers.cpp) |
| eigen_add_gpu_benchmark(bench_solvers_float bench_solvers.cpp DEFINITIONS SCALAR=float) |
| |
| # Chaining benchmarks: async pipeline efficiency, host-roundtrip vs device chain. |
| eigen_add_gpu_benchmark(bench_chaining bench_chaining.cpp) |
| eigen_add_gpu_benchmark(bench_chaining_float bench_chaining.cpp DEFINITIONS SCALAR=float) |
| |
| # Batching benchmarks: multi-stream concurrency for many small systems. |
| eigen_add_gpu_benchmark(bench_batching bench_batching.cpp) |
| eigen_add_gpu_benchmark(bench_batching_float bench_batching.cpp DEFINITIONS SCALAR=float) |