| /* |
| * benchmark_aocl.cpp - AOCL Performance Benchmark Suite for Eigen |
| * |
| * Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. |
| * |
| * This Source Code Form is subject to the terms of the Mozilla Public |
| * License, v. 2.0. If a copy of the MPL was not distributed with this |
| * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| * |
| * Description: |
| * ------------ |
| * This benchmark suite evaluates the performance of Eigen mathematical |
| * operations when integrated with AMD Optimizing CPU Libraries (AOCL). It |
| * tests: |
| * |
| * 1. Vector Math Operations: Transcendental functions (exp, sin, cos, sqrt, |
| * log, etc.) using AOCL Vector Math Library (VML) for optimized |
| * double-precision operations |
| * |
| * 2. Matrix Operations: BLAS Level-3 operations (DGEMM) using AOCL BLAS library |
| * with support for both single-threaded and multithreaded execution |
| * |
| * 3. Linear Algebra: LAPACK operations (eigenvalue decomposition) using |
| * libflame |
| * |
| * 4. Real-world Scenarios: Financial risk computation simulating covariance |
| * matrix calculations and eigenvalue analysis for portfolio optimization |
| * |
| * The benchmark automatically detects AOCL configuration and adjusts test |
| * execution accordingly, providing performance comparisons between standard |
| * Eigen operations and AOCL-accelerated implementations. |
| * |
| * Compilation: |
| * ------------ |
| * # Using AOCC compiler (recommended for best AOCL compatibility): |
| * clang++ -O3 -g -DEIGEN_USE_AOCL_ALL -I<PATH_TO_EIGEN_INCLUDE> |
| * -I${AOCL_ROOT}/include \ |
| * -Wno-parentheses src/benchmark_aocl.cpp -L${AOCL_ROOT}/lib \ |
| * -lamdlibm -lm -lblis -lflame -lpthread -lrt -pthread \ |
| * -o build/eigen_aocl_benchmark |
| * |
| * # Alternative: Using GCC with proper library paths: |
| * g++ -O3 -g -DEIGEN_USE_AOCL_ALL -I<PATH_TO_EIGEN_INCLUDE> |
| * -I${AOCL_ROOT}/include \ |
| * -Wno-parentheses src/benchmark_aocl.cpp -L${AOCL_ROOT}/lib \ |
| * -lamdlibm -lm -lblis -lflame -lpthread -lrt \ |
| * -o build/eigen_aocl_benchmark |
| * |
| * # For multithreaded BLIS support: |
| * clang++ -O3 -g -fopenmp -DEIGEN_USE_AOCL_MT -I<PATH_TO_EIGEN_INCLUDE> \ |
| * -I${AOCL_ROOT}/include -Wno-parentheses src/benchmark_aocl.cpp \ |
| * -L${AOCL_ROOT}/lib -lamdlibm -lm -lblis-mt -lflame -lpthread -lrt \ |
| * -o build/eigen_aocl_benchmark_mt |
| * |
| * Usage: |
| * ------ |
| * export AOCL_ROOT=/path/to/aocl/installation |
| * export LD_LIBRARY_PATH=$AOCL_ROOT/lib:$LD_LIBRARY_PATH |
| * ./build/eigen_aocl_benchmark |
| * |
| * Developer: |
| * ---------- |
| * Name: Sharad Saurabh Bhaskar |
| * Email: shbhaska@amd.com |
| * Organization: Advanced Micro Devices, Inc. |
| */ |
| |
| #include <chrono> |
| #include <cstdlib> |
| #include <iostream> |
| #include <thread> |
| #include <vector> |
| |
| // Simple - just include Eigen headers |
| #include <Eigen/Core> |
| #include <Eigen/Dense> |
| #include <Eigen/Eigenvalues> |
| |
| // Only include CBLAS if AOCL BLIS is available |
| #ifdef EIGEN_USE_AOCL_ALL |
| #include <cblas.h> |
| #endif |
| |
| using namespace std; |
| using namespace std::chrono; |
| using namespace Eigen; |
| |
| void benchmarkVectorMath(int size) { |
| VectorXd v = VectorXd::LinSpaced(size, 0.1, 10.0); |
| VectorXd result(size); |
| double elapsed_ms = 0; |
| |
| cout << "\n--- Vector Math Benchmark (size = " << size << ") ---" << endl; |
| |
| auto start = high_resolution_clock::now(); |
| result = v.array().exp(); |
| auto end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "exp() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().sin(); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "sin() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().cos(); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "cos() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().sqrt(); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "sqrt() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().cbrt(); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "cbrt() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().abs(); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "abs() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().log(); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "log() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().log10(); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "log10() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().exp2(); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "exp2() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().asin(); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "asin() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().sinh(); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "sinh() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().acos(); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "acos() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().cosh(); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "cosh() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().tan(); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "tan() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().atan(); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "atan() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().tanh(); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "tanh() time: " << elapsed_ms << " ms" << endl; |
| |
| VectorXd v2 = VectorXd::Random(size); |
| start = high_resolution_clock::now(); |
| result = v.array() + v2.array(); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "add() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().pow(2.0); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "pow() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().max(v2.array()); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "max() time: " << elapsed_ms << " ms" << endl; |
| |
| start = high_resolution_clock::now(); |
| result = v.array().min(v2.array()); |
| end = high_resolution_clock::now(); |
| elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "min() time: " << elapsed_ms << " ms" << endl; |
| } |
| |
| // Function to benchmark BLAS operation: Matrix multiplication. |
| void benchmarkMatrixMultiplication(int matSize) { |
| cout << "\n--- BLIS-st DGEMM Benchmark (" << matSize << " x " << matSize |
| << ") ---" << endl; |
| |
| MatrixXd A = MatrixXd::Random(matSize, matSize); |
| MatrixXd B = MatrixXd::Random(matSize, matSize); |
| MatrixXd C(matSize, matSize); |
| |
| auto start = high_resolution_clock::now(); |
| C = A * B; |
| auto end = high_resolution_clock::now(); |
| double elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "Matrix multiplication time: " << elapsed_ms << " ms" << endl; |
| } |
| |
| // Benchmark BLIS directly using its CBLAS interface if available. |
| void benchmarkBlisMultithreaded(int matSize, int numThreads) { |
| #if defined(EIGEN_AOCL_USE_BLIS_MT) |
| cout << "\n--- BLIS-mt DGEMM Benchmark (" << matSize << " x " << matSize |
| << ", threads=" << numThreads << ") ---" << endl; |
| vector<double> A(matSize * matSize); |
| vector<double> B(matSize * matSize); |
| vector<double> C(matSize * matSize); |
| for (auto &v : A) |
| v = static_cast<double>(rand()) / RAND_MAX; |
| for (auto &v : B) |
| v = static_cast<double>(rand()) / RAND_MAX; |
| double alpha = 1.0, beta = 0.0; |
| string th = to_string(numThreads); |
| setenv("BLIS_NUM_THREADS", th.c_str(), 1); |
| auto start = high_resolution_clock::now(); |
| cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, matSize, matSize, |
| matSize, alpha, A.data(), matSize, B.data(), matSize, beta, |
| C.data(), matSize); |
| auto end = high_resolution_clock::now(); |
| double elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| cout << "BLIS dgemm time: " << elapsed_ms << " ms" << endl; |
| #else |
| (void)matSize; |
| (void)numThreads; |
| cout << "\nBLIS multithreaded support not enabled." << endl; |
| #endif |
| } |
| |
| // Function to benchmark LAPACK operation: Eigenvalue decomposition. |
| void benchmarkEigenDecomposition(int matSize) { |
| cout << "\n--- Eigenvalue Decomposition Benchmark (Matrix Size: " << matSize |
| << " x " << matSize << ") ---" << endl; |
| MatrixXd M = MatrixXd::Random(matSize, matSize); |
| // Make matrix symmetric (necessary for eigenvalue decomposition of |
| // self-adjoint matrices) |
| M = (M + M.transpose()) * 0.5; |
| |
| SelfAdjointEigenSolver<MatrixXd> eigensolver; |
| auto start = high_resolution_clock::now(); |
| eigensolver.compute(M); |
| auto end = high_resolution_clock::now(); |
| double elapsed_ms = duration_cast<milliseconds>(end - start).count(); |
| if (eigensolver.info() == Success) { |
| cout << "Eigenvalue decomposition time: " << elapsed_ms << " ms" << endl; |
| } else { |
| cout << "Eigenvalue decomposition failed." << endl; |
| } |
| } |
| |
| // Function simulating a real-world FSI risk computation scenario. |
| // Example: Compute covariance matrix from simulated asset returns, then perform |
| // eigenvalue decomposition. |
| void benchmarkFSIRiskComputation(int numPeriods, int numAssets) { |
| cout << "\n--- FSI Risk Computation Benchmark ---" << endl; |
| cout << "Simulating " << numPeriods << " periods for " << numAssets |
| << " assets." << endl; |
| |
| // Simulate asset returns: each column represents an asset's returns. |
| MatrixXd returns = MatrixXd::Random(numPeriods, numAssets); |
| |
| // Compute covariance matrix: cov = (returns^T * returns) / (numPeriods - 1) |
| auto start = high_resolution_clock::now(); |
| MatrixXd cov = (returns.transpose() * returns) / (numPeriods - 1); |
| auto end = high_resolution_clock::now(); |
| double cov_time = duration_cast<milliseconds>(end - start).count(); |
| cout << "Covariance matrix computation time: " << cov_time << " ms" << endl; |
| |
| // Eigenvalue decomposition on covariance matrix. |
| SelfAdjointEigenSolver<MatrixXd> eigensolver; |
| start = high_resolution_clock::now(); |
| eigensolver.compute(cov); |
| end = high_resolution_clock::now(); |
| double eig_time = duration_cast<milliseconds>(end - start).count(); |
| if (eigensolver.info() == Success) { |
| cout << "Eigenvalue decomposition (covariance) time: " << eig_time << " ms" |
| << endl; |
| cout << "Top 3 Eigenvalues: " |
| << eigensolver.eigenvalues().tail(3).transpose() << endl; |
| } else { |
| cout << "Eigenvalue decomposition failed." << endl; |
| } |
| } |
| |
| int main() { |
| cout << "=== AOCL Benchmark for Eigen on AMD Platforms ===" << endl; |
| cout << "Developer: Sharad Saurabh Bhaskar (shbhaska@amd.com)" << endl; |
| cout << "Organization: Advanced Micro Devices, Inc." << endl; |
| cout << "License: Mozilla Public License 2.0" << endl << endl; |
| |
| // Print AOCL configuration |
| #ifdef EIGEN_USE_AOCL_MT |
| cout << "AOCL Mode: MULTITHREADED (MT)" << endl; |
| cout << "Features: Multithreaded BLIS, AOCL VML, LAPACK" << endl; |
| #elif defined(EIGEN_USE_AOCL_ALL) |
| cout << "AOCL Mode: SINGLE-THREADED (ALL)" << endl; |
| cout << "Features: Single-threaded BLIS, AOCL VML, LAPACK" << endl; |
| #else |
| cout << "AOCL Mode: DISABLED" << endl; |
| cout << "Using standard Eigen implementation" << endl; |
| #endif |
| cout << "Hardware threads available: " << thread::hardware_concurrency() << endl << endl; |
| |
| // Benchmark vector math functions with varying vector sizes. |
| vector<int> vectorSizes = {5000000, 10000000, 50000000}; |
| for (int size : vectorSizes) { |
| benchmarkVectorMath(size); |
| } |
| |
| // Benchmark matrix multiplication for varying sizes. |
| vector<int> matrixSizes = {1024}; |
| for (int msize : matrixSizes) { |
| benchmarkMatrixMultiplication(msize); |
| #if defined(EIGEN_AOCL_USE_BLIS_MT) |
| benchmarkBlisMultithreaded(msize, thread::hardware_concurrency()); |
| #endif |
| } |
| |
| // Benchmark LAPACK: Eigenvalue Decomposition. |
| for (int msize : matrixSizes) { |
| benchmarkEigenDecomposition(msize); |
| } |
| |
| // Benchmark a complex FSI risk computation scenario. |
| // For example, simulate 10,000 time periods (days) for 500 assets. |
| benchmarkFSIRiskComputation(10000, 500); |
| |
| cout << "\n=== Benchmark Complete ===" << endl; |
| return 0; |
| } |