Eigen/GPU [2/5]: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER) libeigen/eigen!2412 Co-authored-by: Rasmus Munk Larsen <rlarsen@nvidia.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt index 601e45f..b38be67 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt
@@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.10.0) +cmake_minimum_required(VERSION 3.17) #============================================================================== # CMake Policy issues. @@ -9,7 +9,7 @@ endif (POLICY CMP0077) # NOTE Remove setting the policy once the minimum required CMake version is -# increased to at least 3.15. Retain enabling the export to package registry. +# increased to at least 3.21. Retain enabling the export to package registry. if (POLICY CMP0090) # The export command does not populate package registry by default cmake_policy (SET CMP0090 NEW)
diff --git a/ci/build.linux.gitlab-ci.yml b/ci/build.linux.gitlab-ci.yml index 7dc5ca4..1907986 100644 --- a/ci/build.linux.gitlab-ci.yml +++ b/ci/build.linux.gitlab-ci.yml
@@ -214,11 +214,15 @@ .build:linux:cuda: extends: .build:linux:cross:x86-64 + # CUDA images and compiler packages vary enough that a cached CMake build + # directory can retain stale compiler-feature detection across pipeline runs. + cache: [] variables: # Additional flags passed to the cuda compiler. EIGEN_CI_CUDA_CXX_FLAGS: "" # Compute architectures present in the GitLab CI runners. - EIGEN_CI_CUDA_COMPUTE_ARCH: "70;80" + # SaaS GPU runners have Tesla T4 (sm_75). + EIGEN_CI_CUDA_COMPUTE_ARCH: "75" EIGEN_CI_BUILD_TARGET: buildtests_gpu EIGEN_CI_TEST_CUDA_CLANG: "off" EIGEN_CI_TEST_CUDA_NVC: "off" @@ -237,22 +241,80 @@ # Build on regular linux to limit GPU cost. - saas-linux-2xlarge-amd64 -# GCC-10, CUDA-12.2 -build:linux:cuda-12.2:gcc-10: +# ---- CUDA 11.5 + oldest supported compilers --------------------------------- +# clang-14 fully supports up to CUDA 11.5. CUDA 11.5 images are Ubuntu 20.04 +# only, which lacks clang-14 and cmake >= 3.17. EIGEN_CI_BEFORE_SCRIPT adds +# the LLVM apt repo (for clang-14) and installs cmake via pip. + +build:linux:cuda-11.5:gcc-10: extends: .build:linux:cuda - image: nvidia/cuda:12.2.0-devel-ubuntu20.04 + image: nvidia/cuda:11.5.2-devel-ubuntu20.04 + before_script: + # Ubuntu 20.04 has cmake 3.16; Eigen requires 3.17+. Pull a newer + # cmake from Kitware's official apt repo for focal. + - apt-get update -y > /dev/null + - apt-get install -y --no-install-recommends wget gnupg ca-certificates > /dev/null + - wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc + | gpg --dearmor -o /usr/share/keyrings/kitware-archive-keyring.gpg + - echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ focal main' + > /etc/apt/sources.list.d/kitware.list + - apt-get update -y > /dev/null + - . ci/scripts/common.linux.before_script.sh variables: + EIGEN_CI_INSTALL: gcc-10 g++-10 EIGEN_CI_C_COMPILER: gcc-10 EIGEN_CI_CXX_COMPILER: g++-10 -# Clang-12, CUDA-12.2 -build:linux:cuda-12.2:clang-12: - extends: build:linux:cuda-12.2:gcc-10 +build:linux:cuda-11.5:clang-14: + extends: build:linux:cuda-11.5:gcc-10 + before_script: + # Ubuntu 20.04 lacks clang-14 and has cmake 3.16. Add both the LLVM + # apt repo (for clang-14) and Kitware's apt repo (for cmake >= 3.17) + # before running common.linux.before_script.sh, which apt-installs + # cmake + the requested compiler. + - apt-get update -y > /dev/null + - apt-get install -y --no-install-recommends wget gnupg ca-certificates > /dev/null + - wget -qO - https://apt.llvm.org/llvm-snapshot.gpg.key + | gpg --dearmor -o /usr/share/keyrings/llvm-archive-keyring.gpg + - echo 'deb [signed-by=/usr/share/keyrings/llvm-archive-keyring.gpg] http://apt.llvm.org/focal/ llvm-toolchain-focal-14 main' + > /etc/apt/sources.list.d/llvm-14.list + - wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc + | gpg --dearmor -o /usr/share/keyrings/kitware-archive-keyring.gpg + - echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ focal main' + > /etc/apt/sources.list.d/kitware.list + - apt-get update -y > /dev/null + - . ci/scripts/common.linux.before_script.sh variables: - EIGEN_CI_C_COMPILER: clang-12 - EIGEN_CI_CXX_COMPILER: clang++-12 + EIGEN_CI_INSTALL: clang-14 + EIGEN_CI_C_COMPILER: clang-14 + EIGEN_CI_CXX_COMPILER: clang++-14 EIGEN_CI_TEST_CUDA_CLANG: "on" +# ---- New: latest CUDA + newest compilers, sm_75;sm_90 ---------------------- +# sm_75 (T4) coverage comes from every job; sm_90 only from newer CUDA. + +# GCC-13, CUDA-12.6 (nvcc) +# CUDA 12.6 nvcc only supports gcc up to 13. +build:linux:cuda-12.6:gcc-13: + extends: .build:linux:cuda + image: nvidia/cuda:12.6.3-devel-ubuntu24.04 + variables: + EIGEN_CI_C_COMPILER: gcc-13 + EIGEN_CI_CXX_COMPILER: g++-13 + EIGEN_CI_CUDA_COMPUTE_ARCH: "75;90" + +# Clang-19, CUDA-12.6 (clang as CUDA compiler) +# Clang CUDA support lags behind nvcc; clang-19 supports up to CUDA 12.x. +build:linux:cuda-12.6:clang-19: + extends: .build:linux:cuda + image: nvidia/cuda:12.6.3-devel-ubuntu24.04 + variables: + EIGEN_CI_INSTALL: clang-19 + EIGEN_CI_C_COMPILER: clang-19 + EIGEN_CI_CXX_COMPILER: clang++-19 + EIGEN_CI_TEST_CUDA_CLANG: "on" + EIGEN_CI_CUDA_COMPUTE_ARCH: "75;90" + ######### HIP ################################################################## # Note: these are currently build-only, until we get an AMD-supported runner. @@ -402,4 +464,3 @@ - if: $CI_PIPELINE_SOURCE == "merge_request_event" tags: - saas-linux-medium-arm64 -
diff --git a/ci/test.linux.gitlab-ci.yml b/ci/test.linux.gitlab-ci.yml index b4ca0b3..f19a5bf 100644 --- a/ci/test.linux.gitlab-ci.yml +++ b/ci/test.linux.gitlab-ci.yml
@@ -271,23 +271,59 @@ tags: - saas-linux-medium-amd64-gpu-standard -# GCC-10, CUDA-12.2 -test:linux:cuda-12.2:gcc-10: - extends: .test:linux:cuda - image: nvidia/cuda:12.2.0-devel-ubuntu20.04 - needs: [ build:linux:cuda-12.2:gcc-10 ] - variables: - EIGEN_CI_CXX_COMPILER: g++-10 - EIGEN_CI_CC_COMPILER: gcc-10 +# ---- CUDA 11.5 + oldest supported compilers --------------------------------- -# Clang-12, CUDA-12.2 -test:linux:cuda-12.2:clang-12: +# GCC-10, CUDA-11.5 +test:linux:cuda-11.5:gcc-10: extends: .test:linux:cuda - image: nvidia/cuda:12.2.0-devel-ubuntu20.04 - needs: [ build:linux:cuda-12.2:clang-12 ] + image: nvidia/cuda:11.5.2-devel-ubuntu20.04 + needs: [ build:linux:cuda-11.5:gcc-10 ] variables: - EIGEN_CI_CXX_COMPILER: clang++-12 - EIGEN_CI_CC_COMPILER: clang-12 + EIGEN_CI_INSTALL: gcc-10 g++-10 + EIGEN_CI_C_COMPILER: gcc-10 + EIGEN_CI_CXX_COMPILER: g++-10 + +# Clang-14, CUDA-11.5 +test:linux:cuda-11.5:clang-14: + extends: .test:linux:cuda + image: nvidia/cuda:11.5.2-devel-ubuntu20.04 + needs: [ build:linux:cuda-11.5:clang-14 ] + before_script: + # Ubuntu 20.04 lacks clang-14. Add the LLVM apt repo before + # the common before_script installs EIGEN_CI_INSTALL packages. + - apt-get update -y > /dev/null + - apt-get install -y --no-install-recommends wget gnupg > /dev/null + - wget -qO - https://apt.llvm.org/llvm-snapshot.gpg.key + | gpg --dearmor -o /usr/share/keyrings/llvm-archive-keyring.gpg + - echo 'deb [signed-by=/usr/share/keyrings/llvm-archive-keyring.gpg] http://apt.llvm.org/focal/ llvm-toolchain-focal-14 main' + > /etc/apt/sources.list.d/llvm-14.list + - apt-get update -y > /dev/null + - . ci/scripts/common.linux.before_script.sh + variables: + EIGEN_CI_INSTALL: clang-14 + EIGEN_CI_CXX_COMPILER: clang++-14 + EIGEN_CI_C_COMPILER: clang-14 + +# ---- New: latest CUDA ------------------------------------------------------- + +# GCC-13, CUDA-12.6 +test:linux:cuda-12.6:gcc-13: + extends: .test:linux:cuda + image: nvidia/cuda:12.6.3-devel-ubuntu24.04 + needs: [ build:linux:cuda-12.6:gcc-13 ] + variables: + EIGEN_CI_CXX_COMPILER: g++-13 + EIGEN_CI_C_COMPILER: gcc-13 + +# Clang-19, CUDA-12.6 +test:linux:cuda-12.6:clang-19: + extends: .test:linux:cuda + image: nvidia/cuda:12.6.3-devel-ubuntu24.04 + needs: [ build:linux:cuda-12.6:clang-19 ] + variables: + EIGEN_CI_INSTALL: clang-19 + EIGEN_CI_CXX_COMPILER: clang++-19 + EIGEN_CI_C_COMPILER: clang-19 ##### arm ###################################################################### @@ -485,4 +521,3 @@ - if: $CI_PIPELINE_SOURCE == "merge_request_event" tags: - saas-linux-medium-arm64 -
diff --git a/unsupported/Eigen/CMakeLists.txt b/unsupported/Eigen/CMakeLists.txt index 6d3c57e..d15bfac 100644 --- a/unsupported/Eigen/CMakeLists.txt +++ b/unsupported/Eigen/CMakeLists.txt
@@ -6,6 +6,7 @@ BVH EulerAngles FFT + GPU IterativeSolvers KroneckerProduct LevenbergMarquardt
diff --git a/unsupported/Eigen/GPU b/unsupported/Eigen/GPU new file mode 100644 index 0000000..ca32cd6 --- /dev/null +++ b/unsupported/Eigen/GPU
@@ -0,0 +1,55 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_GPU_MODULE_H +#define EIGEN_GPU_MODULE_H + +#include "../../Eigen/Core" + +#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" + +/** \defgroup GPU_Module GPU module + * + * GPU-accelerated dense linear algebra using NVIDIA CUDA libraries + * (cuSOLVER and cuBLAS). + * + * This module provides explicit GPU solver classes that coexist with Eigen's + * CPU solvers. Unlike the LAPACKE dispatch (which replaces the CPU + * implementation globally), GPU classes are separate types the user + * instantiates by choice: + * + * \code + * #define EIGEN_USE_GPU + * #include <unsupported/Eigen/GPU> + * + * // CPU path (unchanged) + * Eigen::LLT<Eigen::MatrixXd> llt_cpu(A); + * + * // GPU path (explicit) + * Eigen::gpu::LLT<double> llt_gpu(A); // L stays on device + * auto X = llt_gpu.solve(B); // only B transferred per solve + * \endcode + * + * Requires CUDA 11.4+. + */ + +#ifdef EIGEN_USE_GPU +// IWYU pragma: begin_exports +#include "src/GPU/DeviceMatrix.h" +#include "src/GPU/GpuContext.h" +#include "src/GPU/DeviceExpr.h" +#include "src/GPU/DeviceBlasExpr.h" +#include "src/GPU/DeviceSolverExpr.h" +#include "src/GPU/DeviceDispatch.h" +#include "src/GPU/GpuLLT.h" +#include "src/GPU/GpuLU.h" +// IWYU pragma: end_exports +#endif + +#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" + +#endif // EIGEN_GPU_MODULE_H
diff --git a/unsupported/Eigen/src/GPU/CuBlasSupport.h b/unsupported/Eigen/src/GPU/CuBlasSupport.h new file mode 100644 index 0000000..6201e39 --- /dev/null +++ b/unsupported/Eigen/src/GPU/CuBlasSupport.h
@@ -0,0 +1,192 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// cuBLAS-specific support types: +// - Error-checking macro +// - Operation enum and mapping to cublasOperation_t +// +// Generic CUDA runtime utilities (DeviceBuffer, cuda_data_type) are in GpuSupport.h. + +#ifndef EIGEN_GPU_CUBLAS_SUPPORT_H +#define EIGEN_GPU_CUBLAS_SUPPORT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./GpuSupport.h" +#include <cublas_v2.h> +#include <cstring> + +namespace Eigen { +namespace gpu { + +// ---- Operation enum --------------------------------------------------------- +// Public flag for transpose/adjoint in BLAS- and solver-style calls. + +enum class GpuOp { NoTrans, Trans, ConjTrans }; + +namespace internal { + +// ---- Error-checking macro --------------------------------------------------- + +#define EIGEN_CUBLAS_CHECK(expr) \ + do { \ + cublasStatus_t _s = (expr); \ + eigen_assert(_s == CUBLAS_STATUS_SUCCESS && "cuBLAS call failed"); \ + } while (0) + +constexpr cublasOperation_t to_cublas_op(GpuOp op) { + switch (op) { + case GpuOp::Trans: + return CUBLAS_OP_T; + case GpuOp::ConjTrans: + return CUBLAS_OP_C; + default: + return CUBLAS_OP_N; + } +} + +// ---- Type-specific cuBLAS wrappers ------------------------------------------ +// cuBLAS uses separate functions per type (Sgemm, Dgemm, etc.). +// These overloaded wrappers allow calling cublasXgemm/cublasXtrsm/etc. +// with any supported scalar type. + +// GEMM wrappers +inline cublasStatus_t cublasXgemm(cublasHandle_t h, cublasOperation_t transA, cublasOperation_t transB, int m, int n, + int k, const float* alpha, const float* A, int lda, const float* B, int ldb, + const float* beta, float* C, int ldc) { + return cublasSgemm(h, transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); +} +inline cublasStatus_t cublasXgemm(cublasHandle_t h, cublasOperation_t transA, cublasOperation_t transB, int m, int n, + int k, const double* alpha, const double* A, int lda, const double* B, int ldb, + const double* beta, double* C, int ldc) { + return cublasDgemm(h, transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); +} +static_assert(sizeof(cuComplex) == sizeof(std::complex<float>), "cuComplex and std::complex<float> layout mismatch"); +static_assert(sizeof(cuDoubleComplex) == sizeof(std::complex<double>), + "cuDoubleComplex and std::complex<double> layout mismatch"); + +// Complex scalar args (alpha, beta) are type-punned from std::complex<T>* +// to cuComplex*/cuDoubleComplex*. A reinterpret_cast violates strict +// aliasing: when inlined, clang/MSVC can elide the caller's store (the +// compiler no longer sees a read through the original type), causing +// segfaults. We use memcpy — the standard-blessed type-pun — for scalars. +// Device array pointers (A, B, C) are opaque to the host compiler, so +// reinterpret_cast is safe there. +inline cublasStatus_t cublasXgemm(cublasHandle_t h, cublasOperation_t transA, cublasOperation_t transB, int m, int n, + int k, const std::complex<float>* alpha, const std::complex<float>* A, int lda, + const std::complex<float>* B, int ldb, const std::complex<float>* beta, + std::complex<float>* C, int ldc) { + cuComplex a, b; + std::memcpy(&a, alpha, sizeof(a)); + std::memcpy(&b, beta, sizeof(b)); + return cublasCgemm(h, transA, transB, m, n, k, &a, reinterpret_cast<const cuComplex*>(A), lda, + reinterpret_cast<const cuComplex*>(B), ldb, &b, reinterpret_cast<cuComplex*>(C), ldc); +} +inline cublasStatus_t cublasXgemm(cublasHandle_t h, cublasOperation_t transA, cublasOperation_t transB, int m, int n, + int k, const std::complex<double>* alpha, const std::complex<double>* A, int lda, + const std::complex<double>* B, int ldb, const std::complex<double>* beta, + std::complex<double>* C, int ldc) { + cuDoubleComplex a, b; + std::memcpy(&a, alpha, sizeof(a)); + std::memcpy(&b, beta, sizeof(b)); + return cublasZgemm(h, transA, transB, m, n, k, &a, reinterpret_cast<const cuDoubleComplex*>(A), lda, + reinterpret_cast<const cuDoubleComplex*>(B), ldb, &b, reinterpret_cast<cuDoubleComplex*>(C), ldc); +} + +// TRSM wrappers +inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, + cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha, + const float* A, int lda, float* B, int ldb) { + return cublasStrsm(h, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); +} +inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, + cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha, + const double* A, int lda, double* B, int ldb) { + return cublasDtrsm(h, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb); +} +inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, + cublasOperation_t trans, cublasDiagType_t diag, int m, int n, + const std::complex<float>* alpha, const std::complex<float>* A, int lda, + std::complex<float>* B, int ldb) { + cuComplex a; + std::memcpy(&a, alpha, sizeof(a)); + return cublasCtrsm(h, side, uplo, trans, diag, m, n, &a, reinterpret_cast<const cuComplex*>(A), lda, + reinterpret_cast<cuComplex*>(B), ldb); +} +inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, + cublasOperation_t trans, cublasDiagType_t diag, int m, int n, + const std::complex<double>* alpha, const std::complex<double>* A, int lda, + std::complex<double>* B, int ldb) { + cuDoubleComplex a; + std::memcpy(&a, alpha, sizeof(a)); + return cublasZtrsm(h, side, uplo, trans, diag, m, n, &a, reinterpret_cast<const cuDoubleComplex*>(A), lda, + reinterpret_cast<cuDoubleComplex*>(B), ldb); +} + +// SYMM wrappers (real → symm, complex → hemm) +inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, + const float* alpha, const float* A, int lda, const float* B, int ldb, + const float* beta, float* C, int ldc) { + return cublasSsymm(h, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); +} +inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, + const double* alpha, const double* A, int lda, const double* B, int ldb, + const double* beta, double* C, int ldc) { + return cublasDsymm(h, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc); +} +inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, + const std::complex<float>* alpha, const std::complex<float>* A, int lda, + const std::complex<float>* B, int ldb, const std::complex<float>* beta, + std::complex<float>* C, int ldc) { + cuComplex a, b; + std::memcpy(&a, alpha, sizeof(a)); + std::memcpy(&b, beta, sizeof(b)); + return cublasChemm(h, side, uplo, m, n, &a, reinterpret_cast<const cuComplex*>(A), lda, + reinterpret_cast<const cuComplex*>(B), ldb, &b, reinterpret_cast<cuComplex*>(C), ldc); +} +inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n, + const std::complex<double>* alpha, const std::complex<double>* A, int lda, + const std::complex<double>* B, int ldb, const std::complex<double>* beta, + std::complex<double>* C, int ldc) { + cuDoubleComplex a, b; + std::memcpy(&a, alpha, sizeof(a)); + std::memcpy(&b, beta, sizeof(b)); + return cublasZhemm(h, side, uplo, m, n, &a, reinterpret_cast<const cuDoubleComplex*>(A), lda, + reinterpret_cast<const cuDoubleComplex*>(B), ldb, &b, reinterpret_cast<cuDoubleComplex*>(C), ldc); +} + +// SYRK wrappers (real → syrk, complex → herk) +inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, + const float* alpha, const float* A, int lda, const float* beta, float* C, int ldc) { + return cublasSsyrk(h, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); +} +inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, + const double* alpha, const double* A, int lda, const double* beta, double* C, + int ldc) { + return cublasDsyrk(h, uplo, trans, n, k, alpha, A, lda, beta, C, ldc); +} +inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, + const float* alpha, const std::complex<float>* A, int lda, const float* beta, + std::complex<float>* C, int ldc) { + return cublasCherk(h, uplo, trans, n, k, alpha, reinterpret_cast<const cuComplex*>(A), lda, beta, + reinterpret_cast<cuComplex*>(C), ldc); +} +inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, + const double* alpha, const std::complex<double>* A, int lda, const double* beta, + std::complex<double>* C, int ldc) { + return cublasZherk(h, uplo, trans, n, k, alpha, reinterpret_cast<const cuDoubleComplex*>(A), lda, beta, + reinterpret_cast<cuDoubleComplex*>(C), ldc); +} + +} // namespace internal +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_CUBLAS_SUPPORT_H
diff --git a/unsupported/Eigen/src/GPU/CuSolverSupport.h b/unsupported/Eigen/src/GPU/CuSolverSupport.h new file mode 100644 index 0000000..2de889d --- /dev/null +++ b/unsupported/Eigen/src/GPU/CuSolverSupport.h
@@ -0,0 +1,130 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// cuSOLVER-specific support types: +// - cuSOLVER error-checking macro +// - RAII wrapper for cusolverDnParams +// - Scalar → cudaDataType_t mapping +// - (UpLo, StorageOrder) → cublasFillMode_t mapping +// +// Generic CUDA runtime utilities (DeviceBuffer, EIGEN_CUDA_RUNTIME_CHECK) +// are in GpuSupport.h. + +#ifndef EIGEN_GPU_CUSOLVER_SUPPORT_H +#define EIGEN_GPU_CUSOLVER_SUPPORT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./GpuSupport.h" +#include <cusolverDn.h> +#include <cstdio> + +namespace Eigen { +namespace gpu { +namespace internal { + +// ---- Error-checking macros -------------------------------------------------- + +// cuSOLVER does not ship a cusolverGetErrorString() in the public API, so we +// stringify the codes ourselves. Keeps failed asserts actionable. +inline const char* cusolver_status_name(cusolverStatus_t s) { + switch (s) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED"; + default: + return "CUSOLVER_STATUS_UNKNOWN"; + } +} + +inline bool report_cusolver_failure(cusolverStatus_t s, const char* expr, const char* file, int line) { + std::fprintf(stderr, + "cuSOLVER call failed\n" + " expr: %s\n" + " status: %s (%d)\n" + " at: %s:%d\n", + expr, cusolver_status_name(s), static_cast<int>(s), file, line); + return false; +} + +#define EIGEN_CUSOLVER_CHECK(expr) \ + do { \ + cusolverStatus_t _s = (expr); \ + eigen_assert(_s == CUSOLVER_STATUS_SUCCESS || \ + ::Eigen::gpu::internal::report_cusolver_failure(_s, #expr, __FILE__, __LINE__)); \ + } while (0) + +// ---- RAII: cusolverDnParams ------------------------------------------------- + +struct CusolverParams { + cusolverDnParams_t p = nullptr; + + CusolverParams() { EIGEN_CUSOLVER_CHECK(cusolverDnCreateParams(&p)); } + + ~CusolverParams() { + if (p) (void)cusolverDnDestroyParams(p); // destructor: can't propagate + } + + // Move-only. + CusolverParams(CusolverParams&& o) noexcept : p(o.p) { o.p = nullptr; } + CusolverParams& operator=(CusolverParams&& o) noexcept { + if (this != &o) { + if (p) (void)cusolverDnDestroyParams(p); + p = o.p; + o.p = nullptr; + } + return *this; + } + + CusolverParams(const CusolverParams&) = delete; + CusolverParams& operator=(const CusolverParams&) = delete; +}; + +// ---- Scalar → cudaDataType_t ------------------------------------------------ +// Alias for backward compatibility. The canonical trait is cuda_data_type<> in GpuSupport.h. +template <typename Scalar> +using cusolver_data_type = cuda_data_type<Scalar>; + +// ---- UpLo → cublasFillMode_t ------------------------------------------------ +// cuSOLVER always interprets the matrix as column-major. Callers pass the +// triangle that holds the data in column-major layout. + +template <int UpLo> +struct cusolver_fill_mode; + +template <> +struct cusolver_fill_mode<Lower> { + static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER; +}; +template <> +struct cusolver_fill_mode<Upper> { + static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_UPPER; +}; + +} // namespace internal +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_CUSOLVER_SUPPORT_H
diff --git a/unsupported/Eigen/src/GPU/DeviceBlasExpr.h b/unsupported/Eigen/src/GPU/DeviceBlasExpr.h new file mode 100644 index 0000000..7597320 --- /dev/null +++ b/unsupported/Eigen/src/GPU/DeviceBlasExpr.h
@@ -0,0 +1,152 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// BLAS Level 3 expression types for gpu::DeviceMatrix (beyond GEMM): +// TrsmExpr -> cublasXtrsm (triangular solve) +// SymmExpr -> cublasXsymm (symmetric multiply, real) +// -> cublasXhemm (Hermitian multiply, complex) +// SyrkExpr -> cublasXsyrk (symmetric rank-k update, real) +// -> cublasXherk (Hermitian rank-k update, complex) + +#ifndef EIGEN_GPU_DEVICE_BLAS_EXPR_H +#define EIGEN_GPU_DEVICE_BLAS_EXPR_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include <functional> + +namespace Eigen { +namespace gpu { + +template <typename Scalar_> +class DeviceMatrix; +template <typename Scalar_, int UpLo_> +class TrsmExpr; + +// ---- TriangularView -------------------------------------------------------- +// d_A.triangularView<Lower>() -> view with .solve(d_B) + +template <typename Scalar_, int UpLo_> +class TriangularView { + public: + using Scalar = Scalar_; + static constexpr int UpLo = UpLo_; + + explicit TriangularView(const DeviceMatrix<Scalar>& m) : mat_(m) {} + const DeviceMatrix<Scalar>& matrix() const { return mat_; } + + /** Build a TRSM solve expression. */ + TrsmExpr<Scalar, UpLo_> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; } + + private: + std::reference_wrapper<const DeviceMatrix<Scalar>> mat_; +}; + +// ---- TrsmExpr: triangularView<UpLo>().solve(B) -> cublasXtrsm -------------- + +template <typename Scalar_, int UpLo_> +class TrsmExpr { + public: + using Scalar = Scalar_; + static constexpr int UpLo = UpLo_; + + TrsmExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {} + const DeviceMatrix<Scalar>& matrix() const { return A_; } + const DeviceMatrix<Scalar>& rhs() const { return B_; } + + private: + std::reference_wrapper<const DeviceMatrix<Scalar>> A_; + std::reference_wrapper<const DeviceMatrix<Scalar>> B_; +}; + +// ---- SelfAdjointView ------------------------------------------------------- +// d_A.selfadjointView<Lower>() -> view that can multiply: view * d_B + +template <typename Scalar_, int UpLo_> +class SelfAdjointView { + public: + using Scalar = Scalar_; + using RealScalar = typename NumTraits<Scalar>::Real; + static constexpr int UpLo = UpLo_; + + explicit SelfAdjointView(DeviceMatrix<Scalar>& m) : mat_(m) {} + const DeviceMatrix<Scalar>& matrix() const { return mat_; } + DeviceMatrix<Scalar>& matrix() { return mat_; } + + /** Rank-k update: C.selfadjointView<Lower>().rankUpdate(A, alpha) + * computes C = alpha * A * A^H + C (lower triangle only). + * Maps to cublasXsyrk (real) or cublasXherk (complex). */ + void rankUpdate(const DeviceMatrix<Scalar>& A, RealScalar alpha = RealScalar(1)); + + private: + std::reference_wrapper<DeviceMatrix<Scalar>> mat_; +}; + +// Const variant for multiplication only (no rankUpdate). +template <typename Scalar_, int UpLo_> +class ConstSelfAdjointView { + public: + using Scalar = Scalar_; + static constexpr int UpLo = UpLo_; + + explicit ConstSelfAdjointView(const DeviceMatrix<Scalar>& m) : mat_(m) {} + const DeviceMatrix<Scalar>& matrix() const { return mat_; } + + private: + std::reference_wrapper<const DeviceMatrix<Scalar>> mat_; +}; + +// ---- SymmExpr: selfadjointView<UpLo>() * B -> cublasXsymm/Xhemm ----------- + +template <typename Scalar_, int UpLo_> +class SymmExpr { + public: + using Scalar = Scalar_; + static constexpr int UpLo = UpLo_; + + SymmExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {} + const DeviceMatrix<Scalar>& matrix() const { return A_; } + const DeviceMatrix<Scalar>& rhs() const { return B_; } + + private: + std::reference_wrapper<const DeviceMatrix<Scalar>> A_; + std::reference_wrapper<const DeviceMatrix<Scalar>> B_; +}; + +// operator*: SelfAdjointView * Matrix -> SymmExpr (mutable and const variants) +template <typename S, int UpLo> +SymmExpr<S, UpLo> operator*(const SelfAdjointView<S, UpLo>& a, const DeviceMatrix<S>& b) { + return {a.matrix(), b}; +} +template <typename S, int UpLo> +SymmExpr<S, UpLo> operator*(const ConstSelfAdjointView<S, UpLo>& a, const DeviceMatrix<S>& b) { + return {a.matrix(), b}; +} + +// ---- SyrkExpr: rankUpdate(A) -> cublasXsyrk/Xherk -------------------------- +// C.rankUpdate(A) computes C += A * A^H (or A^H * A depending on convention). + +template <typename Scalar_, int UpLo_> +class SyrkExpr { + public: + using Scalar = Scalar_; + static constexpr int UpLo = UpLo_; + + SyrkExpr(const DeviceMatrix<Scalar>& A) : A_(A) {} + const DeviceMatrix<Scalar>& matrix() const { return A_; } + + private: + std::reference_wrapper<const DeviceMatrix<Scalar>> A_; +}; + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_DEVICE_BLAS_EXPR_H
diff --git a/unsupported/Eigen/src/GPU/DeviceDispatch.h b/unsupported/Eigen/src/GPU/DeviceDispatch.h new file mode 100644 index 0000000..26ffc8f --- /dev/null +++ b/unsupported/Eigen/src/GPU/DeviceDispatch.h
@@ -0,0 +1,489 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// Dispatch functions that map DeviceMatrix expressions to NVIDIA library calls. +// +// dispatch_gemm() — GemmExpr → cublasXgemm +// +// Each function documents the exact library call and parameters. + +#ifndef EIGEN_GPU_DEVICE_DISPATCH_H +#define EIGEN_GPU_DEVICE_DISPATCH_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include <climits> + +#include "./DeviceExpr.h" +#include "./DeviceBlasExpr.h" +#include "./DeviceSolverExpr.h" +#include "./GpuContext.h" +#include "./CuSolverSupport.h" + +namespace Eigen { +namespace gpu { +namespace internal { + +template <typename Scalar> +bool aliases_device_memory(const DeviceMatrix<Scalar>& a, const DeviceMatrix<Scalar>& b) { + return a.data() != nullptr && a.data() == b.data(); +} + +// ---- GEMM dispatch ---------------------------------------------------------- +// GemmExpr<Lhs, Rhs> → cublasXgemm (type-specific Sgemm/Dgemm/Cgemm/Zgemm). + +template <typename Lhs, typename Rhs> +void dispatch_gemm( + Context& ctx, DeviceMatrix<typename device_expr_traits<Lhs>::scalar_type>& dst, const GemmExpr<Lhs, Rhs>& expr, + typename device_expr_traits<Lhs>::scalar_type beta_val, + typename device_expr_traits<Lhs>::scalar_type alpha_scale = typename device_expr_traits<Lhs>::scalar_type(1)) { + using Scalar = typename device_expr_traits<Lhs>::scalar_type; + using traits_lhs = device_expr_traits<Lhs>; + using traits_rhs = device_expr_traits<Rhs>; + + const DeviceMatrix<Scalar>& A = traits_lhs::matrix(expr.lhs()); + const DeviceMatrix<Scalar>& B = traits_rhs::matrix(expr.rhs()); + + constexpr cublasOperation_t transA = to_cublas_op(traits_lhs::op); + constexpr cublasOperation_t transB = to_cublas_op(traits_rhs::op); + + // GEMM dimensions: C(m,n) = op(A)(m,k) * op(B)(k,n) + const int64_t m = (traits_lhs::op == GpuOp::NoTrans) ? A.rows() : A.cols(); + const int64_t k = (traits_lhs::op == GpuOp::NoTrans) ? A.cols() : A.rows(); + const int64_t n = (traits_rhs::op == GpuOp::NoTrans) ? B.cols() : B.rows(); + const int64_t rhs_k = (traits_rhs::op == GpuOp::NoTrans) ? B.rows() : B.cols(); + + eigen_assert(k == rhs_k && "DeviceMatrix GEMM dimension mismatch"); + + const int64_t lda = A.outerStride(); + const int64_t ldb = B.outerStride(); + + eigen_assert(!aliases_device_memory(dst, A) && "DeviceMatrix GEMM destination aliases lhs operand"); + eigen_assert(!aliases_device_memory(dst, B) && "DeviceMatrix GEMM destination aliases rhs operand"); + + if (!dst.empty()) { + dst.waitReady(ctx.stream()); + } + + const bool resized = dst.empty() || dst.rows() != m || dst.cols() != n; + if (resized) { + dst.resize(m, n); + } + const int64_t ldc = dst.outerStride(); + + Scalar alpha_local = alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs()); + + A.waitReady(ctx.stream()); + B.waitReady(ctx.stream()); + + if (resized && beta_val != Scalar(0) && dst.sizeInBytes() > 0) { + EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream())); + } + + eigen_assert(m <= INT_MAX && n <= INT_MAX && k <= INT_MAX && lda <= INT_MAX && ldb <= INT_MAX && ldc <= INT_MAX && + "cublasXgemm dimensions exceed int range"); + + Scalar scalars[2] = {alpha_local, beta_val}; + EIGEN_CUBLAS_CHECK(cublasXgemm(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n), + static_cast<int>(k), &scalars[0], A.data(), static_cast<int>(lda), B.data(), + static_cast<int>(ldb), &scalars[1], dst.data(), static_cast<int>(ldc))); + + dst.recordReady(ctx.stream()); +} + +// ---- LLT solve dispatch ----------------------------------------------------- + +template <typename Scalar, int UpLo> +void dispatch_llt_solve(Context& ctx, DeviceMatrix<Scalar>& dst, const LltSolveExpr<Scalar, UpLo>& expr) { + const DeviceMatrix<Scalar>& A = expr.matrix(); + const DeviceMatrix<Scalar>& B = expr.rhs(); + + eigen_assert(A.rows() == A.cols() && "LLT requires a square matrix"); + eigen_assert(B.rows() == A.rows() && "LLT solve: RHS rows must match matrix size"); + + const int64_t n = static_cast<int64_t>(A.rows()); + const int64_t nrhs = static_cast<int64_t>(B.cols()); + + if (n == 0 || nrhs == 0) { + if (!dst.empty()) dst.waitReady(ctx.stream()); + dst.resize(n, B.cols()); + return; + } + + A.waitReady(ctx.stream()); + B.waitReady(ctx.stream()); + if (!dst.empty()) dst.waitReady(ctx.stream()); + + constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value; + constexpr cublasFillMode_t uplo = cusolver_fill_mode<UpLo>::value; + const int64_t lda = static_cast<int64_t>(A.outerStride()); + const int64_t ldb = static_cast<int64_t>(B.outerStride()); + const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar); + const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar); + + DeviceBuffer d_factor(mat_bytes); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_factor.get(), A.data(), mat_bytes, cudaMemcpyDeviceToDevice, ctx.stream())); + + // Two info slots (potrf, potrs) so we can queue both kernels back-to-back + // and host-sync once at the end. If potrf fails, potrs runs on garbage but + // the assert fires after the single sync — saving a round trip. + PinnedHostBuffer h_info(2 * sizeof(int)); + int* info_words = static_cast<int*>(h_info.get()); + + CusolverParams params; + DeviceBuffer d_info(2 * sizeof(int)); + int* d_info_potrf = static_cast<int*>(d_info.get()); + int* d_info_potrs = d_info_potrf + 1; + size_t dev_ws = 0, host_ws = 0; + EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(ctx.cusolverHandle(), params.p, uplo, n, dtype, d_factor.get(), lda, + dtype, &dev_ws, &host_ws)); + + DeviceBuffer d_workspace(dev_ws); + std::vector<char> h_workspace(host_ws); + + EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(ctx.cusolverHandle(), params.p, uplo, n, dtype, d_factor.get(), lda, dtype, + d_workspace.get(), dev_ws, host_ws > 0 ? h_workspace.data() : nullptr, host_ws, + d_info_potrf)); + + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(&info_words[0], d_info_potrf, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); + + dst.resize(n, B.cols()); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream())); + + EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(ctx.cusolverHandle(), params.p, uplo, n, nrhs, dtype, d_factor.get(), lda, + dtype, dst.data(), static_cast<int64_t>(dst.outerStride()), d_info_potrs)); + + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(&info_words[1], d_info_potrs, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream())); + eigen_assert(info_words[0] == 0 && "cuSOLVER LLT factorization failed (matrix not positive definite)"); + eigen_assert(info_words[1] == 0 && "cuSOLVER LLT solve failed"); + + dst.recordReady(ctx.stream()); +} + +// ---- LU solve dispatch ------------------------------------------------------ + +template <typename Scalar> +void dispatch_lu_solve(Context& ctx, DeviceMatrix<Scalar>& dst, const LuSolveExpr<Scalar>& expr) { + const DeviceMatrix<Scalar>& A = expr.matrix(); + const DeviceMatrix<Scalar>& B = expr.rhs(); + + eigen_assert(A.rows() == A.cols() && "LU requires a square matrix"); + eigen_assert(B.rows() == A.rows() && "LU solve: RHS rows must match matrix size"); + + const int64_t n = static_cast<int64_t>(A.rows()); + const int64_t nrhs = static_cast<int64_t>(B.cols()); + + if (n == 0 || nrhs == 0) { + if (!dst.empty()) dst.waitReady(ctx.stream()); + dst.resize(n, B.cols()); + return; + } + + A.waitReady(ctx.stream()); + B.waitReady(ctx.stream()); + if (!dst.empty()) dst.waitReady(ctx.stream()); + + constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value; + const int64_t lda = static_cast<int64_t>(A.outerStride()); + const int64_t ldb = static_cast<int64_t>(B.outerStride()); + const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar); + const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar); + const size_t ipiv_bytes = static_cast<size_t>(n) * sizeof(int64_t); + + DeviceBuffer d_lu(mat_bytes); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu.get(), A.data(), mat_bytes, cudaMemcpyDeviceToDevice, ctx.stream())); + + DeviceBuffer d_ipiv(ipiv_bytes); + + // See dispatch_llt_solve: two info slots + single end-of-chain sync. + PinnedHostBuffer h_info(2 * sizeof(int)); + int* info_words = static_cast<int*>(h_info.get()); + + CusolverParams params; + DeviceBuffer d_info(2 * sizeof(int)); + int* d_info_getrf = static_cast<int*>(d_info.get()); + int* d_info_getrs = d_info_getrf + 1; + size_t dev_ws = 0, host_ws = 0; + EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(ctx.cusolverHandle(), params.p, n, n, dtype, d_lu.get(), lda, dtype, + &dev_ws, &host_ws)); + + DeviceBuffer d_workspace(dev_ws); + std::vector<char> h_workspace(host_ws); + + EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf(ctx.cusolverHandle(), params.p, n, n, dtype, d_lu.get(), lda, + static_cast<int64_t*>(d_ipiv.get()), dtype, d_workspace.get(), dev_ws, + host_ws > 0 ? h_workspace.data() : nullptr, host_ws, d_info_getrf)); + + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(&info_words[0], d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); + + dst.resize(n, B.cols()); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream())); + + EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(ctx.cusolverHandle(), params.p, CUBLAS_OP_N, n, nrhs, dtype, d_lu.get(), lda, + static_cast<const int64_t*>(d_ipiv.get()), dtype, dst.data(), + static_cast<int64_t>(dst.outerStride()), d_info_getrs)); + + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(&info_words[1], d_info_getrs, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream())); + eigen_assert(info_words[0] == 0 && "cuSOLVER LU factorization failed (singular matrix)"); + eigen_assert(info_words[1] == 0 && "cuSOLVER LU solve failed"); + + dst.recordReady(ctx.stream()); +} + +// ---- TRSM dispatch ---------------------------------------------------------- + +template <typename Scalar, int UpLo> +void dispatch_trsm(Context& ctx, DeviceMatrix<Scalar>& dst, const TrsmExpr<Scalar, UpLo>& expr) { + const DeviceMatrix<Scalar>& A = expr.matrix(); + const DeviceMatrix<Scalar>& B = expr.rhs(); + + eigen_assert(A.rows() == A.cols() && "TRSM requires a square triangular matrix"); + eigen_assert(B.rows() == A.rows() && "TRSM: RHS rows must match matrix size"); + + eigen_assert(A.rows() <= INT_MAX && B.cols() <= INT_MAX && A.outerStride() <= INT_MAX && + "cublasXtrsm dimensions exceed int range"); + + const int n = static_cast<int>(A.rows()); + const int nrhs = static_cast<int>(B.cols()); + + if (n == 0 || nrhs == 0) { + if (!dst.empty()) dst.waitReady(ctx.stream()); + dst.resize(n, B.cols()); + return; + } + + A.waitReady(ctx.stream()); + B.waitReady(ctx.stream()); + eigen_assert(!aliases_device_memory(dst, A) && "DeviceMatrix TRSM destination aliases triangular operand"); + eigen_assert(!aliases_device_memory(dst, B) && "DeviceMatrix TRSM destination aliases RHS operand"); + if (!dst.empty()) dst.waitReady(ctx.stream()); + + dst.resize(n, B.cols()); + const size_t rhs_bytes = static_cast<size_t>(dst.outerStride()) * static_cast<size_t>(nrhs) * sizeof(Scalar); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream())); + + constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; + Scalar alpha(1); + + EIGEN_CUBLAS_CHECK(cublasXtrsm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, nrhs, + &alpha, A.data(), static_cast<int>(A.outerStride()), dst.data(), + static_cast<int>(dst.outerStride()))); + + dst.recordReady(ctx.stream()); +} + +// ---- SYMM/HEMM dispatch ----------------------------------------------------- + +template <typename Scalar, int UpLo> +void dispatch_symm(Context& ctx, DeviceMatrix<Scalar>& dst, const SymmExpr<Scalar, UpLo>& expr) { + const DeviceMatrix<Scalar>& A = expr.matrix(); + const DeviceMatrix<Scalar>& B = expr.rhs(); + + eigen_assert(A.rows() == A.cols() && "SYMM requires a square matrix"); + eigen_assert(B.rows() == A.rows() && "SYMM: RHS rows must match matrix size"); + eigen_assert(A.rows() <= INT_MAX && B.cols() <= INT_MAX && A.outerStride() <= INT_MAX && B.outerStride() <= INT_MAX && + "cublasXsymm dimensions exceed int range"); + + const int m = static_cast<int>(A.rows()); + const int n = static_cast<int>(B.cols()); + + if (m == 0 || n == 0) { + if (!dst.empty()) dst.waitReady(ctx.stream()); + dst.resize(m == 0 ? 0 : m, B.cols()); + return; + } + + A.waitReady(ctx.stream()); + B.waitReady(ctx.stream()); + eigen_assert(!aliases_device_memory(dst, A) && "DeviceMatrix SYMM destination aliases self-adjoint operand"); + eigen_assert(!aliases_device_memory(dst, B) && "DeviceMatrix SYMM destination aliases RHS operand"); + if (!dst.empty()) dst.waitReady(ctx.stream()); + + dst.resize(m, n); + + constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; + Scalar scalars[2] = {Scalar(1), Scalar(0)}; + + EIGEN_CUBLAS_CHECK(cublasXsymm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, m, n, &scalars[0], A.data(), + static_cast<int>(A.outerStride()), B.data(), static_cast<int>(B.outerStride()), + &scalars[1], dst.data(), static_cast<int>(dst.outerStride()))); + + dst.recordReady(ctx.stream()); +} + +// ---- SYRK/HERK dispatch ----------------------------------------------------- + +template <typename Scalar, int UpLo> +void dispatch_syrk(Context& ctx, DeviceMatrix<Scalar>& dst, const SyrkExpr<Scalar, UpLo>& expr, + typename NumTraits<Scalar>::Real alpha_val, typename NumTraits<Scalar>::Real beta_val) { + using RealScalar = typename NumTraits<Scalar>::Real; + const DeviceMatrix<Scalar>& A = expr.matrix(); + + eigen_assert(A.rows() <= INT_MAX && A.cols() <= INT_MAX && A.outerStride() <= INT_MAX && + "cublasXsyrk dimensions exceed int range"); + + const int n = static_cast<int>(A.rows()); + const int k = static_cast<int>(A.cols()); + + if (n == 0) { + if (!dst.empty()) dst.waitReady(ctx.stream()); + dst.resize(0, 0); + return; + } + + A.waitReady(ctx.stream()); + eigen_assert(!aliases_device_memory(dst, A) && "DeviceMatrix SYRK destination aliases input operand"); + if (!dst.empty()) dst.waitReady(ctx.stream()); + + if (dst.empty() || dst.rows() != n || dst.cols() != n) { + dst.resize(n, n); + if (beta_val != RealScalar(0)) { + EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream())); + } + } + + constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; + + EIGEN_CUBLAS_CHECK(cublasXsyrk(ctx.cublasHandle(), uplo, CUBLAS_OP_N, n, k, &alpha_val, A.data(), + static_cast<int>(A.outerStride()), &beta_val, dst.data(), + static_cast<int>(dst.outerStride()))); + + dst.recordReady(ctx.stream()); +} + +} // namespace internal + +// ---- Assignment: d_C.device(ctx) = expr ------------------------------------ + +template <typename Scalar_> +class Assignment { + public: + using Scalar = Scalar_; + + Assignment(DeviceMatrix<Scalar>& dst, Context& ctx) : dst_(dst), ctx_(ctx) {} + + template <typename Lhs, typename Rhs> + DeviceMatrix<Scalar>& operator=(const GemmExpr<Lhs, Rhs>& expr) { + internal::dispatch_gemm(ctx_, dst_, expr, Scalar(0)); + return dst_; + } + + template <typename Lhs, typename Rhs> + DeviceMatrix<Scalar>& operator+=(const GemmExpr<Lhs, Rhs>& expr) { + internal::dispatch_gemm(ctx_, dst_, expr, Scalar(1)); + return dst_; + } + + template <typename Lhs, typename Rhs> + DeviceMatrix<Scalar>& operator-=(const GemmExpr<Lhs, Rhs>& expr) { + internal::dispatch_gemm(ctx_, dst_, expr, Scalar(1), Scalar(-1)); + return dst_; + } + + template <int UpLo> + DeviceMatrix<Scalar>& operator=(const LltSolveExpr<Scalar, UpLo>& expr) { + internal::dispatch_llt_solve(ctx_, dst_, expr); + return dst_; + } + + DeviceMatrix<Scalar>& operator=(const LuSolveExpr<Scalar>& expr) { + internal::dispatch_lu_solve(ctx_, dst_, expr); + return dst_; + } + + template <int UpLo> + DeviceMatrix<Scalar>& operator=(const TrsmExpr<Scalar, UpLo>& expr) { + internal::dispatch_trsm(ctx_, dst_, expr); + return dst_; + } + + template <int UpLo> + DeviceMatrix<Scalar>& operator=(const SymmExpr<Scalar, UpLo>& expr) { + internal::dispatch_symm(ctx_, dst_, expr); + return dst_; + } + + template <typename Expr> + DeviceMatrix<Scalar>& operator=(const Expr&) { + static_assert(sizeof(Expr) == 0, + "DeviceMatrix expression not supported: no cuBLAS/cuSOLVER mapping. " + "Supported: GEMM (A*B), TRSM (.triangularView().solve()), " + "SYMM (.selfadjointView()*B), LLT (.llt().solve()), LU (.lu().solve())."); + return dst_; + } + + private: + DeviceMatrix<Scalar>& dst_; + Context& ctx_; +}; + +// ---- Out-of-line Matrix expression operator= definitions ------------------ +// Declared in DeviceMatrix.h, defined here because they need Context::threadLocal(). + +template <typename Scalar_> +template <typename Lhs, typename Rhs> +DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const GemmExpr<Lhs, Rhs>& expr) { + device(Context::threadLocal()) = expr; + return *this; +} + +template <typename Scalar_> +template <typename Lhs, typename Rhs> +DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const GemmExpr<Lhs, Rhs>& expr) { + device(Context::threadLocal()) += expr; + return *this; +} + +template <typename Scalar_> +template <int UpLo> +DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const LltSolveExpr<Scalar_, UpLo>& expr) { + device(Context::threadLocal()) = expr; + return *this; +} + +template <typename Scalar_> +DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const LuSolveExpr<Scalar_>& expr) { + device(Context::threadLocal()) = expr; + return *this; +} + +template <typename Scalar_> +template <int UpLo> +DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const TrsmExpr<Scalar_, UpLo>& expr) { + device(Context::threadLocal()) = expr; + return *this; +} + +template <typename Scalar_> +template <int UpLo> +DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const SymmExpr<Scalar_, UpLo>& expr) { + device(Context::threadLocal()) = expr; + return *this; +} + +// SelfAdjointView::rankUpdate — defined here because it needs Context. +template <typename Scalar_, int UpLo_> +void SelfAdjointView<Scalar_, UpLo_>::rankUpdate(const DeviceMatrix<Scalar_>& A, RealScalar alpha) { + SyrkExpr<Scalar_, UpLo_> expr(A); + RealScalar beta = matrix().empty() ? RealScalar(0) : RealScalar(1); + internal::dispatch_syrk(Context::threadLocal(), matrix(), expr, alpha, beta); +} + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_DEVICE_DISPATCH_H
diff --git a/unsupported/Eigen/src/GPU/DeviceExpr.h b/unsupported/Eigen/src/GPU/DeviceExpr.h new file mode 100644 index 0000000..bcf842a --- /dev/null +++ b/unsupported/Eigen/src/GPU/DeviceExpr.h
@@ -0,0 +1,190 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// Lightweight expression types for DeviceMatrix operations. +// +// These are NOT Eigen expression templates. Each type maps 1:1 to a single +// NVIDIA library call (cuBLAS or cuSOLVER). There is no coefficient-level +// evaluation, no lazy fusion, no packet operations. +// +// Expression types: +// AdjointView<S> — d_A.adjoint() → marks ConjTrans for GEMM +// TransposeView<S> — d_A.transpose() → marks Trans for GEMM +// Scaled<Expr> — alpha * expr → carries scalar factor +// gpu::GemmExpr<Lhs, Rhs> — lhs * rhs → dispatches to cublasXgemm + +#ifndef EIGEN_GPU_DEVICE_EXPR_H +#define EIGEN_GPU_DEVICE_EXPR_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./CuBlasSupport.h" + +namespace Eigen { +namespace gpu { + +namespace internal { +// Forward declaration — specializations follow below, after the class definitions. +template <typename Expr> +struct device_expr_traits; +} // namespace internal + +// Forward declaration. +template <typename Scalar_> +class DeviceMatrix; + +// ---- AdjointView: marks ConjTrans ------------------------------------------- +// Returned by DeviceMatrix::adjoint(). Maps to cublasXgemm transA/B = C. + +template <typename Scalar_> +class AdjointView { + public: + using Scalar = Scalar_; + explicit AdjointView(const DeviceMatrix<Scalar>& m) : mat_(m) {} + const DeviceMatrix<Scalar>& matrix() const { return mat_; } + + private: + const DeviceMatrix<Scalar>& mat_; +}; + +// ---- TransposeView: marks Trans --------------------------------------------- +// Returned by DeviceMatrix::transpose(). Maps to cublasXgemm transA/B = T. + +template <typename Scalar_> +class TransposeView { + public: + using Scalar = Scalar_; + explicit TransposeView(const DeviceMatrix<Scalar>& m) : mat_(m) {} + const DeviceMatrix<Scalar>& matrix() const { return mat_; } + + private: + const DeviceMatrix<Scalar>& mat_; +}; + +// ---- Scaled: alpha * expr --------------------------------------------------- +// Returned by operator*(Scalar, DeviceMatrix/View). Carries the scalar factor. + +template <typename Inner> +class Scaled { + public: + using Scalar = typename internal::device_expr_traits<Inner>::scalar_type; + Scaled(Scalar alpha, const Inner& inner) : alpha_(alpha), inner_(inner) {} + Scalar scalar() const { return alpha_; } + const Inner& inner() const { return inner_; } + + private: + Scalar alpha_; + const Inner& inner_; +}; + +// ---- GemmExpr: lhs * rhs -> cublasXgemm ------------------------------------ +// Returned by operator*(lhs_expr, rhs_expr). Dispatches to cuBLAS GEMM. + +template <typename Lhs, typename Rhs> +class GemmExpr { + public: + using Scalar = typename internal::device_expr_traits<Lhs>::scalar_type; + static_assert(std::is_same<Scalar, typename internal::device_expr_traits<Rhs>::scalar_type>::value, + "DeviceMatrix GEMM: LHS and RHS must have the same scalar type"); + + GemmExpr(const Lhs& lhs, const Rhs& rhs) : lhs_(lhs), rhs_(rhs) {} + const Lhs& lhs() const { return lhs_; } + const Rhs& rhs() const { return rhs_; } + + private: + // Stored by reference — like Eigen's CPU expression templates, these must + // not be captured with auto (the references will dangle). Use .eval() or + // assign to a DeviceMatrix immediately. + const Lhs& lhs_; + const Rhs& rhs_; +}; + +// ---- Free operator* overloads that produce GemmExpr ------------------------- +// Defined after device_expr_traits so it can accept any supported view pair. + +// ---- Scalar * Matrix / View -> Scaled --------------------------------------- + +template <typename S> +Scaled<DeviceMatrix<S>> operator*(S alpha, const DeviceMatrix<S>& m) { + return {alpha, m}; +} + +template <typename S> +Scaled<AdjointView<S>> operator*(S alpha, const AdjointView<S>& m) { + return {alpha, m}; +} + +template <typename S> +Scaled<TransposeView<S>> operator*(S alpha, const TransposeView<S>& m) { + return {alpha, m}; +} + +namespace internal { + +// ---- Traits: extract operation info from expression types ------------------- + +// Default: a DeviceMatrix is NoTrans. +template <typename T> +struct device_expr_traits { + static constexpr bool is_device_expr = false; +}; + +template <typename Scalar> +struct device_expr_traits<DeviceMatrix<Scalar>> { + using scalar_type = Scalar; + static constexpr GpuOp op = GpuOp::NoTrans; + static constexpr bool is_device_expr = true; + static const DeviceMatrix<Scalar>& matrix(const DeviceMatrix<Scalar>& x) { return x; } + static Scalar alpha(const DeviceMatrix<Scalar>&) { return Scalar(1); } +}; + +template <typename Scalar> +struct device_expr_traits<AdjointView<Scalar>> { + using scalar_type = Scalar; + static constexpr GpuOp op = GpuOp::ConjTrans; + static constexpr bool is_device_expr = true; + static const DeviceMatrix<Scalar>& matrix(const AdjointView<Scalar>& x) { return x.matrix(); } + static Scalar alpha(const AdjointView<Scalar>&) { return Scalar(1); } +}; + +template <typename Scalar> +struct device_expr_traits<TransposeView<Scalar>> { + using scalar_type = Scalar; + static constexpr GpuOp op = GpuOp::Trans; + static constexpr bool is_device_expr = true; + static const DeviceMatrix<Scalar>& matrix(const TransposeView<Scalar>& x) { return x.matrix(); } + static Scalar alpha(const TransposeView<Scalar>&) { return Scalar(1); } +}; + +template <typename Inner> +struct device_expr_traits<Scaled<Inner>> { + using scalar_type = typename device_expr_traits<Inner>::scalar_type; + static constexpr GpuOp op = device_expr_traits<Inner>::op; + static constexpr bool is_device_expr = true; + static const DeviceMatrix<scalar_type>& matrix(const Scaled<Inner>& x) { + return device_expr_traits<Inner>::matrix(x.inner()); + } + static scalar_type alpha(const Scaled<Inner>& x) { return x.scalar() * device_expr_traits<Inner>::alpha(x.inner()); } +}; + +} // namespace internal + +template <typename Lhs, typename Rhs, + typename std::enable_if<internal::device_expr_traits<Lhs>::is_device_expr && + internal::device_expr_traits<Rhs>::is_device_expr, + int>::type = 0> +GemmExpr<Lhs, Rhs> operator*(const Lhs& a, const Rhs& b) { + return {a, b}; +} + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_DEVICE_EXPR_H
diff --git a/unsupported/Eigen/src/GPU/DeviceMatrix.h b/unsupported/Eigen/src/GPU/DeviceMatrix.h new file mode 100644 index 0000000..2e15c41 --- /dev/null +++ b/unsupported/Eigen/src/GPU/DeviceMatrix.h
@@ -0,0 +1,538 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// Typed RAII wrapper for a dense matrix in GPU device memory. +// +// gpu::DeviceMatrix<Scalar> holds a column-major matrix on the GPU with tracked +// dimensions and leading dimension. It can be passed to GPU solvers +// (gpu::LLT, gpu::LU, cuBLAS expressions) without host round-trips. +// +// Cross-stream safety is automatic: an internal CUDA event tracks when the +// last write completed. Consumers on a different stream wait on that event +// before reading. +// +// Usage: +// auto d_A = gpu::DeviceMatrix<double>::fromHost(A); // upload (sync) +// gpu::LLT<double> llt; +// llt.compute(d_A); // factor on device +// auto d_X = llt.solve(d_B); // async, no sync +// MatrixXd X = d_X.toHost(); // download + block +// +// Async variants: +// auto d_A = gpu::DeviceMatrix<double>::fromHostAsync(A.data(), n, n, n, stream); +// auto transfer = d_X.toHostAsync(stream); // enqueue D2H +// // ... overlap with other work ... +// MatrixXd X = transfer.get(); // block + retrieve + +#ifndef EIGEN_GPU_DEVICE_MATRIX_H +#define EIGEN_GPU_DEVICE_MATRIX_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include <cstring> + +#include "./GpuSupport.h" + +namespace Eigen { +namespace gpu { + +// Forward declarations. +template <typename, int> +class LLT; +template <typename> +class LU; +template <typename> +class AdjointView; +template <typename> +class TransposeView; +template <typename> +class Assignment; +template <typename, typename> +class GemmExpr; +template <typename, int> +class LltSolveExpr; +template <typename> +class LuSolveExpr; +template <typename, int> +class LLTView; +template <typename> +class LUView; +template <typename, int> +class TriangularView; +template <typename, int> +class SelfAdjointView; +template <typename, int> +class ConstSelfAdjointView; +template <typename, int> +class TrsmExpr; +template <typename, int> +class SymmExpr; +template <typename, int> +class SyrkExpr; +class Context; + +// -------------------------------------------------------------------------- +// HostTransfer — future-like wrapper for an async device-to-host transfer. +// -------------------------------------------------------------------------- + +/** \ingroup GPU_Module + * \class HostTransfer + * \brief Future for an asynchronous device-to-host matrix transfer. + * + * Returned by gpu::DeviceMatrix::toHostAsync(). The transfer runs asynchronously + * on the given CUDA stream. Call get() to block until complete and retrieve + * the host matrix, or ready() to poll without blocking. + */ +template <typename Scalar_> +class HostTransfer { + public: + using Scalar = Scalar_; + using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>; + + /** Block until the transfer completes and return the host matrix. + * Idempotent: subsequent calls return the same matrix without re-syncing. + * On first call, copies from pinned staging buffer into a regular matrix. */ + PlainMatrix& get() { + if (!synced_) { + EIGEN_CUDA_RUNTIME_CHECK(cudaEventSynchronize(event_)); + // Copy from pinned staging buffer into the regular (pageable) host matrix. + if (pinned_buf_ && host_buf_.size() > 0) { + std::memcpy(host_buf_.data(), pinned_buf_.get(), static_cast<size_t>(host_buf_.size()) * sizeof(Scalar)); + } + pinned_buf_ = internal::PinnedHostBuffer(); // free pinned memory early + synced_ = true; + } + return host_buf_; + } + + /** Non-blocking check: has the transfer completed? */ + bool ready() const { + if (synced_) return true; + cudaError_t err = cudaEventQuery(event_); + if (err == cudaSuccess) return true; + eigen_assert(err == cudaErrorNotReady && "cudaEventQuery failed"); + return false; + } + + ~HostTransfer() { + if (event_) (void)cudaEventDestroy(event_); + } + + HostTransfer(HostTransfer&& o) noexcept + : host_buf_(std::move(o.host_buf_)), pinned_buf_(std::move(o.pinned_buf_)), event_(o.event_), synced_(o.synced_) { + o.event_ = nullptr; + o.synced_ = true; + } + + HostTransfer& operator=(HostTransfer&& o) noexcept { + if (this != &o) { + if (event_) (void)cudaEventDestroy(event_); + host_buf_ = std::move(o.host_buf_); + pinned_buf_ = std::move(o.pinned_buf_); + event_ = o.event_; + synced_ = o.synced_; + o.event_ = nullptr; + o.synced_ = true; + } + return *this; + } + + HostTransfer(const HostTransfer&) = delete; + HostTransfer& operator=(const HostTransfer&) = delete; + + private: + template <typename> + friend class DeviceMatrix; + + HostTransfer(PlainMatrix&& buf, internal::PinnedHostBuffer&& pinned, cudaEvent_t event) + : host_buf_(std::move(buf)), pinned_buf_(std::move(pinned)), event_(event), synced_(false) {} + + PlainMatrix host_buf_; // final destination (pageable) + internal::PinnedHostBuffer pinned_buf_; // staging buffer for async DMA + cudaEvent_t event_ = nullptr; + bool synced_ = false; +}; + +// -------------------------------------------------------------------------- +// Matrix — typed RAII wrapper for a dense matrix in device memory. +// -------------------------------------------------------------------------- + +/** \ingroup GPU_Module + * \class DeviceMatrix + * \brief RAII wrapper for a dense column-major matrix in GPU device memory. + * + * \tparam Scalar_ Element type: float, double, complex<float>, complex<double> + * + * Owns a device allocation with tracked dimensions and leading dimension. + * An internal CUDA event records when the data was last written, enabling + * safe cross-stream consumption without user-visible synchronization. + * + * Each method has a synchronous and an asynchronous variant: + * - fromHost() / fromHostAsync(): upload from host + * - toHost() / toHostAsync(): download to host + */ +template <typename Scalar_> +class DeviceMatrix { + public: + using Scalar = Scalar_; + using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>; + + // ---- Construction / destruction ------------------------------------------ + + /** Default: empty (0x0, no allocation). */ + DeviceMatrix() = default; + + /** Allocate uninitialized device memory for a rows x cols matrix. */ + DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols), outerStride_(rows) { + eigen_assert(rows >= 0 && cols >= 0); + size_t bytes = sizeInBytes(); + if (bytes > 0) { + void* p = nullptr; + EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes)); + data_.reset(static_cast<Scalar*>(p)); + } + } + + ~DeviceMatrix() { + // cudaEventDestroy on a pending event is non-blocking: the runtime defers + // teardown until the event completes. The trailing cudaFree() (via + // data_.reset()) is itself synchronous, so the buffer outlives any + // in-flight kernel that may still be touching it. + if (ready_event_) (void)cudaEventDestroy(ready_event_); + } + + // ---- Move-only ----------------------------------------------------------- + + DeviceMatrix(DeviceMatrix&& o) noexcept + : data_(std::move(o.data_)), + rows_(o.rows_), + cols_(o.cols_), + outerStride_(o.outerStride_), + ready_event_(o.ready_event_), + ready_stream_(o.ready_stream_), + retained_buffer_(std::move(o.retained_buffer_)) { + o.rows_ = 0; + o.cols_ = 0; + o.outerStride_ = 0; + o.ready_event_ = nullptr; + o.ready_stream_ = nullptr; + } + + DeviceMatrix& operator=(DeviceMatrix&& o) noexcept { + if (this != &o) { + if (ready_event_) (void)cudaEventDestroy(ready_event_); + data_ = std::move(o.data_); + rows_ = o.rows_; + cols_ = o.cols_; + outerStride_ = o.outerStride_; + ready_event_ = o.ready_event_; + ready_stream_ = o.ready_stream_; + retained_buffer_ = std::move(o.retained_buffer_); + o.rows_ = 0; + o.cols_ = 0; + o.outerStride_ = 0; + o.ready_event_ = nullptr; + o.ready_stream_ = nullptr; + } + return *this; + } + + DeviceMatrix(const DeviceMatrix&) = delete; + DeviceMatrix& operator=(const DeviceMatrix&) = delete; + + // ---- Upload from host ---------------------------------------------------- + + /** Upload a host Eigen matrix to device memory (synchronous). + * + * Evaluates the expression into a contiguous ColMajor temporary, copies to + * device via cudaMemcpyAsync on \p stream, and synchronizes before returning. + * + * \param host Any Eigen matrix expression. + * \param stream CUDA stream for the transfer (default: stream 0). + */ + template <typename Derived> + static DeviceMatrix fromHost(const MatrixBase<Derived>& host, cudaStream_t stream = nullptr) { + const PlainMatrix mat(host.derived()); + DeviceMatrix dm(mat.rows(), mat.cols()); + if (dm.sizeInBytes() > 0) { + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(dm.data_.get(), mat.data(), dm.sizeInBytes(), cudaMemcpyHostToDevice, stream)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream)); + } + return dm; + } + + /** Upload from a raw host pointer to device memory (asynchronous). + * + * Enqueues an async H2D copy on \p stream and records an internal event. + * The caller must keep \p host_data alive until the transfer completes + * (check via the internal event or synchronize the stream). + * + * \param host_data Pointer to contiguous column-major host data. + * \param rows Number of rows. + * \param cols Number of columns. + * \param outerStride Leading dimension (>= rows). Use rows for dense. + * \param stream CUDA stream for the transfer. + */ + static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, Index outerStride, + cudaStream_t stream) { + eigen_assert(rows >= 0 && cols >= 0 && outerStride >= rows); + eigen_assert(host_data != nullptr || (rows == 0 || cols == 0)); + DeviceMatrix dm(rows, cols); + if (dm.sizeInBytes() > 0) { + // If outerStride == rows (dense), single contiguous copy. + // Otherwise, copy column by column (strided layout). + if (outerStride == rows) { + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(dm.data_.get(), host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream)); + } else { + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(dm.data_.get(), static_cast<size_t>(rows) * sizeof(Scalar), + host_data, static_cast<size_t>(outerStride) * sizeof(Scalar), + static_cast<size_t>(rows) * sizeof(Scalar), + static_cast<size_t>(cols), cudaMemcpyHostToDevice, stream)); + } + dm.recordReady(stream); + } + return dm; + } + + // ---- Download to host ---------------------------------------------------- + + /** Download device matrix to host memory (synchronous). + * + * Waits on the internal ready event, enqueues a D2H copy on \p stream, + * synchronizes, and returns the host matrix directly. + * + * \param stream CUDA stream for the transfer (default: stream 0). + */ + PlainMatrix toHost(cudaStream_t stream = nullptr) const { + PlainMatrix host_buf(rows_, cols_); + if (sizeInBytes() > 0) { + waitReady(stream); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(host_buf.data(), data_.get(), sizeInBytes(), cudaMemcpyDeviceToHost, stream)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream)); + } + return host_buf; + } + + /** Enqueue an async device-to-host transfer and return a future. + * + * Waits on the internal ready event (if any) to ensure the device data is + * valid, then enqueues the D2H copy on \p stream. Returns a HostTransfer + * future; call .get() to block and retrieve the host matrix. + * + * \param stream CUDA stream for the transfer (default: stream 0). + */ + HostTransfer<Scalar> toHostAsync(cudaStream_t stream = nullptr) const { + PlainMatrix host_buf(rows_, cols_); + internal::PinnedHostBuffer pinned_buf(sizeInBytes()); + if (sizeInBytes() > 0) { + waitReady(stream); + // DMA into pinned staging buffer for truly async transfer. + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(pinned_buf.get(), data_.get(), sizeInBytes(), cudaMemcpyDeviceToHost, stream)); + } + // Record a transfer-complete event. + cudaEvent_t transfer_event; + EIGEN_CUDA_RUNTIME_CHECK(cudaEventCreateWithFlags(&transfer_event, cudaEventDisableTiming)); + EIGEN_CUDA_RUNTIME_CHECK(cudaEventRecord(transfer_event, stream)); + return HostTransfer<Scalar>(std::move(host_buf), std::move(pinned_buf), transfer_event); + } + + // ---- Device-to-device copy ----------------------------------------------- + + /** Deep copy on device. Fully async — records event on the result, no sync. + * + * \param stream CUDA stream for the D2D copy (default: stream 0). + */ + DeviceMatrix clone(cudaStream_t stream = nullptr) const { + DeviceMatrix result(rows_, cols_); + if (sizeInBytes() > 0) { + waitReady(stream); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(result.data_.get(), data_.get(), sizeInBytes(), cudaMemcpyDeviceToDevice, stream)); + result.recordReady(stream); + } + return result; + } + + // ---- Resize (destructive) ------------------------------------------------ + + /** Discard contents and reallocate to (rows x cols). Clears the ready event. */ + void resize(Index rows, Index cols) { + eigen_assert(rows >= 0 && cols >= 0); + if (rows == rows_ && cols == cols_) return; + data_.reset(); + if (ready_event_) { + (void)cudaEventDestroy(ready_event_); + ready_event_ = nullptr; + } + ready_stream_ = nullptr; + retained_buffer_ = internal::DeviceBuffer(); + rows_ = rows; + cols_ = cols; + outerStride_ = rows; + size_t bytes = sizeInBytes(); + if (bytes > 0) { + void* p = nullptr; + EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes)); + data_.reset(static_cast<Scalar*>(p)); + } + } + + // ---- Accessors ----------------------------------------------------------- + + Scalar* data() { return data_.get(); } + const Scalar* data() const { return data_.get(); } + Index rows() const { return rows_; } + Index cols() const { return cols_; } + Index outerStride() const { return outerStride_; } + bool empty() const { return rows_ == 0 || cols_ == 0; } + + /** Size of the device allocation in bytes. */ + size_t sizeInBytes() const { return static_cast<size_t>(outerStride_) * static_cast<size_t>(cols_) * sizeof(Scalar); } + + // ---- Event synchronization (public for library dispatch interop) --------- + + /** Record that device data is ready after work on \p stream. */ + void recordReady(cudaStream_t stream) { + ensureEvent(); + EIGEN_CUDA_RUNTIME_CHECK(cudaEventRecord(ready_event_, stream)); + ready_stream_ = stream; + } + + /** Make \p stream wait until the device data is ready. + * No-op if no event recorded, or if the consumer stream is the same as the + * producer stream (CUDA guarantees in-order execution within a stream). */ + void waitReady(cudaStream_t stream) const { + if (ready_event_ && stream != ready_stream_) { + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamWaitEvent(stream, ready_event_, 0)); + } + } + + // ---- Expression methods (dispatch to cuBLAS/cuSOLVER) -------------------- + + /** Adjoint view for GEMM dispatch. Maps to cublasXgemm with ConjTrans. */ + AdjointView<Scalar> adjoint() const { return AdjointView<Scalar>(*this); } + + /** Transpose view for GEMM dispatch. Maps to cublasXgemm with Trans. */ + TransposeView<Scalar> transpose() const { return TransposeView<Scalar>(*this); } + + /** Bind this matrix to a Context for expression assignment. + * Returns an Assignment proxy: `d_C.device(ctx) = d_A * d_B;` */ + Assignment<Scalar> device(Context& ctx) { return Assignment<Scalar>(*this, ctx); } + + /** Assign from a GEMM expression using the thread-local default Context. + * Defined out-of-line after Context is fully declared (see DeviceDispatch.h). */ + template <typename Lhs, typename Rhs> + DeviceMatrix& operator=(const GemmExpr<Lhs, Rhs>& expr); + + /** Accumulate from a GEMM expression using the thread-local default Context. */ + template <typename Lhs, typename Rhs> + DeviceMatrix& operator+=(const GemmExpr<Lhs, Rhs>& expr); + + /** Cholesky view: d_A.llt().solve(d_B) → LltSolveExpr. */ + LLTView<Scalar, Lower> llt() const { return LLTView<Scalar, Lower>(*this); } + + /** Cholesky view with explicit triangle: d_A.llt<Upper>().solve(d_B). */ + template <int UpLo> + LLTView<Scalar, UpLo> llt() const { + return LLTView<Scalar, UpLo>(*this); + } + + /** LU view: d_A.lu().solve(d_B) → LuSolveExpr. */ + LUView<Scalar> lu() const { return LUView<Scalar>(*this); } + + /** Assign from an LLT solve expression (thread-local default context). */ + template <int UpLo> + DeviceMatrix& operator=(const LltSolveExpr<Scalar, UpLo>& expr); + + /** Assign from an LU solve expression (thread-local default context). */ + DeviceMatrix& operator=(const LuSolveExpr<Scalar>& expr); + + /** Triangular view: d_A.triangularView<Lower>().solve(d_B) → TrsmExpr. */ + template <int UpLo> + TriangularView<Scalar, UpLo> triangularView() const { + return TriangularView<Scalar, UpLo>(*this); + } + + /** Self-adjoint view (mutable): d_C.selfadjointView<Lower>().rankUpdate(d_A). */ + template <int UpLo> + SelfAdjointView<Scalar, UpLo> selfadjointView() { + return SelfAdjointView<Scalar, UpLo>(*this); + } + + /** Self-adjoint view (const): d_A.selfadjointView<Lower>() * d_B → SymmExpr. */ + template <int UpLo> + ConstSelfAdjointView<Scalar, UpLo> selfadjointView() const { + return ConstSelfAdjointView<Scalar, UpLo>(*this); + } + + /** Assign from a TRSM expression (thread-local default context). */ + template <int UpLo> + DeviceMatrix& operator=(const TrsmExpr<Scalar, UpLo>& expr); + + /** Assign from a SYMM expression (thread-local default context). */ + template <int UpLo> + DeviceMatrix& operator=(const SymmExpr<Scalar, UpLo>& expr); + + // ---- Ownership transfer --------------------------------------------------- + + /** Adopt an existing device pointer. Caller relinquishes ownership. */ + static DeviceMatrix adopt(Scalar* device_ptr, Index rows, Index cols) { + DeviceMatrix dm; + dm.data_.reset(device_ptr); + dm.rows_ = rows; + dm.cols_ = cols; + dm.outerStride_ = rows; + return dm; + } + + /** Transfer ownership of the device pointer out. Zeros internal state. */ + Scalar* release() { + Scalar* p = data_.release(); + rows_ = 0; + cols_ = 0; + outerStride_ = 0; + if (ready_event_) { + (void)cudaEventDestroy(ready_event_); + ready_event_ = nullptr; + } + ready_stream_ = nullptr; + return p; + } + + private: + // ---- Private helpers ------------------------------------------------------- + + void ensureEvent() { + if (!ready_event_) { + EIGEN_CUDA_RUNTIME_CHECK(cudaEventCreateWithFlags(&ready_event_, cudaEventDisableTiming)); + } + } + + void retainBuffer(internal::DeviceBuffer&& buffer) { retained_buffer_ = std::move(buffer); } + + // ---- Data members -------------------------------------------------------- + + std::unique_ptr<Scalar, internal::CudaFreeDeleter> data_; + Index rows_ = 0; + Index cols_ = 0; + Index outerStride_ = 0; + cudaEvent_t ready_event_ = nullptr; // internal: tracks last write completion + cudaStream_t ready_stream_ = nullptr; // stream that recorded ready_event_ (for same-stream skip) + internal::DeviceBuffer retained_buffer_; // internal: keeps async aux buffers alive +}; + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_DEVICE_MATRIX_H
diff --git a/unsupported/Eigen/src/GPU/DeviceSolverExpr.h b/unsupported/Eigen/src/GPU/DeviceSolverExpr.h new file mode 100644 index 0000000..a71a998 --- /dev/null +++ b/unsupported/Eigen/src/GPU/DeviceSolverExpr.h
@@ -0,0 +1,119 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// Solver expression types for gpu::DeviceMatrix. +// +// Each expression maps 1:1 to cuSOLVER library calls: +// LltSolveExpr -> cusolverDnXpotrf + cusolverDnXpotrs +// LuSolveExpr -> cusolverDnXgetrf + cusolverDnXgetrs +// +// Usage: +// d_X = d_A.llt().solve(d_B); // Cholesky solve +// d_X.device(ctx) = d_A.lu().solve(d_B); // LU solve on explicit stream + +#ifndef EIGEN_GPU_DEVICE_SOLVER_EXPR_H +#define EIGEN_GPU_DEVICE_SOLVER_EXPR_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include <functional> + +namespace Eigen { +namespace gpu { + +// Forward declarations. +template <typename Scalar_> +class DeviceMatrix; +class Context; + +// ---- LLT solve expression --------------------------------------------------- +// d_A.llt().solve(d_B) -> LltSolveExpr -> cusolverDnXpotrf + cusolverDnXpotrs + +template <typename Scalar_, int UpLo_ = Lower> +class LltSolveExpr { + public: + using Scalar = Scalar_; + static constexpr int UpLo = UpLo_; + + LltSolveExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {} + const DeviceMatrix<Scalar>& matrix() const { return A_; } + const DeviceMatrix<Scalar>& rhs() const { return B_; } + + private: + std::reference_wrapper<const DeviceMatrix<Scalar>> A_; + std::reference_wrapper<const DeviceMatrix<Scalar>> B_; +}; + +// ---- LU solve expression ---------------------------------------------------- +// d_A.lu().solve(d_B) -> LuSolveExpr -> cusolverDnXgetrf + cusolverDnXgetrs + +template <typename Scalar_> +class LuSolveExpr { + public: + using Scalar = Scalar_; + + LuSolveExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {} + const DeviceMatrix<Scalar>& matrix() const { return A_; } + const DeviceMatrix<Scalar>& rhs() const { return B_; } + + private: + std::reference_wrapper<const DeviceMatrix<Scalar>> A_; + std::reference_wrapper<const DeviceMatrix<Scalar>> B_; +}; + +// ---- LLTView: d_A.llt() -> view with .solve() and .device() ---------------- + +template <typename Scalar_, int UpLo_ = Lower> +class LLTView { + public: + using Scalar = Scalar_; + + explicit LLTView(const DeviceMatrix<Scalar>& m) : mat_(m) {} + + /** Build a solve expression: d_A.llt().solve(d_B). + * The expression is evaluated when assigned to a gpu::DeviceMatrix. */ + LltSolveExpr<Scalar, UpLo_> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; } + + // For cached factorizations, use the explicit gpu::LLT API directly: + // gpu::LLT<double> llt; + // llt.compute(d_A); + // auto d_X1 = llt.solve(d_B1); + // auto d_X2 = llt.solve(d_B2); + + private: + std::reference_wrapper<const DeviceMatrix<Scalar>> mat_; +}; + +// ---- LUView: d_A.lu() -> view with .solve() and .device() ------------------ + +template <typename Scalar_> +class LUView { + public: + using Scalar = Scalar_; + + explicit LUView(const DeviceMatrix<Scalar>& m) : mat_(m) {} + + /** Build a solve expression: d_A.lu().solve(d_B). */ + LuSolveExpr<Scalar> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; } + + // For cached factorizations, use the explicit gpu::LU API directly: + // gpu::LU<double> lu; + // lu.compute(d_A); + // auto d_X1 = lu.solve(d_B1); + // auto d_X2 = lu.solve(d_B2); + + private: + std::reference_wrapper<const DeviceMatrix<Scalar>> mat_; +}; + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_DEVICE_SOLVER_EXPR_H
diff --git a/unsupported/Eigen/src/GPU/GpuContext.h b/unsupported/Eigen/src/GPU/GpuContext.h new file mode 100644 index 0000000..bff4053 --- /dev/null +++ b/unsupported/Eigen/src/GPU/GpuContext.h
@@ -0,0 +1,95 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// Unified GPU execution context. +// +// gpu::Context owns a CUDA stream and NVIDIA library handles (cuBLAS, +// cuSOLVER). It is the entry point for all GPU +// operations on gpu::DeviceMatrix. +// +// Usage: +// gpu::Context ctx; // explicit context +// d_C.device(ctx) = d_A * d_B; // GEMM on ctx's stream +// +// d_C = d_A * d_B; // thread-local default context +// gpu::Context& ctx = gpu::Context::threadLocal(); + +#ifndef EIGEN_GPU_CONTEXT_H +#define EIGEN_GPU_CONTEXT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./CuBlasSupport.h" +#include "./CuSolverSupport.h" + +namespace Eigen { +namespace gpu { + +/** \ingroup GPU_Module + * \class Context + * \brief Unified GPU execution context owning a CUDA stream and library handles. + * + * Each Context instance creates a dedicated CUDA stream, a cuBLAS handle, + * and a cuSOLVER handle, all bound to that stream. Multiple contexts enable + * concurrent execution on independent streams. + * + * A lazily-created thread-local default is available via threadLocal() for + * simple single-stream usage. + */ +class Context { + public: + Context() { + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_)); + EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_)); + EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_)); + EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_)); + EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_)); + } + + ~Context() { + if (cusolver_) (void)cusolverDnDestroy(cusolver_); + if (cublas_) (void)cublasDestroy(cublas_); + if (stream_) (void)cudaStreamDestroy(stream_); + } + + // Non-copyable, non-movable (owns library handles). + Context(const Context&) = delete; + Context& operator=(const Context&) = delete; + Context(Context&&) = delete; + Context& operator=(Context&&) = delete; + + /** Lazily-created thread-local default context. + * + * \note The thread-local instance is destroyed when the thread exits (or at + * static destruction time for the main thread). On some CUDA driver + * configurations this may print "CUDA_ERROR_DEINITIALIZED" to stderr if the + * CUDA context has already been torn down. These errors are harmless and are + * suppressed in the destructor, but they can produce noise in test output. + * To avoid this, call cudaDeviceReset() only after all Context instances + * (including thread-local ones) have been destroyed. */ + static Context& threadLocal() { + thread_local Context ctx; + return ctx; + } + + cudaStream_t stream() const { return stream_; } + cublasHandle_t cublasHandle() const { return cublas_; } + cusolverDnHandle_t cusolverHandle() const { return cusolver_; } + + private: + cudaStream_t stream_ = nullptr; + cublasHandle_t cublas_ = nullptr; + cusolverDnHandle_t cusolver_ = nullptr; +}; + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_CONTEXT_H
diff --git a/unsupported/Eigen/src/GPU/GpuLLT.h b/unsupported/Eigen/src/GPU/GpuLLT.h new file mode 100644 index 0000000..8ca3dbd --- /dev/null +++ b/unsupported/Eigen/src/GPU/GpuLLT.h
@@ -0,0 +1,403 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Eigen Authors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// GPU Cholesky (LLT) decomposition using cuSOLVER. +// +// Unlike Eigen's CPU LLT<MatrixType>, gpu::LLT keeps the factored Cholesky +// factor in device memory for the lifetime of the object. Multiple solves +// against the same factor therefore only transfer the RHS and solution +// vectors, not the factor itself. +// +// Requires CUDA 11.0+ (cusolverDnXpotrf / cusolverDnXpotrs generic API). +// Requires CUDA 11.4+ (cusolverDnX generic API + cudaMallocAsync). +// +// Usage: +// gpu::LLT<double> llt(A); // upload A, potrf, L stays on device +// if (llt.info() != Success) { ... } +// MatrixXd x1 = llt.solve(b1); // potrs, only b1 transferred +// MatrixXd x2 = llt.solve(b2); // L already on device + +#ifndef EIGEN_GPU_LLT_H +#define EIGEN_GPU_LLT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./CuSolverSupport.h" +#include <vector> + +namespace Eigen { +namespace gpu { + +/** \ingroup GPU_Module + * \class LLT + * \brief GPU Cholesky (LL^T) decomposition via cuSOLVER + * + * \tparam Scalar_ Element type: float, double, complex<float>, complex<double> + * \tparam UpLo_ Triangle used: Lower (default) or Upper + * + * Factorizes a symmetric positive-definite matrix A = LL^H on the GPU and + * caches the factor L in device memory. Each subsequent solve(B) uploads only + * B, calls cusolverDnXpotrs, and downloads the result — the factor is not + * re-transferred. + * + * Each LLT object owns a dedicated CUDA stream and cuSOLVER handle, + * enabling concurrent factorizations from multiple objects on the same host + * thread. + */ +template <typename Scalar_, int UpLo_ = Lower> +class LLT { + public: + using Scalar = Scalar_; + using RealScalar = typename NumTraits<Scalar>::Real; + using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>; + + static constexpr int UpLo = UpLo_; + + // ---- Construction / destruction ------------------------------------------ + + /** Default constructor. Does not factorize; call compute() before solve(). */ + LLT() { init_context(); } + + /** Factor A immediately. Equivalent to LLT llt; llt.compute(A). */ + template <typename InputType> + explicit LLT(const EigenBase<InputType>& A) : LLT() { + compute(A); + } + + ~LLT() { + // Ignore errors here: dtors are noexcept, and EIGEN_CUSOLVER_CHECK / + // EIGEN_CUDA_RUNTIME_CHECK are eigen_assert-based — firing one from a + // noexcept dtor terminates the program. The trailing cudaFree() of the + // device buffers (via internal::DeviceBuffer::~DeviceBuffer) is sync. + if (handle_) (void)cusolverDnDestroy(handle_); + if (stream_) (void)cudaStreamDestroy(stream_); + } + + // Non-copyable (owns device memory and library handles). + LLT(const LLT&) = delete; + LLT& operator=(const LLT&) = delete; + + // Movable. + LLT(LLT&& o) noexcept + : stream_(o.stream_), + handle_(o.handle_), + params_(std::move(o.params_)), + d_factor_(std::move(o.d_factor_)), + factor_alloc_size_(o.factor_alloc_size_), + d_scratch_(std::move(o.d_scratch_)), + scratch_size_(o.scratch_size_), + h_workspace_(std::move(o.h_workspace_)), + n_(o.n_), + lda_(o.lda_), + info_(o.info_), + pinned_info_(std::move(o.pinned_info_)), + info_synced_(o.info_synced_) { + o.stream_ = nullptr; + o.handle_ = nullptr; + o.factor_alloc_size_ = 0; + o.scratch_size_ = 0; + o.n_ = 0; + o.lda_ = 0; + o.info_ = InvalidInput; + o.info_synced_ = true; + } + + LLT& operator=(LLT&& o) noexcept { + if (this != &o) { + // Drain pending work on the old stream before tearing it down so the + // upcoming move of d_factor_/d_scratch_ doesn't free buffers that an + // in-flight kernel is still touching. The asymmetric move-ctor doesn't + // need this — it constructs onto uninitialized storage. + if (stream_) (void)cudaStreamSynchronize(stream_); + if (handle_) (void)cusolverDnDestroy(handle_); + if (stream_) (void)cudaStreamDestroy(stream_); + stream_ = o.stream_; + handle_ = o.handle_; + params_ = std::move(o.params_); + d_factor_ = std::move(o.d_factor_); + factor_alloc_size_ = o.factor_alloc_size_; + d_scratch_ = std::move(o.d_scratch_); + scratch_size_ = o.scratch_size_; + h_workspace_ = std::move(o.h_workspace_); + n_ = o.n_; + lda_ = o.lda_; + info_ = o.info_; + pinned_info_ = std::move(o.pinned_info_); + info_synced_ = o.info_synced_; + o.stream_ = nullptr; + o.handle_ = nullptr; + o.factor_alloc_size_ = 0; + o.scratch_size_ = 0; + o.n_ = 0; + o.lda_ = 0; + o.info_ = InvalidInput; + o.info_synced_ = true; + } + return *this; + } + + // ---- Factorization ------------------------------------------------------- + + /** Compute the Cholesky factorization of A (host matrix). + * + * Uploads A to device memory, calls cusolverDnXpotrf, and retains the + * factored matrix on device. Any previous factorization is overwritten. + */ + template <typename InputType> + LLT& compute(const EigenBase<InputType>& A) { + eigen_assert(A.rows() == A.cols()); + if (!begin_compute(A.rows())) return *this; + + // Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions). + const PlainMatrix mat(A.derived()); + lda_ = static_cast<int64_t>(mat.outerStride()); + allocate_factor_storage(); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_factor_.get(), mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_)); + + factorize(); + return *this; + } + + /** Compute the Cholesky factorization from a device-resident matrix (D2D copy). */ + LLT& compute(const DeviceMatrix<Scalar>& d_A) { + eigen_assert(d_A.rows() == d_A.cols()); + if (!begin_compute(d_A.rows())) return *this; + + lda_ = static_cast<int64_t>(d_A.outerStride()); + d_A.waitReady(stream_); + allocate_factor_storage(); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_factor_.get(), d_A.data(), factorBytes(), cudaMemcpyDeviceToDevice, stream_)); + + factorize(); + return *this; + } + + /** Compute the Cholesky factorization from a device matrix (move, no copy). */ + LLT& compute(DeviceMatrix<Scalar>&& d_A) { + eigen_assert(d_A.rows() == d_A.cols()); + if (!begin_compute(d_A.rows())) return *this; + + lda_ = static_cast<int64_t>(d_A.outerStride()); + d_A.waitReady(stream_); + d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release())); + factor_alloc_size_ = factorBytes(); + + factorize(); + return *this; + } + + // ---- Solve --------------------------------------------------------------- + + /** Solve A * X = B using the cached Cholesky factor (host → host). + * + * Uploads B to device memory, calls cusolverDnXpotrs using the factor + * retained from compute(), and returns the solution X on the host. + * The factor is not re-transferred; only B goes up and X comes down. + * + * \pre compute() must have been called and info() == Success. + * \returns X such that A * X ≈ B + */ + template <typename Rhs> + PlainMatrix solve(const MatrixBase<Rhs>& B) const { + const_cast<LLT*>(this)->sync_info(); + eigen_assert(info_ == Success && "LLT::solve called on a failed or uninitialized factorization"); + eigen_assert(B.rows() == n_); + + const PlainMatrix rhs(B); + const int64_t nrhs = static_cast<int64_t>(rhs.cols()); + const int64_t ldb = static_cast<int64_t>(rhs.outerStride()); + internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb)); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_x.get(), rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_)); + DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, std::move(d_x)); + + PlainMatrix X(n_, B.cols()); + int solve_info = 0; + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(X.data(), d_X.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + + eigen_assert(solve_info == 0 && "cusolverDnXpotrs reported an error"); + return X; + } + + /** Solve A * X = B with device-resident RHS. Fully async. + * + * All work is enqueued on this solver's stream. Returns a Matrix + * with a recorded ready event — no host synchronization occurs. + * The caller should check info() after compute() to verify the + * factorization succeeded; this method does not check. + */ + DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const { + eigen_assert(d_B.rows() == n_); + d_B.waitReady(stream_); + const int64_t nrhs = static_cast<int64_t>(d_B.cols()); + const int64_t ldb = static_cast<int64_t>(d_B.outerStride()); + internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb)); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_x.get(), d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_)); + return solve_impl(nrhs, ldb, std::move(d_x)); + } + + // ---- Accessors ----------------------------------------------------------- + + /** Returns Success if the last compute() succeeded, NumericalIssue otherwise. + * Lazily synchronizes the stream on first call after compute(). */ + ComputationInfo info() const { + const_cast<LLT*>(this)->sync_info(); + return info_; + } + + Index rows() const { return n_; } + Index cols() const { return n_; } + + /** Returns the CUDA stream owned by this object. + * Advanced users may submit additional GPU work on this stream + * to overlap with or chain after LLT operations. */ + cudaStream_t stream() const { return stream_; } + + private: + cudaStream_t stream_ = nullptr; + cusolverDnHandle_t handle_ = nullptr; + internal::CusolverParams params_; // cuSOLVER params (created once, reused) + internal::DeviceBuffer d_factor_; // factored L (or U) on device (grows, never shrinks) + size_t factor_alloc_size_ = 0; // current d_factor_ allocation size + internal::DeviceBuffer d_scratch_; // combined workspace + info word (grows, never shrinks) + size_t scratch_size_ = 0; // current scratch allocation size + std::vector<char> h_workspace_; // host workspace (kept alive until next compute) + int64_t n_ = 0; + int64_t lda_ = 0; + ComputationInfo info_ = InvalidInput; + internal::PinnedHostBuffer pinned_info_{sizeof(int)}; // pinned host memory for async D2H + bool info_synced_ = true; // has the stream been synced for info? + + int& info_word() { return *static_cast<int*>(pinned_info_.get()); } + int info_word() const { return *static_cast<const int*>(pinned_info_.get()); } + + bool begin_compute(Index rows) { + n_ = rows; + if (n_ == 0) { + info_ = Success; + info_synced_ = true; + return false; + } + info_ = InvalidInput; + return true; + } + + size_t factorBytes() const { return rhsBytes(n_, lda_); } + + static size_t rhsBytes(int64_t cols, int64_t outer_stride) { + return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar); + } + + void allocate_factor_storage() { + size_t needed = factorBytes(); + if (needed > factor_alloc_size_) { + d_factor_ = internal::DeviceBuffer(needed); + factor_alloc_size_ = needed; + } + } + + // Scratch layout: [ workspace (aligned) | info_word (sizeof(int)) ]. + // Workspace size is rounded up to 16 bytes so the info word lands aligned. + static constexpr size_t kInfoBytes = sizeof(int); + static constexpr size_t kScratchAlign = 16; + + static size_t scratchBytesFor(size_t workspace_bytes) { + workspace_bytes = (workspace_bytes + kScratchAlign - 1) & ~(kScratchAlign - 1); + return workspace_bytes + kInfoBytes; + } + + // Ensure d_scratch_ is at least `bytes`. Grows but never shrinks. + // Syncs the stream before reallocating to avoid freeing memory that + // async kernels may still be using. + void ensure_scratch(size_t bytes) { + if (bytes > scratch_size_) { + if (d_scratch_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + d_scratch_ = internal::DeviceBuffer(bytes); + scratch_size_ = bytes; + } + } + + void* scratch_workspace() const { return d_scratch_.get(); } + int* scratch_info() const { + eigen_assert(d_scratch_ && scratch_size_ >= kInfoBytes); + return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.get()) + scratch_size_ - kInfoBytes); + } + + // Solve in place on `d_x` (which already holds B), then re-wrap as a + // typed DeviceMatrix carrying shape and a ready event. The release/adopt + // hop just hands ownership of the raw cudaMalloc pointer from the untyped + // DeviceBuffer to the typed DeviceMatrix without copying. + DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, internal::DeviceBuffer&& d_x) const { + constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value; + constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value; + + EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, n_, nrhs, dtype, d_factor_.get(), lda_, dtype, + d_x.get(), ldb, scratch_info())); + + DeviceMatrix<Scalar> result = + DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_x.release()), n_, static_cast<Index>(nrhs)); + result.recordReady(stream_); + return result; + } + + void init_context() { + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_)); + EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_)); + EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_)); + ensure_scratch(kInfoBytes); + } + + // Synchronize stream and interpret the info word. No-op if already synced. + void sync_info() { + if (!info_synced_) { + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + info_ = (info_word() == 0) ? Success : NumericalIssue; + info_synced_ = true; + } + } + + // Run cusolverDnXpotrf on d_factor_ (already on device). + // Enqueues factorization + async info download. Does NOT sync. + // Workspaces are stored as members to ensure they outlive the async kernels. + void factorize() { + constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value; + constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value; + + info_synced_ = false; + info_ = InvalidInput; + + size_t dev_ws_bytes = 0, host_ws_bytes = 0; + EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(handle_, params_.p, uplo, n_, dtype, d_factor_.get(), lda_, dtype, + &dev_ws_bytes, &host_ws_bytes)); + + ensure_scratch(scratchBytesFor(dev_ws_bytes)); + h_workspace_.resize(host_ws_bytes); + + EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf( + handle_, params_.p, uplo, n_, dtype, d_factor_.get(), lda_, dtype, scratch_workspace(), dev_ws_bytes, + host_ws_bytes > 0 ? h_workspace_.data() : nullptr, host_ws_bytes, scratch_info())); + + // Enqueue async download of info word — sync deferred to info() or solve(). + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(&info_word(), scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_)); + } +}; + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_LLT_H
diff --git a/unsupported/Eigen/src/GPU/GpuLU.h b/unsupported/Eigen/src/GPU/GpuLU.h new file mode 100644 index 0000000..e4392bc --- /dev/null +++ b/unsupported/Eigen/src/GPU/GpuLU.h
@@ -0,0 +1,375 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Eigen Authors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// GPU partial-pivoting LU decomposition using cuSOLVER. +// +// Wraps cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve). +// The factored LU matrix and pivot array are kept in device memory for the +// lifetime of the object, so repeated solves only transfer the RHS/solution. +// +// Requires CUDA 11.0+ (cusolverDnX generic API). +// +// Usage: +// gpu::LU<double> lu(A); // upload A, getrf, LU+ipiv on device +// if (lu.info() != Success) { ... } +// MatrixXd x = lu.solve(b); // getrs NoTrans, only b transferred +// MatrixXd xt = lu.solve(b, gpu::GpuOp::Trans); // A^T x = b + +#ifndef EIGEN_GPU_LU_H +#define EIGEN_GPU_LU_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./CuBlasSupport.h" +#include "./CuSolverSupport.h" +#include <vector> + +namespace Eigen { +namespace gpu { + +/** \ingroup GPU_Module + * \class LU + * \brief GPU LU decomposition with partial pivoting via cuSOLVER + * + * \tparam Scalar_ Element type: float, double, complex<float>, complex<double> + * + * Decomposes a square matrix A = P L U on the GPU and retains the factored + * matrix and pivot array in device memory. Solves A*X=B, A^T*X=B, or + * A^H*X=B by passing the appropriate gpu::GpuOp. + * + * Each LU object owns a dedicated CUDA stream and cuSOLVER handle. + */ +template <typename Scalar_> +class LU { + public: + using Scalar = Scalar_; + using RealScalar = typename NumTraits<Scalar>::Real; + using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>; + + // ---- Construction / destruction ------------------------------------------ + + LU() { init_context(); } + + template <typename InputType> + explicit LU(const EigenBase<InputType>& A) : LU() { + compute(A); + } + + ~LU() { + // Ignore errors here: dtors are noexcept, and EIGEN_CUSOLVER_CHECK / + // EIGEN_CUDA_RUNTIME_CHECK are eigen_assert-based — firing one from a + // noexcept dtor terminates the program. The trailing cudaFree() of the + // device buffers (via internal::DeviceBuffer::~DeviceBuffer) is sync. + if (handle_) (void)cusolverDnDestroy(handle_); + if (stream_) (void)cudaStreamDestroy(stream_); + } + + LU(const LU&) = delete; + LU& operator=(const LU&) = delete; + + LU(LU&& o) noexcept + : stream_(o.stream_), + handle_(o.handle_), + params_(std::move(o.params_)), + d_lu_(std::move(o.d_lu_)), + lu_alloc_size_(o.lu_alloc_size_), + d_ipiv_(std::move(o.d_ipiv_)), + d_scratch_(std::move(o.d_scratch_)), + scratch_size_(o.scratch_size_), + h_workspace_(std::move(o.h_workspace_)), + n_(o.n_), + lda_(o.lda_), + info_(o.info_), + pinned_info_(std::move(o.pinned_info_)), + info_synced_(o.info_synced_) { + o.stream_ = nullptr; + o.handle_ = nullptr; + o.lu_alloc_size_ = 0; + o.scratch_size_ = 0; + o.n_ = 0; + o.lda_ = 0; + o.info_ = InvalidInput; + o.info_synced_ = true; + } + + LU& operator=(LU&& o) noexcept { + if (this != &o) { + // Drain pending work on the old stream before tearing it down so the + // upcoming move of d_lu_/d_ipiv_/d_scratch_ doesn't free buffers that an + // in-flight kernel is still touching. The asymmetric move-ctor doesn't + // need this — it constructs onto uninitialized storage. + if (stream_) (void)cudaStreamSynchronize(stream_); + if (handle_) (void)cusolverDnDestroy(handle_); + if (stream_) (void)cudaStreamDestroy(stream_); + stream_ = o.stream_; + handle_ = o.handle_; + params_ = std::move(o.params_); + d_lu_ = std::move(o.d_lu_); + lu_alloc_size_ = o.lu_alloc_size_; + d_ipiv_ = std::move(o.d_ipiv_); + d_scratch_ = std::move(o.d_scratch_); + scratch_size_ = o.scratch_size_; + h_workspace_ = std::move(o.h_workspace_); + n_ = o.n_; + lda_ = o.lda_; + info_ = o.info_; + pinned_info_ = std::move(o.pinned_info_); + info_synced_ = o.info_synced_; + o.stream_ = nullptr; + o.handle_ = nullptr; + o.lu_alloc_size_ = 0; + o.scratch_size_ = 0; + o.n_ = 0; + o.lda_ = 0; + o.info_ = InvalidInput; + o.info_synced_ = true; + } + return *this; + } + + // ---- Factorization ------------------------------------------------------- + + /** Compute the LU factorization of A (host matrix, must be square). */ + template <typename InputType> + LU& compute(const EigenBase<InputType>& A) { + eigen_assert(A.rows() == A.cols() && "LU requires a square matrix"); + if (!begin_compute(A.rows())) return *this; + + const PlainMatrix mat(A.derived()); + lda_ = static_cast<int64_t>(mat.outerStride()); + allocate_lu_storage(); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.get(), mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_)); + + factorize(); + return *this; + } + + /** Compute the LU factorization from a device-resident matrix (D2D copy). */ + LU& compute(const DeviceMatrix<Scalar>& d_A) { + eigen_assert(d_A.rows() == d_A.cols() && "LU requires a square matrix"); + if (!begin_compute(d_A.rows())) return *this; + + lda_ = static_cast<int64_t>(d_A.outerStride()); + d_A.waitReady(stream_); + allocate_lu_storage(); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_lu_.get(), d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_)); + + factorize(); + return *this; + } + + /** Compute the LU factorization from a device matrix (move, no copy). */ + LU& compute(DeviceMatrix<Scalar>&& d_A) { + eigen_assert(d_A.rows() == d_A.cols() && "LU requires a square matrix"); + if (!begin_compute(d_A.rows())) return *this; + + lda_ = static_cast<int64_t>(d_A.outerStride()); + d_A.waitReady(stream_); + d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release())); + lu_alloc_size_ = matrixBytes(); + + factorize(); + return *this; + } + + // ---- Solve --------------------------------------------------------------- + + /** Solve op(A) * X = B using the cached LU factorization (host → host). + * + * \param B Right-hand side (n x nrhs host matrix). + * \param op gpu::GpuOp::NoTrans (default), Trans, or ConjTrans. + */ + template <typename Rhs> + PlainMatrix solve(const MatrixBase<Rhs>& B, GpuOp op = GpuOp::NoTrans) const { + const_cast<LU*>(this)->sync_info(); + eigen_assert(info_ == Success && "LU::solve called on a failed or uninitialized factorization"); + eigen_assert(B.rows() == n_); + + const PlainMatrix rhs(B); + const int64_t nrhs = static_cast<int64_t>(rhs.cols()); + const int64_t ldb = static_cast<int64_t>(rhs.outerStride()); + internal::DeviceBuffer d_x(matrixBytes(nrhs, ldb)); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_x.get(), rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_)); + DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, op, std::move(d_x)); + + PlainMatrix X(n_, B.cols()); + int solve_info = 0; + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(X.data(), d_X.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + + eigen_assert(solve_info == 0 && "cusolverDnXgetrs reported an error"); + return X; + } + + /** Solve op(A) * X = B with device-resident RHS. Fully async. */ + DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B, GpuOp op = GpuOp::NoTrans) const { + eigen_assert(d_B.rows() == n_); + d_B.waitReady(stream_); + const int64_t nrhs = static_cast<int64_t>(d_B.cols()); + const int64_t ldb = static_cast<int64_t>(d_B.outerStride()); + internal::DeviceBuffer d_x(matrixBytes(nrhs, ldb)); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_x.get(), d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_)); + return solve_impl(nrhs, ldb, op, std::move(d_x)); + } + + // ---- Accessors ----------------------------------------------------------- + + /** Lazily synchronizes the stream on first call after compute(). */ + ComputationInfo info() const { + const_cast<LU*>(this)->sync_info(); + return info_; + } + Index rows() const { return n_; } + Index cols() const { return n_; } + cudaStream_t stream() const { return stream_; } + + private: + cudaStream_t stream_ = nullptr; + cusolverDnHandle_t handle_ = nullptr; + internal::CusolverParams params_; // cuSOLVER params (created once, reused) + internal::DeviceBuffer d_lu_; // LU factors on device (grows, never shrinks) + size_t lu_alloc_size_ = 0; // current d_lu_ allocation size + internal::DeviceBuffer d_ipiv_; // pivot indices (int64_t) on device + internal::DeviceBuffer d_scratch_; // combined workspace + info word (grows, never shrinks) + size_t scratch_size_ = 0; // current scratch allocation size + std::vector<char> h_workspace_; // host workspace (kept alive until next compute) + int64_t n_ = 0; + int64_t lda_ = 0; + ComputationInfo info_ = InvalidInput; + internal::PinnedHostBuffer pinned_info_{sizeof(int)}; // pinned host memory for async D2H + bool info_synced_ = true; // has the stream been synced for info? + + int& info_word() { return *static_cast<int*>(pinned_info_.get()); } + int info_word() const { return *static_cast<const int*>(pinned_info_.get()); } + + bool begin_compute(Index rows) { + n_ = rows; + info_ = InvalidInput; + if (n_ == 0) { + info_ = Success; + info_synced_ = true; + return false; + } + return true; + } + + size_t matrixBytes() const { return matrixBytes(n_, lda_); } + + static size_t matrixBytes(int64_t cols, int64_t outer_stride) { + return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar); + } + + void allocate_lu_storage() { + size_t needed = matrixBytes(); + if (needed > lu_alloc_size_) { + d_lu_ = internal::DeviceBuffer(needed); + lu_alloc_size_ = needed; + } + } + + // Scratch layout: [ workspace (aligned) | info_word (sizeof(int)) ]. + // Workspace size is rounded up to 16 bytes so the info word lands aligned. + static constexpr size_t kInfoBytes = sizeof(int); + static constexpr size_t kScratchAlign = 16; + + static size_t scratchBytesFor(size_t workspace_bytes) { + workspace_bytes = (workspace_bytes + kScratchAlign - 1) & ~(kScratchAlign - 1); + return workspace_bytes + kInfoBytes; + } + + // Ensure d_scratch_ is at least `bytes`. Grows but never shrinks. + // Syncs the stream before reallocating to avoid freeing memory that + // async kernels may still be using. + void ensure_scratch(size_t bytes) { + if (bytes > scratch_size_) { + if (d_scratch_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + d_scratch_ = internal::DeviceBuffer(bytes); + scratch_size_ = bytes; + } + } + + void* scratch_workspace() const { return d_scratch_.get(); } + int* scratch_info() const { + eigen_assert(d_scratch_ && scratch_size_ >= kInfoBytes); + return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.get()) + scratch_size_ - kInfoBytes); + } + + // Solve in place on `d_x` (which already holds B), then re-wrap as a + // typed DeviceMatrix carrying shape and a ready event. The release/adopt + // hop just hands ownership of the raw cudaMalloc pointer from the untyped + // DeviceBuffer to the typed DeviceMatrix without copying. + DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, GpuOp op, internal::DeviceBuffer&& d_x) const { + constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value; + const cublasOperation_t trans = internal::to_cublas_op(op); + + EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(handle_, params_.p, trans, n_, nrhs, dtype, d_lu_.get(), lda_, + static_cast<const int64_t*>(d_ipiv_.get()), dtype, d_x.get(), ldb, + scratch_info())); + + DeviceMatrix<Scalar> result = + DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_x.release()), n_, static_cast<Index>(nrhs)); + result.recordReady(stream_); + return result; + } + + void init_context() { + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_)); + EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_)); + EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_)); + ensure_scratch(kInfoBytes); + } + + void sync_info() { + if (!info_synced_) { + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + info_ = (info_word() == 0) ? Success : NumericalIssue; + info_synced_ = true; + } + } + + // Run cusolverDnXgetrf on d_lu_ (already on device). Allocates d_ipiv_. + // Enqueues factorization + async info download. Does NOT sync. + // Workspaces are stored as members to ensure they outlive the async kernels. + void factorize() { + constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value; + const size_t ipiv_bytes = static_cast<size_t>(n_) * sizeof(int64_t); + + info_synced_ = false; + info_ = InvalidInput; + + d_ipiv_ = internal::DeviceBuffer(ipiv_bytes); + + size_t dev_ws_bytes = 0, host_ws_bytes = 0; + EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(handle_, params_.p, n_, n_, dtype, d_lu_.get(), lda_, dtype, + &dev_ws_bytes, &host_ws_bytes)); + + ensure_scratch(scratchBytesFor(dev_ws_bytes)); + h_workspace_.resize(host_ws_bytes); + + EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf(handle_, params_.p, n_, n_, dtype, d_lu_.get(), lda_, + static_cast<int64_t*>(d_ipiv_.get()), dtype, scratch_workspace(), + dev_ws_bytes, host_ws_bytes > 0 ? h_workspace_.data() : nullptr, + host_ws_bytes, scratch_info())); + + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(&info_word(), scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_)); + } +}; + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_LU_H
diff --git a/unsupported/Eigen/src/GPU/GpuSupport.h b/unsupported/Eigen/src/GPU/GpuSupport.h new file mode 100644 index 0000000..7d840bb --- /dev/null +++ b/unsupported/Eigen/src/GPU/GpuSupport.h
@@ -0,0 +1,136 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// Generic CUDA runtime support shared across all GPU library integrations +// (cuSOLVER and cuBLAS): +// - Error-checking macros +// - RAII device buffer +// +// Only depends on <cuda_runtime.h>. No NVIDIA library headers. + +#ifndef EIGEN_GPU_SUPPORT_H +#define EIGEN_GPU_SUPPORT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include <cuda_runtime.h> + +#include <memory> + +namespace Eigen { +namespace gpu { +namespace internal { + +// ---- Error-checking macros -------------------------------------------------- +// These abort (via eigen_assert) on failure. Not for use in destructors. + +#define EIGEN_CUDA_RUNTIME_CHECK(expr) \ + do { \ + cudaError_t _e = (expr); \ + eigen_assert(_e == cudaSuccess && "CUDA runtime call failed"); \ + } while (0) + +// ---- Custom deleters for CUDA-allocated memory ------------------------------ +// Used with std::unique_ptr to give CUDA allocations RAII semantics with no +// hand-rolled move/dtor boilerplate. + +struct CudaFreeDeleter { + void operator()(void* p) const noexcept { + if (p) (void)cudaFree(p); + } +}; + +struct CudaFreeHostDeleter { + void operator()(void* p) const noexcept { + if (p) (void)cudaFreeHost(p); + } +}; + +// ---- RAII: device buffer ---------------------------------------------------- + +class DeviceBuffer { + public: + DeviceBuffer() = default; + + explicit DeviceBuffer(size_t bytes) { + if (bytes > 0) { + void* p = nullptr; + EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes)); + ptr_.reset(p); + } + } + + void* get() const noexcept { return ptr_.get(); } + void* release() noexcept { return ptr_.release(); } + explicit operator bool() const noexcept { return static_cast<bool>(ptr_); } + + // Adopt an existing device pointer. Caller relinquishes ownership. + static DeviceBuffer adopt(void* p) noexcept { + DeviceBuffer b; + b.ptr_.reset(p); + return b; + } + + private: + std::unique_ptr<void, CudaFreeDeleter> ptr_; +}; + +// ---- RAII: pinned host buffer ----------------------------------------------- +// For async D2H copies (cudaMemcpyAsync requires pinned host memory for true +// asynchrony and to avoid compute-sanitizer warnings). + +class PinnedHostBuffer { + public: + PinnedHostBuffer() = default; + + explicit PinnedHostBuffer(size_t bytes) { + if (bytes > 0) { + void* p = nullptr; + EIGEN_CUDA_RUNTIME_CHECK(cudaMallocHost(&p, bytes)); + ptr_.reset(p); + } + } + + void* get() const noexcept { return ptr_.get(); } + explicit operator bool() const noexcept { return static_cast<bool>(ptr_); } + + private: + std::unique_ptr<void, CudaFreeHostDeleter> ptr_; +}; + +// ---- Scalar → cudaDataType_t ------------------------------------------------ +// Shared by cuBLAS and cuSOLVER. cudaDataType_t is defined in library_types.h +// which is included transitively by cuda_runtime.h. + +template <typename Scalar> +struct cuda_data_type; + +template <> +struct cuda_data_type<float> { + static constexpr cudaDataType_t value = CUDA_R_32F; +}; +template <> +struct cuda_data_type<double> { + static constexpr cudaDataType_t value = CUDA_R_64F; +}; +template <> +struct cuda_data_type<std::complex<float>> { + static constexpr cudaDataType_t value = CUDA_C_32F; +}; +template <> +struct cuda_data_type<std::complex<double>> { + static constexpr cudaDataType_t value = CUDA_C_64F; +}; + +} // namespace internal +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_SUPPORT_H
diff --git a/unsupported/Eigen/src/GPU/InternalHeaderCheck.h b/unsupported/Eigen/src/GPU/InternalHeaderCheck.h new file mode 100644 index 0000000..cb2163c --- /dev/null +++ b/unsupported/Eigen/src/GPU/InternalHeaderCheck.h
@@ -0,0 +1,3 @@ +#ifndef EIGEN_GPU_MODULE_H +#error "Please include unsupported/Eigen/GPU instead of including headers inside the src/GPU directory directly." +#endif
diff --git a/unsupported/Eigen/src/GPU/README.md b/unsupported/Eigen/src/GPU/README.md new file mode 100644 index 0000000..2928a7c --- /dev/null +++ b/unsupported/Eigen/src/GPU/README.md
@@ -0,0 +1,344 @@ +# Eigen GPU Module (`unsupported/Eigen/GPU`) + +GPU-accelerated dense linear algebra for Eigen users, dispatching to NVIDIA +CUDA Math Libraries (cuBLAS, cuSOLVER). Requires CUDA 11.4+. Header-only (link +against CUDA runtime, cuBLAS, and cuSOLVER). + +## Why this module + +Eigen is the linear algebra foundation for a large ecosystem of C++ projects +in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP, +Open3D), scientific computing (Ceres, Stan), and beyond. Many of these +projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations +without dropping down to raw CUDA library APIs. Third-party projects like +[EigenCuda](https://github.com/NLESC-JCER/EigenCuda) and +[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically to fill +this gap, and downstream projects like +[Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and +[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for +GPU-accelerated solvers through Eigen. + +The `unsupported/Eigen/GPU` module aims to close this gap: Existing Eigen users should be +able to move performance-critical dense linear algebra to the GPU with minimal +code changes and without learning CUDA library APIs directly. + +## Design philosophy + +**CPU and GPU coexist.** There is no global compile-time switch that replaces +CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers +explicitly -- `gpu::LLT<double>` vs `Eigen::LLT<MatrixXd>` -- and both coexist in +the same binary. This also lets users keep the factored matrix on device across +multiple solves, something impossible with compile-time replacement. + +**Familiar syntax.** GPU operations use the same expression patterns as CPU +Eigen. Here is a side-by-side comparison: + +```cpp +// ---- CPU (Eigen) ---- // ---- GPU (unsupported/Eigen/GPU) ---- +#include <Eigen/Dense> #define EIGEN_USE_GPU + #include <unsupported/Eigen/GPU> + +MatrixXd A = ...; auto d_A = gpu::DeviceMatrix<double>::fromHost(A); +MatrixXd B = ...; auto d_B = gpu::DeviceMatrix<double>::fromHost(B); + +MatrixXd C = A * B; gpu::DeviceMatrix<double> d_C = d_A * d_B; +MatrixXd X = A.llt().solve(B); gpu::DeviceMatrix<double> d_X = d_A.llt().solve(d_B); + + MatrixXd X = d_X.toHost(); +``` + +The GPU version reads like CPU Eigen with explicit upload/download. +`operator*` dispatches to cuBLAS GEMM, `.llt().solve()` dispatches to +cuSOLVER potrf + potrs. Unsupported expressions are compile errors. + +**Standalone module.** `unsupported/Eigen/GPU` does not modify or depend on Eigen's Core +expression template system (`MatrixBase`, `CwiseBinaryOp`, etc.). +`DeviceMatrix` is not an Eigen expression type and does not inherit from +`MatrixBase`. The expression layer is a thin compile-time dispatch where every +supported expression maps to a single NVIDIA library call. There is no +coefficient-level evaluation, lazy fusion, or packet operations. + +**Explicit over implicit.** Host-device transfers, stream management, and +library handle lifetimes are visible in the API. There are no hidden +allocations or synchronizations except where documented (e.g., `toHost()` must +synchronize to deliver data to the host). + +## Key concepts + +### `DeviceMatrix<Scalar>` + +A typed RAII wrapper for a dense column-major matrix in GPU device memory. +This is the GPU counterpart of Eigen's `MatrixX<Scalar>`. A vector is simply +a `DeviceMatrix` with one column. + +```cpp +// Upload from host +auto d_A = gpu::DeviceMatrix<double>::fromHost(A); + +// Allocate uninitialized +gpu::DeviceMatrix<double> d_C(m, n); + +// Download to host +MatrixXd C = d_C.toHost(); + +// Async download (returns a future) +auto transfer = d_C.toHostAsync(); +// ... do other work ... +MatrixXd C = transfer.get(); +``` + +`DeviceMatrix` supports expression methods that mirror Eigen's API: +`adjoint()`, `transpose()`, `triangularView<UpLo>()`, +`selfadjointView<UpLo>()`, `llt()`, `lu()`. These return lightweight +expression objects that are evaluated when assigned. + +### `gpu::Context` + +Every GPU operation needs a CUDA stream and library handles (cuBLAS, +cuSOLVER). `gpu::Context` bundles these together. + +For simple usage, you don't need to create one -- a per-thread default context +is created lazily on first use: + +```cpp +// These use the thread-local default context automatically +d_C = d_A * d_B; +d_X = d_A.llt().solve(d_B); +``` + +For concurrent multi-stream execution, create explicit contexts: + +```cpp +gpu::Context ctx1, ctx2; +d_C1.device(ctx1) = d_A1 * d_B1; // runs on stream 1 +d_C2.device(ctx2) = d_A2 * d_B2; // runs on stream 2 (concurrently) +``` + +## Usage + +### Matrix operations (cuBLAS) + +```cpp +auto d_A = gpu::DeviceMatrix<double>::fromHost(A); +auto d_B = gpu::DeviceMatrix<double>::fromHost(B); + +// GEMM: C = A * B, C = A^H * B, C = A * B^T, ... +gpu::DeviceMatrix<double> d_C = d_A * d_B; +d_C = d_A.adjoint() * d_B; +d_C = d_A * d_B.transpose(); + +// Scaled and accumulated +d_C += 2.0 * d_A * d_B; // alpha=2, beta=1 +d_C.device(ctx) -= d_A * d_B; // alpha=-1, beta=1 (requires explicit context) + +// Triangular solve (TRSM) +d_X = d_A.triangularView<Lower>().solve(d_B); + +// Symmetric/Hermitian multiply (SYMM/HEMM) +d_C = d_A.selfadjointView<Lower>() * d_B; + +// Rank-k update (SYRK/HERK) +d_C.selfadjointView<Lower>().rankUpdate(d_A); // C += A * A^H +``` + +### Dense solvers (cuSOLVER) + +**One-shot expression syntax** -- Convenient, re-factorizes each time: + +```cpp +// Cholesky solve (potrf + potrs) +d_X = d_A.llt().solve(d_B); + +// LU solve (getrf + getrs) +d_Y = d_A.lu().solve(d_B); +``` + +**Cached factorization** -- Factor once, solve many times: + +```cpp +gpu::LLT<double> llt; +llt.compute(d_A); // factorize (async) +if (llt.info() != Success) { ... } // lazy sync on first info() call +auto d_X1 = llt.solve(d_B1); // reuses factor (async) +auto d_X2 = llt.solve(d_B2); // reuses factor (async) +MatrixXd X2 = d_X2.toHost(); + +// LU with transpose solve +gpu::LU<double> lu; +lu.compute(d_A); +auto d_Y = lu.solve(d_B, gpu::GpuOp::Trans); // A^T Y = B +``` + +The cached API keeps the factored matrix on device, avoiding redundant +host-device transfers and re-factorizations. + +### Stream control and async execution + +Operations are asynchronous by default. The compute-solve chain runs without +host synchronization until you need a result on the host: + +```text +fromHost(A) --sync--> compute() --async--> solve() --async--> toHost() + H2D potrf potrs D2H + sync +``` + +Mandatory sync points: +- `fromHost()` -- Synchronizes to complete the upload before returning +- `toHost()` / `HostTransfer::get()` -- Must deliver data to host +- `info()` -- Must read the factorization status + +**Cross-stream safety** is automatic. `DeviceMatrix` tracks write completion +via CUDA events. When a matrix written on stream A is read on stream B, the +module automatically inserts `cudaStreamWaitEvent`. Same-stream operations +skip the wait (CUDA guarantees in-order execution within a stream). + +**Lifetime of cached factorizations.** A `gpu::LLT` / `gpu::LU` object owns +its CUDA stream, library handle, and the cached factor on device. Destroying +the factorization object while a `solve()` it issued is still in flight is +*correct* but not actually async: `cudaStreamDestroy` returns immediately, +but the destructor of the cached factor calls `cudaFree`, which is fully +device-synchronous and stalls until the in-flight `potrs`/`getrs` retires. +For genuine async pipelining keep the factorization object alive until you +have drained its results (e.g. via `toHost()` or by binding consumption to +an explicit `gpu::Context` that outlives both producer and consumer): + +```cpp +gpu::LLT<double> llt(d_A); // factor stays on device +auto d_X = llt.solve(d_B); +auto h_x = d_X.toHostAsync(llt.stream()); +h_x.get(); // sync: factor + result complete +// llt may now be destroyed without stalling the device +``` + +## Reference + +### Supported scalar types + +`float`, `double`, `std::complex<float>`, `std::complex<double>`. + +### Expression -> library call mapping + +| DeviceMatrix expression | Library call | Parameters | +|---|---|---| +| `C = A * B` | `cublasGemmEx` | transA=N, transB=N, alpha=1, beta=0 | +| `C = A.adjoint() * B` | `cublasGemmEx` | transA=C, transB=N | +| `C = A.transpose() * B` | `cublasGemmEx` | transA=T, transB=N | +| `C = A * B.adjoint()` | `cublasGemmEx` | transA=N, transB=C | +| `C = A * B.transpose()` | `cublasGemmEx` | transA=N, transB=T | +| `C = alpha * A * B` | `cublasGemmEx` | alpha from LHS | +| `C = A * (alpha * B)` | `cublasGemmEx` | alpha from RHS | +| `C += A * B` | `cublasGemmEx` | alpha=1, beta=1 | +| `C.device(ctx) -= A * B` | `cublasGemmEx` | alpha=-1, beta=1 | +| `X = A.llt().solve(B)` | `cusolverDnXpotrf` + `Xpotrs` | uplo, n, nrhs | +| `X = A.llt<Upper>().solve(B)` | same | uplo=Upper | +| `X = A.lu().solve(B)` | `cusolverDnXgetrf` + `Xgetrs` | n, nrhs | +| `X = A.triangularView<L>().solve(B)` | `cublasXtrsm` | side=L, uplo, diag=NonUnit | +| `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo | +| `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N | + +### `DeviceMatrix<Scalar>` API + +| Method | Sync? | Description | +|--------|-------|-------------| +| `DeviceMatrix()` | -- | Empty (0x0) | +| `DeviceMatrix(rows, cols)` | -- | Allocate uninitialized | +| `fromHost(matrix, stream)` | yes | Upload from Eigen matrix | +| `fromHostAsync(ptr, rows, cols, outerStride, stream)` | no | Async upload (caller manages lifetime) | +| `toHost(stream)` | yes | Synchronous download | +| `toHostAsync(stream)` | no | Returns `HostTransfer` future | +| `clone(stream)` | no | Device-to-device deep copy | +| `resize(rows, cols)` | -- | Discard contents, reallocate | +| `data()` | -- | Raw device pointer | +| `rows()`, `cols()` | -- | Dimensions | +| `sizeInBytes()` | -- | Total device allocation size in bytes | +| `empty()` | -- | True if 0x0 | +| `adjoint()` | -- | Adjoint view (GEMM ConjTrans) | +| `transpose()` | -- | Transpose view (GEMM Trans) | +| `llt()` / `llt<UpLo>()` | -- | Cholesky expression builder | +| `lu()` | -- | LU expression builder | +| `triangularView<UpLo>()` | -- | Triangular view (TRSM) | +| `selfadjointView<UpLo>()` | -- | Self-adjoint view (SYMM, rankUpdate) | +| `device(ctx)` | -- | Assignment proxy bound to context | + +### `gpu::Context` + +Unified GPU execution context owning a CUDA stream and library handles. + +```cpp +gpu::Context() // Creates dedicated stream + handles +static gpu::Context& threadLocal() // Per-thread default (lazy-created) + +cudaStream_t stream() +cublasHandle_t cublasHandle() +cusolverDnHandle_t cusolverHandle() +``` + +Non-copyable, non-movable (owns library handles). + +### `gpu::LLT<Scalar, UpLo>` API + +GPU dense Cholesky (LL^T) via cuSOLVER. Caches factor on device. + +| Method | Sync? | Description | +|--------|-------|-------------| +| `gpu::LLT(A)` | deferred | Construct and factorize from host matrix | +| `compute(host_matrix)` | deferred | Upload and factorize | +| `compute(DeviceMatrix)` | deferred | D2D copy and factorize | +| `compute(DeviceMatrix&&)` | deferred | Move-adopt and factorize (no copy) | +| `solve(host_matrix)` | yes | Solve, return host matrix | +| `solve(DeviceMatrix)` | no | Solve, return `DeviceMatrix` (async) | +| `info()` | lazy | Syncs stream on first call, returns `Success` or `NumericalIssue` | + +### `gpu::LU<Scalar>` API + +GPU dense partial-pivoting LU via cuSOLVER. Same pattern as `gpu::LLT`, plus a +`gpu::GpuOp` parameter on `solve()` (`NoTrans`, `Trans`, `ConjTrans`). + +### `HostTransfer<Scalar>` API + +Future for async device-to-host transfer. + +| Method | Description | +|--------|-------------| +| `get()` | Block until transfer completes, return host matrix reference. Idempotent. | +| `ready()` | Non-blocking poll | + +### Aliasing + +Unlike Eigen's `Matrix`, where omitting `.noalias()` triggers a copy to a +temporary, DeviceMatrix dispatches directly to NVIDIA library calls which have +no built-in aliasing protection. All operations are implicitly noalias. +The caller must ensure operands don't alias the destination for GEMM, TRSM, +SYMM/HEMM, and SYRK/HERK. Debug builds assert on these violations before +dispatching to cuBLAS. + +## File layout + +| File | Depends on | Contents | +|------|-----------|----------| +| `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `PinnedHostBuffer`, `cuda_data_type<>` | +| `DeviceMatrix.h` | `GpuSupport.h` | `DeviceMatrix<>`, `HostTransfer<>` | +| `DeviceExpr.h` | `DeviceMatrix.h` | GEMM expression wrappers | +| `DeviceBlasExpr.h` | `DeviceMatrix.h` | TRSM, SYMM, SYRK expression wrappers | +| `DeviceSolverExpr.h` | `DeviceMatrix.h` | Solver expression wrappers (LLT, LU) | +| `DeviceDispatch.h` | all above | All dispatch functions + `Assignment` | +| `GpuContext.h` | `CuBlasSupport.h`, `CuSolverSupport.h` | `gpu::Context` | +| `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>` | cuBLAS error macro, op/compute type maps | +| `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping | +| `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization | +| `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization | + +## Building and testing + +```bash +cmake -G Ninja -B build -S . \ + -DEIGEN_TEST_CUDA=ON \ + -DEIGEN_CUDA_COMPUTE_ARCH="70" + +cmake --build build --target cublas cusolver_llt cusolver_lu device_matrix +ctest --test-dir build -L gpu --output-on-failure +``` + +`EIGEN_TEST_CUBLAS` and `EIGEN_TEST_CUSOLVER` default to ON when CUDA is enabled +(cuBLAS and cuSOLVER are part of the CUDA toolkit).
diff --git a/unsupported/benchmarks/CMakeLists.txt b/unsupported/benchmarks/CMakeLists.txt index ed8c88e..d8b9fcd 100644 --- a/unsupported/benchmarks/CMakeLists.txt +++ b/unsupported/benchmarks/CMakeLists.txt
@@ -33,3 +33,10 @@ add_subdirectory(Splines) add_subdirectory(IterativeSolvers) add_subdirectory(KroneckerProduct) + +# GPU benchmarks have their own CMake project (needs CUDAToolkit). +# They can also be built standalone: cmake -B build -S unsupported/benchmarks/GPU +find_package(CUDAToolkit QUIET) +if(CUDAToolkit_FOUND) + add_subdirectory(GPU) +endif()
diff --git a/unsupported/benchmarks/GPU/CMakeLists.txt b/unsupported/benchmarks/GPU/CMakeLists.txt new file mode 100644 index 0000000..2c3511f --- /dev/null +++ b/unsupported/benchmarks/GPU/CMakeLists.txt
@@ -0,0 +1,54 @@ +# GPU benchmarks require CUDA runtime + cuSOLVER. +# Build separately from the main benchmark tree since they need CUDA toolchain. +# +# Usage: +# cmake -G Ninja -B build-bench-gpu -S unsupported/benchmarks/GPU \ +# -DCMAKE_CUDA_ARCHITECTURES=89 +# cmake --build build-bench-gpu +# +# Profiling: +# nsys profile --trace=cuda ./build-bench-gpu/bench_solvers +# ncu --set full -o profile ./build-bench-gpu/bench_solvers --benchmark_filter=BM_GpuLLT_Compute/4096 + +cmake_minimum_required(VERSION 3.17) +project(EigenGpuBenchmarks CXX) + +find_package(benchmark REQUIRED) +find_package(CUDAToolkit REQUIRED) + +set(EIGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../..") + +function(eigen_add_gpu_benchmark name source) + cmake_parse_arguments(BENCH "" "" "LIBRARIES;DEFINITIONS" ${ARGN}) + if(NOT IS_ABSOLUTE "${source}") + set(source "${CMAKE_CURRENT_SOURCE_DIR}/${source}") + endif() + add_executable(${name} ${source}) + target_compile_features(${name} PRIVATE cxx_std_14) + target_include_directories(${name} PRIVATE + ${EIGEN_SOURCE_DIR} + ${CUDAToolkit_INCLUDE_DIRS}) + target_link_libraries(${name} PRIVATE + benchmark::benchmark benchmark::benchmark_main + CUDA::cudart CUDA::cusolver CUDA::cublas) + if(BENCH_LIBRARIES) + target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES}) + endif() + target_compile_options(${name} PRIVATE -O3 -DNDEBUG) + target_compile_definitions(${name} PRIVATE EIGEN_USE_GPU) + if(BENCH_DEFINITIONS) + target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS}) + endif() +endfunction() + +# Solver benchmarks: LLT/LU compute + solve, host vs device paths, CPU baselines. +eigen_add_gpu_benchmark(bench_solvers bench_solvers.cpp) +eigen_add_gpu_benchmark(bench_solvers_float bench_solvers.cpp DEFINITIONS SCALAR=float) + +# Chaining benchmarks: async pipeline efficiency, host-roundtrip vs device chain. +eigen_add_gpu_benchmark(bench_chaining bench_chaining.cpp) +eigen_add_gpu_benchmark(bench_chaining_float bench_chaining.cpp DEFINITIONS SCALAR=float) + +# Batching benchmarks: multi-stream concurrency for many small systems. +eigen_add_gpu_benchmark(bench_batching bench_batching.cpp) +eigen_add_gpu_benchmark(bench_batching_float bench_batching.cpp DEFINITIONS SCALAR=float)
diff --git a/unsupported/benchmarks/GPU/bench_batching.cpp b/unsupported/benchmarks/GPU/bench_batching.cpp new file mode 100644 index 0000000..f9cabc5 --- /dev/null +++ b/unsupported/benchmarks/GPU/bench_batching.cpp
@@ -0,0 +1,268 @@ +// GPU batching benchmarks: multi-stream concurrency for many small solves. +// +// Each gpu::LLT/gpu::LU owns its own CUDA stream. This benchmark measures how +// well multiple solver instances overlap on the GPU, which is critical for +// workloads like robotics (many small systems) and SLAM (batched poses). +// +// Compares: +// 1. Sequential: one solver handles all systems one by one +// 2. Batched: N solvers on N streams, all launched before any sync +// 3. CPU baseline: Eigen LLT on host +// +// For Nsight Systems: batched mode should show overlapping kernels on +// different streams in the timeline view. +// +// nsys profile --trace=cuda ./bench_batching + +#include <benchmark/benchmark.h> + +#include <Eigen/Cholesky> +#include <unsupported/Eigen/GPU> + +#include <memory> +#include <vector> + +using namespace Eigen; + +#ifndef SCALAR +#define SCALAR double +#endif + +using Scalar = SCALAR; +using Mat = Matrix<Scalar, Dynamic, Dynamic>; + +static Mat make_spd(Index n) { + Mat M = Mat::Random(n, n); + return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n); +} + +static void cuda_warmup() { + static bool done = false; + if (!done) { + void* p; + cudaMalloc(&p, 1); + cudaFree(p); + done = true; + } +} + +// -------------------------------------------------------------------------- +// Sequential: one solver, N systems solved one after another +// -------------------------------------------------------------------------- + +static void BM_Batch_Sequential(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + const int batch_size = static_cast<int>(state.range(1)); + + // Pre-generate all SPD matrices and RHS vectors. + std::vector<Mat> As(batch_size); + std::vector<Mat> Bs(batch_size); + for (int i = 0; i < batch_size; ++i) { + As[i] = make_spd(n); + Bs[i] = Mat::Random(n, 1); + } + + gpu::LLT<Scalar> llt; + + for (auto _ : state) { + std::vector<Mat> results(batch_size); + for (int i = 0; i < batch_size; ++i) { + llt.compute(As[i]); + results[i] = llt.solve(Bs[i]); + } + benchmark::DoNotOptimize(results.back().data()); + } + + state.counters["n"] = n; + state.counters["batch"] = batch_size; + state.counters["total_solves"] = batch_size; +} + +// -------------------------------------------------------------------------- +// Sequential with DeviceMatrix (avoid re-upload of A each iteration) +// -------------------------------------------------------------------------- + +static void BM_Batch_Sequential_Device(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + const int batch_size = static_cast<int>(state.range(1)); + + std::vector<Mat> As(batch_size); + std::vector<Mat> Bs(batch_size); + std::vector<gpu::DeviceMatrix<Scalar>> d_As(batch_size); + std::vector<gpu::DeviceMatrix<Scalar>> d_Bs(batch_size); + for (int i = 0; i < batch_size; ++i) { + As[i] = make_spd(n); + Bs[i] = Mat::Random(n, 1); + d_As[i] = gpu::DeviceMatrix<Scalar>::fromHost(As[i]); + d_Bs[i] = gpu::DeviceMatrix<Scalar>::fromHost(Bs[i]); + } + + gpu::LLT<Scalar> llt; + + for (auto _ : state) { + std::vector<Mat> results(batch_size); + for (int i = 0; i < batch_size; ++i) { + llt.compute(d_As[i]); + gpu::DeviceMatrix<Scalar> d_X = llt.solve(d_Bs[i]); + results[i] = d_X.toHost(); + } + benchmark::DoNotOptimize(results.back().data()); + } + + state.counters["n"] = n; + state.counters["batch"] = batch_size; + state.counters["total_solves"] = batch_size; +} + +// -------------------------------------------------------------------------- +// Batched: N solvers on N streams, overlapping execution +// -------------------------------------------------------------------------- + +static void BM_Batch_MultiStream(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + const int batch_size = static_cast<int>(state.range(1)); + + std::vector<Mat> As(batch_size); + std::vector<Mat> Bs(batch_size); + std::vector<gpu::DeviceMatrix<Scalar>> d_As(batch_size); + std::vector<gpu::DeviceMatrix<Scalar>> d_Bs(batch_size); + for (int i = 0; i < batch_size; ++i) { + As[i] = make_spd(n); + Bs[i] = Mat::Random(n, 1); + d_As[i] = gpu::DeviceMatrix<Scalar>::fromHost(As[i]); + d_Bs[i] = gpu::DeviceMatrix<Scalar>::fromHost(Bs[i]); + } + + // N solvers = N independent CUDA streams. + std::vector<std::unique_ptr<gpu::LLT<Scalar>>> solvers(batch_size); + for (int i = 0; i < batch_size; ++i) { + solvers[i] = std::make_unique<gpu::LLT<Scalar>>(); + } + + for (auto _ : state) { + // Phase 1: launch all factorizations (async, different streams). + for (int i = 0; i < batch_size; ++i) { + solvers[i]->compute(d_As[i]); + } + + // Phase 2: launch all solves (async, different streams). + std::vector<gpu::DeviceMatrix<Scalar>> d_Xs(batch_size); + for (int i = 0; i < batch_size; ++i) { + d_Xs[i] = solvers[i]->solve(d_Bs[i]); + } + + // Phase 3: download all results. + std::vector<Mat> results(batch_size); + for (int i = 0; i < batch_size; ++i) { + results[i] = d_Xs[i].toHost(); + } + benchmark::DoNotOptimize(results.back().data()); + } + + state.counters["n"] = n; + state.counters["batch"] = batch_size; + state.counters["streams"] = batch_size; + state.counters["total_solves"] = batch_size; +} + +// -------------------------------------------------------------------------- +// Batched with async download (overlap D2H with computation) +// -------------------------------------------------------------------------- + +static void BM_Batch_MultiStream_AsyncDownload(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + const int batch_size = static_cast<int>(state.range(1)); + + std::vector<Mat> As(batch_size); + std::vector<Mat> Bs(batch_size); + std::vector<gpu::DeviceMatrix<Scalar>> d_As(batch_size); + std::vector<gpu::DeviceMatrix<Scalar>> d_Bs(batch_size); + for (int i = 0; i < batch_size; ++i) { + As[i] = make_spd(n); + Bs[i] = Mat::Random(n, 1); + d_As[i] = gpu::DeviceMatrix<Scalar>::fromHost(As[i]); + d_Bs[i] = gpu::DeviceMatrix<Scalar>::fromHost(Bs[i]); + } + + std::vector<std::unique_ptr<gpu::LLT<Scalar>>> solvers(batch_size); + for (int i = 0; i < batch_size; ++i) { + solvers[i] = std::make_unique<gpu::LLT<Scalar>>(); + } + + for (auto _ : state) { + // Launch all compute + solve. + std::vector<gpu::DeviceMatrix<Scalar>> d_Xs(batch_size); + for (int i = 0; i < batch_size; ++i) { + solvers[i]->compute(d_As[i]); + d_Xs[i] = solvers[i]->solve(d_Bs[i]); + } + + // Enqueue all async downloads. + std::vector<gpu::HostTransfer<Scalar>> transfers; + transfers.reserve(batch_size); + for (int i = 0; i < batch_size; ++i) { + transfers.push_back(d_Xs[i].toHostAsync()); + } + + // Collect all results. + for (int i = 0; i < batch_size; ++i) { + benchmark::DoNotOptimize(transfers[i].get().data()); + } + } + + state.counters["n"] = n; + state.counters["batch"] = batch_size; + state.counters["streams"] = batch_size; + state.counters["total_solves"] = batch_size; +} + +// -------------------------------------------------------------------------- +// CPU baseline: Eigen LLT on host, sequential +// -------------------------------------------------------------------------- + +static void BM_Batch_CPU(benchmark::State& state) { + const Index n = state.range(0); + const int batch_size = static_cast<int>(state.range(1)); + + std::vector<Mat> As(batch_size); + std::vector<Mat> Bs(batch_size); + for (int i = 0; i < batch_size; ++i) { + As[i] = make_spd(n); + Bs[i] = Mat::Random(n, 1); + } + + for (auto _ : state) { + std::vector<Mat> results(batch_size); + for (int i = 0; i < batch_size; ++i) { + LLT<Mat> llt(As[i]); + results[i] = llt.solve(Bs[i]); + } + benchmark::DoNotOptimize(results.back().data()); + } + + state.counters["n"] = n; + state.counters["batch"] = batch_size; + state.counters["total_solves"] = batch_size; +} + +// -------------------------------------------------------------------------- +// Registration +// -------------------------------------------------------------------------- + +// clang-format off +// Args: {matrix_size, batch_size} +// Small matrices with large batches are the interesting case for multi-stream. +BENCHMARK(BM_Batch_Sequential)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_Batch_Sequential_Device)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_Batch_CPU)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond); + +// Also run larger sizes with moderate batching. +BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond); +// clang-format on
diff --git a/unsupported/benchmarks/GPU/bench_chaining.cpp b/unsupported/benchmarks/GPU/bench_chaining.cpp new file mode 100644 index 0000000..323d8b2 --- /dev/null +++ b/unsupported/benchmarks/GPU/bench_chaining.cpp
@@ -0,0 +1,232 @@ +// GPU chaining benchmarks: measure async pipeline efficiency. +// +// Compares: +// 1. Host round-trip per solve (baseline) +// 2. DeviceMatrix chaining (no host round-trip between solves) +// 3. Varying chain lengths (1, 2, 4, 8 consecutive solves) +// +// For Nsight Systems: look for gaps between kernel launches in the timeline. +// Host round-trip creates visible idle gaps; chaining should show back-to-back kernels. +// +// nsys profile --trace=cuda,nvtx ./bench_chaining + +#include <benchmark/benchmark.h> + +#include <Eigen/Cholesky> +#include <unsupported/Eigen/GPU> + +using namespace Eigen; + +#ifndef SCALAR +#define SCALAR double +#endif + +using Scalar = SCALAR; +using Mat = Matrix<Scalar, Dynamic, Dynamic>; + +static Mat make_spd(Index n) { + Mat M = Mat::Random(n, n); + return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n); +} + +static void cuda_warmup() { + static bool done = false; + if (!done) { + void* p; + cudaMalloc(&p, 1); + cudaFree(p); + done = true; + } +} + +// -------------------------------------------------------------------------- +// Baseline: host round-trip between every solve +// -------------------------------------------------------------------------- + +static void BM_Chain_HostRoundtrip(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + const int chain_len = static_cast<int>(state.range(1)); + + Mat A = make_spd(n); + Mat B = Mat::Random(n, 1); + gpu::LLT<Scalar> llt(A); + + for (auto _ : state) { + Mat X = B; + for (int i = 0; i < chain_len; ++i) { + X = llt.solve(X); // host → device → host each time + } + benchmark::DoNotOptimize(X.data()); + } + + state.counters["n"] = n; + state.counters["chain"] = chain_len; + state.counters["solves/iter"] = chain_len; +} + +// -------------------------------------------------------------------------- +// DeviceMatrix chaining: no host round-trip between solves +// -------------------------------------------------------------------------- + +static void BM_Chain_Device(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + const int chain_len = static_cast<int>(state.range(1)); + + Mat A = make_spd(n); + Mat B = Mat::Random(n, 1); + gpu::LLT<Scalar> llt(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + for (auto _ : state) { + gpu::DeviceMatrix<Scalar> d_X = llt.solve(d_B); + for (int i = 1; i < chain_len; ++i) { + d_X = llt.solve(d_X); // device → device, fully async + } + Mat X = d_X.toHost(); // single sync at end + benchmark::DoNotOptimize(X.data()); + } + + state.counters["n"] = n; + state.counters["chain"] = chain_len; + state.counters["solves/iter"] = chain_len; +} + +// -------------------------------------------------------------------------- +// DeviceMatrix chaining with async download (overlap D2H with next iteration) +// +// Double-buffered: each loop body issues iteration N+1's chain *before* +// draining iteration N's D2H, so the host wait overlaps with on-device +// compute instead of stalling the pipeline. +// -------------------------------------------------------------------------- + +static void BM_Chain_DeviceAsync(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + const int chain_len = static_cast<int>(state.range(1)); + + Mat A = make_spd(n); + Mat B = Mat::Random(n, 1); + gpu::LLT<Scalar> llt(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + auto run_chain = [&]() { + gpu::DeviceMatrix<Scalar> d_X = llt.solve(d_B); + for (int i = 1; i < chain_len; ++i) { + d_X = llt.solve(d_X); + } + return d_X.toHostAsync(); + }; + + // Prime the pipeline so each timed iteration overlaps a fresh chain + // with the previous iteration's D2H. + auto prev = run_chain(); + + for (auto _ : state) { + auto next = run_chain(); // kick off N+1 while D2H of N is in flight + Mat X = prev.get(); // drain N (overlaps with `next` compute) + benchmark::DoNotOptimize(X.data()); + prev = std::move(next); + } + + // Drain the trailing transfer outside the timed region. + prev.get(); + + state.counters["n"] = n; + state.counters["chain"] = chain_len; + state.counters["solves/iter"] = chain_len; +} + +// -------------------------------------------------------------------------- +// Pure GPU chain (no download — measures kernel-only throughput) +// -------------------------------------------------------------------------- + +static void BM_Chain_DeviceNoDownload(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + const int chain_len = static_cast<int>(state.range(1)); + + Mat A = make_spd(n); + Mat B = Mat::Random(n, 1); + gpu::LLT<Scalar> llt(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + for (auto _ : state) { + gpu::DeviceMatrix<Scalar> d_X = llt.solve(d_B); + for (int i = 1; i < chain_len; ++i) { + d_X = llt.solve(d_X); + } + cudaStreamSynchronize(llt.stream()); + benchmark::DoNotOptimize(d_X.data()); + } + + state.counters["n"] = n; + state.counters["chain"] = chain_len; + state.counters["solves/iter"] = chain_len; +} + +// -------------------------------------------------------------------------- +// Compute + solve chain (full pipeline: factorize, then chain solves) +// -------------------------------------------------------------------------- + +static void BM_FullPipeline_Host(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + const int chain_len = static_cast<int>(state.range(1)); + + Mat A = make_spd(n); + Mat B = Mat::Random(n, 1); + + for (auto _ : state) { + gpu::LLT<Scalar> llt(A); + Mat X = B; + for (int i = 0; i < chain_len; ++i) { + X = llt.solve(X); + } + benchmark::DoNotOptimize(X.data()); + } + + state.counters["n"] = n; + state.counters["chain"] = chain_len; +} + +static void BM_FullPipeline_Device(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + const int chain_len = static_cast<int>(state.range(1)); + + Mat A = make_spd(n); + Mat B = Mat::Random(n, 1); + + for (auto _ : state) { + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + gpu::LLT<Scalar> llt; + llt.compute(d_A); + gpu::DeviceMatrix<Scalar> d_X = llt.solve(d_B); + for (int i = 1; i < chain_len; ++i) { + d_X = llt.solve(d_X); + } + Mat X = d_X.toHost(); + benchmark::DoNotOptimize(X.data()); + } + + state.counters["n"] = n; + state.counters["chain"] = chain_len; +} + +// -------------------------------------------------------------------------- +// Registration +// -------------------------------------------------------------------------- + +// clang-format off +// Args: {matrix_size, chain_length} +BENCHMARK(BM_Chain_HostRoundtrip)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_Chain_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_Chain_DeviceAsync)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_Chain_DeviceNoDownload)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond); + +BENCHMARK(BM_FullPipeline_Host)->ArgsProduct({{256, 1024, 4096}, {1, 4}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_FullPipeline_Device)->ArgsProduct({{256, 1024, 4096}, {1, 4}})->Unit(benchmark::kMicrosecond); +// clang-format on
diff --git a/unsupported/benchmarks/GPU/bench_solvers.cpp b/unsupported/benchmarks/GPU/bench_solvers.cpp new file mode 100644 index 0000000..61d86ce --- /dev/null +++ b/unsupported/benchmarks/GPU/bench_solvers.cpp
@@ -0,0 +1,296 @@ +// GPU solver benchmarks: gpu::LLT and gpu::LU compute + solve throughput. +// +// Measures factorization and solve performance for the host-matrix and +// DeviceMatrix code paths across a range of matrix sizes. +// +// For Nsight Systems profiling: +// nsys profile --trace=cuda,nvtx ./bench_solvers +// +// For Nsight Compute kernel analysis: +// ncu --set full -o profile ./bench_solvers --benchmark_filter=BM_GpuLLT_Compute/4096 + +#include <benchmark/benchmark.h> + +#include <Eigen/Cholesky> +#include <unsupported/Eigen/GPU> +#include <Eigen/LU> + +using namespace Eigen; + +#ifndef SCALAR +#define SCALAR double +#endif + +using Scalar = SCALAR; +using Mat = Matrix<Scalar, Dynamic, Dynamic>; + +// -------------------------------------------------------------------------- +// Helpers +// -------------------------------------------------------------------------- + +static Mat make_spd(Index n) { + Mat M = Mat::Random(n, n); + return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n); +} + +// CUDA warm-up: ensure the GPU is initialized before timing. +static void cuda_warmup() { + static bool done = false; + if (!done) { + void* p; + cudaMalloc(&p, 1); + cudaFree(p); + done = true; + } +} + +// -------------------------------------------------------------------------- +// GpuLLT benchmarks +// -------------------------------------------------------------------------- + +// Factorize from host matrix (includes H2D upload). +static void BM_GpuLLT_Compute_Host(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + Mat A = make_spd(n); + gpu::LLT<Scalar> llt; + + for (auto _ : state) { + llt.compute(A); + if (llt.info() != Success) state.SkipWithError("factorization failed"); + } + + double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0; + state.counters["GFLOPS"] = + benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); + state.counters["n"] = n; +} + +// Factorize from DeviceMatrix (D2D copy path). +static void BM_GpuLLT_Compute_Device(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + Mat A = make_spd(n); + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + gpu::LLT<Scalar> llt; + + for (auto _ : state) { + llt.compute(d_A); + if (llt.info() != Success) state.SkipWithError("factorization failed"); + } + + double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0; + state.counters["GFLOPS"] = + benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); + state.counters["n"] = n; +} + +// Factorize from DeviceMatrix (move path, no copy). +static void BM_GpuLLT_Compute_DeviceMove(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + Mat A = make_spd(n); + gpu::LLT<Scalar> llt; + + for (auto _ : state) { + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + llt.compute(std::move(d_A)); + if (llt.info() != Success) state.SkipWithError("factorization failed"); + } + + double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0; + state.counters["GFLOPS"] = + benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); + state.counters["n"] = n; +} + +// Solve from host matrix (H2D + potrs + D2H). +static void BM_GpuLLT_Solve_Host(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + const Index nrhs = state.range(1); + Mat A = make_spd(n); + Mat B = Mat::Random(n, nrhs); + gpu::LLT<Scalar> llt(A); + + for (auto _ : state) { + Mat X = llt.solve(B); + benchmark::DoNotOptimize(X.data()); + } + + state.counters["n"] = n; + state.counters["nrhs"] = nrhs; +} + +// Solve from DeviceMatrix (D2D + potrs, async, toHost at end). +static void BM_GpuLLT_Solve_Device(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + const Index nrhs = state.range(1); + Mat A = make_spd(n); + Mat B = Mat::Random(n, nrhs); + gpu::LLT<Scalar> llt(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + for (auto _ : state) { + gpu::DeviceMatrix<Scalar> d_X = llt.solve(d_B); + Mat X = d_X.toHost(); + benchmark::DoNotOptimize(X.data()); + } + + state.counters["n"] = n; + state.counters["nrhs"] = nrhs; +} + +// Solve staying entirely on device (no toHost — measures pure GPU time). +static void BM_GpuLLT_Solve_DeviceOnly(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + const Index nrhs = state.range(1); + Mat A = make_spd(n); + Mat B = Mat::Random(n, nrhs); + gpu::LLT<Scalar> llt(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + for (auto _ : state) { + gpu::DeviceMatrix<Scalar> d_X = llt.solve(d_B); + // Force completion without D2H transfer. + cudaStreamSynchronize(llt.stream()); + benchmark::DoNotOptimize(d_X.data()); + } + + state.counters["n"] = n; + state.counters["nrhs"] = nrhs; +} + +// -------------------------------------------------------------------------- +// GpuLU benchmarks +// -------------------------------------------------------------------------- + +static void BM_GpuLU_Compute_Host(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + Mat A = Mat::Random(n, n); + gpu::LU<Scalar> lu; + + for (auto _ : state) { + lu.compute(A); + if (lu.info() != Success) state.SkipWithError("factorization failed"); + } + + double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n); + state.counters["GFLOPS"] = + benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); + state.counters["n"] = n; +} + +static void BM_GpuLU_Compute_Device(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + Mat A = Mat::Random(n, n); + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + gpu::LU<Scalar> lu; + + for (auto _ : state) { + lu.compute(d_A); + if (lu.info() != Success) state.SkipWithError("factorization failed"); + } + + double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n); + state.counters["GFLOPS"] = + benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); + state.counters["n"] = n; +} + +static void BM_GpuLU_Solve_Host(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + const Index nrhs = state.range(1); + Mat A = Mat::Random(n, n); + Mat B = Mat::Random(n, nrhs); + gpu::LU<Scalar> lu(A); + + for (auto _ : state) { + Mat X = lu.solve(B); + benchmark::DoNotOptimize(X.data()); + } + + state.counters["n"] = n; + state.counters["nrhs"] = nrhs; +} + +static void BM_GpuLU_Solve_Device(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + const Index nrhs = state.range(1); + Mat A = Mat::Random(n, n); + Mat B = Mat::Random(n, nrhs); + gpu::LU<Scalar> lu(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + for (auto _ : state) { + gpu::DeviceMatrix<Scalar> d_X = lu.solve(d_B); + Mat X = d_X.toHost(); + benchmark::DoNotOptimize(X.data()); + } + + state.counters["n"] = n; + state.counters["nrhs"] = nrhs; +} + +// -------------------------------------------------------------------------- +// CPU baselines for comparison +// -------------------------------------------------------------------------- + +static void BM_CpuLLT_Compute(benchmark::State& state) { + const Index n = state.range(0); + Mat A = make_spd(n); + LLT<Mat> llt; + + for (auto _ : state) { + llt.compute(A); + benchmark::DoNotOptimize(llt.matrixLLT().data()); + } + + double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0; + state.counters["GFLOPS"] = + benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); + state.counters["n"] = n; +} + +static void BM_CpuLU_Compute(benchmark::State& state) { + const Index n = state.range(0); + Mat A = Mat::Random(n, n); + PartialPivLU<Mat> lu; + + for (auto _ : state) { + lu.compute(A); + benchmark::DoNotOptimize(lu.matrixLU().data()); + } + + double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n); + state.counters["GFLOPS"] = + benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000); + state.counters["n"] = n; +} + +// -------------------------------------------------------------------------- +// Registration +// -------------------------------------------------------------------------- + +// clang-format off +BENCHMARK(BM_GpuLLT_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_GpuLLT_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_GpuLLT_Compute_DeviceMove)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_GpuLLT_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_GpuLLT_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_GpuLLT_Solve_DeviceOnly)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond); + +BENCHMARK(BM_GpuLU_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_GpuLU_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_GpuLU_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_GpuLU_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond); + +BENCHMARK(BM_CpuLLT_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_CpuLU_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond); +// clang-format on
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index a521b85..e7d628f 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt
@@ -287,6 +287,8 @@ ei_add_test(tensor_random_gpu) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) + + add_subdirectory(GPU) endif() # Add HIP specific tests
diff --git a/unsupported/test/GPU/CMakeLists.txt b/unsupported/test/GPU/CMakeLists.txt new file mode 100644 index 0000000..02d6832 --- /dev/null +++ b/unsupported/test/GPU/CMakeLists.txt
@@ -0,0 +1,73 @@ +# GPU module tests. Included from unsupported/test/CMakeLists.txt inside the +# if(CUDA_FOUND AND EIGEN_TEST_CUDA) block. + +find_package(CUDAToolkit) + +# ei_add_gpu_test(<name> [EXTRA_LIBS <lib>...] [EXTRA_INCLUDES <dir>...] [EXTRA_DEFINES <def>...]) +# +# Thin wrapper around ei_add_test for GPU module tests. Delegates to +# ei_add_test so CALL_SUBTEST_N splitting and EIGEN_TEST_MAX_SIZE handling +# work the same way as the rest of the test suite, then layers on the +# GPU-specific properties (EXTRA_{LIBS,INCLUDES,DEFINES}, the +# "Official;gpu" labels, SKIP_RETURN_CODE=77, buildtests_gpu dependency). +# +# GPU module tests are plain .cpp (host-compiled) by design: they +# orchestrate NVIDIA library calls and never launch kernels themselves, +# which avoids NVCC instantiating Eigen CPU packet ops for CUDA vector +# types. That keeps them outside ei_add_test's "is_gpu_test" path (which +# triggers on EIGEN_ADD_TEST_FILENAME_EXTENSION=cu), so we set the +# SKIP_RETURN_CODE and labels ourselves. +function(ei_add_gpu_test test_name) + cmake_parse_arguments(ARG "" "" "EXTRA_LIBS;EXTRA_INCLUDES;EXTRA_DEFINES" ${ARGN}) + + # Delegate to ei_add_test. The third arg is a semicolon-separated library + # list; CUDA::cudart_static is always needed for any host test that + # touches a device-allocated buffer. + ei_add_test(${test_name} "" "CUDA::cudart_static;${ARG_EXTRA_LIBS}") + + # Discover targets ei_add_test created. The scan mirrors ei_add_test's own + # scan so we see the same set of suffixes: one target per distinct + # CALL_SUBTEST_N on EIGEN_SPLIT_LARGE_TESTS, else a single target. + file(READ "${test_name}.cpp" _src) + string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+" _matches "${_src}") + string(REGEX REPLACE "CALL_SUBTEST_" "" _suffixes "${_matches}") + list(REMOVE_DUPLICATES _suffixes) + set(_targets) + if(EIGEN_SPLIT_LARGE_TESTS AND _suffixes) + foreach(s ${_suffixes}) + list(APPEND _targets "${test_name}_${s}") + endforeach() + else() + list(APPEND _targets "${test_name}") + endif() + + foreach(t ${_targets}) + target_include_directories(${t} PRIVATE + "${CUDA_TOOLKIT_ROOT_DIR}/include" + "${CMAKE_CURRENT_BINARY_DIR}" + ${ARG_EXTRA_INCLUDES}) + if(ARG_EXTRA_DEFINES) + target_compile_definitions(${t} PRIVATE ${ARG_EXTRA_DEFINES}) + endif() + add_dependencies(buildtests_gpu ${t}) + set_property(TEST ${t} APPEND PROPERTY LABELS "Official;gpu") + set_property(TEST ${t} PROPERTY SKIP_RETURN_CODE 77) + endforeach() +endfunction() + +# DeviceMatrix core: CUDA runtime only. +ei_add_gpu_test(device_matrix) + +# cuBLAS integration (BLAS-3, triangular solves, rank-k updates, ...). +option(EIGEN_TEST_CUBLAS "Test cuBLAS integration" ON) +if(EIGEN_TEST_CUBLAS AND TARGET CUDA::cublas) + ei_add_gpu_test(cublas EXTRA_LIBS CUDA::cublas CUDA::cusolver) +endif() + +# cuSOLVER dense factorizations (LLT, LU). +option(EIGEN_TEST_CUSOLVER "Test cuSOLVER integration" ON) +if(EIGEN_TEST_CUSOLVER AND TARGET CUDA::cusolver) + foreach(_cusolver_test IN ITEMS cusolver_llt cusolver_lu) + ei_add_gpu_test(${_cusolver_test} EXTRA_LIBS CUDA::cusolver CUDA::cublas) + endforeach() +endif()
diff --git a/unsupported/test/GPU/cublas.cpp b/unsupported/test/GPU/cublas.cpp new file mode 100644 index 0000000..7c04eb2 --- /dev/null +++ b/unsupported/test/GPU/cublas.cpp
@@ -0,0 +1,739 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// Tests for cuBLAS GEMM dispatch via DeviceMatrix expression syntax. +// Covers: d_C = d_A * d_B, adjoint, transpose, scaled, +=, .device(ctx). + +#define EIGEN_USE_GPU +#include "main.h" +#include <unsupported/Eigen/GPU> + +using namespace Eigen; + +// ---- Basic GEMM: C = A * B ------------------------------------------------- + +template <typename Scalar> +void test_gemm_basic(Index m, Index n, Index k) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(m, k); + Mat B = Mat::Random(k, n); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + // Expression: d_C = d_A * d_B + gpu::DeviceMatrix<Scalar> d_C; + d_C = d_A * d_B; + + Mat C = d_C.toHost(); + Mat C_ref = A * B; + + RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); + VERIFY((C - C_ref).norm() < tol); +} + +// ---- GEMM with adjoint: C = A^H * B ---------------------------------------- + +template <typename Scalar> +void test_gemm_adjoint_lhs(Index m, Index n, Index k) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(k, m); // A is k×m, A^H is m×k + Mat B = Mat::Random(k, n); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::DeviceMatrix<Scalar> d_C; + d_C = d_A.adjoint() * d_B; + + Mat C = d_C.toHost(); + Mat C_ref = A.adjoint() * B; + + RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); + VERIFY((C - C_ref).norm() < tol); +} + +// ---- GEMM with transpose: C = A * B^T -------------------------------------- + +template <typename Scalar> +void test_gemm_transpose_rhs(Index m, Index n, Index k) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(m, k); + Mat B = Mat::Random(n, k); // B is n×k, B^T is k×n + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::DeviceMatrix<Scalar> d_C; + d_C = d_A * d_B.transpose(); + + Mat C = d_C.toHost(); + Mat C_ref = A * B.transpose(); + + RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); + VERIFY((C - C_ref).norm() < tol); +} + +// ---- GEMM with view/view operands: C = alpha * A^T * B^H ------------------- + +template <typename Scalar> +void test_gemm_view_pairs(Index m, Index n, Index k) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(k, m); // A^T is m x k + Mat B = Mat::Random(n, k); // B^H is k x n + Scalar alpha = Scalar(2); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::DeviceMatrix<Scalar> d_C; + d_C = (alpha * d_A.transpose()) * d_B.adjoint(); + + Mat C = d_C.toHost(); + Mat C_ref = (alpha * A.transpose()) * B.adjoint(); + + RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); + VERIFY((C - C_ref).norm() < tol); +} + +// ---- GEMM with scaled: C = alpha * A * B ------------------------------------ + +template <typename Scalar> +void test_gemm_scaled(Index m, Index n, Index k) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(m, k); + Mat B = Mat::Random(k, n); + Scalar alpha = Scalar(2.5); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::DeviceMatrix<Scalar> d_C; + d_C = alpha * d_A * d_B; + + Mat C = d_C.toHost(); + Mat C_ref = alpha * A * B; + + RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); + VERIFY((C - C_ref).norm() < tol); +} + +// ---- GEMM accumulate: C += A * B (beta=1) ----------------------------------- + +template <typename Scalar> +void test_gemm_accumulate(Index m, Index n, Index k) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(m, k); + Mat B = Mat::Random(k, n); + Mat C_init = Mat::Random(m, n); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + auto d_C = gpu::DeviceMatrix<Scalar>::fromHost(C_init); + + d_C += d_A * d_B; + + Mat C = d_C.toHost(); + Mat C_ref = C_init + A * B; + + RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); + VERIFY((C - C_ref).norm() < tol); +} + +// ---- GEMM accumulate into empty destination --------------------------------- + +template <typename Scalar> +void test_gemm_accumulate_empty(Index m, Index n, Index k) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(m, k); + Mat B = Mat::Random(k, n); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + gpu::DeviceMatrix<Scalar> d_C; + + d_C += d_A * d_B; + + Mat C = d_C.toHost(); + Mat C_ref = A * B; + + RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); + VERIFY((C - C_ref).norm() < tol); +} + +// ---- GEMM subtract: C -= A * B (beta=1, alpha=-1) -------------------------- + +template <typename Scalar> +void test_gemm_subtract(Index m, Index n, Index k) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(m, k); + Mat B = Mat::Random(k, n); + Mat C_init = Mat::Random(m, n); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + auto d_C = gpu::DeviceMatrix<Scalar>::fromHost(C_init); + + gpu::Context ctx; + d_C.device(ctx) -= d_A * d_B; + + Mat C = d_C.toHost(); + Mat C_ref = C_init - A * B; + + RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); + VERIFY((C - C_ref).norm() < tol); +} + +// ---- GEMM subtract from empty destination ----------------------------------- + +template <typename Scalar> +void test_gemm_subtract_empty(Index m, Index n, Index k) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(m, k); + Mat B = Mat::Random(k, n); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::Context ctx; + gpu::DeviceMatrix<Scalar> d_C; + d_C.device(ctx) -= d_A * d_B; + + Mat C = d_C.toHost(); + Mat C_ref = -(A * B); + + RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); + VERIFY((C - C_ref).norm() < tol); +} + +// ---- GEMM with scaled RHS: C = A * (alpha * B) ----------------------------- + +template <typename Scalar> +void test_gemm_scaled_rhs(Index m, Index n, Index k) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(m, k); + Mat B = Mat::Random(k, n); + Scalar alpha = Scalar(3.0); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::DeviceMatrix<Scalar> d_C; + d_C = d_A * (alpha * d_B); + + Mat C = d_C.toHost(); + Mat C_ref = A * (alpha * B); + + RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); + VERIFY((C - C_ref).norm() < tol); +} + +// ---- GEMM dimension mismatch must assert ------------------------------------ +// Note: we do NOT use VERIFY_RAISES_ASSERT here because it relies on +// setjmp/longjmp which skips RAII destructors for DeviceMatrix (GPU memory) +// and cuBLAS/cuSOLVER handles, corrupting state for subsequent tests. + +// ---- GEMM with explicit gpu::Context ------------------------------------------ + +template <typename Scalar> +void test_gemm_explicit_context(Index m, Index n, Index k) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(m, k); + Mat B = Mat::Random(k, n); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::Context ctx; + gpu::DeviceMatrix<Scalar> d_C; + d_C.device(ctx) = d_A * d_B; + + Mat C = d_C.toHost(); + Mat C_ref = A * B; + + RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); + VERIFY((C - C_ref).norm() < tol); +} + +// ---- GEMM cross-context reuse of the same destination ----------------------- + +template <typename Scalar> +void test_gemm_cross_context_reuse(Index n) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(n, n); + Mat B = Mat::Random(n, n); + Mat D = Mat::Random(n, n); + Mat E = Mat::Random(n, n); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + auto d_D = gpu::DeviceMatrix<Scalar>::fromHost(D); + auto d_E = gpu::DeviceMatrix<Scalar>::fromHost(E); + + gpu::Context ctx1; + gpu::Context ctx2; + gpu::DeviceMatrix<Scalar> d_C; + d_C.device(ctx1) = d_A * d_B; + d_C.device(ctx2) += d_D * d_E; + + Mat C = d_C.toHost(); + Mat C_ref = A * B + D * E; + + RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm(); + VERIFY((C - C_ref).norm() < tol); +} + +// ---- GEMM cross-context resize of the destination --------------------------- + +template <typename Scalar> +void test_gemm_cross_context_resize() { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(64, 64); + Mat B = Mat::Random(64, 64); + Mat D = Mat::Random(32, 16); + Mat E = Mat::Random(16, 8); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + auto d_D = gpu::DeviceMatrix<Scalar>::fromHost(D); + auto d_E = gpu::DeviceMatrix<Scalar>::fromHost(E); + + gpu::Context ctx1; + gpu::Context ctx2; + gpu::DeviceMatrix<Scalar> d_C; + d_C.device(ctx1) = d_A * d_B; + d_C.device(ctx2) = d_D * d_E; + + Mat C = d_C.toHost(); + Mat C_ref = D * E; + + RealScalar tol = RealScalar(16) * NumTraits<Scalar>::epsilon() * C_ref.norm(); + VERIFY((C - C_ref).norm() < tol); +} + +// ---- GEMM chaining: C = (A * B) then D = C * E ----------------------------- + +template <typename Scalar> +void test_gemm_chain(Index n) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(n, n); + Mat B = Mat::Random(n, n); + Mat E = Mat::Random(n, n); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + auto d_E = gpu::DeviceMatrix<Scalar>::fromHost(E); + + gpu::DeviceMatrix<Scalar> d_C; + d_C = d_A * d_B; + gpu::DeviceMatrix<Scalar> d_D; + d_D = d_C * d_E; + + Mat D = d_D.toHost(); + Mat D_ref = (A * B) * E; + + RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * D_ref.norm(); + VERIFY((D - D_ref).norm() < tol); +} + +// ---- Square identity check: A * I = A --------------------------------------- + +template <typename Scalar> +void test_gemm_identity(Index n) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + + Mat A = Mat::Random(n, n); + Mat eye = Mat::Identity(n, n); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_I = gpu::DeviceMatrix<Scalar>::fromHost(eye); + + gpu::DeviceMatrix<Scalar> d_C; + d_C = d_A * d_I; + + Mat C = d_C.toHost(); + VERIFY_IS_APPROX(C, A); +} + +// ---- LLT solve expression: d_X = d_A.llt().solve(d_B) ---------------------- + +template <typename MatrixType> +MatrixType make_spd(Index n) { + using Scalar = typename MatrixType::Scalar; + MatrixType M = MatrixType::Random(n, n); + return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n); +} + +template <typename Scalar> +void test_llt_solve_expr(Index n, Index nrhs) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = make_spd<Mat>(n); + Mat B = Mat::Random(n, nrhs); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::DeviceMatrix<Scalar> d_X; + d_X = d_A.llt().solve(d_B); + + Mat X = d_X.toHost(); + RealScalar residual = (A * X - B).norm() / B.norm(); + VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon()); +} + +// ---- LLT solve with explicit context ---------------------------------------- + +template <typename Scalar> +void test_llt_solve_expr_context(Index n, Index nrhs) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = make_spd<Mat>(n); + Mat B = Mat::Random(n, nrhs); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::Context ctx; + gpu::DeviceMatrix<Scalar> d_X; + d_X.device(ctx) = d_A.llt().solve(d_B); + + Mat X = d_X.toHost(); + RealScalar residual = (A * X - B).norm() / B.norm(); + VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon()); +} + +// ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------ + +template <typename Scalar> +void test_lu_solve_expr(Index n, Index nrhs) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(n, n); + Mat B = Mat::Random(n, nrhs); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::DeviceMatrix<Scalar> d_X; + d_X = d_A.lu().solve(d_B); + + Mat X = d_X.toHost(); + RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm()); + VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon()); +} + +// ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------ + +template <typename Scalar> +void test_gemm_then_solve(Index n) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(n, n); + Mat D = Mat::Random(n, 1); + + // Make SPD: C = A^H * A + n*I + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + gpu::DeviceMatrix<Scalar> d_C; + d_C = d_A.adjoint() * d_A; + + // Add n*I on host (no element-wise ops on DeviceMatrix yet). + Mat C = d_C.toHost(); + C += Mat::Identity(n, n) * static_cast<Scalar>(n); + d_C = gpu::DeviceMatrix<Scalar>::fromHost(C); + + auto d_D = gpu::DeviceMatrix<Scalar>::fromHost(D); + + gpu::DeviceMatrix<Scalar> d_X; + d_X = d_C.llt().solve(d_D); + + Mat X = d_X.toHost(); + RealScalar residual = (C * X - D).norm() / D.norm(); + VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon()); +} + +// ---- LLT solve with Upper triangle ----------------------------------------- + +template <typename Scalar> +void test_llt_solve_upper(Index n, Index nrhs) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = make_spd<Mat>(n); + Mat B = Mat::Random(n, nrhs); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::DeviceMatrix<Scalar> d_X; + d_X = d_A.template llt<Upper>().solve(d_B); + + Mat X = d_X.toHost(); + RealScalar residual = (A * X - B).norm() / B.norm(); + VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon()); +} + +// ---- LU solve with explicit context ----------------------------------------- + +template <typename Scalar> +void test_lu_solve_expr_context(Index n, Index nrhs) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(n, n); + Mat B = Mat::Random(n, nrhs); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::Context ctx; + gpu::DeviceMatrix<Scalar> d_X; + d_X.device(ctx) = d_A.lu().solve(d_B); + + Mat X = d_X.toHost(); + RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm()); + VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon()); +} + +// ---- Zero-nrhs solver expressions ------------------------------------------ + +template <typename Scalar> +void test_llt_solve_zero_nrhs(Index n) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + + Mat A = make_spd<Mat>(n); + Mat B = Mat::Random(n, 0); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::DeviceMatrix<Scalar> d_X; + d_X = d_A.llt().solve(d_B); + + VERIFY_IS_EQUAL(d_X.rows(), n); + VERIFY_IS_EQUAL(d_X.cols(), 0); +} + +template <typename Scalar> +void test_lu_solve_zero_nrhs(Index n) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + + Mat A = Mat::Random(n, n); + Mat B = Mat::Random(n, 0); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::DeviceMatrix<Scalar> d_X; + d_X = d_A.lu().solve(d_B); + + VERIFY_IS_EQUAL(d_X.rows(), n); + VERIFY_IS_EQUAL(d_X.cols(), 0); +} + +// ---- TRSM: triangularView<UpLo>().solve(B) ---------------------------------- + +template <typename Scalar, int UpLo> +void test_trsm(Index n, Index nrhs) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + // Build a well-conditioned triangular matrix. + Mat A = Mat::Random(n, n); + A.diagonal().array() += static_cast<Scalar>(n); // ensure non-singular + if (UpLo == Lower) + A = A.template triangularView<Lower>(); + else + A = A.template triangularView<Upper>(); + + Mat B = Mat::Random(n, nrhs); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::DeviceMatrix<Scalar> d_X; + d_X = d_A.template triangularView<UpLo>().solve(d_B); + + Mat X = d_X.toHost(); + RealScalar residual = (A * X - B).norm() / B.norm(); + VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon()); +} + +// ---- SYMM/HEMM: selfadjointView<UpLo>() * B -------------------------------- + +template <typename Scalar, int UpLo> +void test_symm(Index n, Index nrhs) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = make_spd<Mat>(n); // SPD is also self-adjoint + Mat B = Mat::Random(n, nrhs); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::DeviceMatrix<Scalar> d_C; + d_C = d_A.template selfadjointView<UpLo>() * d_B; + + Mat C = d_C.toHost(); + Mat C_ref = A * B; // A is symmetric, so full multiply == symm + + RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm(); + VERIFY((C - C_ref).norm() < tol); +} + +// ---- SYRK/HERK: rankUpdate(A) → C = A * A^H -------------------------------- + +template <typename Scalar> +void test_syrk(Index n, Index k) { + using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(n, k); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + + gpu::DeviceMatrix<Scalar> d_C; + d_C.template selfadjointView<Lower>().rankUpdate(d_A); + + Mat C = d_C.toHost(); + // Only lower triangle is meaningful for SYRK. Compare lower triangle. + Mat C_ref = A * A.adjoint(); + + // Extract lower triangle for comparison. + Mat C_lower = C.template triangularView<Lower>(); + Mat C_ref_lower = C_ref.template triangularView<Lower>(); + + RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm(); + VERIFY((C_lower - C_ref_lower).norm() < tol); +} + +// ---- Per-scalar driver ------------------------------------------------------ + +template <typename Scalar> +void test_scalar() { + CALL_SUBTEST(test_gemm_basic<Scalar>(64, 64, 64)); + CALL_SUBTEST(test_gemm_basic<Scalar>(128, 64, 32)); + CALL_SUBTEST(test_gemm_basic<Scalar>(1, 1, 1)); + CALL_SUBTEST(test_gemm_basic<Scalar>(256, 256, 256)); + + CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(64, 64, 64)); + CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(128, 32, 64)); + + CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(64, 64, 64)); + CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(128, 32, 64)); + CALL_SUBTEST(test_gemm_view_pairs<Scalar>(64, 32, 48)); + + CALL_SUBTEST(test_gemm_scaled<Scalar>(64, 64, 64)); + CALL_SUBTEST(test_gemm_scaled_rhs<Scalar>(64, 64, 64)); + CALL_SUBTEST(test_gemm_accumulate<Scalar>(64, 64, 64)); + CALL_SUBTEST(test_gemm_accumulate_empty<Scalar>(64, 64, 64)); + CALL_SUBTEST(test_gemm_subtract<Scalar>(64, 64, 64)); + CALL_SUBTEST(test_gemm_subtract_empty<Scalar>(64, 64, 64)); + CALL_SUBTEST(test_gemm_explicit_context<Scalar>(64, 64, 64)); + CALL_SUBTEST(test_gemm_cross_context_reuse<Scalar>(64)); + CALL_SUBTEST(test_gemm_cross_context_resize<Scalar>()); + CALL_SUBTEST(test_gemm_chain<Scalar>(64)); + CALL_SUBTEST(test_gemm_identity<Scalar>(64)); + + // Solver expressions — zero-size edge cases (use dedicated tests, not residual-based) + + // Solver expressions + CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 1)); + CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 4)); + CALL_SUBTEST(test_llt_solve_expr<Scalar>(256, 8)); + CALL_SUBTEST(test_llt_solve_expr_context<Scalar>(64, 4)); + CALL_SUBTEST(test_llt_solve_upper<Scalar>(64, 4)); + CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 1)); + CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 4)); + CALL_SUBTEST(test_lu_solve_expr<Scalar>(256, 8)); + CALL_SUBTEST(test_lu_solve_expr_context<Scalar>(64, 4)); + CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(64)); + CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(0)); + CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(64)); + CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(0)); + CALL_SUBTEST(test_gemm_then_solve<Scalar>(64)); + + // TRSM + CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 1))); + CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 4))); + CALL_SUBTEST((test_trsm<Scalar, Upper>(64, 4))); + CALL_SUBTEST((test_trsm<Scalar, Lower>(256, 8))); + + // SYMM/HEMM + CALL_SUBTEST((test_symm<Scalar, Lower>(64, 4))); + CALL_SUBTEST((test_symm<Scalar, Upper>(64, 4))); + CALL_SUBTEST((test_symm<Scalar, Lower>(128, 8))); + + // SYRK/HERK + CALL_SUBTEST(test_syrk<Scalar>(64, 64)); + CALL_SUBTEST(test_syrk<Scalar>(64, 32)); + CALL_SUBTEST(test_syrk<Scalar>(128, 64)); +} + +// ---- Solver failure mode tests (not templated on Scalar) -------------------- +// Use the cached GpuLLT/GpuLU API which reports failure via info() rather than +// the expression API which asserts inside dispatch (incompatible with +// VERIFY_RAISES_ASSERT due to longjmp skipping RAII destructors). + +void test_llt_not_spd() { + // Negative definite matrix — LLT factorization must fail. + MatrixXd A = -MatrixXd::Identity(8, 8); + gpu::LLT<double> llt(A); + VERIFY_IS_EQUAL(llt.info(), NumericalIssue); +} + +void test_lu_singular() { + // Zero matrix — LU factorization must detect singularity. + MatrixXd A = MatrixXd::Zero(8, 8); + gpu::LU<double> lu(A); + VERIFY_IS_EQUAL(lu.info(), NumericalIssue); +} + +EIGEN_DECLARE_TEST(gpu_cublas) { + // Split by scalar so each part compiles in parallel. + CALL_SUBTEST_1(test_scalar<float>()); + CALL_SUBTEST_2(test_scalar<double>()); + CALL_SUBTEST_3(test_scalar<std::complex<float>>()); + CALL_SUBTEST_4(test_scalar<std::complex<double>>()); + CALL_SUBTEST_5(test_llt_not_spd()); + CALL_SUBTEST_5(test_lu_singular()); +}
diff --git a/unsupported/test/GPU/cusolver_llt.cpp b/unsupported/test/GPU/cusolver_llt.cpp new file mode 100644 index 0000000..011c02a --- /dev/null +++ b/unsupported/test/GPU/cusolver_llt.cpp
@@ -0,0 +1,220 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Eigen Authors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// Tests for GpuLLT: GPU Cholesky (LL^T) using cuSOLVER. +// Covers cusolverDnXpotrf (factorization) and cusolverDnXpotrs (solve) +// for float, double, complex<float>, complex<double>, Lower and Upper. + +#define EIGEN_USE_GPU +#include "main.h" +#include <Eigen/Cholesky> +#include <unsupported/Eigen/GPU> + +// Identifier convention throughout this file: +// h_ prefix for host-resident Eigen::Matrix values +// d_ prefix for device-resident Eigen::gpu::DeviceMatrix values +// We also keep namespaces explicit (Eigen::, Eigen::gpu::) so the CPU vs GPU +// path is obvious at every call site. + +// Build a random symmetric positive-definite matrix: A = M^H*M + n*I. +template <typename MatrixType> +MatrixType make_spd(Eigen::Index n) { + using Scalar = typename MatrixType::Scalar; + MatrixType M = MatrixType::Random(n, n); + return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n); +} + +// Test factorization: L*L^H must reconstruct A to within floating-point tolerance. +template <typename Scalar, int UpLo> +void test_potrf(Eigen::Index n) { + using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>; + using RealScalar = typename Eigen::NumTraits<Scalar>::Real; + + MatrixType h_A = make_spd<MatrixType>(n); + + // GPU factorization under test: factor stays on device. + Eigen::gpu::LLT<Scalar, UpLo> gpu_llt(h_A); + VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success); + + // CPU LLT is the oracle. GpuLLT does not expose the device-resident factor, + // so we validate correctness via the CPU-reconstructed matrix. + Eigen::LLT<MatrixType, UpLo> cpu_llt(h_A); + VERIFY_IS_EQUAL(cpu_llt.info(), Eigen::Success); + MatrixType h_A_reconstructed = cpu_llt.reconstructedMatrix(); + + RealScalar tol = RealScalar(4) * RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon() * h_A.norm(); + VERIFY((h_A_reconstructed - h_A).norm() < tol); + + // Cross-check: GPU and CPU solves must agree on the same RHS. `solve(Matrix)` + // uploads/downloads internally, so both outputs end up on the host. + MatrixType h_b = MatrixType::Random(n, 1); + MatrixType h_x_gpu = gpu_llt.solve(h_b); + MatrixType h_x_cpu = cpu_llt.solve(h_b); + VERIFY((h_x_gpu - h_x_cpu).norm() < tol); +} + +// Test solve: residual ||A*X - B|| / ||B|| must be small. +template <typename Scalar, int UpLo> +void test_potrs(Eigen::Index n, Eigen::Index nrhs) { + using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>; + using RealScalar = typename Eigen::NumTraits<Scalar>::Real; + + MatrixType h_A = make_spd<MatrixType>(n); + MatrixType h_B = MatrixType::Random(n, nrhs); + + Eigen::gpu::LLT<Scalar, UpLo> gpu_llt(h_A); + VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success); + + MatrixType h_X = gpu_llt.solve(h_B); + + RealScalar residual = (h_A * h_X - h_B).norm() / h_B.norm(); + RealScalar tol = RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon(); + VERIFY(residual < tol); +} + +// Test that multiple solves against the same factor all produce correct results. +// This exercises the key design property: L stays on device across calls. +template <typename Scalar> +void test_multiple_solves(Eigen::Index n) { + using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>; + using RealScalar = typename Eigen::NumTraits<Scalar>::Real; + + MatrixType h_A = make_spd<MatrixType>(n); + Eigen::gpu::LLT<Scalar, Eigen::Lower> gpu_llt(h_A); + VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success); + + RealScalar tol = RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon(); + for (int k = 0; k < 5; ++k) { + MatrixType h_B = MatrixType::Random(n, 3); + MatrixType h_X = gpu_llt.solve(h_B); + RealScalar residual = (h_A * h_X - h_B).norm() / h_B.norm(); + VERIFY(residual < tol); + } +} + +// Test that GpuLLT correctly detects a non-SPD matrix. +void test_not_spd() { + Eigen::MatrixXd h_A = -Eigen::MatrixXd::Identity(8, 8); // negative definite + Eigen::gpu::LLT<double> gpu_llt(h_A); + VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::NumericalIssue); +} + +// ---- DeviceMatrix-native API -------------------------------------------- +// These tests exercise the device-resident path: compute(DeviceMatrix) + +// solve(DeviceMatrix) -> DeviceMatrix, with the user explicitly managing +// upload/download. The tests above use the host-Matrix overloads which do +// the transfers internally; this section covers the no-implicit-transfer +// surface that keeps data on device across a chain of calls. + +// compute(DeviceMatrix) + solve(DeviceMatrix) → toHost +template <typename Scalar, int UpLo> +void test_device_matrix_solve(Eigen::Index n, Eigen::Index nrhs) { + using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>; + using RealScalar = typename Eigen::NumTraits<Scalar>::Real; + + MatrixType h_A = make_spd<MatrixType>(n); + MatrixType h_B = MatrixType::Random(n, nrhs); + + auto d_A = Eigen::gpu::DeviceMatrix<Scalar>::fromHost(h_A); + auto d_B = Eigen::gpu::DeviceMatrix<Scalar>::fromHost(h_B); + + Eigen::gpu::LLT<Scalar, UpLo> gpu_llt; + gpu_llt.compute(d_A); + VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success); + + Eigen::gpu::DeviceMatrix<Scalar> d_X = gpu_llt.solve(d_B); + MatrixType h_X = d_X.toHost(); + + RealScalar residual = (h_A * h_X - h_B).norm() / h_B.norm(); + VERIFY(residual < RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon()); +} + +// compute(DeviceMatrix&&) — move path +template <typename Scalar> +void test_device_matrix_move_compute(Eigen::Index n) { + using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>; + using RealScalar = typename Eigen::NumTraits<Scalar>::Real; + + MatrixType h_A = make_spd<MatrixType>(n); + MatrixType h_B = MatrixType::Random(n, 1); + + auto d_A = Eigen::gpu::DeviceMatrix<Scalar>::fromHost(h_A); + Eigen::gpu::LLT<Scalar, Eigen::Lower> gpu_llt; + gpu_llt.compute(std::move(d_A)); + VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success); + + // d_A should be empty after move. + VERIFY(d_A.empty()); + + MatrixType h_X = gpu_llt.solve(h_B); + RealScalar residual = (h_A * h_X - h_B).norm() / h_B.norm(); + VERIFY(residual < RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon()); +} + +// Full async chain: compute → solve → solve again with result as RHS → toHost +template <typename Scalar> +void test_chaining(Eigen::Index n) { + using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>; + using RealScalar = typename Eigen::NumTraits<Scalar>::Real; + + MatrixType h_A = make_spd<MatrixType>(n); + MatrixType h_B = MatrixType::Random(n, 3); + + auto d_A = Eigen::gpu::DeviceMatrix<Scalar>::fromHost(h_A); + auto d_B = Eigen::gpu::DeviceMatrix<Scalar>::fromHost(h_B); + + Eigen::gpu::LLT<Scalar, Eigen::Lower> gpu_llt; + gpu_llt.compute(d_A); + VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success); + + // Chain: solve → use result as RHS for another solve. Everything stays on + // device until the final toHost() below; that's the only sync point. + Eigen::gpu::DeviceMatrix<Scalar> d_X = gpu_llt.solve(d_B); + Eigen::gpu::DeviceMatrix<Scalar> d_Y = gpu_llt.solve(d_X); + + MatrixType h_Y = d_Y.toHost(); + + // Verify: Y = A^{-2} * B using CPU oracle. + MatrixType h_X_ref = Eigen::LLT<MatrixType, Eigen::Lower>(h_A).solve(h_B); + MatrixType h_Y_ref = Eigen::LLT<MatrixType, Eigen::Lower>(h_A).solve(h_X_ref); + + RealScalar tol = RealScalar(4) * RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon() * h_Y_ref.norm(); + VERIFY((h_Y - h_Y_ref).norm() < tol); +} + +template <typename Scalar> +void test_scalar() { + CALL_SUBTEST((test_potrf<Scalar, Eigen::Lower>(1))); + CALL_SUBTEST((test_potrf<Scalar, Eigen::Lower>(64))); + CALL_SUBTEST((test_potrf<Scalar, Eigen::Lower>(256))); + CALL_SUBTEST((test_potrf<Scalar, Eigen::Upper>(64))); + CALL_SUBTEST((test_potrf<Scalar, Eigen::Upper>(256))); + + CALL_SUBTEST((test_potrs<Scalar, Eigen::Lower>(64, 1))); + CALL_SUBTEST((test_potrs<Scalar, Eigen::Lower>(64, 4))); + CALL_SUBTEST((test_potrs<Scalar, Eigen::Lower>(256, 8))); + CALL_SUBTEST((test_potrs<Scalar, Eigen::Upper>(64, 1))); + CALL_SUBTEST((test_potrs<Scalar, Eigen::Upper>(256, 4))); + + CALL_SUBTEST(test_multiple_solves<Scalar>(128)); + + CALL_SUBTEST((test_device_matrix_solve<Scalar, Eigen::Lower>(64, 4))); + CALL_SUBTEST((test_device_matrix_solve<Scalar, Eigen::Upper>(128, 1))); + CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64)); + CALL_SUBTEST(test_chaining<Scalar>(64)); +} + +EIGEN_DECLARE_TEST(gpu_cusolver_llt) { + // Split by scalar so each part compiles in parallel. + CALL_SUBTEST_1(test_scalar<float>()); + CALL_SUBTEST_2(test_scalar<double>()); + CALL_SUBTEST_3(test_scalar<std::complex<float>>()); + CALL_SUBTEST_4(test_scalar<std::complex<double>>()); + CALL_SUBTEST_5(test_not_spd()); +}
diff --git a/unsupported/test/GPU/cusolver_lu.cpp b/unsupported/test/GPU/cusolver_lu.cpp new file mode 100644 index 0000000..08bfde8 --- /dev/null +++ b/unsupported/test/GPU/cusolver_lu.cpp
@@ -0,0 +1,204 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Eigen Authors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// Tests for GpuLU: GPU partial-pivoting LU decomposition via cuSOLVER. +// Covers cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve) +// for float, double, complex<float>, complex<double>. +// +#define EIGEN_USE_GPU +#include "main.h" +#include <unsupported/Eigen/GPU> + +using namespace Eigen; + +// ---- Test factorization + NoTrans solve: residual ||A*X - B|| / ||B|| ------- + +template <typename Scalar> +void test_getrf(Index n) { + using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + MatrixType A = MatrixType::Random(n, n); + MatrixType B = MatrixType::Random(n, 4); + + gpu::LU<Scalar> lu(A); + VERIFY_IS_EQUAL(lu.info(), Success); + + MatrixType X = lu.solve(B); + // Backward error bound for LU: ||A*X - B|| <= O(n*u) * ||A|| * ||X||. + // Normalize by ||A||*||X|| rather than ||B|| to be condition-number agnostic. + RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm()); + VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon()); +} + +// ---- Test solve: A^T*X = B and A^H*X = B ------------------------------------ + +template <typename Scalar> +void test_getrs_trans(Index n) { + using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + MatrixType A = MatrixType::Random(n, n); + MatrixType B = MatrixType::Random(n, 3); + RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + + gpu::LU<Scalar> lu(A); + VERIFY_IS_EQUAL(lu.info(), Success); + + MatrixType Xt = lu.solve(B, gpu::GpuOp::Trans); + VERIFY((A.transpose() * Xt - B).norm() / (A.norm() * Xt.norm()) < tol); + + MatrixType Xc = lu.solve(B, gpu::GpuOp::ConjTrans); + VERIFY((A.adjoint() * Xc - B).norm() / (A.norm() * Xc.norm()) < tol); +} + +// ---- Test multiple solves reuse the device-resident LU ---------------------- + +template <typename Scalar> +void test_multiple_solves(Index n) { + using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + MatrixType A = MatrixType::Random(n, n); + gpu::LU<Scalar> lu(A); + VERIFY_IS_EQUAL(lu.info(), Success); + + RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + for (int k = 0; k < 5; ++k) { + MatrixType B = MatrixType::Random(n, 3); + MatrixType X = lu.solve(B); + VERIFY((A * X - B).norm() / (A.norm() * X.norm()) < tol); + } +} + +// ---- Residual check for host solve ------------------------------------------ + +template <typename Scalar> +void test_vs_cpu(Index n) { + using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + MatrixType A = MatrixType::Random(n, n); + MatrixType B = MatrixType::Random(n, 5); + + gpu::LU<Scalar> gpu_lu(A); + VERIFY_IS_EQUAL(gpu_lu.info(), Success); + + MatrixType X_gpu = gpu_lu.solve(B); + + RealScalar residual = (A * X_gpu - B).norm() / (A.norm() * X_gpu.norm()); + VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon()); +} + +// ---- Singular matrix detection ---------------------------------------------- + +void test_singular() { + MatrixXd A = MatrixXd::Zero(8, 8); + gpu::LU<double> lu(A); + VERIFY_IS_EQUAL(lu.info(), NumericalIssue); +} + +// ---- DeviceMatrix integration tests ----------------------------------------- + +template <typename Scalar> +void test_device_matrix_solve(Index n) { + using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + MatrixType A = MatrixType::Random(n, n); + MatrixType B = MatrixType::Random(n, 4); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::LU<Scalar> lu; + lu.compute(d_A); + VERIFY_IS_EQUAL(lu.info(), Success); + + gpu::DeviceMatrix<Scalar> d_X = lu.solve(d_B); + MatrixType X = d_X.toHost(); + + RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm()); + VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon()); +} + +template <typename Scalar> +void test_device_matrix_move_compute(Index n) { + using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + MatrixType A = MatrixType::Random(n, n); + MatrixType B = MatrixType::Random(n, 1); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + gpu::LU<Scalar> lu; + lu.compute(std::move(d_A)); + VERIFY_IS_EQUAL(lu.info(), Success); + VERIFY(d_A.empty()); + + MatrixType X = lu.solve(B); + RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm()); + VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon()); +} + +template <typename Scalar> +void test_chaining(Index n) { + using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + MatrixType A = MatrixType::Random(n, n); + MatrixType B = MatrixType::Random(n, 3); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::LU<Scalar> lu; + lu.compute(d_A); + VERIFY_IS_EQUAL(lu.info(), Success); + + // Chain: solve → use result as RHS + gpu::DeviceMatrix<Scalar> d_X = lu.solve(d_B); + gpu::DeviceMatrix<Scalar> d_Y = lu.solve(d_X); + MatrixType X = d_X.toHost(); + MatrixType Y = d_Y.toHost(); + + RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY((A * X - B).norm() / (A.norm() * X.norm()) < tol); + VERIFY((A * Y - X).norm() / (A.norm() * Y.norm()) < tol); +} + +// ---- Per-scalar driver ------------------------------------------------------- + +template <typename Scalar> +void test_scalar() { + CALL_SUBTEST(test_getrf<Scalar>(1)); + CALL_SUBTEST(test_getrf<Scalar>(64)); + CALL_SUBTEST(test_getrf<Scalar>(256)); + + CALL_SUBTEST(test_getrs_trans<Scalar>(64)); + CALL_SUBTEST(test_getrs_trans<Scalar>(128)); + + CALL_SUBTEST(test_multiple_solves<Scalar>(128)); + + CALL_SUBTEST(test_vs_cpu<Scalar>(64)); + CALL_SUBTEST(test_vs_cpu<Scalar>(256)); + + CALL_SUBTEST(test_device_matrix_solve<Scalar>(64)); + CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64)); + CALL_SUBTEST(test_chaining<Scalar>(64)); +} + +EIGEN_DECLARE_TEST(gpu_cusolver_lu) { + // Split by scalar so each part compiles in parallel. + CALL_SUBTEST_1(test_scalar<float>()); + CALL_SUBTEST_2(test_scalar<double>()); + CALL_SUBTEST_3(test_scalar<std::complex<float>>()); + CALL_SUBTEST_4(test_scalar<std::complex<double>>()); + CALL_SUBTEST_5(test_singular()); +}
diff --git a/unsupported/test/GPU/device_matrix.cpp b/unsupported/test/GPU/device_matrix.cpp new file mode 100644 index 0000000..d09e641 --- /dev/null +++ b/unsupported/test/GPU/device_matrix.cpp
@@ -0,0 +1,260 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// Tests for DeviceMatrix and HostTransfer: typed RAII GPU memory wrapper. +// No cuSOLVER dependency — only CUDA runtime. + +#define EIGEN_USE_GPU +#include "main.h" +#include <unsupported/Eigen/GPU> + +using namespace Eigen; + +// ---- Default construction --------------------------------------------------- + +void test_default_construct() { + gpu::DeviceMatrix<double> dm; + VERIFY(dm.empty()); + VERIFY_IS_EQUAL(dm.rows(), 0); + VERIFY_IS_EQUAL(dm.cols(), 0); + VERIFY(dm.data() == nullptr); + VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(0)); +} + +// ---- Allocate uninitialized ------------------------------------------------- + +template <typename Scalar> +void test_allocate(Index rows, Index cols) { + gpu::DeviceMatrix<Scalar> dm(rows, cols); + VERIFY(!dm.empty()); + VERIFY_IS_EQUAL(dm.rows(), rows); + VERIFY_IS_EQUAL(dm.cols(), cols); + VERIFY_IS_EQUAL(dm.outerStride(), rows); + VERIFY(dm.data() != nullptr); + VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar)); +} + +// ---- fromHost / toHost roundtrip (synchronous) ------------------------------ + +template <typename Scalar> +void test_roundtrip(Index rows, Index cols) { + using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + MatrixType host = MatrixType::Random(rows, cols); + + auto dm = gpu::DeviceMatrix<Scalar>::fromHost(host); + VERIFY_IS_EQUAL(dm.rows(), rows); + VERIFY_IS_EQUAL(dm.cols(), cols); + VERIFY(!dm.empty()); + + MatrixType result = dm.toHost(); + VERIFY_IS_EQUAL(result.rows(), rows); + VERIFY_IS_EQUAL(result.cols(), cols); + VERIFY_IS_APPROX(result, host); +} + +// ---- fromHostAsync / toHostAsync roundtrip ----------------------------------- + +template <typename Scalar> +void test_roundtrip_async(Index rows, Index cols) { + using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + MatrixType host = MatrixType::Random(rows, cols); + + cudaStream_t stream; + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream)); + + // Async upload from raw pointer. + auto dm = gpu::DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, rows, stream); + VERIFY_IS_EQUAL(dm.rows(), rows); + VERIFY_IS_EQUAL(dm.cols(), cols); + + // Async download via HostTransfer future. + auto transfer = dm.toHostAsync(stream); + + // get() blocks and returns the matrix. + MatrixType result = transfer.get(); + VERIFY_IS_APPROX(result, host); + + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamDestroy(stream)); + + cudaStream_t producer_stream; + cudaStream_t consumer_stream; + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&producer_stream)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&consumer_stream)); + + auto cross_stream_dm = gpu::DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, rows, producer_stream); + auto cross_stream_transfer = cross_stream_dm.toHostAsync(consumer_stream); + MatrixType cross_stream_result = cross_stream_transfer.get(); + VERIFY_IS_APPROX(cross_stream_result, host); + + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamDestroy(consumer_stream)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamDestroy(producer_stream)); +} + +// ---- HostTransfer::ready() and idempotent get() ----------------------------- + +void test_host_transfer_ready() { + using MatrixType = Eigen::Matrix<double, Dynamic, Dynamic>; + MatrixType host = MatrixType::Random(100, 100); + + auto dm = gpu::DeviceMatrix<double>::fromHost(host); + auto transfer = dm.toHostAsync(); + + // After get(), ready() must return true. + MatrixType result = transfer.get(); + VERIFY(transfer.ready()); + VERIFY_IS_APPROX(result, host); + + // get() is idempotent. + MatrixType& result2 = transfer.get(); + VERIFY_IS_APPROX(result2, host); +} + +// ---- HostTransfer move ------------------------------------------------------ + +void test_host_transfer_move() { + using MatrixType = Eigen::Matrix<double, Dynamic, Dynamic>; + MatrixType host = MatrixType::Random(50, 50); + + auto dm = gpu::DeviceMatrix<double>::fromHost(host); + auto transfer = dm.toHostAsync(); + + gpu::HostTransfer<double> moved(std::move(transfer)); + MatrixType result = moved.get(); + VERIFY_IS_APPROX(result, host); +} + +// ---- clone() produces independent copy -------------------------------------- + +template <typename Scalar> +void test_clone(Index rows, Index cols) { + using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + MatrixType host = MatrixType::Random(rows, cols); + + auto dm = gpu::DeviceMatrix<Scalar>::fromHost(host); + auto cloned = dm.clone(); + + // Overwrite original with different data. + MatrixType other = MatrixType::Random(rows, cols); + dm = gpu::DeviceMatrix<Scalar>::fromHost(other); + + // Clone still holds the original data. + MatrixType clone_result = cloned.toHost(); + VERIFY_IS_APPROX(clone_result, host); + + // Original holds the new data. + MatrixType dm_result = dm.toHost(); + VERIFY_IS_APPROX(dm_result, other); +} + +// ---- Move construct --------------------------------------------------------- + +template <typename Scalar> +void test_move_construct(Index rows, Index cols) { + using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + MatrixType host = MatrixType::Random(rows, cols); + + auto dm = gpu::DeviceMatrix<Scalar>::fromHost(host); + gpu::DeviceMatrix<Scalar> moved(std::move(dm)); + + VERIFY(dm.empty()); + VERIFY(dm.data() == nullptr); + + VERIFY_IS_EQUAL(moved.rows(), rows); + VERIFY_IS_EQUAL(moved.cols(), cols); + MatrixType result = moved.toHost(); + VERIFY_IS_APPROX(result, host); +} + +// ---- Move assign ------------------------------------------------------------ + +template <typename Scalar> +void test_move_assign(Index rows, Index cols) { + using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>; + MatrixType host = MatrixType::Random(rows, cols); + + auto dm = gpu::DeviceMatrix<Scalar>::fromHost(host); + gpu::DeviceMatrix<Scalar> dest; + dest = std::move(dm); + + VERIFY(dm.empty()); + VERIFY_IS_EQUAL(dest.rows(), rows); + MatrixType result = dest.toHost(); + VERIFY_IS_APPROX(result, host); +} + +// ---- resize() --------------------------------------------------------------- + +void test_resize() { + gpu::DeviceMatrix<double> dm(10, 20); + VERIFY_IS_EQUAL(dm.rows(), 10); + VERIFY_IS_EQUAL(dm.cols(), 20); + + dm.resize(50, 30); + VERIFY_IS_EQUAL(dm.rows(), 50); + VERIFY_IS_EQUAL(dm.cols(), 30); + VERIFY_IS_EQUAL(dm.outerStride(), 50); + VERIFY(dm.data() != nullptr); + + // Resize to same dimensions is a no-op. + double* ptr_before = dm.data(); + dm.resize(50, 30); + VERIFY(dm.data() == ptr_before); +} + +// ---- Empty / 0x0 matrix ----------------------------------------------------- + +void test_empty() { + using MatrixType = Eigen::Matrix<double, Dynamic, Dynamic>; + MatrixType empty_mat(0, 0); + + auto dm = gpu::DeviceMatrix<double>::fromHost(empty_mat); + VERIFY(dm.empty()); + VERIFY_IS_EQUAL(dm.rows(), 0); + VERIFY_IS_EQUAL(dm.cols(), 0); + + MatrixType result = dm.toHost(); + VERIFY_IS_EQUAL(result.rows(), 0); + VERIFY_IS_EQUAL(result.cols(), 0); +} + +// ---- Per-scalar driver ------------------------------------------------------ + +template <typename Scalar> +void test_scalar() { + // Square. + CALL_SUBTEST(test_roundtrip<Scalar>(1, 1)); + CALL_SUBTEST(test_roundtrip<Scalar>(64, 64)); + CALL_SUBTEST(test_roundtrip<Scalar>(256, 256)); + + // Rectangular. + CALL_SUBTEST(test_roundtrip<Scalar>(100, 7)); + CALL_SUBTEST(test_roundtrip<Scalar>(7, 100)); + + // Async roundtrip. + CALL_SUBTEST(test_roundtrip_async<Scalar>(64, 64)); + CALL_SUBTEST(test_roundtrip_async<Scalar>(100, 7)); + + CALL_SUBTEST(test_clone<Scalar>(64, 64)); + CALL_SUBTEST(test_move_construct<Scalar>(64, 64)); + CALL_SUBTEST(test_move_assign<Scalar>(64, 64)); +} + +EIGEN_DECLARE_TEST(gpu_device_matrix) { + CALL_SUBTEST(test_default_construct()); + CALL_SUBTEST(test_empty()); + CALL_SUBTEST(test_resize()); + CALL_SUBTEST(test_host_transfer_ready()); + CALL_SUBTEST(test_host_transfer_move()); + CALL_SUBTEST((test_allocate<float>(100, 50))); + CALL_SUBTEST((test_allocate<double>(100, 50))); + CALL_SUBTEST(test_scalar<float>()); + CALL_SUBTEST(test_scalar<double>()); + CALL_SUBTEST(test_scalar<std::complex<float>>()); + CALL_SUBTEST(test_scalar<std::complex<double>>()); +}