Eigen/GPU [4/5]: Add sparse solvers, FFT, and SpMV (cuDSS, cuFFT, cuSPARSE) libeigen/eigen!2414 Closes #3067 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com> Co-authored-by: Rasmus Munk Larsen <rlarsen@nvidia.com>
diff --git a/unsupported/Eigen/GPU b/unsupported/Eigen/GPU index 4c15d7d..7247074 100644 --- a/unsupported/Eigen/GPU +++ b/unsupported/Eigen/GPU
@@ -11,6 +11,7 @@ #define EIGEN_GPU_MODULE_H #include "../../Eigen/Core" +#include "../../Eigen/SparseCore" #include "../../Eigen/src/Core/util/DisableStupidWarnings.h" @@ -53,6 +54,17 @@ #include "src/GPU/GpuQR.h" #include "src/GPU/GpuSVD.h" #include "src/GPU/GpuEigenSolver.h" +#include "src/GPU/CuFftSupport.h" +#include "src/GPU/GpuFFT.h" +#include "src/GPU/CuSparseSupport.h" +#include "src/GPU/GpuSparseContext.h" +#ifdef EIGEN_CUDSS +#include "src/GPU/CuDssSupport.h" +#include "src/GPU/GpuSparseSolverBase.h" +#include "src/GPU/GpuSparseLLT.h" +#include "src/GPU/GpuSparseLDLT.h" +#include "src/GPU/GpuSparseLU.h" +#endif // IWYU pragma: end_exports #endif
diff --git a/unsupported/Eigen/src/GPU/CuBlasSupport.h b/unsupported/Eigen/src/GPU/CuBlasSupport.h index 994d95a..0e56b69 100644 --- a/unsupported/Eigen/src/GPU/CuBlasSupport.h +++ b/unsupported/Eigen/src/GPU/CuBlasSupport.h
@@ -26,12 +26,6 @@ namespace Eigen { namespace gpu { - -// ---- Operation enum --------------------------------------------------------- -// Public flag for transpose/adjoint in BLAS- and solver-style calls. - -enum class GpuOp { NoTrans, Trans, ConjTrans }; - namespace internal { // ---- Error-checking macro --------------------------------------------------- @@ -218,6 +212,22 @@ reinterpret_cast<cuDoubleComplex*>(C), ldc); } +// SCAL wrappers: x = alpha * x. +// For complex x, alpha is real-valued (Csscal/Zdscal) — this matches the +// 1/n inverse-FFT scaling pattern, where the scale is intrinsically real. +inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const float* alpha, float* x, int incx) { + return cublasSscal(h, n, alpha, x, incx); +} +inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const double* alpha, double* x, int incx) { + return cublasDscal(h, n, alpha, x, incx); +} +inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const float* alpha, std::complex<float>* x, int incx) { + return cublasCsscal(h, n, alpha, reinterpret_cast<cuComplex*>(x), incx); +} +inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const double* alpha, std::complex<double>* x, int incx) { + return cublasZdscal(h, n, alpha, reinterpret_cast<cuDoubleComplex*>(x), incx); +} + // DGMM wrappers: C = A * diag(x) (side=RIGHT) or C = diag(x) * A (side=LEFT). // Useful for applying a diagonal scaling without materialising diag(x) as a // dense matrix. cuBLAS docs guarantee in-place is safe when C == A.
diff --git a/unsupported/Eigen/src/GPU/CuDssSupport.h b/unsupported/Eigen/src/GPU/CuDssSupport.h new file mode 100644 index 0000000..7bb4cb0 --- /dev/null +++ b/unsupported/Eigen/src/GPU/CuDssSupport.h
@@ -0,0 +1,137 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// cuDSS support utilities: error checking macro, type mapping. +// +// cuDSS is NVIDIA's sparse direct solver library, supporting Cholesky (LL^T), +// LDL^T, and LU factorization on GPU. It requires CUDA 12.0+ and is +// distributed separately from the CUDA Toolkit. + +#ifndef EIGEN_GPU_CUDSS_SUPPORT_H +#define EIGEN_GPU_CUDSS_SUPPORT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./GpuSupport.h" +#include <cudss.h> + +namespace Eigen { +namespace gpu { +namespace internal { + +// ---- Error checking --------------------------------------------------------- + +#define EIGEN_CUDSS_CHECK(x) \ + do { \ + cudssStatus_t _s = (x); \ + eigen_assert(_s == CUDSS_STATUS_SUCCESS && "cuDSS call failed: " #x); \ + EIGEN_UNUSED_VARIABLE(_s); \ + } while (0) + +// ---- Scalar → cudssMatrixType_t for SPD/HPD --------------------------------- + +template <typename Scalar> +struct cudss_spd_type; + +template <> +struct cudss_spd_type<float> { + static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD; +}; +template <> +struct cudss_spd_type<double> { + static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD; +}; +template <> +struct cudss_spd_type<std::complex<float>> { + static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD; +}; +template <> +struct cudss_spd_type<std::complex<double>> { + static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD; +}; + +// ---- Scalar → cudssMatrixType_t for symmetric/Hermitian --------------------- + +template <typename Scalar> +struct cudss_symmetric_type; + +template <> +struct cudss_symmetric_type<float> { + static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC; +}; +template <> +struct cudss_symmetric_type<double> { + static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC; +}; +template <> +struct cudss_symmetric_type<std::complex<float>> { + static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN; +}; +template <> +struct cudss_symmetric_type<std::complex<double>> { + static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN; +}; + +// ---- StorageIndex → cudaDataType_t ------------------------------------------ + +template <typename StorageIndex> +struct cudss_index_type; + +template <> +struct cudss_index_type<int> { + static constexpr cudaDataType_t value = CUDA_R_32I; +}; +template <> +struct cudss_index_type<int64_t> { + static constexpr cudaDataType_t value = CUDA_R_64I; +}; + +// ---- UpLo → cudssMatrixViewType_t ------------------------------------------- +// For symmetric matrices stored as CSC (ColMajor), cuDSS sees CSR of A^T. +// Since A = A^T, the data is the same, but the triangle view must be swapped. + +template <int UpLo, int StorageOrder> +struct cudss_view_type; + +// ColMajor (CSC) passed as CSR: lower ↔ upper swap. +template <> +struct cudss_view_type<Lower, ColMajor> { + static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER; +}; +template <> +struct cudss_view_type<Upper, ColMajor> { + static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER; +}; + +// RowMajor (CSR) passed directly: no swap needed. +template <> +struct cudss_view_type<Lower, RowMajor> { + static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER; +}; +template <> +struct cudss_view_type<Upper, RowMajor> { + static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER; +}; + +} // namespace internal + +// ---- Ordering enum ---------------------------------------------------------- + +enum class GpuSparseOrdering { + AMD, // Default fill-reducing ordering + METIS, // METIS nested dissection + RCM // Reverse Cuthill-McKee +}; + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_CUDSS_SUPPORT_H
diff --git a/unsupported/Eigen/src/GPU/CuFftSupport.h b/unsupported/Eigen/src/GPU/CuFftSupport.h new file mode 100644 index 0000000..a9ed788 --- /dev/null +++ b/unsupported/Eigen/src/GPU/CuFftSupport.h
@@ -0,0 +1,106 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// cuFFT support utilities: error checking macro, type mapping. + +#ifndef EIGEN_GPU_CUFFT_SUPPORT_H +#define EIGEN_GPU_CUFFT_SUPPORT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./GpuSupport.h" +#include <cufft.h> + +namespace Eigen { +namespace gpu { +namespace internal { + +// ---- Error checking --------------------------------------------------------- + +#define EIGEN_CUFFT_CHECK(x) \ + do { \ + cufftResult _r = (x); \ + eigen_assert(_r == CUFFT_SUCCESS && "cuFFT call failed: " #x); \ + EIGEN_UNUSED_VARIABLE(_r); \ + } while (0) + +// ---- Scalar → cufftType traits ---------------------------------------------- + +template <typename Scalar> +struct cufft_c2c_type; + +template <> +struct cufft_c2c_type<float> { + static constexpr cufftType value = CUFFT_C2C; +}; +template <> +struct cufft_c2c_type<double> { + static constexpr cufftType value = CUFFT_Z2Z; +}; + +template <typename Scalar> +struct cufft_r2c_type; + +template <> +struct cufft_r2c_type<float> { + static constexpr cufftType value = CUFFT_R2C; +}; +template <> +struct cufft_r2c_type<double> { + static constexpr cufftType value = CUFFT_D2Z; +}; + +template <typename Scalar> +struct cufft_c2r_type; + +template <> +struct cufft_c2r_type<float> { + static constexpr cufftType value = CUFFT_C2R; +}; +template <> +struct cufft_c2r_type<double> { + static constexpr cufftType value = CUFFT_Z2D; +}; + +// ---- Type-dispatched cuFFT execution ---------------------------------------- + +// C2C +inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<float>* in, std::complex<float>* out, + int direction) { + return cufftExecC2C(plan, reinterpret_cast<cufftComplex*>(in), reinterpret_cast<cufftComplex*>(out), direction); +} +inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<double>* in, std::complex<double>* out, + int direction) { + return cufftExecZ2Z(plan, reinterpret_cast<cufftDoubleComplex*>(in), reinterpret_cast<cufftDoubleComplex*>(out), + direction); +} + +// R2C +inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, float* in, std::complex<float>* out) { + return cufftExecR2C(plan, in, reinterpret_cast<cufftComplex*>(out)); +} +inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, double* in, std::complex<double>* out) { + return cufftExecD2Z(plan, in, reinterpret_cast<cufftDoubleComplex*>(out)); +} + +// C2R +inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<float>* in, float* out) { + return cufftExecC2R(plan, reinterpret_cast<cufftComplex*>(in), out); +} +inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<double>* in, double* out) { + return cufftExecZ2D(plan, reinterpret_cast<cufftDoubleComplex*>(in), out); +} + +} // namespace internal +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_CUFFT_SUPPORT_H
diff --git a/unsupported/Eigen/src/GPU/CuSolverSupport.h b/unsupported/Eigen/src/GPU/CuSolverSupport.h index 41f7b31..8b88c5c 100644 --- a/unsupported/Eigen/src/GPU/CuSolverSupport.h +++ b/unsupported/Eigen/src/GPU/CuSolverSupport.h
@@ -92,7 +92,7 @@ CusolverParams(CusolverParams&& o) noexcept : p(o.p) { o.p = nullptr; } CusolverParams& operator=(CusolverParams&& o) noexcept { if (this != &o) { - if (p) (void)cusolverDnDestroyParams(p); + if (p) EIGEN_CUSOLVER_CHECK(cusolverDnDestroyParams(p)); p = o.p; o.p = nullptr; }
diff --git a/unsupported/Eigen/src/GPU/CuSparseSupport.h b/unsupported/Eigen/src/GPU/CuSparseSupport.h new file mode 100644 index 0000000..1d60ecd --- /dev/null +++ b/unsupported/Eigen/src/GPU/CuSparseSupport.h
@@ -0,0 +1,56 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// cuSPARSE support utilities: error checking macro. + +#ifndef EIGEN_GPU_CUSPARSE_SUPPORT_H +#define EIGEN_GPU_CUSPARSE_SUPPORT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./GpuSupport.h" +#include <cusparse.h> + +namespace Eigen { +namespace gpu { +namespace internal { + +#define EIGEN_CUSPARSE_CHECK(x) \ + do { \ + cusparseStatus_t _s = (x); \ + eigen_assert(_s == CUSPARSE_STATUS_SUCCESS && "cuSPARSE call failed: " #x); \ + EIGEN_UNUSED_VARIABLE(_s); \ + } while (0) + +constexpr cusparseOperation_t to_cusparse_op(GpuOp op) { + switch (op) { + case GpuOp::Trans: + return CUSPARSE_OPERATION_TRANSPOSE; + case GpuOp::ConjTrans: + return CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE; + default: + return CUSPARSE_OPERATION_NON_TRANSPOSE; + } +} + +// cuSPARSE rejects CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE for real scalar +// types; for real Scalar, ConjTrans is mathematically equivalent to Trans, +// so silently demote it. Complex Scalar passes through unchanged. +template <typename Scalar> +constexpr cusparseOperation_t to_cusparse_op_for_scalar(GpuOp op) { + return to_cusparse_op((op == GpuOp::ConjTrans && !NumTraits<Scalar>::IsComplex) ? GpuOp::Trans : op); +} + +} // namespace internal +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_CUSPARSE_SUPPORT_H
diff --git a/unsupported/Eigen/src/GPU/DeviceMatrix.h b/unsupported/Eigen/src/GPU/DeviceMatrix.h index 869a5ac..9d30a6f 100644 --- a/unsupported/Eigen/src/GPU/DeviceMatrix.h +++ b/unsupported/Eigen/src/GPU/DeviceMatrix.h
@@ -134,7 +134,7 @@ HostTransfer& operator=(HostTransfer&& o) noexcept { if (this != &o) { - if (event_) (void)cudaEventDestroy(event_); + if (event_) EIGEN_CUDA_RUNTIME_CHECK(cudaEventDestroy(event_)); host_buf_ = std::move(o.host_buf_); pinned_buf_ = std::move(o.pinned_buf_); event_ = o.event_; @@ -226,7 +226,7 @@ DeviceMatrix& operator=(DeviceMatrix&& o) noexcept { if (this != &o) { - if (ready_event_) (void)cudaEventDestroy(ready_event_); + if (ready_event_) EIGEN_CUDA_RUNTIME_CHECK(cudaEventDestroy(ready_event_)); data_ = std::move(o.data_); rows_ = o.rows_; cols_ = o.cols_; @@ -358,7 +358,7 @@ if (rows == rows_ && cols == cols_) return; data_.reset(); if (ready_event_) { - (void)cudaEventDestroy(ready_event_); + EIGEN_CUDA_RUNTIME_CHECK(cudaEventDestroy(ready_event_)); ready_event_ = nullptr; } ready_stream_ = nullptr; @@ -502,7 +502,7 @@ rows_ = 0; cols_ = 0; if (ready_event_) { - (void)cudaEventDestroy(ready_event_); + EIGEN_CUDA_RUNTIME_CHECK(cudaEventDestroy(ready_event_)); ready_event_ = nullptr; } ready_stream_ = nullptr;
diff --git a/unsupported/Eigen/src/GPU/GpuFFT.h b/unsupported/Eigen/src/GPU/GpuFFT.h new file mode 100644 index 0000000..34e770e --- /dev/null +++ b/unsupported/Eigen/src/GPU/GpuFFT.h
@@ -0,0 +1,315 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// GPU FFT via cuFFT. +// +// Standalone GPU FFT class with plan caching. Supports 1D and 2D transforms: +// C2C (complex-to-complex), R2C (real-to-complex), C2R (complex-to-real). +// +// Inverse transforms are scaled by 1/n (1D) or 1/(n*m) (2D) so that +// inv(fwd(x)) == x, matching Eigen's FFT convention. +// +// cuFFT plans are cached by (size, type) and reused across calls. +// +// Thread safety: not thread-safe. Concurrent fwd/inv calls on a single FFT +// instance race on the cached plans, the cuBLAS handle, and the bound +// stream. Use one FFT instance per thread. +// +// Usage: +// FFT<float> fft; +// VectorXcf X = fft.fwd(x); // 1D C2C or R2C +// VectorXcf y = fft.inv(X); // 1D C2C inverse +// VectorXf r = fft.invReal(X, n); // 1D C2R inverse +// MatrixXcf B = fft.fwd2(A); // 2D C2C forward +// MatrixXcf C = fft.inv2(B); // 2D C2C inverse + +#ifndef EIGEN_GPU_FFT_H +#define EIGEN_GPU_FFT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./CuFftSupport.h" +#include "./CuBlasSupport.h" +#include <map> + +namespace Eigen { +namespace gpu { + +template <typename Scalar_> +class FFT { + public: + using Scalar = Scalar_; + using Complex = std::complex<Scalar>; + using ComplexVector = Matrix<Complex, Dynamic, 1>; + using RealVector = Matrix<Scalar, Dynamic, 1>; + using ComplexMatrix = Matrix<Complex, Dynamic, Dynamic, ColMajor>; + + FFT() { + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_)); + EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_)); + EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_)); + } + + ~FFT() { + for (auto& kv : plans_) (void)cufftDestroy(kv.second); + if (cublas_) (void)cublasDestroy(cublas_); + if (stream_) (void)cudaStreamDestroy(stream_); + } + + FFT(const FFT&) = delete; + FFT& operator=(const FFT&) = delete; + + // ---- 1D Complex-to-Complex ------------------------------------------------ + + /** Forward 1D C2C FFT. */ + template <typename Derived> + ComplexVector fwd(const MatrixBase<Derived>& x, + typename std::enable_if<NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) { + const ComplexVector input(x.derived()); + const int n = static_cast<int>(input.size()); + if (n == 0) return ComplexVector(0); + + ensure_buffers(n * sizeof(Complex), n * sizeof(Complex)); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_in_.get(), input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_)); + + cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value); + EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.get()), + static_cast<Complex*>(d_out_.get()), CUFFT_FORWARD)); + + ComplexVector result(n); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(result.data(), d_out_.get(), n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + return result; + } + + /** Inverse 1D C2C FFT. Scaled by 1/n. */ + template <typename Derived> + ComplexVector inv(const MatrixBase<Derived>& X) { + static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv() requires complex input"); + const ComplexVector input(X.derived()); + const int n = static_cast<int>(input.size()); + if (n == 0) return ComplexVector(0); + + ensure_buffers(n * sizeof(Complex), n * sizeof(Complex)); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_in_.get(), input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_)); + + cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value); + EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.get()), + static_cast<Complex*>(d_out_.get()), CUFFT_INVERSE)); + + // Scale by 1/n. + scale_device(static_cast<Complex*>(d_out_.get()), n, Scalar(1) / Scalar(n)); + + ComplexVector result(n); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(result.data(), d_out_.get(), n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + return result; + } + + // ---- 1D Real-to-Complex --------------------------------------------------- + + /** Forward 1D R2C FFT. Returns n/2+1 complex values (half-spectrum). */ + template <typename Derived> + ComplexVector fwd(const MatrixBase<Derived>& x, + typename std::enable_if<!NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) { + const RealVector input(x.derived()); + const int n = static_cast<int>(input.size()); + if (n == 0) return ComplexVector(0); + + const int n_complex = n / 2 + 1; + ensure_buffers(n * sizeof(Scalar), n_complex * sizeof(Complex)); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_in_.get(), input.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice, stream_)); + + cufftHandle plan = get_plan_1d(n, internal::cufft_r2c_type<Scalar>::value); + EIGEN_CUFFT_CHECK( + internal::cufftExecR2C_dispatch(plan, static_cast<Scalar*>(d_in_.get()), static_cast<Complex*>(d_out_.get()))); + + ComplexVector result(n_complex); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(result.data(), d_out_.get(), n_complex * sizeof(Complex), cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + return result; + } + + // ---- 1D Complex-to-Real --------------------------------------------------- + + /** Inverse 1D C2R FFT. Input is n/2+1 complex values, output is nfft real values. + * Scaled by 1/nfft. Caller must specify nfft (original real signal length). */ + template <typename Derived> + RealVector invReal(const MatrixBase<Derived>& X, Index nfft) { + static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "invReal() requires complex input"); + const ComplexVector input(X.derived()); + const int n = static_cast<int>(nfft); + const int n_complex = n / 2 + 1; + eigen_assert(input.size() == n_complex); + if (n == 0) return RealVector(0); + + ensure_buffers(n_complex * sizeof(Complex), n * sizeof(Scalar)); + // cuFFT C2R may overwrite the input, so we copy to d_in_. + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_in_.get(), input.data(), n_complex * sizeof(Complex), cudaMemcpyHostToDevice, stream_)); + + cufftHandle plan = get_plan_1d(n, internal::cufft_c2r_type<Scalar>::value); + EIGEN_CUFFT_CHECK( + internal::cufftExecC2R_dispatch(plan, static_cast<Complex*>(d_in_.get()), static_cast<Scalar*>(d_out_.get()))); + + // Scale by 1/n. + scale_device_real(static_cast<Scalar*>(d_out_.get()), n, Scalar(1) / Scalar(n)); + + RealVector result(n); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(result.data(), d_out_.get(), n * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + return result; + } + + // ---- 2D Complex-to-Complex ------------------------------------------------ + + /** Forward 2D C2C FFT. Input and output are rows x cols complex matrices. */ + template <typename Derived> + ComplexMatrix fwd2(const MatrixBase<Derived>& A) { + static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "fwd2() requires complex input"); + const ComplexMatrix input(A.derived()); + const int rows = static_cast<int>(input.rows()); + const int cols = static_cast<int>(input.cols()); + if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols); + + const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex); + ensure_buffers(total, total); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.get(), input.data(), total, cudaMemcpyHostToDevice, stream_)); + + cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value); + EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.get()), + static_cast<Complex*>(d_out_.get()), CUFFT_FORWARD)); + + ComplexMatrix result(rows, cols); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.get(), total, cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + return result; + } + + /** Inverse 2D C2C FFT. Scaled by 1/(rows*cols). */ + template <typename Derived> + ComplexMatrix inv2(const MatrixBase<Derived>& A) { + static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv2() requires complex input"); + const ComplexMatrix input(A.derived()); + const int rows = static_cast<int>(input.rows()); + const int cols = static_cast<int>(input.cols()); + if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols); + + const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex); + ensure_buffers(total, total); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.get(), input.data(), total, cudaMemcpyHostToDevice, stream_)); + + cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value); + EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.get()), + static_cast<Complex*>(d_out_.get()), CUFFT_INVERSE)); + + // Scale by 1/(rows*cols). + const int total_elems = rows * cols; + scale_device(static_cast<Complex*>(d_out_.get()), total_elems, Scalar(1) / Scalar(total_elems)); + + ComplexMatrix result(rows, cols); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.get(), total, cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + return result; + } + + // ---- Accessors ------------------------------------------------------------ + + cudaStream_t stream() const { return stream_; } + + private: + cudaStream_t stream_ = nullptr; + cublasHandle_t cublas_ = nullptr; + std::map<int64_t, cufftHandle> plans_; + internal::DeviceBuffer d_in_; + internal::DeviceBuffer d_out_; + size_t d_in_size_ = 0; + size_t d_out_size_ = 0; + + void ensure_buffers(size_t in_bytes, size_t out_bytes) { + if (in_bytes > d_in_size_) { + if (d_in_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + d_in_ = internal::DeviceBuffer(in_bytes); + d_in_size_ = in_bytes; + } + if (out_bytes > d_out_size_) { + if (d_out_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + d_out_ = internal::DeviceBuffer(out_bytes); + d_out_size_ = out_bytes; + } + } + + // Plan key encoding: rank (1 bit) | type (4 bits) | dims. + // cufftType uses 7 bits; the top 3 (precision discriminator) are redundant + // since Scalar fixes precision per FFT instance, so mask to 4 bits — without + // it, e.g. plan_key_1d(5, C2C) and plan_key_1d(7, C2C) collide. + static constexpr int64_t kTypeMask = 0xF; + static constexpr int kCols2DBits = 30; // bits 5..34 + static constexpr int kRows2DBits = 29; // bits 35..63 + static int64_t plan_key_1d(int n, cufftType type) { return (int64_t(n) << 5) | (int64_t(type & kTypeMask) << 1) | 0; } + + static int64_t plan_key_2d(int rows, int cols, cufftType type) { + eigen_assert(rows >= 0 && int64_t(rows) < (int64_t(1) << kRows2DBits) && + "FFT plan rows exceed plan-key bit budget"); + eigen_assert(cols >= 0 && int64_t(cols) < (int64_t(1) << kCols2DBits) && + "FFT plan cols exceed plan-key bit budget"); + return (int64_t(rows) << 35) | (int64_t(cols) << 5) | (int64_t(type & kTypeMask) << 1) | 1; + } + + cufftHandle get_plan_1d(int n, cufftType type) { + int64_t key = plan_key_1d(n, type); + auto it = plans_.find(key); + if (it != plans_.end()) return it->second; + + cufftHandle plan; + EIGEN_CUFFT_CHECK(cufftPlan1d(&plan, n, type, /*batch=*/1)); + EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_)); + plans_[key] = plan; + return plan; + } + + cufftHandle get_plan_2d(int rows, int cols, cufftType type) { + int64_t key = plan_key_2d(rows, cols, type); + auto it = plans_.find(key); + if (it != plans_.end()) return it->second; + + // cuFFT uses row-major (C order) for 2D: first dim = rows, second = cols. + // Eigen matrices are column-major, so we pass (cols, rows) to cuFFT + // to get the correct 2D transform. + cufftHandle plan; + EIGEN_CUFFT_CHECK(cufftPlan2d(&plan, cols, rows, type)); + EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_)); + plans_[key] = plan; + return plan; + } + + // Scale complex array on device using cuBLAS scal. + void scale_device(Complex* d_ptr, int n, Scalar alpha) { + EIGEN_CUBLAS_CHECK(internal::cublasXscal(cublas_, n, &alpha, d_ptr, 1)); + } + + // Scale real array on device using cuBLAS scal. + void scale_device_real(Scalar* d_ptr, int n, Scalar alpha) { + EIGEN_CUBLAS_CHECK(internal::cublasXscal(cublas_, n, &alpha, d_ptr, 1)); + } +}; + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_FFT_H
diff --git a/unsupported/Eigen/src/GPU/GpuSolverContext.h b/unsupported/Eigen/src/GPU/GpuSolverContext.h index 87c4641..382a076 100644 --- a/unsupported/Eigen/src/GPU/GpuSolverContext.h +++ b/unsupported/Eigen/src/GPU/GpuSolverContext.h
@@ -87,10 +87,10 @@ // upcoming move of d_scratch_ doesn't free buffers that an in-flight // kernel is still touching. The asymmetric move-ctor doesn't need this — // it constructs onto uninitialized storage. - if (stream_) (void)cudaStreamSynchronize(stream_); - if (cublas_) (void)cublasDestroy(cublas_); - if (cusolver_) (void)cusolverDnDestroy(cusolver_); - if (stream_) (void)cudaStreamDestroy(stream_); + if (stream_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + if (cublas_) EIGEN_CUBLAS_CHECK(cublasDestroy(cublas_)); + if (cusolver_) EIGEN_CUSOLVER_CHECK(cusolverDnDestroy(cusolver_)); + if (stream_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamDestroy(stream_)); stream_ = o.stream_; cusolver_ = o.cusolver_; cublas_ = o.cublas_;
diff --git a/unsupported/Eigen/src/GPU/GpuSparseContext.h b/unsupported/Eigen/src/GPU/GpuSparseContext.h new file mode 100644 index 0000000..45926f8 --- /dev/null +++ b/unsupported/Eigen/src/GPU/GpuSparseContext.h
@@ -0,0 +1,404 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix +// multiply (SpMM) via cuSPARSE. +// +// SparseContext manages a cuSPARSE handle and device buffers. It accepts +// Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the +// GPU. RowMajor input is implicitly converted to ColMajor. +// +// Thread safety: not thread-safe. Concurrent multiply* calls on a single +// SparseContext race on the cuSPARSE handle, the bound stream, and the +// cached device buffers. Use one SparseContext per thread. +// +// Usage: +// SparseContext<double> ctx; +// VectorXd y = ctx.multiply(A, x); // y = A * x +// ctx.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y +// ctx.multiply(A, x, y, 1.0, 0.0, gpu::GpuOp::ConjTrans); // y = A^H * x +// VectorXd z = ctx.multiplyT(A, x); // z = A^T * x +// VectorXcd w = ctx.multiplyAdjoint(A, x); // w = A^H * x (complex) +// MatrixXd Y = ctx.multiplyMat(A, X); // Y = A * X (multiple RHS) + +#ifndef EIGEN_GPU_SPARSE_CONTEXT_H +#define EIGEN_GPU_SPARSE_CONTEXT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./CuSparseSupport.h" + +namespace Eigen { +namespace gpu { + +template <typename Scalar_> +class SparseContext { + public: + using Scalar = Scalar_; + using RealScalar = typename NumTraits<Scalar>::Real; + using StorageIndex = int; + using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>; + using DenseVector = Matrix<Scalar, Dynamic, 1>; + using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>; + + SparseContext() { + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_)); + EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_)); + EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_)); + } + + ~SparseContext() { + destroy_descriptors_unchecked(); + if (handle_) (void)cusparseDestroy(handle_); + if (stream_) (void)cudaStreamDestroy(stream_); + } + + SparseContext(const SparseContext&) = delete; + SparseContext& operator=(const SparseContext&) = delete; + + // ---- SpMV: y = A * x ----------------------------------------------------- + + /** Compute y = A * x. Returns y as a new dense vector. */ + template <typename InputType, typename Rhs> + DenseVector multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) { + const InputType& input = A.derived(); + check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros()); + const SpMat mat(input); + DenseVector y(mat.rows()); + y.setZero(); + multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE); + return y; + } + + /** Compute y = alpha * op(A) * x + beta * y (in-place). */ + template <typename InputType, typename Rhs, typename Dest> + void multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x, MatrixBase<Dest>& y, + Scalar alpha = Scalar(1), Scalar beta = Scalar(0), GpuOp op = GpuOp::NoTrans) { + const InputType& input = A.derived(); + check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros()); + const SpMat mat(input); + multiply_impl(mat, x.derived(), y.derived(), alpha, beta, internal::to_cusparse_op_for_scalar<Scalar>(op)); + } + + // ---- SpMV transpose: y = A^T * x ----------------------------------------- + + /** Compute y = A^T * x. Returns y as a new dense vector. */ + template <typename InputType, typename Rhs> + DenseVector multiplyT(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) { + const InputType& input = A.derived(); + check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros()); + const SpMat mat(input); + DenseVector y(mat.cols()); + y.setZero(); + multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE); + return y; + } + + // ---- SpMV adjoint: y = A^H * x ------------------------------------------- + + /** Compute y = A^H * x (conjugate transpose). For real Scalar this is equivalent to multiplyT. */ + template <typename InputType, typename Rhs> + DenseVector multiplyAdjoint(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) { + const InputType& input = A.derived(); + check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros()); + const SpMat mat(input); + DenseVector y(mat.cols()); + y.setZero(); + multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), + internal::to_cusparse_op_for_scalar<Scalar>(GpuOp::ConjTrans)); + return y; + } + + // ---- SpMM: Y = op(A) * X (multiple RHS) ---------------------------------- + + /** Compute Y = op(A) * X where X is a dense matrix (multiple RHS). Returns Y. */ + template <typename InputType, typename Rhs> + DenseMatrix multiplyMat(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& X, GpuOp op = GpuOp::NoTrans) { + const InputType& input = A.derived(); + check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros()); + const SpMat mat(input); + const DenseMatrix rhs(X.derived()); + + const cusparseOperation_t cu_op = internal::to_cusparse_op_for_scalar<Scalar>(op); + const Index m = (op == GpuOp::NoTrans) ? mat.rows() : mat.cols(); + const Index k = (op == GpuOp::NoTrans) ? mat.cols() : mat.rows(); + eigen_assert(k == rhs.rows()); + + const Index n = rhs.cols(); + if (m == 0 || n == 0 || mat.nonZeros() == 0) return DenseMatrix::Zero(m, n); + + DenseMatrix Y = DenseMatrix::Zero(m, n); + spmm_impl(mat, rhs, Y, Scalar(1), Scalar(0), cu_op); + return Y; + } + + // ---- Accessors ------------------------------------------------------------ + + cudaStream_t stream() const { return stream_; } + + private: + cudaStream_t stream_ = nullptr; + cusparseHandle_t handle_ = nullptr; + + // Cached device buffers (grow-only). + internal::DeviceBuffer d_outerPtr_; + internal::DeviceBuffer d_innerIdx_; + internal::DeviceBuffer d_values_; + internal::DeviceBuffer d_x_; + internal::DeviceBuffer d_y_; + internal::DeviceBuffer d_workspace_; + size_t d_outerPtr_size_ = 0; + size_t d_innerIdx_size_ = 0; + size_t d_values_size_ = 0; + size_t d_x_size_ = 0; + size_t d_y_size_ = 0; + size_t d_workspace_size_ = 0; + + // Cached cuSPARSE descriptors. + cusparseSpMatDescr_t spmat_desc_ = nullptr; + Index cached_rows_ = -1; + Index cached_cols_ = -1; + Index cached_nnz_ = -1; + + // ---- SpMV implementation -------------------------------------------------- + + template <typename RhsDerived, typename DestDerived> + void multiply_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta, + cusparseOperation_t op) { + eigen_assert(A.isCompressed()); + + const Index m = A.rows(); + const Index n = A.cols(); + const Index nnz = A.nonZeros(); + const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m; + const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n; + + eigen_assert(x.size() == x_size); + eigen_assert(y.size() == y_size); + + if (m == 0 || n == 0 || nnz == 0) { + if (beta == Scalar(0)) + y.setZero(); + else + y *= beta; + return; + } + + // Upload sparse matrix to device. + upload_sparse(A); + + // Upload x to device. + ensure_buffer(d_x_, d_x_size_, static_cast<size_t>(x_size) * sizeof(Scalar)); + const DenseVector x_tmp(x); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_x_.get(), x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_)); + + // Upload y to device (for beta != 0). + ensure_buffer(d_y_, d_y_size_, static_cast<size_t>(y_size) * sizeof(Scalar)); + if (beta != Scalar(0)) { + const DenseVector y_tmp(y); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_y_.get(), y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_)); + } + + // Create dense vector descriptors. + constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value; + cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr; + EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_.get(), dtype)); + EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_.get(), dtype)); + + // Query workspace size. + size_t ws_size = 0; + EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype, + CUSPARSE_SPMV_ALG_DEFAULT, &ws_size)); + ensure_buffer(d_workspace_, d_workspace_size_, ws_size); + + // Execute SpMV. + EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype, + CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.get())); + + // Download result. + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(y.data(), d_y_.get(), y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + + EIGEN_CUSPARSE_CHECK(cusparseDestroyDnVec(x_desc)); + EIGEN_CUSPARSE_CHECK(cusparseDestroyDnVec(y_desc)); + } + + // ---- SpMM implementation -------------------------------------------------- + + void spmm_impl(const SpMat& A, const DenseMatrix& X, DenseMatrix& Y, Scalar alpha, Scalar beta, + cusparseOperation_t op) { + eigen_assert(A.isCompressed()); + + // For op != NON_TRANSPOSE, Y = op(A) * X. The dense X / Y descriptors must + // describe the *post-op* shapes: X has k_op rows (= input dim of op(A)), + // Y has m_op rows (= output dim of op(A)). + const bool transposed = (op != CUSPARSE_OPERATION_NON_TRANSPOSE); + const Index m_op = transposed ? A.cols() : A.rows(); + const Index k_op = transposed ? A.rows() : A.cols(); + const Index n = X.cols(); + const Index nnz = A.nonZeros(); + + if (m_op == 0 || n == 0 || k_op == 0 || nnz == 0) { + if (beta == Scalar(0)) + Y.setZero(); + else + Y *= beta; + return; + } + + upload_sparse(A); + + // Upload X to device. X is k_op x n, Y is m_op x n (column-major). + const size_t x_bytes = static_cast<size_t>(k_op) * static_cast<size_t>(n) * sizeof(Scalar); + const size_t y_bytes = static_cast<size_t>(m_op) * static_cast<size_t>(n) * sizeof(Scalar); + ensure_buffer(d_x_, d_x_size_, x_bytes); + ensure_buffer(d_y_, d_y_size_, y_bytes); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_x_.get(), X.data(), x_bytes, cudaMemcpyHostToDevice, stream_)); + if (beta != Scalar(0)) { + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.get(), Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_)); + } + + // Create dense matrix descriptors. + constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value; + cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr; + // Eigen is column-major, so ld = rows. + EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k_op, n, k_op, d_x_.get(), dtype, CUSPARSE_ORDER_COL)); + EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m_op, n, m_op, d_y_.get(), dtype, CUSPARSE_ORDER_COL)); + + // Query workspace. + size_t ws_size = 0; + EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, + x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size)); + ensure_buffer(d_workspace_, d_workspace_size_, ws_size); + + // Execute SpMM. + EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta, + y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.get())); + + // Download result. + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.get(), y_bytes, cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + + EIGEN_CUSPARSE_CHECK(cusparseDestroyDnMat(x_desc)); + EIGEN_CUSPARSE_CHECK(cusparseDestroyDnMat(y_desc)); + } + + // ---- Helpers -------------------------------------------------------------- + + static void check_storage_index_bounds(Index rows, Index cols, Index nnz) { + const Index max_storage_index = static_cast<Index>((std::numeric_limits<StorageIndex>::max)()); + eigen_assert(rows <= max_storage_index && cols <= max_storage_index && nnz <= max_storage_index && + "gpu::SparseContext currently uses int StorageIndex; matrix dimensions or nonzeros exceed int range"); + EIGEN_UNUSED_VARIABLE(rows); + EIGEN_UNUSED_VARIABLE(cols); + EIGEN_UNUSED_VARIABLE(nnz); + EIGEN_UNUSED_VARIABLE(max_storage_index); + } + + void upload_sparse(const SpMat& A) { + // cuSPARSE 12.0+ accepts CSC directly for both SpMV and SpMM. cuSPARSE + // 11.x SpMM rejects CSC ("unsupported matrix format for matA (CSC)") and + // SpMV with CONJUGATE_TRANSPOSE on CSC+complex silently demotes to + // TRANSPOSE. CSR works on every version, so on 11.x we transpose-copy + // the user's ColMajor input into a RowMajor (CSR) representation. +#if CUSPARSE_VERSION >= 12000 + upload_compressed_arrays(A.rows(), A.cols(), A.nonZeros(), + /*outer_count=*/A.cols() + 1, A.outerIndexPtr(), A.innerIndexPtr(), A.valuePtr(), + /*is_csr=*/false); +#else + using CsrMat = SparseMatrix<Scalar, RowMajor, StorageIndex>; + const CsrMat csr(A); + upload_compressed_arrays(csr.rows(), csr.cols(), csr.nonZeros(), + /*outer_count=*/csr.rows() + 1, csr.outerIndexPtr(), csr.innerIndexPtr(), csr.valuePtr(), + /*is_csr=*/true); + // cudaMemcpyAsync from pageable host memory blocks the host until the + // source is consumed, so the CsrMat temporary's lifetime is sufficient. +#endif + } + + void upload_compressed_arrays(Index m, Index n, Index nnz, Index outer_count, const StorageIndex* host_outer, + const StorageIndex* host_inner, const Scalar* host_values, bool is_csr) { + const size_t outer_bytes = static_cast<size_t>(outer_count) * sizeof(StorageIndex); + const size_t inner_bytes = static_cast<size_t>(nnz) * sizeof(StorageIndex); + const size_t val_bytes = static_cast<size_t>(nnz) * sizeof(Scalar); + + ensure_buffer(d_outerPtr_, d_outerPtr_size_, outer_bytes); + ensure_buffer(d_innerIdx_, d_innerIdx_size_, inner_bytes); + ensure_buffer(d_values_, d_values_size_, val_bytes); + + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_outerPtr_.get(), host_outer, outer_bytes, cudaMemcpyHostToDevice, stream_)); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_innerIdx_.get(), host_inner, inner_bytes, cudaMemcpyHostToDevice, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.get(), host_values, val_bytes, cudaMemcpyHostToDevice, stream_)); + + if (m != cached_rows_ || n != cached_cols_ || nnz != cached_nnz_) { + destroy_descriptors_checked(); + + constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I; + constexpr cudaDataType_t val_type = internal::cuda_data_type<Scalar>::value; + + if (is_csr) { + EIGEN_CUSPARSE_CHECK(cusparseCreateCsr(&spmat_desc_, m, n, nnz, d_outerPtr_.get(), d_innerIdx_.get(), + d_values_.get(), idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO, + val_type)); + } else { + EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.get(), d_innerIdx_.get(), + d_values_.get(), idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO, + val_type)); + } + cached_rows_ = m; + cached_cols_ = n; + cached_nnz_ = nnz; + } else if (is_csr) { + EIGEN_CUSPARSE_CHECK(cusparseCsrSetPointers(spmat_desc_, d_outerPtr_.get(), d_innerIdx_.get(), d_values_.get())); + } else { + EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.get(), d_innerIdx_.get(), d_values_.get())); + } + } + + // Destructor-only cleanup: there is no useful recovery path for failures. + void destroy_descriptors_unchecked() { + if (spmat_desc_) { + (void)cusparseDestroySpMat(spmat_desc_); + spmat_desc_ = nullptr; + } + cached_rows_ = -1; + cached_cols_ = -1; + cached_nnz_ = -1; + } + + void destroy_descriptors_checked() { + if (spmat_desc_) { + EIGEN_CUSPARSE_CHECK(cusparseDestroySpMat(spmat_desc_)); + spmat_desc_ = nullptr; + } + cached_rows_ = -1; + cached_cols_ = -1; + cached_nnz_ = -1; + } + + void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) { + if (needed > current_size) { + if (buf) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + buf = internal::DeviceBuffer(needed); + current_size = needed; + } + } +}; + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_SPARSE_CONTEXT_H
diff --git a/unsupported/Eigen/src/GPU/GpuSparseLDLT.h b/unsupported/Eigen/src/GPU/GpuSparseLDLT.h new file mode 100644 index 0000000..7a6e7b7 --- /dev/null +++ b/unsupported/Eigen/src/GPU/GpuSparseLDLT.h
@@ -0,0 +1,65 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// GPU sparse LDL^T / LDL^H factorization via cuDSS. +// +// For symmetric indefinite (or Hermitian indefinite) sparse matrices. +// Same three-phase workflow as SparseLLT. +// +// Usage: +// SparseLDLT<double> ldlt(A); // analyze + factorize +// VectorXd x = ldlt.solve(b); // solve + +#ifndef EIGEN_GPU_SPARSE_LDLT_H +#define EIGEN_GPU_SPARSE_LDLT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./GpuSparseSolverBase.h" + +namespace Eigen { +namespace gpu { + +/** GPU sparse LDL^T factorization (symmetric indefinite / Hermitian indefinite). + * + * Wraps cuDSS with CUDSS_MTYPE_SYMMETRIC (real) or CUDSS_MTYPE_HERMITIAN (complex). + * Uses pivoting for numerical stability. + * + * \tparam Scalar_ float, double, complex<float>, or complex<double> + * \tparam UpLo_ Lower (default) or Upper — which triangle of A is stored + */ +template <typename Scalar_, int UpLo_ = Lower> +class SparseLDLT : public internal::SparseSolverBase<Scalar_, SparseLDLT<Scalar_, UpLo_>> { + using Base = internal::SparseSolverBase<Scalar_, SparseLDLT>; + friend Base; + + public: + using Scalar = Scalar_; + static constexpr int UpLo = UpLo_; + + SparseLDLT() = default; + + template <typename InputType> + explicit SparseLDLT(const SparseMatrixBase<InputType>& A) { + this->compute(A); + } + + static constexpr bool needs_csr_conversion() { return false; } + static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_symmetric_type<Scalar>::value; } + static constexpr cudssMatrixViewType_t cudss_matrix_view() { + return internal::cudss_view_type<UpLo, ColMajor>::value; + } +}; + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_SPARSE_LDLT_H
diff --git a/unsupported/Eigen/src/GPU/GpuSparseLLT.h b/unsupported/Eigen/src/GPU/GpuSparseLLT.h new file mode 100644 index 0000000..9d3f98b --- /dev/null +++ b/unsupported/Eigen/src/GPU/GpuSparseLLT.h
@@ -0,0 +1,65 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// GPU sparse Cholesky (LL^T / LL^H) via cuDSS. +// +// Usage: +// SparseLLT<double> llt(A); // analyze + factorize +// VectorXd x = llt.solve(b); // solve +// llt.analyzePattern(A); // or separate phases +// llt.factorize(A_new); // reuse symbolic analysis + +#ifndef EIGEN_GPU_SPARSE_LLT_H +#define EIGEN_GPU_SPARSE_LLT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./GpuSparseSolverBase.h" + +namespace Eigen { +namespace gpu { + +/** GPU sparse Cholesky factorization (LL^T for real, LL^H for complex). + * + * Wraps cuDSS with CUDSS_MTYPE_SPD (real) or CUDSS_MTYPE_HPD (complex). + * Accepts ColMajor SparseMatrix (CSC), reinterpreted as CSR with swapped + * triangle view for zero-copy upload. + * + * \tparam Scalar_ float, double, complex<float>, or complex<double> + * \tparam UpLo_ Lower (default) or Upper — which triangle of A is stored + */ +template <typename Scalar_, int UpLo_ = Lower> +class SparseLLT : public internal::SparseSolverBase<Scalar_, SparseLLT<Scalar_, UpLo_>> { + using Base = internal::SparseSolverBase<Scalar_, SparseLLT>; + friend Base; + + public: + using Scalar = Scalar_; + static constexpr int UpLo = UpLo_; + + SparseLLT() = default; + + template <typename InputType> + explicit SparseLLT(const SparseMatrixBase<InputType>& A) { + this->compute(A); + } + + static constexpr bool needs_csr_conversion() { return false; } + static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_spd_type<Scalar>::value; } + static constexpr cudssMatrixViewType_t cudss_matrix_view() { + return internal::cudss_view_type<UpLo, ColMajor>::value; + } +}; + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_SPARSE_LLT_H
diff --git a/unsupported/Eigen/src/GPU/GpuSparseLU.h b/unsupported/Eigen/src/GPU/GpuSparseLU.h new file mode 100644 index 0000000..9964ee9 --- /dev/null +++ b/unsupported/Eigen/src/GPU/GpuSparseLU.h
@@ -0,0 +1,62 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// GPU sparse LU factorization via cuDSS. +// +// For general (non-symmetric) sparse matrices. Uses pivoting. +// Same three-phase workflow as SparseLLT. +// +// Usage: +// SparseLU<double> lu(A); // analyze + factorize +// VectorXd x = lu.solve(b); // solve + +#ifndef EIGEN_GPU_SPARSE_LU_H +#define EIGEN_GPU_SPARSE_LU_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./GpuSparseSolverBase.h" + +namespace Eigen { +namespace gpu { + +/** GPU sparse LU factorization (general matrices). + * + * Wraps cuDSS with CUDSS_MTYPE_GENERAL and CUDSS_MVIEW_FULL. + * Accepts ColMajor SparseMatrix (CSC); internally converts to RowMajor + * CSR since cuDSS requires CSR input. + * + * \tparam Scalar_ float, double, complex<float>, or complex<double> + */ +template <typename Scalar_> +class SparseLU : public internal::SparseSolverBase<Scalar_, SparseLU<Scalar_>> { + using Base = internal::SparseSolverBase<Scalar_, SparseLU>; + friend Base; + + public: + using Scalar = Scalar_; + + SparseLU() = default; + + template <typename InputType> + explicit SparseLU(const SparseMatrixBase<InputType>& A) { + this->compute(A); + } + + static constexpr bool needs_csr_conversion() { return true; } + static constexpr cudssMatrixType_t cudss_matrix_type() { return CUDSS_MTYPE_GENERAL; } + static constexpr cudssMatrixViewType_t cudss_matrix_view() { return CUDSS_MVIEW_FULL; } +}; + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_SPARSE_LU_H
diff --git a/unsupported/Eigen/src/GPU/GpuSparseSolverBase.h b/unsupported/Eigen/src/GPU/GpuSparseSolverBase.h new file mode 100644 index 0000000..6813741 --- /dev/null +++ b/unsupported/Eigen/src/GPU/GpuSparseSolverBase.h
@@ -0,0 +1,389 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// Common base for GPU sparse direct solvers (LLT, LDLT, LU) via cuDSS. +// +// All three solver types share the same three-phase workflow +// (analyzePattern → factorize → solve) and differ only in the +// cudssMatrixType_t and cudssMatrixViewType_t passed to cuDSS. +// This CRTP base implements the entire workflow; derived classes +// provide the matrix type/view via static constexpr members. +// +// Thread safety: not thread-safe. Concurrent calls (including concurrent +// solve() calls on the same instance, even though solve() is const) race +// on the cuDSS handle, the bound stream, and the cached scratch buffers. +// Use one solver instance per thread, matching Eigen::SimplicialLLT. + +#ifndef EIGEN_GPU_SPARSE_SOLVER_BASE_H +#define EIGEN_GPU_SPARSE_SOLVER_BASE_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./CuDssSupport.h" + +namespace Eigen { +namespace gpu { +namespace internal { + +/** CRTP base for GPU sparse direct solvers. + * + * \tparam Scalar_ Element type (passed explicitly to avoid incomplete-type issues with CRTP). + * \tparam Derived The concrete solver class (SparseLLT, SparseLDLT, SparseLU). + * Must provide: + * - `static constexpr cudssMatrixType_t cudss_matrix_type()` + * - `static constexpr cudssMatrixViewType_t cudss_matrix_view()` + */ +template <typename Scalar_, typename Derived> +class SparseSolverBase { + public: + using Scalar = Scalar_; + using RealScalar = typename NumTraits<Scalar>::Real; + using StorageIndex = int; + using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>; + using CsrMat = SparseMatrix<Scalar, RowMajor, StorageIndex>; + using DenseVector = Matrix<Scalar, Dynamic, 1>; + using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>; + + SparseSolverBase() { init_context(); } + + ~SparseSolverBase() { + destroy_cudss_objects(); + if (handle_) (void)cudssDestroy(handle_); + if (stream_) (void)cudaStreamDestroy(stream_); + } + + SparseSolverBase(const SparseSolverBase&) = delete; + SparseSolverBase& operator=(const SparseSolverBase&) = delete; + + // ---- Configuration -------------------------------------------------------- + + /** Set the fill-reducing ordering algorithm. Must be called before compute/analyzePattern. */ + void setOrdering(GpuSparseOrdering ordering) { ordering_ = ordering; } + + // ---- Factorization -------------------------------------------------------- + + /** Symbolic analysis + numeric factorization. */ + template <typename InputType> + Derived& compute(const SparseMatrixBase<InputType>& A) { + analyzePattern(A); + if (info_ == Success) { + factorize(A); + } + return derived(); + } + + /** Symbolic analysis only. Uploads sparsity structure to device. + * This phase is synchronous (blocks until complete). */ + template <typename InputType> + Derived& analyzePattern(const SparseMatrixBase<InputType>& A) { + const InputType& input = A.derived(); + check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros()); + const SpMat csc(input); + eigen_assert(csc.rows() == csc.cols() && "GpuSparseSolver requires a square matrix"); + eigen_assert(csc.isCompressed() && "GpuSparseSolver requires a compressed sparse matrix"); + + n_ = csc.rows(); + info_ = InvalidInput; + analysis_done_ = false; + + if (n_ == 0) { + nnz_ = 0; + info_ = Success; + analysis_done_ = true; + return derived(); + } + + // For symmetric solvers, ColMajor CSC can be reinterpreted as CSR with + // swapped triangle view (zero copy). For general solvers, we must convert + // to actual RowMajor CSR so cuDSS sees the correct matrix, not A^T. + if (Derived::needs_csr_conversion()) { + const CsrMat csr(csc); + nnz_ = csr.nonZeros(); + upload_csr(csr); + } else { + nnz_ = csc.nonZeros(); + upload_csr_from_csc(csc); + } + create_cudss_matrix(); + apply_ordering_config(); + + if (data_) EIGEN_CUDSS_CHECK(cudssDataDestroy(handle_, data_)); + EIGEN_CUDSS_CHECK(cudssDataCreate(handle_, &data_)); + + create_placeholder_dense(); + + EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_ANALYSIS, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_)); + + analysis_done_ = true; + info_ = Success; + return derived(); + } + + /** Numeric factorization using the symbolic analysis from analyzePattern. + * + * \warning The sparsity pattern (outerIndexPtr, innerIndexPtr) must be + * identical to the one passed to analyzePattern(). Only the numerical + * values may change. Passing a different pattern is undefined behavior. + * This matches the contract of CHOLMOD, UMFPACK, and cuDSS's own API. + * + * This phase is asynchronous — info() lazily synchronizes. */ + template <typename InputType> + Derived& factorize(const SparseMatrixBase<InputType>& A) { + eigen_assert(analysis_done_ && "factorize() requires analyzePattern() first"); + + if (n_ == 0) { + info_ = Success; + return derived(); + } + + // Convert to the same format used in analyzePattern. + // Both temporaries must outlive the async memcpy (pageable H2D is actually + // synchronous w.r.t. the host, but keep them alive for clarity). + const InputType& input = A.derived(); + check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros()); + const SpMat csc(input); + eigen_assert(csc.rows() == n_ && csc.cols() == n_); + + const Scalar* value_ptr; + Index value_nnz; + CsrMat csr_tmp; + if (Derived::needs_csr_conversion()) { + csr_tmp = CsrMat(csc); + value_ptr = csr_tmp.valuePtr(); + value_nnz = csr_tmp.nonZeros(); + } else { + value_ptr = csc.valuePtr(); + value_nnz = csc.nonZeros(); + } + eigen_assert(value_nnz == nnz_); + + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.get(), value_ptr, static_cast<size_t>(nnz_) * sizeof(Scalar), + cudaMemcpyHostToDevice, stream_)); + + EIGEN_CUDSS_CHECK(cudssMatrixSetValues(d_A_cudss_, d_values_.get())); + + info_ = InvalidInput; + info_synced_ = false; + EIGEN_CUDSS_CHECK( + cudssExecute(handle_, CUDSS_PHASE_FACTORIZATION, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_)); + + return derived(); + } + + // ---- Solve ---------------------------------------------------------------- + + /** Solve A * X = B. Returns X as a dense matrix. + * Supports single or multiple right-hand sides. */ + template <typename Rhs> + DenseMatrix solve(const MatrixBase<Rhs>& B) const { + sync_info(); + eigen_assert(info_ == Success && "GpuSparseSolver::solve requires a successful factorization"); + eigen_assert(B.rows() == n_); + + const DenseMatrix rhs(B); + const int64_t nrhs = static_cast<int64_t>(rhs.cols()); + + if (n_ == 0) return DenseMatrix(0, rhs.cols()); + + // Reuse cached d_b/d_x scratch to avoid cudaMalloc/cudaFree per solve. + const size_t rhs_bytes = static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar); + ensure_solve_buffer(d_b_solve_, d_b_solve_size_, rhs_bytes); + ensure_solve_buffer(d_x_solve_, d_x_solve_size_, rhs_bytes); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_b_solve_.get(), rhs.data(), rhs_bytes, cudaMemcpyHostToDevice, stream_)); + + constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value; + cudssMatrix_t b_cudss = nullptr, x_cudss = nullptr; + EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&b_cudss, n_, nrhs, n_, d_b_solve_.get(), dtype, CUDSS_LAYOUT_COL_MAJOR)); + EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&x_cudss, n_, nrhs, n_, d_x_solve_.get(), dtype, CUDSS_LAYOUT_COL_MAJOR)); + + EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_SOLVE, config_, data_, d_A_cudss_, x_cudss, b_cudss)); + + DenseMatrix X(n_, rhs.cols()); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_x_solve_.get(), rhs_bytes, cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + + EIGEN_CUDSS_CHECK(cudssMatrixDestroy(b_cudss)); + EIGEN_CUDSS_CHECK(cudssMatrixDestroy(x_cudss)); + + return X; + } + + // ---- Accessors ------------------------------------------------------------ + + ComputationInfo info() const { + sync_info(); + return info_; + } + Index rows() const { return n_; } + Index cols() const { return n_; } + + cudaStream_t stream() const { return stream_; } + + protected: + // ---- CUDA / cuDSS handles ------------------------------------------------- + cudaStream_t stream_ = nullptr; + cudssHandle_t handle_ = nullptr; + cudssConfig_t config_ = nullptr; + cudssData_t data_ = nullptr; + cudssMatrix_t d_A_cudss_ = nullptr; + cudssMatrix_t d_x_cudss_ = nullptr; + cudssMatrix_t d_b_cudss_ = nullptr; + + // ---- Device buffers for CSR arrays ---------------------------------------- + DeviceBuffer d_rowPtr_; + DeviceBuffer d_colIdx_; + DeviceBuffer d_values_; + + // ---- Cached scratch for solve() (mutable so const solve() can grow them) -- + mutable DeviceBuffer d_b_solve_; + mutable DeviceBuffer d_x_solve_; + mutable size_t d_b_solve_size_ = 0; + mutable size_t d_x_solve_size_ = 0; + + // ---- State ---------------------------------------------------------------- + int64_t n_ = 0; + int64_t nnz_ = 0; + mutable ComputationInfo info_ = InvalidInput; + mutable bool info_synced_ = true; + bool analysis_done_ = false; + GpuSparseOrdering ordering_ = GpuSparseOrdering::AMD; + + private: + Derived& derived() { return static_cast<Derived&>(*this); } + const Derived& derived() const { return static_cast<const Derived&>(*this); } + + void init_context() { + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_)); + EIGEN_CUDSS_CHECK(cudssCreate(&handle_)); + EIGEN_CUDSS_CHECK(cudssSetStream(handle_, stream_)); + EIGEN_CUDSS_CHECK(cudssConfigCreate(&config_)); + } + + void ensure_solve_buffer(DeviceBuffer& buf, size_t& current_size, size_t needed) const { + if (needed > current_size) { + if (buf) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + buf = DeviceBuffer(needed); + current_size = needed; + } + } + + void sync_info() const { + if (!info_synced_) { + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + int cudss_info = 0; + EIGEN_CUDSS_CHECK(cudssDataGet(handle_, data_, CUDSS_DATA_INFO, &cudss_info, sizeof(cudss_info), nullptr)); + info_ = (cudss_info == 0) ? Success : NumericalIssue; + info_synced_ = true; + } + } + + // Destructor-only cleanup: there is no useful recovery path for failures. + void destroy_cudss_objects() { + if (d_A_cudss_) { + (void)cudssMatrixDestroy(d_A_cudss_); + d_A_cudss_ = nullptr; + } + if (d_x_cudss_) { + (void)cudssMatrixDestroy(d_x_cudss_); + d_x_cudss_ = nullptr; + } + if (d_b_cudss_) { + (void)cudssMatrixDestroy(d_b_cudss_); + d_b_cudss_ = nullptr; + } + if (data_) { + (void)cudssDataDestroy(handle_, data_); + data_ = nullptr; + } + if (config_) { + (void)cudssConfigDestroy(config_); + config_ = nullptr; + } + } + + // Upload CSR from a RowMajor sparse matrix (native CSR). + void upload_csr(const CsrMat& csr) { upload_compressed(csr.outerIndexPtr(), csr.innerIndexPtr(), csr.valuePtr()); } + + // Upload CSC arrays reinterpreted as CSR (for symmetric matrices: CSC(A) = CSR(A^T) = CSR(A)). + void upload_csr_from_csc(const SpMat& csc) { + upload_compressed(csc.outerIndexPtr(), csc.innerIndexPtr(), csc.valuePtr()); + } + + static void check_storage_index_bounds(Index rows, Index cols, Index nnz) { + const Index max_storage_index = static_cast<Index>((std::numeric_limits<StorageIndex>::max)()); + eigen_assert(rows <= max_storage_index && cols <= max_storage_index && nnz <= max_storage_index && + "gpu sparse solvers currently use int StorageIndex; matrix dimensions or nonzeros exceed int range"); + EIGEN_UNUSED_VARIABLE(rows); + EIGEN_UNUSED_VARIABLE(cols); + EIGEN_UNUSED_VARIABLE(nnz); + EIGEN_UNUSED_VARIABLE(max_storage_index); + } + + void upload_compressed(const StorageIndex* outer, const StorageIndex* inner, const Scalar* values) { + const size_t rowptr_bytes = static_cast<size_t>(n_ + 1) * sizeof(StorageIndex); + const size_t colidx_bytes = static_cast<size_t>(nnz_) * sizeof(StorageIndex); + const size_t values_bytes = static_cast<size_t>(nnz_) * sizeof(Scalar); + + d_rowPtr_ = DeviceBuffer(rowptr_bytes); + d_colIdx_ = DeviceBuffer(colidx_bytes); + d_values_ = DeviceBuffer(values_bytes); + + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_rowPtr_.get(), outer, rowptr_bytes, cudaMemcpyHostToDevice, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_colIdx_.get(), inner, colidx_bytes, cudaMemcpyHostToDevice, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.get(), values, values_bytes, cudaMemcpyHostToDevice, stream_)); + } + + void create_cudss_matrix() { + if (d_A_cudss_) EIGEN_CUDSS_CHECK(cudssMatrixDestroy(d_A_cudss_)); + + constexpr cudaDataType_t idx_type = cudss_index_type<StorageIndex>::value; + constexpr cudaDataType_t val_type = cuda_data_type<Scalar>::value; + constexpr cudssMatrixType_t mtype = Derived::cudss_matrix_type(); + constexpr cudssMatrixViewType_t mview = Derived::cudss_matrix_view(); + + EIGEN_CUDSS_CHECK(cudssMatrixCreateCsr(&d_A_cudss_, n_, n_, nnz_, d_rowPtr_.get(), + /*rowEnd=*/nullptr, d_colIdx_.get(), d_values_.get(), idx_type, val_type, + mtype, mview, CUDSS_BASE_ZERO)); + } + + void apply_ordering_config() { + cudssAlgType_t alg; + switch (ordering_) { + case GpuSparseOrdering::AMD: + alg = CUDSS_ALG_DEFAULT; + break; + case GpuSparseOrdering::METIS: + alg = CUDSS_ALG_2; + break; + case GpuSparseOrdering::RCM: + alg = CUDSS_ALG_3; + break; + default: + alg = CUDSS_ALG_DEFAULT; + break; + } + EIGEN_CUDSS_CHECK(cudssConfigSet(config_, CUDSS_CONFIG_REORDERING_ALG, &alg, sizeof(alg))); + } + + void create_placeholder_dense() { + if (d_x_cudss_) EIGEN_CUDSS_CHECK(cudssMatrixDestroy(d_x_cudss_)); + if (d_b_cudss_) EIGEN_CUDSS_CHECK(cudssMatrixDestroy(d_b_cudss_)); + constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value; + EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_x_cudss_, n_, 1, n_, nullptr, dtype, CUDSS_LAYOUT_COL_MAJOR)); + EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_b_cudss_, n_, 1, n_, nullptr, dtype, CUDSS_LAYOUT_COL_MAJOR)); + } +}; + +} // namespace internal +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_SPARSE_SOLVER_BASE_H
diff --git a/unsupported/Eigen/src/GPU/GpuSupport.h b/unsupported/Eigen/src/GPU/GpuSupport.h index 2c9b0cc..4b02869 100644 --- a/unsupported/Eigen/src/GPU/GpuSupport.h +++ b/unsupported/Eigen/src/GPU/GpuSupport.h
@@ -28,6 +28,14 @@ namespace Eigen { namespace gpu { + +// ---- Generic operation flag ------------------------------------------------- +// Public flag for transpose/adjoint in BLAS-, solver-, and sparse-style calls. +// Each library's support header maps this to its own enum (cublasOperation_t, +// cusparseOperation_t, etc.) via a small to_<lib>_op() helper. + +enum class GpuOp { NoTrans, Trans, ConjTrans }; + namespace internal { // ---- Error-checking macros --------------------------------------------------
diff --git a/unsupported/Eigen/src/GPU/README.md b/unsupported/Eigen/src/GPU/README.md index b7c4269..d7411bb 100644 --- a/unsupported/Eigen/src/GPU/README.md +++ b/unsupported/Eigen/src/GPU/README.md
@@ -1,8 +1,9 @@ # Eigen GPU Module (`unsupported/Eigen/GPU`) -GPU-accelerated dense linear algebra for Eigen users, dispatching to NVIDIA -CUDA Math Libraries (cuBLAS, cuSOLVER). Requires CUDA 11.4+. Header-only (link -against CUDA runtime, cuBLAS, and cuSOLVER). +GPU-accelerated linear algebra for Eigen users, dispatching to NVIDIA CUDA +Math Libraries (cuBLAS, cuSOLVER, cuFFT, cuSPARSE, cuDSS). Requires CUDA +11.4+; cuDSS features require CUDA 12.0+ and a separate cuDSS install. +Header-only. ## Why this module @@ -10,25 +11,31 @@ in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP, Open3D), scientific computing (Ceres, Stan), and beyond. Many of these projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations -without dropping down to raw CUDA library APIs. Third-party projects like -[EigenCuda](https://github.com/NLESC-JCER/EigenCuda) and -[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically to fill -this gap, and downstream projects like -[Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and -[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for -GPU-accelerated solvers through Eigen. +without dropping down to raw CUDA library APIs. -The `unsupported/Eigen/GPU` module aims to close this gap: Existing Eigen users should be -able to move performance-critical dense linear algebra to the GPU with minimal -code changes and without learning CUDA library APIs directly. +GPU sparse solvers are a particularly acute gap. Sparse factorization is the +bottleneck in SLAM, bundle adjustment, FEM, and nonlinear optimization -- +exactly the workloads where GPU acceleration matters most. Downstream projects +like [Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and +[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for +GPU-accelerated sparse solvers, and third-party projects like +[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically because +Eigen lacks them. The `unsupported/Eigen/GPU` module provides GPU sparse Cholesky, LDL^T, +and LU factorization via cuDSS, alongside dense solvers (cuSOLVER), matrix +products (cuBLAS), FFT (cuFFT), and sparse matrix-vector products (cuSPARSE). + +Existing Eigen users should be able to move performance-critical dense or +sparse linear algebra to the GPU with minimal code changes and without +learning CUDA library APIs directly. ## Design philosophy **CPU and GPU coexist.** There is no global compile-time switch that replaces CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers -explicitly -- `gpu::LLT<double>` vs `LLT<MatrixXd>` -- and both coexist in -the same binary. This also lets users keep the factored matrix on device across -multiple solves, something impossible with compile-time replacement. +explicitly -- `gpu::LLT<double>` vs `LLT<MatrixXd>`, `gpu::SparseLLT<double>` vs +`SimplicialLLT<SparseMatrix<double>>` -- and both coexist in the same binary. +This also lets users keep the factored matrix on device across multiple solves, +something impossible with compile-time replacement. **Familiar syntax.** GPU operations use the same expression patterns as CPU Eigen. Here is a side-by-side comparison: @@ -38,6 +45,7 @@ #include <Eigen/Dense> #define EIGEN_USE_GPU #include <unsupported/Eigen/GPU> +// Dense MatrixXd A = ...; auto d_A = gpu::DeviceMatrix<double>::fromHost(A); MatrixXd B = ...; auto d_B = gpu::DeviceMatrix<double>::fromHost(B); @@ -45,11 +53,15 @@ MatrixXd X = A.llt().solve(B); gpu::DeviceMatrix<double> d_X = d_A.llt().solve(d_B); MatrixXd X = d_X.toHost(); + +// Sparse (using SpMat = SparseMatrix<double>) +SimplicialLLT<SpMat> llt(A); gpu::SparseLLT<double> llt(A); +VectorXd x = llt.solve(b); VectorXd x = llt.solve(b); ``` -The GPU version reads like CPU Eigen with explicit upload/download. -`operator*` dispatches to cuBLAS GEMM, `.llt().solve()` dispatches to -cuSOLVER potrf + potrs. Unsupported expressions are compile errors. +The GPU version reads like CPU Eigen with explicit upload/download for dense +operations, and an almost identical API for sparse solvers. Unsupported +expressions are compile errors. **Standalone module.** `unsupported/Eigen/GPU` does not modify or depend on Eigen's Core expression template system (`MatrixBase`, `CwiseBinaryOp`, etc.). @@ -169,15 +181,17 @@ auto d_Y = lu.solve(d_B, gpu::GpuOp::Trans); // A^T Y = B // QR solve (overdetermined least squares) -gpu::QR<double> qr(A); // host matrix input -MatrixXd X = qr.solve(B); // Q^H * B via ormqr, then trsm on R +gpu::QR<double> qr; +qr.compute(d_A); // factorize on device (async) +auto d_X = qr.solve(d_B); // Q^H * B via ormqr, then trsm on R +MatrixXd X = d_X.toHost(); -// SVD -gpu::SVD<double> svd(A, ComputeThinU | ComputeThinV); -VectorXd S = svd.singularValues(); -MatrixXd U = svd.matrixU(); -MatrixXd VT = svd.matrixVT(); -MatrixXd X = svd.solve(B); // pseudoinverse solve +// SVD (results downloaded on access) +gpu::SVD<double> svd; +svd.compute(d_A, ComputeThinU | ComputeThinV); +VectorXd S = svd.singularValues(); // downloads to host +MatrixXd U = svd.matrixU(); // downloads to host +MatrixXd VT = svd.matrixVT(); // V^T (matches cuSOLVER) // SVD: device-side views (no D2H transfer; svd must outlive the views) auto d_S = svd.d_singularValues(); // DeviceMatrix view of singular values @@ -185,17 +199,89 @@ auto d_VT = svd.d_matrixVT(); // DeviceMatrix view of V^T // Self-adjoint eigenvalue decomposition -gpu::SelfAdjointEigenSolver<double> es(A); -VectorXd eigenvals = es.eigenvalues(); -MatrixXd eigenvecs = es.eigenvectors(); -auto d_W = es.d_eigenvalues(); // DeviceMatrix view of eigenvalues -auto d_V = es.d_eigenvectors(); // DeviceMatrix view of eigenvectors +gpu::SelfAdjointEigenSolver<double> es; +es.compute(d_A); +VectorXd eigenvals = es.eigenvalues(); // downloads to host +MatrixXd eigenvecs = es.eigenvectors(); // downloads to host +auto d_W = es.d_eigenvalues(); // DeviceMatrix view of eigenvalues +auto d_V = es.d_eigenvectors(); // DeviceMatrix view of eigenvectors ``` The cached API keeps the factored matrix on device, avoiding redundant -host-device transfers and re-factorizations. The `d_*` accessors return -non-owning `DeviceMatrix` views so downstream cuBLAS/cuSOLVER work can chain -without round-tripping through host memory. +host-device transfers and re-factorizations. All solvers also accept host +matrices directly as a convenience (e.g., `gpu::LLT<double> llt(A)` or +`qr.solve(B)`), which handles upload/download internally. The `d_*` accessors +on `gpu::SVD` and `gpu::SelfAdjointEigenSolver` return non-owning +`DeviceMatrix` views so downstream cuBLAS/cuSOLVER work can chain without +round-tripping through host memory. + +### Sparse direct solvers (cuDSS) + +Requires cuDSS (separate install, CUDA 12.0+). Define `EIGEN_CUDSS` before +including `unsupported/Eigen/GPU` and link with `-lcudss`. + +```cpp +SparseMatrix<double> A = ...; // symmetric positive definite +VectorXd b = ...; + +// Sparse Cholesky -- one-liner +gpu::SparseLLT<double> llt(A); +VectorXd x = llt.solve(b); + +// Three-phase workflow for repeated solves with the same sparsity pattern +gpu::SparseLLT<double> llt; +llt.analyzePattern(A); // symbolic analysis (once) +llt.factorize(A); // numeric factorization +VectorXd x = llt.solve(b); +llt.factorize(A_new_values); // refactorize (reuses symbolic analysis) +VectorXd x2 = llt.solve(b); + +// Sparse LDL^T (symmetric indefinite) +gpu::SparseLDLT<double> ldlt(A); +VectorXd x = ldlt.solve(b); + +// Sparse LU (general non-symmetric) +gpu::SparseLU<double> lu(A); +VectorXd x = lu.solve(b); +``` + +### FFT (cuFFT) + +```cpp +gpu::FFT<float> fft; + +// 1D complex-to-complex +VectorXcf X = fft.fwd(x); // forward +VectorXcf y = fft.inv(X); // inverse (scaled by 1/n) + +// 1D real-to-complex / complex-to-real +VectorXcf R = fft.fwd(r); // returns n/2+1 complex (half-spectrum) +VectorXf s = fft.invReal(R, n); // C2R inverse, caller specifies n + +// 2D complex-to-complex +MatrixXcf B = fft.fwd2(A); // 2D forward +MatrixXcf C = fft.inv2(B); // 2D inverse (scaled by 1/(rows*cols)) + +// Plans are cached and reused across calls with the same size/type. +``` + +### Sparse matrix-vector multiply (cuSPARSE) + +```cpp +SparseMatrix<double> A = ...; +VectorXd x = ...; + +gpu::SparseContext<double> ctx; +VectorXd y = ctx.multiply(A, x); // y = A * x +VectorXd z = ctx.multiplyT(A, x); // z = A^T * x +ctx.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y +ctx.multiply(A, x, y, 1.0, 0.0, // y = A^H * x (Hermitian SpMV) + gpu::GpuOp::ConjTrans); + +// Multiple RHS (SpMM) +MatrixXd Y = ctx.multiplyMat(A, X); // Y = A * X +MatrixXd Z = ctx.multiplyMat(A, X, gpu::GpuOp::Trans); // Z = A^T * X +``` ### Precision control @@ -238,8 +324,7 @@ ```cpp gpu::LLT<double> llt(d_A); // factor stays on device -auto d_X = llt.solve(d_B); -auto h_x = d_X.toHostAsync(llt.stream()); +auto h_x = d_X = llt.solve(d_B).toHostAsync(stream); h_x.get(); // sync: factor + result complete // llt may now be destroyed without stalling the device ``` @@ -248,7 +333,8 @@ ### Supported scalar types -`float`, `double`, `std::complex<float>`, `std::complex<double>`. +`float`, `double`, `std::complex<float>`, `std::complex<double>` (unless +noted otherwise). ### Expression -> library call mapping @@ -270,31 +356,43 @@ | `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo | | `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N | -### `DeviceMatrix<Scalar>` API +### `DeviceMatrix<Scalar>` -| Method | Sync? | Description | -|--------|-------|-------------| -| `DeviceMatrix()` | -- | Empty (0x0) | -| `DeviceMatrix(rows, cols)` | -- | Allocate uninitialized | -| `fromHost(matrix, stream)` | yes | Upload from Eigen matrix | -| `fromHostAsync(ptr, rows, cols, stream)` | no | Async upload (caller manages lifetime) | -| `adopt(ptr, rows, cols)` | -- | Owning wrapper over a device pointer (caller relinquishes ownership) | -| `view(ptr, rows, cols)` | -- | Non-owning view over a device pointer (does not free on destruction) | -| `toHost(stream)` | yes | Synchronous download | -| `toHostAsync(stream)` | no | Returns `HostTransfer` future | -| `clone(stream)` | no | Device-to-device deep copy | -| `resize(rows, cols)` | -- | Discard contents, reallocate | -| `data()` | -- | Raw device pointer | -| `rows()`, `cols()` | -- | Dimensions | -| `sizeInBytes()` | -- | Total device allocation size in bytes | -| `empty()` | -- | True if 0x0 | -| `adjoint()` | -- | Adjoint view (GEMM ConjTrans) | -| `transpose()` | -- | Transpose view (GEMM Trans) | -| `llt()` / `llt<UpLo>()` | -- | Cholesky expression builder | -| `lu()` | -- | LU expression builder | -| `triangularView<UpLo>()` | -- | Triangular view (TRSM) | -| `selfadjointView<UpLo>()` | -- | Self-adjoint view (SYMM, rankUpdate) | -| `device(ctx)` | -- | Assignment proxy bound to context | +Typed RAII wrapper for a dense column-major matrix in GPU device memory. +Always dense (leading dimension = rows). A vector is a `DeviceMatrix` with +one column. + +```cpp +// Construction +DeviceMatrix<Scalar>() // Empty (0x0) +DeviceMatrix<Scalar>(rows, cols) // Allocate uninitialized + +// Upload / download / pointer adoption +static DeviceMatrix fromHost(matrix, stream=nullptr) // -> DeviceMatrix (syncs) +static DeviceMatrix fromHostAsync(ptr, rows, cols, outerStride, s) // -> DeviceMatrix (no sync, caller manages ptr lifetime) +static DeviceMatrix adopt(Scalar* device_ptr, rows, cols) // Owning wrapper over a raw device pointer +static DeviceMatrix view(Scalar* device_ptr, rows, cols) // Non-owning view (does not free on destruction) +PlainMatrix toHost(stream=nullptr) // -> host Matrix (syncs) +HostTransfer toHostAsync(stream=nullptr) // -> HostTransfer future (no sync) +DeviceMatrix clone(stream=nullptr) // -> DeviceMatrix (D2D copy, async) + +// Dimensions and access +Index rows() +Index cols() +size_t sizeInBytes() +bool empty() +Scalar* data() // Raw device pointer +void resize(Index rows, Index cols) // Discard contents, reallocate + +// Expression builders (return lightweight views, evaluated on assignment) +AdjointView adjoint() // GEMM with ConjTrans +TransposeView transpose() // GEMM with Trans +LltExpr llt() / llt<UpLo>() // -> .solve(d_B) -> DeviceMatrix +LuExpr lu() // -> .solve(d_B) -> DeviceMatrix +TriangularView triangularView<UpLo>() // -> .solve(d_B) -> DeviceMatrix (TRSM) +SelfAdjointView selfadjointView<UpLo>() // -> * d_B (SYMM), .rankUpdate(d_A) (SYRK) +Assignment device(gpu::Context& ctx) // Bind assignment to explicit stream +``` ### `gpu::Context` @@ -311,103 +409,205 @@ Non-copyable, non-movable (owns library handles). -### `gpu::LLT<Scalar, UpLo>` API +### `gpu::LLT<Scalar, UpLo>` -- Dense Cholesky (cuSOLVER) -GPU dense Cholesky (LL^T) via cuSOLVER. Caches factor on device. +Caches the Cholesky factor on device for repeated solves. -| Method | Sync? | Description | -|--------|-------|-------------| -| `gpu::LLT(A)` | deferred | Construct and factorize from host matrix | -| `compute(host_matrix)` | deferred | Upload and factorize | -| `compute(DeviceMatrix)` | deferred | D2D copy and factorize | -| `compute(DeviceMatrix&&)` | deferred | Move-adopt and factorize (no copy) | -| `solve(host_matrix)` | yes | Solve, return host matrix | -| `solve(DeviceMatrix)` | no | Solve, return `DeviceMatrix` (async) | -| `info()` | lazy | Syncs stream on first call, returns `Success` or `NumericalIssue` | +```cpp +gpu::LLT() // Default construct, then call compute() +gpu::LLT(const EigenBase<D>& A) // Convenience: upload + factorize -### `gpu::LU<Scalar>` API +gpu::LLT& compute(const EigenBase<D>& A) // Upload + factorize +gpu::LLT& compute(const DeviceMatrix& d_A) // D2D copy + factorize +gpu::LLT& compute(DeviceMatrix&& d_A) // Adopt + factorize (no copy) -GPU dense partial-pivoting LU via cuSOLVER. Same pattern as `gpu::LLT`, plus a -`gpu::GpuOp` parameter on `solve()` (`NoTrans`, `Trans`, `ConjTrans`). +PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs) +DeviceMatrix solve(const DeviceMatrix& d_B) // -> DeviceMatrix (async, stays on device) -### `gpu::QR<Scalar>` API +ComputationInfo info() // Lazy sync on first call: Success or NumericalIssue +Index rows() / cols() +cudaStream_t stream() +``` -GPU dense QR decomposition via cuSOLVER (`geqrf`). Solve uses `ormqr` (apply -Q^H) + `trsm` (back-substitute on R) -- Q is never formed explicitly. +### `gpu::LU<Scalar>` -- Dense LU (cuSOLVER) -| Method | Description | -|--------|-------------| -| `gpu::QR()` | Default construct, then call `compute()` | -| `gpu::QR(A)` | Construct and factorize from host matrix | -| `compute(A)` | Upload + factorize | -| `compute(DeviceMatrix)` | D2D copy + factorize | -| `solve(host_matrix)` | Solve, return host matrix (syncs) | -| `solve(DeviceMatrix)` | Solve, return `DeviceMatrix` (async) | -| `matrixR()` | Download upper-triangular factor R to host (m >= n only) | -| `info()` | Lazy sync | -| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream | +Same pattern as `gpu::LLT`. Adds a `gpu::GpuOp` parameter on `solve()`. -### `gpu::SVD<Scalar>` API +```cpp +PlainMatrix solve(const MatrixBase<D>& B, GpuOp op = GpuOp::NoTrans) // -> host Matrix +DeviceMatrix solve(const DeviceMatrix& d_B, GpuOp op = GpuOp::NoTrans) // -> DeviceMatrix +``` -GPU dense SVD via cuSOLVER (`gesvd`). Supports thin, full, and values-only -modes via Eigen's `ComputeThinU | ComputeThinV`, `ComputeFullU | ComputeFullV`, -or `0` (values only). +`gpu::GpuOp`: `NoTrans`, `Trans`, `ConjTrans`. -| Method | Description | -|--------|-------------| -| `gpu::SVD()` | Default construct, then call `compute()` | -| `gpu::SVD(A, options)` | Construct and compute (options default: `ComputeThinU \| ComputeThinV`) | -| `compute(A, options)` | Compute from host matrix | -| `compute(DeviceMatrix, options)` | Compute from device matrix | -| `singularValues()` | Download singular values to host | -| `matrixU()` | Download U to host (requires `ComputeThinU` or `ComputeFullU`) | -| `matrixV()` | Download V to host (matches `JacobiSVD::matrixV()`) | -| `matrixVT()` | Download V^T to host (requires `ComputeThinV` or `ComputeFullV`) | -| `d_singularValues()` | `DeviceMatrix` view of singular values (zero-copy) | -| `d_matrixU()` | `DeviceMatrix` view of U (zero-copy when m >= n) | -| `d_matrixVT()` | `DeviceMatrix` view of V^T (zero-copy when m >= n) | -| `solve(B)` | Pseudoinverse solve (returns host matrix) | -| `solve(B, k)` | Truncated solve (top k singular triplets) | -| `solve(B, lambda)` | Tikhonov regularized solve | -| `rank(threshold)` | Count singular values above threshold | -| `info()` | Lazy sync | -| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream | +### `gpu::QR<Scalar>` -- Dense QR (cuSOLVER) -Wide matrices (m < n) are handled by internally transposing via cuBLAS `geam`; -in that case `d_matrixU()` and `d_matrixVT()` allocate an owning result via a -single `cublasXgeam` adjoint pass. `d_singularValues()` is always zero-copy. +QR factorization via `cusolverDnXgeqrf`. Solve uses ORMQR (apply Q^H) + TRSM +(back-substitute on R) -- Q is never formed explicitly. -### `gpu::SelfAdjointEigenSolver<Scalar>` API +```cpp +gpu::QR() // Default construct +gpu::QR(const EigenBase<D>& A) // Convenience: upload + factorize -GPU symmetric/Hermitian eigenvalue decomposition via cuSOLVER (`syevd`). +gpu::QR& compute(const EigenBase<D>& A) // Upload + factorize +gpu::QR& compute(const DeviceMatrix& d_A) // D2D copy + factorize -| Method | Description | -|--------|-------------| -| `gpu::SelfAdjointEigenSolver()` | Default construct, then call `compute()` | -| `gpu::SelfAdjointEigenSolver(A, mode)` | Construct and compute (mode default: `ComputeEigenvectors`) | -| `compute(A, mode)` | Compute from host matrix | -| `compute(DeviceMatrix, mode)` | Compute from device matrix | -| `eigenvalues()` | Download eigenvalues to host (ascending order) | -| `eigenvectors()` | Download eigenvectors to host (columns) | -| `d_eigenvalues()` | `DeviceMatrix` view of eigenvalues (zero-copy) | -| `d_eigenvectors()` | `DeviceMatrix` view of eigenvectors (zero-copy, requires `ComputeEigenvectors`) | -| `info()` | Lazy sync | -| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream | +PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs) +DeviceMatrix solve(const DeviceMatrix& d_B) // -> DeviceMatrix (async) +PlainMatrix matrixR() // -> host Matrix (m >= n only) -`ComputeMode`: `gpu::SelfAdjointEigenSolver::EigenvaluesOnly` or -`gpu::SelfAdjointEigenSolver::ComputeEigenvectors`. +ComputationInfo info() // Lazy sync +Index rows() / cols() +cudaStream_t stream() +``` -The `d_*` accessors return non-owning views into the solver's internal device -buffers; the solver must outlive any view derived from it. +### `gpu::SVD<Scalar>` -- Dense SVD (cuSOLVER) -### `HostTransfer<Scalar>` API +SVD via `cusolverDnXgesvd`. Supports `ComputeThinU | ComputeThinV`, +`ComputeFullU | ComputeFullV`, or `0` (values only). Wide matrices (m < n) +handled by internal transpose. -Future for async device-to-host transfer. +```cpp +gpu::SVD() // Default construct, then call compute() +gpu::SVD(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV) // Convenience -| Method | Description | -|--------|-------------| -| `get()` | Block until transfer completes, return host matrix reference. Idempotent. | -| `ready()` | Non-blocking poll | +gpu::SVD& compute(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV) +gpu::SVD& compute(const DeviceMatrix& d_A, unsigned options = ComputeThinU | ComputeThinV) + +RealVector singularValues() // -> host vector (syncs, downloads) +PlainMatrix matrixU() // -> host Matrix (syncs, downloads) +PlainMatrix matrixVT() // -> host Matrix (syncs, downloads V^T) + +DeviceMatrix d_singularValues() // -> DeviceMatrix view (zero-copy) +DeviceMatrix d_matrixU() // -> DeviceMatrix view (zero-copy when m >= n) +DeviceMatrix d_matrixVT() // -> DeviceMatrix view (zero-copy when m >= n) + +PlainMatrix solve(const MatrixBase<D>& B) // -> host Matrix (pseudoinverse) +PlainMatrix solve(const MatrixBase<D>& B, Index k) // Truncated (top k triplets) +PlainMatrix solve(const MatrixBase<D>& B, RealScalar l) // Tikhonov regularized + +Index rank(RealScalar threshold = -1) +ComputationInfo info() // Lazy sync +Index rows() / cols() +cudaStream_t stream() +``` + +**Note:** `singularValues()`, `matrixU()`, and `matrixVT()` download to host +on each call. The `d_*` accessors return non-owning `DeviceMatrix` views into +the solver's internal buffers; the `gpu::SVD` object must outlive any view +derived from it. For wide matrices (m < n) the U/V^T views are owning (one +`cublasXgeam` adjoint pass). + +### `gpu::SelfAdjointEigenSolver<Scalar>` -- Eigendecomposition (cuSOLVER) + +Symmetric/Hermitian eigenvalue decomposition via `cusolverDnXsyevd`. +`ComputeMode` enum: `EigenvaluesOnly`, `ComputeEigenvectors`. + +```cpp +gpu::SelfAdjointEigenSolver() // Default construct, then call compute() +gpu::SelfAdjointEigenSolver(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors) // Convenience + +gpu::SelfAdjointEigenSolver& compute(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors) +gpu::SelfAdjointEigenSolver& compute(const DeviceMatrix& d_A, ComputeMode mode = ComputeEigenvectors) + +RealVector eigenvalues() // -> host vector (syncs, downloads, ascending order) +PlainMatrix eigenvectors() // -> host Matrix (syncs, downloads, columns) + +DeviceMatrix d_eigenvalues() // -> DeviceMatrix view (zero-copy) +DeviceMatrix d_eigenvectors() // -> DeviceMatrix view (zero-copy, requires ComputeEigenvectors) + +ComputationInfo info() // Lazy sync +Index rows() / cols() +cudaStream_t stream() +``` + +**Note:** `eigenvalues()` and `eigenvectors()` download to host on each call. +The `d_*` accessors return non-owning `DeviceMatrix` views into the solver's +internal buffers; the `gpu::SelfAdjointEigenSolver` object must outlive any +view derived from it. + +### `HostTransfer<Scalar>` + +Future for async device-to-host transfer. Returned by +`DeviceMatrix::toHostAsync()`. + +```cpp +PlainMatrix& get() // Block until complete, return host Matrix ref. Idempotent. +bool ready() // Non-blocking poll +``` + +### `gpu::SparseLLT<Scalar, UpLo>` -- Sparse Cholesky (cuDSS) + +Requires cuDSS (CUDA 12.0+, `#define EIGEN_CUDSS`). Three-phase workflow +with symbolic reuse. Accepts `SparseMatrix<Scalar, ColMajor, int>` (CSC). +Matrix dimensions and nonzero count must fit in `int` (cuDSS limitation; +debug builds assert). + +```cpp +gpu::SparseLLT() // Default construct +gpu::SparseLLT(const SparseMatrixBase<D>& A) // Analyze + factorize + +gpu::SparseLLT& analyzePattern(const SparseMatrixBase<D>& A) // Symbolic analysis (reusable) +gpu::SparseLLT& factorize(const SparseMatrixBase<D>& A) // Numeric factorization +gpu::SparseLLT& compute(const SparseMatrixBase<D>& A) // analyzePattern + factorize +void setOrdering(GpuSparseOrdering ord) // AMD (default), METIS, or RCM + +DenseMatrix solve(const MatrixBase<D>& B) // -> host Matrix (syncs) + +ComputationInfo info() // Lazy sync +Index rows() / cols() +cudaStream_t stream() +``` + +### `gpu::SparseLDLT<Scalar, UpLo>` -- Sparse LDL^T (cuDSS) + +Symmetric indefinite. Same API as `gpu::SparseLLT`. + +### `gpu::SparseLU<Scalar>` -- Sparse LU (cuDSS) + +General non-symmetric. Same API as `gpu::SparseLLT` (without `UpLo`). + +### `gpu::FFT<Scalar>` -- FFT (cuFFT) + +Plans cached by (size, type) and reused. Inverse transforms scaled so +`inv(fwd(x)) == x`. Supported scalars: `float`, `double`. + +```cpp +// 1D transforms (host vectors in and out) +ComplexVector fwd(const MatrixBase<D>& x) // C2C forward (complex input) +ComplexVector fwd(const MatrixBase<D>& x) // R2C forward (real input, returns n/2+1) +ComplexVector inv(const MatrixBase<D>& X) // C2C inverse, scaled by 1/n +RealVector invReal(const MatrixBase<D>& X, Index n) // C2R inverse, scaled by 1/n + +// 2D transforms (host matrices in and out) +ComplexMatrix fwd2(const MatrixBase<D>& A) // 2D C2C forward +ComplexMatrix inv2(const MatrixBase<D>& A) // 2D C2C inverse, scaled by 1/(rows*cols) + +cudaStream_t stream() +``` + +All FFT methods accept host data and return host data. Upload/download is +handled internally. The C2C and R2C overloads of `fwd()` are distinguished by +the input scalar type (complex vs real). + +### `gpu::SparseContext<Scalar>` -- SpMV/SpMM (cuSPARSE) + +Accepts `SparseMatrix<Scalar, ColMajor>`. All methods accept host data and +return host data. Matrix dimensions and nonzero count must fit in `int` +(cuSPARSE limitation; debug builds assert). + +```cpp +gpu::SparseContext() // Creates own stream + cuSPARSE handle + +DenseVector multiply(A, x) // y = A * x +void multiply(A, x, y, alpha=1, beta=0, // y = alpha*op(A)*x + beta*y + op=GpuOp::NoTrans) +DenseVector multiplyT(A, x) // y = A^T * x +DenseVector multiplyAdjoint(A, x) // y = A^H * x +DenseMatrix multiplyMat(A, X, op=GpuOp::NoTrans) // Y = op(A) * X (SpMM) + +cudaStream_t stream() +``` ### Aliasing @@ -434,7 +634,7 @@ | File | Depends on | Contents | |------|-----------|----------| -| `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `PinnedHostBuffer`, `cuda_data_type<>` | +| `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `cuda_data_type<>` | | `DeviceMatrix.h` | `GpuSupport.h` | `DeviceMatrix<>`, `HostTransfer<>` | | `DeviceExpr.h` | `DeviceMatrix.h` | GEMM expression wrappers | | `DeviceBlasExpr.h` | `DeviceMatrix.h` | TRSM, SYMM, SYRK expression wrappers | @@ -449,18 +649,46 @@ | `GpuQR.h` | `GpuSolverContext.h` | Dense QR decomposition | | `GpuSVD.h` | `GpuSolverContext.h` | Dense SVD decomposition | | `GpuEigenSolver.h` | `GpuSolverContext.h` | Self-adjoint eigenvalue decomposition | +| `CuFftSupport.h` | `GpuSupport.h`, `<cufft.h>` | cuFFT error macro, type-dispatch wrappers | +| `GpuFFT.h` | `CuFftSupport.h`, `CuBlasSupport.h` | 1D/2D FFT with plan caching | +| `CuSparseSupport.h` | `GpuSupport.h`, `<cusparse.h>` | cuSPARSE error macro | +| `GpuSparseContext.h` | `CuSparseSupport.h` | SpMV/SpMM via cuSPARSE | +| `CuDssSupport.h` | `GpuSupport.h`, `<cudss.h>` | cuDSS error macro, type traits (optional) | +| `GpuSparseSolverBase.h` | `CuDssSupport.h` | CRTP base for sparse solvers (optional) | +| `GpuSparseLLT.h` | `GpuSparseSolverBase.h` | Sparse Cholesky via cuDSS (optional) | +| `GpuSparseLDLT.h` | `GpuSparseSolverBase.h` | Sparse LDL^T via cuDSS (optional) | +| `GpuSparseLU.h` | `GpuSparseSolverBase.h` | Sparse LU via cuDSS (optional) | ## Building and testing ```bash cmake -G Ninja -B build -S . \ -DEIGEN_TEST_CUDA=ON \ - -DEIGEN_CUDA_COMPUTE_ARCH="70" + -DEIGEN_CUDA_COMPUTE_ARCH="70" \ + -DEIGEN_TEST_CUBLAS=ON \ + -DEIGEN_TEST_CUSOLVER=ON cmake --build build --target cublas cusolver_llt cusolver_lu \ - cusolver_qr cusolver_svd cusolver_eigen device_matrix + cusolver_qr cusolver_svd cusolver_eigen \ + device_matrix cufft cusparse_spmv ctest --test-dir build -L gpu --output-on-failure + +# Sparse solvers (cuDSS -- separate install required) +cmake -G Ninja -B build -S . \ + -DEIGEN_TEST_CUDA=ON \ + -DEIGEN_CUDA_COMPUTE_ARCH="70" \ + -DEIGEN_TEST_CUDSS=ON + +cmake --build build --target cudss_llt cudss_ldlt cudss_lu +ctest --test-dir build -R '^cudss_' --output-on-failure ``` -`EIGEN_TEST_CUBLAS` and `EIGEN_TEST_CUSOLVER` default to ON when CUDA is enabled -(cuBLAS and cuSOLVER are part of the CUDA toolkit). +## Future enhancements + +- **Device-resident sparse matrix-vector products.** `gpu::SparseContext` + currently operates on host vectors and matrices, uploading and downloading + on each call. The key missing piece is a `DeviceSparseView` that holds a + sparse matrix on device and supports operator syntax (`d_y = d_A * d_x`) + with `DeviceMatrix` operands -- keeping the entire SpMV/SpMM pipeline on + device. This is essential for iterative solvers and any workflow that chains + sparse and dense operations without returning to the host.
diff --git a/unsupported/benchmarks/GPU/CMakeLists.txt b/unsupported/benchmarks/GPU/CMakeLists.txt index 5447154..1f42082 100644 --- a/unsupported/benchmarks/GPU/CMakeLists.txt +++ b/unsupported/benchmarks/GPU/CMakeLists.txt
@@ -54,3 +54,7 @@ # Batching benchmarks: multi-stream concurrency for many small systems. eigen_add_gpu_benchmark(bench_batching bench_batching.cpp) eigen_add_gpu_benchmark(bench_batching_float bench_batching.cpp DEFINITIONS SCALAR=float) + +# FFT benchmarks: 1D/2D C2C, R2C, C2R throughput and plan reuse. +eigen_add_gpu_benchmark(bench_fft bench_fft.cpp LIBRARIES CUDA::cufft) +eigen_add_gpu_benchmark(bench_fft_double bench_fft.cpp LIBRARIES CUDA::cufft DEFINITIONS SCALAR=double)
diff --git a/unsupported/benchmarks/GPU/bench_batching.cpp b/unsupported/benchmarks/GPU/bench_batching.cpp index 67c3e3b..db61cbc 100644 --- a/unsupported/benchmarks/GPU/bench_batching.cpp +++ b/unsupported/benchmarks/GPU/bench_batching.cpp
@@ -42,8 +42,8 @@ static bool done = false; if (!done) { void* p; - cudaMalloc(&p, 1); - cudaFree(p); + EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, 1)); + EIGEN_CUDA_RUNTIME_CHECK(cudaFree(p)); done = true; } }
diff --git a/unsupported/benchmarks/GPU/bench_chaining.cpp b/unsupported/benchmarks/GPU/bench_chaining.cpp index 60a4f2f..4a09d8a 100644 --- a/unsupported/benchmarks/GPU/bench_chaining.cpp +++ b/unsupported/benchmarks/GPU/bench_chaining.cpp
@@ -35,8 +35,8 @@ static bool done = false; if (!done) { void* p; - cudaMalloc(&p, 1); - cudaFree(p); + EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, 1)); + EIGEN_CUDA_RUNTIME_CHECK(cudaFree(p)); done = true; } } @@ -159,7 +159,7 @@ for (int i = 1; i < chain_len; ++i) { d_X = llt.solve(d_X); } - cudaStreamSynchronize(llt.stream()); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(llt.stream())); benchmark::DoNotOptimize(d_X.data()); }
diff --git a/unsupported/benchmarks/GPU/bench_fft.cpp b/unsupported/benchmarks/GPU/bench_fft.cpp new file mode 100644 index 0000000..c8d8a37 --- /dev/null +++ b/unsupported/benchmarks/GPU/bench_fft.cpp
@@ -0,0 +1,188 @@ +// GPU FFT benchmarks: GpuFFT 1D and 2D throughput. +// +// Measures forward and inverse FFT performance across a range of sizes, +// including plan-amortized (reuse) and cold-start (new plan) scenarios. +// +// Usage: +// cmake --build build-bench-gpu --target bench_fft bench_fft_double +// ./build-bench-gpu/bench_fft +// ./build-bench-gpu/bench_fft_double +// +// Profiling: +// nsys profile --trace=cuda ./build-bench-gpu/bench_fft +// SPDX-FileCopyrightText: The Eigen Authors +// SPDX-License-Identifier: MPL-2.0 + +#include <benchmark/benchmark.h> + +#include <unsupported/Eigen/GPU> + +using namespace Eigen; + +#ifndef SCALAR +#define SCALAR float +#endif + +using Scalar = SCALAR; +using Complex = std::complex<Scalar>; +using CVec = Matrix<Complex, Dynamic, 1>; +using RVec = Matrix<Scalar, Dynamic, 1>; +using CMat = Matrix<Complex, Dynamic, Dynamic>; + +// CUDA warm-up: ensure the GPU is initialized before timing. +static void cuda_warmup() { + static bool done = false; + if (!done) { + void* p; + EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, 1)); + EIGEN_CUDA_RUNTIME_CHECK(cudaFree(p)); + done = true; + } +} + +// -------------------------------------------------------------------------- +// 1D C2C Forward +// -------------------------------------------------------------------------- + +static void BM_GpuFFT_1D_C2C_Fwd(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + CVec x = CVec::Random(n); + gpu::FFT<Scalar> fft; + + // Warm up plan. + CVec tmp = fft.fwd(x); + + for (auto _ : state) { + benchmark::DoNotOptimize(fft.fwd(x)); + } + state.SetItemsProcessed(state.iterations() * n); + state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2); // read + write +} + +BENCHMARK(BM_GpuFFT_1D_C2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22); + +// -------------------------------------------------------------------------- +// 1D C2C Inverse +// -------------------------------------------------------------------------- + +static void BM_GpuFFT_1D_C2C_Inv(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + CVec x = CVec::Random(n); + gpu::FFT<Scalar> fft; + CVec X = fft.fwd(x); + + for (auto _ : state) { + benchmark::DoNotOptimize(fft.inv(X)); + } + state.SetItemsProcessed(state.iterations() * n); + state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2); +} + +BENCHMARK(BM_GpuFFT_1D_C2C_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22); + +// -------------------------------------------------------------------------- +// 1D R2C Forward +// -------------------------------------------------------------------------- + +static void BM_GpuFFT_1D_R2C_Fwd(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + RVec r = RVec::Random(n); + gpu::FFT<Scalar> fft; + + // Warm up plan. + CVec tmp = fft.fwd(r); + + for (auto _ : state) { + benchmark::DoNotOptimize(fft.fwd(r)); + } + state.SetItemsProcessed(state.iterations() * n); + state.SetBytesProcessed(state.iterations() * (n * sizeof(Scalar) + (n / 2 + 1) * sizeof(Complex))); +} + +BENCHMARK(BM_GpuFFT_1D_R2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22); + +// -------------------------------------------------------------------------- +// 1D C2R Inverse +// -------------------------------------------------------------------------- + +static void BM_GpuFFT_1D_C2R_Inv(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + RVec r = RVec::Random(n); + gpu::FFT<Scalar> fft; + CVec R = fft.fwd(r); + + for (auto _ : state) { + benchmark::DoNotOptimize(fft.invReal(R, n)); + } + state.SetItemsProcessed(state.iterations() * n); + state.SetBytesProcessed(state.iterations() * ((n / 2 + 1) * sizeof(Complex) + n * sizeof(Scalar))); +} + +BENCHMARK(BM_GpuFFT_1D_C2R_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22); + +// -------------------------------------------------------------------------- +// 2D C2C Forward +// -------------------------------------------------------------------------- + +static void BM_GpuFFT_2D_C2C_Fwd(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); // square n x n + CMat A = CMat::Random(n, n); + gpu::FFT<Scalar> fft; + + // Warm up plan. + CMat tmp = fft.fwd2(A); + + for (auto _ : state) { + benchmark::DoNotOptimize(fft.fwd2(A)); + } + state.SetItemsProcessed(state.iterations() * n * n); + state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 2); +} + +BENCHMARK(BM_GpuFFT_2D_C2C_Fwd)->RangeMultiplier(2)->Range(64, 4096); + +// -------------------------------------------------------------------------- +// 2D C2C Roundtrip (fwd + inv) +// -------------------------------------------------------------------------- + +static void BM_GpuFFT_2D_C2C_Roundtrip(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + CMat A = CMat::Random(n, n); + gpu::FFT<Scalar> fft; + + // Warm up plans. + CMat tmp = fft.inv2(fft.fwd2(A)); + + for (auto _ : state) { + CMat B = fft.fwd2(A); + benchmark::DoNotOptimize(fft.inv2(B)); + } + state.SetItemsProcessed(state.iterations() * n * n * 2); // fwd + inv + state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 4); +} + +BENCHMARK(BM_GpuFFT_2D_C2C_Roundtrip)->RangeMultiplier(2)->Range(64, 4096); + +// -------------------------------------------------------------------------- +// 1D Cold start (includes plan creation) +// -------------------------------------------------------------------------- + +static void BM_GpuFFT_1D_ColdStart(benchmark::State& state) { + cuda_warmup(); + const Index n = state.range(0); + CVec x = CVec::Random(n); + + for (auto _ : state) { + gpu::FFT<Scalar> fft; // new object = new plans + benchmark::DoNotOptimize(fft.fwd(x)); + } + state.SetItemsProcessed(state.iterations() * n); +} + +BENCHMARK(BM_GpuFFT_1D_ColdStart)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
diff --git a/unsupported/benchmarks/GPU/bench_solvers.cpp b/unsupported/benchmarks/GPU/bench_solvers.cpp index 7e0952e..17490f1 100644 --- a/unsupported/benchmarks/GPU/bench_solvers.cpp +++ b/unsupported/benchmarks/GPU/bench_solvers.cpp
@@ -40,8 +40,8 @@ static bool done = false; if (!done) { void* p; - cudaMalloc(&p, 1); - cudaFree(p); + EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, 1)); + EIGEN_CUDA_RUNTIME_CHECK(cudaFree(p)); done = true; } } @@ -157,7 +157,7 @@ for (auto _ : state) { gpu::DeviceMatrix<Scalar> d_X = llt.solve(d_B); // Force completion without D2H transfer. - cudaStreamSynchronize(llt.stream()); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(llt.stream())); benchmark::DoNotOptimize(d_X.data()); }
diff --git a/unsupported/test/GPU/CMakeLists.txt b/unsupported/test/GPU/CMakeLists.txt index 706efef..f0b07c3 100644 --- a/unsupported/test/GPU/CMakeLists.txt +++ b/unsupported/test/GPU/CMakeLists.txt
@@ -73,3 +73,34 @@ ei_add_gpu_test(${_cusolver_test} EXTRA_LIBS CUDA::cusolver CUDA::cublas) endforeach() endif() + +# cuFFT (part of the CUDA toolkit, no separate option needed). +if(TARGET CUDA::cufft) + ei_add_gpu_test(cufft EXTRA_LIBS CUDA::cufft CUDA::cublas) +endif() + +# cuSPARSE SpMV (part of the CUDA toolkit). +if(TARGET CUDA::cusparse) + ei_add_gpu_test(cusparse_spmv + EXTRA_LIBS CUDA::cusparse CUDA::cublas CUDA::cusolver) +endif() + +# cuDSS sparse direct solvers -- distributed separately from the CUDA Toolkit. +option(EIGEN_TEST_CUDSS "Test cuDSS sparse solver integration" OFF) +if(EIGEN_TEST_CUDSS) + find_path(CUDSS_INCLUDE_DIR cudss.h + HINTS ${CUDSS_DIR}/include ${CUDA_TOOLKIT_ROOT_DIR}/include /usr/include) + find_library(CUDSS_LIBRARY cudss + HINTS ${CUDSS_DIR}/lib ${CUDSS_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib/x86_64-linux-gnu) + if(CUDSS_INCLUDE_DIR AND CUDSS_LIBRARY) + message(STATUS "cuDSS found: ${CUDSS_LIBRARY}") + foreach(_cudss_test IN ITEMS cudss_llt cudss_ldlt cudss_lu) + ei_add_gpu_test(${_cudss_test} + EXTRA_LIBS CUDA::cusolver CUDA::cublas ${CUDSS_LIBRARY} + EXTRA_INCLUDES ${CUDSS_INCLUDE_DIR} + EXTRA_DEFINES EIGEN_CUDSS=1) + endforeach() + else() + message(WARNING "EIGEN_TEST_CUDSS=ON but cuDSS not found. Set CUDSS_DIR.") + endif() +endif()
diff --git a/unsupported/test/GPU/cudss_ldlt.cpp b/unsupported/test/GPU/cudss_ldlt.cpp new file mode 100644 index 0000000..c979065 --- /dev/null +++ b/unsupported/test/GPU/cudss_ldlt.cpp
@@ -0,0 +1,168 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// Tests for GpuSparseLDLT: GPU sparse LDL^T via cuDSS. + +#define EIGEN_USE_GPU +#include "main.h" +#include <Eigen/Sparse> +#include <unsupported/Eigen/GPU> +#include "gpu_test_helpers.h" + +using namespace Eigen; + +// ---- Helper: build a random sparse symmetric indefinite matrix --------------- + +template <typename Scalar> +SparseMatrix<Scalar, ColMajor, int> make_symmetric_indefinite(Index n, double density = 0.1) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using RealScalar = typename NumTraits<Scalar>::Real; + + // Diagonal has mixed signs for indefiniteness. For complex Scalar: + // off-diagonals carry nonzero imaginary parts so the Hermitian path is + // genuinely exercised; diagonal stays real so A = R + R.adjoint() has a + // real diagonal explicitly (R + R.adjoint() at i,i = 2*Re(R_ii) regardless). + SpMat R(n, n); + R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1)); + for (Index j = 0; j < n; ++j) { + for (Index i = 0; i < n; ++i) { + if (i == j || (std::rand() / double(RAND_MAX)) < density) { + const RealScalar re = RealScalar(std::rand() / double(RAND_MAX) - 0.5); + const RealScalar im = (i == j) ? RealScalar(0) : RealScalar(std::rand() / double(RAND_MAX) - 0.5); + R.insert(i, j) = gpu_test::make_test_value<Scalar>(re, im); + } + } + } + R.makeCompressed(); + + // A = R + R^H (symmetric), then add diagonal with alternating signs for indefiniteness. + SpMat A = R + SparseMatrix<Scalar, ColMajor, int>(R.adjoint()); + for (Index i = 0; i < n; ++i) { + Scalar diag_val = Scalar((i % 2 == 0) ? n : -n); + A.coeffRef(i, i) += diag_val; + } + A.makeCompressed(); + return A; +} + +// ---- Solve and check residual ----------------------------------------------- + +template <typename Scalar> +void test_solve(Index n) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Vec = Matrix<Scalar, Dynamic, 1>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_symmetric_indefinite<Scalar>(n); + Vec b = Vec::Random(n); + + gpu::SparseLDLT<Scalar> ldlt(A); + VERIFY_IS_EQUAL(ldlt.info(), Success); + + Vec x = ldlt.solve(b); + VERIFY_IS_EQUAL(x.rows(), n); + + Vec r = A * x - b; + RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY(r.norm() / b.norm() < tol); +} + +// ---- Multiple RHS ----------------------------------------------------------- + +template <typename Scalar> +void test_multiple_rhs(Index n, Index nrhs) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_symmetric_indefinite<Scalar>(n); + Mat B = Mat::Random(n, nrhs); + + gpu::SparseLDLT<Scalar> ldlt(A); + VERIFY_IS_EQUAL(ldlt.info(), Success); + + Mat X = ldlt.solve(B); + VERIFY_IS_EQUAL(X.rows(), n); + VERIFY_IS_EQUAL(X.cols(), nrhs); + + Mat R = A * X - B; + RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY(R.norm() / B.norm() < tol); +} + +// ---- Refactorize ------------------------------------------------------------ + +template <typename Scalar> +void test_refactorize(Index n) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Vec = Matrix<Scalar, Dynamic, 1>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_symmetric_indefinite<Scalar>(n); + Vec b = Vec::Random(n); + + gpu::SparseLDLT<Scalar> ldlt; + ldlt.analyzePattern(A); + VERIFY_IS_EQUAL(ldlt.info(), Success); + + ldlt.factorize(A); + VERIFY_IS_EQUAL(ldlt.info(), Success); + Vec x1 = ldlt.solve(b); + + // Modify values, keep pattern. + SpMat A2 = A; + for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2)); + + ldlt.factorize(A2); + VERIFY_IS_EQUAL(ldlt.info(), Success); + Vec x2 = ldlt.solve(b); + + RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY((A * x1 - b).norm() / b.norm() < tol); + VERIFY((A2 * x2 - b).norm() / b.norm() < tol); + // Diagonal scaled 2x; x1 and x2 must differ by a substantial fraction. + VERIFY((x1 - x2).norm() > RealScalar(0.01) * x1.norm()); +} + +// ---- Empty ------------------------------------------------------------------ + +template <typename Scalar> +void test_empty() { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + SpMat A(0, 0); + A.makeCompressed(); + gpu::SparseLDLT<Scalar> ldlt(A); + VERIFY_IS_EQUAL(ldlt.info(), Success); + VERIFY_IS_EQUAL(ldlt.rows(), 0); + VERIFY_IS_EQUAL(ldlt.cols(), 0); +} + +// ---- Per-scalar driver ------------------------------------------------------ + +template <typename Scalar> +void test_scalar() { + CALL_SUBTEST(test_solve<Scalar>(64)); + CALL_SUBTEST(test_solve<Scalar>(256)); + CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4)); + CALL_SUBTEST(test_refactorize<Scalar>(64)); +} + +EIGEN_DECLARE_TEST(gpu_cudss_ldlt) { + gpu_test::require_cudss_context(); + // Split by scalar so each part compiles in parallel. + CALL_SUBTEST_1(test_scalar<float>()); + CALL_SUBTEST_2(test_scalar<double>()); + CALL_SUBTEST_3(test_scalar<std::complex<float>>()); + CALL_SUBTEST_4(test_scalar<std::complex<double>>()); + CALL_SUBTEST_5(test_empty<float>()); + CALL_SUBTEST_5(test_empty<double>()); + CALL_SUBTEST_5(test_empty<std::complex<float>>()); + CALL_SUBTEST_5(test_empty<std::complex<double>>()); +}
diff --git a/unsupported/test/GPU/cudss_llt.cpp b/unsupported/test/GPU/cudss_llt.cpp new file mode 100644 index 0000000..b01bcca --- /dev/null +++ b/unsupported/test/GPU/cudss_llt.cpp
@@ -0,0 +1,216 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// Tests for GpuSparseLLT: GPU sparse Cholesky via cuDSS. + +#define EIGEN_USE_GPU +#include "main.h" +#include <Eigen/Sparse> +#include <unsupported/Eigen/GPU> +#include "gpu_test_helpers.h" + +using namespace Eigen; + +// ---- Helper: build a random sparse SPD matrix ------------------------------- + +template <typename Scalar> +SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using RealScalar = typename NumTraits<Scalar>::Real; + + // Off-diagonal entries carry nonzero imaginary parts for complex Scalar so + // the cuDSS HPD path (CUDSS_MTYPE_HPD) is genuinely exercised; A = R^H * R + // is Hermitian regardless of R, and the +nI shift keeps the diagonal real. + SpMat R(n, n); + R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1)); + for (Index j = 0; j < n; ++j) { + for (Index i = 0; i < n; ++i) { + if (i == j || (std::rand() / double(RAND_MAX)) < density) { + const RealScalar re = RealScalar(std::rand() / double(RAND_MAX) - 0.5); + const RealScalar im = RealScalar(std::rand() / double(RAND_MAX) - 0.5); + R.insert(i, j) = gpu_test::make_test_value<Scalar>(re, im); + } + } + } + R.makeCompressed(); + + // A = R^H * R + n * I (guaranteed SPD). + SpMat A = R.adjoint() * R; + for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n)); + A.makeCompressed(); + return A; +} + +// ---- Solve and check residual ----------------------------------------------- + +template <typename Scalar> +void test_solve(Index n) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Vec = Matrix<Scalar, Dynamic, 1>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_spd<Scalar>(n); + Vec b = Vec::Random(n); + + gpu::SparseLLT<Scalar> llt(A); + VERIFY_IS_EQUAL(llt.info(), Success); + + Vec x = llt.solve(b); + VERIFY_IS_EQUAL(x.rows(), n); + + // Check residual: ||Ax - b|| / ||b||. + Vec r = A * x - b; + RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY(r.norm() / b.norm() < tol); +} + +// ---- Compare with CPU SimplicialLLT ----------------------------------------- + +template <typename Scalar> +void test_vs_cpu(Index n) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Vec = Matrix<Scalar, Dynamic, 1>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_spd<Scalar>(n); + Vec b = Vec::Random(n); + + gpu::SparseLLT<Scalar> gpu_llt(A); + VERIFY_IS_EQUAL(gpu_llt.info(), Success); + Vec x_gpu = gpu_llt.solve(b); + + SimplicialLLT<SpMat> cpu_llt(A); + VERIFY_IS_EQUAL(cpu_llt.info(), Success); + Vec x_cpu = cpu_llt.solve(b); + + RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY((x_gpu - x_cpu).norm() / x_cpu.norm() < tol); +} + +// ---- Multiple RHS ----------------------------------------------------------- + +template <typename Scalar> +void test_multiple_rhs(Index n, Index nrhs) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_spd<Scalar>(n); + Mat B = Mat::Random(n, nrhs); + + gpu::SparseLLT<Scalar> llt(A); + VERIFY_IS_EQUAL(llt.info(), Success); + + Mat X = llt.solve(B); + VERIFY_IS_EQUAL(X.rows(), n); + VERIFY_IS_EQUAL(X.cols(), nrhs); + + Mat R = A * X - B; + RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY(R.norm() / B.norm() < tol); +} + +// ---- Separate analyze + factorize (refactorization) ------------------------- + +template <typename Scalar> +void test_refactorize(Index n) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Vec = Matrix<Scalar, Dynamic, 1>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_spd<Scalar>(n); + Vec b = Vec::Random(n); + + gpu::SparseLLT<Scalar> llt; + llt.analyzePattern(A); + VERIFY_IS_EQUAL(llt.info(), Success); + + // First factorize + solve. + llt.factorize(A); + VERIFY_IS_EQUAL(llt.info(), Success); + Vec x1 = llt.solve(b); + + // Modify values (keep same pattern): scale diagonal. + SpMat A2 = A; + for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2)); + + // Refactorize with same pattern. + llt.factorize(A2); + VERIFY_IS_EQUAL(llt.info(), Success); + Vec x2 = llt.solve(b); + + // Both solutions should satisfy their respective systems. + RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY((A * x1 - b).norm() / b.norm() < tol); + VERIFY((A2 * x2 - b).norm() / b.norm() < tol); + + // Solutions should differ meaningfully: diagonal was scaled 2x, so x1 vs x2 + // should differ by a substantial fraction of x1's magnitude. A near-epsilon + // bound here would pass even if refactorize silently reused the stale factor. + VERIFY((x1 - x2).norm() > RealScalar(0.01) * x1.norm()); +} + +// ---- Empty matrix ----------------------------------------------------------- + +template <typename Scalar> +void test_empty() { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + SpMat A(0, 0); + A.makeCompressed(); + gpu::SparseLLT<Scalar> llt(A); + VERIFY_IS_EQUAL(llt.info(), Success); + VERIFY_IS_EQUAL(llt.rows(), 0); + VERIFY_IS_EQUAL(llt.cols(), 0); +} + +// ---- Upper triangle --------------------------------------------------------- + +template <typename Scalar> +void test_upper(Index n) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Vec = Matrix<Scalar, Dynamic, 1>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_spd<Scalar>(n); + Vec b = Vec::Random(n); + + gpu::SparseLLT<Scalar, Upper> llt(A); + VERIFY_IS_EQUAL(llt.info(), Success); + + Vec x = llt.solve(b); + Vec r = A * x - b; + RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY(r.norm() / b.norm() < tol); +} + +// ---- Per-scalar driver ------------------------------------------------------ + +template <typename Scalar> +void test_scalar() { + CALL_SUBTEST(test_solve<Scalar>(64)); + CALL_SUBTEST(test_solve<Scalar>(256)); + CALL_SUBTEST(test_vs_cpu<Scalar>(64)); + CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4)); + CALL_SUBTEST(test_refactorize<Scalar>(64)); + CALL_SUBTEST(test_upper<Scalar>(64)); +} + +EIGEN_DECLARE_TEST(gpu_cudss_llt) { + gpu_test::require_cudss_context(); + // Split by scalar so each part compiles in parallel. + CALL_SUBTEST_1(test_scalar<float>()); + CALL_SUBTEST_2(test_scalar<double>()); + CALL_SUBTEST_3(test_scalar<std::complex<float>>()); + CALL_SUBTEST_4(test_scalar<std::complex<double>>()); + CALL_SUBTEST_5(test_empty<float>()); + CALL_SUBTEST_5(test_empty<double>()); + CALL_SUBTEST_5(test_empty<std::complex<float>>()); + CALL_SUBTEST_5(test_empty<std::complex<double>>()); +}
diff --git a/unsupported/test/GPU/cudss_lu.cpp b/unsupported/test/GPU/cudss_lu.cpp new file mode 100644 index 0000000..d88900e --- /dev/null +++ b/unsupported/test/GPU/cudss_lu.cpp
@@ -0,0 +1,158 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// Tests for GpuSparseLU: GPU sparse LU via cuDSS. + +#define EIGEN_USE_GPU +#include "main.h" +#include <Eigen/Sparse> +#include <unsupported/Eigen/GPU> +#include "gpu_test_helpers.h" + +using namespace Eigen; + +// ---- Helper: build a random sparse non-singular general matrix --------------- + +template <typename Scalar> +SparseMatrix<Scalar, ColMajor, int> make_general(Index n, double density = 0.1) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat R(n, n); + R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1)); + for (Index j = 0; j < n; ++j) { + for (Index i = 0; i < n; ++i) { + if (i == j || (std::rand() / double(RAND_MAX)) < density) { + const RealScalar re = RealScalar(std::rand() / double(RAND_MAX) - 0.5); + const RealScalar im = RealScalar(std::rand() / double(RAND_MAX) - 0.5); + R.insert(i, j) = gpu_test::make_test_value<Scalar>(re, im); + } + } + } + // Add strong diagonal for non-singularity. + for (Index i = 0; i < n; ++i) R.coeffRef(i, i) += Scalar(RealScalar(n)); + R.makeCompressed(); + return R; +} + +// ---- Solve and check residual ----------------------------------------------- + +template <typename Scalar> +void test_solve(Index n) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Vec = Matrix<Scalar, Dynamic, 1>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_general<Scalar>(n); + Vec b = Vec::Random(n); + + gpu::SparseLU<Scalar> lu(A); + VERIFY_IS_EQUAL(lu.info(), Success); + + Vec x = lu.solve(b); + VERIFY_IS_EQUAL(x.rows(), n); + + Vec r = A * x - b; + RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY(r.norm() / b.norm() < tol); +} + +// ---- Multiple RHS ----------------------------------------------------------- + +template <typename Scalar> +void test_multiple_rhs(Index n, Index nrhs) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_general<Scalar>(n); + Mat B = Mat::Random(n, nrhs); + + gpu::SparseLU<Scalar> lu(A); + VERIFY_IS_EQUAL(lu.info(), Success); + + Mat X = lu.solve(B); + VERIFY_IS_EQUAL(X.rows(), n); + VERIFY_IS_EQUAL(X.cols(), nrhs); + + Mat R = A * X - B; + RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY(R.norm() / B.norm() < tol); +} + +// ---- Refactorize ------------------------------------------------------------ + +template <typename Scalar> +void test_refactorize(Index n) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Vec = Matrix<Scalar, Dynamic, 1>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_general<Scalar>(n); + Vec b = Vec::Random(n); + + gpu::SparseLU<Scalar> lu; + lu.analyzePattern(A); + VERIFY_IS_EQUAL(lu.info(), Success); + + lu.factorize(A); + VERIFY_IS_EQUAL(lu.info(), Success); + Vec x1 = lu.solve(b); + + // Modify values, keep pattern. + SpMat A2 = A; + for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2)); + + lu.factorize(A2); + VERIFY_IS_EQUAL(lu.info(), Success); + Vec x2 = lu.solve(b); + + RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY((A * x1 - b).norm() / b.norm() < tol); + VERIFY((A2 * x2 - b).norm() / b.norm() < tol); + // Diagonal scaled 2x; x1 and x2 must differ by a substantial fraction. + VERIFY((x1 - x2).norm() > RealScalar(0.01) * x1.norm()); +} + +// ---- Empty ------------------------------------------------------------------ + +template <typename Scalar> +void test_empty() { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + SpMat A(0, 0); + A.makeCompressed(); + gpu::SparseLU<Scalar> lu(A); + VERIFY_IS_EQUAL(lu.info(), Success); + VERIFY_IS_EQUAL(lu.rows(), 0); + VERIFY_IS_EQUAL(lu.cols(), 0); +} + +// ---- Per-scalar driver ------------------------------------------------------ + +template <typename Scalar> +void test_scalar() { + CALL_SUBTEST(test_solve<Scalar>(64)); + CALL_SUBTEST(test_solve<Scalar>(256)); + CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4)); + CALL_SUBTEST(test_refactorize<Scalar>(64)); +} + +EIGEN_DECLARE_TEST(gpu_cudss_lu) { + gpu_test::require_cudss_context(); + // Split by scalar so each part compiles in parallel. + CALL_SUBTEST_1(test_scalar<float>()); + CALL_SUBTEST_2(test_scalar<double>()); + CALL_SUBTEST_3(test_scalar<std::complex<float>>()); + CALL_SUBTEST_4(test_scalar<std::complex<double>>()); + CALL_SUBTEST_5(test_empty<float>()); + CALL_SUBTEST_5(test_empty<double>()); + CALL_SUBTEST_5(test_empty<std::complex<float>>()); + CALL_SUBTEST_5(test_empty<std::complex<double>>()); +}
diff --git a/unsupported/test/GPU/cufft.cpp b/unsupported/test/GPU/cufft.cpp new file mode 100644 index 0000000..7dc87c7 --- /dev/null +++ b/unsupported/test/GPU/cufft.cpp
@@ -0,0 +1,190 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// Tests for GpuFFT: GPU FFT via cuFFT. + +#define EIGEN_USE_GPU +#include "main.h" +#include <unsupported/Eigen/GPU> +#include "gpu_test_helpers.h" + +using namespace Eigen; + +// ---- 1D C2C roundtrip: inv(fwd(x)) ≈ x ------------------------------------- + +template <typename Scalar> +void test_c2c_roundtrip(Index n) { + using Complex = std::complex<Scalar>; + using Vec = Matrix<Complex, Dynamic, 1>; + using RealScalar = Scalar; + + Vec x = Vec::Random(n); + + gpu::FFT<Scalar> fft; + Vec X = fft.fwd(x); + VERIFY_IS_EQUAL(X.size(), n); + + Vec y = fft.inv(X); + VERIFY_IS_EQUAL(y.size(), n); + + RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY((y - x).norm() / x.norm() < tol); +} + +// ---- 1D C2C known signal: FFT of constant = delta -------------------------- + +template <typename Scalar> +void test_c2c_constant() { + using Complex = std::complex<Scalar>; + using Vec = Matrix<Complex, Dynamic, 1>; + using RealScalar = Scalar; + + const int n = 64; + Vec x = Vec::Constant(n, Complex(3.0, 0.0)); + + gpu::FFT<Scalar> fft; + Vec X = fft.fwd(x); + + // FFT of constant c: X[0] = c*n, X[k] = 0 for k > 0. + RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(n); + VERIFY(std::abs(X(0) - Complex(3.0 * n, 0.0)) < tol); + for (int k = 1; k < n; ++k) { + VERIFY(std::abs(X(k)) < tol); + } +} + +// ---- 1D R2C/C2R roundtrip: invReal(fwd(r), n) ≈ r -------------------------- + +template <typename Scalar> +void test_r2c_roundtrip(Index n) { + using Complex = std::complex<Scalar>; + using CVec = Matrix<Complex, Dynamic, 1>; + using RVec = Matrix<Scalar, Dynamic, 1>; + using RealScalar = Scalar; + + RVec r = RVec::Random(n); + + gpu::FFT<Scalar> fft; + CVec R = fft.fwd(r); + + // R2C returns n/2+1 complex values. + VERIFY_IS_EQUAL(R.size(), n / 2 + 1); + + RVec s = fft.invReal(R, n); + VERIFY_IS_EQUAL(s.size(), n); + + RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY((s - r).norm() / r.norm() < tol); +} + +// ---- 2D C2C roundtrip: inv2(fwd2(A)) ≈ A --------------------------------- + +template <typename Scalar> +void test_2d_roundtrip(Index rows, Index cols) { + using Complex = std::complex<Scalar>; + using Mat = Matrix<Complex, Dynamic, Dynamic>; + using RealScalar = Scalar; + + Mat A = Mat::Random(rows, cols); + + gpu::FFT<Scalar> fft; + Mat B = fft.fwd2(A); + VERIFY_IS_EQUAL(B.rows(), rows); + VERIFY_IS_EQUAL(B.cols(), cols); + + Mat C = fft.inv2(B); + VERIFY_IS_EQUAL(C.rows(), rows); + VERIFY_IS_EQUAL(C.cols(), cols); + + RealScalar tol = RealScalar(10) * RealScalar(rows * cols) * NumTraits<Scalar>::epsilon(); + VERIFY((C - A).norm() / A.norm() < tol); +} + +// ---- 2D C2C known signal: constant matrix ----------------------------------- + +template <typename Scalar> +void test_2d_constant() { + using Complex = std::complex<Scalar>; + using Mat = Matrix<Complex, Dynamic, Dynamic>; + using RealScalar = Scalar; + + const int rows = 16, cols = 32; + Mat A = Mat::Constant(rows, cols, Complex(2.0, 0.0)); + + gpu::FFT<Scalar> fft; + Mat B = fft.fwd2(A); + + // 2D FFT of constant c: B(0,0) = c*rows*cols, all others = 0. + RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(rows * cols); + VERIFY(std::abs(B(0, 0) - Complex(2.0 * rows * cols, 0.0)) < tol); + for (int j = 0; j < cols; ++j) { + for (int i = 0; i < rows; ++i) { + if (i == 0 && j == 0) continue; + VERIFY(std::abs(B(i, j)) < tol); + } + } +} + +// ---- Plan reuse: repeated calls should work --------------------------------- + +template <typename Scalar> +void test_plan_reuse() { + using Complex = std::complex<Scalar>; + using Vec = Matrix<Complex, Dynamic, 1>; + using RealScalar = Scalar; + + gpu::FFT<Scalar> fft; + for (int trial = 0; trial < 5; ++trial) { + Vec x = Vec::Random(128); + Vec X = fft.fwd(x); + Vec y = fft.inv(X); + RealScalar tol = RealScalar(10) * RealScalar(128) * NumTraits<Scalar>::epsilon(); + VERIFY((y - x).norm() / x.norm() < tol); + } +} + +// ---- Empty ------------------------------------------------------------------ + +template <typename Scalar> +void test_empty() { + using Complex = std::complex<Scalar>; + using Vec = Matrix<Complex, Dynamic, 1>; + + gpu::FFT<Scalar> fft; + Vec x(0); + Vec X = fft.fwd(x); + VERIFY_IS_EQUAL(X.size(), 0); + Vec y = fft.inv(X); + VERIFY_IS_EQUAL(y.size(), 0); +} + +// ---- Per-scalar driver ------------------------------------------------------ + +template <typename Scalar> +void test_scalar() { + CALL_SUBTEST(test_c2c_roundtrip<Scalar>(64)); + CALL_SUBTEST(test_c2c_roundtrip<Scalar>(256)); + CALL_SUBTEST(test_c2c_roundtrip<Scalar>(1000)); // non-power-of-2 + CALL_SUBTEST(test_c2c_constant<Scalar>()); + CALL_SUBTEST(test_r2c_roundtrip<Scalar>(64)); + CALL_SUBTEST(test_r2c_roundtrip<Scalar>(256)); + CALL_SUBTEST(test_2d_roundtrip<Scalar>(32, 32)); + CALL_SUBTEST(test_2d_roundtrip<Scalar>(16, 64)); // non-square + CALL_SUBTEST(test_2d_constant<Scalar>()); + CALL_SUBTEST(test_plan_reuse<Scalar>()); + CALL_SUBTEST(test_empty<Scalar>()); +} + +EIGEN_DECLARE_TEST(gpu_cufft) { + gpu_test::require_cufft_context(); + // Split by scalar so each part compiles in parallel. + CALL_SUBTEST_1(test_scalar<float>()); + CALL_SUBTEST_2(test_scalar<double>()); +}
diff --git a/unsupported/test/GPU/cusparse_spmv.cpp b/unsupported/test/GPU/cusparse_spmv.cpp new file mode 100644 index 0000000..122ddf9 --- /dev/null +++ b/unsupported/test/GPU/cusparse_spmv.cpp
@@ -0,0 +1,253 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// Tests for gpu::SparseContext: GPU SpMV/SpMM via cuSPARSE. + +#define EIGEN_USE_GPU +#include "main.h" +#include <Eigen/Sparse> +#include <unsupported/Eigen/GPU> +#include "gpu_test_helpers.h" + +using namespace Eigen; + +// ---- Helper: build a random sparse matrix ----------------------------------- + +template <typename Scalar> +SparseMatrix<Scalar, ColMajor, int> make_sparse(Index rows, Index cols, double density = 0.1) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat R(rows, cols); + R.reserve(VectorXi::Constant(cols, static_cast<int>(rows * density) + 1)); + for (Index j = 0; j < cols; ++j) { + for (Index i = 0; i < rows; ++i) { + if ((std::rand() / double(RAND_MAX)) < density) { + const RealScalar re = RealScalar(std::rand() / double(RAND_MAX) - 0.5); + const RealScalar im = RealScalar(std::rand() / double(RAND_MAX) - 0.5); + R.insert(i, j) = gpu_test::make_test_value<Scalar>(re, im); + } + } + } + R.makeCompressed(); + return R; +} + +// ---- SpMV: y = A * x ------------------------------------------------------- + +template <typename Scalar> +void test_spmv(Index rows, Index cols) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Vec = Matrix<Scalar, Dynamic, 1>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_sparse<Scalar>(rows, cols); + Vec x = Vec::Random(cols); + + gpu::SparseContext<Scalar> ctx; + Vec y_gpu = ctx.multiply(A, x); + Vec y_cpu = A * x; + + RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon(); + VERIFY_IS_EQUAL(y_gpu.size(), rows); + VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol); +} + +// ---- SpMV with alpha/beta: y = alpha*A*x + beta*y --------------------------- + +template <typename Scalar> +void test_spmv_alpha_beta(Index n) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Vec = Matrix<Scalar, Dynamic, 1>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_sparse<Scalar>(n, n); + Vec x = Vec::Random(n); + Vec y_init = Vec::Random(n); + + Scalar alpha(2); + Scalar beta(3); + + Vec y_cpu = alpha * (A * x) + beta * y_init; + + gpu::SparseContext<Scalar> ctx; + Vec y_gpu = y_init; + ctx.multiply(A, x, y_gpu, alpha, beta); + + RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol); +} + +// ---- Transpose: y = A^T * x ------------------------------------------------ + +template <typename Scalar> +void test_spmv_transpose(Index rows, Index cols) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Vec = Matrix<Scalar, Dynamic, 1>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_sparse<Scalar>(rows, cols); + Vec x = Vec::Random(rows); + + gpu::SparseContext<Scalar> ctx; + Vec y_gpu = ctx.multiplyT(A, x); + Vec y_cpu = A.transpose() * x; + + RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon(); + VERIFY_IS_EQUAL(y_gpu.size(), cols); + VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol); +} + +// ---- SpMV adjoint: y = A^H * x ---------------------------------------------- + +template <typename Scalar> +void test_spmv_adjoint(Index rows, Index cols) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Vec = Matrix<Scalar, Dynamic, 1>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_sparse<Scalar>(rows, cols); + Vec x = Vec::Random(rows); + + gpu::SparseContext<Scalar> ctx; + Vec y_gpu = ctx.multiplyAdjoint(A, x); + Vec y_cpu = A.adjoint() * x; + + RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon(); + VERIFY_IS_EQUAL(y_gpu.size(), cols); + VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol); +} + +// ---- SpMM: Y = A * X (multiple RHS) ---------------------------------------- + +template <typename Scalar> +void test_spmm(Index rows, Index cols, Index nrhs) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_sparse<Scalar>(rows, cols); + Mat X = Mat::Random(cols, nrhs); + + gpu::SparseContext<Scalar> ctx; + Mat Y_gpu = ctx.multiplyMat(A, X); + Mat Y_cpu = A * X; + + RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon(); + VERIFY_IS_EQUAL(Y_gpu.rows(), rows); + VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs); + VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol); +} + +// ---- SpMM transpose: Y = A^T * X -------------------------------------------- + +template <typename Scalar> +void test_spmm_transpose(Index rows, Index cols, Index nrhs) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + SpMat A = make_sparse<Scalar>(rows, cols); + Mat X = Mat::Random(rows, nrhs); + + gpu::SparseContext<Scalar> ctx; + Mat Y_gpu = ctx.multiplyMat(A, X, gpu::GpuOp::Trans); + Mat Y_cpu = A.transpose() * X; + + RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon(); + VERIFY_IS_EQUAL(Y_gpu.rows(), cols); + VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs); + VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol); +} + +// ---- Identity matrix: I * x = x -------------------------------------------- + +template <typename Scalar> +void test_identity(Index n) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Vec = Matrix<Scalar, Dynamic, 1>; + using RealScalar = typename NumTraits<Scalar>::Real; + + // Build sparse identity. + SpMat eye(n, n); + eye.setIdentity(); + eye.makeCompressed(); + + Vec x = Vec::Random(n); + + gpu::SparseContext<Scalar> ctx; + Vec y = ctx.multiply(eye, x); + + RealScalar tol = NumTraits<Scalar>::epsilon(); + VERIFY((y - x).norm() < tol); +} + +// ---- Context reuse ---------------------------------------------------------- + +template <typename Scalar> +void test_reuse(Index n) { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Vec = Matrix<Scalar, Dynamic, 1>; + using RealScalar = typename NumTraits<Scalar>::Real; + + gpu::SparseContext<Scalar> ctx; + RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + + for (int trial = 0; trial < 3; ++trial) { + SpMat A = make_sparse<Scalar>(n, n); + Vec x = Vec::Random(n); + Vec y_gpu = ctx.multiply(A, x); + Vec y_cpu = A * x; + VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol); + } +} + +// ---- Empty ------------------------------------------------------------------ + +template <typename Scalar> +void test_empty() { + using SpMat = SparseMatrix<Scalar, ColMajor, int>; + using Vec = Matrix<Scalar, Dynamic, 1>; + + SpMat A(0, 0); + A.makeCompressed(); + Vec x(0); + + gpu::SparseContext<Scalar> ctx; + Vec y = ctx.multiply(A, x); + VERIFY_IS_EQUAL(y.size(), 0); +} + +// ---- Per-scalar driver ------------------------------------------------------ + +template <typename Scalar> +void test_scalar() { + CALL_SUBTEST(test_spmv<Scalar>(64, 64)); + CALL_SUBTEST(test_spmv<Scalar>(128, 64)); // non-square + CALL_SUBTEST(test_spmv<Scalar>(64, 128)); // wide + CALL_SUBTEST(test_spmv_alpha_beta<Scalar>(64)); + CALL_SUBTEST(test_spmv_transpose<Scalar>(128, 64)); + CALL_SUBTEST(test_spmv_adjoint<Scalar>(128, 64)); + CALL_SUBTEST(test_spmm<Scalar>(64, 64, 4)); + CALL_SUBTEST(test_spmm_transpose<Scalar>(128, 64, 4)); + CALL_SUBTEST(test_identity<Scalar>(64)); + CALL_SUBTEST(test_reuse<Scalar>(64)); + CALL_SUBTEST(test_empty<Scalar>()); +} + +EIGEN_DECLARE_TEST(gpu_cusparse_spmv) { + gpu_test::require_cusparse_context(); + + // Split by scalar so each part compiles in parallel. + CALL_SUBTEST_1(test_scalar<float>()); + CALL_SUBTEST_2(test_scalar<double>()); + CALL_SUBTEST_3(test_scalar<std::complex<float>>()); + CALL_SUBTEST_4(test_scalar<std::complex<double>>()); +}
diff --git a/unsupported/test/GPU/gpu_test_helpers.h b/unsupported/test/GPU/gpu_test_helpers.h new file mode 100644 index 0000000..1ef4f9c --- /dev/null +++ b/unsupported/test/GPU/gpu_test_helpers.h
@@ -0,0 +1,80 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// Helpers shared across GPU library tests: +// * runtime probes that exit(77) (CI skip) when a library is unavailable +// * a small make_test_value() that constructs a Scalar with an imaginary +// component for complex types so the complex code paths are genuinely +// exercised — without this, Scalar(real_value) silently zeros the imag. + +#ifndef EIGEN_UNSUPPORTED_TEST_GPU_TEST_HELPERS_H +#define EIGEN_UNSUPPORTED_TEST_GPU_TEST_HELPERS_H + +#include <Eigen/Core> +#include <cstdlib> +#include <iostream> +#include <type_traits> + +namespace gpu_test { + +template <typename Scalar, typename RealScalar> +inline std::enable_if_t<Eigen::NumTraits<Scalar>::IsComplex, Scalar> make_test_value(RealScalar re, RealScalar im) { + return Scalar(re, im); +} +template <typename Scalar, typename RealScalar> +inline std::enable_if_t<!Eigen::NumTraits<Scalar>::IsComplex, Scalar> make_test_value(RealScalar re, + RealScalar /*im*/) { + return Scalar(re); +} + +#ifdef CUDSS_VERSION +inline void require_cudss_context() { + cudssHandle_t handle = nullptr; + const cudssStatus_t status = cudssCreate(&handle); + if (status != CUDSS_STATUS_SUCCESS) { + std::cout << "SKIP: cuDSS tests require an initialized cuDSS context. cudssCreate failed with status " + << static_cast<int>(status) << std::endl; + std::exit(77); + } + EIGEN_CUDSS_CHECK(cudssDestroy(handle)); +} +#endif + +#ifdef CUSPARSE_VERSION +inline void require_cusparse_context() { + cusparseHandle_t handle = nullptr; + const cusparseStatus_t status = cusparseCreate(&handle); + if (status != CUSPARSE_STATUS_SUCCESS) { + std::cout << "SKIP: cuSPARSE tests require an initialized cuSPARSE context. cusparseCreate failed with status " + << static_cast<int>(status) << std::endl; + std::exit(77); + } + EIGEN_CUSPARSE_CHECK(cusparseDestroy(handle)); +} +#endif + +#ifdef CUFFT_VERSION +inline void require_cufft_context() { + cufftHandle plan = 0; + // cufftCreate allocates a plan handle without configuring it; succeeds only + // when the cuFFT runtime is loadable. + const cufftResult status = cufftCreate(&plan); + if (status != CUFFT_SUCCESS) { + std::cout << "SKIP: cuFFT tests require a working cuFFT runtime. cufftCreate failed with status " + << static_cast<int>(status) << std::endl; + std::exit(77); + } + EIGEN_CUFFT_CHECK(cufftDestroy(plan)); +} +#endif + +} // namespace gpu_test + +#endif // EIGEN_UNSUPPORTED_TEST_GPU_TEST_HELPERS_H