Eigen/GPU [4/5]: Add sparse solvers, FFT, and SpMV (cuDSS, cuFFT, cuSPARSE)

libeigen/eigen!2414

Closes #3067

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
Co-authored-by: Rasmus Munk Larsen <rlarsen@nvidia.com>
diff --git a/unsupported/Eigen/GPU b/unsupported/Eigen/GPU
index 4c15d7d..7247074 100644
--- a/unsupported/Eigen/GPU
+++ b/unsupported/Eigen/GPU
@@ -11,6 +11,7 @@
 #define EIGEN_GPU_MODULE_H
 
 #include "../../Eigen/Core"
+#include "../../Eigen/SparseCore"
 
 #include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
@@ -53,6 +54,17 @@
 #include "src/GPU/GpuQR.h"
 #include "src/GPU/GpuSVD.h"
 #include "src/GPU/GpuEigenSolver.h"
+#include "src/GPU/CuFftSupport.h"
+#include "src/GPU/GpuFFT.h"
+#include "src/GPU/CuSparseSupport.h"
+#include "src/GPU/GpuSparseContext.h"
+#ifdef EIGEN_CUDSS
+#include "src/GPU/CuDssSupport.h"
+#include "src/GPU/GpuSparseSolverBase.h"
+#include "src/GPU/GpuSparseLLT.h"
+#include "src/GPU/GpuSparseLDLT.h"
+#include "src/GPU/GpuSparseLU.h"
+#endif
 // IWYU pragma: end_exports
 #endif
 
diff --git a/unsupported/Eigen/src/GPU/CuBlasSupport.h b/unsupported/Eigen/src/GPU/CuBlasSupport.h
index 994d95a..0e56b69 100644
--- a/unsupported/Eigen/src/GPU/CuBlasSupport.h
+++ b/unsupported/Eigen/src/GPU/CuBlasSupport.h
@@ -26,12 +26,6 @@
 
 namespace Eigen {
 namespace gpu {
-
-// ---- Operation enum ---------------------------------------------------------
-// Public flag for transpose/adjoint in BLAS- and solver-style calls.
-
-enum class GpuOp { NoTrans, Trans, ConjTrans };
-
 namespace internal {
 
 // ---- Error-checking macro ---------------------------------------------------
@@ -218,6 +212,22 @@
                      reinterpret_cast<cuDoubleComplex*>(C), ldc);
 }
 
+// SCAL wrappers: x = alpha * x.
+// For complex x, alpha is real-valued (Csscal/Zdscal) — this matches the
+// 1/n inverse-FFT scaling pattern, where the scale is intrinsically real.
+inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const float* alpha, float* x, int incx) {
+  return cublasSscal(h, n, alpha, x, incx);
+}
+inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const double* alpha, double* x, int incx) {
+  return cublasDscal(h, n, alpha, x, incx);
+}
+inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const float* alpha, std::complex<float>* x, int incx) {
+  return cublasCsscal(h, n, alpha, reinterpret_cast<cuComplex*>(x), incx);
+}
+inline cublasStatus_t cublasXscal(cublasHandle_t h, int n, const double* alpha, std::complex<double>* x, int incx) {
+  return cublasZdscal(h, n, alpha, reinterpret_cast<cuDoubleComplex*>(x), incx);
+}
+
 // DGMM wrappers: C = A * diag(x)  (side=RIGHT) or C = diag(x) * A  (side=LEFT).
 // Useful for applying a diagonal scaling without materialising diag(x) as a
 // dense matrix. cuBLAS docs guarantee in-place is safe when C == A.
diff --git a/unsupported/Eigen/src/GPU/CuDssSupport.h b/unsupported/Eigen/src/GPU/CuDssSupport.h
new file mode 100644
index 0000000..7bb4cb0
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/CuDssSupport.h
@@ -0,0 +1,137 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// cuDSS support utilities: error checking macro, type mapping.
+//
+// cuDSS is NVIDIA's sparse direct solver library, supporting Cholesky (LL^T),
+// LDL^T, and LU factorization on GPU. It requires CUDA 12.0+ and is
+// distributed separately from the CUDA Toolkit.
+
+#ifndef EIGEN_GPU_CUDSS_SUPPORT_H
+#define EIGEN_GPU_CUDSS_SUPPORT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSupport.h"
+#include <cudss.h>
+
+namespace Eigen {
+namespace gpu {
+namespace internal {
+
+// ---- Error checking ---------------------------------------------------------
+
+#define EIGEN_CUDSS_CHECK(x)                                              \
+  do {                                                                    \
+    cudssStatus_t _s = (x);                                               \
+    eigen_assert(_s == CUDSS_STATUS_SUCCESS && "cuDSS call failed: " #x); \
+    EIGEN_UNUSED_VARIABLE(_s);                                            \
+  } while (0)
+
+// ---- Scalar → cudssMatrixType_t for SPD/HPD ---------------------------------
+
+template <typename Scalar>
+struct cudss_spd_type;
+
+template <>
+struct cudss_spd_type<float> {
+  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
+};
+template <>
+struct cudss_spd_type<double> {
+  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SPD;
+};
+template <>
+struct cudss_spd_type<std::complex<float>> {
+  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
+};
+template <>
+struct cudss_spd_type<std::complex<double>> {
+  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HPD;
+};
+
+// ---- Scalar → cudssMatrixType_t for symmetric/Hermitian ---------------------
+
+template <typename Scalar>
+struct cudss_symmetric_type;
+
+template <>
+struct cudss_symmetric_type<float> {
+  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
+};
+template <>
+struct cudss_symmetric_type<double> {
+  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_SYMMETRIC;
+};
+template <>
+struct cudss_symmetric_type<std::complex<float>> {
+  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
+};
+template <>
+struct cudss_symmetric_type<std::complex<double>> {
+  static constexpr cudssMatrixType_t value = CUDSS_MTYPE_HERMITIAN;
+};
+
+// ---- StorageIndex → cudaDataType_t ------------------------------------------
+
+template <typename StorageIndex>
+struct cudss_index_type;
+
+template <>
+struct cudss_index_type<int> {
+  static constexpr cudaDataType_t value = CUDA_R_32I;
+};
+template <>
+struct cudss_index_type<int64_t> {
+  static constexpr cudaDataType_t value = CUDA_R_64I;
+};
+
+// ---- UpLo → cudssMatrixViewType_t -------------------------------------------
+// For symmetric matrices stored as CSC (ColMajor), cuDSS sees CSR of A^T.
+// Since A = A^T, the data is the same, but the triangle view must be swapped.
+
+template <int UpLo, int StorageOrder>
+struct cudss_view_type;
+
+// ColMajor (CSC) passed as CSR: lower ↔ upper swap.
+template <>
+struct cudss_view_type<Lower, ColMajor> {
+  static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
+};
+template <>
+struct cudss_view_type<Upper, ColMajor> {
+  static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
+};
+
+// RowMajor (CSR) passed directly: no swap needed.
+template <>
+struct cudss_view_type<Lower, RowMajor> {
+  static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_LOWER;
+};
+template <>
+struct cudss_view_type<Upper, RowMajor> {
+  static constexpr cudssMatrixViewType_t value = CUDSS_MVIEW_UPPER;
+};
+
+}  // namespace internal
+
+// ---- Ordering enum ----------------------------------------------------------
+
+enum class GpuSparseOrdering {
+  AMD,    // Default fill-reducing ordering
+  METIS,  // METIS nested dissection
+  RCM     // Reverse Cuthill-McKee
+};
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_CUDSS_SUPPORT_H
diff --git a/unsupported/Eigen/src/GPU/CuFftSupport.h b/unsupported/Eigen/src/GPU/CuFftSupport.h
new file mode 100644
index 0000000..a9ed788
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/CuFftSupport.h
@@ -0,0 +1,106 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// cuFFT support utilities: error checking macro, type mapping.
+
+#ifndef EIGEN_GPU_CUFFT_SUPPORT_H
+#define EIGEN_GPU_CUFFT_SUPPORT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSupport.h"
+#include <cufft.h>
+
+namespace Eigen {
+namespace gpu {
+namespace internal {
+
+// ---- Error checking ---------------------------------------------------------
+
+#define EIGEN_CUFFT_CHECK(x)                                       \
+  do {                                                             \
+    cufftResult _r = (x);                                          \
+    eigen_assert(_r == CUFFT_SUCCESS && "cuFFT call failed: " #x); \
+    EIGEN_UNUSED_VARIABLE(_r);                                     \
+  } while (0)
+
+// ---- Scalar → cufftType traits ----------------------------------------------
+
+template <typename Scalar>
+struct cufft_c2c_type;
+
+template <>
+struct cufft_c2c_type<float> {
+  static constexpr cufftType value = CUFFT_C2C;
+};
+template <>
+struct cufft_c2c_type<double> {
+  static constexpr cufftType value = CUFFT_Z2Z;
+};
+
+template <typename Scalar>
+struct cufft_r2c_type;
+
+template <>
+struct cufft_r2c_type<float> {
+  static constexpr cufftType value = CUFFT_R2C;
+};
+template <>
+struct cufft_r2c_type<double> {
+  static constexpr cufftType value = CUFFT_D2Z;
+};
+
+template <typename Scalar>
+struct cufft_c2r_type;
+
+template <>
+struct cufft_c2r_type<float> {
+  static constexpr cufftType value = CUFFT_C2R;
+};
+template <>
+struct cufft_c2r_type<double> {
+  static constexpr cufftType value = CUFFT_Z2D;
+};
+
+// ---- Type-dispatched cuFFT execution ----------------------------------------
+
+// C2C
+inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<float>* in, std::complex<float>* out,
+                                         int direction) {
+  return cufftExecC2C(plan, reinterpret_cast<cufftComplex*>(in), reinterpret_cast<cufftComplex*>(out), direction);
+}
+inline cufftResult cufftExecC2C_dispatch(cufftHandle plan, std::complex<double>* in, std::complex<double>* out,
+                                         int direction) {
+  return cufftExecZ2Z(plan, reinterpret_cast<cufftDoubleComplex*>(in), reinterpret_cast<cufftDoubleComplex*>(out),
+                      direction);
+}
+
+// R2C
+inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, float* in, std::complex<float>* out) {
+  return cufftExecR2C(plan, in, reinterpret_cast<cufftComplex*>(out));
+}
+inline cufftResult cufftExecR2C_dispatch(cufftHandle plan, double* in, std::complex<double>* out) {
+  return cufftExecD2Z(plan, in, reinterpret_cast<cufftDoubleComplex*>(out));
+}
+
+// C2R
+inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<float>* in, float* out) {
+  return cufftExecC2R(plan, reinterpret_cast<cufftComplex*>(in), out);
+}
+inline cufftResult cufftExecC2R_dispatch(cufftHandle plan, std::complex<double>* in, double* out) {
+  return cufftExecZ2D(plan, reinterpret_cast<cufftDoubleComplex*>(in), out);
+}
+
+}  // namespace internal
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_CUFFT_SUPPORT_H
diff --git a/unsupported/Eigen/src/GPU/CuSolverSupport.h b/unsupported/Eigen/src/GPU/CuSolverSupport.h
index 41f7b31..8b88c5c 100644
--- a/unsupported/Eigen/src/GPU/CuSolverSupport.h
+++ b/unsupported/Eigen/src/GPU/CuSolverSupport.h
@@ -92,7 +92,7 @@
   CusolverParams(CusolverParams&& o) noexcept : p(o.p) { o.p = nullptr; }
   CusolverParams& operator=(CusolverParams&& o) noexcept {
     if (this != &o) {
-      if (p) (void)cusolverDnDestroyParams(p);
+      if (p) EIGEN_CUSOLVER_CHECK(cusolverDnDestroyParams(p));
       p = o.p;
       o.p = nullptr;
     }
diff --git a/unsupported/Eigen/src/GPU/CuSparseSupport.h b/unsupported/Eigen/src/GPU/CuSparseSupport.h
new file mode 100644
index 0000000..1d60ecd
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/CuSparseSupport.h
@@ -0,0 +1,56 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// cuSPARSE support utilities: error checking macro.
+
+#ifndef EIGEN_GPU_CUSPARSE_SUPPORT_H
+#define EIGEN_GPU_CUSPARSE_SUPPORT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSupport.h"
+#include <cusparse.h>
+
+namespace Eigen {
+namespace gpu {
+namespace internal {
+
+#define EIGEN_CUSPARSE_CHECK(x)                                                 \
+  do {                                                                          \
+    cusparseStatus_t _s = (x);                                                  \
+    eigen_assert(_s == CUSPARSE_STATUS_SUCCESS && "cuSPARSE call failed: " #x); \
+    EIGEN_UNUSED_VARIABLE(_s);                                                  \
+  } while (0)
+
+constexpr cusparseOperation_t to_cusparse_op(GpuOp op) {
+  switch (op) {
+    case GpuOp::Trans:
+      return CUSPARSE_OPERATION_TRANSPOSE;
+    case GpuOp::ConjTrans:
+      return CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE;
+    default:
+      return CUSPARSE_OPERATION_NON_TRANSPOSE;
+  }
+}
+
+// cuSPARSE rejects CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE for real scalar
+// types; for real Scalar, ConjTrans is mathematically equivalent to Trans,
+// so silently demote it. Complex Scalar passes through unchanged.
+template <typename Scalar>
+constexpr cusparseOperation_t to_cusparse_op_for_scalar(GpuOp op) {
+  return to_cusparse_op((op == GpuOp::ConjTrans && !NumTraits<Scalar>::IsComplex) ? GpuOp::Trans : op);
+}
+
+}  // namespace internal
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_CUSPARSE_SUPPORT_H
diff --git a/unsupported/Eigen/src/GPU/DeviceMatrix.h b/unsupported/Eigen/src/GPU/DeviceMatrix.h
index 869a5ac..9d30a6f 100644
--- a/unsupported/Eigen/src/GPU/DeviceMatrix.h
+++ b/unsupported/Eigen/src/GPU/DeviceMatrix.h
@@ -134,7 +134,7 @@
 
   HostTransfer& operator=(HostTransfer&& o) noexcept {
     if (this != &o) {
-      if (event_) (void)cudaEventDestroy(event_);
+      if (event_) EIGEN_CUDA_RUNTIME_CHECK(cudaEventDestroy(event_));
       host_buf_ = std::move(o.host_buf_);
       pinned_buf_ = std::move(o.pinned_buf_);
       event_ = o.event_;
@@ -226,7 +226,7 @@
 
   DeviceMatrix& operator=(DeviceMatrix&& o) noexcept {
     if (this != &o) {
-      if (ready_event_) (void)cudaEventDestroy(ready_event_);
+      if (ready_event_) EIGEN_CUDA_RUNTIME_CHECK(cudaEventDestroy(ready_event_));
       data_ = std::move(o.data_);
       rows_ = o.rows_;
       cols_ = o.cols_;
@@ -358,7 +358,7 @@
     if (rows == rows_ && cols == cols_) return;
     data_.reset();
     if (ready_event_) {
-      (void)cudaEventDestroy(ready_event_);
+      EIGEN_CUDA_RUNTIME_CHECK(cudaEventDestroy(ready_event_));
       ready_event_ = nullptr;
     }
     ready_stream_ = nullptr;
@@ -502,7 +502,7 @@
     rows_ = 0;
     cols_ = 0;
     if (ready_event_) {
-      (void)cudaEventDestroy(ready_event_);
+      EIGEN_CUDA_RUNTIME_CHECK(cudaEventDestroy(ready_event_));
       ready_event_ = nullptr;
     }
     ready_stream_ = nullptr;
diff --git a/unsupported/Eigen/src/GPU/GpuFFT.h b/unsupported/Eigen/src/GPU/GpuFFT.h
new file mode 100644
index 0000000..34e770e
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/GpuFFT.h
@@ -0,0 +1,315 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// GPU FFT via cuFFT.
+//
+// Standalone GPU FFT class with plan caching. Supports 1D and 2D transforms:
+// C2C (complex-to-complex), R2C (real-to-complex), C2R (complex-to-real).
+//
+// Inverse transforms are scaled by 1/n (1D) or 1/(n*m) (2D) so that
+// inv(fwd(x)) == x, matching Eigen's FFT convention.
+//
+// cuFFT plans are cached by (size, type) and reused across calls.
+//
+// Thread safety: not thread-safe. Concurrent fwd/inv calls on a single FFT
+// instance race on the cached plans, the cuBLAS handle, and the bound
+// stream. Use one FFT instance per thread.
+//
+// Usage:
+//   FFT<float> fft;
+//   VectorXcf X = fft.fwd(x);         // 1D C2C or R2C
+//   VectorXcf y = fft.inv(X);         // 1D C2C inverse
+//   VectorXf  r = fft.invReal(X, n);  // 1D C2R inverse
+//   MatrixXcf B = fft.fwd2(A);       // 2D C2C forward
+//   MatrixXcf C = fft.inv2(B);       // 2D C2C inverse
+
+#ifndef EIGEN_GPU_FFT_H
+#define EIGEN_GPU_FFT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuFftSupport.h"
+#include "./CuBlasSupport.h"
+#include <map>
+
+namespace Eigen {
+namespace gpu {
+
+template <typename Scalar_>
+class FFT {
+ public:
+  using Scalar = Scalar_;
+  using Complex = std::complex<Scalar>;
+  using ComplexVector = Matrix<Complex, Dynamic, 1>;
+  using RealVector = Matrix<Scalar, Dynamic, 1>;
+  using ComplexMatrix = Matrix<Complex, Dynamic, Dynamic, ColMajor>;
+
+  FFT() {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
+    EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
+  }
+
+  ~FFT() {
+    for (auto& kv : plans_) (void)cufftDestroy(kv.second);
+    if (cublas_) (void)cublasDestroy(cublas_);
+    if (stream_) (void)cudaStreamDestroy(stream_);
+  }
+
+  FFT(const FFT&) = delete;
+  FFT& operator=(const FFT&) = delete;
+
+  // ---- 1D Complex-to-Complex ------------------------------------------------
+
+  /** Forward 1D C2C FFT. */
+  template <typename Derived>
+  ComplexVector fwd(const MatrixBase<Derived>& x,
+                    typename std::enable_if<NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
+    const ComplexVector input(x.derived());
+    const int n = static_cast<int>(input.size());
+    if (n == 0) return ComplexVector(0);
+
+    ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_in_.get(), input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
+
+    cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
+    EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.get()),
+                                                      static_cast<Complex*>(d_out_.get()), CUFFT_FORWARD));
+
+    ComplexVector result(n);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(result.data(), d_out_.get(), n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+    return result;
+  }
+
+  /** Inverse 1D C2C FFT. Scaled by 1/n. */
+  template <typename Derived>
+  ComplexVector inv(const MatrixBase<Derived>& X) {
+    static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv() requires complex input");
+    const ComplexVector input(X.derived());
+    const int n = static_cast<int>(input.size());
+    if (n == 0) return ComplexVector(0);
+
+    ensure_buffers(n * sizeof(Complex), n * sizeof(Complex));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_in_.get(), input.data(), n * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
+
+    cufftHandle plan = get_plan_1d(n, internal::cufft_c2c_type<Scalar>::value);
+    EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.get()),
+                                                      static_cast<Complex*>(d_out_.get()), CUFFT_INVERSE));
+
+    // Scale by 1/n.
+    scale_device(static_cast<Complex*>(d_out_.get()), n, Scalar(1) / Scalar(n));
+
+    ComplexVector result(n);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(result.data(), d_out_.get(), n * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+    return result;
+  }
+
+  // ---- 1D Real-to-Complex ---------------------------------------------------
+
+  /** Forward 1D R2C FFT. Returns n/2+1 complex values (half-spectrum). */
+  template <typename Derived>
+  ComplexVector fwd(const MatrixBase<Derived>& x,
+                    typename std::enable_if<!NumTraits<typename Derived::Scalar>::IsComplex>::type* = nullptr) {
+    const RealVector input(x.derived());
+    const int n = static_cast<int>(input.size());
+    if (n == 0) return ComplexVector(0);
+
+    const int n_complex = n / 2 + 1;
+    ensure_buffers(n * sizeof(Scalar), n_complex * sizeof(Complex));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_in_.get(), input.data(), n * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
+
+    cufftHandle plan = get_plan_1d(n, internal::cufft_r2c_type<Scalar>::value);
+    EIGEN_CUFFT_CHECK(
+        internal::cufftExecR2C_dispatch(plan, static_cast<Scalar*>(d_in_.get()), static_cast<Complex*>(d_out_.get())));
+
+    ComplexVector result(n_complex);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(result.data(), d_out_.get(), n_complex * sizeof(Complex), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+    return result;
+  }
+
+  // ---- 1D Complex-to-Real ---------------------------------------------------
+
+  /** Inverse 1D C2R FFT. Input is n/2+1 complex values, output is nfft real values.
+   * Scaled by 1/nfft. Caller must specify nfft (original real signal length). */
+  template <typename Derived>
+  RealVector invReal(const MatrixBase<Derived>& X, Index nfft) {
+    static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "invReal() requires complex input");
+    const ComplexVector input(X.derived());
+    const int n = static_cast<int>(nfft);
+    const int n_complex = n / 2 + 1;
+    eigen_assert(input.size() == n_complex);
+    if (n == 0) return RealVector(0);
+
+    ensure_buffers(n_complex * sizeof(Complex), n * sizeof(Scalar));
+    // cuFFT C2R may overwrite the input, so we copy to d_in_.
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_in_.get(), input.data(), n_complex * sizeof(Complex), cudaMemcpyHostToDevice, stream_));
+
+    cufftHandle plan = get_plan_1d(n, internal::cufft_c2r_type<Scalar>::value);
+    EIGEN_CUFFT_CHECK(
+        internal::cufftExecC2R_dispatch(plan, static_cast<Complex*>(d_in_.get()), static_cast<Scalar*>(d_out_.get())));
+
+    // Scale by 1/n.
+    scale_device_real(static_cast<Scalar*>(d_out_.get()), n, Scalar(1) / Scalar(n));
+
+    RealVector result(n);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(result.data(), d_out_.get(), n * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+    return result;
+  }
+
+  // ---- 2D Complex-to-Complex ------------------------------------------------
+
+  /** Forward 2D C2C FFT. Input and output are rows x cols complex matrices. */
+  template <typename Derived>
+  ComplexMatrix fwd2(const MatrixBase<Derived>& A) {
+    static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "fwd2() requires complex input");
+    const ComplexMatrix input(A.derived());
+    const int rows = static_cast<int>(input.rows());
+    const int cols = static_cast<int>(input.cols());
+    if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
+
+    const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
+    ensure_buffers(total, total);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.get(), input.data(), total, cudaMemcpyHostToDevice, stream_));
+
+    cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
+    EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.get()),
+                                                      static_cast<Complex*>(d_out_.get()), CUFFT_FORWARD));
+
+    ComplexMatrix result(rows, cols);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.get(), total, cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+    return result;
+  }
+
+  /** Inverse 2D C2C FFT. Scaled by 1/(rows*cols). */
+  template <typename Derived>
+  ComplexMatrix inv2(const MatrixBase<Derived>& A) {
+    static_assert(NumTraits<typename Derived::Scalar>::IsComplex, "inv2() requires complex input");
+    const ComplexMatrix input(A.derived());
+    const int rows = static_cast<int>(input.rows());
+    const int cols = static_cast<int>(input.cols());
+    if (rows == 0 || cols == 0) return ComplexMatrix(rows, cols);
+
+    const size_t total = static_cast<size_t>(rows) * static_cast<size_t>(cols) * sizeof(Complex);
+    ensure_buffers(total, total);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_in_.get(), input.data(), total, cudaMemcpyHostToDevice, stream_));
+
+    cufftHandle plan = get_plan_2d(rows, cols, internal::cufft_c2c_type<Scalar>::value);
+    EIGEN_CUFFT_CHECK(internal::cufftExecC2C_dispatch(plan, static_cast<Complex*>(d_in_.get()),
+                                                      static_cast<Complex*>(d_out_.get()), CUFFT_INVERSE));
+
+    // Scale by 1/(rows*cols).
+    const int total_elems = rows * cols;
+    scale_device(static_cast<Complex*>(d_out_.get()), total_elems, Scalar(1) / Scalar(total_elems));
+
+    ComplexMatrix result(rows, cols);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(result.data(), d_out_.get(), total, cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+    return result;
+  }
+
+  // ---- Accessors ------------------------------------------------------------
+
+  cudaStream_t stream() const { return stream_; }
+
+ private:
+  cudaStream_t stream_ = nullptr;
+  cublasHandle_t cublas_ = nullptr;
+  std::map<int64_t, cufftHandle> plans_;
+  internal::DeviceBuffer d_in_;
+  internal::DeviceBuffer d_out_;
+  size_t d_in_size_ = 0;
+  size_t d_out_size_ = 0;
+
+  void ensure_buffers(size_t in_bytes, size_t out_bytes) {
+    if (in_bytes > d_in_size_) {
+      if (d_in_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      d_in_ = internal::DeviceBuffer(in_bytes);
+      d_in_size_ = in_bytes;
+    }
+    if (out_bytes > d_out_size_) {
+      if (d_out_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      d_out_ = internal::DeviceBuffer(out_bytes);
+      d_out_size_ = out_bytes;
+    }
+  }
+
+  // Plan key encoding: rank (1 bit) | type (4 bits) | dims.
+  // cufftType uses 7 bits; the top 3 (precision discriminator) are redundant
+  // since Scalar fixes precision per FFT instance, so mask to 4 bits — without
+  // it, e.g. plan_key_1d(5, C2C) and plan_key_1d(7, C2C) collide.
+  static constexpr int64_t kTypeMask = 0xF;
+  static constexpr int kCols2DBits = 30;  // bits 5..34
+  static constexpr int kRows2DBits = 29;  // bits 35..63
+  static int64_t plan_key_1d(int n, cufftType type) { return (int64_t(n) << 5) | (int64_t(type & kTypeMask) << 1) | 0; }
+
+  static int64_t plan_key_2d(int rows, int cols, cufftType type) {
+    eigen_assert(rows >= 0 && int64_t(rows) < (int64_t(1) << kRows2DBits) &&
+                 "FFT plan rows exceed plan-key bit budget");
+    eigen_assert(cols >= 0 && int64_t(cols) < (int64_t(1) << kCols2DBits) &&
+                 "FFT plan cols exceed plan-key bit budget");
+    return (int64_t(rows) << 35) | (int64_t(cols) << 5) | (int64_t(type & kTypeMask) << 1) | 1;
+  }
+
+  cufftHandle get_plan_1d(int n, cufftType type) {
+    int64_t key = plan_key_1d(n, type);
+    auto it = plans_.find(key);
+    if (it != plans_.end()) return it->second;
+
+    cufftHandle plan;
+    EIGEN_CUFFT_CHECK(cufftPlan1d(&plan, n, type, /*batch=*/1));
+    EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
+    plans_[key] = plan;
+    return plan;
+  }
+
+  cufftHandle get_plan_2d(int rows, int cols, cufftType type) {
+    int64_t key = plan_key_2d(rows, cols, type);
+    auto it = plans_.find(key);
+    if (it != plans_.end()) return it->second;
+
+    // cuFFT uses row-major (C order) for 2D: first dim = rows, second = cols.
+    // Eigen matrices are column-major, so we pass (cols, rows) to cuFFT
+    // to get the correct 2D transform.
+    cufftHandle plan;
+    EIGEN_CUFFT_CHECK(cufftPlan2d(&plan, cols, rows, type));
+    EIGEN_CUFFT_CHECK(cufftSetStream(plan, stream_));
+    plans_[key] = plan;
+    return plan;
+  }
+
+  // Scale complex array on device using cuBLAS scal.
+  void scale_device(Complex* d_ptr, int n, Scalar alpha) {
+    EIGEN_CUBLAS_CHECK(internal::cublasXscal(cublas_, n, &alpha, d_ptr, 1));
+  }
+
+  // Scale real array on device using cuBLAS scal.
+  void scale_device_real(Scalar* d_ptr, int n, Scalar alpha) {
+    EIGEN_CUBLAS_CHECK(internal::cublasXscal(cublas_, n, &alpha, d_ptr, 1));
+  }
+};
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_FFT_H
diff --git a/unsupported/Eigen/src/GPU/GpuSolverContext.h b/unsupported/Eigen/src/GPU/GpuSolverContext.h
index 87c4641..382a076 100644
--- a/unsupported/Eigen/src/GPU/GpuSolverContext.h
+++ b/unsupported/Eigen/src/GPU/GpuSolverContext.h
@@ -87,10 +87,10 @@
       // upcoming move of d_scratch_ doesn't free buffers that an in-flight
       // kernel is still touching. The asymmetric move-ctor doesn't need this —
       // it constructs onto uninitialized storage.
-      if (stream_) (void)cudaStreamSynchronize(stream_);
-      if (cublas_) (void)cublasDestroy(cublas_);
-      if (cusolver_) (void)cusolverDnDestroy(cusolver_);
-      if (stream_) (void)cudaStreamDestroy(stream_);
+      if (stream_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      if (cublas_) EIGEN_CUBLAS_CHECK(cublasDestroy(cublas_));
+      if (cusolver_) EIGEN_CUSOLVER_CHECK(cusolverDnDestroy(cusolver_));
+      if (stream_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamDestroy(stream_));
       stream_ = o.stream_;
       cusolver_ = o.cusolver_;
       cublas_ = o.cublas_;
diff --git a/unsupported/Eigen/src/GPU/GpuSparseContext.h b/unsupported/Eigen/src/GPU/GpuSparseContext.h
new file mode 100644
index 0000000..45926f8
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/GpuSparseContext.h
@@ -0,0 +1,404 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix
+// multiply (SpMM) via cuSPARSE.
+//
+// SparseContext manages a cuSPARSE handle and device buffers. It accepts
+// Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the
+// GPU. RowMajor input is implicitly converted to ColMajor.
+//
+// Thread safety: not thread-safe. Concurrent multiply* calls on a single
+// SparseContext race on the cuSPARSE handle, the bound stream, and the
+// cached device buffers. Use one SparseContext per thread.
+//
+// Usage:
+//   SparseContext<double> ctx;
+//   VectorXd y = ctx.multiply(A, x);                  // y = A * x
+//   ctx.multiply(A, x, y, 2.0, 1.0);                  // y = 2*A*x + y
+//   ctx.multiply(A, x, y, 1.0, 0.0, gpu::GpuOp::ConjTrans);  // y = A^H * x
+//   VectorXd z = ctx.multiplyT(A, x);                 // z = A^T * x
+//   VectorXcd w = ctx.multiplyAdjoint(A, x);          // w = A^H * x (complex)
+//   MatrixXd Y = ctx.multiplyMat(A, X);               // Y = A * X (multiple RHS)
+
+#ifndef EIGEN_GPU_SPARSE_CONTEXT_H
+#define EIGEN_GPU_SPARSE_CONTEXT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuSparseSupport.h"
+
+namespace Eigen {
+namespace gpu {
+
+template <typename Scalar_>
+class SparseContext {
+ public:
+  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  using StorageIndex = int;
+  using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
+  using DenseVector = Matrix<Scalar, Dynamic, 1>;
+  using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+
+  SparseContext() {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_));
+    EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_));
+  }
+
+  ~SparseContext() {
+    destroy_descriptors_unchecked();
+    if (handle_) (void)cusparseDestroy(handle_);
+    if (stream_) (void)cudaStreamDestroy(stream_);
+  }
+
+  SparseContext(const SparseContext&) = delete;
+  SparseContext& operator=(const SparseContext&) = delete;
+
+  // ---- SpMV: y = A * x -----------------------------------------------------
+
+  /** Compute y = A * x. Returns y as a new dense vector. */
+  template <typename InputType, typename Rhs>
+  DenseVector multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
+    const InputType& input = A.derived();
+    check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
+    const SpMat mat(input);
+    DenseVector y(mat.rows());
+    y.setZero();
+    multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
+    return y;
+  }
+
+  /** Compute y = alpha * op(A) * x + beta * y (in-place). */
+  template <typename InputType, typename Rhs, typename Dest>
+  void multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x, MatrixBase<Dest>& y,
+                Scalar alpha = Scalar(1), Scalar beta = Scalar(0), GpuOp op = GpuOp::NoTrans) {
+    const InputType& input = A.derived();
+    check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
+    const SpMat mat(input);
+    multiply_impl(mat, x.derived(), y.derived(), alpha, beta, internal::to_cusparse_op_for_scalar<Scalar>(op));
+  }
+
+  // ---- SpMV transpose: y = A^T * x -----------------------------------------
+
+  /** Compute y = A^T * x. Returns y as a new dense vector. */
+  template <typename InputType, typename Rhs>
+  DenseVector multiplyT(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
+    const InputType& input = A.derived();
+    check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
+    const SpMat mat(input);
+    DenseVector y(mat.cols());
+    y.setZero();
+    multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE);
+    return y;
+  }
+
+  // ---- SpMV adjoint: y = A^H * x -------------------------------------------
+
+  /** Compute y = A^H * x (conjugate transpose). For real Scalar this is equivalent to multiplyT. */
+  template <typename InputType, typename Rhs>
+  DenseVector multiplyAdjoint(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
+    const InputType& input = A.derived();
+    check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
+    const SpMat mat(input);
+    DenseVector y(mat.cols());
+    y.setZero();
+    multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0),
+                  internal::to_cusparse_op_for_scalar<Scalar>(GpuOp::ConjTrans));
+    return y;
+  }
+
+  // ---- SpMM: Y = op(A) * X (multiple RHS) ----------------------------------
+
+  /** Compute Y = op(A) * X where X is a dense matrix (multiple RHS). Returns Y. */
+  template <typename InputType, typename Rhs>
+  DenseMatrix multiplyMat(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& X, GpuOp op = GpuOp::NoTrans) {
+    const InputType& input = A.derived();
+    check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
+    const SpMat mat(input);
+    const DenseMatrix rhs(X.derived());
+
+    const cusparseOperation_t cu_op = internal::to_cusparse_op_for_scalar<Scalar>(op);
+    const Index m = (op == GpuOp::NoTrans) ? mat.rows() : mat.cols();
+    const Index k = (op == GpuOp::NoTrans) ? mat.cols() : mat.rows();
+    eigen_assert(k == rhs.rows());
+
+    const Index n = rhs.cols();
+    if (m == 0 || n == 0 || mat.nonZeros() == 0) return DenseMatrix::Zero(m, n);
+
+    DenseMatrix Y = DenseMatrix::Zero(m, n);
+    spmm_impl(mat, rhs, Y, Scalar(1), Scalar(0), cu_op);
+    return Y;
+  }
+
+  // ---- Accessors ------------------------------------------------------------
+
+  cudaStream_t stream() const { return stream_; }
+
+ private:
+  cudaStream_t stream_ = nullptr;
+  cusparseHandle_t handle_ = nullptr;
+
+  // Cached device buffers (grow-only).
+  internal::DeviceBuffer d_outerPtr_;
+  internal::DeviceBuffer d_innerIdx_;
+  internal::DeviceBuffer d_values_;
+  internal::DeviceBuffer d_x_;
+  internal::DeviceBuffer d_y_;
+  internal::DeviceBuffer d_workspace_;
+  size_t d_outerPtr_size_ = 0;
+  size_t d_innerIdx_size_ = 0;
+  size_t d_values_size_ = 0;
+  size_t d_x_size_ = 0;
+  size_t d_y_size_ = 0;
+  size_t d_workspace_size_ = 0;
+
+  // Cached cuSPARSE descriptors.
+  cusparseSpMatDescr_t spmat_desc_ = nullptr;
+  Index cached_rows_ = -1;
+  Index cached_cols_ = -1;
+  Index cached_nnz_ = -1;
+
+  // ---- SpMV implementation --------------------------------------------------
+
+  template <typename RhsDerived, typename DestDerived>
+  void multiply_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta,
+                     cusparseOperation_t op) {
+    eigen_assert(A.isCompressed());
+
+    const Index m = A.rows();
+    const Index n = A.cols();
+    const Index nnz = A.nonZeros();
+    const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
+    const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;
+
+    eigen_assert(x.size() == x_size);
+    eigen_assert(y.size() == y_size);
+
+    if (m == 0 || n == 0 || nnz == 0) {
+      if (beta == Scalar(0))
+        y.setZero();
+      else
+        y *= beta;
+      return;
+    }
+
+    // Upload sparse matrix to device.
+    upload_sparse(A);
+
+    // Upload x to device.
+    ensure_buffer(d_x_, d_x_size_, static_cast<size_t>(x_size) * sizeof(Scalar));
+    const DenseVector x_tmp(x);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_x_.get(), x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
+
+    // Upload y to device (for beta != 0).
+    ensure_buffer(d_y_, d_y_size_, static_cast<size_t>(y_size) * sizeof(Scalar));
+    if (beta != Scalar(0)) {
+      const DenseVector y_tmp(y);
+      EIGEN_CUDA_RUNTIME_CHECK(
+          cudaMemcpyAsync(d_y_.get(), y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
+    }
+
+    // Create dense vector descriptors.
+    constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
+    cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr;
+    EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_.get(), dtype));
+    EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_.get(), dtype));
+
+    // Query workspace size.
+    size_t ws_size = 0;
+    EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
+                                                 CUSPARSE_SPMV_ALG_DEFAULT, &ws_size));
+    ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
+
+    // Execute SpMV.
+    EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
+                                      CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.get()));
+
+    // Download result.
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(y.data(), d_y_.get(), y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+
+    EIGEN_CUSPARSE_CHECK(cusparseDestroyDnVec(x_desc));
+    EIGEN_CUSPARSE_CHECK(cusparseDestroyDnVec(y_desc));
+  }
+
+  // ---- SpMM implementation --------------------------------------------------
+
+  void spmm_impl(const SpMat& A, const DenseMatrix& X, DenseMatrix& Y, Scalar alpha, Scalar beta,
+                 cusparseOperation_t op) {
+    eigen_assert(A.isCompressed());
+
+    // For op != NON_TRANSPOSE, Y = op(A) * X. The dense X / Y descriptors must
+    // describe the *post-op* shapes: X has k_op rows (= input dim of op(A)),
+    // Y has m_op rows (= output dim of op(A)).
+    const bool transposed = (op != CUSPARSE_OPERATION_NON_TRANSPOSE);
+    const Index m_op = transposed ? A.cols() : A.rows();
+    const Index k_op = transposed ? A.rows() : A.cols();
+    const Index n = X.cols();
+    const Index nnz = A.nonZeros();
+
+    if (m_op == 0 || n == 0 || k_op == 0 || nnz == 0) {
+      if (beta == Scalar(0))
+        Y.setZero();
+      else
+        Y *= beta;
+      return;
+    }
+
+    upload_sparse(A);
+
+    // Upload X to device. X is k_op x n, Y is m_op x n (column-major).
+    const size_t x_bytes = static_cast<size_t>(k_op) * static_cast<size_t>(n) * sizeof(Scalar);
+    const size_t y_bytes = static_cast<size_t>(m_op) * static_cast<size_t>(n) * sizeof(Scalar);
+    ensure_buffer(d_x_, d_x_size_, x_bytes);
+    ensure_buffer(d_y_, d_y_size_, y_bytes);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_x_.get(), X.data(), x_bytes, cudaMemcpyHostToDevice, stream_));
+    if (beta != Scalar(0)) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.get(), Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_));
+    }
+
+    // Create dense matrix descriptors.
+    constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
+    cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr;
+    // Eigen is column-major, so ld = rows.
+    EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k_op, n, k_op, d_x_.get(), dtype, CUSPARSE_ORDER_COL));
+    EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m_op, n, m_op, d_y_.get(), dtype, CUSPARSE_ORDER_COL));
+
+    // Query workspace.
+    size_t ws_size = 0;
+    EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_,
+                                                 x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size));
+    ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
+
+    // Execute SpMM.
+    EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta,
+                                      y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.get()));
+
+    // Download result.
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.get(), y_bytes, cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+
+    EIGEN_CUSPARSE_CHECK(cusparseDestroyDnMat(x_desc));
+    EIGEN_CUSPARSE_CHECK(cusparseDestroyDnMat(y_desc));
+  }
+
+  // ---- Helpers --------------------------------------------------------------
+
+  static void check_storage_index_bounds(Index rows, Index cols, Index nnz) {
+    const Index max_storage_index = static_cast<Index>((std::numeric_limits<StorageIndex>::max)());
+    eigen_assert(rows <= max_storage_index && cols <= max_storage_index && nnz <= max_storage_index &&
+                 "gpu::SparseContext currently uses int StorageIndex; matrix dimensions or nonzeros exceed int range");
+    EIGEN_UNUSED_VARIABLE(rows);
+    EIGEN_UNUSED_VARIABLE(cols);
+    EIGEN_UNUSED_VARIABLE(nnz);
+    EIGEN_UNUSED_VARIABLE(max_storage_index);
+  }
+
+  void upload_sparse(const SpMat& A) {
+    // cuSPARSE 12.0+ accepts CSC directly for both SpMV and SpMM. cuSPARSE
+    // 11.x SpMM rejects CSC ("unsupported matrix format for matA (CSC)") and
+    // SpMV with CONJUGATE_TRANSPOSE on CSC+complex silently demotes to
+    // TRANSPOSE. CSR works on every version, so on 11.x we transpose-copy
+    // the user's ColMajor input into a RowMajor (CSR) representation.
+#if CUSPARSE_VERSION >= 12000
+    upload_compressed_arrays(A.rows(), A.cols(), A.nonZeros(),
+                             /*outer_count=*/A.cols() + 1, A.outerIndexPtr(), A.innerIndexPtr(), A.valuePtr(),
+                             /*is_csr=*/false);
+#else
+    using CsrMat = SparseMatrix<Scalar, RowMajor, StorageIndex>;
+    const CsrMat csr(A);
+    upload_compressed_arrays(csr.rows(), csr.cols(), csr.nonZeros(),
+                             /*outer_count=*/csr.rows() + 1, csr.outerIndexPtr(), csr.innerIndexPtr(), csr.valuePtr(),
+                             /*is_csr=*/true);
+    // cudaMemcpyAsync from pageable host memory blocks the host until the
+    // source is consumed, so the CsrMat temporary's lifetime is sufficient.
+#endif
+  }
+
+  void upload_compressed_arrays(Index m, Index n, Index nnz, Index outer_count, const StorageIndex* host_outer,
+                                const StorageIndex* host_inner, const Scalar* host_values, bool is_csr) {
+    const size_t outer_bytes = static_cast<size_t>(outer_count) * sizeof(StorageIndex);
+    const size_t inner_bytes = static_cast<size_t>(nnz) * sizeof(StorageIndex);
+    const size_t val_bytes = static_cast<size_t>(nnz) * sizeof(Scalar);
+
+    ensure_buffer(d_outerPtr_, d_outerPtr_size_, outer_bytes);
+    ensure_buffer(d_innerIdx_, d_innerIdx_size_, inner_bytes);
+    ensure_buffer(d_values_, d_values_size_, val_bytes);
+
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_outerPtr_.get(), host_outer, outer_bytes, cudaMemcpyHostToDevice, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_innerIdx_.get(), host_inner, inner_bytes, cudaMemcpyHostToDevice, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.get(), host_values, val_bytes, cudaMemcpyHostToDevice, stream_));
+
+    if (m != cached_rows_ || n != cached_cols_ || nnz != cached_nnz_) {
+      destroy_descriptors_checked();
+
+      constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
+      constexpr cudaDataType_t val_type = internal::cuda_data_type<Scalar>::value;
+
+      if (is_csr) {
+        EIGEN_CUSPARSE_CHECK(cusparseCreateCsr(&spmat_desc_, m, n, nnz, d_outerPtr_.get(), d_innerIdx_.get(),
+                                               d_values_.get(), idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO,
+                                               val_type));
+      } else {
+        EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.get(), d_innerIdx_.get(),
+                                               d_values_.get(), idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO,
+                                               val_type));
+      }
+      cached_rows_ = m;
+      cached_cols_ = n;
+      cached_nnz_ = nnz;
+    } else if (is_csr) {
+      EIGEN_CUSPARSE_CHECK(cusparseCsrSetPointers(spmat_desc_, d_outerPtr_.get(), d_innerIdx_.get(), d_values_.get()));
+    } else {
+      EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.get(), d_innerIdx_.get(), d_values_.get()));
+    }
+  }
+
+  // Destructor-only cleanup: there is no useful recovery path for failures.
+  void destroy_descriptors_unchecked() {
+    if (spmat_desc_) {
+      (void)cusparseDestroySpMat(spmat_desc_);
+      spmat_desc_ = nullptr;
+    }
+    cached_rows_ = -1;
+    cached_cols_ = -1;
+    cached_nnz_ = -1;
+  }
+
+  void destroy_descriptors_checked() {
+    if (spmat_desc_) {
+      EIGEN_CUSPARSE_CHECK(cusparseDestroySpMat(spmat_desc_));
+      spmat_desc_ = nullptr;
+    }
+    cached_rows_ = -1;
+    cached_cols_ = -1;
+    cached_nnz_ = -1;
+  }
+
+  void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) {
+    if (needed > current_size) {
+      if (buf) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      buf = internal::DeviceBuffer(needed);
+      current_size = needed;
+    }
+  }
+};
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_SPARSE_CONTEXT_H
diff --git a/unsupported/Eigen/src/GPU/GpuSparseLDLT.h b/unsupported/Eigen/src/GPU/GpuSparseLDLT.h
new file mode 100644
index 0000000..7a6e7b7
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/GpuSparseLDLT.h
@@ -0,0 +1,65 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// GPU sparse LDL^T / LDL^H factorization via cuDSS.
+//
+// For symmetric indefinite (or Hermitian indefinite) sparse matrices.
+// Same three-phase workflow as SparseLLT.
+//
+// Usage:
+//   SparseLDLT<double> ldlt(A);      // analyze + factorize
+//   VectorXd x = ldlt.solve(b);         // solve
+
+#ifndef EIGEN_GPU_SPARSE_LDLT_H
+#define EIGEN_GPU_SPARSE_LDLT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSparseSolverBase.h"
+
+namespace Eigen {
+namespace gpu {
+
+/** GPU sparse LDL^T factorization (symmetric indefinite / Hermitian indefinite).
+ *
+ * Wraps cuDSS with CUDSS_MTYPE_SYMMETRIC (real) or CUDSS_MTYPE_HERMITIAN (complex).
+ * Uses pivoting for numerical stability.
+ *
+ * \tparam Scalar_  float, double, complex<float>, or complex<double>
+ * \tparam UpLo_    Lower (default) or Upper — which triangle of A is stored
+ */
+template <typename Scalar_, int UpLo_ = Lower>
+class SparseLDLT : public internal::SparseSolverBase<Scalar_, SparseLDLT<Scalar_, UpLo_>> {
+  using Base = internal::SparseSolverBase<Scalar_, SparseLDLT>;
+  friend Base;
+
+ public:
+  using Scalar = Scalar_;
+  static constexpr int UpLo = UpLo_;
+
+  SparseLDLT() = default;
+
+  template <typename InputType>
+  explicit SparseLDLT(const SparseMatrixBase<InputType>& A) {
+    this->compute(A);
+  }
+
+  static constexpr bool needs_csr_conversion() { return false; }
+  static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_symmetric_type<Scalar>::value; }
+  static constexpr cudssMatrixViewType_t cudss_matrix_view() {
+    return internal::cudss_view_type<UpLo, ColMajor>::value;
+  }
+};
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_SPARSE_LDLT_H
diff --git a/unsupported/Eigen/src/GPU/GpuSparseLLT.h b/unsupported/Eigen/src/GPU/GpuSparseLLT.h
new file mode 100644
index 0000000..9d3f98b
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/GpuSparseLLT.h
@@ -0,0 +1,65 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// GPU sparse Cholesky (LL^T / LL^H) via cuDSS.
+//
+// Usage:
+//   SparseLLT<double> llt(A);        // analyze + factorize
+//   VectorXd x = llt.solve(b);          // solve
+//   llt.analyzePattern(A);              // or separate phases
+//   llt.factorize(A_new);              // reuse symbolic analysis
+
+#ifndef EIGEN_GPU_SPARSE_LLT_H
+#define EIGEN_GPU_SPARSE_LLT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSparseSolverBase.h"
+
+namespace Eigen {
+namespace gpu {
+
+/** GPU sparse Cholesky factorization (LL^T for real, LL^H for complex).
+ *
+ * Wraps cuDSS with CUDSS_MTYPE_SPD (real) or CUDSS_MTYPE_HPD (complex).
+ * Accepts ColMajor SparseMatrix (CSC), reinterpreted as CSR with swapped
+ * triangle view for zero-copy upload.
+ *
+ * \tparam Scalar_  float, double, complex<float>, or complex<double>
+ * \tparam UpLo_    Lower (default) or Upper — which triangle of A is stored
+ */
+template <typename Scalar_, int UpLo_ = Lower>
+class SparseLLT : public internal::SparseSolverBase<Scalar_, SparseLLT<Scalar_, UpLo_>> {
+  using Base = internal::SparseSolverBase<Scalar_, SparseLLT>;
+  friend Base;
+
+ public:
+  using Scalar = Scalar_;
+  static constexpr int UpLo = UpLo_;
+
+  SparseLLT() = default;
+
+  template <typename InputType>
+  explicit SparseLLT(const SparseMatrixBase<InputType>& A) {
+    this->compute(A);
+  }
+
+  static constexpr bool needs_csr_conversion() { return false; }
+  static constexpr cudssMatrixType_t cudss_matrix_type() { return internal::cudss_spd_type<Scalar>::value; }
+  static constexpr cudssMatrixViewType_t cudss_matrix_view() {
+    return internal::cudss_view_type<UpLo, ColMajor>::value;
+  }
+};
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_SPARSE_LLT_H
diff --git a/unsupported/Eigen/src/GPU/GpuSparseLU.h b/unsupported/Eigen/src/GPU/GpuSparseLU.h
new file mode 100644
index 0000000..9964ee9
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/GpuSparseLU.h
@@ -0,0 +1,62 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// GPU sparse LU factorization via cuDSS.
+//
+// For general (non-symmetric) sparse matrices. Uses pivoting.
+// Same three-phase workflow as SparseLLT.
+//
+// Usage:
+//   SparseLU<double> lu(A);          // analyze + factorize
+//   VectorXd x = lu.solve(b);           // solve
+
+#ifndef EIGEN_GPU_SPARSE_LU_H
+#define EIGEN_GPU_SPARSE_LU_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSparseSolverBase.h"
+
+namespace Eigen {
+namespace gpu {
+
+/** GPU sparse LU factorization (general matrices).
+ *
+ * Wraps cuDSS with CUDSS_MTYPE_GENERAL and CUDSS_MVIEW_FULL.
+ * Accepts ColMajor SparseMatrix (CSC); internally converts to RowMajor
+ * CSR since cuDSS requires CSR input.
+ *
+ * \tparam Scalar_  float, double, complex<float>, or complex<double>
+ */
+template <typename Scalar_>
+class SparseLU : public internal::SparseSolverBase<Scalar_, SparseLU<Scalar_>> {
+  using Base = internal::SparseSolverBase<Scalar_, SparseLU>;
+  friend Base;
+
+ public:
+  using Scalar = Scalar_;
+
+  SparseLU() = default;
+
+  template <typename InputType>
+  explicit SparseLU(const SparseMatrixBase<InputType>& A) {
+    this->compute(A);
+  }
+
+  static constexpr bool needs_csr_conversion() { return true; }
+  static constexpr cudssMatrixType_t cudss_matrix_type() { return CUDSS_MTYPE_GENERAL; }
+  static constexpr cudssMatrixViewType_t cudss_matrix_view() { return CUDSS_MVIEW_FULL; }
+};
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_SPARSE_LU_H
diff --git a/unsupported/Eigen/src/GPU/GpuSparseSolverBase.h b/unsupported/Eigen/src/GPU/GpuSparseSolverBase.h
new file mode 100644
index 0000000..6813741
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/GpuSparseSolverBase.h
@@ -0,0 +1,389 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// Common base for GPU sparse direct solvers (LLT, LDLT, LU) via cuDSS.
+//
+// All three solver types share the same three-phase workflow
+// (analyzePattern → factorize → solve) and differ only in the
+// cudssMatrixType_t and cudssMatrixViewType_t passed to cuDSS.
+// This CRTP base implements the entire workflow; derived classes
+// provide the matrix type/view via static constexpr members.
+//
+// Thread safety: not thread-safe. Concurrent calls (including concurrent
+// solve() calls on the same instance, even though solve() is const) race
+// on the cuDSS handle, the bound stream, and the cached scratch buffers.
+// Use one solver instance per thread, matching Eigen::SimplicialLLT.
+
+#ifndef EIGEN_GPU_SPARSE_SOLVER_BASE_H
+#define EIGEN_GPU_SPARSE_SOLVER_BASE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuDssSupport.h"
+
+namespace Eigen {
+namespace gpu {
+namespace internal {
+
+/** CRTP base for GPU sparse direct solvers.
+ *
+ * \tparam Scalar_  Element type (passed explicitly to avoid incomplete-type issues with CRTP).
+ * \tparam Derived  The concrete solver class (SparseLLT, SparseLDLT, SparseLU).
+ *                  Must provide:
+ *                  - `static constexpr cudssMatrixType_t cudss_matrix_type()`
+ *                  - `static constexpr cudssMatrixViewType_t cudss_matrix_view()`
+ */
+template <typename Scalar_, typename Derived>
+class SparseSolverBase {
+ public:
+  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  using StorageIndex = int;
+  using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
+  using CsrMat = SparseMatrix<Scalar, RowMajor, StorageIndex>;
+  using DenseVector = Matrix<Scalar, Dynamic, 1>;
+  using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+
+  SparseSolverBase() { init_context(); }
+
+  ~SparseSolverBase() {
+    destroy_cudss_objects();
+    if (handle_) (void)cudssDestroy(handle_);
+    if (stream_) (void)cudaStreamDestroy(stream_);
+  }
+
+  SparseSolverBase(const SparseSolverBase&) = delete;
+  SparseSolverBase& operator=(const SparseSolverBase&) = delete;
+
+  // ---- Configuration --------------------------------------------------------
+
+  /** Set the fill-reducing ordering algorithm. Must be called before compute/analyzePattern. */
+  void setOrdering(GpuSparseOrdering ordering) { ordering_ = ordering; }
+
+  // ---- Factorization --------------------------------------------------------
+
+  /** Symbolic analysis + numeric factorization. */
+  template <typename InputType>
+  Derived& compute(const SparseMatrixBase<InputType>& A) {
+    analyzePattern(A);
+    if (info_ == Success) {
+      factorize(A);
+    }
+    return derived();
+  }
+
+  /** Symbolic analysis only. Uploads sparsity structure to device.
+   * This phase is synchronous (blocks until complete). */
+  template <typename InputType>
+  Derived& analyzePattern(const SparseMatrixBase<InputType>& A) {
+    const InputType& input = A.derived();
+    check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
+    const SpMat csc(input);
+    eigen_assert(csc.rows() == csc.cols() && "GpuSparseSolver requires a square matrix");
+    eigen_assert(csc.isCompressed() && "GpuSparseSolver requires a compressed sparse matrix");
+
+    n_ = csc.rows();
+    info_ = InvalidInput;
+    analysis_done_ = false;
+
+    if (n_ == 0) {
+      nnz_ = 0;
+      info_ = Success;
+      analysis_done_ = true;
+      return derived();
+    }
+
+    // For symmetric solvers, ColMajor CSC can be reinterpreted as CSR with
+    // swapped triangle view (zero copy). For general solvers, we must convert
+    // to actual RowMajor CSR so cuDSS sees the correct matrix, not A^T.
+    if (Derived::needs_csr_conversion()) {
+      const CsrMat csr(csc);
+      nnz_ = csr.nonZeros();
+      upload_csr(csr);
+    } else {
+      nnz_ = csc.nonZeros();
+      upload_csr_from_csc(csc);
+    }
+    create_cudss_matrix();
+    apply_ordering_config();
+
+    if (data_) EIGEN_CUDSS_CHECK(cudssDataDestroy(handle_, data_));
+    EIGEN_CUDSS_CHECK(cudssDataCreate(handle_, &data_));
+
+    create_placeholder_dense();
+
+    EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_ANALYSIS, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
+
+    analysis_done_ = true;
+    info_ = Success;
+    return derived();
+  }
+
+  /** Numeric factorization using the symbolic analysis from analyzePattern.
+   *
+   * \warning The sparsity pattern (outerIndexPtr, innerIndexPtr) must be
+   * identical to the one passed to analyzePattern(). Only the numerical
+   * values may change. Passing a different pattern is undefined behavior.
+   * This matches the contract of CHOLMOD, UMFPACK, and cuDSS's own API.
+   *
+   * This phase is asynchronous — info() lazily synchronizes. */
+  template <typename InputType>
+  Derived& factorize(const SparseMatrixBase<InputType>& A) {
+    eigen_assert(analysis_done_ && "factorize() requires analyzePattern() first");
+
+    if (n_ == 0) {
+      info_ = Success;
+      return derived();
+    }
+
+    // Convert to the same format used in analyzePattern.
+    // Both temporaries must outlive the async memcpy (pageable H2D is actually
+    // synchronous w.r.t. the host, but keep them alive for clarity).
+    const InputType& input = A.derived();
+    check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
+    const SpMat csc(input);
+    eigen_assert(csc.rows() == n_ && csc.cols() == n_);
+
+    const Scalar* value_ptr;
+    Index value_nnz;
+    CsrMat csr_tmp;
+    if (Derived::needs_csr_conversion()) {
+      csr_tmp = CsrMat(csc);
+      value_ptr = csr_tmp.valuePtr();
+      value_nnz = csr_tmp.nonZeros();
+    } else {
+      value_ptr = csc.valuePtr();
+      value_nnz = csc.nonZeros();
+    }
+    eigen_assert(value_nnz == nnz_);
+
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.get(), value_ptr, static_cast<size_t>(nnz_) * sizeof(Scalar),
+                                             cudaMemcpyHostToDevice, stream_));
+
+    EIGEN_CUDSS_CHECK(cudssMatrixSetValues(d_A_cudss_, d_values_.get()));
+
+    info_ = InvalidInput;
+    info_synced_ = false;
+    EIGEN_CUDSS_CHECK(
+        cudssExecute(handle_, CUDSS_PHASE_FACTORIZATION, config_, data_, d_A_cudss_, d_x_cudss_, d_b_cudss_));
+
+    return derived();
+  }
+
+  // ---- Solve ----------------------------------------------------------------
+
+  /** Solve A * X = B. Returns X as a dense matrix.
+   * Supports single or multiple right-hand sides. */
+  template <typename Rhs>
+  DenseMatrix solve(const MatrixBase<Rhs>& B) const {
+    sync_info();
+    eigen_assert(info_ == Success && "GpuSparseSolver::solve requires a successful factorization");
+    eigen_assert(B.rows() == n_);
+
+    const DenseMatrix rhs(B);
+    const int64_t nrhs = static_cast<int64_t>(rhs.cols());
+
+    if (n_ == 0) return DenseMatrix(0, rhs.cols());
+
+    // Reuse cached d_b/d_x scratch to avoid cudaMalloc/cudaFree per solve.
+    const size_t rhs_bytes = static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar);
+    ensure_solve_buffer(d_b_solve_, d_b_solve_size_, rhs_bytes);
+    ensure_solve_buffer(d_x_solve_, d_x_solve_size_, rhs_bytes);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_b_solve_.get(), rhs.data(), rhs_bytes, cudaMemcpyHostToDevice, stream_));
+
+    constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
+    cudssMatrix_t b_cudss = nullptr, x_cudss = nullptr;
+    EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&b_cudss, n_, nrhs, n_, d_b_solve_.get(), dtype, CUDSS_LAYOUT_COL_MAJOR));
+    EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&x_cudss, n_, nrhs, n_, d_x_solve_.get(), dtype, CUDSS_LAYOUT_COL_MAJOR));
+
+    EIGEN_CUDSS_CHECK(cudssExecute(handle_, CUDSS_PHASE_SOLVE, config_, data_, d_A_cudss_, x_cudss, b_cudss));
+
+    DenseMatrix X(n_, rhs.cols());
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_x_solve_.get(), rhs_bytes, cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+
+    EIGEN_CUDSS_CHECK(cudssMatrixDestroy(b_cudss));
+    EIGEN_CUDSS_CHECK(cudssMatrixDestroy(x_cudss));
+
+    return X;
+  }
+
+  // ---- Accessors ------------------------------------------------------------
+
+  ComputationInfo info() const {
+    sync_info();
+    return info_;
+  }
+  Index rows() const { return n_; }
+  Index cols() const { return n_; }
+
+  cudaStream_t stream() const { return stream_; }
+
+ protected:
+  // ---- CUDA / cuDSS handles -------------------------------------------------
+  cudaStream_t stream_ = nullptr;
+  cudssHandle_t handle_ = nullptr;
+  cudssConfig_t config_ = nullptr;
+  cudssData_t data_ = nullptr;
+  cudssMatrix_t d_A_cudss_ = nullptr;
+  cudssMatrix_t d_x_cudss_ = nullptr;
+  cudssMatrix_t d_b_cudss_ = nullptr;
+
+  // ---- Device buffers for CSR arrays ----------------------------------------
+  DeviceBuffer d_rowPtr_;
+  DeviceBuffer d_colIdx_;
+  DeviceBuffer d_values_;
+
+  // ---- Cached scratch for solve() (mutable so const solve() can grow them) --
+  mutable DeviceBuffer d_b_solve_;
+  mutable DeviceBuffer d_x_solve_;
+  mutable size_t d_b_solve_size_ = 0;
+  mutable size_t d_x_solve_size_ = 0;
+
+  // ---- State ----------------------------------------------------------------
+  int64_t n_ = 0;
+  int64_t nnz_ = 0;
+  mutable ComputationInfo info_ = InvalidInput;
+  mutable bool info_synced_ = true;
+  bool analysis_done_ = false;
+  GpuSparseOrdering ordering_ = GpuSparseOrdering::AMD;
+
+ private:
+  Derived& derived() { return static_cast<Derived&>(*this); }
+  const Derived& derived() const { return static_cast<const Derived&>(*this); }
+
+  void init_context() {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    EIGEN_CUDSS_CHECK(cudssCreate(&handle_));
+    EIGEN_CUDSS_CHECK(cudssSetStream(handle_, stream_));
+    EIGEN_CUDSS_CHECK(cudssConfigCreate(&config_));
+  }
+
+  void ensure_solve_buffer(DeviceBuffer& buf, size_t& current_size, size_t needed) const {
+    if (needed > current_size) {
+      if (buf) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      buf = DeviceBuffer(needed);
+      current_size = needed;
+    }
+  }
+
+  void sync_info() const {
+    if (!info_synced_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      int cudss_info = 0;
+      EIGEN_CUDSS_CHECK(cudssDataGet(handle_, data_, CUDSS_DATA_INFO, &cudss_info, sizeof(cudss_info), nullptr));
+      info_ = (cudss_info == 0) ? Success : NumericalIssue;
+      info_synced_ = true;
+    }
+  }
+
+  // Destructor-only cleanup: there is no useful recovery path for failures.
+  void destroy_cudss_objects() {
+    if (d_A_cudss_) {
+      (void)cudssMatrixDestroy(d_A_cudss_);
+      d_A_cudss_ = nullptr;
+    }
+    if (d_x_cudss_) {
+      (void)cudssMatrixDestroy(d_x_cudss_);
+      d_x_cudss_ = nullptr;
+    }
+    if (d_b_cudss_) {
+      (void)cudssMatrixDestroy(d_b_cudss_);
+      d_b_cudss_ = nullptr;
+    }
+    if (data_) {
+      (void)cudssDataDestroy(handle_, data_);
+      data_ = nullptr;
+    }
+    if (config_) {
+      (void)cudssConfigDestroy(config_);
+      config_ = nullptr;
+    }
+  }
+
+  // Upload CSR from a RowMajor sparse matrix (native CSR).
+  void upload_csr(const CsrMat& csr) { upload_compressed(csr.outerIndexPtr(), csr.innerIndexPtr(), csr.valuePtr()); }
+
+  // Upload CSC arrays reinterpreted as CSR (for symmetric matrices: CSC(A) = CSR(A^T) = CSR(A)).
+  void upload_csr_from_csc(const SpMat& csc) {
+    upload_compressed(csc.outerIndexPtr(), csc.innerIndexPtr(), csc.valuePtr());
+  }
+
+  static void check_storage_index_bounds(Index rows, Index cols, Index nnz) {
+    const Index max_storage_index = static_cast<Index>((std::numeric_limits<StorageIndex>::max)());
+    eigen_assert(rows <= max_storage_index && cols <= max_storage_index && nnz <= max_storage_index &&
+                 "gpu sparse solvers currently use int StorageIndex; matrix dimensions or nonzeros exceed int range");
+    EIGEN_UNUSED_VARIABLE(rows);
+    EIGEN_UNUSED_VARIABLE(cols);
+    EIGEN_UNUSED_VARIABLE(nnz);
+    EIGEN_UNUSED_VARIABLE(max_storage_index);
+  }
+
+  void upload_compressed(const StorageIndex* outer, const StorageIndex* inner, const Scalar* values) {
+    const size_t rowptr_bytes = static_cast<size_t>(n_ + 1) * sizeof(StorageIndex);
+    const size_t colidx_bytes = static_cast<size_t>(nnz_) * sizeof(StorageIndex);
+    const size_t values_bytes = static_cast<size_t>(nnz_) * sizeof(Scalar);
+
+    d_rowPtr_ = DeviceBuffer(rowptr_bytes);
+    d_colIdx_ = DeviceBuffer(colidx_bytes);
+    d_values_ = DeviceBuffer(values_bytes);
+
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_rowPtr_.get(), outer, rowptr_bytes, cudaMemcpyHostToDevice, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_colIdx_.get(), inner, colidx_bytes, cudaMemcpyHostToDevice, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.get(), values, values_bytes, cudaMemcpyHostToDevice, stream_));
+  }
+
+  void create_cudss_matrix() {
+    if (d_A_cudss_) EIGEN_CUDSS_CHECK(cudssMatrixDestroy(d_A_cudss_));
+
+    constexpr cudaDataType_t idx_type = cudss_index_type<StorageIndex>::value;
+    constexpr cudaDataType_t val_type = cuda_data_type<Scalar>::value;
+    constexpr cudssMatrixType_t mtype = Derived::cudss_matrix_type();
+    constexpr cudssMatrixViewType_t mview = Derived::cudss_matrix_view();
+
+    EIGEN_CUDSS_CHECK(cudssMatrixCreateCsr(&d_A_cudss_, n_, n_, nnz_, d_rowPtr_.get(),
+                                           /*rowEnd=*/nullptr, d_colIdx_.get(), d_values_.get(), idx_type, val_type,
+                                           mtype, mview, CUDSS_BASE_ZERO));
+  }
+
+  void apply_ordering_config() {
+    cudssAlgType_t alg;
+    switch (ordering_) {
+      case GpuSparseOrdering::AMD:
+        alg = CUDSS_ALG_DEFAULT;
+        break;
+      case GpuSparseOrdering::METIS:
+        alg = CUDSS_ALG_2;
+        break;
+      case GpuSparseOrdering::RCM:
+        alg = CUDSS_ALG_3;
+        break;
+      default:
+        alg = CUDSS_ALG_DEFAULT;
+        break;
+    }
+    EIGEN_CUDSS_CHECK(cudssConfigSet(config_, CUDSS_CONFIG_REORDERING_ALG, &alg, sizeof(alg)));
+  }
+
+  void create_placeholder_dense() {
+    if (d_x_cudss_) EIGEN_CUDSS_CHECK(cudssMatrixDestroy(d_x_cudss_));
+    if (d_b_cudss_) EIGEN_CUDSS_CHECK(cudssMatrixDestroy(d_b_cudss_));
+    constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
+    EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_x_cudss_, n_, 1, n_, nullptr, dtype, CUDSS_LAYOUT_COL_MAJOR));
+    EIGEN_CUDSS_CHECK(cudssMatrixCreateDn(&d_b_cudss_, n_, 1, n_, nullptr, dtype, CUDSS_LAYOUT_COL_MAJOR));
+  }
+};
+
+}  // namespace internal
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_SPARSE_SOLVER_BASE_H
diff --git a/unsupported/Eigen/src/GPU/GpuSupport.h b/unsupported/Eigen/src/GPU/GpuSupport.h
index 2c9b0cc..4b02869 100644
--- a/unsupported/Eigen/src/GPU/GpuSupport.h
+++ b/unsupported/Eigen/src/GPU/GpuSupport.h
@@ -28,6 +28,14 @@
 
 namespace Eigen {
 namespace gpu {
+
+// ---- Generic operation flag -------------------------------------------------
+// Public flag for transpose/adjoint in BLAS-, solver-, and sparse-style calls.
+// Each library's support header maps this to its own enum (cublasOperation_t,
+// cusparseOperation_t, etc.) via a small to_<lib>_op() helper.
+
+enum class GpuOp { NoTrans, Trans, ConjTrans };
+
 namespace internal {
 
 // ---- Error-checking macros --------------------------------------------------
diff --git a/unsupported/Eigen/src/GPU/README.md b/unsupported/Eigen/src/GPU/README.md
index b7c4269..d7411bb 100644
--- a/unsupported/Eigen/src/GPU/README.md
+++ b/unsupported/Eigen/src/GPU/README.md
@@ -1,8 +1,9 @@
 # Eigen GPU Module (`unsupported/Eigen/GPU`)
 
-GPU-accelerated dense linear algebra for Eigen users, dispatching to NVIDIA
-CUDA Math Libraries (cuBLAS, cuSOLVER). Requires CUDA 11.4+. Header-only (link
-against CUDA runtime, cuBLAS, and cuSOLVER).
+GPU-accelerated linear algebra for Eigen users, dispatching to NVIDIA CUDA
+Math Libraries (cuBLAS, cuSOLVER, cuFFT, cuSPARSE, cuDSS). Requires CUDA
+11.4+; cuDSS features require CUDA 12.0+ and a separate cuDSS install.
+Header-only.
 
 ## Why this module
 
@@ -10,25 +11,31 @@
 in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP,
 Open3D), scientific computing (Ceres, Stan), and beyond. Many of these
 projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations
-without dropping down to raw CUDA library APIs. Third-party projects like
-[EigenCuda](https://github.com/NLESC-JCER/EigenCuda) and
-[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically to fill
-this gap, and downstream projects like
-[Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
-[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
-GPU-accelerated solvers through Eigen.
+without dropping down to raw CUDA library APIs.
 
-The `unsupported/Eigen/GPU` module aims to close this gap: Existing Eigen users should be
-able to move performance-critical dense linear algebra to the GPU with minimal
-code changes and without learning CUDA library APIs directly.
+GPU sparse solvers are a particularly acute gap. Sparse factorization is the
+bottleneck in SLAM, bundle adjustment, FEM, and nonlinear optimization --
+exactly the workloads where GPU acceleration matters most. Downstream projects
+like [Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
+[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
+GPU-accelerated sparse solvers, and third-party projects like
+[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically because
+Eigen lacks them. The `unsupported/Eigen/GPU` module provides GPU sparse Cholesky, LDL^T,
+and LU factorization via cuDSS, alongside dense solvers (cuSOLVER), matrix
+products (cuBLAS), FFT (cuFFT), and sparse matrix-vector products (cuSPARSE).
+
+Existing Eigen users should be able to move performance-critical dense or
+sparse linear algebra to the GPU with minimal code changes and without
+learning CUDA library APIs directly.
 
 ## Design philosophy
 
 **CPU and GPU coexist.** There is no global compile-time switch that replaces
 CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers
-explicitly -- `gpu::LLT<double>` vs `LLT<MatrixXd>` -- and both coexist in
-the same binary. This also lets users keep the factored matrix on device across
-multiple solves, something impossible with compile-time replacement.
+explicitly -- `gpu::LLT<double>` vs `LLT<MatrixXd>`, `gpu::SparseLLT<double>` vs
+`SimplicialLLT<SparseMatrix<double>>` -- and both coexist in the same binary.
+This also lets users keep the factored matrix on device across multiple solves,
+something impossible with compile-time replacement.
 
 **Familiar syntax.** GPU operations use the same expression patterns as CPU
 Eigen. Here is a side-by-side comparison:
@@ -38,6 +45,7 @@
 #include <Eigen/Dense>                  #define EIGEN_USE_GPU
                                         #include <unsupported/Eigen/GPU>
 
+// Dense
 MatrixXd A = ...;                       auto d_A = gpu::DeviceMatrix<double>::fromHost(A);
 MatrixXd B = ...;                       auto d_B = gpu::DeviceMatrix<double>::fromHost(B);
 
@@ -45,11 +53,15 @@
 MatrixXd X = A.llt().solve(B);          gpu::DeviceMatrix<double> d_X = d_A.llt().solve(d_B);
 
                                         MatrixXd X = d_X.toHost();
+
+// Sparse (using SpMat = SparseMatrix<double>)
+SimplicialLLT<SpMat> llt(A);            gpu::SparseLLT<double> llt(A);
+VectorXd x = llt.solve(b);              VectorXd x = llt.solve(b);
 ```
 
-The GPU version reads like CPU Eigen with explicit upload/download.
-`operator*` dispatches to cuBLAS GEMM, `.llt().solve()` dispatches to
-cuSOLVER potrf + potrs. Unsupported expressions are compile errors.
+The GPU version reads like CPU Eigen with explicit upload/download for dense
+operations, and an almost identical API for sparse solvers. Unsupported
+expressions are compile errors.
 
 **Standalone module.** `unsupported/Eigen/GPU` does not modify or depend on Eigen's Core
 expression template system (`MatrixBase`, `CwiseBinaryOp`, etc.).
@@ -169,15 +181,17 @@
 auto d_Y = lu.solve(d_B, gpu::GpuOp::Trans);           // A^T Y = B
 
 // QR solve (overdetermined least squares)
-gpu::QR<double> qr(A);                // host matrix input
-MatrixXd X = qr.solve(B);           // Q^H * B via ormqr, then trsm on R
+gpu::QR<double> qr;
+qr.compute(d_A);                     // factorize on device (async)
+auto d_X = qr.solve(d_B);           // Q^H * B via ormqr, then trsm on R
+MatrixXd X = d_X.toHost();
 
-// SVD
-gpu::SVD<double> svd(A, ComputeThinU | ComputeThinV);
-VectorXd S = svd.singularValues();
-MatrixXd U = svd.matrixU();
-MatrixXd VT = svd.matrixVT();
-MatrixXd X = svd.solve(B);          // pseudoinverse solve
+// SVD (results downloaded on access)
+gpu::SVD<double> svd;
+svd.compute(d_A, ComputeThinU | ComputeThinV);
+VectorXd S = svd.singularValues();   // downloads to host
+MatrixXd U = svd.matrixU();          // downloads to host
+MatrixXd VT = svd.matrixVT();        // V^T (matches cuSOLVER)
 
 // SVD: device-side views (no D2H transfer; svd must outlive the views)
 auto d_S = svd.d_singularValues();   // DeviceMatrix view of singular values
@@ -185,17 +199,89 @@
 auto d_VT = svd.d_matrixVT();        // DeviceMatrix view of V^T
 
 // Self-adjoint eigenvalue decomposition
-gpu::SelfAdjointEigenSolver<double> es(A);
-VectorXd eigenvals = es.eigenvalues();
-MatrixXd eigenvecs = es.eigenvectors();
-auto d_W = es.d_eigenvalues();        // DeviceMatrix view of eigenvalues
-auto d_V = es.d_eigenvectors();       // DeviceMatrix view of eigenvectors
+gpu::SelfAdjointEigenSolver<double> es;
+es.compute(d_A);
+VectorXd eigenvals = es.eigenvalues();    // downloads to host
+MatrixXd eigenvecs = es.eigenvectors();   // downloads to host
+auto d_W = es.d_eigenvalues();            // DeviceMatrix view of eigenvalues
+auto d_V = es.d_eigenvectors();           // DeviceMatrix view of eigenvectors
 ```
 
 The cached API keeps the factored matrix on device, avoiding redundant
-host-device transfers and re-factorizations. The `d_*` accessors return
-non-owning `DeviceMatrix` views so downstream cuBLAS/cuSOLVER work can chain
-without round-tripping through host memory.
+host-device transfers and re-factorizations. All solvers also accept host
+matrices directly as a convenience (e.g., `gpu::LLT<double> llt(A)` or
+`qr.solve(B)`), which handles upload/download internally. The `d_*` accessors
+on `gpu::SVD` and `gpu::SelfAdjointEigenSolver` return non-owning
+`DeviceMatrix` views so downstream cuBLAS/cuSOLVER work can chain without
+round-tripping through host memory.
+
+### Sparse direct solvers (cuDSS)
+
+Requires cuDSS (separate install, CUDA 12.0+). Define `EIGEN_CUDSS` before
+including `unsupported/Eigen/GPU` and link with `-lcudss`.
+
+```cpp
+SparseMatrix<double> A = ...;  // symmetric positive definite
+VectorXd b = ...;
+
+// Sparse Cholesky -- one-liner
+gpu::SparseLLT<double> llt(A);
+VectorXd x = llt.solve(b);
+
+// Three-phase workflow for repeated solves with the same sparsity pattern
+gpu::SparseLLT<double> llt;
+llt.analyzePattern(A);               // symbolic analysis (once)
+llt.factorize(A);                    // numeric factorization
+VectorXd x = llt.solve(b);
+llt.factorize(A_new_values);         // refactorize (reuses symbolic analysis)
+VectorXd x2 = llt.solve(b);
+
+// Sparse LDL^T (symmetric indefinite)
+gpu::SparseLDLT<double> ldlt(A);
+VectorXd x = ldlt.solve(b);
+
+// Sparse LU (general non-symmetric)
+gpu::SparseLU<double> lu(A);
+VectorXd x = lu.solve(b);
+```
+
+### FFT (cuFFT)
+
+```cpp
+gpu::FFT<float> fft;
+
+// 1D complex-to-complex
+VectorXcf X = fft.fwd(x);           // forward
+VectorXcf y = fft.inv(X);           // inverse (scaled by 1/n)
+
+// 1D real-to-complex / complex-to-real
+VectorXcf R = fft.fwd(r);           // returns n/2+1 complex (half-spectrum)
+VectorXf  s = fft.invReal(R, n);    // C2R inverse, caller specifies n
+
+// 2D complex-to-complex
+MatrixXcf B = fft.fwd2(A);         // 2D forward
+MatrixXcf C = fft.inv2(B);         // 2D inverse (scaled by 1/(rows*cols))
+
+// Plans are cached and reused across calls with the same size/type.
+```
+
+### Sparse matrix-vector multiply (cuSPARSE)
+
+```cpp
+SparseMatrix<double> A = ...;
+VectorXd x = ...;
+
+gpu::SparseContext<double> ctx;
+VectorXd y = ctx.multiply(A, x);            // y = A * x
+VectorXd z = ctx.multiplyT(A, x);           // z = A^T * x
+ctx.multiply(A, x, y, 2.0, 1.0);            // y = 2*A*x + y
+ctx.multiply(A, x, y, 1.0, 0.0,             // y = A^H * x (Hermitian SpMV)
+             gpu::GpuOp::ConjTrans);
+
+// Multiple RHS (SpMM)
+MatrixXd Y = ctx.multiplyMat(A, X);                       // Y = A * X
+MatrixXd Z = ctx.multiplyMat(A, X, gpu::GpuOp::Trans);    // Z = A^T * X
+```
 
 ### Precision control
 
@@ -238,8 +324,7 @@
 
 ```cpp
 gpu::LLT<double> llt(d_A);             // factor stays on device
-auto d_X = llt.solve(d_B);
-auto h_x = d_X.toHostAsync(llt.stream());
+auto h_x = d_X = llt.solve(d_B).toHostAsync(stream);
 h_x.get();                             // sync: factor + result complete
 // llt may now be destroyed without stalling the device
 ```
@@ -248,7 +333,8 @@
 
 ### Supported scalar types
 
-`float`, `double`, `std::complex<float>`, `std::complex<double>`.
+`float`, `double`, `std::complex<float>`, `std::complex<double>` (unless
+noted otherwise).
 
 ### Expression -> library call mapping
 
@@ -270,31 +356,43 @@
 | `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo |
 | `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N |
 
-### `DeviceMatrix<Scalar>` API
+### `DeviceMatrix<Scalar>`
 
-| Method | Sync? | Description |
-|--------|-------|-------------|
-| `DeviceMatrix()` | -- | Empty (0x0) |
-| `DeviceMatrix(rows, cols)` | -- | Allocate uninitialized |
-| `fromHost(matrix, stream)` | yes | Upload from Eigen matrix |
-| `fromHostAsync(ptr, rows, cols, stream)` | no | Async upload (caller manages lifetime) |
-| `adopt(ptr, rows, cols)` | -- | Owning wrapper over a device pointer (caller relinquishes ownership) |
-| `view(ptr, rows, cols)` | -- | Non-owning view over a device pointer (does not free on destruction) |
-| `toHost(stream)` | yes | Synchronous download |
-| `toHostAsync(stream)` | no | Returns `HostTransfer` future |
-| `clone(stream)` | no | Device-to-device deep copy |
-| `resize(rows, cols)` | -- | Discard contents, reallocate |
-| `data()` | -- | Raw device pointer |
-| `rows()`, `cols()` | -- | Dimensions |
-| `sizeInBytes()` | -- | Total device allocation size in bytes |
-| `empty()` | -- | True if 0x0 |
-| `adjoint()` | -- | Adjoint view (GEMM ConjTrans) |
-| `transpose()` | -- | Transpose view (GEMM Trans) |
-| `llt()` / `llt<UpLo>()` | -- | Cholesky expression builder |
-| `lu()` | -- | LU expression builder |
-| `triangularView<UpLo>()` | -- | Triangular view (TRSM) |
-| `selfadjointView<UpLo>()` | -- | Self-adjoint view (SYMM, rankUpdate) |
-| `device(ctx)` | -- | Assignment proxy bound to context |
+Typed RAII wrapper for a dense column-major matrix in GPU device memory.
+Always dense (leading dimension = rows). A vector is a `DeviceMatrix` with
+one column.
+
+```cpp
+// Construction
+DeviceMatrix<Scalar>()                                   // Empty (0x0)
+DeviceMatrix<Scalar>(rows, cols)                         // Allocate uninitialized
+
+// Upload / download / pointer adoption
+static DeviceMatrix fromHost(matrix, stream=nullptr)           // -> DeviceMatrix (syncs)
+static DeviceMatrix fromHostAsync(ptr, rows, cols, outerStride, s)  // -> DeviceMatrix (no sync, caller manages ptr lifetime)
+static DeviceMatrix adopt(Scalar* device_ptr, rows, cols)      // Owning wrapper over a raw device pointer
+static DeviceMatrix view(Scalar* device_ptr, rows, cols)       // Non-owning view (does not free on destruction)
+PlainMatrix        toHost(stream=nullptr)                      // -> host Matrix (syncs)
+HostTransfer       toHostAsync(stream=nullptr)                 // -> HostTransfer future (no sync)
+DeviceMatrix       clone(stream=nullptr)                       // -> DeviceMatrix (D2D copy, async)
+
+// Dimensions and access
+Index   rows()
+Index   cols()
+size_t  sizeInBytes()
+bool    empty()
+Scalar* data()                                           // Raw device pointer
+void    resize(Index rows, Index cols)                   // Discard contents, reallocate
+
+// Expression builders (return lightweight views, evaluated on assignment)
+AdjointView       adjoint()                              // GEMM with ConjTrans
+TransposeView     transpose()                            // GEMM with Trans
+LltExpr            llt() / llt<UpLo>()                   // -> .solve(d_B) -> DeviceMatrix
+LuExpr             lu()                                  // -> .solve(d_B) -> DeviceMatrix
+TriangularView     triangularView<UpLo>()                // -> .solve(d_B) -> DeviceMatrix (TRSM)
+SelfAdjointView    selfadjointView<UpLo>()               // -> * d_B (SYMM), .rankUpdate(d_A) (SYRK)
+Assignment   device(gpu::Context& ctx)                // Bind assignment to explicit stream
+```
 
 ### `gpu::Context`
 
@@ -311,103 +409,205 @@
 
 Non-copyable, non-movable (owns library handles).
 
-### `gpu::LLT<Scalar, UpLo>` API
+### `gpu::LLT<Scalar, UpLo>` -- Dense Cholesky (cuSOLVER)
 
-GPU dense Cholesky (LL^T) via cuSOLVER. Caches factor on device.
+Caches the Cholesky factor on device for repeated solves.
 
-| Method | Sync? | Description |
-|--------|-------|-------------|
-| `gpu::LLT(A)` | deferred | Construct and factorize from host matrix |
-| `compute(host_matrix)` | deferred | Upload and factorize |
-| `compute(DeviceMatrix)` | deferred | D2D copy and factorize |
-| `compute(DeviceMatrix&&)` | deferred | Move-adopt and factorize (no copy) |
-| `solve(host_matrix)` | yes | Solve, return host matrix |
-| `solve(DeviceMatrix)` | no | Solve, return `DeviceMatrix` (async) |
-| `info()` | lazy | Syncs stream on first call, returns `Success` or `NumericalIssue` |
+```cpp
+gpu::LLT()                                                // Default construct, then call compute()
+gpu::LLT(const EigenBase<D>& A)                           // Convenience: upload + factorize
 
-### `gpu::LU<Scalar>` API
+gpu::LLT&            compute(const EigenBase<D>& A)       // Upload + factorize
+gpu::LLT&            compute(const DeviceMatrix& d_A)     // D2D copy + factorize
+gpu::LLT&            compute(DeviceMatrix&& d_A)          // Adopt + factorize (no copy)
 
-GPU dense partial-pivoting LU via cuSOLVER. Same pattern as `gpu::LLT`, plus a
-`gpu::GpuOp` parameter on `solve()` (`NoTrans`, `Trans`, `ConjTrans`).
+PlainMatrix        solve(const MatrixBase<D>& B)         // -> host Matrix (syncs)
+DeviceMatrix       solve(const DeviceMatrix& d_B)        // -> DeviceMatrix (async, stays on device)
 
-### `gpu::QR<Scalar>` API
+ComputationInfo    info()                                // Lazy sync on first call: Success or NumericalIssue
+Index              rows() / cols()
+cudaStream_t       stream()
+```
 
-GPU dense QR decomposition via cuSOLVER (`geqrf`). Solve uses `ormqr` (apply
-Q^H) + `trsm` (back-substitute on R) -- Q is never formed explicitly.
+### `gpu::LU<Scalar>` -- Dense LU (cuSOLVER)
 
-| Method | Description |
-|--------|-------------|
-| `gpu::QR()` | Default construct, then call `compute()` |
-| `gpu::QR(A)` | Construct and factorize from host matrix |
-| `compute(A)` | Upload + factorize |
-| `compute(DeviceMatrix)` | D2D copy + factorize |
-| `solve(host_matrix)` | Solve, return host matrix (syncs) |
-| `solve(DeviceMatrix)` | Solve, return `DeviceMatrix` (async) |
-| `matrixR()` | Download upper-triangular factor R to host (m >= n only) |
-| `info()` | Lazy sync |
-| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream |
+Same pattern as `gpu::LLT`. Adds a `gpu::GpuOp` parameter on `solve()`.
 
-### `gpu::SVD<Scalar>` API
+```cpp
+PlainMatrix        solve(const MatrixBase<D>& B, GpuOp op = GpuOp::NoTrans)  // -> host Matrix
+DeviceMatrix       solve(const DeviceMatrix& d_B, GpuOp op = GpuOp::NoTrans) // -> DeviceMatrix
+```
 
-GPU dense SVD via cuSOLVER (`gesvd`). Supports thin, full, and values-only
-modes via Eigen's `ComputeThinU | ComputeThinV`, `ComputeFullU | ComputeFullV`,
-or `0` (values only).
+`gpu::GpuOp`: `NoTrans`, `Trans`, `ConjTrans`.
 
-| Method | Description |
-|--------|-------------|
-| `gpu::SVD()` | Default construct, then call `compute()` |
-| `gpu::SVD(A, options)` | Construct and compute (options default: `ComputeThinU \| ComputeThinV`) |
-| `compute(A, options)` | Compute from host matrix |
-| `compute(DeviceMatrix, options)` | Compute from device matrix |
-| `singularValues()` | Download singular values to host |
-| `matrixU()` | Download U to host (requires `ComputeThinU` or `ComputeFullU`) |
-| `matrixV()` | Download V to host (matches `JacobiSVD::matrixV()`) |
-| `matrixVT()` | Download V^T to host (requires `ComputeThinV` or `ComputeFullV`) |
-| `d_singularValues()` | `DeviceMatrix` view of singular values (zero-copy) |
-| `d_matrixU()` | `DeviceMatrix` view of U (zero-copy when m >= n) |
-| `d_matrixVT()` | `DeviceMatrix` view of V^T (zero-copy when m >= n) |
-| `solve(B)` | Pseudoinverse solve (returns host matrix) |
-| `solve(B, k)` | Truncated solve (top k singular triplets) |
-| `solve(B, lambda)` | Tikhonov regularized solve |
-| `rank(threshold)` | Count singular values above threshold |
-| `info()` | Lazy sync |
-| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream |
+### `gpu::QR<Scalar>` -- Dense QR (cuSOLVER)
 
-Wide matrices (m < n) are handled by internally transposing via cuBLAS `geam`;
-in that case `d_matrixU()` and `d_matrixVT()` allocate an owning result via a
-single `cublasXgeam` adjoint pass. `d_singularValues()` is always zero-copy.
+QR factorization via `cusolverDnXgeqrf`. Solve uses ORMQR (apply Q^H) + TRSM
+(back-substitute on R) -- Q is never formed explicitly.
 
-### `gpu::SelfAdjointEigenSolver<Scalar>` API
+```cpp
+gpu::QR()                                                  // Default construct
+gpu::QR(const EigenBase<D>& A)                             // Convenience: upload + factorize
 
-GPU symmetric/Hermitian eigenvalue decomposition via cuSOLVER (`syevd`).
+gpu::QR&             compute(const EigenBase<D>& A)        // Upload + factorize
+gpu::QR&             compute(const DeviceMatrix& d_A)      // D2D copy + factorize
 
-| Method | Description |
-|--------|-------------|
-| `gpu::SelfAdjointEigenSolver()` | Default construct, then call `compute()` |
-| `gpu::SelfAdjointEigenSolver(A, mode)` | Construct and compute (mode default: `ComputeEigenvectors`) |
-| `compute(A, mode)` | Compute from host matrix |
-| `compute(DeviceMatrix, mode)` | Compute from device matrix |
-| `eigenvalues()` | Download eigenvalues to host (ascending order) |
-| `eigenvectors()` | Download eigenvectors to host (columns) |
-| `d_eigenvalues()` | `DeviceMatrix` view of eigenvalues (zero-copy) |
-| `d_eigenvectors()` | `DeviceMatrix` view of eigenvectors (zero-copy, requires `ComputeEigenvectors`) |
-| `info()` | Lazy sync |
-| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream |
+PlainMatrix        solve(const MatrixBase<D>& B)         // -> host Matrix (syncs)
+DeviceMatrix       solve(const DeviceMatrix& d_B)        // -> DeviceMatrix (async)
+PlainMatrix        matrixR()                             // -> host Matrix (m >= n only)
 
-`ComputeMode`: `gpu::SelfAdjointEigenSolver::EigenvaluesOnly` or
-`gpu::SelfAdjointEigenSolver::ComputeEigenvectors`.
+ComputationInfo    info()                                // Lazy sync
+Index              rows() / cols()
+cudaStream_t       stream()
+```
 
-The `d_*` accessors return non-owning views into the solver's internal device
-buffers; the solver must outlive any view derived from it.
+### `gpu::SVD<Scalar>` -- Dense SVD (cuSOLVER)
 
-### `HostTransfer<Scalar>` API
+SVD via `cusolverDnXgesvd`. Supports `ComputeThinU | ComputeThinV`,
+`ComputeFullU | ComputeFullV`, or `0` (values only). Wide matrices (m < n)
+handled by internal transpose.
 
-Future for async device-to-host transfer.
+```cpp
+gpu::SVD()                                                 // Default construct, then call compute()
+gpu::SVD(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV)  // Convenience
 
-| Method | Description |
-|--------|-------------|
-| `get()` | Block until transfer completes, return host matrix reference. Idempotent. |
-| `ready()` | Non-blocking poll |
+gpu::SVD&            compute(const EigenBase<D>& A, unsigned options = ComputeThinU | ComputeThinV)
+gpu::SVD&            compute(const DeviceMatrix& d_A, unsigned options = ComputeThinU | ComputeThinV)
+
+RealVector         singularValues()                      // -> host vector (syncs, downloads)
+PlainMatrix        matrixU()                             // -> host Matrix (syncs, downloads)
+PlainMatrix        matrixVT()                            // -> host Matrix (syncs, downloads V^T)
+
+DeviceMatrix       d_singularValues()                    // -> DeviceMatrix view (zero-copy)
+DeviceMatrix       d_matrixU()                           // -> DeviceMatrix view (zero-copy when m >= n)
+DeviceMatrix       d_matrixVT()                          // -> DeviceMatrix view (zero-copy when m >= n)
+
+PlainMatrix        solve(const MatrixBase<D>& B)         // -> host Matrix (pseudoinverse)
+PlainMatrix        solve(const MatrixBase<D>& B, Index k)       // Truncated (top k triplets)
+PlainMatrix        solve(const MatrixBase<D>& B, RealScalar l)  // Tikhonov regularized
+
+Index              rank(RealScalar threshold = -1)
+ComputationInfo    info()                                // Lazy sync
+Index              rows() / cols()
+cudaStream_t       stream()
+```
+
+**Note:** `singularValues()`, `matrixU()`, and `matrixVT()` download to host
+on each call. The `d_*` accessors return non-owning `DeviceMatrix` views into
+the solver's internal buffers; the `gpu::SVD` object must outlive any view
+derived from it. For wide matrices (m < n) the U/V^T views are owning (one
+`cublasXgeam` adjoint pass).
+
+### `gpu::SelfAdjointEigenSolver<Scalar>` -- Eigendecomposition (cuSOLVER)
+
+Symmetric/Hermitian eigenvalue decomposition via `cusolverDnXsyevd`.
+`ComputeMode` enum: `EigenvaluesOnly`, `ComputeEigenvectors`.
+
+```cpp
+gpu::SelfAdjointEigenSolver()                              // Default construct, then call compute()
+gpu::SelfAdjointEigenSolver(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors)  // Convenience
+
+gpu::SelfAdjointEigenSolver& compute(const EigenBase<D>& A, ComputeMode mode = ComputeEigenvectors)
+gpu::SelfAdjointEigenSolver& compute(const DeviceMatrix& d_A, ComputeMode mode = ComputeEigenvectors)
+
+RealVector         eigenvalues()                         // -> host vector (syncs, downloads, ascending order)
+PlainMatrix        eigenvectors()                        // -> host Matrix (syncs, downloads, columns)
+
+DeviceMatrix       d_eigenvalues()                       // -> DeviceMatrix view (zero-copy)
+DeviceMatrix       d_eigenvectors()                      // -> DeviceMatrix view (zero-copy, requires ComputeEigenvectors)
+
+ComputationInfo    info()                                // Lazy sync
+Index              rows() / cols()
+cudaStream_t       stream()
+```
+
+**Note:** `eigenvalues()` and `eigenvectors()` download to host on each call.
+The `d_*` accessors return non-owning `DeviceMatrix` views into the solver's
+internal buffers; the `gpu::SelfAdjointEigenSolver` object must outlive any
+view derived from it.
+
+### `HostTransfer<Scalar>`
+
+Future for async device-to-host transfer. Returned by
+`DeviceMatrix::toHostAsync()`.
+
+```cpp
+PlainMatrix&       get()                                 // Block until complete, return host Matrix ref. Idempotent.
+bool               ready()                               // Non-blocking poll
+```
+
+### `gpu::SparseLLT<Scalar, UpLo>` -- Sparse Cholesky (cuDSS)
+
+Requires cuDSS (CUDA 12.0+, `#define EIGEN_CUDSS`). Three-phase workflow
+with symbolic reuse. Accepts `SparseMatrix<Scalar, ColMajor, int>` (CSC).
+Matrix dimensions and nonzero count must fit in `int` (cuDSS limitation;
+debug builds assert).
+
+```cpp
+gpu::SparseLLT()                                           // Default construct
+gpu::SparseLLT(const SparseMatrixBase<D>& A)               // Analyze + factorize
+
+gpu::SparseLLT&      analyzePattern(const SparseMatrixBase<D>& A)  // Symbolic analysis (reusable)
+gpu::SparseLLT&      factorize(const SparseMatrixBase<D>& A)       // Numeric factorization
+gpu::SparseLLT&      compute(const SparseMatrixBase<D>& A)         // analyzePattern + factorize
+void               setOrdering(GpuSparseOrdering ord)             // AMD (default), METIS, or RCM
+
+DenseMatrix        solve(const MatrixBase<D>& B)         // -> host Matrix (syncs)
+
+ComputationInfo    info()                                // Lazy sync
+Index              rows() / cols()
+cudaStream_t       stream()
+```
+
+### `gpu::SparseLDLT<Scalar, UpLo>` -- Sparse LDL^T (cuDSS)
+
+Symmetric indefinite. Same API as `gpu::SparseLLT`.
+
+### `gpu::SparseLU<Scalar>` -- Sparse LU (cuDSS)
+
+General non-symmetric. Same API as `gpu::SparseLLT` (without `UpLo`).
+
+### `gpu::FFT<Scalar>` -- FFT (cuFFT)
+
+Plans cached by (size, type) and reused. Inverse transforms scaled so
+`inv(fwd(x)) == x`. Supported scalars: `float`, `double`.
+
+```cpp
+// 1D transforms (host vectors in and out)
+ComplexVector      fwd(const MatrixBase<D>& x)           // C2C forward (complex input)
+ComplexVector      fwd(const MatrixBase<D>& x)           // R2C forward (real input, returns n/2+1)
+ComplexVector      inv(const MatrixBase<D>& X)           // C2C inverse, scaled by 1/n
+RealVector         invReal(const MatrixBase<D>& X, Index n)  // C2R inverse, scaled by 1/n
+
+// 2D transforms (host matrices in and out)
+ComplexMatrix      fwd2(const MatrixBase<D>& A)         // 2D C2C forward
+ComplexMatrix      inv2(const MatrixBase<D>& A)         // 2D C2C inverse, scaled by 1/(rows*cols)
+
+cudaStream_t       stream()
+```
+
+All FFT methods accept host data and return host data. Upload/download is
+handled internally. The C2C and R2C overloads of `fwd()` are distinguished by
+the input scalar type (complex vs real).
+
+### `gpu::SparseContext<Scalar>` -- SpMV/SpMM (cuSPARSE)
+
+Accepts `SparseMatrix<Scalar, ColMajor>`. All methods accept host data and
+return host data. Matrix dimensions and nonzero count must fit in `int`
+(cuSPARSE limitation; debug builds assert).
+
+```cpp
+gpu::SparseContext()                                       // Creates own stream + cuSPARSE handle
+
+DenseVector        multiply(A, x)                                       // y = A * x
+void               multiply(A, x, y, alpha=1, beta=0,                   // y = alpha*op(A)*x + beta*y
+                     op=GpuOp::NoTrans)
+DenseVector        multiplyT(A, x)                                      // y = A^T * x
+DenseVector        multiplyAdjoint(A, x)                                // y = A^H * x
+DenseMatrix        multiplyMat(A, X, op=GpuOp::NoTrans)                 // Y = op(A) * X (SpMM)
+
+cudaStream_t       stream()
+```
 
 ### Aliasing
 
@@ -434,7 +634,7 @@
 
 | File | Depends on | Contents |
 |------|-----------|----------|
-| `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `PinnedHostBuffer`, `cuda_data_type<>` |
+| `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `cuda_data_type<>` |
 | `DeviceMatrix.h` | `GpuSupport.h` | `DeviceMatrix<>`, `HostTransfer<>` |
 | `DeviceExpr.h` | `DeviceMatrix.h` | GEMM expression wrappers |
 | `DeviceBlasExpr.h` | `DeviceMatrix.h` | TRSM, SYMM, SYRK expression wrappers |
@@ -449,18 +649,46 @@
 | `GpuQR.h` | `GpuSolverContext.h` | Dense QR decomposition |
 | `GpuSVD.h` | `GpuSolverContext.h` | Dense SVD decomposition |
 | `GpuEigenSolver.h` | `GpuSolverContext.h` | Self-adjoint eigenvalue decomposition |
+| `CuFftSupport.h` | `GpuSupport.h`, `<cufft.h>` | cuFFT error macro, type-dispatch wrappers |
+| `GpuFFT.h` | `CuFftSupport.h`, `CuBlasSupport.h` | 1D/2D FFT with plan caching |
+| `CuSparseSupport.h` | `GpuSupport.h`, `<cusparse.h>` | cuSPARSE error macro |
+| `GpuSparseContext.h` | `CuSparseSupport.h` | SpMV/SpMM via cuSPARSE |
+| `CuDssSupport.h` | `GpuSupport.h`, `<cudss.h>` | cuDSS error macro, type traits (optional) |
+| `GpuSparseSolverBase.h` | `CuDssSupport.h` | CRTP base for sparse solvers (optional) |
+| `GpuSparseLLT.h` | `GpuSparseSolverBase.h` | Sparse Cholesky via cuDSS (optional) |
+| `GpuSparseLDLT.h` | `GpuSparseSolverBase.h` | Sparse LDL^T via cuDSS (optional) |
+| `GpuSparseLU.h` | `GpuSparseSolverBase.h` | Sparse LU via cuDSS (optional) |
 
 ## Building and testing
 
 ```bash
 cmake -G Ninja -B build -S . \
   -DEIGEN_TEST_CUDA=ON \
-  -DEIGEN_CUDA_COMPUTE_ARCH="70"
+  -DEIGEN_CUDA_COMPUTE_ARCH="70" \
+  -DEIGEN_TEST_CUBLAS=ON \
+  -DEIGEN_TEST_CUSOLVER=ON
 
 cmake --build build --target cublas cusolver_llt cusolver_lu \
-  cusolver_qr cusolver_svd cusolver_eigen device_matrix
+  cusolver_qr cusolver_svd cusolver_eigen \
+  device_matrix cufft cusparse_spmv
 ctest --test-dir build -L gpu --output-on-failure
+
+# Sparse solvers (cuDSS -- separate install required)
+cmake -G Ninja -B build -S . \
+  -DEIGEN_TEST_CUDA=ON \
+  -DEIGEN_CUDA_COMPUTE_ARCH="70" \
+  -DEIGEN_TEST_CUDSS=ON
+
+cmake --build build --target cudss_llt cudss_ldlt cudss_lu
+ctest --test-dir build -R '^cudss_' --output-on-failure
 ```
 
-`EIGEN_TEST_CUBLAS` and `EIGEN_TEST_CUSOLVER` default to ON when CUDA is enabled
-(cuBLAS and cuSOLVER are part of the CUDA toolkit).
+## Future enhancements
+
+- **Device-resident sparse matrix-vector products.** `gpu::SparseContext`
+  currently operates on host vectors and matrices, uploading and downloading
+  on each call. The key missing piece is a `DeviceSparseView` that holds a
+  sparse matrix on device and supports operator syntax (`d_y = d_A * d_x`)
+  with `DeviceMatrix` operands -- keeping the entire SpMV/SpMM pipeline on
+  device. This is essential for iterative solvers and any workflow that chains
+  sparse and dense operations without returning to the host.
diff --git a/unsupported/benchmarks/GPU/CMakeLists.txt b/unsupported/benchmarks/GPU/CMakeLists.txt
index 5447154..1f42082 100644
--- a/unsupported/benchmarks/GPU/CMakeLists.txt
+++ b/unsupported/benchmarks/GPU/CMakeLists.txt
@@ -54,3 +54,7 @@
 # Batching benchmarks: multi-stream concurrency for many small systems.
 eigen_add_gpu_benchmark(bench_batching bench_batching.cpp)
 eigen_add_gpu_benchmark(bench_batching_float bench_batching.cpp DEFINITIONS SCALAR=float)
+
+# FFT benchmarks: 1D/2D C2C, R2C, C2R throughput and plan reuse.
+eigen_add_gpu_benchmark(bench_fft bench_fft.cpp LIBRARIES CUDA::cufft)
+eigen_add_gpu_benchmark(bench_fft_double bench_fft.cpp LIBRARIES CUDA::cufft DEFINITIONS SCALAR=double)
diff --git a/unsupported/benchmarks/GPU/bench_batching.cpp b/unsupported/benchmarks/GPU/bench_batching.cpp
index 67c3e3b..db61cbc 100644
--- a/unsupported/benchmarks/GPU/bench_batching.cpp
+++ b/unsupported/benchmarks/GPU/bench_batching.cpp
@@ -42,8 +42,8 @@
   static bool done = false;
   if (!done) {
     void* p;
-    cudaMalloc(&p, 1);
-    cudaFree(p);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, 1));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaFree(p));
     done = true;
   }
 }
diff --git a/unsupported/benchmarks/GPU/bench_chaining.cpp b/unsupported/benchmarks/GPU/bench_chaining.cpp
index 60a4f2f..4a09d8a 100644
--- a/unsupported/benchmarks/GPU/bench_chaining.cpp
+++ b/unsupported/benchmarks/GPU/bench_chaining.cpp
@@ -35,8 +35,8 @@
   static bool done = false;
   if (!done) {
     void* p;
-    cudaMalloc(&p, 1);
-    cudaFree(p);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, 1));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaFree(p));
     done = true;
   }
 }
@@ -159,7 +159,7 @@
     for (int i = 1; i < chain_len; ++i) {
       d_X = llt.solve(d_X);
     }
-    cudaStreamSynchronize(llt.stream());
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(llt.stream()));
     benchmark::DoNotOptimize(d_X.data());
   }
 
diff --git a/unsupported/benchmarks/GPU/bench_fft.cpp b/unsupported/benchmarks/GPU/bench_fft.cpp
new file mode 100644
index 0000000..c8d8a37
--- /dev/null
+++ b/unsupported/benchmarks/GPU/bench_fft.cpp
@@ -0,0 +1,188 @@
+// GPU FFT benchmarks: GpuFFT 1D and 2D throughput.
+//
+// Measures forward and inverse FFT performance across a range of sizes,
+// including plan-amortized (reuse) and cold-start (new plan) scenarios.
+//
+// Usage:
+//   cmake --build build-bench-gpu --target bench_fft bench_fft_double
+//   ./build-bench-gpu/bench_fft
+//   ./build-bench-gpu/bench_fft_double
+//
+// Profiling:
+//   nsys profile --trace=cuda ./build-bench-gpu/bench_fft
+// SPDX-FileCopyrightText: The Eigen Authors
+// SPDX-License-Identifier: MPL-2.0
+
+#include <benchmark/benchmark.h>
+
+#include <unsupported/Eigen/GPU>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR float
+#endif
+
+using Scalar = SCALAR;
+using Complex = std::complex<Scalar>;
+using CVec = Matrix<Complex, Dynamic, 1>;
+using RVec = Matrix<Scalar, Dynamic, 1>;
+using CMat = Matrix<Complex, Dynamic, Dynamic>;
+
+// CUDA warm-up: ensure the GPU is initialized before timing.
+static void cuda_warmup() {
+  static bool done = false;
+  if (!done) {
+    void* p;
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, 1));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaFree(p));
+    done = true;
+  }
+}
+
+// --------------------------------------------------------------------------
+// 1D C2C Forward
+// --------------------------------------------------------------------------
+
+static void BM_GpuFFT_1D_C2C_Fwd(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  CVec x = CVec::Random(n);
+  gpu::FFT<Scalar> fft;
+
+  // Warm up plan.
+  CVec tmp = fft.fwd(x);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(fft.fwd(x));
+  }
+  state.SetItemsProcessed(state.iterations() * n);
+  state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2);  // read + write
+}
+
+BENCHMARK(BM_GpuFFT_1D_C2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
+
+// --------------------------------------------------------------------------
+// 1D C2C Inverse
+// --------------------------------------------------------------------------
+
+static void BM_GpuFFT_1D_C2C_Inv(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  CVec x = CVec::Random(n);
+  gpu::FFT<Scalar> fft;
+  CVec X = fft.fwd(x);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(fft.inv(X));
+  }
+  state.SetItemsProcessed(state.iterations() * n);
+  state.SetBytesProcessed(state.iterations() * n * sizeof(Complex) * 2);
+}
+
+BENCHMARK(BM_GpuFFT_1D_C2C_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
+
+// --------------------------------------------------------------------------
+// 1D R2C Forward
+// --------------------------------------------------------------------------
+
+static void BM_GpuFFT_1D_R2C_Fwd(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  RVec r = RVec::Random(n);
+  gpu::FFT<Scalar> fft;
+
+  // Warm up plan.
+  CVec tmp = fft.fwd(r);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(fft.fwd(r));
+  }
+  state.SetItemsProcessed(state.iterations() * n);
+  state.SetBytesProcessed(state.iterations() * (n * sizeof(Scalar) + (n / 2 + 1) * sizeof(Complex)));
+}
+
+BENCHMARK(BM_GpuFFT_1D_R2C_Fwd)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
+
+// --------------------------------------------------------------------------
+// 1D C2R Inverse
+// --------------------------------------------------------------------------
+
+static void BM_GpuFFT_1D_C2R_Inv(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  RVec r = RVec::Random(n);
+  gpu::FFT<Scalar> fft;
+  CVec R = fft.fwd(r);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(fft.invReal(R, n));
+  }
+  state.SetItemsProcessed(state.iterations() * n);
+  state.SetBytesProcessed(state.iterations() * ((n / 2 + 1) * sizeof(Complex) + n * sizeof(Scalar)));
+}
+
+BENCHMARK(BM_GpuFFT_1D_C2R_Inv)->RangeMultiplier(4)->Range(1 << 10, 1 << 22);
+
+// --------------------------------------------------------------------------
+// 2D C2C Forward
+// --------------------------------------------------------------------------
+
+static void BM_GpuFFT_2D_C2C_Fwd(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);  // square n x n
+  CMat A = CMat::Random(n, n);
+  gpu::FFT<Scalar> fft;
+
+  // Warm up plan.
+  CMat tmp = fft.fwd2(A);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(fft.fwd2(A));
+  }
+  state.SetItemsProcessed(state.iterations() * n * n);
+  state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 2);
+}
+
+BENCHMARK(BM_GpuFFT_2D_C2C_Fwd)->RangeMultiplier(2)->Range(64, 4096);
+
+// --------------------------------------------------------------------------
+// 2D C2C Roundtrip (fwd + inv)
+// --------------------------------------------------------------------------
+
+static void BM_GpuFFT_2D_C2C_Roundtrip(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  CMat A = CMat::Random(n, n);
+  gpu::FFT<Scalar> fft;
+
+  // Warm up plans.
+  CMat tmp = fft.inv2(fft.fwd2(A));
+
+  for (auto _ : state) {
+    CMat B = fft.fwd2(A);
+    benchmark::DoNotOptimize(fft.inv2(B));
+  }
+  state.SetItemsProcessed(state.iterations() * n * n * 2);  // fwd + inv
+  state.SetBytesProcessed(state.iterations() * n * n * sizeof(Complex) * 4);
+}
+
+BENCHMARK(BM_GpuFFT_2D_C2C_Roundtrip)->RangeMultiplier(2)->Range(64, 4096);
+
+// --------------------------------------------------------------------------
+// 1D Cold start (includes plan creation)
+// --------------------------------------------------------------------------
+
+static void BM_GpuFFT_1D_ColdStart(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  CVec x = CVec::Random(n);
+
+  for (auto _ : state) {
+    gpu::FFT<Scalar> fft;  // new object = new plans
+    benchmark::DoNotOptimize(fft.fwd(x));
+  }
+  state.SetItemsProcessed(state.iterations() * n);
+}
+
+BENCHMARK(BM_GpuFFT_1D_ColdStart)->RangeMultiplier(4)->Range(1 << 10, 1 << 20);
diff --git a/unsupported/benchmarks/GPU/bench_solvers.cpp b/unsupported/benchmarks/GPU/bench_solvers.cpp
index 7e0952e..17490f1 100644
--- a/unsupported/benchmarks/GPU/bench_solvers.cpp
+++ b/unsupported/benchmarks/GPU/bench_solvers.cpp
@@ -40,8 +40,8 @@
   static bool done = false;
   if (!done) {
     void* p;
-    cudaMalloc(&p, 1);
-    cudaFree(p);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, 1));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaFree(p));
     done = true;
   }
 }
@@ -157,7 +157,7 @@
   for (auto _ : state) {
     gpu::DeviceMatrix<Scalar> d_X = llt.solve(d_B);
     // Force completion without D2H transfer.
-    cudaStreamSynchronize(llt.stream());
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(llt.stream()));
     benchmark::DoNotOptimize(d_X.data());
   }
 
diff --git a/unsupported/test/GPU/CMakeLists.txt b/unsupported/test/GPU/CMakeLists.txt
index 706efef..f0b07c3 100644
--- a/unsupported/test/GPU/CMakeLists.txt
+++ b/unsupported/test/GPU/CMakeLists.txt
@@ -73,3 +73,34 @@
     ei_add_gpu_test(${_cusolver_test} EXTRA_LIBS CUDA::cusolver CUDA::cublas)
   endforeach()
 endif()
+
+# cuFFT (part of the CUDA toolkit, no separate option needed).
+if(TARGET CUDA::cufft)
+  ei_add_gpu_test(cufft EXTRA_LIBS CUDA::cufft CUDA::cublas)
+endif()
+
+# cuSPARSE SpMV (part of the CUDA toolkit).
+if(TARGET CUDA::cusparse)
+  ei_add_gpu_test(cusparse_spmv
+    EXTRA_LIBS CUDA::cusparse CUDA::cublas CUDA::cusolver)
+endif()
+
+# cuDSS sparse direct solvers -- distributed separately from the CUDA Toolkit.
+option(EIGEN_TEST_CUDSS "Test cuDSS sparse solver integration" OFF)
+if(EIGEN_TEST_CUDSS)
+  find_path(CUDSS_INCLUDE_DIR cudss.h
+    HINTS ${CUDSS_DIR}/include ${CUDA_TOOLKIT_ROOT_DIR}/include /usr/include)
+  find_library(CUDSS_LIBRARY cudss
+    HINTS ${CUDSS_DIR}/lib ${CUDSS_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib/x86_64-linux-gnu)
+  if(CUDSS_INCLUDE_DIR AND CUDSS_LIBRARY)
+    message(STATUS "cuDSS found: ${CUDSS_LIBRARY}")
+    foreach(_cudss_test IN ITEMS cudss_llt cudss_ldlt cudss_lu)
+      ei_add_gpu_test(${_cudss_test}
+        EXTRA_LIBS CUDA::cusolver CUDA::cublas ${CUDSS_LIBRARY}
+        EXTRA_INCLUDES ${CUDSS_INCLUDE_DIR}
+        EXTRA_DEFINES EIGEN_CUDSS=1)
+    endforeach()
+  else()
+    message(WARNING "EIGEN_TEST_CUDSS=ON but cuDSS not found. Set CUDSS_DIR.")
+  endif()
+endif()
diff --git a/unsupported/test/GPU/cudss_ldlt.cpp b/unsupported/test/GPU/cudss_ldlt.cpp
new file mode 100644
index 0000000..c979065
--- /dev/null
+++ b/unsupported/test/GPU/cudss_ldlt.cpp
@@ -0,0 +1,168 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// Tests for GpuSparseLDLT: GPU sparse LDL^T via cuDSS.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/Sparse>
+#include <unsupported/Eigen/GPU>
+#include "gpu_test_helpers.h"
+
+using namespace Eigen;
+
+// ---- Helper: build a random sparse symmetric indefinite matrix ---------------
+
+template <typename Scalar>
+SparseMatrix<Scalar, ColMajor, int> make_symmetric_indefinite(Index n, double density = 0.1) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  // Diagonal has mixed signs for indefiniteness. For complex Scalar:
+  // off-diagonals carry nonzero imaginary parts so the Hermitian path is
+  // genuinely exercised; diagonal stays real so A = R + R.adjoint() has a
+  // real diagonal explicitly (R + R.adjoint() at i,i = 2*Re(R_ii) regardless).
+  SpMat R(n, n);
+  R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
+  for (Index j = 0; j < n; ++j) {
+    for (Index i = 0; i < n; ++i) {
+      if (i == j || (std::rand() / double(RAND_MAX)) < density) {
+        const RealScalar re = RealScalar(std::rand() / double(RAND_MAX) - 0.5);
+        const RealScalar im = (i == j) ? RealScalar(0) : RealScalar(std::rand() / double(RAND_MAX) - 0.5);
+        R.insert(i, j) = gpu_test::make_test_value<Scalar>(re, im);
+      }
+    }
+  }
+  R.makeCompressed();
+
+  // A = R + R^H (symmetric), then add diagonal with alternating signs for indefiniteness.
+  SpMat A = R + SparseMatrix<Scalar, ColMajor, int>(R.adjoint());
+  for (Index i = 0; i < n; ++i) {
+    Scalar diag_val = Scalar((i % 2 == 0) ? n : -n);
+    A.coeffRef(i, i) += diag_val;
+  }
+  A.makeCompressed();
+  return A;
+}
+
+// ---- Solve and check residual -----------------------------------------------
+
+template <typename Scalar>
+void test_solve(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_symmetric_indefinite<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  gpu::SparseLDLT<Scalar> ldlt(A);
+  VERIFY_IS_EQUAL(ldlt.info(), Success);
+
+  Vec x = ldlt.solve(b);
+  VERIFY_IS_EQUAL(x.rows(), n);
+
+  Vec r = A * x - b;
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY(r.norm() / b.norm() < tol);
+}
+
+// ---- Multiple RHS -----------------------------------------------------------
+
+template <typename Scalar>
+void test_multiple_rhs(Index n, Index nrhs) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_symmetric_indefinite<Scalar>(n);
+  Mat B = Mat::Random(n, nrhs);
+
+  gpu::SparseLDLT<Scalar> ldlt(A);
+  VERIFY_IS_EQUAL(ldlt.info(), Success);
+
+  Mat X = ldlt.solve(B);
+  VERIFY_IS_EQUAL(X.rows(), n);
+  VERIFY_IS_EQUAL(X.cols(), nrhs);
+
+  Mat R = A * X - B;
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY(R.norm() / B.norm() < tol);
+}
+
+// ---- Refactorize ------------------------------------------------------------
+
+template <typename Scalar>
+void test_refactorize(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_symmetric_indefinite<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  gpu::SparseLDLT<Scalar> ldlt;
+  ldlt.analyzePattern(A);
+  VERIFY_IS_EQUAL(ldlt.info(), Success);
+
+  ldlt.factorize(A);
+  VERIFY_IS_EQUAL(ldlt.info(), Success);
+  Vec x1 = ldlt.solve(b);
+
+  // Modify values, keep pattern.
+  SpMat A2 = A;
+  for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
+
+  ldlt.factorize(A2);
+  VERIFY_IS_EQUAL(ldlt.info(), Success);
+  Vec x2 = ldlt.solve(b);
+
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((A * x1 - b).norm() / b.norm() < tol);
+  VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
+  // Diagonal scaled 2x; x1 and x2 must differ by a substantial fraction.
+  VERIFY((x1 - x2).norm() > RealScalar(0.01) * x1.norm());
+}
+
+// ---- Empty ------------------------------------------------------------------
+
+template <typename Scalar>
+void test_empty() {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  SpMat A(0, 0);
+  A.makeCompressed();
+  gpu::SparseLDLT<Scalar> ldlt(A);
+  VERIFY_IS_EQUAL(ldlt.info(), Success);
+  VERIFY_IS_EQUAL(ldlt.rows(), 0);
+  VERIFY_IS_EQUAL(ldlt.cols(), 0);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST(test_solve<Scalar>(64));
+  CALL_SUBTEST(test_solve<Scalar>(256));
+  CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
+  CALL_SUBTEST(test_refactorize<Scalar>(64));
+}
+
+EIGEN_DECLARE_TEST(gpu_cudss_ldlt) {
+  gpu_test::require_cudss_context();
+  // Split by scalar so each part compiles in parallel.
+  CALL_SUBTEST_1(test_scalar<float>());
+  CALL_SUBTEST_2(test_scalar<double>());
+  CALL_SUBTEST_3(test_scalar<std::complex<float>>());
+  CALL_SUBTEST_4(test_scalar<std::complex<double>>());
+  CALL_SUBTEST_5(test_empty<float>());
+  CALL_SUBTEST_5(test_empty<double>());
+  CALL_SUBTEST_5(test_empty<std::complex<float>>());
+  CALL_SUBTEST_5(test_empty<std::complex<double>>());
+}
diff --git a/unsupported/test/GPU/cudss_llt.cpp b/unsupported/test/GPU/cudss_llt.cpp
new file mode 100644
index 0000000..b01bcca
--- /dev/null
+++ b/unsupported/test/GPU/cudss_llt.cpp
@@ -0,0 +1,216 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// Tests for GpuSparseLLT: GPU sparse Cholesky via cuDSS.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/Sparse>
+#include <unsupported/Eigen/GPU>
+#include "gpu_test_helpers.h"
+
+using namespace Eigen;
+
+// ---- Helper: build a random sparse SPD matrix -------------------------------
+
+template <typename Scalar>
+SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  // Off-diagonal entries carry nonzero imaginary parts for complex Scalar so
+  // the cuDSS HPD path (CUDSS_MTYPE_HPD) is genuinely exercised; A = R^H * R
+  // is Hermitian regardless of R, and the +nI shift keeps the diagonal real.
+  SpMat R(n, n);
+  R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
+  for (Index j = 0; j < n; ++j) {
+    for (Index i = 0; i < n; ++i) {
+      if (i == j || (std::rand() / double(RAND_MAX)) < density) {
+        const RealScalar re = RealScalar(std::rand() / double(RAND_MAX) - 0.5);
+        const RealScalar im = RealScalar(std::rand() / double(RAND_MAX) - 0.5);
+        R.insert(i, j) = gpu_test::make_test_value<Scalar>(re, im);
+      }
+    }
+  }
+  R.makeCompressed();
+
+  // A = R^H * R + n * I  (guaranteed SPD).
+  SpMat A = R.adjoint() * R;
+  for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
+  A.makeCompressed();
+  return A;
+}
+
+// ---- Solve and check residual -----------------------------------------------
+
+template <typename Scalar>
+void test_solve(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_spd<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  gpu::SparseLLT<Scalar> llt(A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+
+  Vec x = llt.solve(b);
+  VERIFY_IS_EQUAL(x.rows(), n);
+
+  // Check residual: ||Ax - b|| / ||b||.
+  Vec r = A * x - b;
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY(r.norm() / b.norm() < tol);
+}
+
+// ---- Compare with CPU SimplicialLLT -----------------------------------------
+
+template <typename Scalar>
+void test_vs_cpu(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_spd<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  gpu::SparseLLT<Scalar> gpu_llt(A);
+  VERIFY_IS_EQUAL(gpu_llt.info(), Success);
+  Vec x_gpu = gpu_llt.solve(b);
+
+  SimplicialLLT<SpMat> cpu_llt(A);
+  VERIFY_IS_EQUAL(cpu_llt.info(), Success);
+  Vec x_cpu = cpu_llt.solve(b);
+
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((x_gpu - x_cpu).norm() / x_cpu.norm() < tol);
+}
+
+// ---- Multiple RHS -----------------------------------------------------------
+
+template <typename Scalar>
+void test_multiple_rhs(Index n, Index nrhs) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_spd<Scalar>(n);
+  Mat B = Mat::Random(n, nrhs);
+
+  gpu::SparseLLT<Scalar> llt(A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+
+  Mat X = llt.solve(B);
+  VERIFY_IS_EQUAL(X.rows(), n);
+  VERIFY_IS_EQUAL(X.cols(), nrhs);
+
+  Mat R = A * X - B;
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY(R.norm() / B.norm() < tol);
+}
+
+// ---- Separate analyze + factorize (refactorization) -------------------------
+
+template <typename Scalar>
+void test_refactorize(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_spd<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  gpu::SparseLLT<Scalar> llt;
+  llt.analyzePattern(A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+
+  // First factorize + solve.
+  llt.factorize(A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+  Vec x1 = llt.solve(b);
+
+  // Modify values (keep same pattern): scale diagonal.
+  SpMat A2 = A;
+  for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
+
+  // Refactorize with same pattern.
+  llt.factorize(A2);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+  Vec x2 = llt.solve(b);
+
+  // Both solutions should satisfy their respective systems.
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((A * x1 - b).norm() / b.norm() < tol);
+  VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
+
+  // Solutions should differ meaningfully: diagonal was scaled 2x, so x1 vs x2
+  // should differ by a substantial fraction of x1's magnitude. A near-epsilon
+  // bound here would pass even if refactorize silently reused the stale factor.
+  VERIFY((x1 - x2).norm() > RealScalar(0.01) * x1.norm());
+}
+
+// ---- Empty matrix -----------------------------------------------------------
+
+template <typename Scalar>
+void test_empty() {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  SpMat A(0, 0);
+  A.makeCompressed();
+  gpu::SparseLLT<Scalar> llt(A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+  VERIFY_IS_EQUAL(llt.rows(), 0);
+  VERIFY_IS_EQUAL(llt.cols(), 0);
+}
+
+// ---- Upper triangle ---------------------------------------------------------
+
+template <typename Scalar>
+void test_upper(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_spd<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  gpu::SparseLLT<Scalar, Upper> llt(A);
+  VERIFY_IS_EQUAL(llt.info(), Success);
+
+  Vec x = llt.solve(b);
+  Vec r = A * x - b;
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY(r.norm() / b.norm() < tol);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST(test_solve<Scalar>(64));
+  CALL_SUBTEST(test_solve<Scalar>(256));
+  CALL_SUBTEST(test_vs_cpu<Scalar>(64));
+  CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
+  CALL_SUBTEST(test_refactorize<Scalar>(64));
+  CALL_SUBTEST(test_upper<Scalar>(64));
+}
+
+EIGEN_DECLARE_TEST(gpu_cudss_llt) {
+  gpu_test::require_cudss_context();
+  // Split by scalar so each part compiles in parallel.
+  CALL_SUBTEST_1(test_scalar<float>());
+  CALL_SUBTEST_2(test_scalar<double>());
+  CALL_SUBTEST_3(test_scalar<std::complex<float>>());
+  CALL_SUBTEST_4(test_scalar<std::complex<double>>());
+  CALL_SUBTEST_5(test_empty<float>());
+  CALL_SUBTEST_5(test_empty<double>());
+  CALL_SUBTEST_5(test_empty<std::complex<float>>());
+  CALL_SUBTEST_5(test_empty<std::complex<double>>());
+}
diff --git a/unsupported/test/GPU/cudss_lu.cpp b/unsupported/test/GPU/cudss_lu.cpp
new file mode 100644
index 0000000..d88900e
--- /dev/null
+++ b/unsupported/test/GPU/cudss_lu.cpp
@@ -0,0 +1,158 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// Tests for GpuSparseLU: GPU sparse LU via cuDSS.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/Sparse>
+#include <unsupported/Eigen/GPU>
+#include "gpu_test_helpers.h"
+
+using namespace Eigen;
+
+// ---- Helper: build a random sparse non-singular general matrix ---------------
+
+template <typename Scalar>
+SparseMatrix<Scalar, ColMajor, int> make_general(Index n, double density = 0.1) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat R(n, n);
+  R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
+  for (Index j = 0; j < n; ++j) {
+    for (Index i = 0; i < n; ++i) {
+      if (i == j || (std::rand() / double(RAND_MAX)) < density) {
+        const RealScalar re = RealScalar(std::rand() / double(RAND_MAX) - 0.5);
+        const RealScalar im = RealScalar(std::rand() / double(RAND_MAX) - 0.5);
+        R.insert(i, j) = gpu_test::make_test_value<Scalar>(re, im);
+      }
+    }
+  }
+  // Add strong diagonal for non-singularity.
+  for (Index i = 0; i < n; ++i) R.coeffRef(i, i) += Scalar(RealScalar(n));
+  R.makeCompressed();
+  return R;
+}
+
+// ---- Solve and check residual -----------------------------------------------
+
+template <typename Scalar>
+void test_solve(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_general<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  gpu::SparseLU<Scalar> lu(A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+
+  Vec x = lu.solve(b);
+  VERIFY_IS_EQUAL(x.rows(), n);
+
+  Vec r = A * x - b;
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY(r.norm() / b.norm() < tol);
+}
+
+// ---- Multiple RHS -----------------------------------------------------------
+
+template <typename Scalar>
+void test_multiple_rhs(Index n, Index nrhs) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_general<Scalar>(n);
+  Mat B = Mat::Random(n, nrhs);
+
+  gpu::SparseLU<Scalar> lu(A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+
+  Mat X = lu.solve(B);
+  VERIFY_IS_EQUAL(X.rows(), n);
+  VERIFY_IS_EQUAL(X.cols(), nrhs);
+
+  Mat R = A * X - B;
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY(R.norm() / B.norm() < tol);
+}
+
+// ---- Refactorize ------------------------------------------------------------
+
+template <typename Scalar>
+void test_refactorize(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_general<Scalar>(n);
+  Vec b = Vec::Random(n);
+
+  gpu::SparseLU<Scalar> lu;
+  lu.analyzePattern(A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+
+  lu.factorize(A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+  Vec x1 = lu.solve(b);
+
+  // Modify values, keep pattern.
+  SpMat A2 = A;
+  for (Index i = 0; i < n; ++i) A2.coeffRef(i, i) *= Scalar(RealScalar(2));
+
+  lu.factorize(A2);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+  Vec x2 = lu.solve(b);
+
+  RealScalar tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((A * x1 - b).norm() / b.norm() < tol);
+  VERIFY((A2 * x2 - b).norm() / b.norm() < tol);
+  // Diagonal scaled 2x; x1 and x2 must differ by a substantial fraction.
+  VERIFY((x1 - x2).norm() > RealScalar(0.01) * x1.norm());
+}
+
+// ---- Empty ------------------------------------------------------------------
+
+template <typename Scalar>
+void test_empty() {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  SpMat A(0, 0);
+  A.makeCompressed();
+  gpu::SparseLU<Scalar> lu(A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+  VERIFY_IS_EQUAL(lu.rows(), 0);
+  VERIFY_IS_EQUAL(lu.cols(), 0);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST(test_solve<Scalar>(64));
+  CALL_SUBTEST(test_solve<Scalar>(256));
+  CALL_SUBTEST(test_multiple_rhs<Scalar>(64, 4));
+  CALL_SUBTEST(test_refactorize<Scalar>(64));
+}
+
+EIGEN_DECLARE_TEST(gpu_cudss_lu) {
+  gpu_test::require_cudss_context();
+  // Split by scalar so each part compiles in parallel.
+  CALL_SUBTEST_1(test_scalar<float>());
+  CALL_SUBTEST_2(test_scalar<double>());
+  CALL_SUBTEST_3(test_scalar<std::complex<float>>());
+  CALL_SUBTEST_4(test_scalar<std::complex<double>>());
+  CALL_SUBTEST_5(test_empty<float>());
+  CALL_SUBTEST_5(test_empty<double>());
+  CALL_SUBTEST_5(test_empty<std::complex<float>>());
+  CALL_SUBTEST_5(test_empty<std::complex<double>>());
+}
diff --git a/unsupported/test/GPU/cufft.cpp b/unsupported/test/GPU/cufft.cpp
new file mode 100644
index 0000000..7dc87c7
--- /dev/null
+++ b/unsupported/test/GPU/cufft.cpp
@@ -0,0 +1,190 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// Tests for GpuFFT: GPU FFT via cuFFT.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <unsupported/Eigen/GPU>
+#include "gpu_test_helpers.h"
+
+using namespace Eigen;
+
+// ---- 1D C2C roundtrip: inv(fwd(x)) ≈ x -------------------------------------
+
+template <typename Scalar>
+void test_c2c_roundtrip(Index n) {
+  using Complex = std::complex<Scalar>;
+  using Vec = Matrix<Complex, Dynamic, 1>;
+  using RealScalar = Scalar;
+
+  Vec x = Vec::Random(n);
+
+  gpu::FFT<Scalar> fft;
+  Vec X = fft.fwd(x);
+  VERIFY_IS_EQUAL(X.size(), n);
+
+  Vec y = fft.inv(X);
+  VERIFY_IS_EQUAL(y.size(), n);
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((y - x).norm() / x.norm() < tol);
+}
+
+// ---- 1D C2C known signal: FFT of constant = delta --------------------------
+
+template <typename Scalar>
+void test_c2c_constant() {
+  using Complex = std::complex<Scalar>;
+  using Vec = Matrix<Complex, Dynamic, 1>;
+  using RealScalar = Scalar;
+
+  const int n = 64;
+  Vec x = Vec::Constant(n, Complex(3.0, 0.0));
+
+  gpu::FFT<Scalar> fft;
+  Vec X = fft.fwd(x);
+
+  // FFT of constant c: X[0] = c*n, X[k] = 0 for k > 0.
+  RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(n);
+  VERIFY(std::abs(X(0) - Complex(3.0 * n, 0.0)) < tol);
+  for (int k = 1; k < n; ++k) {
+    VERIFY(std::abs(X(k)) < tol);
+  }
+}
+
+// ---- 1D R2C/C2R roundtrip: invReal(fwd(r), n) ≈ r --------------------------
+
+template <typename Scalar>
+void test_r2c_roundtrip(Index n) {
+  using Complex = std::complex<Scalar>;
+  using CVec = Matrix<Complex, Dynamic, 1>;
+  using RVec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = Scalar;
+
+  RVec r = RVec::Random(n);
+
+  gpu::FFT<Scalar> fft;
+  CVec R = fft.fwd(r);
+
+  // R2C returns n/2+1 complex values.
+  VERIFY_IS_EQUAL(R.size(), n / 2 + 1);
+
+  RVec s = fft.invReal(R, n);
+  VERIFY_IS_EQUAL(s.size(), n);
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((s - r).norm() / r.norm() < tol);
+}
+
+// ---- 2D C2C roundtrip: inv2(fwd2(A)) ≈ A ---------------------------------
+
+template <typename Scalar>
+void test_2d_roundtrip(Index rows, Index cols) {
+  using Complex = std::complex<Scalar>;
+  using Mat = Matrix<Complex, Dynamic, Dynamic>;
+  using RealScalar = Scalar;
+
+  Mat A = Mat::Random(rows, cols);
+
+  gpu::FFT<Scalar> fft;
+  Mat B = fft.fwd2(A);
+  VERIFY_IS_EQUAL(B.rows(), rows);
+  VERIFY_IS_EQUAL(B.cols(), cols);
+
+  Mat C = fft.inv2(B);
+  VERIFY_IS_EQUAL(C.rows(), rows);
+  VERIFY_IS_EQUAL(C.cols(), cols);
+
+  RealScalar tol = RealScalar(10) * RealScalar(rows * cols) * NumTraits<Scalar>::epsilon();
+  VERIFY((C - A).norm() / A.norm() < tol);
+}
+
+// ---- 2D C2C known signal: constant matrix -----------------------------------
+
+template <typename Scalar>
+void test_2d_constant() {
+  using Complex = std::complex<Scalar>;
+  using Mat = Matrix<Complex, Dynamic, Dynamic>;
+  using RealScalar = Scalar;
+
+  const int rows = 16, cols = 32;
+  Mat A = Mat::Constant(rows, cols, Complex(2.0, 0.0));
+
+  gpu::FFT<Scalar> fft;
+  Mat B = fft.fwd2(A);
+
+  // 2D FFT of constant c: B(0,0) = c*rows*cols, all others = 0.
+  RealScalar tol = RealScalar(10) * NumTraits<Scalar>::epsilon() * RealScalar(rows * cols);
+  VERIFY(std::abs(B(0, 0) - Complex(2.0 * rows * cols, 0.0)) < tol);
+  for (int j = 0; j < cols; ++j) {
+    for (int i = 0; i < rows; ++i) {
+      if (i == 0 && j == 0) continue;
+      VERIFY(std::abs(B(i, j)) < tol);
+    }
+  }
+}
+
+// ---- Plan reuse: repeated calls should work ---------------------------------
+
+template <typename Scalar>
+void test_plan_reuse() {
+  using Complex = std::complex<Scalar>;
+  using Vec = Matrix<Complex, Dynamic, 1>;
+  using RealScalar = Scalar;
+
+  gpu::FFT<Scalar> fft;
+  for (int trial = 0; trial < 5; ++trial) {
+    Vec x = Vec::Random(128);
+    Vec X = fft.fwd(x);
+    Vec y = fft.inv(X);
+    RealScalar tol = RealScalar(10) * RealScalar(128) * NumTraits<Scalar>::epsilon();
+    VERIFY((y - x).norm() / x.norm() < tol);
+  }
+}
+
+// ---- Empty ------------------------------------------------------------------
+
+template <typename Scalar>
+void test_empty() {
+  using Complex = std::complex<Scalar>;
+  using Vec = Matrix<Complex, Dynamic, 1>;
+
+  gpu::FFT<Scalar> fft;
+  Vec x(0);
+  Vec X = fft.fwd(x);
+  VERIFY_IS_EQUAL(X.size(), 0);
+  Vec y = fft.inv(X);
+  VERIFY_IS_EQUAL(y.size(), 0);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST(test_c2c_roundtrip<Scalar>(64));
+  CALL_SUBTEST(test_c2c_roundtrip<Scalar>(256));
+  CALL_SUBTEST(test_c2c_roundtrip<Scalar>(1000));  // non-power-of-2
+  CALL_SUBTEST(test_c2c_constant<Scalar>());
+  CALL_SUBTEST(test_r2c_roundtrip<Scalar>(64));
+  CALL_SUBTEST(test_r2c_roundtrip<Scalar>(256));
+  CALL_SUBTEST(test_2d_roundtrip<Scalar>(32, 32));
+  CALL_SUBTEST(test_2d_roundtrip<Scalar>(16, 64));  // non-square
+  CALL_SUBTEST(test_2d_constant<Scalar>());
+  CALL_SUBTEST(test_plan_reuse<Scalar>());
+  CALL_SUBTEST(test_empty<Scalar>());
+}
+
+EIGEN_DECLARE_TEST(gpu_cufft) {
+  gpu_test::require_cufft_context();
+  // Split by scalar so each part compiles in parallel.
+  CALL_SUBTEST_1(test_scalar<float>());
+  CALL_SUBTEST_2(test_scalar<double>());
+}
diff --git a/unsupported/test/GPU/cusparse_spmv.cpp b/unsupported/test/GPU/cusparse_spmv.cpp
new file mode 100644
index 0000000..122ddf9
--- /dev/null
+++ b/unsupported/test/GPU/cusparse_spmv.cpp
@@ -0,0 +1,253 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// Tests for gpu::SparseContext: GPU SpMV/SpMM via cuSPARSE.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/Sparse>
+#include <unsupported/Eigen/GPU>
+#include "gpu_test_helpers.h"
+
+using namespace Eigen;
+
+// ---- Helper: build a random sparse matrix -----------------------------------
+
+template <typename Scalar>
+SparseMatrix<Scalar, ColMajor, int> make_sparse(Index rows, Index cols, double density = 0.1) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat R(rows, cols);
+  R.reserve(VectorXi::Constant(cols, static_cast<int>(rows * density) + 1));
+  for (Index j = 0; j < cols; ++j) {
+    for (Index i = 0; i < rows; ++i) {
+      if ((std::rand() / double(RAND_MAX)) < density) {
+        const RealScalar re = RealScalar(std::rand() / double(RAND_MAX) - 0.5);
+        const RealScalar im = RealScalar(std::rand() / double(RAND_MAX) - 0.5);
+        R.insert(i, j) = gpu_test::make_test_value<Scalar>(re, im);
+      }
+    }
+  }
+  R.makeCompressed();
+  return R;
+}
+
+// ---- SpMV: y = A * x -------------------------------------------------------
+
+template <typename Scalar>
+void test_spmv(Index rows, Index cols) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_sparse<Scalar>(rows, cols);
+  Vec x = Vec::Random(cols);
+
+  gpu::SparseContext<Scalar> ctx;
+  Vec y_gpu = ctx.multiply(A, x);
+  Vec y_cpu = A * x;
+
+  RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
+  VERIFY_IS_EQUAL(y_gpu.size(), rows);
+  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
+}
+
+// ---- SpMV with alpha/beta: y = alpha*A*x + beta*y ---------------------------
+
+template <typename Scalar>
+void test_spmv_alpha_beta(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_sparse<Scalar>(n, n);
+  Vec x = Vec::Random(n);
+  Vec y_init = Vec::Random(n);
+
+  Scalar alpha(2);
+  Scalar beta(3);
+
+  Vec y_cpu = alpha * (A * x) + beta * y_init;
+
+  gpu::SparseContext<Scalar> ctx;
+  Vec y_gpu = y_init;
+  ctx.multiply(A, x, y_gpu, alpha, beta);
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
+}
+
+// ---- Transpose: y = A^T * x ------------------------------------------------
+
+template <typename Scalar>
+void test_spmv_transpose(Index rows, Index cols) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_sparse<Scalar>(rows, cols);
+  Vec x = Vec::Random(rows);
+
+  gpu::SparseContext<Scalar> ctx;
+  Vec y_gpu = ctx.multiplyT(A, x);
+  Vec y_cpu = A.transpose() * x;
+
+  RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
+  VERIFY_IS_EQUAL(y_gpu.size(), cols);
+  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
+}
+
+// ---- SpMV adjoint: y = A^H * x ----------------------------------------------
+
+template <typename Scalar>
+void test_spmv_adjoint(Index rows, Index cols) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_sparse<Scalar>(rows, cols);
+  Vec x = Vec::Random(rows);
+
+  gpu::SparseContext<Scalar> ctx;
+  Vec y_gpu = ctx.multiplyAdjoint(A, x);
+  Vec y_cpu = A.adjoint() * x;
+
+  RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
+  VERIFY_IS_EQUAL(y_gpu.size(), cols);
+  VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
+}
+
+// ---- SpMM: Y = A * X (multiple RHS) ----------------------------------------
+
+template <typename Scalar>
+void test_spmm(Index rows, Index cols, Index nrhs) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_sparse<Scalar>(rows, cols);
+  Mat X = Mat::Random(cols, nrhs);
+
+  gpu::SparseContext<Scalar> ctx;
+  Mat Y_gpu = ctx.multiplyMat(A, X);
+  Mat Y_cpu = A * X;
+
+  RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
+  VERIFY_IS_EQUAL(Y_gpu.rows(), rows);
+  VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs);
+  VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol);
+}
+
+// ---- SpMM transpose: Y = A^T * X --------------------------------------------
+
+template <typename Scalar>
+void test_spmm_transpose(Index rows, Index cols, Index nrhs) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  SpMat A = make_sparse<Scalar>(rows, cols);
+  Mat X = Mat::Random(rows, nrhs);
+
+  gpu::SparseContext<Scalar> ctx;
+  Mat Y_gpu = ctx.multiplyMat(A, X, gpu::GpuOp::Trans);
+  Mat Y_cpu = A.transpose() * X;
+
+  RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
+  VERIFY_IS_EQUAL(Y_gpu.rows(), cols);
+  VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs);
+  VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol);
+}
+
+// ---- Identity matrix: I * x = x --------------------------------------------
+
+template <typename Scalar>
+void test_identity(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  // Build sparse identity.
+  SpMat eye(n, n);
+  eye.setIdentity();
+  eye.makeCompressed();
+
+  Vec x = Vec::Random(n);
+
+  gpu::SparseContext<Scalar> ctx;
+  Vec y = ctx.multiply(eye, x);
+
+  RealScalar tol = NumTraits<Scalar>::epsilon();
+  VERIFY((y - x).norm() < tol);
+}
+
+// ---- Context reuse ----------------------------------------------------------
+
+template <typename Scalar>
+void test_reuse(Index n) {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  gpu::SparseContext<Scalar> ctx;
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+
+  for (int trial = 0; trial < 3; ++trial) {
+    SpMat A = make_sparse<Scalar>(n, n);
+    Vec x = Vec::Random(n);
+    Vec y_gpu = ctx.multiply(A, x);
+    Vec y_cpu = A * x;
+    VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
+  }
+}
+
+// ---- Empty ------------------------------------------------------------------
+
+template <typename Scalar>
+void test_empty() {
+  using SpMat = SparseMatrix<Scalar, ColMajor, int>;
+  using Vec = Matrix<Scalar, Dynamic, 1>;
+
+  SpMat A(0, 0);
+  A.makeCompressed();
+  Vec x(0);
+
+  gpu::SparseContext<Scalar> ctx;
+  Vec y = ctx.multiply(A, x);
+  VERIFY_IS_EQUAL(y.size(), 0);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST(test_spmv<Scalar>(64, 64));
+  CALL_SUBTEST(test_spmv<Scalar>(128, 64));  // non-square
+  CALL_SUBTEST(test_spmv<Scalar>(64, 128));  // wide
+  CALL_SUBTEST(test_spmv_alpha_beta<Scalar>(64));
+  CALL_SUBTEST(test_spmv_transpose<Scalar>(128, 64));
+  CALL_SUBTEST(test_spmv_adjoint<Scalar>(128, 64));
+  CALL_SUBTEST(test_spmm<Scalar>(64, 64, 4));
+  CALL_SUBTEST(test_spmm_transpose<Scalar>(128, 64, 4));
+  CALL_SUBTEST(test_identity<Scalar>(64));
+  CALL_SUBTEST(test_reuse<Scalar>(64));
+  CALL_SUBTEST(test_empty<Scalar>());
+}
+
+EIGEN_DECLARE_TEST(gpu_cusparse_spmv) {
+  gpu_test::require_cusparse_context();
+
+  // Split by scalar so each part compiles in parallel.
+  CALL_SUBTEST_1(test_scalar<float>());
+  CALL_SUBTEST_2(test_scalar<double>());
+  CALL_SUBTEST_3(test_scalar<std::complex<float>>());
+  CALL_SUBTEST_4(test_scalar<std::complex<double>>());
+}
diff --git a/unsupported/test/GPU/gpu_test_helpers.h b/unsupported/test/GPU/gpu_test_helpers.h
new file mode 100644
index 0000000..1ef4f9c
--- /dev/null
+++ b/unsupported/test/GPU/gpu_test_helpers.h
@@ -0,0 +1,80 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// Helpers shared across GPU library tests:
+//   * runtime probes that exit(77) (CI skip) when a library is unavailable
+//   * a small make_test_value() that constructs a Scalar with an imaginary
+//     component for complex types so the complex code paths are genuinely
+//     exercised — without this, Scalar(real_value) silently zeros the imag.
+
+#ifndef EIGEN_UNSUPPORTED_TEST_GPU_TEST_HELPERS_H
+#define EIGEN_UNSUPPORTED_TEST_GPU_TEST_HELPERS_H
+
+#include <Eigen/Core>
+#include <cstdlib>
+#include <iostream>
+#include <type_traits>
+
+namespace gpu_test {
+
+template <typename Scalar, typename RealScalar>
+inline std::enable_if_t<Eigen::NumTraits<Scalar>::IsComplex, Scalar> make_test_value(RealScalar re, RealScalar im) {
+  return Scalar(re, im);
+}
+template <typename Scalar, typename RealScalar>
+inline std::enable_if_t<!Eigen::NumTraits<Scalar>::IsComplex, Scalar> make_test_value(RealScalar re,
+                                                                                      RealScalar /*im*/) {
+  return Scalar(re);
+}
+
+#ifdef CUDSS_VERSION
+inline void require_cudss_context() {
+  cudssHandle_t handle = nullptr;
+  const cudssStatus_t status = cudssCreate(&handle);
+  if (status != CUDSS_STATUS_SUCCESS) {
+    std::cout << "SKIP: cuDSS tests require an initialized cuDSS context. cudssCreate failed with status "
+              << static_cast<int>(status) << std::endl;
+    std::exit(77);
+  }
+  EIGEN_CUDSS_CHECK(cudssDestroy(handle));
+}
+#endif
+
+#ifdef CUSPARSE_VERSION
+inline void require_cusparse_context() {
+  cusparseHandle_t handle = nullptr;
+  const cusparseStatus_t status = cusparseCreate(&handle);
+  if (status != CUSPARSE_STATUS_SUCCESS) {
+    std::cout << "SKIP: cuSPARSE tests require an initialized cuSPARSE context. cusparseCreate failed with status "
+              << static_cast<int>(status) << std::endl;
+    std::exit(77);
+  }
+  EIGEN_CUSPARSE_CHECK(cusparseDestroy(handle));
+}
+#endif
+
+#ifdef CUFFT_VERSION
+inline void require_cufft_context() {
+  cufftHandle plan = 0;
+  // cufftCreate allocates a plan handle without configuring it; succeeds only
+  // when the cuFFT runtime is loadable.
+  const cufftResult status = cufftCreate(&plan);
+  if (status != CUFFT_SUCCESS) {
+    std::cout << "SKIP: cuFFT tests require a working cuFFT runtime. cufftCreate failed with status "
+              << static_cast<int>(status) << std::endl;
+    std::exit(77);
+  }
+  EIGEN_CUFFT_CHECK(cufftDestroy(plan));
+}
+#endif
+
+}  // namespace gpu_test
+
+#endif  // EIGEN_UNSUPPORTED_TEST_GPU_TEST_HELPERS_H