Eigen/GPU [3/5]: Add dense cuSOLVER solvers (QR, SVD, EigenSolver)

libeigen/eigen!2413

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
Co-authored-by: Rasmus Munk Larsen <rlarsen@nvidia.com>
diff --git a/unsupported/Eigen/GPU b/unsupported/Eigen/GPU
index f5d20ba..4c15d7d 100644
--- a/unsupported/Eigen/GPU
+++ b/unsupported/Eigen/GPU
@@ -47,8 +47,12 @@
 #include "src/GPU/DeviceBlasExpr.h"
 #include "src/GPU/DeviceSolverExpr.h"
 #include "src/GPU/DeviceDispatch.h"
+#include "src/GPU/GpuSolverContext.h"
 #include "src/GPU/GpuLLT.h"
 #include "src/GPU/GpuLU.h"
+#include "src/GPU/GpuQR.h"
+#include "src/GPU/GpuSVD.h"
+#include "src/GPU/GpuEigenSolver.h"
 // IWYU pragma: end_exports
 #endif
 
diff --git a/unsupported/Eigen/src/GPU/CuBlasSupport.h b/unsupported/Eigen/src/GPU/CuBlasSupport.h
index ae30553..994d95a 100644
--- a/unsupported/Eigen/src/GPU/CuBlasSupport.h
+++ b/unsupported/Eigen/src/GPU/CuBlasSupport.h
@@ -163,6 +163,38 @@
                      reinterpret_cast<const cuDoubleComplex*>(B), ldb, &b, reinterpret_cast<cuDoubleComplex*>(C), ldc);
 }
 
+// GEAM wrappers: C = alpha * op(A) + beta * op(B)
+inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transA, cublasOperation_t transB, int m, int n,
+                                  const float* alpha, const float* A, int lda, const float* beta, const float* B,
+                                  int ldb, float* C, int ldc) {
+  return cublasSgeam(h, transA, transB, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transA, cublasOperation_t transB, int m, int n,
+                                  const double* alpha, const double* A, int lda, const double* beta, const double* B,
+                                  int ldb, double* C, int ldc) {
+  return cublasDgeam(h, transA, transB, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transA, cublasOperation_t transB, int m, int n,
+                                  const std::complex<float>* alpha, const std::complex<float>* A, int lda,
+                                  const std::complex<float>* beta, const std::complex<float>* B, int ldb,
+                                  std::complex<float>* C, int ldc) {
+  cuComplex a, b;
+  std::memcpy(&a, alpha, sizeof(a));
+  std::memcpy(&b, beta, sizeof(b));
+  return cublasCgeam(h, transA, transB, m, n, &a, reinterpret_cast<const cuComplex*>(A), lda, &b,
+                     reinterpret_cast<const cuComplex*>(B), ldb, reinterpret_cast<cuComplex*>(C), ldc);
+}
+inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transA, cublasOperation_t transB, int m, int n,
+                                  const std::complex<double>* alpha, const std::complex<double>* A, int lda,
+                                  const std::complex<double>* beta, const std::complex<double>* B, int ldb,
+                                  std::complex<double>* C, int ldc) {
+  cuDoubleComplex a, b;
+  std::memcpy(&a, alpha, sizeof(a));
+  std::memcpy(&b, beta, sizeof(b));
+  return cublasZgeam(h, transA, transB, m, n, &a, reinterpret_cast<const cuDoubleComplex*>(A), lda, &b,
+                     reinterpret_cast<const cuDoubleComplex*>(B), ldb, reinterpret_cast<cuDoubleComplex*>(C), ldc);
+}
+
 // SYRK wrappers (real → syrk, complex → herk)
 inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
                                   const float* alpha, const float* A, int lda, const float* beta, float* C, int ldc) {
@@ -186,6 +218,28 @@
                      reinterpret_cast<cuDoubleComplex*>(C), ldc);
 }
 
+// DGMM wrappers: C = A * diag(x)  (side=RIGHT) or C = diag(x) * A  (side=LEFT).
+// Useful for applying a diagonal scaling without materialising diag(x) as a
+// dense matrix. cuBLAS docs guarantee in-place is safe when C == A.
+inline cublasStatus_t cublasXdgmm(cublasHandle_t h, cublasSideMode_t side, int m, int n, const float* A, int lda,
+                                  const float* x, int incx, float* C, int ldc) {
+  return cublasSdgmm(h, side, m, n, A, lda, x, incx, C, ldc);
+}
+inline cublasStatus_t cublasXdgmm(cublasHandle_t h, cublasSideMode_t side, int m, int n, const double* A, int lda,
+                                  const double* x, int incx, double* C, int ldc) {
+  return cublasDdgmm(h, side, m, n, A, lda, x, incx, C, ldc);
+}
+inline cublasStatus_t cublasXdgmm(cublasHandle_t h, cublasSideMode_t side, int m, int n, const std::complex<float>* A,
+                                  int lda, const std::complex<float>* x, int incx, std::complex<float>* C, int ldc) {
+  return cublasCdgmm(h, side, m, n, reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<const cuComplex*>(x),
+                     incx, reinterpret_cast<cuComplex*>(C), ldc);
+}
+inline cublasStatus_t cublasXdgmm(cublasHandle_t h, cublasSideMode_t side, int m, int n, const std::complex<double>* A,
+                                  int lda, const std::complex<double>* x, int incx, std::complex<double>* C, int ldc) {
+  return cublasZdgmm(h, side, m, n, reinterpret_cast<const cuDoubleComplex*>(A), lda,
+                     reinterpret_cast<const cuDoubleComplex*>(x), incx, reinterpret_cast<cuDoubleComplex*>(C), ldc);
+}
+
 }  // namespace internal
 }  // namespace gpu
 }  // namespace Eigen
diff --git a/unsupported/Eigen/src/GPU/CuSolverSupport.h b/unsupported/Eigen/src/GPU/CuSolverSupport.h
index 7f8c33d..41f7b31 100644
--- a/unsupported/Eigen/src/GPU/CuSolverSupport.h
+++ b/unsupported/Eigen/src/GPU/CuSolverSupport.h
@@ -124,6 +124,68 @@
   static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_UPPER;
 };
 
+// ---- Type-specific cuSOLVER wrappers ----------------------------------------
+// cuSOLVER does not provide generic X variants for ormqr/unmqr. These overloaded
+// wrappers dispatch to the correct type-specific function.
+// For real types: ormqr (orthogonal Q). For complex types: unmqr (unitary Q).
+
+inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
+                                         int n, int k, const float* A, int lda, const float* tau, float* C, int ldc,
+                                         float* work, int lwork, int* info) {
+  return cusolverDnSormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info);
+}
+inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
+                                         int n, int k, const double* A, int lda, const double* tau, double* C, int ldc,
+                                         double* work, int lwork, int* info) {
+  return cusolverDnDormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info);
+}
+inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
+                                         int n, int k, const std::complex<float>* A, int lda,
+                                         const std::complex<float>* tau, std::complex<float>* C, int ldc,
+                                         std::complex<float>* work, int lwork, int* info) {
+  return cusolverDnCunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda,
+                          reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<cuComplex*>(C), ldc,
+                          reinterpret_cast<cuComplex*>(work), lwork, info);
+}
+inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m,
+                                         int n, int k, const std::complex<double>* A, int lda,
+                                         const std::complex<double>* tau, std::complex<double>* C, int ldc,
+                                         std::complex<double>* work, int lwork, int* info) {
+  return cusolverDnZunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda,
+                          reinterpret_cast<const cuDoubleComplex*>(tau), reinterpret_cast<cuDoubleComplex*>(C), ldc,
+                          reinterpret_cast<cuDoubleComplex*>(work), lwork, info);
+}
+
+// Buffer size wrappers for ormqr/unmqr.
+inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
+                                                    cublasOperation_t trans, int m, int n, int k, const float* A,
+                                                    int lda, const float* tau, const float* C, int ldc, int* lwork) {
+  return cusolverDnSormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
+                                                    cublasOperation_t trans, int m, int n, int k, const double* A,
+                                                    int lda, const double* tau, const double* C, int ldc, int* lwork) {
+  return cusolverDnDormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
+                                                    cublasOperation_t trans, int m, int n, int k,
+                                                    const std::complex<float>* A, int lda,
+                                                    const std::complex<float>* tau, const std::complex<float>* C,
+                                                    int ldc, int* lwork) {
+  return cusolverDnCunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda,
+                                     reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<const cuComplex*>(C),
+                                     ldc, lwork);
+}
+inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side,
+                                                    cublasOperation_t trans, int m, int n, int k,
+                                                    const std::complex<double>* A, int lda,
+                                                    const std::complex<double>* tau, const std::complex<double>* C,
+                                                    int ldc, int* lwork) {
+  return cusolverDnZunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda,
+                                     reinterpret_cast<const cuDoubleComplex*>(tau),
+                                     reinterpret_cast<const cuDoubleComplex*>(C), ldc, lwork);
+}
+
 }  // namespace internal
 }  // namespace gpu
 }  // namespace Eigen
diff --git a/unsupported/Eigen/src/GPU/DeviceDispatch.h b/unsupported/Eigen/src/GPU/DeviceDispatch.h
index 82c9112..8c6465c 100644
--- a/unsupported/Eigen/src/GPU/DeviceDispatch.h
+++ b/unsupported/Eigen/src/GPU/DeviceDispatch.h
@@ -8,12 +8,6 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // SPDX-License-Identifier: MPL-2.0
 
-// Dispatch functions that map DeviceMatrix expressions to NVIDIA library calls.
-//
-// dispatch_gemm()  — GemmExpr → cublasXgemm
-//
-// Each function documents the exact library call and parameters.
-
 #ifndef EIGEN_GPU_DEVICE_DISPATCH_H
 #define EIGEN_GPU_DEVICE_DISPATCH_H
 
@@ -37,9 +31,6 @@
   return a.data() != nullptr && a.data() == b.data();
 }
 
-// ---- GEMM dispatch ----------------------------------------------------------
-// GemmExpr<Lhs, Rhs> → cublasXgemm (type-specific Sgemm/Dgemm/Cgemm/Zgemm).
-
 template <typename Lhs, typename Rhs>
 void dispatch_gemm(
     Context& ctx, DeviceMatrix<typename device_expr_traits<Lhs>::scalar_type>& dst, const GemmExpr<Lhs, Rhs>& expr,
@@ -55,7 +46,6 @@
   constexpr cublasOperation_t transA = to_cublas_op(traits_lhs::op);
   constexpr cublasOperation_t transB = to_cublas_op(traits_rhs::op);
 
-  // GEMM dimensions: C(m,n) = op(A)(m,k) * op(B)(k,n)
   const int64_t m = (traits_lhs::op == GpuOp::NoTrans) ? A.rows() : A.cols();
   const int64_t k = (traits_lhs::op == GpuOp::NoTrans) ? A.cols() : A.rows();
   const int64_t n = (traits_rhs::op == GpuOp::NoTrans) ? B.cols() : B.rows();
@@ -63,8 +53,8 @@
 
   eigen_assert(k == rhs_k && "DeviceMatrix GEMM dimension mismatch");
 
-  const int64_t lda = A.outerStride();
-  const int64_t ldb = B.outerStride();
+  const int64_t lda = A.rows();
+  const int64_t ldb = B.rows();
 
   eigen_assert(!aliases_device_memory(dst, A) && "DeviceMatrix GEMM destination aliases lhs operand");
   eigen_assert(!aliases_device_memory(dst, B) && "DeviceMatrix GEMM destination aliases rhs operand");
@@ -77,7 +67,7 @@
   if (resized) {
     dst.resize(m, n);
   }
-  const int64_t ldc = dst.outerStride();
+  const int64_t ldc = dst.rows();
 
   Scalar alpha_local = alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs());
 
@@ -91,6 +81,10 @@
   eigen_assert(m <= INT_MAX && n <= INT_MAX && k <= INT_MAX && lda <= INT_MAX && ldb <= INT_MAX && ldc <= INT_MAX &&
                "cublasXgemm dimensions exceed int range");
 
+  // cuBLAS reads alpha and beta through host pointers. Store them in an array
+  // to prevent the compiler from eliding their stack slots — clang and MSVC
+  // at -O1+ otherwise optimise away the stores for complex types, leaving
+  // cuBLAS with a dangling pointer.
   Scalar scalars[2] = {alpha_local, beta_val};
   EIGEN_CUBLAS_CHECK(cublasXgemm(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n),
                                  static_cast<int>(k), &scalars[0], A.data(), static_cast<int>(lda), B.data(),
@@ -99,8 +93,6 @@
   dst.recordReady(ctx.stream());
 }
 
-// ---- LLT solve dispatch -----------------------------------------------------
-
 template <typename Scalar, int UpLo>
 void dispatch_llt_solve(Context& ctx, DeviceMatrix<Scalar>& dst, const LltSolveExpr<Scalar, UpLo>& expr) {
   const DeviceMatrix<Scalar>& A = expr.matrix();
@@ -124,8 +116,8 @@
 
   constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
   constexpr cublasFillMode_t uplo = cusolver_fill_mode<UpLo>::value;
-  const int64_t lda = static_cast<int64_t>(A.outerStride());
-  const int64_t ldb = static_cast<int64_t>(B.outerStride());
+  const int64_t lda = static_cast<int64_t>(A.rows());
+  const int64_t ldb = static_cast<int64_t>(B.rows());
   const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
   const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
 
@@ -161,8 +153,9 @@
   EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
 
   EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(ctx.cusolverHandle(), params.p, uplo, n, nrhs, dtype, d_factor.get(), lda,
-                                        dtype, dst.data(), static_cast<int64_t>(dst.outerStride()), d_info_potrs));
+                                        dtype, dst.data(), static_cast<int64_t>(dst.rows()), d_info_potrs));
 
+  // Workspace locals must outlive the async kernels — sync before they unwind.
   EIGEN_CUDA_RUNTIME_CHECK(
       cudaMemcpyAsync(&info_words[1], d_info_potrs, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
   EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
@@ -172,8 +165,6 @@
   dst.recordReady(ctx.stream());
 }
 
-// ---- LU solve dispatch ------------------------------------------------------
-
 template <typename Scalar>
 void dispatch_lu_solve(Context& ctx, DeviceMatrix<Scalar>& dst, const LuSolveExpr<Scalar>& expr) {
   const DeviceMatrix<Scalar>& A = expr.matrix();
@@ -196,8 +187,8 @@
   if (!dst.empty()) dst.waitReady(ctx.stream());
 
   constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
-  const int64_t lda = static_cast<int64_t>(A.outerStride());
-  const int64_t ldb = static_cast<int64_t>(B.outerStride());
+  const int64_t lda = static_cast<int64_t>(A.rows());
+  const int64_t ldb = static_cast<int64_t>(B.rows());
   const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
   const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
   const size_t ipiv_bytes = static_cast<size_t>(n) * sizeof(int64_t);
@@ -207,7 +198,6 @@
 
   DeviceBuffer d_ipiv(ipiv_bytes);
 
-  // See dispatch_llt_solve: two info slots + single end-of-chain sync.
   PinnedHostBuffer h_info(2 * sizeof(int));
   int* info_words = static_cast<int*>(h_info.get());
 
@@ -234,8 +224,9 @@
 
   EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(ctx.cusolverHandle(), params.p, CUBLAS_OP_N, n, nrhs, dtype, d_lu.get(), lda,
                                         static_cast<const int64_t*>(d_ipiv.get()), dtype, dst.data(),
-                                        static_cast<int64_t>(dst.outerStride()), d_info_getrs));
+                                        static_cast<int64_t>(dst.rows()), d_info_getrs));
 
+  // Workspace locals must outlive the async kernels — sync before they unwind.
   EIGEN_CUDA_RUNTIME_CHECK(
       cudaMemcpyAsync(&info_words[1], d_info_getrs, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
   EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
@@ -245,8 +236,6 @@
   dst.recordReady(ctx.stream());
 }
 
-// ---- TRSM dispatch ----------------------------------------------------------
-
 template <typename Scalar, int UpLo>
 void dispatch_trsm(Context& ctx, DeviceMatrix<Scalar>& dst, const TrsmExpr<Scalar, UpLo>& expr) {
   const DeviceMatrix<Scalar>& A = expr.matrix();
@@ -255,8 +244,7 @@
   eigen_assert(A.rows() == A.cols() && "TRSM requires a square triangular matrix");
   eigen_assert(B.rows() == A.rows() && "TRSM: RHS rows must match matrix size");
 
-  eigen_assert(A.rows() <= INT_MAX && B.cols() <= INT_MAX && A.outerStride() <= INT_MAX &&
-               "cublasXtrsm dimensions exceed int range");
+  eigen_assert(A.rows() <= INT_MAX && B.cols() <= INT_MAX && "cublasXtrsm dimensions exceed int range");
 
   const int n = static_cast<int>(A.rows());
   const int nrhs = static_cast<int>(B.cols());
@@ -274,21 +262,19 @@
   if (!dst.empty()) dst.waitReady(ctx.stream());
 
   dst.resize(n, B.cols());
-  const size_t rhs_bytes = static_cast<size_t>(dst.outerStride()) * static_cast<size_t>(nrhs) * sizeof(Scalar);
+  const size_t rhs_bytes = static_cast<size_t>(dst.rows()) * static_cast<size_t>(nrhs) * sizeof(Scalar);
   EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
 
   constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
   Scalar alpha(1);
 
   EIGEN_CUBLAS_CHECK(cublasXtrsm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, nrhs,
-                                 &alpha, A.data(), static_cast<int>(A.outerStride()), dst.data(),
-                                 static_cast<int>(dst.outerStride())));
+                                 &alpha, A.data(), static_cast<int>(A.rows()), dst.data(),
+                                 static_cast<int>(dst.rows())));
 
   dst.recordReady(ctx.stream());
 }
 
-// ---- SYMM/HEMM dispatch -----------------------------------------------------
-
 template <typename Scalar, int UpLo>
 void dispatch_symm(Context& ctx, DeviceMatrix<Scalar>& dst, const SymmExpr<Scalar, UpLo>& expr) {
   const DeviceMatrix<Scalar>& A = expr.matrix();
@@ -296,7 +282,7 @@
 
   eigen_assert(A.rows() == A.cols() && "SYMM requires a square matrix");
   eigen_assert(B.rows() == A.rows() && "SYMM: RHS rows must match matrix size");
-  eigen_assert(A.rows() <= INT_MAX && B.cols() <= INT_MAX && A.outerStride() <= INT_MAX && B.outerStride() <= INT_MAX &&
+  eigen_assert(A.rows() <= INT_MAX && B.cols() <= INT_MAX && B.rows() <= INT_MAX &&
                "cublasXsymm dimensions exceed int range");
 
   const int m = static_cast<int>(A.rows());
@@ -304,7 +290,7 @@
 
   if (m == 0 || n == 0) {
     if (!dst.empty()) dst.waitReady(ctx.stream());
-    dst.resize(m == 0 ? 0 : m, B.cols());
+    dst.resize(m, B.cols());
     return;
   }
 
@@ -317,25 +303,23 @@
   dst.resize(m, n);
 
   constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+  // See dispatch_gemm: array prevents compiler from eliding host-pointer stack slots.
   Scalar scalars[2] = {Scalar(1), Scalar(0)};
 
   EIGEN_CUBLAS_CHECK(cublasXsymm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, m, n, &scalars[0], A.data(),
-                                 static_cast<int>(A.outerStride()), B.data(), static_cast<int>(B.outerStride()),
-                                 &scalars[1], dst.data(), static_cast<int>(dst.outerStride())));
+                                 static_cast<int>(A.rows()), B.data(), static_cast<int>(B.rows()), &scalars[1],
+                                 dst.data(), static_cast<int>(dst.rows())));
 
   dst.recordReady(ctx.stream());
 }
 
-// ---- SYRK/HERK dispatch -----------------------------------------------------
-
 template <typename Scalar, int UpLo>
 void dispatch_syrk(Context& ctx, DeviceMatrix<Scalar>& dst, const SyrkExpr<Scalar, UpLo>& expr,
                    typename NumTraits<Scalar>::Real alpha_val, typename NumTraits<Scalar>::Real beta_val) {
   using RealScalar = typename NumTraits<Scalar>::Real;
   const DeviceMatrix<Scalar>& A = expr.matrix();
 
-  eigen_assert(A.rows() <= INT_MAX && A.cols() <= INT_MAX && A.outerStride() <= INT_MAX &&
-               "cublasXsyrk dimensions exceed int range");
+  eigen_assert(A.rows() <= INT_MAX && A.cols() <= INT_MAX && "cublasXsyrk dimensions exceed int range");
 
   const int n = static_cast<int>(A.rows());
   const int k = static_cast<int>(A.cols());
@@ -360,16 +344,13 @@
   constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
 
   EIGEN_CUBLAS_CHECK(cublasXsyrk(ctx.cublasHandle(), uplo, CUBLAS_OP_N, n, k, &alpha_val, A.data(),
-                                 static_cast<int>(A.outerStride()), &beta_val, dst.data(),
-                                 static_cast<int>(dst.outerStride())));
+                                 static_cast<int>(A.rows()), &beta_val, dst.data(), static_cast<int>(dst.rows())));
 
   dst.recordReady(ctx.stream());
 }
 
 }  // namespace internal
 
-// ---- Assignment: d_C.device(ctx) = expr ------------------------------------
-
 template <typename Scalar_>
 class Assignment {
  public:
@@ -432,8 +413,8 @@
   Context& ctx_;
 };
 
-// ---- Out-of-line Matrix expression operator= definitions ------------------
-// Declared in DeviceMatrix.h, defined here because they need Context::threadLocal().
+// Out-of-line definitions: these depend on Context::threadLocal(), which
+// requires the full Context definition unavailable in DeviceMatrix.h.
 
 template <typename Scalar_>
 template <typename Lhs, typename Rhs>
@@ -476,7 +457,6 @@
   return *this;
 }
 
-// SelfAdjointView::rankUpdate — defined here because it needs Context.
 template <typename Scalar_, int UpLo_>
 void SelfAdjointView<Scalar_, UpLo_>::rankUpdate(const DeviceMatrix<Scalar_>& A, RealScalar alpha) {
   SyrkExpr<Scalar_, UpLo_> expr(A);
diff --git a/unsupported/Eigen/src/GPU/DeviceMatrix.h b/unsupported/Eigen/src/GPU/DeviceMatrix.h
index acc4ba5..869a5ac 100644
--- a/unsupported/Eigen/src/GPU/DeviceMatrix.h
+++ b/unsupported/Eigen/src/GPU/DeviceMatrix.h
@@ -26,7 +26,7 @@
 //   MatrixXd X = d_X.toHost();                       // download + block
 //
 // Async variants:
-//   auto d_A = gpu::DeviceMatrix<double>::fromHostAsync(A.data(), n, n, n, stream);
+//   auto d_A = gpu::DeviceMatrix<double>::fromHostAsync(A.data(), n, n, stream);
 //   auto transfer = d_X.toHostAsync(stream);         // enqueue D2H
 //   // ... overlap with other work ...
 //   MatrixXd X = transfer.get();                     // block + retrieve
@@ -191,7 +191,7 @@
   DeviceMatrix() = default;
 
   /** Allocate uninitialized device memory for a rows x cols matrix. */
-  DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols), outerStride_(rows) {
+  DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols) {
     eigen_assert(rows >= 0 && cols >= 0);
     size_t bytes = sizeInBytes();
     if (bytes > 0) {
@@ -215,13 +215,11 @@
       : data_(std::move(o.data_)),
         rows_(o.rows_),
         cols_(o.cols_),
-        outerStride_(o.outerStride_),
         ready_event_(o.ready_event_),
         ready_stream_(o.ready_stream_),
         retained_buffer_(std::move(o.retained_buffer_)) {
     o.rows_ = 0;
     o.cols_ = 0;
-    o.outerStride_ = 0;
     o.ready_event_ = nullptr;
     o.ready_stream_ = nullptr;
   }
@@ -232,13 +230,11 @@
       data_ = std::move(o.data_);
       rows_ = o.rows_;
       cols_ = o.cols_;
-      outerStride_ = o.outerStride_;
       ready_event_ = o.ready_event_;
       ready_stream_ = o.ready_stream_;
       retained_buffer_ = std::move(o.retained_buffer_);
       o.rows_ = 0;
       o.cols_ = 0;
-      o.outerStride_ = 0;
       o.ready_event_ = nullptr;
       o.ready_stream_ = nullptr;
     }
@@ -276,29 +272,18 @@
    * The caller must keep \p host_data alive until the transfer completes
    * (check via the internal event or synchronize the stream).
    *
-   * \param host_data    Pointer to contiguous column-major host data.
-   * \param rows         Number of rows.
-   * \param cols         Number of columns.
-   * \param outerStride  Leading dimension (>= rows). Use rows for dense.
-   * \param stream       CUDA stream for the transfer.
+   * \param host_data  Pointer to contiguous column-major host data.
+   * \param rows       Number of rows.
+   * \param cols       Number of columns.
+   * \param stream     CUDA stream for the transfer.
    */
-  static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, Index outerStride,
-                                    cudaStream_t stream) {
-    eigen_assert(rows >= 0 && cols >= 0 && outerStride >= rows);
+  static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, cudaStream_t stream) {
+    eigen_assert(rows >= 0 && cols >= 0);
     eigen_assert(host_data != nullptr || (rows == 0 || cols == 0));
     DeviceMatrix dm(rows, cols);
     if (dm.sizeInBytes() > 0) {
-      // If outerStride == rows (dense), single contiguous copy.
-      // Otherwise, copy column by column (strided layout).
-      if (outerStride == rows) {
-        EIGEN_CUDA_RUNTIME_CHECK(
-            cudaMemcpyAsync(dm.data_.get(), host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
-      } else {
-        EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(dm.data_.get(), static_cast<size_t>(rows) * sizeof(Scalar),
-                                                   host_data, static_cast<size_t>(outerStride) * sizeof(Scalar),
-                                                   static_cast<size_t>(rows) * sizeof(Scalar),
-                                                   static_cast<size_t>(cols), cudaMemcpyHostToDevice, stream));
-      }
+      EIGEN_CUDA_RUNTIME_CHECK(
+          cudaMemcpyAsync(dm.data_.get(), host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
       dm.recordReady(stream);
     }
     return dm;
@@ -380,7 +365,6 @@
     retained_buffer_ = internal::DeviceBuffer();
     rows_ = rows;
     cols_ = cols;
-    outerStride_ = rows;
     size_t bytes = sizeInBytes();
     if (bytes > 0) {
       void* p = nullptr;
@@ -395,11 +379,10 @@
   const Scalar* data() const { return data_.get(); }
   Index rows() const { return rows_; }
   Index cols() const { return cols_; }
-  Index outerStride() const { return outerStride_; }
   bool empty() const { return rows_ == 0 || cols_ == 0; }
 
   /** Size of the device allocation in bytes. */
-  size_t sizeInBytes() const { return static_cast<size_t>(outerStride_) * static_cast<size_t>(cols_) * sizeof(Scalar); }
+  size_t sizeInBytes() const { return static_cast<size_t>(rows_) * static_cast<size_t>(cols_) * sizeof(Scalar); }
 
   // ---- Event synchronization (public for library dispatch interop) ---------
 
@@ -493,7 +476,23 @@
     dm.data_.reset(device_ptr);
     dm.rows_ = rows;
     dm.cols_ = cols;
-    dm.outerStride_ = rows;
+    return dm;
+  }
+
+  /** Construct a non-owning view over an existing device pointer.
+   *
+   * The pointer is *borrowed*: destruction does not free, and the underlying
+   * storage must outlive this view. Use this to chain decomposition outputs
+   * (e.g. `svd.d_matrixU()`) into downstream cuBLAS expressions without an
+   * intervening D2D copy. The view supports the full read interface (toHost,
+   * gemm operands, adjoint(), etc.). Do not assign through a view: the borrowed
+   * pointer would be silently replaced and the underlying owner left intact. */
+  static DeviceMatrix view(Scalar* device_ptr, Index rows, Index cols) {
+    DeviceMatrix dm;
+    dm.data_ =
+        std::unique_ptr<Scalar, internal::CudaFreeDeleter>(device_ptr, internal::CudaFreeDeleter{/*borrow=*/true});
+    dm.rows_ = rows;
+    dm.cols_ = cols;
     return dm;
   }
 
@@ -502,7 +501,6 @@
     Scalar* p = data_.release();
     rows_ = 0;
     cols_ = 0;
-    outerStride_ = 0;
     if (ready_event_) {
       (void)cudaEventDestroy(ready_event_);
       ready_event_ = nullptr;
@@ -527,7 +525,6 @@
   std::unique_ptr<Scalar, internal::CudaFreeDeleter> data_;
   Index rows_ = 0;
   Index cols_ = 0;
-  Index outerStride_ = 0;
   cudaEvent_t ready_event_ = nullptr;       // internal: tracks last write completion
   cudaStream_t ready_stream_ = nullptr;     // stream that recorded ready_event_ (for same-stream skip)
   internal::DeviceBuffer retained_buffer_;  // internal: keeps async aux buffers alive
diff --git a/unsupported/Eigen/src/GPU/GpuEigenSolver.h b/unsupported/Eigen/src/GPU/GpuEigenSolver.h
new file mode 100644
index 0000000..e863390
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/GpuEigenSolver.h
@@ -0,0 +1,219 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// GPU self-adjoint eigenvalue decomposition using cuSOLVER.
+//
+// Wraps cusolverDnXsyevd (symmetric/Hermitian divide-and-conquer).
+// Stores eigenvalues and eigenvectors on device.
+//
+// Usage:
+//   SelfAdjointEigenSolver<double> es(A);
+//   VectorXd eigenvals = es.eigenvalues();
+//   MatrixXd eigenvecs = es.eigenvectors();
+
+#ifndef EIGEN_GPU_EIGENSOLVER_H
+#define EIGEN_GPU_EIGENSOLVER_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSolverContext.h"
+
+namespace Eigen {
+namespace gpu {
+
+template <typename Scalar_>
+class SelfAdjointEigenSolver {
+ public:
+  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+  using RealVector = Eigen::Matrix<RealScalar, Dynamic, 1>;
+
+  SelfAdjointEigenSolver() = default;
+
+  /** \param options  Eigen::ComputeEigenvectors (default) or Eigen::EigenvaluesOnly. */
+  template <typename InputType>
+  explicit SelfAdjointEigenSolver(const EigenBase<InputType>& A, int options = ComputeEigenvectors) {
+    compute(A, options);
+  }
+
+  explicit SelfAdjointEigenSolver(const DeviceMatrix<Scalar>& d_A, int options = ComputeEigenvectors) {
+    compute(d_A, options);
+  }
+
+  ~SelfAdjointEigenSolver() = default;
+
+  SelfAdjointEigenSolver(const SelfAdjointEigenSolver&) = delete;
+  SelfAdjointEigenSolver& operator=(const SelfAdjointEigenSolver&) = delete;
+
+  SelfAdjointEigenSolver(SelfAdjointEigenSolver&& o) noexcept
+      : solver_ctx_(std::move(o.solver_ctx_)),
+        d_A_(std::move(o.d_A_)),
+        d_W_(std::move(o.d_W_)),
+        compute_eigenvectors_(o.compute_eigenvectors_),
+        n_(o.n_),
+        lda_(o.lda_) {
+    o.compute_eigenvectors_ = true;
+    o.n_ = 0;
+    o.lda_ = 0;
+  }
+
+  SelfAdjointEigenSolver& operator=(SelfAdjointEigenSolver&& o) noexcept {
+    if (this != &o) {
+      solver_ctx_ = std::move(o.solver_ctx_);
+      d_A_ = std::move(o.d_A_);
+      d_W_ = std::move(o.d_W_);
+      compute_eigenvectors_ = o.compute_eigenvectors_;
+      n_ = o.n_;
+      lda_ = o.lda_;
+      o.compute_eigenvectors_ = true;
+      o.n_ = 0;
+      o.lda_ = 0;
+    }
+    return *this;
+  }
+
+  // ---- Factorization -------------------------------------------------------
+
+  template <typename InputType>
+  SelfAdjointEigenSolver& compute(const EigenBase<InputType>& A, int options = ComputeEigenvectors) {
+    return compute(DeviceMatrix<Scalar>::fromHost(A.derived(), solver_ctx_.stream_), options);
+  }
+
+  SelfAdjointEigenSolver& compute(const DeviceMatrix<Scalar>& d_A, int options = ComputeEigenvectors) {
+    eigen_assert(d_A.rows() == d_A.cols() && "SelfAdjointEigenSolver requires a square matrix");
+    eigen_assert((options == ComputeEigenvectors || options == EigenvaluesOnly) &&
+                 "options must be ComputeEigenvectors or EigenvaluesOnly");
+    compute_eigenvectors_ = (options == ComputeEigenvectors);
+    n_ = d_A.rows();
+
+    if (n_ == 0) {
+      solver_ctx_.info_ = Success;
+      solver_ctx_.info_synced_ = true;
+      return *this;
+    }
+
+    d_A.waitReady(solver_ctx_.stream_);
+    lda_ = n_;
+    const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
+
+    d_A_ = internal::DeviceBuffer(mat_bytes);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_A_.get(), d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, solver_ctx_.stream_));
+
+    factorize();
+    return *this;
+  }
+
+  // ---- Accessors -----------------------------------------------------------
+
+  ComputationInfo info() const { return solver_ctx_.info(); }
+
+  Index cols() const { return n_; }
+  Index rows() const { return n_; }
+
+  /** Eigenvalues in ascending order. Downloads from device. */
+  RealVector eigenvalues() const {
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success);
+    RealVector W(n_);
+    if (n_ > 0) {
+      EIGEN_CUDA_RUNTIME_CHECK(
+          cudaMemcpy(W.data(), d_W_.get(), static_cast<size_t>(n_) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
+    }
+    return W;
+  }
+
+  /** Eigenvectors (columns). Downloads from device.
+   * Requires ComputeEigenvectors mode. */
+  PlainMatrix eigenvectors() const {
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success);
+    eigen_assert(compute_eigenvectors_ && "eigenvectors() requires ComputeEigenvectors option");
+    PlainMatrix V(n_, n_);
+    if (n_ > 0) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(V.data(), d_A_.get(),
+                                          static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar),
+                                          cudaMemcpyDeviceToHost));
+    }
+    return V;
+  }
+
+  // ---- Device-side accessors (zero-copy views; chain into cuBLAS without D2D) ---------
+  //
+  // These return non-owning DeviceMatrix views over this solver's internal storage. The
+  // view borrows the pointer: destruction does not free; this solver must outlive any
+  // view derived from it. Both accessors are pure metadata — zero kernel launches.
+
+  /** Eigenvalues as an n × 1 view on this solver's stream. */
+  DeviceMatrix<RealScalar> d_eigenvalues() const {
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success);
+    auto v = DeviceMatrix<RealScalar>::view(static_cast<RealScalar*>(d_W_.get()), n_, 1);
+    v.recordReady(solver_ctx_.stream_);
+    return v;
+  }
+
+  /** Eigenvectors (columns) as an n × n view on this solver's stream. */
+  DeviceMatrix<Scalar> d_eigenvectors() const {
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success);
+    eigen_assert(compute_eigenvectors_ && "d_eigenvectors() requires ComputeEigenvectors option");
+    auto v = DeviceMatrix<Scalar>::view(static_cast<Scalar*>(d_A_.get()), n_, n_);
+    v.recordReady(solver_ctx_.stream_);
+    return v;
+  }
+
+  cudaStream_t stream() const { return solver_ctx_.stream_; }
+
+ private:
+  mutable internal::GpuSolverContext solver_ctx_;
+  internal::DeviceBuffer d_A_;  // overwritten with eigenvectors by syevd
+  internal::DeviceBuffer d_W_;  // eigenvalues (RealScalar, length n)
+  bool compute_eigenvectors_ = true;
+  int64_t n_ = 0;
+  int64_t lda_ = 0;
+
+  void factorize() {
+    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
+    constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value;
+
+    solver_ctx_.mark_pending();
+
+    d_W_ = internal::DeviceBuffer(static_cast<size_t>(n_) * sizeof(RealScalar));
+
+    const cusolverEigMode_t jobz = compute_eigenvectors_ ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
+
+    // Use lower triangle (standard convention).
+    constexpr cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER;
+
+    size_t dev_ws = 0, host_ws = 0;
+    EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd_bufferSize(solver_ctx_.cusolver_, solver_ctx_.params_.p, jobz, uplo, n_,
+                                                     dtype, d_A_.get(), lda_, rtype, d_W_.get(), dtype, &dev_ws,
+                                                     &host_ws));
+
+    solver_ctx_.ensure_scratch(dev_ws);
+    solver_ctx_.h_workspace_.resize(host_ws);
+
+    EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd(solver_ctx_.cusolver_, solver_ctx_.params_.p, jobz, uplo, n_, dtype,
+                                          d_A_.get(), lda_, rtype, d_W_.get(), dtype, solver_ctx_.scratch_workspace(),
+                                          dev_ws, host_ws > 0 ? solver_ctx_.h_workspace_.data() : nullptr, host_ws,
+                                          solver_ctx_.scratch_info()));
+
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solver_ctx_.info_word(), solver_ctx_.scratch_info(), sizeof(int),
+                                             cudaMemcpyDeviceToHost, solver_ctx_.stream_));
+  }
+};
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_EIGENSOLVER_H
diff --git a/unsupported/Eigen/src/GPU/GpuLLT.h b/unsupported/Eigen/src/GPU/GpuLLT.h
index 2dd7299..b112543 100644
--- a/unsupported/Eigen/src/GPU/GpuLLT.h
+++ b/unsupported/Eigen/src/GPU/GpuLLT.h
@@ -30,8 +30,7 @@
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
 
-#include "./CuSolverSupport.h"
-#include <vector>
+#include "./GpuSolverContext.h"
 
 namespace Eigen {
 namespace gpu {
@@ -64,22 +63,15 @@
   // ---- Construction / destruction ------------------------------------------
 
   /** Default constructor. Does not factorize; call compute() before solve(). */
-  LLT() { init_context(); }
+  LLT() = default;
 
   /** Factor A immediately. Equivalent to LLT llt; llt.compute(A). */
   template <typename InputType>
-  explicit LLT(const EigenBase<InputType>& A) : LLT() {
+  explicit LLT(const EigenBase<InputType>& A) {
     compute(A);
   }
 
-  ~LLT() {
-    // Ignore errors here: dtors are noexcept, and EIGEN_CUSOLVER_CHECK /
-    // EIGEN_CUDA_RUNTIME_CHECK are eigen_assert-based — firing one from a
-    // noexcept dtor terminates the program. The trailing cudaFree() of the
-    // device buffers (via internal::DeviceBuffer::~DeviceBuffer) is sync.
-    if (handle_) (void)cusolverDnDestroy(handle_);
-    if (stream_) (void)cudaStreamDestroy(stream_);
-  }
+  ~LLT() = default;
 
   // Non-copyable (owns device memory and library handles).
   LLT(const LLT&) = delete;
@@ -87,81 +79,43 @@
 
   // Movable.
   LLT(LLT&& o) noexcept
-      : stream_(o.stream_),
-        handle_(o.handle_),
-        params_(std::move(o.params_)),
+      : solver_ctx_(std::move(o.solver_ctx_)),
         d_factor_(std::move(o.d_factor_)),
         factor_alloc_size_(o.factor_alloc_size_),
-        d_scratch_(std::move(o.d_scratch_)),
-        scratch_size_(o.scratch_size_),
-        h_workspace_(std::move(o.h_workspace_)),
         n_(o.n_),
-        lda_(o.lda_),
-        info_(o.info_),
-        pinned_info_(std::move(o.pinned_info_)),
-        info_synced_(o.info_synced_) {
-    o.stream_ = nullptr;
-    o.handle_ = nullptr;
+        lda_(o.lda_) {
     o.factor_alloc_size_ = 0;
-    o.scratch_size_ = 0;
     o.n_ = 0;
     o.lda_ = 0;
-    o.info_ = InvalidInput;
-    o.info_synced_ = true;
   }
 
   LLT& operator=(LLT&& o) noexcept {
     if (this != &o) {
-      // Drain pending work on the old stream before tearing it down so the
-      // upcoming move of d_factor_/d_scratch_ doesn't free buffers that an
-      // in-flight kernel is still touching. The asymmetric move-ctor doesn't
-      // need this — it constructs onto uninitialized storage.
-      if (stream_) (void)cudaStreamSynchronize(stream_);
-      if (handle_) (void)cusolverDnDestroy(handle_);
-      if (stream_) (void)cudaStreamDestroy(stream_);
-      stream_ = o.stream_;
-      handle_ = o.handle_;
-      params_ = std::move(o.params_);
+      solver_ctx_ = std::move(o.solver_ctx_);
       d_factor_ = std::move(o.d_factor_);
       factor_alloc_size_ = o.factor_alloc_size_;
-      d_scratch_ = std::move(o.d_scratch_);
-      scratch_size_ = o.scratch_size_;
-      h_workspace_ = std::move(o.h_workspace_);
       n_ = o.n_;
       lda_ = o.lda_;
-      info_ = o.info_;
-      pinned_info_ = std::move(o.pinned_info_);
-      info_synced_ = o.info_synced_;
-      o.stream_ = nullptr;
-      o.handle_ = nullptr;
       o.factor_alloc_size_ = 0;
-      o.scratch_size_ = 0;
       o.n_ = 0;
       o.lda_ = 0;
-      o.info_ = InvalidInput;
-      o.info_synced_ = true;
     }
     return *this;
   }
 
   // ---- Factorization -------------------------------------------------------
 
-  /** Compute the Cholesky factorization of A (host matrix).
-   *
-   * Uploads A to device memory, calls cusolverDnXpotrf, and retains the
-   * factored matrix on device. Any previous factorization is overwritten.
-   */
+  /** Compute the Cholesky factorization of A (host matrix). */
   template <typename InputType>
   LLT& compute(const EigenBase<InputType>& A) {
     eigen_assert(A.rows() == A.cols());
     if (!begin_compute(A.rows())) return *this;
 
-    // Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions).
     const PlainMatrix mat(A.derived());
-    lda_ = static_cast<int64_t>(mat.outerStride());
+    lda_ = static_cast<int64_t>(mat.rows());
     allocate_factor_storage();
     EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(d_factor_.get(), mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_));
+        cudaMemcpyAsync(d_factor_.get(), mat.data(), factorBytes(), cudaMemcpyHostToDevice, solver_ctx_.stream_));
 
     factorize();
     return *this;
@@ -172,11 +126,11 @@
     eigen_assert(d_A.rows() == d_A.cols());
     if (!begin_compute(d_A.rows())) return *this;
 
-    lda_ = static_cast<int64_t>(d_A.outerStride());
-    d_A.waitReady(stream_);
+    lda_ = static_cast<int64_t>(d_A.rows());
+    d_A.waitReady(solver_ctx_.stream_);
     allocate_factor_storage();
     EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(d_factor_.get(), d_A.data(), factorBytes(), cudaMemcpyDeviceToDevice, stream_));
+        cudaMemcpyAsync(d_factor_.get(), d_A.data(), factorBytes(), cudaMemcpyDeviceToDevice, solver_ctx_.stream_));
 
     factorize();
     return *this;
@@ -187,8 +141,8 @@
     eigen_assert(d_A.rows() == d_A.cols());
     if (!begin_compute(d_A.rows())) return *this;
 
-    lda_ = static_cast<int64_t>(d_A.outerStride());
-    d_A.waitReady(stream_);
+    lda_ = static_cast<int64_t>(d_A.rows());
+    d_A.waitReady(solver_ctx_.stream_);
     d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
     factor_alloc_size_ = factorBytes();
 
@@ -198,109 +152,79 @@
 
   // ---- Solve ---------------------------------------------------------------
 
-  /** Solve A * X = B using the cached Cholesky factor (host → host).
-   *
-   * Uploads B to device memory, calls cusolverDnXpotrs using the factor
-   * retained from compute(), and returns the solution X on the host.
-   * The factor is not re-transferred; only B goes up and X comes down.
-   *
-   * \pre compute() must have been called and info() == Success.
-   * \returns X such that A * X ≈ B
-   */
+  /** Solve A * X = B using the cached Cholesky factor (host → host). */
   template <typename Rhs>
   PlainMatrix solve(const MatrixBase<Rhs>& B) const {
-    const_cast<LLT*>(this)->sync_info();
-    eigen_assert(info_ == Success && "LLT::solve called on a failed or uninitialized factorization");
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success && "LLT::solve called on a failed or uninitialized factorization");
     eigen_assert(B.rows() == n_);
 
     const PlainMatrix rhs(B);
     const int64_t nrhs = static_cast<int64_t>(rhs.cols());
-    const int64_t ldb = static_cast<int64_t>(rhs.outerStride());
+    const int64_t ldb = static_cast<int64_t>(rhs.rows());
     internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb));
     EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(d_x.get(), rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
+        cudaMemcpyAsync(d_x.get(), rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, solver_ctx_.stream_));
     DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, std::move(d_x));
 
     PlainMatrix X(n_, B.cols());
     int solve_info = 0;
     EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(X.data(), d_X.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
-    EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
-    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+        cudaMemcpyAsync(X.data(), d_X.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToHost, solver_ctx_.stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solve_info, solver_ctx_.scratch_info(), sizeof(int),
+                                             cudaMemcpyDeviceToHost, solver_ctx_.stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(solver_ctx_.stream_));
 
     eigen_assert(solve_info == 0 && "cusolverDnXpotrs reported an error");
     return X;
   }
 
-  /** Solve A * X = B with device-resident RHS. Fully async.
-   *
-   * All work is enqueued on this solver's stream. Returns a Matrix
-   * with a recorded ready event — no host synchronization occurs.
-   * The caller should check info() after compute() to verify the
-   * factorization succeeded; this method does not check.
-   */
+  /** Solve A * X = B with device-resident RHS. Returns immediately after enqueuing
+   * the solve; the sync_info()/info_ check at entry forces a host wait on the
+   * factorization status so a singular A causes a clean assert rather than a
+   * silently-bad result. */
   DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success && "LLT::solve called on a failed or uninitialized factorization");
     eigen_assert(d_B.rows() == n_);
-    d_B.waitReady(stream_);
+    d_B.waitReady(solver_ctx_.stream_);
     const int64_t nrhs = static_cast<int64_t>(d_B.cols());
-    const int64_t ldb = static_cast<int64_t>(d_B.outerStride());
+    const int64_t ldb = static_cast<int64_t>(d_B.rows());
     internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb));
     EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(d_x.get(), d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
+        cudaMemcpyAsync(d_x.get(), d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, solver_ctx_.stream_));
     return solve_impl(nrhs, ldb, std::move(d_x));
   }
 
   // ---- Accessors -----------------------------------------------------------
 
-  /** Returns Success if the last compute() succeeded, NumericalIssue otherwise.
-   * Lazily synchronizes the stream on first call after compute(). */
-  ComputationInfo info() const {
-    const_cast<LLT*>(this)->sync_info();
-    return info_;
-  }
-
+  ComputationInfo info() const { return solver_ctx_.info(); }
   Index rows() const { return n_; }
   Index cols() const { return n_; }
-
-  /** Returns the CUDA stream owned by this object.
-   *  Advanced users may submit additional GPU work on this stream
-   *  to overlap with or chain after LLT operations. */
-  cudaStream_t stream() const { return stream_; }
+  cudaStream_t stream() const { return solver_ctx_.stream_; }
 
  private:
-  cudaStream_t stream_ = nullptr;
-  cusolverDnHandle_t handle_ = nullptr;
-  internal::CusolverParams params_;   // cuSOLVER params (created once, reused)
-  internal::DeviceBuffer d_factor_;   // factored L (or U) on device (grows, never shrinks)
-  size_t factor_alloc_size_ = 0;      // current d_factor_ allocation size
-  internal::DeviceBuffer d_scratch_;  // combined workspace + info word (grows, never shrinks)
-  size_t scratch_size_ = 0;           // current scratch allocation size
-  std::vector<char> h_workspace_;     // host workspace (kept alive until next compute)
+  mutable internal::GpuSolverContext solver_ctx_;
+  internal::DeviceBuffer d_factor_;
+  size_t factor_alloc_size_ = 0;
   int64_t n_ = 0;
   int64_t lda_ = 0;
-  ComputationInfo info_ = InvalidInput;
-  internal::PinnedHostBuffer pinned_info_{sizeof(int)};  // pinned host memory for async D2H
-  bool info_synced_ = true;                              // has the stream been synced for info?
-
-  int& info_word() { return *static_cast<int*>(pinned_info_.get()); }
-  int info_word() const { return *static_cast<const int*>(pinned_info_.get()); }
 
   bool begin_compute(Index rows) {
     n_ = rows;
+    solver_ctx_.info_ = InvalidInput;
     if (n_ == 0) {
-      info_ = Success;
-      info_synced_ = true;
+      solver_ctx_.info_ = Success;
+      solver_ctx_.info_synced_ = true;
       return false;
     }
-    info_ = InvalidInput;
     return true;
   }
 
   size_t factorBytes() const { return rhsBytes(n_, lda_); }
 
-  static size_t rhsBytes(int64_t cols, int64_t outer_stride) {
-    return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
+  static size_t rhsBytes(int64_t cols, int64_t ld) {
+    return static_cast<size_t>(ld) * static_cast<size_t>(cols) * sizeof(Scalar);
   }
 
   void allocate_factor_storage() {
@@ -311,90 +235,43 @@
     }
   }
 
-  // Scratch layout: [ workspace (aligned) | info_word (sizeof(int)) ].
-  // Workspace size is rounded up to 16 bytes so the info word lands aligned.
-  static constexpr size_t kInfoBytes = sizeof(int);
-  static constexpr size_t kScratchAlign = 16;
-
-  static size_t scratchBytesFor(size_t workspace_bytes) {
-    workspace_bytes = (workspace_bytes + kScratchAlign - 1) & ~(kScratchAlign - 1);
-    return workspace_bytes + kInfoBytes;
-  }
-
-  // Ensure d_scratch_ is at least `bytes`. Grows but never shrinks.
-  // Syncs the stream before reallocating to avoid freeing memory that
-  // async kernels may still be using.
-  void ensure_scratch(size_t bytes) {
-    if (bytes > scratch_size_) {
-      if (d_scratch_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
-      d_scratch_ = internal::DeviceBuffer(bytes);
-      scratch_size_ = bytes;
-    }
-  }
-
-  void* scratch_workspace() const { return d_scratch_.get(); }
-  int* scratch_info() const {
-    eigen_assert(d_scratch_ && scratch_size_ >= kInfoBytes);
-    return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.get()) + scratch_size_ - kInfoBytes);
-  }
-
-  // Solve in place on `d_x` (which already holds B), then re-wrap as a
-  // typed DeviceMatrix carrying shape and a ready event. The release/adopt
-  // hop just hands ownership of the raw cudaMalloc pointer from the untyped
-  // DeviceBuffer to the typed DeviceMatrix without copying.
+  // Solve in place on `d_x` (which already holds B), then re-wrap as a typed
+  // DeviceMatrix carrying shape and a ready event. The release/adopt hop hands
+  // ownership of the raw cudaMalloc pointer from the untyped DeviceBuffer to
+  // the typed DeviceMatrix without copying.
   DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, internal::DeviceBuffer&& d_x) const {
     constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
     constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value;
 
-    EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, n_, nrhs, dtype, d_factor_.get(), lda_, dtype,
-                                          d_x.get(), ldb, scratch_info()));
+    EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(solver_ctx_.cusolver_, solver_ctx_.params_.p, uplo, n_, nrhs, dtype,
+                                          d_factor_.get(), lda_, dtype, d_x.get(), ldb, solver_ctx_.scratch_info()));
 
     DeviceMatrix<Scalar> result =
         DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_x.release()), n_, static_cast<Index>(nrhs));
-    result.recordReady(stream_);
+    result.recordReady(solver_ctx_.stream_);
     return result;
   }
 
-  void init_context() {
-    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
-    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
-    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
-    ensure_scratch(kInfoBytes);
-  }
-
-  // Synchronize stream and interpret the info word. No-op if already synced.
-  void sync_info() {
-    if (!info_synced_) {
-      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
-      info_ = (info_word() == 0) ? Success : NumericalIssue;
-      info_synced_ = true;
-    }
-  }
-
-  // Run cusolverDnXpotrf on d_factor_ (already on device).
-  // Enqueues factorization + async info download. Does NOT sync.
-  // Workspaces are stored as members to ensure they outlive the async kernels.
   void factorize() {
     constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
     constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value;
 
-    info_synced_ = false;
-    info_ = InvalidInput;
+    solver_ctx_.mark_pending();
 
     size_t dev_ws_bytes = 0, host_ws_bytes = 0;
-    EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(handle_, params_.p, uplo, n_, dtype, d_factor_.get(), lda_, dtype,
-                                                     &dev_ws_bytes, &host_ws_bytes));
+    EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(solver_ctx_.cusolver_, solver_ctx_.params_.p, uplo, n_, dtype,
+                                                     d_factor_.get(), lda_, dtype, &dev_ws_bytes, &host_ws_bytes));
 
-    ensure_scratch(scratchBytesFor(dev_ws_bytes));
-    h_workspace_.resize(host_ws_bytes);
+    solver_ctx_.ensure_scratch(dev_ws_bytes);
+    solver_ctx_.h_workspace_.resize(host_ws_bytes);
 
-    EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(
-        handle_, params_.p, uplo, n_, dtype, d_factor_.get(), lda_, dtype, scratch_workspace(), dev_ws_bytes,
-        host_ws_bytes > 0 ? h_workspace_.data() : nullptr, host_ws_bytes, scratch_info()));
+    EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(solver_ctx_.cusolver_, solver_ctx_.params_.p, uplo, n_, dtype,
+                                          d_factor_.get(), lda_, dtype, solver_ctx_.scratch_workspace(), dev_ws_bytes,
+                                          host_ws_bytes > 0 ? solver_ctx_.h_workspace_.data() : nullptr, host_ws_bytes,
+                                          solver_ctx_.scratch_info()));
 
-    // Enqueue async download of info word — sync deferred to info() or solve().
-    EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(&info_word(), scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solver_ctx_.info_word(), solver_ctx_.scratch_info(), sizeof(int),
+                                             cudaMemcpyDeviceToHost, solver_ctx_.stream_));
   }
 };
 
diff --git a/unsupported/Eigen/src/GPU/GpuLU.h b/unsupported/Eigen/src/GPU/GpuLU.h
index afc941b..92c10a9 100644
--- a/unsupported/Eigen/src/GPU/GpuLU.h
+++ b/unsupported/Eigen/src/GPU/GpuLU.h
@@ -20,7 +20,7 @@
 //   gpu::LU<double> lu(A);            // upload A, getrf, LU+ipiv on device
 //   if (lu.info() != Success) { ... }
 //   MatrixXd x = lu.solve(b);         // getrs NoTrans, only b transferred
-//   MatrixXd xt = lu.solve(b, gpu::GpuOp::Trans);             // A^T x = b
+//   MatrixXd xt = lu.solve(b, gpu::GpuOp::Trans);   // A^T x = b
 
 #ifndef EIGEN_GPU_LU_H
 #define EIGEN_GPU_LU_H
@@ -28,9 +28,7 @@
 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"
 
-#include "./CuBlasSupport.h"
-#include "./CuSolverSupport.h"
-#include <vector>
+#include "./GpuSolverContext.h"
 
 namespace Eigen {
 namespace gpu {
@@ -56,81 +54,41 @@
 
   // ---- Construction / destruction ------------------------------------------
 
-  LU() { init_context(); }
+  LU() = default;
 
   template <typename InputType>
-  explicit LU(const EigenBase<InputType>& A) : LU() {
+  explicit LU(const EigenBase<InputType>& A) {
     compute(A);
   }
 
-  ~LU() {
-    // Ignore errors here: dtors are noexcept, and EIGEN_CUSOLVER_CHECK /
-    // EIGEN_CUDA_RUNTIME_CHECK are eigen_assert-based — firing one from a
-    // noexcept dtor terminates the program. The trailing cudaFree() of the
-    // device buffers (via internal::DeviceBuffer::~DeviceBuffer) is sync.
-    if (handle_) (void)cusolverDnDestroy(handle_);
-    if (stream_) (void)cudaStreamDestroy(stream_);
-  }
+  ~LU() = default;
 
   LU(const LU&) = delete;
   LU& operator=(const LU&) = delete;
 
   LU(LU&& o) noexcept
-      : stream_(o.stream_),
-        handle_(o.handle_),
-        params_(std::move(o.params_)),
+      : solver_ctx_(std::move(o.solver_ctx_)),
         d_lu_(std::move(o.d_lu_)),
         lu_alloc_size_(o.lu_alloc_size_),
         d_ipiv_(std::move(o.d_ipiv_)),
-        d_scratch_(std::move(o.d_scratch_)),
-        scratch_size_(o.scratch_size_),
-        h_workspace_(std::move(o.h_workspace_)),
         n_(o.n_),
-        lda_(o.lda_),
-        info_(o.info_),
-        pinned_info_(std::move(o.pinned_info_)),
-        info_synced_(o.info_synced_) {
-    o.stream_ = nullptr;
-    o.handle_ = nullptr;
+        lda_(o.lda_) {
     o.lu_alloc_size_ = 0;
-    o.scratch_size_ = 0;
     o.n_ = 0;
     o.lda_ = 0;
-    o.info_ = InvalidInput;
-    o.info_synced_ = true;
   }
 
   LU& operator=(LU&& o) noexcept {
     if (this != &o) {
-      // Drain pending work on the old stream before tearing it down so the
-      // upcoming move of d_lu_/d_ipiv_/d_scratch_ doesn't free buffers that an
-      // in-flight kernel is still touching. The asymmetric move-ctor doesn't
-      // need this — it constructs onto uninitialized storage.
-      if (stream_) (void)cudaStreamSynchronize(stream_);
-      if (handle_) (void)cusolverDnDestroy(handle_);
-      if (stream_) (void)cudaStreamDestroy(stream_);
-      stream_ = o.stream_;
-      handle_ = o.handle_;
-      params_ = std::move(o.params_);
+      solver_ctx_ = std::move(o.solver_ctx_);
       d_lu_ = std::move(o.d_lu_);
       lu_alloc_size_ = o.lu_alloc_size_;
       d_ipiv_ = std::move(o.d_ipiv_);
-      d_scratch_ = std::move(o.d_scratch_);
-      scratch_size_ = o.scratch_size_;
-      h_workspace_ = std::move(o.h_workspace_);
       n_ = o.n_;
       lda_ = o.lda_;
-      info_ = o.info_;
-      pinned_info_ = std::move(o.pinned_info_);
-      info_synced_ = o.info_synced_;
-      o.stream_ = nullptr;
-      o.handle_ = nullptr;
       o.lu_alloc_size_ = 0;
-      o.scratch_size_ = 0;
       o.n_ = 0;
       o.lda_ = 0;
-      o.info_ = InvalidInput;
-      o.info_synced_ = true;
     }
     return *this;
   }
@@ -144,9 +102,10 @@
     if (!begin_compute(A.rows())) return *this;
 
     const PlainMatrix mat(A.derived());
-    lda_ = static_cast<int64_t>(mat.outerStride());
+    lda_ = static_cast<int64_t>(mat.rows());
     allocate_lu_storage();
-    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.get(), mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_lu_.get(), mat.data(), matrixBytes(), cudaMemcpyHostToDevice, solver_ctx_.stream_));
 
     factorize();
     return *this;
@@ -157,11 +116,11 @@
     eigen_assert(d_A.rows() == d_A.cols() && "LU requires a square matrix");
     if (!begin_compute(d_A.rows())) return *this;
 
-    lda_ = static_cast<int64_t>(d_A.outerStride());
-    d_A.waitReady(stream_);
+    lda_ = static_cast<int64_t>(d_A.rows());
+    d_A.waitReady(solver_ctx_.stream_);
     allocate_lu_storage();
     EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(d_lu_.get(), d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_));
+        cudaMemcpyAsync(d_lu_.get(), d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, solver_ctx_.stream_));
 
     factorize();
     return *this;
@@ -172,8 +131,8 @@
     eigen_assert(d_A.rows() == d_A.cols() && "LU requires a square matrix");
     if (!begin_compute(d_A.rows())) return *this;
 
-    lda_ = static_cast<int64_t>(d_A.outerStride());
-    d_A.waitReady(stream_);
+    lda_ = static_cast<int64_t>(d_A.rows());
+    d_A.waitReady(solver_ctx_.stream_);
     d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
     lu_alloc_size_ = matrixBytes();
 
@@ -190,78 +149,68 @@
    */
   template <typename Rhs>
   PlainMatrix solve(const MatrixBase<Rhs>& B, GpuOp op = GpuOp::NoTrans) const {
-    const_cast<LU*>(this)->sync_info();
-    eigen_assert(info_ == Success && "LU::solve called on a failed or uninitialized factorization");
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success && "LU::solve called on a failed or uninitialized factorization");
     eigen_assert(B.rows() == n_);
 
     const PlainMatrix rhs(B);
     const int64_t nrhs = static_cast<int64_t>(rhs.cols());
-    const int64_t ldb = static_cast<int64_t>(rhs.outerStride());
+    const int64_t ldb = static_cast<int64_t>(rhs.rows());
     internal::DeviceBuffer d_x(matrixBytes(nrhs, ldb));
     EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(d_x.get(), rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
+        cudaMemcpyAsync(d_x.get(), rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, solver_ctx_.stream_));
     DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, op, std::move(d_x));
 
     PlainMatrix X(n_, B.cols());
     int solve_info = 0;
     EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(X.data(), d_X.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
-    EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
-    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+        cudaMemcpyAsync(X.data(), d_X.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToHost, solver_ctx_.stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solve_info, solver_ctx_.scratch_info(), sizeof(int),
+                                             cudaMemcpyDeviceToHost, solver_ctx_.stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(solver_ctx_.stream_));
 
     eigen_assert(solve_info == 0 && "cusolverDnXgetrs reported an error");
     return X;
   }
 
-  /** Solve op(A) * X = B with device-resident RHS. Fully async. */
+  /** Solve op(A) * X = B with device-resident RHS. Returns immediately after
+   * enqueuing the solve; the sync_info()/info_ check at entry forces a host
+   * wait on the factorization status so a singular A causes a clean assert
+   * rather than a silently-bad result. */
   DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B, GpuOp op = GpuOp::NoTrans) const {
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success && "LU::solve called on a failed or uninitialized factorization");
     eigen_assert(d_B.rows() == n_);
-    d_B.waitReady(stream_);
+    d_B.waitReady(solver_ctx_.stream_);
     const int64_t nrhs = static_cast<int64_t>(d_B.cols());
-    const int64_t ldb = static_cast<int64_t>(d_B.outerStride());
+    const int64_t ldb = static_cast<int64_t>(d_B.rows());
     internal::DeviceBuffer d_x(matrixBytes(nrhs, ldb));
     EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(d_x.get(), d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
+        cudaMemcpyAsync(d_x.get(), d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, solver_ctx_.stream_));
     return solve_impl(nrhs, ldb, op, std::move(d_x));
   }
 
   // ---- Accessors -----------------------------------------------------------
 
-  /** Lazily synchronizes the stream on first call after compute(). */
-  ComputationInfo info() const {
-    const_cast<LU*>(this)->sync_info();
-    return info_;
-  }
+  ComputationInfo info() const { return solver_ctx_.info(); }
   Index rows() const { return n_; }
   Index cols() const { return n_; }
-  cudaStream_t stream() const { return stream_; }
+  cudaStream_t stream() const { return solver_ctx_.stream_; }
 
  private:
-  cudaStream_t stream_ = nullptr;
-  cusolverDnHandle_t handle_ = nullptr;
-  internal::CusolverParams params_;   // cuSOLVER params (created once, reused)
-  internal::DeviceBuffer d_lu_;       // LU factors on device (grows, never shrinks)
-  size_t lu_alloc_size_ = 0;          // current d_lu_ allocation size
-  internal::DeviceBuffer d_ipiv_;     // pivot indices (int64_t) on device
-  internal::DeviceBuffer d_scratch_;  // combined workspace + info word (grows, never shrinks)
-  size_t scratch_size_ = 0;           // current scratch allocation size
-  std::vector<char> h_workspace_;     // host workspace (kept alive until next compute)
+  mutable internal::GpuSolverContext solver_ctx_;
+  internal::DeviceBuffer d_lu_;
+  size_t lu_alloc_size_ = 0;
+  internal::DeviceBuffer d_ipiv_;
   int64_t n_ = 0;
   int64_t lda_ = 0;
-  ComputationInfo info_ = InvalidInput;
-  internal::PinnedHostBuffer pinned_info_{sizeof(int)};  // pinned host memory for async D2H
-  bool info_synced_ = true;                              // has the stream been synced for info?
-
-  int& info_word() { return *static_cast<int*>(pinned_info_.get()); }
-  int info_word() const { return *static_cast<const int*>(pinned_info_.get()); }
 
   bool begin_compute(Index rows) {
     n_ = rows;
-    info_ = InvalidInput;
+    solver_ctx_.info_ = InvalidInput;
     if (n_ == 0) {
-      info_ = Success;
-      info_synced_ = true;
+      solver_ctx_.info_ = Success;
+      solver_ctx_.info_synced_ = true;
       return false;
     }
     return true;
@@ -269,8 +218,8 @@
 
   size_t matrixBytes() const { return matrixBytes(n_, lda_); }
 
-  static size_t matrixBytes(int64_t cols, int64_t outer_stride) {
-    return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
+  static size_t matrixBytes(int64_t cols, int64_t ld) {
+    return static_cast<size_t>(ld) * static_cast<size_t>(cols) * sizeof(Scalar);
   }
 
   void allocate_lu_storage() {
@@ -281,92 +230,46 @@
     }
   }
 
-  // Scratch layout: [ workspace (aligned) | info_word (sizeof(int)) ].
-  // Workspace size is rounded up to 16 bytes so the info word lands aligned.
-  static constexpr size_t kInfoBytes = sizeof(int);
-  static constexpr size_t kScratchAlign = 16;
-
-  static size_t scratchBytesFor(size_t workspace_bytes) {
-    workspace_bytes = (workspace_bytes + kScratchAlign - 1) & ~(kScratchAlign - 1);
-    return workspace_bytes + kInfoBytes;
-  }
-
-  // Ensure d_scratch_ is at least `bytes`. Grows but never shrinks.
-  // Syncs the stream before reallocating to avoid freeing memory that
-  // async kernels may still be using.
-  void ensure_scratch(size_t bytes) {
-    if (bytes > scratch_size_) {
-      if (d_scratch_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
-      d_scratch_ = internal::DeviceBuffer(bytes);
-      scratch_size_ = bytes;
-    }
-  }
-
-  void* scratch_workspace() const { return d_scratch_.get(); }
-  int* scratch_info() const {
-    eigen_assert(d_scratch_ && scratch_size_ >= kInfoBytes);
-    return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.get()) + scratch_size_ - kInfoBytes);
-  }
-
-  // Solve in place on `d_x` (which already holds B), then re-wrap as a
-  // typed DeviceMatrix carrying shape and a ready event. The release/adopt
-  // hop just hands ownership of the raw cudaMalloc pointer from the untyped
-  // DeviceBuffer to the typed DeviceMatrix without copying.
+  // Solve in place on `d_x` (which already holds B), then re-wrap as a typed
+  // DeviceMatrix carrying shape and a ready event. The release/adopt hop hands
+  // ownership of the raw cudaMalloc pointer from the untyped DeviceBuffer to
+  // the typed DeviceMatrix without copying.
   DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, GpuOp op, internal::DeviceBuffer&& d_x) const {
     constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
     const cublasOperation_t trans = internal::to_cublas_op(op);
 
-    EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(handle_, params_.p, trans, n_, nrhs, dtype, d_lu_.get(), lda_,
-                                          static_cast<const int64_t*>(d_ipiv_.get()), dtype, d_x.get(), ldb,
-                                          scratch_info()));
+    EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(solver_ctx_.cusolver_, solver_ctx_.params_.p, trans, n_, nrhs, dtype,
+                                          d_lu_.get(), lda_, static_cast<const int64_t*>(d_ipiv_.get()), dtype,
+                                          d_x.get(), ldb, solver_ctx_.scratch_info()));
 
     DeviceMatrix<Scalar> result =
         DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_x.release()), n_, static_cast<Index>(nrhs));
-    result.recordReady(stream_);
+    result.recordReady(solver_ctx_.stream_);
     return result;
   }
 
-  void init_context() {
-    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
-    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
-    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
-    ensure_scratch(kInfoBytes);
-  }
-
-  void sync_info() {
-    if (!info_synced_) {
-      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
-      info_ = (info_word() == 0) ? Success : NumericalIssue;
-      info_synced_ = true;
-    }
-  }
-
-  // Run cusolverDnXgetrf on d_lu_ (already on device). Allocates d_ipiv_.
-  // Enqueues factorization + async info download. Does NOT sync.
-  // Workspaces are stored as members to ensure they outlive the async kernels.
   void factorize() {
     constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
     const size_t ipiv_bytes = static_cast<size_t>(n_) * sizeof(int64_t);
 
-    info_synced_ = false;
-    info_ = InvalidInput;
+    solver_ctx_.mark_pending();
 
     d_ipiv_ = internal::DeviceBuffer(ipiv_bytes);
 
     size_t dev_ws_bytes = 0, host_ws_bytes = 0;
-    EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(handle_, params_.p, n_, n_, dtype, d_lu_.get(), lda_, dtype,
-                                                     &dev_ws_bytes, &host_ws_bytes));
+    EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(solver_ctx_.cusolver_, solver_ctx_.params_.p, n_, n_, dtype,
+                                                     d_lu_.get(), lda_, dtype, &dev_ws_bytes, &host_ws_bytes));
 
-    ensure_scratch(scratchBytesFor(dev_ws_bytes));
-    h_workspace_.resize(host_ws_bytes);
+    solver_ctx_.ensure_scratch(dev_ws_bytes);
+    solver_ctx_.h_workspace_.resize(host_ws_bytes);
 
-    EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf(handle_, params_.p, n_, n_, dtype, d_lu_.get(), lda_,
-                                          static_cast<int64_t*>(d_ipiv_.get()), dtype, scratch_workspace(),
-                                          dev_ws_bytes, host_ws_bytes > 0 ? h_workspace_.data() : nullptr,
-                                          host_ws_bytes, scratch_info()));
+    EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf(
+        solver_ctx_.cusolver_, solver_ctx_.params_.p, n_, n_, dtype, d_lu_.get(), lda_,
+        static_cast<int64_t*>(d_ipiv_.get()), dtype, solver_ctx_.scratch_workspace(), dev_ws_bytes,
+        host_ws_bytes > 0 ? solver_ctx_.h_workspace_.data() : nullptr, host_ws_bytes, solver_ctx_.scratch_info()));
 
-    EIGEN_CUDA_RUNTIME_CHECK(
-        cudaMemcpyAsync(&info_word(), scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solver_ctx_.info_word(), solver_ctx_.scratch_info(), sizeof(int),
+                                             cudaMemcpyDeviceToHost, solver_ctx_.stream_));
   }
 };
 
diff --git a/unsupported/Eigen/src/GPU/GpuQR.h b/unsupported/Eigen/src/GPU/GpuQR.h
new file mode 100644
index 0000000..821c458
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/GpuQR.h
@@ -0,0 +1,379 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// GPU QR decomposition using cuSOLVER.
+//
+// Wraps cusolverDnXgeqrf (factorization), cusolverDnXormqr (apply Q),
+// and cublasXtrsm (triangular solve on R). Q is never formed explicitly.
+//
+// Handles both shapes transparently:
+//   m >= n (overdetermined or square): factor A = Q R; least-squares solve.
+//   m  < n (underdetermined):           factor A^H = Q R internally; min-norm solve.
+//
+// Usage:
+//   QR<double> qr(A);              // upload A, geqrf (transparent transpose if m<n)
+//   if (qr.info() != Success) { ... }
+//   MatrixXd X = qr.solve(B);      // least-squares (m>=n) or min-norm (m<n)
+//   MatrixXd R = qr.matrixR();     // upper-triangular factor (m>=n only)
+
+#ifndef EIGEN_GPU_QR_H
+#define EIGEN_GPU_QR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSolverContext.h"
+
+namespace Eigen {
+namespace gpu {
+
+template <typename Scalar_>
+class QR {
+ public:
+  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+
+  QR() = default;
+
+  template <typename InputType>
+  explicit QR(const EigenBase<InputType>& A) {
+    compute(A);
+  }
+
+  explicit QR(const DeviceMatrix<Scalar>& d_A) { compute(d_A); }
+
+  ~QR() = default;
+
+  QR(const QR&) = delete;
+  QR& operator=(const QR&) = delete;
+
+  QR(QR&& o) noexcept
+      : solver_ctx_(std::move(o.solver_ctx_)),
+        d_qr_(std::move(o.d_qr_)),
+        d_tau_(std::move(o.d_tau_)),
+        m_(o.m_),
+        n_(o.n_),
+        lda_(o.lda_),
+        transposed_(o.transposed_) {
+    o.m_ = 0;
+    o.n_ = 0;
+    o.lda_ = 0;
+    o.transposed_ = false;
+  }
+
+  QR& operator=(QR&& o) noexcept {
+    if (this != &o) {
+      solver_ctx_ = std::move(o.solver_ctx_);
+      d_qr_ = std::move(o.d_qr_);
+      d_tau_ = std::move(o.d_tau_);
+      m_ = o.m_;
+      n_ = o.n_;
+      lda_ = o.lda_;
+      transposed_ = o.transposed_;
+      o.m_ = 0;
+      o.n_ = 0;
+      o.lda_ = 0;
+      o.transposed_ = false;
+    }
+    return *this;
+  }
+
+  // ---- Factorization -------------------------------------------------------
+
+  template <typename InputType>
+  QR& compute(const EigenBase<InputType>& A) {
+    // Upload to device, then delegate. The wide-matrix transpose runs on the
+    // GPU (via cublasXgeam) inside the device-input path; no host transpose.
+    return compute(DeviceMatrix<Scalar>::fromHost(A.derived(), solver_ctx_.stream_));
+  }
+
+  QR& compute(const DeviceMatrix<Scalar>& d_A) {
+    m_ = d_A.rows();
+    n_ = d_A.cols();
+
+    if (m_ == 0 || n_ == 0) {
+      solver_ctx_.info_ = Success;
+      solver_ctx_.info_synced_ = true;
+      return *this;
+    }
+
+    transposed_ = (m_ < n_);
+    d_A.waitReady(solver_ctx_.stream_);
+    lda_ = static_cast<int64_t>(transposed_ ? n_ : m_);
+    const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(factor_cols()) * sizeof(Scalar);
+    const size_t tau_bytes = static_cast<size_t>(k()) * sizeof(Scalar);
+
+    d_qr_ = internal::DeviceBuffer(mat_bytes);
+    d_tau_ = internal::DeviceBuffer(tau_bytes);
+
+    if (transposed_) {
+      // Transpose-on-device via cuBLAS geam: d_qr_ = A^H.
+      Scalar alpha_one(1), beta_zero(0);
+      EIGEN_CUBLAS_CHECK(internal::cublasXgeam(
+          solver_ctx_.cublas_, CUBLAS_OP_C, CUBLAS_OP_N, internal::to_blas_int(n_), internal::to_blas_int(m_),
+          &alpha_one, d_A.data(), internal::to_blas_int(d_A.rows()), &beta_zero, static_cast<const Scalar*>(nullptr),
+          internal::to_blas_int(n_), static_cast<Scalar*>(d_qr_.get()), internal::to_blas_int(n_)));
+    } else {
+      EIGEN_CUDA_RUNTIME_CHECK(
+          cudaMemcpyAsync(d_qr_.get(), d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, solver_ctx_.stream_));
+    }
+
+    factorize();
+    return *this;
+  }
+
+  // ---- Solve ---------------------------------------------------------------
+
+  /** Solve A * X = B.
+   * For m >= n (over-/exactly-determined): least-squares X = R^{-1} Q^H B (residual A^H r ≈ 0).
+   * For m  < n (underdetermined):          minimum-norm  X = Q R^{-H} B (||X|| minimized). */
+  template <typename Rhs>
+  PlainMatrix solve(const MatrixBase<Rhs>& B) const {
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success && "QR::solve called on a failed or uninitialized factorization");
+    eigen_assert(B.rows() == m_);
+
+    const PlainMatrix rhs(B);
+    const Index nrhs = rhs.cols();
+
+    if (!transposed_) {
+      return solve_overdetermined_host(rhs);
+    }
+    return solve_underdetermined_host(rhs, nrhs);
+  }
+
+  /** Solve with device-resident RHS. Returns n × nrhs DeviceMatrix. */
+  DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success && "QR::solve called on a failed or uninitialized factorization");
+    eigen_assert(d_B.rows() == m_);
+    d_B.waitReady(solver_ctx_.stream_);
+
+    if (!transposed_) {
+      return solve_overdetermined_device(d_B);
+    }
+    return solve_underdetermined_device(d_B);
+  }
+
+  // ---- Accessors -----------------------------------------------------------
+
+  ComputationInfo info() const { return solver_ctx_.info(); }
+
+  Index rows() const { return m_; }
+  Index cols() const { return n_; }
+  cudaStream_t stream() const { return solver_ctx_.stream_; }
+
+  /** Upper-triangular factor R (k × n) of A = Q R. Available only for m >= n. */
+  PlainMatrix matrixR() const {
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success);
+    eigen_assert(!transposed_ && "matrixR() not available when m < n (we factored A^H internally)");
+    PlainMatrix qr_full(m_, n_);
+    if (m_ > 0 && n_ > 0) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(qr_full.data(), d_qr_.get(),
+                                          static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar),
+                                          cudaMemcpyDeviceToHost));
+    }
+    PlainMatrix R = qr_full.topRows(k()).template triangularView<Upper>();
+    return R;
+  }
+
+ private:
+  mutable internal::GpuSolverContext solver_ctx_;
+  internal::DeviceBuffer d_qr_;   // QR factors (reflectors below diag, R above)
+  internal::DeviceBuffer d_tau_;  // Householder scalars (length k)
+  int64_t m_ = 0;                 // original A.rows()
+  int64_t n_ = 0;                 // original A.cols()
+  int64_t lda_ = 0;               // factor leading dim = max(m_, n_)
+  bool transposed_ = false;       // true iff m_ < n_ (we factored A^H instead of A)
+
+  // Factor matrix dimensions (we always factor a "tall" matrix: rows >= cols).
+  int64_t factor_rows() const { return transposed_ ? n_ : m_; }
+  int64_t factor_cols() const { return transposed_ ? m_ : n_; }
+  int64_t k() const { return (std::min)(m_, n_); }
+
+  void factorize() {
+    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
+
+    solver_ctx_.mark_pending();
+
+    const int64_t fm = factor_rows();
+    const int64_t fn = factor_cols();
+    size_t dev_ws = 0, host_ws = 0;
+    EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf_bufferSize(solver_ctx_.cusolver_, solver_ctx_.params_.p, fm, fn, dtype,
+                                                     d_qr_.get(), lda_, dtype, d_tau_.get(), dtype, &dev_ws, &host_ws));
+
+    solver_ctx_.ensure_scratch(dev_ws);
+    solver_ctx_.h_workspace_.resize(host_ws);
+
+    EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf(solver_ctx_.cusolver_, solver_ctx_.params_.p, fm, fn, dtype, d_qr_.get(),
+                                          lda_, dtype, d_tau_.get(), dtype, solver_ctx_.scratch_workspace(), dev_ws,
+                                          host_ws > 0 ? solver_ctx_.h_workspace_.data() : nullptr, host_ws,
+                                          solver_ctx_.scratch_info()));
+
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solver_ctx_.info_word(), solver_ctx_.scratch_info(), sizeof(int),
+                                             cudaMemcpyDeviceToHost, solver_ctx_.stream_));
+  }
+
+  // Apply Q (op = CUBLAS_OP_N) or Q^H (op = CUBLAS_OP_T/C) to a device buffer in-place.
+  // For real types Q^H = Q^T -> CUBLAS_OP_T. For complex -> CUBLAS_OP_C.
+  // Workspace lives in solver_ctx_.scratch (grows but never shrinks): no per-call malloc/free.
+  void apply_Q(cublasOperation_t op, void* d_B, int64_t ldb, int64_t nrhs) const {
+    const int im = internal::to_blas_int(factor_rows());
+    const int in = internal::to_blas_int(nrhs);
+    const int ik = internal::to_blas_int(k());
+    const int ilda = internal::to_blas_int(lda_);
+    const int ildb = internal::to_blas_int(ldb);
+
+    int lwork = 0;
+    EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr_bufferSize(
+        solver_ctx_.cusolver_, CUBLAS_SIDE_LEFT, op, im, in, ik, static_cast<const Scalar*>(d_qr_.get()), ilda,
+        static_cast<const Scalar*>(d_tau_.get()), static_cast<const Scalar*>(d_B), ildb, &lwork));
+
+    solver_ctx_.ensure_scratch(static_cast<size_t>(lwork) * sizeof(Scalar));
+
+    EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr(
+        solver_ctx_.cusolver_, CUBLAS_SIDE_LEFT, op, im, in, ik, static_cast<const Scalar*>(d_qr_.get()), ilda,
+        static_cast<const Scalar*>(d_tau_.get()), static_cast<Scalar*>(d_B), ildb,
+        static_cast<Scalar*>(solver_ctx_.scratch_workspace()), lwork, solver_ctx_.scratch_info()));
+  }
+
+  void apply_QH(void* d_B, int64_t ldb, int64_t nrhs) const {
+    constexpr cublasOperation_t trans = NumTraits<Scalar>::IsComplex ? CUBLAS_OP_C : CUBLAS_OP_T;
+    apply_Q(trans, d_B, ldb, nrhs);
+  }
+
+  // ---- Solve helpers (overdetermined: m >= n) ------------------------------
+
+  PlainMatrix solve_overdetermined_host(const PlainMatrix& rhs) const {
+    const Index nrhs = rhs.cols();
+    const size_t b_bytes = static_cast<size_t>(m_) * static_cast<size_t>(nrhs) * sizeof(Scalar);
+
+    internal::DeviceBuffer d_B(b_bytes);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_B.get(), rhs.data(), b_bytes, cudaMemcpyHostToDevice, solver_ctx_.stream_));
+
+    apply_QH(d_B.get(), m_, nrhs);
+    trsm_R(d_B.get(), m_, nrhs, /*op=*/CUBLAS_OP_N);
+
+    PlainMatrix X(n_, nrhs);
+    if (m_ == n_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_B.get(),
+                                               static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar),
+                                               cudaMemcpyDeviceToHost, solver_ctx_.stream_));
+    } else {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(X.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_B.get(),
+                                                 static_cast<size_t>(m_) * sizeof(Scalar),
+                                                 static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs),
+                                                 cudaMemcpyDeviceToHost, solver_ctx_.stream_));
+    }
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(solver_ctx_.stream_));
+    return X;
+  }
+
+  DeviceMatrix<Scalar> solve_overdetermined_device(const DeviceMatrix<Scalar>& d_B) const {
+    const Index nrhs = d_B.cols();
+    const size_t b_bytes = static_cast<size_t>(m_) * static_cast<size_t>(nrhs) * sizeof(Scalar);
+
+    internal::DeviceBuffer d_work(b_bytes);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_work.get(), d_B.data(), b_bytes, cudaMemcpyDeviceToDevice, solver_ctx_.stream_));
+
+    apply_QH(d_work.get(), m_, nrhs);
+    trsm_R(d_work.get(), m_, nrhs, /*op=*/CUBLAS_OP_N);
+
+    if (m_ == n_) {
+      DeviceMatrix<Scalar> result =
+          DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_work.release()), n_, static_cast<Index>(nrhs));
+      result.recordReady(solver_ctx_.stream_);
+      return result;
+    }
+    DeviceMatrix<Scalar> result(n_, static_cast<Index>(nrhs));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(result.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_work.get(),
+                                               static_cast<size_t>(m_) * sizeof(Scalar),
+                                               static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs),
+                                               cudaMemcpyDeviceToDevice, solver_ctx_.stream_));
+    result.recordReady(solver_ctx_.stream_);
+    return result;
+  }
+
+  // ---- Solve helpers (underdetermined: m < n) ------------------------------
+  //
+  // We factored A^H = Q R, so A = R^H Q^H. Solving A X = B for X with min ||X||:
+  //   z = R^{-H} B            (m × nrhs, occupies top m rows of an n × nrhs buffer)
+  //   X = Q [z; 0]            (n × nrhs)
+
+  PlainMatrix solve_underdetermined_host(const PlainMatrix& rhs, Index nrhs) const {
+    const size_t x_bytes = static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar);
+
+    internal::DeviceBuffer d_X(x_bytes);
+    // Zero the full n × nrhs buffer; B will overwrite the top m × nrhs block.
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(d_X.get(), 0, x_bytes, solver_ctx_.stream_));
+
+    // 2D copy: B (m × nrhs, leading dim m) into top of d_X (leading dim n).
+    if (m_ > 0 && nrhs > 0) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(d_X.get(), static_cast<size_t>(n_) * sizeof(Scalar), rhs.data(),
+                                                 static_cast<size_t>(m_) * sizeof(Scalar),
+                                                 static_cast<size_t>(m_) * sizeof(Scalar), static_cast<size_t>(nrhs),
+                                                 cudaMemcpyHostToDevice, solver_ctx_.stream_));
+    }
+
+    trsm_R(d_X.get(), n_, nrhs, trsm_op_conj_trans());
+    apply_Q(CUBLAS_OP_N, d_X.get(), n_, nrhs);
+
+    PlainMatrix X(n_, nrhs);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(X.data(), d_X.get(), x_bytes, cudaMemcpyDeviceToHost, solver_ctx_.stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(solver_ctx_.stream_));
+    return X;
+  }
+
+  DeviceMatrix<Scalar> solve_underdetermined_device(const DeviceMatrix<Scalar>& d_B) const {
+    const Index nrhs = d_B.cols();
+    const size_t x_bytes = static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar);
+
+    internal::DeviceBuffer d_X(x_bytes);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(d_X.get(), 0, x_bytes, solver_ctx_.stream_));
+
+    if (m_ > 0 && nrhs > 0) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(d_X.get(), static_cast<size_t>(n_) * sizeof(Scalar), d_B.data(),
+                                                 static_cast<size_t>(m_) * sizeof(Scalar),
+                                                 static_cast<size_t>(m_) * sizeof(Scalar), static_cast<size_t>(nrhs),
+                                                 cudaMemcpyDeviceToDevice, solver_ctx_.stream_));
+    }
+
+    trsm_R(d_X.get(), n_, nrhs, trsm_op_conj_trans());
+    apply_Q(CUBLAS_OP_N, d_X.get(), n_, nrhs);
+
+    DeviceMatrix<Scalar> result =
+        DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_X.release()), n_, static_cast<Index>(nrhs));
+    result.recordReady(solver_ctx_.stream_);
+    return result;
+  }
+
+  static cublasOperation_t trsm_op_conj_trans() { return NumTraits<Scalar>::IsComplex ? CUBLAS_OP_C : CUBLAS_OP_T; }
+
+  // Triangular solve on R: X := op(R)^{-1} B (in-place on B).
+  // op = CUBLAS_OP_N        for the m>=n branch (R X = (Q^H B)[:k,:])
+  // op = CUBLAS_OP_T or _C  for the m<n branch  (R^H z = B)
+  void trsm_R(void* d_B, int64_t ldb, int64_t nrhs, cublasOperation_t op) const {
+    Scalar alpha(1);
+    EIGEN_CUBLAS_CHECK(internal::cublasXtrsm(
+        solver_ctx_.cublas_, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, op, CUBLAS_DIAG_NON_UNIT,
+        internal::to_blas_int(k()), internal::to_blas_int(nrhs), &alpha, static_cast<const Scalar*>(d_qr_.get()),
+        internal::to_blas_int(lda_), static_cast<Scalar*>(d_B), internal::to_blas_int(ldb)));
+  }
+};
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_QR_H
diff --git a/unsupported/Eigen/src/GPU/GpuSVD.h b/unsupported/Eigen/src/GPU/GpuSVD.h
new file mode 100644
index 0000000..9b16cab
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/GpuSVD.h
@@ -0,0 +1,531 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// GPU SVD decomposition using cuSOLVER (divide-and-conquer).
+//
+// Wraps cusolverDnXgesvd. Stores U, S, VT on device. Solve uses
+// cuBLAS GEMM: X = VT^H * diag(D) * U^H * B.
+//
+// cuSOLVER returns VT (not V). We store and expose VT directly.
+//
+// Usage:
+//   SVD<double> svd(A, ComputeThinU | ComputeThinV);
+//   VectorXd S = svd.singularValues();
+//   MatrixXd U = svd.matrixU();       // m×k or m×m
+//   MatrixXd VT = svd.matrixVT();      // k×n or n×n (this is V^T)
+//   MatrixXd X = svd.solve(B);        // pseudoinverse
+//   MatrixXd X = svd.solve(B, k);     // truncated (top k triplets)
+//   MatrixXd X = svd.solve(B, 0.1);   // Tikhonov regularized
+
+#ifndef EIGEN_GPU_SVD_H
+#define EIGEN_GPU_SVD_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSolverContext.h"
+
+namespace Eigen {
+namespace gpu {
+
+template <typename Scalar_>
+class SVD {
+ public:
+  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+  using PlainVector = Eigen::Matrix<Scalar, Dynamic, 1>;
+  using RealVector = Eigen::Matrix<RealScalar, Dynamic, 1>;
+
+  SVD() = default;
+
+  template <typename InputType>
+  explicit SVD(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) {
+    compute(A, options);
+  }
+
+  explicit SVD(const DeviceMatrix<Scalar>& d_A, unsigned int options = ComputeThinU | ComputeThinV) {
+    compute(d_A, options);
+  }
+
+  ~SVD() = default;
+
+  SVD(const SVD&) = delete;
+  SVD& operator=(const SVD&) = delete;
+
+  SVD(SVD&& o) noexcept
+      : solver_ctx_(std::move(o.solver_ctx_)),
+        d_A_(std::move(o.d_A_)),
+        d_U_(std::move(o.d_U_)),
+        d_S_(std::move(o.d_S_)),
+        d_VT_(std::move(o.d_VT_)),
+        options_(o.options_),
+        m_(o.m_),
+        n_(o.n_),
+        lda_(o.lda_),
+        transposed_(o.transposed_) {
+    o.options_ = 0;
+    o.m_ = 0;
+    o.n_ = 0;
+    o.lda_ = 0;
+    o.transposed_ = false;
+  }
+
+  SVD& operator=(SVD&& o) noexcept {
+    if (this != &o) {
+      solver_ctx_ = std::move(o.solver_ctx_);
+      d_A_ = std::move(o.d_A_);
+      d_U_ = std::move(o.d_U_);
+      d_S_ = std::move(o.d_S_);
+      d_VT_ = std::move(o.d_VT_);
+      options_ = o.options_;
+      m_ = o.m_;
+      n_ = o.n_;
+      lda_ = o.lda_;
+      transposed_ = o.transposed_;
+      o.options_ = 0;
+      o.m_ = 0;
+      o.n_ = 0;
+      o.lda_ = 0;
+      o.transposed_ = false;
+    }
+    return *this;
+  }
+
+  // ---- Factorization -------------------------------------------------------
+
+  template <typename InputType>
+  SVD& compute(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) {
+    // Upload to device, then delegate. The wide-matrix transpose runs on the
+    // GPU (via cublasXgeam) inside the device-input path; no host transpose.
+    return compute(DeviceMatrix<Scalar>::fromHost(A.derived(), solver_ctx_.stream_), options);
+  }
+
+  SVD& compute(const DeviceMatrix<Scalar>& d_A, unsigned int options = ComputeThinU | ComputeThinV) {
+    options_ = options;
+    m_ = d_A.rows();
+    n_ = d_A.cols();
+    lda_ = 0;
+    transposed_ = false;
+
+    if (m_ == 0 || n_ == 0) {
+      d_A_ = internal::DeviceBuffer();
+      d_U_ = internal::DeviceBuffer();
+      d_S_ = internal::DeviceBuffer();
+      d_VT_ = internal::DeviceBuffer();
+      solver_ctx_.info_ = Success;
+      solver_ctx_.info_synced_ = true;
+      return *this;
+    }
+
+    transposed_ = (m_ < n_);
+    d_A.waitReady(solver_ctx_.stream_);
+
+    if (transposed_) {
+      // Transpose on device via cuBLAS geam: d_A_ = A^H.
+      std::swap(m_, n_);
+      lda_ = m_;
+      const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
+      d_A_ = internal::DeviceBuffer(mat_bytes);
+      // geam: C(m×n) = alpha * op(A) + beta * op(B). beta=0, B=nullptr.
+      Scalar alpha_one(1), beta_zero(0);
+      EIGEN_CUBLAS_CHECK(internal::cublasXgeam(
+          solver_ctx_.cublas_, CUBLAS_OP_C, CUBLAS_OP_N, internal::to_blas_int(m_), internal::to_blas_int(n_),
+          &alpha_one, d_A.data(), internal::to_blas_int(d_A.rows()), &beta_zero, static_cast<const Scalar*>(nullptr),
+          internal::to_blas_int(m_), static_cast<Scalar*>(d_A_.get()), internal::to_blas_int(m_)));
+    } else {
+      lda_ = static_cast<int64_t>(d_A.rows());
+      const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar);
+      d_A_ = internal::DeviceBuffer(mat_bytes);
+      EIGEN_CUDA_RUNTIME_CHECK(
+          cudaMemcpyAsync(d_A_.get(), d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, solver_ctx_.stream_));
+    }
+
+    factorize();
+    return *this;
+  }
+
+  // ---- Accessors -----------------------------------------------------------
+
+  ComputationInfo info() const { return solver_ctx_.info(); }
+
+  Index rows() const { return transposed_ ? n_ : m_; }
+  Index cols() const { return transposed_ ? m_ : n_; }
+
+  /** Singular values (always available). Downloads from device on each call. */
+  RealVector singularValues() const {
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success);
+    const Index k = (std::min)(m_, n_);
+    RealVector S(k);
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpy(S.data(), d_S_.get(), static_cast<size_t>(k) * sizeof(RealScalar), cudaMemcpyDeviceToHost));
+    return S;
+  }
+
+  /** Left singular vectors U. */
+  PlainMatrix matrixU() const {
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success);
+    eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "matrixU() requires ComputeThinU or ComputeFullU");
+    const Index m_orig = transposed_ ? n_ : m_;
+    const Index n_orig = transposed_ ? m_ : n_;
+    const Index k = (std::min)(m_orig, n_orig);
+    if (!transposed_) {
+      const Index ucols = (options_ & ComputeFullU) ? m_ : k;
+      PlainMatrix U(m_, ucols);
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U.data(), d_U_.get(),
+                                          static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar),
+                                          cudaMemcpyDeviceToHost));
+      return U;
+    } else {
+      const Index vtrows = (options_ & ComputeFullU) ? m_orig : k;
+      PlainMatrix VT_stored(vtrows, n_);
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT_stored.data(), d_VT_.get(),
+                                          static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar),
+                                          cudaMemcpyDeviceToHost));
+      return VT_stored.adjoint();
+    }
+  }
+
+  /** Right singular vectors V (matches host JacobiSVD/BDCSVD). */
+  PlainMatrix matrixV() const { return matrixVT().adjoint(); }
+
+  /** Right singular vectors transposed V^T. */
+  PlainMatrix matrixVT() const {
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success);
+    eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "matrixVT() requires ComputeThinV or ComputeFullV");
+    const Index m_orig = transposed_ ? n_ : m_;
+    const Index n_orig = transposed_ ? m_ : n_;
+    const Index k = (std::min)(m_orig, n_orig);
+    if (!transposed_) {
+      const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
+      PlainMatrix VT(vtrows, n_);
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT.data(), d_VT_.get(),
+                                          static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar),
+                                          cudaMemcpyDeviceToHost));
+      return VT;
+    } else {
+      const Index ucols = (options_ & ComputeFullV) ? n_orig : k;
+      PlainMatrix U_stored(m_, ucols);
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U_stored.data(), d_U_.get(),
+                                          static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar),
+                                          cudaMemcpyDeviceToHost));
+      return U_stored.adjoint();
+    }
+  }
+
+  // ---- Device-side accessors (zero-copy views; chain into cuBLAS without D2D) ---------
+  //
+  // These return non-owning DeviceMatrix views over the SVD's internal device storage.
+  // The view borrows the pointer: destruction does not free; the SVD object must outlive
+  // any view derived from it. For the common case (m >= n) all three accessors are pure
+  // metadata: zero kernel launches, zero allocations.
+  //
+  // For wide matrices (m < n, internally factored as A^H), original U and V^T are the
+  // adjoints of the stored buffers, so d_matrixU() / d_matrixVT() build them via a
+  // cublasXgeam into an owning temporary. d_singularValues() remains zero-copy.
+
+  /** Singular values as a k × 1 view on this solver's stream. */
+  DeviceMatrix<RealScalar> d_singularValues() const {
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success);
+    const Index k = (std::min)(m_, n_);
+    auto v = DeviceMatrix<RealScalar>::view(static_cast<RealScalar*>(d_S_.get()), k, 1);
+    v.recordReady(solver_ctx_.stream_);
+    return v;
+  }
+
+  /** Left singular vectors U as a DeviceMatrix on this solver's stream.
+   * For m >= n: zero-copy view. For m < n: owning (one cublasXgeam adjoint pass). */
+  DeviceMatrix<Scalar> d_matrixU() const {
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success);
+    eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "d_matrixU() requires ComputeThinU or ComputeFullU");
+    const Index m_orig = transposed_ ? n_ : m_;
+    const Index n_orig = transposed_ ? m_ : n_;
+    const Index k = (std::min)(m_orig, n_orig);
+    if (!transposed_) {
+      const Index ucols = (options_ & ComputeFullU) ? m_ : k;
+      auto v = DeviceMatrix<Scalar>::view(static_cast<Scalar*>(d_U_.get()), m_, ucols);
+      v.recordReady(solver_ctx_.stream_);
+      return v;
+    }
+    // transposed: U_orig = VT_stored^H -> conjugate-transpose via cublasXgeam.
+    const Index vtrows_stored = (options_ & ComputeFullU) ? n_ : k;
+    DeviceMatrix<Scalar> result(n_, vtrows_stored);
+    if (n_ > 0 && vtrows_stored > 0) {
+      Scalar alpha_one(1), beta_zero(0);
+      EIGEN_CUBLAS_CHECK(internal::cublasXgeam(
+          solver_ctx_.cublas_, CUBLAS_OP_C, CUBLAS_OP_N, internal::to_blas_int(n_),
+          internal::to_blas_int(vtrows_stored), &alpha_one, static_cast<const Scalar*>(d_VT_.get()),
+          internal::to_blas_int(vtrows_stored), &beta_zero, static_cast<const Scalar*>(nullptr),
+          internal::to_blas_int(n_), result.data(), internal::to_blas_int(n_)));
+      result.recordReady(solver_ctx_.stream_);
+    }
+    return result;
+  }
+
+  /** Right singular vectors transposed V^T as a DeviceMatrix on this solver's stream.
+   * For m >= n: zero-copy view. For m < n: owning (one cublasXgeam adjoint pass). */
+  DeviceMatrix<Scalar> d_matrixVT() const {
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success);
+    eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "d_matrixVT() requires ComputeThinV or ComputeFullV");
+    const Index m_orig = transposed_ ? n_ : m_;
+    const Index n_orig = transposed_ ? m_ : n_;
+    const Index k = (std::min)(m_orig, n_orig);
+    if (!transposed_) {
+      const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
+      auto v = DeviceMatrix<Scalar>::view(static_cast<Scalar*>(d_VT_.get()), vtrows, n_);
+      v.recordReady(solver_ctx_.stream_);
+      return v;
+    }
+    // transposed: VT_orig = U_stored^H.
+    const Index ucols = (options_ & ComputeFullV) ? n_orig : k;
+    DeviceMatrix<Scalar> result(ucols, m_);
+    if (ucols > 0 && m_ > 0) {
+      Scalar alpha_one(1), beta_zero(0);
+      EIGEN_CUBLAS_CHECK(
+          internal::cublasXgeam(solver_ctx_.cublas_, CUBLAS_OP_C, CUBLAS_OP_N, internal::to_blas_int(ucols),
+                                internal::to_blas_int(m_), &alpha_one, static_cast<const Scalar*>(d_U_.get()),
+                                internal::to_blas_int(m_), &beta_zero, static_cast<const Scalar*>(nullptr),
+                                internal::to_blas_int(ucols), result.data(), internal::to_blas_int(ucols)));
+      result.recordReady(solver_ctx_.stream_);
+    }
+    return result;
+  }
+
+  /** Number of singular values above threshold. */
+  Index rank(RealScalar threshold = RealScalar(-1)) const {
+    RealVector S = singularValues();
+    if (S.size() == 0) return 0;
+    if (threshold < 0) {
+      threshold = (std::max)(m_, n_) * S(0) * NumTraits<RealScalar>::epsilon();
+    }
+    return (S.array() > threshold).count();
+  }
+
+  // ---- Solve ---------------------------------------------------------------
+
+  /** Pseudoinverse solve: X = V * diag(1/S) * U^H * B. */
+  template <typename Rhs>
+  PlainMatrix solve(const MatrixBase<Rhs>& B) const {
+    return solve_impl(B, (std::min)(m_, n_), RealScalar(0));
+  }
+
+  /** Truncated solve: use only top trunc singular triplets. */
+  template <typename Rhs>
+  PlainMatrix solve(const MatrixBase<Rhs>& B, Index trunc) const {
+    eigen_assert(trunc > 0 && trunc <= (std::min)(m_, n_));
+    return solve_impl(B, trunc, RealScalar(0));
+  }
+
+  /** Tikhonov-regularized solve: D_ii = S_i / (S_i^2 + lambda^2). */
+  template <typename Rhs>
+  PlainMatrix solve(const MatrixBase<Rhs>& B, RealScalar lambda) const {
+    eigen_assert(lambda > 0);
+    return solve_impl(B, (std::min)(m_, n_), lambda);
+  }
+
+  cudaStream_t stream() const { return solver_ctx_.stream_; }
+
+ private:
+  mutable internal::GpuSolverContext solver_ctx_;
+  internal::DeviceBuffer d_A_;
+  internal::DeviceBuffer d_U_;
+  internal::DeviceBuffer d_S_;
+  internal::DeviceBuffer d_VT_;
+  unsigned int options_ = 0;
+  int64_t m_ = 0;
+  int64_t n_ = 0;
+  int64_t lda_ = 0;
+  bool transposed_ = false;
+
+  // Swap U↔V flags for the transposed case.
+  static unsigned int swap_uv_options(unsigned int opts) {
+    unsigned int result = 0;
+    if (opts & ComputeThinU) result |= ComputeThinV;
+    if (opts & ComputeFullU) result |= ComputeFullV;
+    if (opts & ComputeThinV) result |= ComputeThinU;
+    if (opts & ComputeFullV) result |= ComputeFullU;
+    return result;
+  }
+
+  static signed char jobu(unsigned int opts) {
+    if (opts & ComputeFullU) return 'A';
+    if (opts & ComputeThinU) return 'S';
+    return 'N';
+  }
+
+  static signed char jobvt(unsigned int opts) {
+    if (opts & ComputeFullV) return 'A';
+    if (opts & ComputeThinV) return 'S';
+    return 'N';
+  }
+
+  void factorize() {
+    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
+    constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value;
+    const Index k = (std::min)(m_, n_);
+
+    solver_ctx_.mark_pending();
+
+    d_S_ = internal::DeviceBuffer(static_cast<size_t>(k) * sizeof(RealScalar));
+
+    const unsigned int int_opts = transposed_ ? swap_uv_options(options_) : options_;
+
+    const Index ucols = (int_opts & ComputeFullU) ? m_ : ((int_opts & ComputeThinU) ? k : 0);
+    const Index vtrows = (int_opts & ComputeFullV) ? n_ : ((int_opts & ComputeThinV) ? k : 0);
+    const int64_t ldu = m_;
+    const int64_t ldvt = vtrows > 0 ? vtrows : 1;
+
+    d_U_ = internal::DeviceBuffer();
+    d_VT_ = internal::DeviceBuffer();
+    if (ucols > 0) d_U_ = internal::DeviceBuffer(static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar));
+    if (vtrows > 0)
+      d_VT_ = internal::DeviceBuffer(static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar));
+
+    eigen_assert(m_ >= n_ && "Internal error: m_ < n_ should have been handled by transpose in compute()");
+    size_t dev_ws = 0, host_ws = 0;
+    EIGEN_CUSOLVER_CHECK(cusolverDnXgesvd_bufferSize(
+        solver_ctx_.cusolver_, solver_ctx_.params_.p, jobu(int_opts), jobvt(int_opts), m_, n_, dtype, d_A_.get(), lda_,
+        rtype, d_S_.get(), dtype, ucols > 0 ? d_U_.get() : nullptr, ldu, dtype, vtrows > 0 ? d_VT_.get() : nullptr,
+        ldvt, dtype, &dev_ws, &host_ws));
+
+    solver_ctx_.ensure_scratch(dev_ws);
+    solver_ctx_.h_workspace_.resize(host_ws);
+
+    EIGEN_CUSOLVER_CHECK(
+        cusolverDnXgesvd(solver_ctx_.cusolver_, solver_ctx_.params_.p, jobu(int_opts), jobvt(int_opts), m_, n_, dtype,
+                         d_A_.get(), lda_, rtype, d_S_.get(), dtype, ucols > 0 ? d_U_.get() : nullptr, ldu, dtype,
+                         vtrows > 0 ? d_VT_.get() : nullptr, ldvt, dtype, solver_ctx_.scratch_workspace(), dev_ws,
+                         host_ws > 0 ? solver_ctx_.h_workspace_.data() : nullptr, host_ws, solver_ctx_.scratch_info()));
+
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solver_ctx_.info_word(), solver_ctx_.scratch_info(), sizeof(int),
+                                             cudaMemcpyDeviceToHost, solver_ctx_.stream_));
+  }
+
+  template <typename Rhs>
+  PlainMatrix solve_impl(const MatrixBase<Rhs>& B, Index trunc, RealScalar lambda) const {
+    solver_ctx_.sync_info();
+    eigen_assert(solver_ctx_.info_ == Success && "SVD::solve called on a failed or uninitialized decomposition");
+    eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "solve requires U");
+    eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "solve requires V");
+
+    const Index m_orig = transposed_ ? n_ : m_;
+    const Index n_orig = transposed_ ? m_ : n_;
+    eigen_assert(B.rows() == m_orig);
+
+    const Index k = (std::min)(m_, n_);
+    const Index kk = (std::min)(trunc, k);
+    const Index nrhs = B.cols();
+
+    // Empty problem: no rank, no RHS, or zero domain -> result is the zero matrix.
+    // Returning early avoids reading S(0) below when k == 0 and prevents zero-extent
+    // GEMM/dgmm calls.
+    if (kk == 0 || nrhs == 0 || n_orig == 0) {
+      return PlainMatrix::Zero(n_orig, nrhs);
+    }
+
+    // Enqueue both transfers on solver_ctx_.stream_ in one batch and sync once. Issuing the
+    // B upload before reading S means B's H2D is already in flight while we wait for
+    // gesvd-then-S-D2H, instead of two back-to-back blocking syncs.
+    const PlainMatrix rhs(B);
+    internal::DeviceBuffer d_B(static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_B.get(), rhs.data(),
+                                             static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar),
+                                             cudaMemcpyHostToDevice, solver_ctx_.stream_));
+    RealVector S(k);
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(S.data(), d_S_.get(), static_cast<size_t>(k) * sizeof(RealScalar),
+                                             cudaMemcpyDeviceToHost, solver_ctx_.stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(solver_ctx_.stream_));
+
+    // Typed pointers for device buffers.
+    auto* U_dev = static_cast<const Scalar*>(d_U_.get());
+    auto* VT_dev = static_cast<const Scalar*>(d_VT_.get());
+    auto* B_dev = static_cast<const Scalar*>(d_B.get());
+
+    Scalar scalars[2] = {Scalar(1), Scalar(0)};
+
+    // Step 1: tmp = U_orig^H * B  (kk × nrhs).
+    internal::DeviceBuffer d_tmp(static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar));
+    auto* tmp_dev = static_cast<Scalar*>(d_tmp.get());
+    if (!transposed_) {
+      EIGEN_CUBLAS_CHECK(internal::cublasXgemm(solver_ctx_.cublas_, CUBLAS_OP_C, CUBLAS_OP_N, internal::to_blas_int(kk),
+                                               internal::to_blas_int(nrhs), internal::to_blas_int(m_), &scalars[0],
+                                               U_dev, internal::to_blas_int(m_), B_dev, internal::to_blas_int(m_orig),
+                                               &scalars[1], tmp_dev, internal::to_blas_int(kk)));
+    } else {
+      const Index vtrows_stored = (swap_uv_options(options_) & ComputeFullV) ? n_ : k;
+      EIGEN_CUBLAS_CHECK(internal::cublasXgemm(
+          solver_ctx_.cublas_, CUBLAS_OP_N, CUBLAS_OP_N, internal::to_blas_int(kk), internal::to_blas_int(nrhs),
+          internal::to_blas_int(m_orig), &scalars[0], VT_dev, internal::to_blas_int(vtrows_stored), B_dev,
+          internal::to_blas_int(m_orig), &scalars[1], tmp_dev, internal::to_blas_int(kk)));
+    }
+
+    // Step 2: Apply diag(D) to tmp on device via cublasXdgmm.
+    // D is built on host from the (small, k-entry) singular-values vector S
+    // and uploaded to a small device buffer; tmp itself stays on device.
+    //
+    // For lambda == 0 we mirror Eigen's SVDBase::_solve_impl: drop singular
+    // values below S(0) * k * eps (numerical-rank truncation), so this
+    // pseudoinverse solve agrees with CPU BDCSVD::solve on near-singular A.
+    // dgmm wants the diagonal in the matrix scalar type — for complex Scalar
+    // the diagonal is still real, so we build the real values then cast.
+    const RealScalar drop_threshold = S(0) * RealScalar(k) * NumTraits<RealScalar>::epsilon();
+    auto S_head = S.head(kk).array();
+    PlainVector D(kk);
+    if (lambda == RealScalar(0)) {
+      D = (S_head > drop_threshold).select(S_head.inverse(), RealScalar(0)).matrix().template cast<Scalar>();
+    } else {
+      D = (S_head / (S_head.square() + lambda * lambda)).matrix().template cast<Scalar>();
+    }
+
+    internal::DeviceBuffer d_D(static_cast<size_t>(kk) * sizeof(Scalar));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_D.get(), D.data(), static_cast<size_t>(kk) * sizeof(Scalar),
+                                             cudaMemcpyHostToDevice, solver_ctx_.stream_));
+
+    EIGEN_CUBLAS_CHECK(internal::cublasXdgmm(
+        solver_ctx_.cublas_, CUBLAS_SIDE_LEFT, internal::to_blas_int(kk), internal::to_blas_int(nrhs), tmp_dev,
+        internal::to_blas_int(kk), static_cast<const Scalar*>(d_D.get()), 1, tmp_dev, internal::to_blas_int(kk)));
+
+    // Step 3: X = V_orig * tmp  (n_orig × nrhs).
+    PlainMatrix X(n_orig, nrhs);
+    internal::DeviceBuffer d_X(static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar));
+    auto* X_dev = static_cast<Scalar*>(d_X.get());
+
+    if (!transposed_) {
+      const Index vtrows = (options_ & ComputeFullV) ? n_ : k;
+      EIGEN_CUBLAS_CHECK(internal::cublasXgemm(
+          solver_ctx_.cublas_, CUBLAS_OP_C, CUBLAS_OP_N, internal::to_blas_int(n_orig), internal::to_blas_int(nrhs),
+          internal::to_blas_int(kk), &scalars[0], VT_dev, internal::to_blas_int(vtrows), tmp_dev,
+          internal::to_blas_int(kk), &scalars[1], X_dev, internal::to_blas_int(n_orig)));
+    } else {
+      EIGEN_CUBLAS_CHECK(internal::cublasXgemm(
+          solver_ctx_.cublas_, CUBLAS_OP_N, CUBLAS_OP_N, internal::to_blas_int(n_orig), internal::to_blas_int(nrhs),
+          internal::to_blas_int(kk), &scalars[0], U_dev, internal::to_blas_int(m_), tmp_dev, internal::to_blas_int(kk),
+          &scalars[1], X_dev, internal::to_blas_int(n_orig)));
+    }
+
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_X.get(),
+                                             static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar),
+                                             cudaMemcpyDeviceToHost, solver_ctx_.stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(solver_ctx_.stream_));
+
+    return X;
+  }
+};
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_SVD_H
diff --git a/unsupported/Eigen/src/GPU/GpuSolverContext.h b/unsupported/Eigen/src/GPU/GpuSolverContext.h
new file mode 100644
index 0000000..87c4641
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/GpuSolverContext.h
@@ -0,0 +1,171 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// Shared context for GPU solvers (GpuLLT, GpuLU, GpuQR, GpuSVD, etc.).
+//
+// Owns a CUDA stream, cuSOLVER handle, cuBLAS handle, scratch buffer,
+// and info word. Each solver holds a GpuSolverContext by composition
+// and delegates lifecycle/scratch management to it.
+
+#ifndef EIGEN_GPU_SOLVER_CONTEXT_H
+#define EIGEN_GPU_SOLVER_CONTEXT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuSolverSupport.h"
+#include "./CuBlasSupport.h"
+#include <vector>
+
+namespace Eigen {
+namespace gpu {
+namespace internal {
+
+struct GpuSolverContext {
+  cudaStream_t stream_ = nullptr;
+  cusolverDnHandle_t cusolver_ = nullptr;
+  cublasHandle_t cublas_ = nullptr;
+  CusolverParams params_;
+  DeviceBuffer d_scratch_;
+  size_t scratch_size_ = 0;
+  std::vector<char> h_workspace_;
+  ComputationInfo info_ = InvalidInput;
+  PinnedHostBuffer pinned_info_{sizeof(int)};  // pinned host memory for async D2H of info word
+  bool info_synced_ = true;
+
+  int& info_word() { return *static_cast<int*>(pinned_info_.get()); }
+  int info_word() const { return *static_cast<const int*>(pinned_info_.get()); }
+
+  GpuSolverContext() {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_));
+    EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
+    EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
+    ensure_scratch(0);
+  }
+
+  ~GpuSolverContext() {
+    // Ignore errors here: dtors are noexcept, and EIGEN_CU{BLAS,SOLVER,DA_RUNTIME}_CHECK
+    // are eigen_assert-based — firing one from a noexcept dtor terminates the program.
+    // The trailing cudaFree() of the device buffers (via DeviceBuffer::~DeviceBuffer) is
+    // synchronous and waits for any in-flight kernel touching the buffer.
+    if (cublas_) (void)cublasDestroy(cublas_);
+    if (cusolver_) (void)cusolverDnDestroy(cusolver_);
+    if (stream_) (void)cudaStreamDestroy(stream_);
+  }
+
+  GpuSolverContext(GpuSolverContext&& o) noexcept
+      : stream_(o.stream_),
+        cusolver_(o.cusolver_),
+        cublas_(o.cublas_),
+        params_(std::move(o.params_)),
+        d_scratch_(std::move(o.d_scratch_)),
+        scratch_size_(o.scratch_size_),
+        h_workspace_(std::move(o.h_workspace_)),
+        info_(o.info_),
+        pinned_info_(std::move(o.pinned_info_)),
+        info_synced_(o.info_synced_) {
+    o.stream_ = nullptr;
+    o.cusolver_ = nullptr;
+    o.cublas_ = nullptr;
+    o.scratch_size_ = 0;
+    o.info_ = InvalidInput;
+    o.info_synced_ = true;
+  }
+
+  GpuSolverContext& operator=(GpuSolverContext&& o) noexcept {
+    if (this != &o) {
+      // Drain pending work on the old stream before tearing it down so the
+      // upcoming move of d_scratch_ doesn't free buffers that an in-flight
+      // kernel is still touching. The asymmetric move-ctor doesn't need this —
+      // it constructs onto uninitialized storage.
+      if (stream_) (void)cudaStreamSynchronize(stream_);
+      if (cublas_) (void)cublasDestroy(cublas_);
+      if (cusolver_) (void)cusolverDnDestroy(cusolver_);
+      if (stream_) (void)cudaStreamDestroy(stream_);
+      stream_ = o.stream_;
+      cusolver_ = o.cusolver_;
+      cublas_ = o.cublas_;
+      params_ = std::move(o.params_);
+      d_scratch_ = std::move(o.d_scratch_);
+      scratch_size_ = o.scratch_size_;
+      h_workspace_ = std::move(o.h_workspace_);
+      info_ = o.info_;
+      pinned_info_ = std::move(o.pinned_info_);
+      info_synced_ = o.info_synced_;
+      o.stream_ = nullptr;
+      o.cusolver_ = nullptr;
+      o.cublas_ = nullptr;
+      o.scratch_size_ = 0;
+      o.info_ = InvalidInput;
+      o.info_synced_ = true;
+    }
+    return *this;
+  }
+
+  GpuSolverContext(const GpuSolverContext&) = delete;
+  GpuSolverContext& operator=(const GpuSolverContext&) = delete;
+
+  // Scratch layout: [ workspace (aligned) | info_word (sizeof(int)) ].
+  // Workspace size is rounded up to 16 bytes so the info word lands aligned.
+  static constexpr size_t kInfoBytes = sizeof(int);
+  static constexpr size_t kScratchAlign = 16;
+
+  static size_t scratchBytesFor(size_t workspace_bytes) {
+    workspace_bytes = (workspace_bytes + kScratchAlign - 1) & ~(kScratchAlign - 1);
+    return workspace_bytes + kInfoBytes;
+  }
+
+  // Ensure d_scratch_ holds at least `workspace_bytes` of scratch plus the trailing
+  // info word. Grows but never shrinks. Syncs the stream before reallocating to
+  // avoid freeing memory that async kernels may still be using.
+  void ensure_scratch(size_t workspace_bytes) {
+    size_t needed = scratchBytesFor(workspace_bytes);
+    if (needed > scratch_size_) {
+      if (d_scratch_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      d_scratch_ = DeviceBuffer(needed);
+      scratch_size_ = needed;
+    }
+  }
+
+  void* scratch_workspace() const { return d_scratch_.get(); }
+
+  int* scratch_info() const {
+    eigen_assert(d_scratch_ && scratch_size_ >= kInfoBytes);
+    return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.get()) + scratch_size_ - kInfoBytes);
+  }
+
+  // Mark a factorization as pending (info not yet available).
+  void mark_pending() {
+    info_synced_ = false;
+    info_ = InvalidInput;
+  }
+
+  // Synchronize the stream and interpret the info word. No-op if already synced.
+  void sync_info() {
+    if (!info_synced_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      info_ = (info_word() == 0) ? Success : NumericalIssue;
+      info_synced_ = true;
+    }
+  }
+
+  ComputationInfo info() {
+    sync_info();
+    return info_;
+  }
+};
+
+}  // namespace internal
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_SOLVER_CONTEXT_H
diff --git a/unsupported/Eigen/src/GPU/GpuSupport.h b/unsupported/Eigen/src/GPU/GpuSupport.h
index d2c129f..2c9b0cc 100644
--- a/unsupported/Eigen/src/GPU/GpuSupport.h
+++ b/unsupported/Eigen/src/GPU/GpuSupport.h
@@ -23,6 +23,7 @@
 
 #include <cuda_runtime.h>
 
+#include <limits>
 #include <memory>
 
 namespace Eigen {
@@ -38,13 +39,30 @@
     eigen_assert(_e == cudaSuccess && "CUDA runtime call failed"); \
   } while (0)
 
+// ---- Bounds-checked narrowing for cuBLAS/cuSOLVER int parameters ------------
+// cuBLAS and the legacy cuSOLVER APIs take dimensions and leading dimensions as
+// `int` (32-bit signed). Modern GPUs can host allocations whose dimensions
+// exceed INT_MAX, and Eigen's Index is 64-bit by default. Use this helper at
+// every narrowing call site so an out-of-range value triggers an assert
+// instead of silently overflowing the BLAS argument.
+
+inline int to_blas_int(int64_t v) {
+  eigen_assert(v >= 0 && v <= static_cast<int64_t>((std::numeric_limits<int>::max)()) &&
+               "dimension exceeds the int range supported by cuBLAS / cuSOLVER");
+  return static_cast<int>(v);
+}
+
 // ---- Custom deleters for CUDA-allocated memory ------------------------------
 // Used with std::unique_ptr to give CUDA allocations RAII semantics with no
 // hand-rolled move/dtor boilerplate.
 
 struct CudaFreeDeleter {
+  // When `borrow == true`, the unique_ptr does not free the pointer. Used by
+  // DeviceMatrix::view() to wrap a non-owning device pointer with the same
+  // smart-pointer machinery as owning storage, without changing the type.
+  bool borrow = false;
   void operator()(void* p) const noexcept {
-    if (p) (void)cudaFree(p);
+    if (p && !borrow) (void)cudaFree(p);
   }
 };
 
diff --git a/unsupported/Eigen/src/GPU/README.md b/unsupported/Eigen/src/GPU/README.md
index 2928a7c..b7c4269 100644
--- a/unsupported/Eigen/src/GPU/README.md
+++ b/unsupported/Eigen/src/GPU/README.md
@@ -26,7 +26,7 @@
 
 **CPU and GPU coexist.** There is no global compile-time switch that replaces
 CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers
-explicitly -- `gpu::LLT<double>` vs `Eigen::LLT<MatrixXd>` -- and both coexist in
+explicitly -- `gpu::LLT<double>` vs `LLT<MatrixXd>` -- and both coexist in
 the same binary. This also lets users keep the factored matrix on device across
 multiple solves, something impossible with compile-time replacement.
 
@@ -167,10 +167,43 @@
 gpu::LU<double> lu;
 lu.compute(d_A);
 auto d_Y = lu.solve(d_B, gpu::GpuOp::Trans);           // A^T Y = B
+
+// QR solve (overdetermined least squares)
+gpu::QR<double> qr(A);                // host matrix input
+MatrixXd X = qr.solve(B);           // Q^H * B via ormqr, then trsm on R
+
+// SVD
+gpu::SVD<double> svd(A, ComputeThinU | ComputeThinV);
+VectorXd S = svd.singularValues();
+MatrixXd U = svd.matrixU();
+MatrixXd VT = svd.matrixVT();
+MatrixXd X = svd.solve(B);          // pseudoinverse solve
+
+// SVD: device-side views (no D2H transfer; svd must outlive the views)
+auto d_S = svd.d_singularValues();   // DeviceMatrix view of singular values
+auto d_U = svd.d_matrixU();          // DeviceMatrix view of U
+auto d_VT = svd.d_matrixVT();        // DeviceMatrix view of V^T
+
+// Self-adjoint eigenvalue decomposition
+gpu::SelfAdjointEigenSolver<double> es(A);
+VectorXd eigenvals = es.eigenvalues();
+MatrixXd eigenvecs = es.eigenvectors();
+auto d_W = es.d_eigenvalues();        // DeviceMatrix view of eigenvalues
+auto d_V = es.d_eigenvectors();       // DeviceMatrix view of eigenvectors
 ```
 
 The cached API keeps the factored matrix on device, avoiding redundant
-host-device transfers and re-factorizations.
+host-device transfers and re-factorizations. The `d_*` accessors return
+non-owning `DeviceMatrix` views so downstream cuBLAS/cuSOLVER work can chain
+without round-tripping through host memory.
+
+### Precision control
+
+GEMM dispatch uses `cublasXgemm` (type-specific Sgemm/Dgemm/Cgemm/Zgemm).
+cuBLAS may internally use tensor cores depending on the GPU architecture,
+matrix dimensions, and CUDA math mode settings. No Eigen-specific macros
+control this; use the standard `CUDA_MATH_MODE` environment variable or
+`cublasSetMathMode()` to configure tensor core behavior if needed.
 
 ### Stream control and async execution
 
@@ -221,15 +254,15 @@
 
 | DeviceMatrix expression | Library call | Parameters |
 |---|---|---|
-| `C = A * B` | `cublasGemmEx` | transA=N, transB=N, alpha=1, beta=0 |
-| `C = A.adjoint() * B` | `cublasGemmEx` | transA=C, transB=N |
-| `C = A.transpose() * B` | `cublasGemmEx` | transA=T, transB=N |
-| `C = A * B.adjoint()` | `cublasGemmEx` | transA=N, transB=C |
-| `C = A * B.transpose()` | `cublasGemmEx` | transA=N, transB=T |
-| `C = alpha * A * B` | `cublasGemmEx` | alpha from LHS |
-| `C = A * (alpha * B)` | `cublasGemmEx` | alpha from RHS |
-| `C += A * B` | `cublasGemmEx` | alpha=1, beta=1 |
-| `C.device(ctx) -= A * B` | `cublasGemmEx` | alpha=-1, beta=1 |
+| `C = A * B` | `cublasXgemm` | transA=N, transB=N, alpha=1, beta=0 |
+| `C = A.adjoint() * B` | `cublasXgemm` | transA=C, transB=N |
+| `C = A.transpose() * B` | `cublasXgemm` | transA=T, transB=N |
+| `C = A * B.adjoint()` | `cublasXgemm` | transA=N, transB=C |
+| `C = A * B.transpose()` | `cublasXgemm` | transA=N, transB=T |
+| `C = alpha * A * B` | `cublasXgemm` | alpha from LHS |
+| `C = A * (alpha * B)` | `cublasXgemm` | alpha from RHS |
+| `C += A * B` | `cublasXgemm` | alpha=1, beta=1 |
+| `C.device(ctx) -= A * B` | `cublasXgemm` | alpha=-1, beta=1 |
 | `X = A.llt().solve(B)` | `cusolverDnXpotrf` + `Xpotrs` | uplo, n, nrhs |
 | `X = A.llt<Upper>().solve(B)` | same | uplo=Upper |
 | `X = A.lu().solve(B)` | `cusolverDnXgetrf` + `Xgetrs` | n, nrhs |
@@ -244,7 +277,9 @@
 | `DeviceMatrix()` | -- | Empty (0x0) |
 | `DeviceMatrix(rows, cols)` | -- | Allocate uninitialized |
 | `fromHost(matrix, stream)` | yes | Upload from Eigen matrix |
-| `fromHostAsync(ptr, rows, cols, outerStride, stream)` | no | Async upload (caller manages lifetime) |
+| `fromHostAsync(ptr, rows, cols, stream)` | no | Async upload (caller manages lifetime) |
+| `adopt(ptr, rows, cols)` | -- | Owning wrapper over a device pointer (caller relinquishes ownership) |
+| `view(ptr, rows, cols)` | -- | Non-owning view over a device pointer (does not free on destruction) |
 | `toHost(stream)` | yes | Synchronous download |
 | `toHostAsync(stream)` | no | Returns `HostTransfer` future |
 | `clone(stream)` | no | Device-to-device deep copy |
@@ -295,6 +330,76 @@
 GPU dense partial-pivoting LU via cuSOLVER. Same pattern as `gpu::LLT`, plus a
 `gpu::GpuOp` parameter on `solve()` (`NoTrans`, `Trans`, `ConjTrans`).
 
+### `gpu::QR<Scalar>` API
+
+GPU dense QR decomposition via cuSOLVER (`geqrf`). Solve uses `ormqr` (apply
+Q^H) + `trsm` (back-substitute on R) -- Q is never formed explicitly.
+
+| Method | Description |
+|--------|-------------|
+| `gpu::QR()` | Default construct, then call `compute()` |
+| `gpu::QR(A)` | Construct and factorize from host matrix |
+| `compute(A)` | Upload + factorize |
+| `compute(DeviceMatrix)` | D2D copy + factorize |
+| `solve(host_matrix)` | Solve, return host matrix (syncs) |
+| `solve(DeviceMatrix)` | Solve, return `DeviceMatrix` (async) |
+| `matrixR()` | Download upper-triangular factor R to host (m >= n only) |
+| `info()` | Lazy sync |
+| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream |
+
+### `gpu::SVD<Scalar>` API
+
+GPU dense SVD via cuSOLVER (`gesvd`). Supports thin, full, and values-only
+modes via Eigen's `ComputeThinU | ComputeThinV`, `ComputeFullU | ComputeFullV`,
+or `0` (values only).
+
+| Method | Description |
+|--------|-------------|
+| `gpu::SVD()` | Default construct, then call `compute()` |
+| `gpu::SVD(A, options)` | Construct and compute (options default: `ComputeThinU \| ComputeThinV`) |
+| `compute(A, options)` | Compute from host matrix |
+| `compute(DeviceMatrix, options)` | Compute from device matrix |
+| `singularValues()` | Download singular values to host |
+| `matrixU()` | Download U to host (requires `ComputeThinU` or `ComputeFullU`) |
+| `matrixV()` | Download V to host (matches `JacobiSVD::matrixV()`) |
+| `matrixVT()` | Download V^T to host (requires `ComputeThinV` or `ComputeFullV`) |
+| `d_singularValues()` | `DeviceMatrix` view of singular values (zero-copy) |
+| `d_matrixU()` | `DeviceMatrix` view of U (zero-copy when m >= n) |
+| `d_matrixVT()` | `DeviceMatrix` view of V^T (zero-copy when m >= n) |
+| `solve(B)` | Pseudoinverse solve (returns host matrix) |
+| `solve(B, k)` | Truncated solve (top k singular triplets) |
+| `solve(B, lambda)` | Tikhonov regularized solve |
+| `rank(threshold)` | Count singular values above threshold |
+| `info()` | Lazy sync |
+| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream |
+
+Wide matrices (m < n) are handled by internally transposing via cuBLAS `geam`;
+in that case `d_matrixU()` and `d_matrixVT()` allocate an owning result via a
+single `cublasXgeam` adjoint pass. `d_singularValues()` is always zero-copy.
+
+### `gpu::SelfAdjointEigenSolver<Scalar>` API
+
+GPU symmetric/Hermitian eigenvalue decomposition via cuSOLVER (`syevd`).
+
+| Method | Description |
+|--------|-------------|
+| `gpu::SelfAdjointEigenSolver()` | Default construct, then call `compute()` |
+| `gpu::SelfAdjointEigenSolver(A, mode)` | Construct and compute (mode default: `ComputeEigenvectors`) |
+| `compute(A, mode)` | Compute from host matrix |
+| `compute(DeviceMatrix, mode)` | Compute from device matrix |
+| `eigenvalues()` | Download eigenvalues to host (ascending order) |
+| `eigenvectors()` | Download eigenvectors to host (columns) |
+| `d_eigenvalues()` | `DeviceMatrix` view of eigenvalues (zero-copy) |
+| `d_eigenvectors()` | `DeviceMatrix` view of eigenvectors (zero-copy, requires `ComputeEigenvectors`) |
+| `info()` | Lazy sync |
+| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream |
+
+`ComputeMode`: `gpu::SelfAdjointEigenSolver::EigenvaluesOnly` or
+`gpu::SelfAdjointEigenSolver::ComputeEigenvectors`.
+
+The `d_*` accessors return non-owning views into the solver's internal device
+buffers; the solver must outlive any view derived from it.
+
 ### `HostTransfer<Scalar>` API
 
 Future for async device-to-host transfer.
@@ -313,6 +418,18 @@
 SYMM/HEMM, and SYRK/HERK. Debug builds assert on these violations before
 dispatching to cuBLAS.
 
+## Future work
+
+- **Reassess host-input vs. device-input API surface.** Each solver currently
+  exposes both host-input (`compute(MatrixXd)`, `solve(MatrixXd)`) and
+  device-input (`compute(DeviceMatrix)`, `solve(DeviceMatrix)`) overloads, plus
+  host- and device-side accessors (`matrixU()` vs `d_matrixU()`). This eases
+  migration from CPU Eigen but may invite accidental host ↔ device round-trips
+  when users mix the two without realising the cost. Revisit after the initial
+  GPU module roll-out (MRs !2408, !2412, !2413, !2414, !2415) is in users'
+  hands; if the convenience overloads cause more confusion than they save,
+  narrow toward a single explicit `fromHost` / `toHost` boundary.
+
 ## File layout
 
 | File | Depends on | Contents |
@@ -326,8 +443,12 @@
 | `GpuContext.h` | `CuBlasSupport.h`, `CuSolverSupport.h` | `gpu::Context` |
 | `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>` | cuBLAS error macro, op/compute type maps |
 | `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping |
-| `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization |
-| `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization |
+| `GpuSolverContext.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Shared solver context (stream, handles, scratch) |
+| `GpuLLT.h` | `GpuSolverContext.h` | Cached dense Cholesky factorization |
+| `GpuLU.h` | `GpuSolverContext.h` | Cached dense LU factorization |
+| `GpuQR.h` | `GpuSolverContext.h` | Dense QR decomposition |
+| `GpuSVD.h` | `GpuSolverContext.h` | Dense SVD decomposition |
+| `GpuEigenSolver.h` | `GpuSolverContext.h` | Self-adjoint eigenvalue decomposition |
 
 ## Building and testing
 
@@ -336,7 +457,8 @@
   -DEIGEN_TEST_CUDA=ON \
   -DEIGEN_CUDA_COMPUTE_ARCH="70"
 
-cmake --build build --target cublas cusolver_llt cusolver_lu device_matrix
+cmake --build build --target cublas cusolver_llt cusolver_lu \
+  cusolver_qr cusolver_svd cusolver_eigen device_matrix
 ctest --test-dir build -L gpu --output-on-failure
 ```
 
diff --git a/unsupported/test/GPU/CMakeLists.txt b/unsupported/test/GPU/CMakeLists.txt
index cdc96ac..706efef 100644
--- a/unsupported/test/GPU/CMakeLists.txt
+++ b/unsupported/test/GPU/CMakeLists.txt
@@ -69,7 +69,7 @@
 # cuSOLVER dense factorizations (LLT, LU).
 option(EIGEN_TEST_CUSOLVER "Test cuSOLVER integration" ON)
 if(EIGEN_TEST_CUSOLVER AND TARGET CUDA::cusolver)
-  foreach(_cusolver_test IN ITEMS cusolver_llt cusolver_lu)
+  foreach(_cusolver_test IN ITEMS cusolver_llt cusolver_lu cusolver_qr cusolver_svd cusolver_eigen)
     ei_add_gpu_test(${_cusolver_test} EXTRA_LIBS CUDA::cusolver CUDA::cublas)
   endforeach()
 endif()
diff --git a/unsupported/test/GPU/cusolver_eigen.cpp b/unsupported/test/GPU/cusolver_eigen.cpp
new file mode 100644
index 0000000..3a6535d
--- /dev/null
+++ b/unsupported/test/GPU/cusolver_eigen.cpp
@@ -0,0 +1,328 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// Tests for GpuSelfAdjointEigenSolver: GPU symmetric/Hermitian eigenvalue
+// decomposition via cuSOLVER.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/Eigenvalues>
+#include <unsupported/Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- Reconstruction: V * diag(W) * V^H ≈ A ---------------------------------
+
+template <typename Scalar>
+void test_eigen_reconstruction(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  // Build a symmetric/Hermitian matrix.
+  Mat R = Mat::Random(n, n);
+  Mat A = R + R.adjoint();
+
+  gpu::SelfAdjointEigenSolver<Scalar> es(A);
+  VERIFY_IS_EQUAL(es.info(), Success);
+
+  auto W = es.eigenvalues();
+  Mat V = es.eigenvectors();
+
+  VERIFY_IS_EQUAL(W.size(), n);
+  VERIFY_IS_EQUAL(V.rows(), n);
+  VERIFY_IS_EQUAL(V.cols(), n);
+
+  // Reconstruct: A_hat = V * diag(W) * V^H.
+  Mat A_hat = V * W.asDiagonal() * V.adjoint();
+  RealScalar tol = RealScalar(8) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon() * A.norm();
+  VERIFY((A_hat - A).norm() < tol);
+
+  // Orthogonality: V^H * V ≈ I.
+  Mat VhV = V.adjoint() * V;
+  Mat eye = Mat::Identity(n, n);
+  VERIFY((VhV - eye).norm() < tol);
+}
+
+// ---- Eigenvalues match CPU SelfAdjointEigenSolver ---------------------------
+
+template <typename Scalar>
+void test_eigen_values(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat R = Mat::Random(n, n);
+  Mat A = R + R.adjoint();
+
+  gpu::SelfAdjointEigenSolver<Scalar> gpu_es(A);
+  VERIFY_IS_EQUAL(gpu_es.info(), Success);
+  auto W_gpu = gpu_es.eigenvalues();
+
+  SelfAdjointEigenSolver<Mat> cpu_es(A);
+  auto W_cpu = cpu_es.eigenvalues();
+
+  RealScalar tol =
+      RealScalar(2) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon() * W_cpu.cwiseAbs().maxCoeff();
+  VERIFY((W_gpu - W_cpu).norm() < tol);
+}
+
+// ---- Eigenvalues-only mode --------------------------------------------------
+
+template <typename Scalar>
+void test_eigen_values_only(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat R = Mat::Random(n, n);
+  Mat A = R + R.adjoint();
+
+  gpu::SelfAdjointEigenSolver<Scalar> gpu_es(A, EigenvaluesOnly);
+  VERIFY_IS_EQUAL(gpu_es.info(), Success);
+  auto W_gpu = gpu_es.eigenvalues();
+
+  SelfAdjointEigenSolver<Mat> cpu_es(A, EigenvaluesOnly);
+  auto W_cpu = cpu_es.eigenvalues();
+
+  RealScalar tol =
+      RealScalar(2) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon() * W_cpu.cwiseAbs().maxCoeff();
+  VERIFY((W_gpu - W_cpu).norm() < tol);
+}
+
+// ---- DeviceMatrix input path ------------------------------------------------
+
+template <typename Scalar>
+void test_eigen_device_matrix(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat R = Mat::Random(n, n);
+  Mat A = R + R.adjoint();
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  gpu::SelfAdjointEigenSolver<Scalar> es;
+  es.compute(d_A);
+  VERIFY_IS_EQUAL(es.info(), Success);
+
+  auto W_gpu = es.eigenvalues();
+  Mat V = es.eigenvectors();
+
+  // Verify reconstruction.
+  Mat A_hat = V * W_gpu.asDiagonal() * V.adjoint();
+  RealScalar tol = RealScalar(8) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon() * A.norm();
+  VERIFY((A_hat - A).norm() < tol);
+}
+
+// ---- Recompute (reuse solver object) ----------------------------------------
+
+template <typename Scalar>
+void test_eigen_recompute(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  gpu::SelfAdjointEigenSolver<Scalar> es;
+
+  for (int trial = 0; trial < 3; ++trial) {
+    Mat R = Mat::Random(n, n);
+    Mat A = R + R.adjoint();
+    es.compute(A);
+    VERIFY_IS_EQUAL(es.info(), Success);
+
+    auto W = es.eigenvalues();
+    Mat V = es.eigenvectors();
+    Mat A_hat = V * W.asDiagonal() * V.adjoint();
+    RealScalar tol = RealScalar(8) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon() * A.norm();
+    VERIFY((A_hat - A).norm() < tol);
+  }
+}
+
+// ---- Repeated eigenvalues ---------------------------------------------------
+//
+// Builds A with a degenerate eigenvalue cluster. Eigenvectors within the cluster
+// are not uniquely determined, but eigenvalues, reconstruction, and orthogonality
+// must still hold.
+
+template <typename Scalar>
+void test_eigen_repeated_values(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using Vec = Matrix<typename NumTraits<Scalar>::Real, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  // Build a unitary V via QR of a random matrix.
+  Mat seed = Mat::Random(n, n);
+  Mat V = HouseholderQR<Mat>(seed).householderQ();
+
+  // Spectrum: a 4-fold cluster at value 1, then increasing distinct values.
+  Vec eigs(n);
+  for (Index i = 0; i < n; ++i) {
+    eigs(i) = (i < 4) ? RealScalar(1) : RealScalar(i);
+  }
+  Mat A = V * eigs.asDiagonal() * V.adjoint();
+  Mat A_sym = (A + A.adjoint()) * RealScalar(0.5);  // symmetrize numerically
+
+  gpu::SelfAdjointEigenSolver<Scalar> es(A_sym);
+  VERIFY_IS_EQUAL(es.info(), Success);
+
+  // Eigenvalues must match the constructed spectrum (cuSOLVER returns ascending).
+  Vec W_gpu = es.eigenvalues();
+  Vec W_expected = eigs;
+  std::sort(W_expected.data(), W_expected.data() + n);
+
+  RealScalar tol_w =
+      RealScalar(2) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon() * W_expected.cwiseAbs().maxCoeff();
+  VERIFY((W_gpu - W_expected).norm() < tol_w);
+
+  // Reconstruction must hold even with a degenerate cluster (eigenvectors form a
+  // valid orthonormal basis for the cluster's invariant subspace).
+  Mat V_gpu = es.eigenvectors();
+  Mat A_hat = V_gpu * W_gpu.asDiagonal() * V_gpu.adjoint();
+  RealScalar tol = RealScalar(20) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon() * A.norm();
+  VERIFY((A_hat - A_sym).norm() < tol);
+
+  // Orthogonality of computed eigenvectors must hold.
+  VERIFY((V_gpu.adjoint() * V_gpu - Mat::Identity(n, n)).norm() < tol);
+}
+
+// ---- Device-side accessors --------------------------------------------------
+//
+// d_eigenvalues / d_eigenvectors return non-owning views over the solver's
+// internal d_W_ / d_A_ buffers. Verify they agree with the host accessors and
+// that repeated invocations leave the solver state intact.
+
+template <typename Scalar>
+void test_eigen_device_accessors(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+
+  Mat R = Mat::Random(n, n);
+  Mat A = R + R.adjoint();
+
+  gpu::SelfAdjointEigenSolver<Scalar> es(A);
+  VERIFY_IS_EQUAL(es.info(), Success);
+
+  auto d_W = es.d_eigenvalues();
+  auto d_V = es.d_eigenvectors();
+
+  VERIFY_IS_APPROX(d_W.toHost(), es.eigenvalues());
+  VERIFY_IS_APPROX(d_V.toHost(), es.eigenvectors());
+
+  // Re-fetch must keep working (view destruction must not free the underlying buffers).
+  (void)es.d_eigenvalues();
+  (void)es.d_eigenvectors();
+  VERIFY_IS_APPROX(es.eigenvalues(), d_W.toHost());
+  VERIFY_IS_APPROX(es.eigenvectors(), d_V.toHost());
+}
+
+// ---- Chain device views into a downstream cuBLAS GEMM (no D2D copy) ---------
+
+template <typename Scalar>
+void test_eigen_chain_orthogonality(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat R = Mat::Random(n, n);
+  Mat A = R + R.adjoint();
+
+  gpu::SelfAdjointEigenSolver<Scalar> es(A);
+  VERIFY_IS_EQUAL(es.info(), Success);
+
+  // V^H * V = I — V is the device view of d_A_; the GEMM consumes the view
+  // without an intervening D2D copy.
+  gpu::Context ctx;
+  gpu::DeviceMatrix<Scalar> d_VtV;
+  {
+    auto d_V = es.d_eigenvectors();
+    d_VtV.device(ctx) = d_V.adjoint() * d_V;
+  }
+  Mat VtV = d_VtV.toHost();
+  RealScalar tol = RealScalar(20) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((VtV - Mat::Identity(n, n)).norm() < tol);
+
+  // After view destruction, the solver's state must remain valid.
+  Mat V_again = es.eigenvectors();
+  VERIFY_IS_EQUAL(V_again.rows(), n);
+}
+
+// ---- Move support -------------------------------------------------------------
+
+template <typename Scalar>
+void test_eigen_move(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat R = Mat::Random(n, n);
+  Mat A = R + R.adjoint();
+
+  gpu::SelfAdjointEigenSolver<Scalar> es(A);
+  VERIFY_IS_EQUAL(es.info(), Success);
+
+  gpu::SelfAdjointEigenSolver<Scalar> moved(std::move(es));
+  VERIFY_IS_EQUAL(moved.info(), Success);
+  Mat V = moved.eigenvectors();
+  auto W = moved.eigenvalues();
+  RealScalar tol = RealScalar(8) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon() * A.norm();
+  VERIFY((V * W.asDiagonal() * V.adjoint() - A).norm() < tol);
+
+  gpu::SelfAdjointEigenSolver<Scalar> assigned;
+  assigned = std::move(moved);
+  VERIFY_IS_EQUAL(assigned.info(), Success);
+  V = assigned.eigenvectors();
+  W = assigned.eigenvalues();
+  VERIFY((V * W.asDiagonal() * V.adjoint() - A).norm() < tol);
+}
+
+// ---- Empty matrix -----------------------------------------------------------
+
+void test_eigen_empty() {
+  gpu::SelfAdjointEigenSolver<double> es(MatrixXd(0, 0));
+  VERIFY_IS_EQUAL(es.info(), Success);
+  VERIFY_IS_EQUAL(es.rows(), 0);
+  VERIFY_IS_EQUAL(es.cols(), 0);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  // Reconstruction + orthogonality.
+  CALL_SUBTEST(test_eigen_reconstruction<Scalar>(64));
+  CALL_SUBTEST(test_eigen_reconstruction<Scalar>(128));
+
+  // Eigenvalues match CPU.
+  CALL_SUBTEST(test_eigen_values<Scalar>(64));
+  CALL_SUBTEST(test_eigen_values<Scalar>(128));
+
+  // Values-only mode.
+  CALL_SUBTEST(test_eigen_values_only<Scalar>(64));
+
+  // DeviceMatrix input.
+  CALL_SUBTEST(test_eigen_device_matrix<Scalar>(64));
+
+  // Recompute.
+  CALL_SUBTEST(test_eigen_recompute<Scalar>(32));
+
+  // Repeated eigenvalues (degenerate cluster).
+  CALL_SUBTEST(test_eigen_repeated_values<Scalar>(64));
+
+  // Device accessors.
+  CALL_SUBTEST(test_eigen_device_accessors<Scalar>(64));
+
+  // Chain device view into a downstream GEMM.
+  CALL_SUBTEST(test_eigen_chain_orthogonality<Scalar>(64));
+
+  // Move constructor/assignment.
+  CALL_SUBTEST(test_eigen_move<Scalar>(32));
+}
+
+EIGEN_DECLARE_TEST(gpu_cusolver_eigen) {
+  // Split by scalar so each part compiles in parallel.
+  CALL_SUBTEST_1(test_scalar<float>());
+  CALL_SUBTEST_2(test_scalar<double>());
+  CALL_SUBTEST_3(test_scalar<std::complex<float>>());
+  CALL_SUBTEST_4(test_scalar<std::complex<double>>());
+  CALL_SUBTEST_5(test_eigen_empty());
+}
diff --git a/unsupported/test/GPU/cusolver_llt.cpp b/unsupported/test/GPU/cusolver_llt.cpp
index 9e763af..772333b 100644
--- a/unsupported/test/GPU/cusolver_llt.cpp
+++ b/unsupported/test/GPU/cusolver_llt.cpp
@@ -106,6 +106,17 @@
   VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::NumericalIssue);
 }
 
+// solve(DeviceMatrix) must not silently return garbage when the factorization
+// failed: it must sync the info word and assert just like solve(MatrixBase).
+void test_not_spd_device_solve_asserts() {
+  Eigen::MatrixXd h_A = -Eigen::MatrixXd::Identity(8, 8);
+  Eigen::MatrixXd h_B = Eigen::MatrixXd::Random(8, 4);
+  Eigen::gpu::LLT<double> gpu_llt(h_A);
+  VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::NumericalIssue);
+  auto d_B = Eigen::gpu::DeviceMatrix<double>::fromHost(h_B);
+  VERIFY_RAISES_ASSERT(gpu_llt.solve(d_B));
+}
+
 // ---- DeviceMatrix-native API --------------------------------------------
 // These tests exercise the device-resident path: compute(DeviceMatrix) +
 // solve(DeviceMatrix) -> DeviceMatrix, with the user explicitly managing
@@ -218,4 +229,5 @@
   CALL_SUBTEST_3(test_scalar<std::complex<float>>());
   CALL_SUBTEST_4(test_scalar<std::complex<double>>());
   CALL_SUBTEST_5(test_not_spd());
+  CALL_SUBTEST_5(test_not_spd_device_solve_asserts());
 }
diff --git a/unsupported/test/GPU/cusolver_lu.cpp b/unsupported/test/GPU/cusolver_lu.cpp
index 8c4e4be..195f6d1 100644
--- a/unsupported/test/GPU/cusolver_lu.cpp
+++ b/unsupported/test/GPU/cusolver_lu.cpp
@@ -105,6 +105,17 @@
   VERIFY_IS_EQUAL(lu.info(), NumericalIssue);
 }
 
+// solve(DeviceMatrix) must not silently return garbage when the factorization
+// failed: it must sync the info word and assert just like solve(MatrixBase).
+void test_singular_device_solve_asserts() {
+  MatrixXd A = MatrixXd::Zero(8, 8);
+  MatrixXd B = MatrixXd::Random(8, 4);
+  gpu::LU<double> lu(A);
+  VERIFY_IS_EQUAL(lu.info(), NumericalIssue);
+  auto d_B = gpu::DeviceMatrix<double>::fromHost(B);
+  VERIFY_RAISES_ASSERT(lu.solve(d_B));
+}
+
 // ---- DeviceMatrix integration tests -----------------------------------------
 
 template <typename Scalar>
@@ -202,4 +213,5 @@
   CALL_SUBTEST_3(test_scalar<std::complex<float>>());
   CALL_SUBTEST_4(test_scalar<std::complex<double>>());
   CALL_SUBTEST_5(test_singular());
+  CALL_SUBTEST_5(test_singular_device_solve_asserts());
 }
diff --git a/unsupported/test/GPU/cusolver_qr.cpp b/unsupported/test/GPU/cusolver_qr.cpp
new file mode 100644
index 0000000..c1f7f12
--- /dev/null
+++ b/unsupported/test/GPU/cusolver_qr.cpp
@@ -0,0 +1,309 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// Tests for GpuQR: GPU QR decomposition via cuSOLVER.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/QR>
+#include <Eigen/SVD>
+#include <unsupported/Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- Solve square system: A * X = B -----------------------------------------
+
+template <typename Scalar>
+void test_qr_solve_square(Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, nrhs);
+
+  gpu::QR<Scalar> qr(A);
+  VERIFY_IS_EQUAL(qr.info(), Success);
+
+  Mat X = qr.solve(B);
+  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
+  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+// ---- Solve overdetermined system: m > n (least-squares) ---------------------
+
+template <typename Scalar>
+void test_qr_solve_overdetermined(Index m, Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  eigen_assert(m >= n);
+  Mat A = Mat::Random(m, n);
+  Mat B = Mat::Random(m, nrhs);
+
+  gpu::QR<Scalar> qr(A);
+  VERIFY_IS_EQUAL(qr.info(), Success);
+
+  Mat X = qr.solve(B);
+  VERIFY_IS_EQUAL(X.rows(), n);
+  VERIFY_IS_EQUAL(X.cols(), nrhs);
+
+  // For an overdetermined system the residual r = A X - B is generally large
+  // (O(||B||) when B is not in col(A)). What backward-stable least-squares
+  // makes small is the gradient of the LS objective, A^H r, which is zero at
+  // the optimum. Higham's bound for QR-based LS gives ||A^H r|| <= O(m * eps)
+  // * ||A|| * (||A|| ||X|| + ||B||), regardless of kappa(A).
+  Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
+  RealScalar tol = RealScalar(10) * RealScalar(m) * NumTraits<Scalar>::epsilon();
+  RealScalar A_norm = A.norm();
+  RealScalar denom_gpu = A_norm * (A_norm * X.norm() + B.norm());
+  RealScalar denom_cpu = A_norm * (A_norm * X_cpu.norm() + B.norm());
+  VERIFY((A.adjoint() * (A * X - B)).norm() / denom_gpu < tol);
+  VERIFY((A.adjoint() * (A * X_cpu - B)).norm() / denom_cpu < tol);
+}
+
+// ---- Solve with DeviceMatrix input ------------------------------------------
+
+template <typename Scalar>
+void test_qr_solve_device(Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, nrhs);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::QR<Scalar> qr;
+  qr.compute(d_A);
+  VERIFY_IS_EQUAL(qr.info(), Success);
+
+  gpu::DeviceMatrix<Scalar> d_X = qr.solve(d_B);
+  Mat X = d_X.toHost();
+
+  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
+  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+// ---- Solve overdetermined via device path -----------------------------------
+
+template <typename Scalar>
+void test_qr_solve_overdetermined_device(Index m, Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  eigen_assert(m >= n);
+  Mat A = Mat::Random(m, n);
+  Mat B = Mat::Random(m, nrhs);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::QR<Scalar> qr;
+  qr.compute(d_A);
+  VERIFY_IS_EQUAL(qr.info(), Success);
+
+  gpu::DeviceMatrix<Scalar> d_X = qr.solve(d_B);
+  VERIFY_IS_EQUAL(d_X.rows(), n);
+  VERIFY_IS_EQUAL(d_X.cols(), nrhs);
+
+  Mat X = d_X.toHost();
+  Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
+  // See test_qr_solve_overdetermined: backward error for LS is bounded on
+  // the gradient A^H r, not on r itself.
+  RealScalar tol = RealScalar(10) * RealScalar(m) * NumTraits<Scalar>::epsilon();
+  RealScalar A_norm = A.norm();
+  RealScalar denom_gpu = A_norm * (A_norm * X.norm() + B.norm());
+  RealScalar denom_cpu = A_norm * (A_norm * X_cpu.norm() + B.norm());
+  VERIFY((A.adjoint() * (A * X - B)).norm() / denom_gpu < tol);
+  VERIFY((A.adjoint() * (A * X_cpu - B)).norm() / denom_cpu < tol);
+}
+
+// ---- Solve underdetermined system: m < n (minimum norm) ---------------------
+
+template <typename Scalar>
+void test_qr_solve_underdetermined(Index m, Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  eigen_assert(m < n);
+  Mat A = Mat::Random(m, n);
+  Mat B = Mat::Random(m, nrhs);
+
+  gpu::QR<Scalar> qr(A);
+  VERIFY_IS_EQUAL(qr.info(), Success);
+
+  Mat X = qr.solve(B);
+  VERIFY_IS_EQUAL(X.rows(), n);
+  VERIFY_IS_EQUAL(X.cols(), nrhs);
+
+  // The minimum-norm solution exactly satisfies A X = B (m equations, n > m
+  // unknowns). Backward error is O(m * eps) * ||A|| * ||X||.
+  RealScalar tol = RealScalar(20) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((A * X - B).norm() / (A.norm() * X.norm() + B.norm()) < tol);
+
+  // Min-norm property: X ⟂ null(A), so X ∈ row space(A) = col space(A^H).
+  // ||X|| <= ||X_any|| for any other solution. Compare to the SVD min-norm reference.
+  Mat X_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV).solve(B);
+  // Both should have the same norm (up to rounding).
+  VERIFY(numext::abs(X.norm() - X_svd.norm()) / X_svd.norm() <
+         RealScalar(50) * RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+template <typename Scalar>
+void test_qr_solve_underdetermined_device(Index m, Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  eigen_assert(m < n);
+  Mat A = Mat::Random(m, n);
+  Mat B = Mat::Random(m, nrhs);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::QR<Scalar> qr;
+  qr.compute(d_A);
+  VERIFY_IS_EQUAL(qr.info(), Success);
+
+  gpu::DeviceMatrix<Scalar> d_X = qr.solve(d_B);
+  VERIFY_IS_EQUAL(d_X.rows(), n);
+  VERIFY_IS_EQUAL(d_X.cols(), nrhs);
+
+  Mat X = d_X.toHost();
+  RealScalar tol = RealScalar(20) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((A * X - B).norm() / (A.norm() * X.norm() + B.norm()) < tol);
+}
+
+// ---- matrixR() returns the upper-triangular factor --------------------------
+
+template <typename Scalar>
+void test_qr_matrixR(Index m, Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  eigen_assert(m >= n);
+  Mat A = Mat::Random(m, n);
+  gpu::QR<Scalar> qr(A);
+  VERIFY_IS_EQUAL(qr.info(), Success);
+
+  Mat R = qr.matrixR();
+  VERIFY_IS_EQUAL(R.rows(), n);
+  VERIFY_IS_EQUAL(R.cols(), n);
+
+  // Verify R is upper-triangular (lower triangle exactly zero).
+  for (Index j = 0; j < n; ++j) {
+    for (Index i = j + 1; i < n; ++i) {
+      VERIFY_IS_EQUAL(R(i, j), Scalar(0));
+    }
+  }
+
+  // Verify that the diagonal of R matches the absolute values of CPU R diagonal
+  // up to sign (Householder QR has free diagonal sign).
+  Mat R_cpu = HouseholderQR<Mat>(A).matrixQR().topRows(n).template triangularView<Upper>();
+  RealScalar tol = RealScalar(20) * RealScalar(m) * NumTraits<Scalar>::epsilon() * A.norm();
+  for (Index i = 0; i < n; ++i) {
+    VERIFY(numext::abs(numext::abs(R(i, i)) - numext::abs(R_cpu(i, i))) < tol);
+  }
+}
+
+// ---- Multiple solves reuse the factorization --------------------------------
+
+template <typename Scalar>
+void test_qr_multiple_solves(Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  gpu::QR<Scalar> qr(A);
+  VERIFY_IS_EQUAL(qr.info(), Success);
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  for (int k = 0; k < 5; ++k) {
+    Mat B = Mat::Random(n, 3);
+    Mat X = qr.solve(B);
+    RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
+    VERIFY(residual < tol);
+  }
+}
+
+// ---- Agreement with CPU HouseholderQR ---------------------------------------
+
+template <typename Scalar>
+void test_qr_vs_cpu(Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, nrhs);
+
+  gpu::QR<Scalar> gpu_qr(A);
+  VERIFY_IS_EQUAL(gpu_qr.info(), Success);
+
+  Mat X_gpu = gpu_qr.solve(B);
+  Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
+
+  // Compare via residual rather than directly between X_gpu and X_cpu: for an
+  // ill-conditioned A (a random N(0,1) n*n matrix easily reaches kappa(A) ~ n),
+  // the forward error of each correct solve is bounded by O(kappa * eps), so
+  // ||X_gpu - X_cpu|| / ||X_cpu|| can legitimately exceed n*eps even though
+  // both are correct. The relative residual ||A X - B|| / (||A||*||X|| + ||B||)
+  // is bounded by Higham's backward-stable solve result at O(n * eps),
+  // independent of kappa.
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  RealScalar A_norm = A.norm();
+  RealScalar B_norm = B.norm();
+  RealScalar denom_gpu = A_norm * X_gpu.norm() + B_norm;
+  RealScalar denom_cpu = A_norm * X_cpu.norm() + B_norm;
+  VERIFY((A * X_gpu - B).norm() / denom_gpu < tol);
+  VERIFY((A * X_cpu - B).norm() / denom_cpu < tol);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST(test_qr_solve_square<Scalar>(1, 1));
+  CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 1));
+  CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 4));
+  CALL_SUBTEST(test_qr_solve_square<Scalar>(256, 8));
+
+  CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(128, 64, 4));
+  CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(256, 128, 1));
+
+  CALL_SUBTEST(test_qr_solve_underdetermined<Scalar>(64, 128, 4));
+  CALL_SUBTEST(test_qr_solve_underdetermined<Scalar>(64, 128, 1));
+  CALL_SUBTEST(test_qr_solve_underdetermined_device<Scalar>(64, 128, 4));
+
+  CALL_SUBTEST(test_qr_matrixR<Scalar>(64, 64));
+  CALL_SUBTEST(test_qr_matrixR<Scalar>(128, 64));
+
+  CALL_SUBTEST(test_qr_solve_device<Scalar>(64, 4));
+  CALL_SUBTEST(test_qr_solve_overdetermined_device<Scalar>(128, 64, 4));
+  CALL_SUBTEST(test_qr_multiple_solves<Scalar>(64));
+  CALL_SUBTEST(test_qr_vs_cpu<Scalar>(64, 4));
+  CALL_SUBTEST(test_qr_vs_cpu<Scalar>(256, 8));
+}
+
+void test_qr_empty() {
+  gpu::QR<double> qr(MatrixXd(0, 0));
+  VERIFY_IS_EQUAL(qr.info(), Success);
+  VERIFY_IS_EQUAL(qr.rows(), 0);
+  VERIFY_IS_EQUAL(qr.cols(), 0);
+}
+
+EIGEN_DECLARE_TEST(gpu_cusolver_qr) {
+  // Split by scalar so each part compiles in parallel.
+  CALL_SUBTEST_1(test_scalar<float>());
+  CALL_SUBTEST_2(test_scalar<double>());
+  CALL_SUBTEST_3(test_scalar<std::complex<float>>());
+  CALL_SUBTEST_4(test_scalar<std::complex<double>>());
+  CALL_SUBTEST_5(test_qr_empty());
+}
diff --git a/unsupported/test/GPU/cusolver_svd.cpp b/unsupported/test/GPU/cusolver_svd.cpp
new file mode 100644
index 0000000..5d756b1
--- /dev/null
+++ b/unsupported/test/GPU/cusolver_svd.cpp
@@ -0,0 +1,431 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// SPDX-License-Identifier: MPL-2.0
+
+// Tests for GpuSVD: GPU SVD via cuSOLVER.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/SVD>
+#include <unsupported/Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- SVD reconstruction: U * diag(S) * VT ≈ A ------------------------------
+
+template <typename Scalar, unsigned int Options>
+void test_svd_reconstruction(Index m, Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, n);
+  gpu::SVD<Scalar> svd(A, Options);
+  VERIFY_IS_EQUAL(svd.info(), Success);
+
+  auto S = svd.singularValues();
+  Mat U = svd.matrixU();
+  Mat VT = svd.matrixVT();
+
+  const Index k = (std::min)(m, n);
+
+  // Reconstruct: A_hat = U[:,:k] * diag(S) * VT[:k,:].
+  Mat A_hat = U.leftCols(k) * S.asDiagonal() * VT.topRows(k);
+  RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(k)) * NumTraits<Scalar>::epsilon() * A.norm();
+  VERIFY((A_hat - A).norm() < tol);
+
+  // Orthogonality: U^H * U ≈ I.
+  Mat UtU = U.adjoint() * U;
+  Mat I_u = Mat::Identity(U.cols(), U.cols());
+  VERIFY((UtU - I_u).norm() < tol);
+
+  // Orthogonality: VT * VT^H ≈ I.
+  Mat VtVh = VT * VT.adjoint();
+  Mat I_v = Mat::Identity(VT.rows(), VT.rows());
+  VERIFY((VtVh - I_v).norm() < tol);
+}
+
+// ---- Singular values match CPU BDCSVD ---------------------------------------
+
+template <typename Scalar>
+void test_svd_singular_values(Index m, Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, n);
+  gpu::SVD<Scalar> svd(A, 0);  // values only
+  VERIFY_IS_EQUAL(svd.info(), Success);
+
+  auto S_gpu = svd.singularValues();
+  auto S_cpu = BDCSVD<Mat>(A, 0).singularValues();
+
+  // Weyl's perturbation bound (Higham, Accuracy and Stability of Numerical
+  // Algorithms, 2nd ed., §10.2.3): |σ_i(A) - σ_i(A+δA)| ≤ ||δA||. Both cuSOLVER
+  // and BDCSVD have backward error ||δA|| / ||A|| ≤ p(m,n) · u; the difference
+  // of the two computed singular value vectors is bounded by 2·p(m,n)·u·S_max,
+  // and √(min) · ||·||_∞ ≥ ||·||_2. Across 5k trials per case in
+  // {float, double, complex<float>, complex<double>} × {(64,64), (128,64)},
+  // worst observed err / (√min · u · S_max) is ≈ 6.2; we use 12 for headroom
+  // against future runs hitting the tail of the distribution.
+  RealScalar tol =
+      RealScalar(12) * std::sqrt(static_cast<RealScalar>((std::min)(m, n))) * NumTraits<Scalar>::epsilon() * S_cpu(0);
+  VERIFY((S_gpu - S_cpu).norm() < tol);
+}
+
+// ---- Solve: pseudoinverse ---------------------------------------------------
+
+template <typename Scalar>
+void test_svd_solve(Index m, Index n, Index nrhs) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, n);
+  Mat B = Mat::Random(m, nrhs);
+
+  gpu::SVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
+  VERIFY_IS_EQUAL(svd.info(), Success);
+
+  Mat X = svd.solve(B);
+  VERIFY_IS_EQUAL(X.rows(), n);
+  VERIFY_IS_EQUAL(X.cols(), nrhs);
+
+  // Compare with CPU BDCSVD solve. Wedin's perturbation theorem (Higham,
+  // Accuracy and Stability of Numerical Algorithms, 2nd ed., §20.1) bounds
+  // the forward error of a backward-stable SVD-based pseudoinverse solve by
+  // c · κ(A) · u with c = O(1). Comparing two such solvers doubles the
+  // constant. Across 6k trials over {float, double, complex<float>,
+  // complex<double>} and {square, over-/underdetermined} shapes, the worst
+  // observed err / (κ · u) is 5.3.
+  auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
+  Mat X_cpu = cpu_svd.solve(B);
+  auto S = cpu_svd.singularValues();
+  const RealScalar cond = S(0) / S(S.size() - 1);
+  const RealScalar tol = RealScalar(8) * cond * NumTraits<Scalar>::epsilon();
+  VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol);
+}
+
+// ---- Solve: truncated -------------------------------------------------------
+
+template <typename Scalar>
+void test_svd_solve_truncated(Index m, Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, n);
+  Mat B = Mat::Random(m, 1);
+  const Index k = (std::min)(m, n);
+  const Index trunc = k / 2;
+  eigen_assert(trunc > 0);
+
+  gpu::SVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
+  Mat X_trunc = svd.solve(B, trunc);
+
+  // Build CPU reference: truncated pseudoinverse.
+  auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
+  auto S = cpu_svd.singularValues();
+  Mat U = cpu_svd.matrixU();
+  Mat V = cpu_svd.matrixV();
+
+  // D_ii = 1/S_i for i < trunc, 0 otherwise.
+  Matrix<RealScalar, Dynamic, 1> D = Matrix<RealScalar, Dynamic, 1>::Zero(k);
+  for (Index i = 0; i < trunc; ++i) D(i) = RealScalar(1) / S(i);
+  Mat X_ref = V * D.asDiagonal() * U.adjoint() * B;
+
+  RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon();
+  VERIFY((X_trunc - X_ref).norm() / X_ref.norm() < tol);
+}
+
+// ---- Solve: Tikhonov regularized --------------------------------------------
+
+template <typename Scalar>
+void test_svd_solve_regularized(Index m, Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, n);
+  Mat B = Mat::Random(m, 1);
+  RealScalar lambda = RealScalar(0.1);
+  const Index k = (std::min)(m, n);
+
+  gpu::SVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
+  Mat X_reg = svd.solve(B, lambda);
+
+  // CPU reference: D_ii = S_i / (S_i^2 + lambda^2).
+  auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV);
+  auto S = cpu_svd.singularValues();
+  Mat U = cpu_svd.matrixU();
+  Mat V = cpu_svd.matrixV();
+
+  Matrix<RealScalar, Dynamic, 1> D(k);
+  for (Index i = 0; i < k; ++i) D(i) = S(i) / (S(i) * S(i) + lambda * lambda);
+  Mat X_ref = V * D.asDiagonal() * U.adjoint() * B;
+
+  RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon();
+  VERIFY((X_reg - X_ref).norm() / X_ref.norm() < tol);
+}
+
+// ---- Solve: rank-deficient (exercise drop_threshold pseudoinverse) ----------
+
+template <typename Scalar>
+void test_svd_solve_rank_deficient(Index m, Index n, Index r) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using Vec = Matrix<typename NumTraits<Scalar>::Real, Dynamic, 1>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  eigen_assert(r > 0 && r < (std::min)(m, n));
+
+  // Build A of rank exactly r: A = U[:,:r] * diag(s) * V[:,:r]^H.
+  Mat U_seed = Mat::Random(m, m);
+  Mat U_full = HouseholderQR<Mat>(U_seed).householderQ();
+  Mat V_seed = Mat::Random(n, n);
+  Mat V_full = HouseholderQR<Mat>(V_seed).householderQ();
+  Vec sigma(r);
+  for (Index i = 0; i < r; ++i) sigma(i) = RealScalar(1) + RealScalar(i);  // distinct, well-spaced
+  Mat A = U_full.leftCols(r) * sigma.asDiagonal() * V_full.leftCols(r).adjoint();
+
+  Mat B = Mat::Random(m, 1);
+
+  gpu::SVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
+  VERIFY_IS_EQUAL(svd.info(), Success);
+  Mat X_gpu = svd.solve(B);
+
+  // Reference: CPU BDCSVD with the same drop_threshold semantics (its default solve()
+  // also drops near-zero singular values relative to S(0) * (m, n)_max * epsilon).
+  Mat X_cpu = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV).solve(B);
+
+  // Both should produce the minimum-norm least-squares solution; compare via the
+  // "useful" residual A^H r: zero up to the rank-r component plus rounding.
+  RealScalar tol = RealScalar(50) * RealScalar((std::max)(m, n)) * NumTraits<Scalar>::epsilon();
+  VERIFY((X_gpu - X_cpu).norm() / X_cpu.norm() < tol);
+
+  // Norm of X should also be minimum (X has no component in null(A) up to rounding).
+  VERIFY(numext::abs(X_gpu.norm() - X_cpu.norm()) / X_cpu.norm() < tol);
+}
+
+// ---- Reconstruction: full U / full V on a wide matrix (m < n) ---------------
+//
+// Exercises the transposed-internal-representation path through gesvd combined
+// with the matrixU/matrixVT swap logic.
+
+template <typename Scalar>
+void test_svd_reconstruction_full_wide(Index m, Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  eigen_assert(m < n);
+  Mat A = Mat::Random(m, n);
+  gpu::SVD<Scalar> svd(A, ComputeFullU | ComputeFullV);
+  VERIFY_IS_EQUAL(svd.info(), Success);
+
+  auto S = svd.singularValues();
+  Mat U = svd.matrixU();    // m × m
+  Mat VT = svd.matrixVT();  // n × n
+  Mat V = svd.matrixV();    // n × n (matrixV alias should match matrixVT().adjoint())
+
+  VERIFY_IS_EQUAL(U.rows(), m);
+  VERIFY_IS_EQUAL(U.cols(), m);
+  VERIFY_IS_EQUAL(VT.rows(), n);
+  VERIFY_IS_EQUAL(VT.cols(), n);
+  VERIFY_IS_EQUAL(V.rows(), n);
+  VERIFY_IS_EQUAL(V.cols(), n);
+  VERIFY_IS_APPROX(V, VT.adjoint());
+
+  const Index k = (std::min)(m, n);
+  Mat A_hat = U.leftCols(k) * S.asDiagonal() * VT.topRows(k);
+  RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(k)) * NumTraits<Scalar>::epsilon() * A.norm();
+  VERIFY((A_hat - A).norm() < tol);
+
+  Mat UtU = U.adjoint() * U;
+  VERIFY((UtU - Mat::Identity(m, m)).norm() < tol);
+  Mat VtVh = VT * VT.adjoint();
+  VERIFY((VtVh - Mat::Identity(n, n)).norm() < tol);
+}
+
+// ---- Device-side accessors --------------------------------------------------
+//
+// Verify that d_singularValues / d_matrixU / d_matrixVT return views over the
+// SVD's internal storage that agree with the host accessors after toHost().
+
+template <typename Scalar>
+void test_svd_device_accessors(Index m, Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, n);
+  gpu::SVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
+  VERIFY_IS_EQUAL(svd.info(), Success);
+
+  auto d_S = svd.d_singularValues();
+  auto d_U = svd.d_matrixU();
+  auto d_VT = svd.d_matrixVT();
+
+  Matrix<RealScalar, Dynamic, 1> S_host = d_S.toHost();
+  Mat U_host = d_U.toHost();
+  Mat VT_host = d_VT.toHost();
+
+  VERIFY_IS_APPROX(S_host, svd.singularValues());
+  VERIFY_IS_APPROX(U_host, svd.matrixU());
+  VERIFY_IS_APPROX(VT_host, svd.matrixVT());
+
+  // Multiple invocations must keep returning views without disturbing the SVD's state:
+  // call again, then verify the SVD's host accessors still produce correct results.
+  (void)svd.d_matrixU();
+  (void)svd.d_matrixVT();
+  VERIFY_IS_APPROX(svd.matrixU(), U_host);
+  VERIFY_IS_APPROX(svd.matrixVT(), VT_host);
+}
+
+template <typename Scalar>
+void test_svd_device_accessors_full_wide(Index m, Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  eigen_assert(m < n);
+  Mat A = Mat::Random(m, n);
+  gpu::SVD<Scalar> svd(A, ComputeFullU | ComputeFullV);
+  VERIFY_IS_EQUAL(svd.info(), Success);
+
+  auto d_S = svd.d_singularValues();
+  auto d_U = svd.d_matrixU();
+  auto d_VT = svd.d_matrixVT();
+
+  VERIFY_IS_EQUAL(d_U.rows(), m);
+  VERIFY_IS_EQUAL(d_U.cols(), m);
+  VERIFY_IS_EQUAL(d_VT.rows(), n);
+  VERIFY_IS_EQUAL(d_VT.cols(), n);
+
+  Matrix<RealScalar, Dynamic, 1> S_host = d_S.toHost();
+  Mat U_host = d_U.toHost();
+  Mat VT_host = d_VT.toHost();
+
+  VERIFY_IS_APPROX(S_host, svd.singularValues());
+  VERIFY_IS_APPROX(U_host, svd.matrixU());
+  VERIFY_IS_APPROX(VT_host, svd.matrixVT());
+}
+
+// ---- Chain device views into a downstream cuBLAS GEMM (no D2D copy) ---------
+//
+// d_matrixU() returns a non-owning view over the SVD's internal d_U_ buffer.
+// Feeding the view straight into a Context-driven GEMM exercises cross-stream
+// event sync and confirms the borrow-deleter does not double-free on temporary
+// destruction.
+
+template <typename Scalar>
+void test_svd_chain_orthogonality(Index m, Index n) {
+  using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, n);
+  gpu::SVD<Scalar> svd(A, ComputeThinU | ComputeThinV);
+  VERIFY_IS_EQUAL(svd.info(), Success);
+
+  const Index k = (std::min)(m, n);
+
+  // U^H * U = I_k, all on device, no host roundtrip on U.
+  gpu::Context ctx;
+  gpu::DeviceMatrix<Scalar> d_UtU;
+  {
+    auto d_U = svd.d_matrixU();
+    d_UtU.device(ctx) = d_U.adjoint() * d_U;
+  }
+  Mat UtU = d_UtU.toHost();
+  RealScalar tol = RealScalar(20) * std::sqrt(static_cast<RealScalar>(k)) * NumTraits<Scalar>::epsilon();
+  VERIFY((UtU - Mat::Identity(k, k)).norm() < tol);
+
+  // VT * VT^H = I_k.
+  gpu::DeviceMatrix<Scalar> d_VVt;
+  {
+    auto d_VT = svd.d_matrixVT();
+    d_VVt.device(ctx) = d_VT * d_VT.adjoint();
+  }
+  Mat VVt = d_VVt.toHost();
+  VERIFY((VVt - Mat::Identity(k, k)).norm() < tol);
+
+  // After view destruction, SVD's state must remain valid (the deleter is a no-op
+  // for views, so the underlying d_U_ / d_VT_ are not freed).
+  Mat U_again = svd.matrixU();
+  Mat VT_again = svd.matrixVT();
+  VERIFY_IS_EQUAL(U_again.rows(), m);
+  VERIFY_IS_EQUAL(VT_again.cols(), n);
+}
+
+// ---- Empty matrix -----------------------------------------------------------
+
+void test_svd_empty() {
+  gpu::SVD<double> svd(MatrixXd(0, 0), 0);
+  VERIFY_IS_EQUAL(svd.info(), Success);
+  VERIFY_IS_EQUAL(svd.rows(), 0);
+  VERIFY_IS_EQUAL(svd.cols(), 0);
+
+  svd.compute(MatrixXd::Random(4, 6), ComputeThinU | ComputeThinV);
+  VERIFY_IS_EQUAL(svd.info(), Success);
+
+  svd.compute(MatrixXd(0, 7), 0);
+  VERIFY_IS_EQUAL(svd.info(), Success);
+  VERIFY_IS_EQUAL(svd.rows(), 0);
+  VERIFY_IS_EQUAL(svd.cols(), 7);
+  VERIFY_IS_EQUAL(svd.singularValues().size(), 0);
+
+  svd.compute(MatrixXd(5, 0), 0);
+  VERIFY_IS_EQUAL(svd.info(), Success);
+  VERIFY_IS_EQUAL(svd.rows(), 5);
+  VERIFY_IS_EQUAL(svd.cols(), 0);
+  VERIFY_IS_EQUAL(svd.singularValues().size(), 0);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  // Reconstruction + orthogonality (thin and full, identical test logic).
+  CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 64)));
+  CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(128, 64)));
+  CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 128)));  // wide (m < n)
+  CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(64, 64)));
+  CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(128, 64)));
+
+  // Singular values.
+  CALL_SUBTEST(test_svd_singular_values<Scalar>(64, 64));
+  CALL_SUBTEST(test_svd_singular_values<Scalar>(128, 64));
+
+  // Solve.
+  CALL_SUBTEST(test_svd_solve<Scalar>(64, 64, 4));
+  CALL_SUBTEST(test_svd_solve<Scalar>(128, 64, 4));
+  CALL_SUBTEST(test_svd_solve<Scalar>(64, 128, 4));  // wide (m < n)
+
+  // Truncated and regularized solve.
+  CALL_SUBTEST(test_svd_solve_truncated<Scalar>(64, 64));
+  CALL_SUBTEST(test_svd_solve_regularized<Scalar>(64, 64));
+
+  // Rank-deficient solve (exercises drop_threshold pseudoinverse).
+  CALL_SUBTEST(test_svd_solve_rank_deficient<Scalar>(64, 64, 32));
+  CALL_SUBTEST(test_svd_solve_rank_deficient<Scalar>(96, 64, 16));
+  CALL_SUBTEST(test_svd_solve_rank_deficient<Scalar>(64, 96, 16));
+
+  // Wide matrix with full U/V (transposed-internal path).
+  CALL_SUBTEST((test_svd_reconstruction_full_wide<Scalar>(64, 96)));
+
+  // Device accessors.
+  CALL_SUBTEST(test_svd_device_accessors<Scalar>(64, 64));
+  CALL_SUBTEST(test_svd_device_accessors<Scalar>(96, 64));
+  CALL_SUBTEST(test_svd_device_accessors<Scalar>(64, 96));
+  CALL_SUBTEST(test_svd_device_accessors_full_wide<Scalar>(64, 96));
+
+  // Chain device views into a downstream GEMM (orthogonality check).
+  CALL_SUBTEST(test_svd_chain_orthogonality<Scalar>(64, 64));
+  CALL_SUBTEST(test_svd_chain_orthogonality<Scalar>(96, 64));
+}
+
+EIGEN_DECLARE_TEST(gpu_cusolver_svd) {
+  // Split by scalar so each part compiles in parallel.
+  CALL_SUBTEST_1(test_scalar<float>());
+  CALL_SUBTEST_2(test_scalar<double>());
+  CALL_SUBTEST_3(test_scalar<std::complex<float>>());
+  CALL_SUBTEST_4(test_scalar<std::complex<double>>());
+  CALL_SUBTEST_5(test_svd_empty());
+}
diff --git a/unsupported/test/GPU/device_matrix.cpp b/unsupported/test/GPU/device_matrix.cpp
index 4070f49..ac69983 100644
--- a/unsupported/test/GPU/device_matrix.cpp
+++ b/unsupported/test/GPU/device_matrix.cpp
@@ -36,7 +36,6 @@
   VERIFY(!dm.empty());
   VERIFY_IS_EQUAL(dm.rows(), rows);
   VERIFY_IS_EQUAL(dm.cols(), cols);
-  VERIFY_IS_EQUAL(dm.outerStride(), rows);
   VERIFY(dm.data() != nullptr);
   VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar));
 }
@@ -70,7 +69,7 @@
   EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream));
 
   // Async upload from raw pointer.
-  auto dm = gpu::DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, rows, stream);
+  auto dm = gpu::DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, stream);
   VERIFY_IS_EQUAL(dm.rows(), rows);
   VERIFY_IS_EQUAL(dm.cols(), cols);
 
@@ -88,7 +87,7 @@
   EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&producer_stream));
   EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&consumer_stream));
 
-  auto cross_stream_dm = gpu::DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, rows, producer_stream);
+  auto cross_stream_dm = gpu::DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, producer_stream);
   auto cross_stream_transfer = cross_stream_dm.toHostAsync(consumer_stream);
   MatrixType cross_stream_result = cross_stream_transfer.get();
   VERIFY_IS_APPROX(cross_stream_result, host);
@@ -199,7 +198,6 @@
   dm.resize(50, 30);
   VERIFY_IS_EQUAL(dm.rows(), 50);
   VERIFY_IS_EQUAL(dm.cols(), 30);
-  VERIFY_IS_EQUAL(dm.outerStride(), 50);
   VERIFY(dm.data() != nullptr);
 
   // Resize to same dimensions is a no-op.