Eigen/GPU [3/5]: Add dense cuSOLVER solvers (QR, SVD, EigenSolver) libeigen/eigen!2413 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com> Co-authored-by: Rasmus Munk Larsen <rlarsen@nvidia.com>
diff --git a/unsupported/Eigen/GPU b/unsupported/Eigen/GPU index f5d20ba..4c15d7d 100644 --- a/unsupported/Eigen/GPU +++ b/unsupported/Eigen/GPU
@@ -47,8 +47,12 @@ #include "src/GPU/DeviceBlasExpr.h" #include "src/GPU/DeviceSolverExpr.h" #include "src/GPU/DeviceDispatch.h" +#include "src/GPU/GpuSolverContext.h" #include "src/GPU/GpuLLT.h" #include "src/GPU/GpuLU.h" +#include "src/GPU/GpuQR.h" +#include "src/GPU/GpuSVD.h" +#include "src/GPU/GpuEigenSolver.h" // IWYU pragma: end_exports #endif
diff --git a/unsupported/Eigen/src/GPU/CuBlasSupport.h b/unsupported/Eigen/src/GPU/CuBlasSupport.h index ae30553..994d95a 100644 --- a/unsupported/Eigen/src/GPU/CuBlasSupport.h +++ b/unsupported/Eigen/src/GPU/CuBlasSupport.h
@@ -163,6 +163,38 @@ reinterpret_cast<const cuDoubleComplex*>(B), ldb, &b, reinterpret_cast<cuDoubleComplex*>(C), ldc); } +// GEAM wrappers: C = alpha * op(A) + beta * op(B) +inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transA, cublasOperation_t transB, int m, int n, + const float* alpha, const float* A, int lda, const float* beta, const float* B, + int ldb, float* C, int ldc) { + return cublasSgeam(h, transA, transB, m, n, alpha, A, lda, beta, B, ldb, C, ldc); +} +inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transA, cublasOperation_t transB, int m, int n, + const double* alpha, const double* A, int lda, const double* beta, const double* B, + int ldb, double* C, int ldc) { + return cublasDgeam(h, transA, transB, m, n, alpha, A, lda, beta, B, ldb, C, ldc); +} +inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transA, cublasOperation_t transB, int m, int n, + const std::complex<float>* alpha, const std::complex<float>* A, int lda, + const std::complex<float>* beta, const std::complex<float>* B, int ldb, + std::complex<float>* C, int ldc) { + cuComplex a, b; + std::memcpy(&a, alpha, sizeof(a)); + std::memcpy(&b, beta, sizeof(b)); + return cublasCgeam(h, transA, transB, m, n, &a, reinterpret_cast<const cuComplex*>(A), lda, &b, + reinterpret_cast<const cuComplex*>(B), ldb, reinterpret_cast<cuComplex*>(C), ldc); +} +inline cublasStatus_t cublasXgeam(cublasHandle_t h, cublasOperation_t transA, cublasOperation_t transB, int m, int n, + const std::complex<double>* alpha, const std::complex<double>* A, int lda, + const std::complex<double>* beta, const std::complex<double>* B, int ldb, + std::complex<double>* C, int ldc) { + cuDoubleComplex a, b; + std::memcpy(&a, alpha, sizeof(a)); + std::memcpy(&b, beta, sizeof(b)); + return cublasZgeam(h, transA, transB, m, n, &a, reinterpret_cast<const cuDoubleComplex*>(A), lda, &b, + reinterpret_cast<const cuDoubleComplex*>(B), ldb, reinterpret_cast<cuDoubleComplex*>(C), ldc); +} + // SYRK wrappers (real → syrk, complex → herk) inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k, const float* alpha, const float* A, int lda, const float* beta, float* C, int ldc) { @@ -186,6 +218,28 @@ reinterpret_cast<cuDoubleComplex*>(C), ldc); } +// DGMM wrappers: C = A * diag(x) (side=RIGHT) or C = diag(x) * A (side=LEFT). +// Useful for applying a diagonal scaling without materialising diag(x) as a +// dense matrix. cuBLAS docs guarantee in-place is safe when C == A. +inline cublasStatus_t cublasXdgmm(cublasHandle_t h, cublasSideMode_t side, int m, int n, const float* A, int lda, + const float* x, int incx, float* C, int ldc) { + return cublasSdgmm(h, side, m, n, A, lda, x, incx, C, ldc); +} +inline cublasStatus_t cublasXdgmm(cublasHandle_t h, cublasSideMode_t side, int m, int n, const double* A, int lda, + const double* x, int incx, double* C, int ldc) { + return cublasDdgmm(h, side, m, n, A, lda, x, incx, C, ldc); +} +inline cublasStatus_t cublasXdgmm(cublasHandle_t h, cublasSideMode_t side, int m, int n, const std::complex<float>* A, + int lda, const std::complex<float>* x, int incx, std::complex<float>* C, int ldc) { + return cublasCdgmm(h, side, m, n, reinterpret_cast<const cuComplex*>(A), lda, reinterpret_cast<const cuComplex*>(x), + incx, reinterpret_cast<cuComplex*>(C), ldc); +} +inline cublasStatus_t cublasXdgmm(cublasHandle_t h, cublasSideMode_t side, int m, int n, const std::complex<double>* A, + int lda, const std::complex<double>* x, int incx, std::complex<double>* C, int ldc) { + return cublasZdgmm(h, side, m, n, reinterpret_cast<const cuDoubleComplex*>(A), lda, + reinterpret_cast<const cuDoubleComplex*>(x), incx, reinterpret_cast<cuDoubleComplex*>(C), ldc); +} + } // namespace internal } // namespace gpu } // namespace Eigen
diff --git a/unsupported/Eigen/src/GPU/CuSolverSupport.h b/unsupported/Eigen/src/GPU/CuSolverSupport.h index 7f8c33d..41f7b31 100644 --- a/unsupported/Eigen/src/GPU/CuSolverSupport.h +++ b/unsupported/Eigen/src/GPU/CuSolverSupport.h
@@ -124,6 +124,68 @@ static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_UPPER; }; +// ---- Type-specific cuSOLVER wrappers ---------------------------------------- +// cuSOLVER does not provide generic X variants for ormqr/unmqr. These overloaded +// wrappers dispatch to the correct type-specific function. +// For real types: ormqr (orthogonal Q). For complex types: unmqr (unitary Q). + +inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m, + int n, int k, const float* A, int lda, const float* tau, float* C, int ldc, + float* work, int lwork, int* info) { + return cusolverDnSormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info); +} +inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m, + int n, int k, const double* A, int lda, const double* tau, double* C, int ldc, + double* work, int lwork, int* info) { + return cusolverDnDormqr(h, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, info); +} +inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m, + int n, int k, const std::complex<float>* A, int lda, + const std::complex<float>* tau, std::complex<float>* C, int ldc, + std::complex<float>* work, int lwork, int* info) { + return cusolverDnCunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda, + reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<cuComplex*>(C), ldc, + reinterpret_cast<cuComplex*>(work), lwork, info); +} +inline cusolverStatus_t cusolverDnXormqr(cusolverDnHandle_t h, cublasSideMode_t side, cublasOperation_t trans, int m, + int n, int k, const std::complex<double>* A, int lda, + const std::complex<double>* tau, std::complex<double>* C, int ldc, + std::complex<double>* work, int lwork, int* info) { + return cusolverDnZunmqr(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda, + reinterpret_cast<const cuDoubleComplex*>(tau), reinterpret_cast<cuDoubleComplex*>(C), ldc, + reinterpret_cast<cuDoubleComplex*>(work), lwork, info); +} + +// Buffer size wrappers for ormqr/unmqr. +inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side, + cublasOperation_t trans, int m, int n, int k, const float* A, + int lda, const float* tau, const float* C, int ldc, int* lwork) { + return cusolverDnSormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork); +} +inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side, + cublasOperation_t trans, int m, int n, int k, const double* A, + int lda, const double* tau, const double* C, int ldc, int* lwork) { + return cusolverDnDormqr_bufferSize(h, side, trans, m, n, k, A, lda, tau, C, ldc, lwork); +} +inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side, + cublasOperation_t trans, int m, int n, int k, + const std::complex<float>* A, int lda, + const std::complex<float>* tau, const std::complex<float>* C, + int ldc, int* lwork) { + return cusolverDnCunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuComplex*>(A), lda, + reinterpret_cast<const cuComplex*>(tau), reinterpret_cast<const cuComplex*>(C), + ldc, lwork); +} +inline cusolverStatus_t cusolverDnXormqr_bufferSize(cusolverDnHandle_t h, cublasSideMode_t side, + cublasOperation_t trans, int m, int n, int k, + const std::complex<double>* A, int lda, + const std::complex<double>* tau, const std::complex<double>* C, + int ldc, int* lwork) { + return cusolverDnZunmqr_bufferSize(h, side, trans, m, n, k, reinterpret_cast<const cuDoubleComplex*>(A), lda, + reinterpret_cast<const cuDoubleComplex*>(tau), + reinterpret_cast<const cuDoubleComplex*>(C), ldc, lwork); +} + } // namespace internal } // namespace gpu } // namespace Eigen
diff --git a/unsupported/Eigen/src/GPU/DeviceDispatch.h b/unsupported/Eigen/src/GPU/DeviceDispatch.h index 82c9112..8c6465c 100644 --- a/unsupported/Eigen/src/GPU/DeviceDispatch.h +++ b/unsupported/Eigen/src/GPU/DeviceDispatch.h
@@ -8,12 +8,6 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. // SPDX-License-Identifier: MPL-2.0 -// Dispatch functions that map DeviceMatrix expressions to NVIDIA library calls. -// -// dispatch_gemm() — GemmExpr → cublasXgemm -// -// Each function documents the exact library call and parameters. - #ifndef EIGEN_GPU_DEVICE_DISPATCH_H #define EIGEN_GPU_DEVICE_DISPATCH_H @@ -37,9 +31,6 @@ return a.data() != nullptr && a.data() == b.data(); } -// ---- GEMM dispatch ---------------------------------------------------------- -// GemmExpr<Lhs, Rhs> → cublasXgemm (type-specific Sgemm/Dgemm/Cgemm/Zgemm). - template <typename Lhs, typename Rhs> void dispatch_gemm( Context& ctx, DeviceMatrix<typename device_expr_traits<Lhs>::scalar_type>& dst, const GemmExpr<Lhs, Rhs>& expr, @@ -55,7 +46,6 @@ constexpr cublasOperation_t transA = to_cublas_op(traits_lhs::op); constexpr cublasOperation_t transB = to_cublas_op(traits_rhs::op); - // GEMM dimensions: C(m,n) = op(A)(m,k) * op(B)(k,n) const int64_t m = (traits_lhs::op == GpuOp::NoTrans) ? A.rows() : A.cols(); const int64_t k = (traits_lhs::op == GpuOp::NoTrans) ? A.cols() : A.rows(); const int64_t n = (traits_rhs::op == GpuOp::NoTrans) ? B.cols() : B.rows(); @@ -63,8 +53,8 @@ eigen_assert(k == rhs_k && "DeviceMatrix GEMM dimension mismatch"); - const int64_t lda = A.outerStride(); - const int64_t ldb = B.outerStride(); + const int64_t lda = A.rows(); + const int64_t ldb = B.rows(); eigen_assert(!aliases_device_memory(dst, A) && "DeviceMatrix GEMM destination aliases lhs operand"); eigen_assert(!aliases_device_memory(dst, B) && "DeviceMatrix GEMM destination aliases rhs operand"); @@ -77,7 +67,7 @@ if (resized) { dst.resize(m, n); } - const int64_t ldc = dst.outerStride(); + const int64_t ldc = dst.rows(); Scalar alpha_local = alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs()); @@ -91,6 +81,10 @@ eigen_assert(m <= INT_MAX && n <= INT_MAX && k <= INT_MAX && lda <= INT_MAX && ldb <= INT_MAX && ldc <= INT_MAX && "cublasXgemm dimensions exceed int range"); + // cuBLAS reads alpha and beta through host pointers. Store them in an array + // to prevent the compiler from eliding their stack slots — clang and MSVC + // at -O1+ otherwise optimise away the stores for complex types, leaving + // cuBLAS with a dangling pointer. Scalar scalars[2] = {alpha_local, beta_val}; EIGEN_CUBLAS_CHECK(cublasXgemm(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n), static_cast<int>(k), &scalars[0], A.data(), static_cast<int>(lda), B.data(), @@ -99,8 +93,6 @@ dst.recordReady(ctx.stream()); } -// ---- LLT solve dispatch ----------------------------------------------------- - template <typename Scalar, int UpLo> void dispatch_llt_solve(Context& ctx, DeviceMatrix<Scalar>& dst, const LltSolveExpr<Scalar, UpLo>& expr) { const DeviceMatrix<Scalar>& A = expr.matrix(); @@ -124,8 +116,8 @@ constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value; constexpr cublasFillMode_t uplo = cusolver_fill_mode<UpLo>::value; - const int64_t lda = static_cast<int64_t>(A.outerStride()); - const int64_t ldb = static_cast<int64_t>(B.outerStride()); + const int64_t lda = static_cast<int64_t>(A.rows()); + const int64_t ldb = static_cast<int64_t>(B.rows()); const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar); const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar); @@ -161,8 +153,9 @@ EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream())); EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(ctx.cusolverHandle(), params.p, uplo, n, nrhs, dtype, d_factor.get(), lda, - dtype, dst.data(), static_cast<int64_t>(dst.outerStride()), d_info_potrs)); + dtype, dst.data(), static_cast<int64_t>(dst.rows()), d_info_potrs)); + // Workspace locals must outlive the async kernels — sync before they unwind. EIGEN_CUDA_RUNTIME_CHECK( cudaMemcpyAsync(&info_words[1], d_info_potrs, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream())); @@ -172,8 +165,6 @@ dst.recordReady(ctx.stream()); } -// ---- LU solve dispatch ------------------------------------------------------ - template <typename Scalar> void dispatch_lu_solve(Context& ctx, DeviceMatrix<Scalar>& dst, const LuSolveExpr<Scalar>& expr) { const DeviceMatrix<Scalar>& A = expr.matrix(); @@ -196,8 +187,8 @@ if (!dst.empty()) dst.waitReady(ctx.stream()); constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value; - const int64_t lda = static_cast<int64_t>(A.outerStride()); - const int64_t ldb = static_cast<int64_t>(B.outerStride()); + const int64_t lda = static_cast<int64_t>(A.rows()); + const int64_t ldb = static_cast<int64_t>(B.rows()); const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar); const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar); const size_t ipiv_bytes = static_cast<size_t>(n) * sizeof(int64_t); @@ -207,7 +198,6 @@ DeviceBuffer d_ipiv(ipiv_bytes); - // See dispatch_llt_solve: two info slots + single end-of-chain sync. PinnedHostBuffer h_info(2 * sizeof(int)); int* info_words = static_cast<int*>(h_info.get()); @@ -234,8 +224,9 @@ EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(ctx.cusolverHandle(), params.p, CUBLAS_OP_N, n, nrhs, dtype, d_lu.get(), lda, static_cast<const int64_t*>(d_ipiv.get()), dtype, dst.data(), - static_cast<int64_t>(dst.outerStride()), d_info_getrs)); + static_cast<int64_t>(dst.rows()), d_info_getrs)); + // Workspace locals must outlive the async kernels — sync before they unwind. EIGEN_CUDA_RUNTIME_CHECK( cudaMemcpyAsync(&info_words[1], d_info_getrs, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream())); @@ -245,8 +236,6 @@ dst.recordReady(ctx.stream()); } -// ---- TRSM dispatch ---------------------------------------------------------- - template <typename Scalar, int UpLo> void dispatch_trsm(Context& ctx, DeviceMatrix<Scalar>& dst, const TrsmExpr<Scalar, UpLo>& expr) { const DeviceMatrix<Scalar>& A = expr.matrix(); @@ -255,8 +244,7 @@ eigen_assert(A.rows() == A.cols() && "TRSM requires a square triangular matrix"); eigen_assert(B.rows() == A.rows() && "TRSM: RHS rows must match matrix size"); - eigen_assert(A.rows() <= INT_MAX && B.cols() <= INT_MAX && A.outerStride() <= INT_MAX && - "cublasXtrsm dimensions exceed int range"); + eigen_assert(A.rows() <= INT_MAX && B.cols() <= INT_MAX && "cublasXtrsm dimensions exceed int range"); const int n = static_cast<int>(A.rows()); const int nrhs = static_cast<int>(B.cols()); @@ -274,21 +262,19 @@ if (!dst.empty()) dst.waitReady(ctx.stream()); dst.resize(n, B.cols()); - const size_t rhs_bytes = static_cast<size_t>(dst.outerStride()) * static_cast<size_t>(nrhs) * sizeof(Scalar); + const size_t rhs_bytes = static_cast<size_t>(dst.rows()) * static_cast<size_t>(nrhs) * sizeof(Scalar); EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream())); constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; Scalar alpha(1); EIGEN_CUBLAS_CHECK(cublasXtrsm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, nrhs, - &alpha, A.data(), static_cast<int>(A.outerStride()), dst.data(), - static_cast<int>(dst.outerStride()))); + &alpha, A.data(), static_cast<int>(A.rows()), dst.data(), + static_cast<int>(dst.rows()))); dst.recordReady(ctx.stream()); } -// ---- SYMM/HEMM dispatch ----------------------------------------------------- - template <typename Scalar, int UpLo> void dispatch_symm(Context& ctx, DeviceMatrix<Scalar>& dst, const SymmExpr<Scalar, UpLo>& expr) { const DeviceMatrix<Scalar>& A = expr.matrix(); @@ -296,7 +282,7 @@ eigen_assert(A.rows() == A.cols() && "SYMM requires a square matrix"); eigen_assert(B.rows() == A.rows() && "SYMM: RHS rows must match matrix size"); - eigen_assert(A.rows() <= INT_MAX && B.cols() <= INT_MAX && A.outerStride() <= INT_MAX && B.outerStride() <= INT_MAX && + eigen_assert(A.rows() <= INT_MAX && B.cols() <= INT_MAX && B.rows() <= INT_MAX && "cublasXsymm dimensions exceed int range"); const int m = static_cast<int>(A.rows()); @@ -304,7 +290,7 @@ if (m == 0 || n == 0) { if (!dst.empty()) dst.waitReady(ctx.stream()); - dst.resize(m == 0 ? 0 : m, B.cols()); + dst.resize(m, B.cols()); return; } @@ -317,25 +303,23 @@ dst.resize(m, n); constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; + // See dispatch_gemm: array prevents compiler from eliding host-pointer stack slots. Scalar scalars[2] = {Scalar(1), Scalar(0)}; EIGEN_CUBLAS_CHECK(cublasXsymm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, m, n, &scalars[0], A.data(), - static_cast<int>(A.outerStride()), B.data(), static_cast<int>(B.outerStride()), - &scalars[1], dst.data(), static_cast<int>(dst.outerStride()))); + static_cast<int>(A.rows()), B.data(), static_cast<int>(B.rows()), &scalars[1], + dst.data(), static_cast<int>(dst.rows()))); dst.recordReady(ctx.stream()); } -// ---- SYRK/HERK dispatch ----------------------------------------------------- - template <typename Scalar, int UpLo> void dispatch_syrk(Context& ctx, DeviceMatrix<Scalar>& dst, const SyrkExpr<Scalar, UpLo>& expr, typename NumTraits<Scalar>::Real alpha_val, typename NumTraits<Scalar>::Real beta_val) { using RealScalar = typename NumTraits<Scalar>::Real; const DeviceMatrix<Scalar>& A = expr.matrix(); - eigen_assert(A.rows() <= INT_MAX && A.cols() <= INT_MAX && A.outerStride() <= INT_MAX && - "cublasXsyrk dimensions exceed int range"); + eigen_assert(A.rows() <= INT_MAX && A.cols() <= INT_MAX && "cublasXsyrk dimensions exceed int range"); const int n = static_cast<int>(A.rows()); const int k = static_cast<int>(A.cols()); @@ -360,16 +344,13 @@ constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; EIGEN_CUBLAS_CHECK(cublasXsyrk(ctx.cublasHandle(), uplo, CUBLAS_OP_N, n, k, &alpha_val, A.data(), - static_cast<int>(A.outerStride()), &beta_val, dst.data(), - static_cast<int>(dst.outerStride()))); + static_cast<int>(A.rows()), &beta_val, dst.data(), static_cast<int>(dst.rows()))); dst.recordReady(ctx.stream()); } } // namespace internal -// ---- Assignment: d_C.device(ctx) = expr ------------------------------------ - template <typename Scalar_> class Assignment { public: @@ -432,8 +413,8 @@ Context& ctx_; }; -// ---- Out-of-line Matrix expression operator= definitions ------------------ -// Declared in DeviceMatrix.h, defined here because they need Context::threadLocal(). +// Out-of-line definitions: these depend on Context::threadLocal(), which +// requires the full Context definition unavailable in DeviceMatrix.h. template <typename Scalar_> template <typename Lhs, typename Rhs> @@ -476,7 +457,6 @@ return *this; } -// SelfAdjointView::rankUpdate — defined here because it needs Context. template <typename Scalar_, int UpLo_> void SelfAdjointView<Scalar_, UpLo_>::rankUpdate(const DeviceMatrix<Scalar_>& A, RealScalar alpha) { SyrkExpr<Scalar_, UpLo_> expr(A);
diff --git a/unsupported/Eigen/src/GPU/DeviceMatrix.h b/unsupported/Eigen/src/GPU/DeviceMatrix.h index acc4ba5..869a5ac 100644 --- a/unsupported/Eigen/src/GPU/DeviceMatrix.h +++ b/unsupported/Eigen/src/GPU/DeviceMatrix.h
@@ -26,7 +26,7 @@ // MatrixXd X = d_X.toHost(); // download + block // // Async variants: -// auto d_A = gpu::DeviceMatrix<double>::fromHostAsync(A.data(), n, n, n, stream); +// auto d_A = gpu::DeviceMatrix<double>::fromHostAsync(A.data(), n, n, stream); // auto transfer = d_X.toHostAsync(stream); // enqueue D2H // // ... overlap with other work ... // MatrixXd X = transfer.get(); // block + retrieve @@ -191,7 +191,7 @@ DeviceMatrix() = default; /** Allocate uninitialized device memory for a rows x cols matrix. */ - DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols), outerStride_(rows) { + DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols) { eigen_assert(rows >= 0 && cols >= 0); size_t bytes = sizeInBytes(); if (bytes > 0) { @@ -215,13 +215,11 @@ : data_(std::move(o.data_)), rows_(o.rows_), cols_(o.cols_), - outerStride_(o.outerStride_), ready_event_(o.ready_event_), ready_stream_(o.ready_stream_), retained_buffer_(std::move(o.retained_buffer_)) { o.rows_ = 0; o.cols_ = 0; - o.outerStride_ = 0; o.ready_event_ = nullptr; o.ready_stream_ = nullptr; } @@ -232,13 +230,11 @@ data_ = std::move(o.data_); rows_ = o.rows_; cols_ = o.cols_; - outerStride_ = o.outerStride_; ready_event_ = o.ready_event_; ready_stream_ = o.ready_stream_; retained_buffer_ = std::move(o.retained_buffer_); o.rows_ = 0; o.cols_ = 0; - o.outerStride_ = 0; o.ready_event_ = nullptr; o.ready_stream_ = nullptr; } @@ -276,29 +272,18 @@ * The caller must keep \p host_data alive until the transfer completes * (check via the internal event or synchronize the stream). * - * \param host_data Pointer to contiguous column-major host data. - * \param rows Number of rows. - * \param cols Number of columns. - * \param outerStride Leading dimension (>= rows). Use rows for dense. - * \param stream CUDA stream for the transfer. + * \param host_data Pointer to contiguous column-major host data. + * \param rows Number of rows. + * \param cols Number of columns. + * \param stream CUDA stream for the transfer. */ - static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, Index outerStride, - cudaStream_t stream) { - eigen_assert(rows >= 0 && cols >= 0 && outerStride >= rows); + static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, cudaStream_t stream) { + eigen_assert(rows >= 0 && cols >= 0); eigen_assert(host_data != nullptr || (rows == 0 || cols == 0)); DeviceMatrix dm(rows, cols); if (dm.sizeInBytes() > 0) { - // If outerStride == rows (dense), single contiguous copy. - // Otherwise, copy column by column (strided layout). - if (outerStride == rows) { - EIGEN_CUDA_RUNTIME_CHECK( - cudaMemcpyAsync(dm.data_.get(), host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream)); - } else { - EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(dm.data_.get(), static_cast<size_t>(rows) * sizeof(Scalar), - host_data, static_cast<size_t>(outerStride) * sizeof(Scalar), - static_cast<size_t>(rows) * sizeof(Scalar), - static_cast<size_t>(cols), cudaMemcpyHostToDevice, stream)); - } + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(dm.data_.get(), host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream)); dm.recordReady(stream); } return dm; @@ -380,7 +365,6 @@ retained_buffer_ = internal::DeviceBuffer(); rows_ = rows; cols_ = cols; - outerStride_ = rows; size_t bytes = sizeInBytes(); if (bytes > 0) { void* p = nullptr; @@ -395,11 +379,10 @@ const Scalar* data() const { return data_.get(); } Index rows() const { return rows_; } Index cols() const { return cols_; } - Index outerStride() const { return outerStride_; } bool empty() const { return rows_ == 0 || cols_ == 0; } /** Size of the device allocation in bytes. */ - size_t sizeInBytes() const { return static_cast<size_t>(outerStride_) * static_cast<size_t>(cols_) * sizeof(Scalar); } + size_t sizeInBytes() const { return static_cast<size_t>(rows_) * static_cast<size_t>(cols_) * sizeof(Scalar); } // ---- Event synchronization (public for library dispatch interop) --------- @@ -493,7 +476,23 @@ dm.data_.reset(device_ptr); dm.rows_ = rows; dm.cols_ = cols; - dm.outerStride_ = rows; + return dm; + } + + /** Construct a non-owning view over an existing device pointer. + * + * The pointer is *borrowed*: destruction does not free, and the underlying + * storage must outlive this view. Use this to chain decomposition outputs + * (e.g. `svd.d_matrixU()`) into downstream cuBLAS expressions without an + * intervening D2D copy. The view supports the full read interface (toHost, + * gemm operands, adjoint(), etc.). Do not assign through a view: the borrowed + * pointer would be silently replaced and the underlying owner left intact. */ + static DeviceMatrix view(Scalar* device_ptr, Index rows, Index cols) { + DeviceMatrix dm; + dm.data_ = + std::unique_ptr<Scalar, internal::CudaFreeDeleter>(device_ptr, internal::CudaFreeDeleter{/*borrow=*/true}); + dm.rows_ = rows; + dm.cols_ = cols; return dm; } @@ -502,7 +501,6 @@ Scalar* p = data_.release(); rows_ = 0; cols_ = 0; - outerStride_ = 0; if (ready_event_) { (void)cudaEventDestroy(ready_event_); ready_event_ = nullptr; @@ -527,7 +525,6 @@ std::unique_ptr<Scalar, internal::CudaFreeDeleter> data_; Index rows_ = 0; Index cols_ = 0; - Index outerStride_ = 0; cudaEvent_t ready_event_ = nullptr; // internal: tracks last write completion cudaStream_t ready_stream_ = nullptr; // stream that recorded ready_event_ (for same-stream skip) internal::DeviceBuffer retained_buffer_; // internal: keeps async aux buffers alive
diff --git a/unsupported/Eigen/src/GPU/GpuEigenSolver.h b/unsupported/Eigen/src/GPU/GpuEigenSolver.h new file mode 100644 index 0000000..e863390 --- /dev/null +++ b/unsupported/Eigen/src/GPU/GpuEigenSolver.h
@@ -0,0 +1,219 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// GPU self-adjoint eigenvalue decomposition using cuSOLVER. +// +// Wraps cusolverDnXsyevd (symmetric/Hermitian divide-and-conquer). +// Stores eigenvalues and eigenvectors on device. +// +// Usage: +// SelfAdjointEigenSolver<double> es(A); +// VectorXd eigenvals = es.eigenvalues(); +// MatrixXd eigenvecs = es.eigenvectors(); + +#ifndef EIGEN_GPU_EIGENSOLVER_H +#define EIGEN_GPU_EIGENSOLVER_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./GpuSolverContext.h" + +namespace Eigen { +namespace gpu { + +template <typename Scalar_> +class SelfAdjointEigenSolver { + public: + using Scalar = Scalar_; + using RealScalar = typename NumTraits<Scalar>::Real; + using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>; + using RealVector = Eigen::Matrix<RealScalar, Dynamic, 1>; + + SelfAdjointEigenSolver() = default; + + /** \param options Eigen::ComputeEigenvectors (default) or Eigen::EigenvaluesOnly. */ + template <typename InputType> + explicit SelfAdjointEigenSolver(const EigenBase<InputType>& A, int options = ComputeEigenvectors) { + compute(A, options); + } + + explicit SelfAdjointEigenSolver(const DeviceMatrix<Scalar>& d_A, int options = ComputeEigenvectors) { + compute(d_A, options); + } + + ~SelfAdjointEigenSolver() = default; + + SelfAdjointEigenSolver(const SelfAdjointEigenSolver&) = delete; + SelfAdjointEigenSolver& operator=(const SelfAdjointEigenSolver&) = delete; + + SelfAdjointEigenSolver(SelfAdjointEigenSolver&& o) noexcept + : solver_ctx_(std::move(o.solver_ctx_)), + d_A_(std::move(o.d_A_)), + d_W_(std::move(o.d_W_)), + compute_eigenvectors_(o.compute_eigenvectors_), + n_(o.n_), + lda_(o.lda_) { + o.compute_eigenvectors_ = true; + o.n_ = 0; + o.lda_ = 0; + } + + SelfAdjointEigenSolver& operator=(SelfAdjointEigenSolver&& o) noexcept { + if (this != &o) { + solver_ctx_ = std::move(o.solver_ctx_); + d_A_ = std::move(o.d_A_); + d_W_ = std::move(o.d_W_); + compute_eigenvectors_ = o.compute_eigenvectors_; + n_ = o.n_; + lda_ = o.lda_; + o.compute_eigenvectors_ = true; + o.n_ = 0; + o.lda_ = 0; + } + return *this; + } + + // ---- Factorization ------------------------------------------------------- + + template <typename InputType> + SelfAdjointEigenSolver& compute(const EigenBase<InputType>& A, int options = ComputeEigenvectors) { + return compute(DeviceMatrix<Scalar>::fromHost(A.derived(), solver_ctx_.stream_), options); + } + + SelfAdjointEigenSolver& compute(const DeviceMatrix<Scalar>& d_A, int options = ComputeEigenvectors) { + eigen_assert(d_A.rows() == d_A.cols() && "SelfAdjointEigenSolver requires a square matrix"); + eigen_assert((options == ComputeEigenvectors || options == EigenvaluesOnly) && + "options must be ComputeEigenvectors or EigenvaluesOnly"); + compute_eigenvectors_ = (options == ComputeEigenvectors); + n_ = d_A.rows(); + + if (n_ == 0) { + solver_ctx_.info_ = Success; + solver_ctx_.info_synced_ = true; + return *this; + } + + d_A.waitReady(solver_ctx_.stream_); + lda_ = n_; + const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar); + + d_A_ = internal::DeviceBuffer(mat_bytes); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_A_.get(), d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, solver_ctx_.stream_)); + + factorize(); + return *this; + } + + // ---- Accessors ----------------------------------------------------------- + + ComputationInfo info() const { return solver_ctx_.info(); } + + Index cols() const { return n_; } + Index rows() const { return n_; } + + /** Eigenvalues in ascending order. Downloads from device. */ + RealVector eigenvalues() const { + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success); + RealVector W(n_); + if (n_ > 0) { + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpy(W.data(), d_W_.get(), static_cast<size_t>(n_) * sizeof(RealScalar), cudaMemcpyDeviceToHost)); + } + return W; + } + + /** Eigenvectors (columns). Downloads from device. + * Requires ComputeEigenvectors mode. */ + PlainMatrix eigenvectors() const { + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success); + eigen_assert(compute_eigenvectors_ && "eigenvectors() requires ComputeEigenvectors option"); + PlainMatrix V(n_, n_); + if (n_ > 0) { + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(V.data(), d_A_.get(), + static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar), + cudaMemcpyDeviceToHost)); + } + return V; + } + + // ---- Device-side accessors (zero-copy views; chain into cuBLAS without D2D) --------- + // + // These return non-owning DeviceMatrix views over this solver's internal storage. The + // view borrows the pointer: destruction does not free; this solver must outlive any + // view derived from it. Both accessors are pure metadata — zero kernel launches. + + /** Eigenvalues as an n × 1 view on this solver's stream. */ + DeviceMatrix<RealScalar> d_eigenvalues() const { + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success); + auto v = DeviceMatrix<RealScalar>::view(static_cast<RealScalar*>(d_W_.get()), n_, 1); + v.recordReady(solver_ctx_.stream_); + return v; + } + + /** Eigenvectors (columns) as an n × n view on this solver's stream. */ + DeviceMatrix<Scalar> d_eigenvectors() const { + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success); + eigen_assert(compute_eigenvectors_ && "d_eigenvectors() requires ComputeEigenvectors option"); + auto v = DeviceMatrix<Scalar>::view(static_cast<Scalar*>(d_A_.get()), n_, n_); + v.recordReady(solver_ctx_.stream_); + return v; + } + + cudaStream_t stream() const { return solver_ctx_.stream_; } + + private: + mutable internal::GpuSolverContext solver_ctx_; + internal::DeviceBuffer d_A_; // overwritten with eigenvectors by syevd + internal::DeviceBuffer d_W_; // eigenvalues (RealScalar, length n) + bool compute_eigenvectors_ = true; + int64_t n_ = 0; + int64_t lda_ = 0; + + void factorize() { + constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value; + constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value; + + solver_ctx_.mark_pending(); + + d_W_ = internal::DeviceBuffer(static_cast<size_t>(n_) * sizeof(RealScalar)); + + const cusolverEigMode_t jobz = compute_eigenvectors_ ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR; + + // Use lower triangle (standard convention). + constexpr cublasFillMode_t uplo = CUBLAS_FILL_MODE_LOWER; + + size_t dev_ws = 0, host_ws = 0; + EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd_bufferSize(solver_ctx_.cusolver_, solver_ctx_.params_.p, jobz, uplo, n_, + dtype, d_A_.get(), lda_, rtype, d_W_.get(), dtype, &dev_ws, + &host_ws)); + + solver_ctx_.ensure_scratch(dev_ws); + solver_ctx_.h_workspace_.resize(host_ws); + + EIGEN_CUSOLVER_CHECK(cusolverDnXsyevd(solver_ctx_.cusolver_, solver_ctx_.params_.p, jobz, uplo, n_, dtype, + d_A_.get(), lda_, rtype, d_W_.get(), dtype, solver_ctx_.scratch_workspace(), + dev_ws, host_ws > 0 ? solver_ctx_.h_workspace_.data() : nullptr, host_ws, + solver_ctx_.scratch_info())); + + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solver_ctx_.info_word(), solver_ctx_.scratch_info(), sizeof(int), + cudaMemcpyDeviceToHost, solver_ctx_.stream_)); + } +}; + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_EIGENSOLVER_H
diff --git a/unsupported/Eigen/src/GPU/GpuLLT.h b/unsupported/Eigen/src/GPU/GpuLLT.h index 2dd7299..b112543 100644 --- a/unsupported/Eigen/src/GPU/GpuLLT.h +++ b/unsupported/Eigen/src/GPU/GpuLLT.h
@@ -30,8 +30,7 @@ // IWYU pragma: private #include "./InternalHeaderCheck.h" -#include "./CuSolverSupport.h" -#include <vector> +#include "./GpuSolverContext.h" namespace Eigen { namespace gpu { @@ -64,22 +63,15 @@ // ---- Construction / destruction ------------------------------------------ /** Default constructor. Does not factorize; call compute() before solve(). */ - LLT() { init_context(); } + LLT() = default; /** Factor A immediately. Equivalent to LLT llt; llt.compute(A). */ template <typename InputType> - explicit LLT(const EigenBase<InputType>& A) : LLT() { + explicit LLT(const EigenBase<InputType>& A) { compute(A); } - ~LLT() { - // Ignore errors here: dtors are noexcept, and EIGEN_CUSOLVER_CHECK / - // EIGEN_CUDA_RUNTIME_CHECK are eigen_assert-based — firing one from a - // noexcept dtor terminates the program. The trailing cudaFree() of the - // device buffers (via internal::DeviceBuffer::~DeviceBuffer) is sync. - if (handle_) (void)cusolverDnDestroy(handle_); - if (stream_) (void)cudaStreamDestroy(stream_); - } + ~LLT() = default; // Non-copyable (owns device memory and library handles). LLT(const LLT&) = delete; @@ -87,81 +79,43 @@ // Movable. LLT(LLT&& o) noexcept - : stream_(o.stream_), - handle_(o.handle_), - params_(std::move(o.params_)), + : solver_ctx_(std::move(o.solver_ctx_)), d_factor_(std::move(o.d_factor_)), factor_alloc_size_(o.factor_alloc_size_), - d_scratch_(std::move(o.d_scratch_)), - scratch_size_(o.scratch_size_), - h_workspace_(std::move(o.h_workspace_)), n_(o.n_), - lda_(o.lda_), - info_(o.info_), - pinned_info_(std::move(o.pinned_info_)), - info_synced_(o.info_synced_) { - o.stream_ = nullptr; - o.handle_ = nullptr; + lda_(o.lda_) { o.factor_alloc_size_ = 0; - o.scratch_size_ = 0; o.n_ = 0; o.lda_ = 0; - o.info_ = InvalidInput; - o.info_synced_ = true; } LLT& operator=(LLT&& o) noexcept { if (this != &o) { - // Drain pending work on the old stream before tearing it down so the - // upcoming move of d_factor_/d_scratch_ doesn't free buffers that an - // in-flight kernel is still touching. The asymmetric move-ctor doesn't - // need this — it constructs onto uninitialized storage. - if (stream_) (void)cudaStreamSynchronize(stream_); - if (handle_) (void)cusolverDnDestroy(handle_); - if (stream_) (void)cudaStreamDestroy(stream_); - stream_ = o.stream_; - handle_ = o.handle_; - params_ = std::move(o.params_); + solver_ctx_ = std::move(o.solver_ctx_); d_factor_ = std::move(o.d_factor_); factor_alloc_size_ = o.factor_alloc_size_; - d_scratch_ = std::move(o.d_scratch_); - scratch_size_ = o.scratch_size_; - h_workspace_ = std::move(o.h_workspace_); n_ = o.n_; lda_ = o.lda_; - info_ = o.info_; - pinned_info_ = std::move(o.pinned_info_); - info_synced_ = o.info_synced_; - o.stream_ = nullptr; - o.handle_ = nullptr; o.factor_alloc_size_ = 0; - o.scratch_size_ = 0; o.n_ = 0; o.lda_ = 0; - o.info_ = InvalidInput; - o.info_synced_ = true; } return *this; } // ---- Factorization ------------------------------------------------------- - /** Compute the Cholesky factorization of A (host matrix). - * - * Uploads A to device memory, calls cusolverDnXpotrf, and retains the - * factored matrix on device. Any previous factorization is overwritten. - */ + /** Compute the Cholesky factorization of A (host matrix). */ template <typename InputType> LLT& compute(const EigenBase<InputType>& A) { eigen_assert(A.rows() == A.cols()); if (!begin_compute(A.rows())) return *this; - // Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions). const PlainMatrix mat(A.derived()); - lda_ = static_cast<int64_t>(mat.outerStride()); + lda_ = static_cast<int64_t>(mat.rows()); allocate_factor_storage(); EIGEN_CUDA_RUNTIME_CHECK( - cudaMemcpyAsync(d_factor_.get(), mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_)); + cudaMemcpyAsync(d_factor_.get(), mat.data(), factorBytes(), cudaMemcpyHostToDevice, solver_ctx_.stream_)); factorize(); return *this; @@ -172,11 +126,11 @@ eigen_assert(d_A.rows() == d_A.cols()); if (!begin_compute(d_A.rows())) return *this; - lda_ = static_cast<int64_t>(d_A.outerStride()); - d_A.waitReady(stream_); + lda_ = static_cast<int64_t>(d_A.rows()); + d_A.waitReady(solver_ctx_.stream_); allocate_factor_storage(); EIGEN_CUDA_RUNTIME_CHECK( - cudaMemcpyAsync(d_factor_.get(), d_A.data(), factorBytes(), cudaMemcpyDeviceToDevice, stream_)); + cudaMemcpyAsync(d_factor_.get(), d_A.data(), factorBytes(), cudaMemcpyDeviceToDevice, solver_ctx_.stream_)); factorize(); return *this; @@ -187,8 +141,8 @@ eigen_assert(d_A.rows() == d_A.cols()); if (!begin_compute(d_A.rows())) return *this; - lda_ = static_cast<int64_t>(d_A.outerStride()); - d_A.waitReady(stream_); + lda_ = static_cast<int64_t>(d_A.rows()); + d_A.waitReady(solver_ctx_.stream_); d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release())); factor_alloc_size_ = factorBytes(); @@ -198,109 +152,79 @@ // ---- Solve --------------------------------------------------------------- - /** Solve A * X = B using the cached Cholesky factor (host → host). - * - * Uploads B to device memory, calls cusolverDnXpotrs using the factor - * retained from compute(), and returns the solution X on the host. - * The factor is not re-transferred; only B goes up and X comes down. - * - * \pre compute() must have been called and info() == Success. - * \returns X such that A * X ≈ B - */ + /** Solve A * X = B using the cached Cholesky factor (host → host). */ template <typename Rhs> PlainMatrix solve(const MatrixBase<Rhs>& B) const { - const_cast<LLT*>(this)->sync_info(); - eigen_assert(info_ == Success && "LLT::solve called on a failed or uninitialized factorization"); + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success && "LLT::solve called on a failed or uninitialized factorization"); eigen_assert(B.rows() == n_); const PlainMatrix rhs(B); const int64_t nrhs = static_cast<int64_t>(rhs.cols()); - const int64_t ldb = static_cast<int64_t>(rhs.outerStride()); + const int64_t ldb = static_cast<int64_t>(rhs.rows()); internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb)); EIGEN_CUDA_RUNTIME_CHECK( - cudaMemcpyAsync(d_x.get(), rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_)); + cudaMemcpyAsync(d_x.get(), rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, solver_ctx_.stream_)); DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, std::move(d_x)); PlainMatrix X(n_, B.cols()); int solve_info = 0; EIGEN_CUDA_RUNTIME_CHECK( - cudaMemcpyAsync(X.data(), d_X.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_)); - EIGEN_CUDA_RUNTIME_CHECK( - cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_)); - EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + cudaMemcpyAsync(X.data(), d_X.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToHost, solver_ctx_.stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solve_info, solver_ctx_.scratch_info(), sizeof(int), + cudaMemcpyDeviceToHost, solver_ctx_.stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(solver_ctx_.stream_)); eigen_assert(solve_info == 0 && "cusolverDnXpotrs reported an error"); return X; } - /** Solve A * X = B with device-resident RHS. Fully async. - * - * All work is enqueued on this solver's stream. Returns a Matrix - * with a recorded ready event — no host synchronization occurs. - * The caller should check info() after compute() to verify the - * factorization succeeded; this method does not check. - */ + /** Solve A * X = B with device-resident RHS. Returns immediately after enqueuing + * the solve; the sync_info()/info_ check at entry forces a host wait on the + * factorization status so a singular A causes a clean assert rather than a + * silently-bad result. */ DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const { + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success && "LLT::solve called on a failed or uninitialized factorization"); eigen_assert(d_B.rows() == n_); - d_B.waitReady(stream_); + d_B.waitReady(solver_ctx_.stream_); const int64_t nrhs = static_cast<int64_t>(d_B.cols()); - const int64_t ldb = static_cast<int64_t>(d_B.outerStride()); + const int64_t ldb = static_cast<int64_t>(d_B.rows()); internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb)); EIGEN_CUDA_RUNTIME_CHECK( - cudaMemcpyAsync(d_x.get(), d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_)); + cudaMemcpyAsync(d_x.get(), d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, solver_ctx_.stream_)); return solve_impl(nrhs, ldb, std::move(d_x)); } // ---- Accessors ----------------------------------------------------------- - /** Returns Success if the last compute() succeeded, NumericalIssue otherwise. - * Lazily synchronizes the stream on first call after compute(). */ - ComputationInfo info() const { - const_cast<LLT*>(this)->sync_info(); - return info_; - } - + ComputationInfo info() const { return solver_ctx_.info(); } Index rows() const { return n_; } Index cols() const { return n_; } - - /** Returns the CUDA stream owned by this object. - * Advanced users may submit additional GPU work on this stream - * to overlap with or chain after LLT operations. */ - cudaStream_t stream() const { return stream_; } + cudaStream_t stream() const { return solver_ctx_.stream_; } private: - cudaStream_t stream_ = nullptr; - cusolverDnHandle_t handle_ = nullptr; - internal::CusolverParams params_; // cuSOLVER params (created once, reused) - internal::DeviceBuffer d_factor_; // factored L (or U) on device (grows, never shrinks) - size_t factor_alloc_size_ = 0; // current d_factor_ allocation size - internal::DeviceBuffer d_scratch_; // combined workspace + info word (grows, never shrinks) - size_t scratch_size_ = 0; // current scratch allocation size - std::vector<char> h_workspace_; // host workspace (kept alive until next compute) + mutable internal::GpuSolverContext solver_ctx_; + internal::DeviceBuffer d_factor_; + size_t factor_alloc_size_ = 0; int64_t n_ = 0; int64_t lda_ = 0; - ComputationInfo info_ = InvalidInput; - internal::PinnedHostBuffer pinned_info_{sizeof(int)}; // pinned host memory for async D2H - bool info_synced_ = true; // has the stream been synced for info? - - int& info_word() { return *static_cast<int*>(pinned_info_.get()); } - int info_word() const { return *static_cast<const int*>(pinned_info_.get()); } bool begin_compute(Index rows) { n_ = rows; + solver_ctx_.info_ = InvalidInput; if (n_ == 0) { - info_ = Success; - info_synced_ = true; + solver_ctx_.info_ = Success; + solver_ctx_.info_synced_ = true; return false; } - info_ = InvalidInput; return true; } size_t factorBytes() const { return rhsBytes(n_, lda_); } - static size_t rhsBytes(int64_t cols, int64_t outer_stride) { - return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar); + static size_t rhsBytes(int64_t cols, int64_t ld) { + return static_cast<size_t>(ld) * static_cast<size_t>(cols) * sizeof(Scalar); } void allocate_factor_storage() { @@ -311,90 +235,43 @@ } } - // Scratch layout: [ workspace (aligned) | info_word (sizeof(int)) ]. - // Workspace size is rounded up to 16 bytes so the info word lands aligned. - static constexpr size_t kInfoBytes = sizeof(int); - static constexpr size_t kScratchAlign = 16; - - static size_t scratchBytesFor(size_t workspace_bytes) { - workspace_bytes = (workspace_bytes + kScratchAlign - 1) & ~(kScratchAlign - 1); - return workspace_bytes + kInfoBytes; - } - - // Ensure d_scratch_ is at least `bytes`. Grows but never shrinks. - // Syncs the stream before reallocating to avoid freeing memory that - // async kernels may still be using. - void ensure_scratch(size_t bytes) { - if (bytes > scratch_size_) { - if (d_scratch_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); - d_scratch_ = internal::DeviceBuffer(bytes); - scratch_size_ = bytes; - } - } - - void* scratch_workspace() const { return d_scratch_.get(); } - int* scratch_info() const { - eigen_assert(d_scratch_ && scratch_size_ >= kInfoBytes); - return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.get()) + scratch_size_ - kInfoBytes); - } - - // Solve in place on `d_x` (which already holds B), then re-wrap as a - // typed DeviceMatrix carrying shape and a ready event. The release/adopt - // hop just hands ownership of the raw cudaMalloc pointer from the untyped - // DeviceBuffer to the typed DeviceMatrix without copying. + // Solve in place on `d_x` (which already holds B), then re-wrap as a typed + // DeviceMatrix carrying shape and a ready event. The release/adopt hop hands + // ownership of the raw cudaMalloc pointer from the untyped DeviceBuffer to + // the typed DeviceMatrix without copying. DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, internal::DeviceBuffer&& d_x) const { constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value; constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value; - EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, n_, nrhs, dtype, d_factor_.get(), lda_, dtype, - d_x.get(), ldb, scratch_info())); + EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(solver_ctx_.cusolver_, solver_ctx_.params_.p, uplo, n_, nrhs, dtype, + d_factor_.get(), lda_, dtype, d_x.get(), ldb, solver_ctx_.scratch_info())); DeviceMatrix<Scalar> result = DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_x.release()), n_, static_cast<Index>(nrhs)); - result.recordReady(stream_); + result.recordReady(solver_ctx_.stream_); return result; } - void init_context() { - EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_)); - EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_)); - EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_)); - ensure_scratch(kInfoBytes); - } - - // Synchronize stream and interpret the info word. No-op if already synced. - void sync_info() { - if (!info_synced_) { - EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); - info_ = (info_word() == 0) ? Success : NumericalIssue; - info_synced_ = true; - } - } - - // Run cusolverDnXpotrf on d_factor_ (already on device). - // Enqueues factorization + async info download. Does NOT sync. - // Workspaces are stored as members to ensure they outlive the async kernels. void factorize() { constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value; constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value; - info_synced_ = false; - info_ = InvalidInput; + solver_ctx_.mark_pending(); size_t dev_ws_bytes = 0, host_ws_bytes = 0; - EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(handle_, params_.p, uplo, n_, dtype, d_factor_.get(), lda_, dtype, - &dev_ws_bytes, &host_ws_bytes)); + EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(solver_ctx_.cusolver_, solver_ctx_.params_.p, uplo, n_, dtype, + d_factor_.get(), lda_, dtype, &dev_ws_bytes, &host_ws_bytes)); - ensure_scratch(scratchBytesFor(dev_ws_bytes)); - h_workspace_.resize(host_ws_bytes); + solver_ctx_.ensure_scratch(dev_ws_bytes); + solver_ctx_.h_workspace_.resize(host_ws_bytes); - EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf( - handle_, params_.p, uplo, n_, dtype, d_factor_.get(), lda_, dtype, scratch_workspace(), dev_ws_bytes, - host_ws_bytes > 0 ? h_workspace_.data() : nullptr, host_ws_bytes, scratch_info())); + EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(solver_ctx_.cusolver_, solver_ctx_.params_.p, uplo, n_, dtype, + d_factor_.get(), lda_, dtype, solver_ctx_.scratch_workspace(), dev_ws_bytes, + host_ws_bytes > 0 ? solver_ctx_.h_workspace_.data() : nullptr, host_ws_bytes, + solver_ctx_.scratch_info())); - // Enqueue async download of info word — sync deferred to info() or solve(). - EIGEN_CUDA_RUNTIME_CHECK( - cudaMemcpyAsync(&info_word(), scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solver_ctx_.info_word(), solver_ctx_.scratch_info(), sizeof(int), + cudaMemcpyDeviceToHost, solver_ctx_.stream_)); } };
diff --git a/unsupported/Eigen/src/GPU/GpuLU.h b/unsupported/Eigen/src/GPU/GpuLU.h index afc941b..92c10a9 100644 --- a/unsupported/Eigen/src/GPU/GpuLU.h +++ b/unsupported/Eigen/src/GPU/GpuLU.h
@@ -20,7 +20,7 @@ // gpu::LU<double> lu(A); // upload A, getrf, LU+ipiv on device // if (lu.info() != Success) { ... } // MatrixXd x = lu.solve(b); // getrs NoTrans, only b transferred -// MatrixXd xt = lu.solve(b, gpu::GpuOp::Trans); // A^T x = b +// MatrixXd xt = lu.solve(b, gpu::GpuOp::Trans); // A^T x = b #ifndef EIGEN_GPU_LU_H #define EIGEN_GPU_LU_H @@ -28,9 +28,7 @@ // IWYU pragma: private #include "./InternalHeaderCheck.h" -#include "./CuBlasSupport.h" -#include "./CuSolverSupport.h" -#include <vector> +#include "./GpuSolverContext.h" namespace Eigen { namespace gpu { @@ -56,81 +54,41 @@ // ---- Construction / destruction ------------------------------------------ - LU() { init_context(); } + LU() = default; template <typename InputType> - explicit LU(const EigenBase<InputType>& A) : LU() { + explicit LU(const EigenBase<InputType>& A) { compute(A); } - ~LU() { - // Ignore errors here: dtors are noexcept, and EIGEN_CUSOLVER_CHECK / - // EIGEN_CUDA_RUNTIME_CHECK are eigen_assert-based — firing one from a - // noexcept dtor terminates the program. The trailing cudaFree() of the - // device buffers (via internal::DeviceBuffer::~DeviceBuffer) is sync. - if (handle_) (void)cusolverDnDestroy(handle_); - if (stream_) (void)cudaStreamDestroy(stream_); - } + ~LU() = default; LU(const LU&) = delete; LU& operator=(const LU&) = delete; LU(LU&& o) noexcept - : stream_(o.stream_), - handle_(o.handle_), - params_(std::move(o.params_)), + : solver_ctx_(std::move(o.solver_ctx_)), d_lu_(std::move(o.d_lu_)), lu_alloc_size_(o.lu_alloc_size_), d_ipiv_(std::move(o.d_ipiv_)), - d_scratch_(std::move(o.d_scratch_)), - scratch_size_(o.scratch_size_), - h_workspace_(std::move(o.h_workspace_)), n_(o.n_), - lda_(o.lda_), - info_(o.info_), - pinned_info_(std::move(o.pinned_info_)), - info_synced_(o.info_synced_) { - o.stream_ = nullptr; - o.handle_ = nullptr; + lda_(o.lda_) { o.lu_alloc_size_ = 0; - o.scratch_size_ = 0; o.n_ = 0; o.lda_ = 0; - o.info_ = InvalidInput; - o.info_synced_ = true; } LU& operator=(LU&& o) noexcept { if (this != &o) { - // Drain pending work on the old stream before tearing it down so the - // upcoming move of d_lu_/d_ipiv_/d_scratch_ doesn't free buffers that an - // in-flight kernel is still touching. The asymmetric move-ctor doesn't - // need this — it constructs onto uninitialized storage. - if (stream_) (void)cudaStreamSynchronize(stream_); - if (handle_) (void)cusolverDnDestroy(handle_); - if (stream_) (void)cudaStreamDestroy(stream_); - stream_ = o.stream_; - handle_ = o.handle_; - params_ = std::move(o.params_); + solver_ctx_ = std::move(o.solver_ctx_); d_lu_ = std::move(o.d_lu_); lu_alloc_size_ = o.lu_alloc_size_; d_ipiv_ = std::move(o.d_ipiv_); - d_scratch_ = std::move(o.d_scratch_); - scratch_size_ = o.scratch_size_; - h_workspace_ = std::move(o.h_workspace_); n_ = o.n_; lda_ = o.lda_; - info_ = o.info_; - pinned_info_ = std::move(o.pinned_info_); - info_synced_ = o.info_synced_; - o.stream_ = nullptr; - o.handle_ = nullptr; o.lu_alloc_size_ = 0; - o.scratch_size_ = 0; o.n_ = 0; o.lda_ = 0; - o.info_ = InvalidInput; - o.info_synced_ = true; } return *this; } @@ -144,9 +102,10 @@ if (!begin_compute(A.rows())) return *this; const PlainMatrix mat(A.derived()); - lda_ = static_cast<int64_t>(mat.outerStride()); + lda_ = static_cast<int64_t>(mat.rows()); allocate_lu_storage(); - EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.get(), mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_)); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_lu_.get(), mat.data(), matrixBytes(), cudaMemcpyHostToDevice, solver_ctx_.stream_)); factorize(); return *this; @@ -157,11 +116,11 @@ eigen_assert(d_A.rows() == d_A.cols() && "LU requires a square matrix"); if (!begin_compute(d_A.rows())) return *this; - lda_ = static_cast<int64_t>(d_A.outerStride()); - d_A.waitReady(stream_); + lda_ = static_cast<int64_t>(d_A.rows()); + d_A.waitReady(solver_ctx_.stream_); allocate_lu_storage(); EIGEN_CUDA_RUNTIME_CHECK( - cudaMemcpyAsync(d_lu_.get(), d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_)); + cudaMemcpyAsync(d_lu_.get(), d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, solver_ctx_.stream_)); factorize(); return *this; @@ -172,8 +131,8 @@ eigen_assert(d_A.rows() == d_A.cols() && "LU requires a square matrix"); if (!begin_compute(d_A.rows())) return *this; - lda_ = static_cast<int64_t>(d_A.outerStride()); - d_A.waitReady(stream_); + lda_ = static_cast<int64_t>(d_A.rows()); + d_A.waitReady(solver_ctx_.stream_); d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release())); lu_alloc_size_ = matrixBytes(); @@ -190,78 +149,68 @@ */ template <typename Rhs> PlainMatrix solve(const MatrixBase<Rhs>& B, GpuOp op = GpuOp::NoTrans) const { - const_cast<LU*>(this)->sync_info(); - eigen_assert(info_ == Success && "LU::solve called on a failed or uninitialized factorization"); + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success && "LU::solve called on a failed or uninitialized factorization"); eigen_assert(B.rows() == n_); const PlainMatrix rhs(B); const int64_t nrhs = static_cast<int64_t>(rhs.cols()); - const int64_t ldb = static_cast<int64_t>(rhs.outerStride()); + const int64_t ldb = static_cast<int64_t>(rhs.rows()); internal::DeviceBuffer d_x(matrixBytes(nrhs, ldb)); EIGEN_CUDA_RUNTIME_CHECK( - cudaMemcpyAsync(d_x.get(), rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_)); + cudaMemcpyAsync(d_x.get(), rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, solver_ctx_.stream_)); DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, op, std::move(d_x)); PlainMatrix X(n_, B.cols()); int solve_info = 0; EIGEN_CUDA_RUNTIME_CHECK( - cudaMemcpyAsync(X.data(), d_X.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_)); - EIGEN_CUDA_RUNTIME_CHECK( - cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_)); - EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + cudaMemcpyAsync(X.data(), d_X.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToHost, solver_ctx_.stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solve_info, solver_ctx_.scratch_info(), sizeof(int), + cudaMemcpyDeviceToHost, solver_ctx_.stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(solver_ctx_.stream_)); eigen_assert(solve_info == 0 && "cusolverDnXgetrs reported an error"); return X; } - /** Solve op(A) * X = B with device-resident RHS. Fully async. */ + /** Solve op(A) * X = B with device-resident RHS. Returns immediately after + * enqueuing the solve; the sync_info()/info_ check at entry forces a host + * wait on the factorization status so a singular A causes a clean assert + * rather than a silently-bad result. */ DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B, GpuOp op = GpuOp::NoTrans) const { + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success && "LU::solve called on a failed or uninitialized factorization"); eigen_assert(d_B.rows() == n_); - d_B.waitReady(stream_); + d_B.waitReady(solver_ctx_.stream_); const int64_t nrhs = static_cast<int64_t>(d_B.cols()); - const int64_t ldb = static_cast<int64_t>(d_B.outerStride()); + const int64_t ldb = static_cast<int64_t>(d_B.rows()); internal::DeviceBuffer d_x(matrixBytes(nrhs, ldb)); EIGEN_CUDA_RUNTIME_CHECK( - cudaMemcpyAsync(d_x.get(), d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_)); + cudaMemcpyAsync(d_x.get(), d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, solver_ctx_.stream_)); return solve_impl(nrhs, ldb, op, std::move(d_x)); } // ---- Accessors ----------------------------------------------------------- - /** Lazily synchronizes the stream on first call after compute(). */ - ComputationInfo info() const { - const_cast<LU*>(this)->sync_info(); - return info_; - } + ComputationInfo info() const { return solver_ctx_.info(); } Index rows() const { return n_; } Index cols() const { return n_; } - cudaStream_t stream() const { return stream_; } + cudaStream_t stream() const { return solver_ctx_.stream_; } private: - cudaStream_t stream_ = nullptr; - cusolverDnHandle_t handle_ = nullptr; - internal::CusolverParams params_; // cuSOLVER params (created once, reused) - internal::DeviceBuffer d_lu_; // LU factors on device (grows, never shrinks) - size_t lu_alloc_size_ = 0; // current d_lu_ allocation size - internal::DeviceBuffer d_ipiv_; // pivot indices (int64_t) on device - internal::DeviceBuffer d_scratch_; // combined workspace + info word (grows, never shrinks) - size_t scratch_size_ = 0; // current scratch allocation size - std::vector<char> h_workspace_; // host workspace (kept alive until next compute) + mutable internal::GpuSolverContext solver_ctx_; + internal::DeviceBuffer d_lu_; + size_t lu_alloc_size_ = 0; + internal::DeviceBuffer d_ipiv_; int64_t n_ = 0; int64_t lda_ = 0; - ComputationInfo info_ = InvalidInput; - internal::PinnedHostBuffer pinned_info_{sizeof(int)}; // pinned host memory for async D2H - bool info_synced_ = true; // has the stream been synced for info? - - int& info_word() { return *static_cast<int*>(pinned_info_.get()); } - int info_word() const { return *static_cast<const int*>(pinned_info_.get()); } bool begin_compute(Index rows) { n_ = rows; - info_ = InvalidInput; + solver_ctx_.info_ = InvalidInput; if (n_ == 0) { - info_ = Success; - info_synced_ = true; + solver_ctx_.info_ = Success; + solver_ctx_.info_synced_ = true; return false; } return true; @@ -269,8 +218,8 @@ size_t matrixBytes() const { return matrixBytes(n_, lda_); } - static size_t matrixBytes(int64_t cols, int64_t outer_stride) { - return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar); + static size_t matrixBytes(int64_t cols, int64_t ld) { + return static_cast<size_t>(ld) * static_cast<size_t>(cols) * sizeof(Scalar); } void allocate_lu_storage() { @@ -281,92 +230,46 @@ } } - // Scratch layout: [ workspace (aligned) | info_word (sizeof(int)) ]. - // Workspace size is rounded up to 16 bytes so the info word lands aligned. - static constexpr size_t kInfoBytes = sizeof(int); - static constexpr size_t kScratchAlign = 16; - - static size_t scratchBytesFor(size_t workspace_bytes) { - workspace_bytes = (workspace_bytes + kScratchAlign - 1) & ~(kScratchAlign - 1); - return workspace_bytes + kInfoBytes; - } - - // Ensure d_scratch_ is at least `bytes`. Grows but never shrinks. - // Syncs the stream before reallocating to avoid freeing memory that - // async kernels may still be using. - void ensure_scratch(size_t bytes) { - if (bytes > scratch_size_) { - if (d_scratch_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); - d_scratch_ = internal::DeviceBuffer(bytes); - scratch_size_ = bytes; - } - } - - void* scratch_workspace() const { return d_scratch_.get(); } - int* scratch_info() const { - eigen_assert(d_scratch_ && scratch_size_ >= kInfoBytes); - return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.get()) + scratch_size_ - kInfoBytes); - } - - // Solve in place on `d_x` (which already holds B), then re-wrap as a - // typed DeviceMatrix carrying shape and a ready event. The release/adopt - // hop just hands ownership of the raw cudaMalloc pointer from the untyped - // DeviceBuffer to the typed DeviceMatrix without copying. + // Solve in place on `d_x` (which already holds B), then re-wrap as a typed + // DeviceMatrix carrying shape and a ready event. The release/adopt hop hands + // ownership of the raw cudaMalloc pointer from the untyped DeviceBuffer to + // the typed DeviceMatrix without copying. DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, GpuOp op, internal::DeviceBuffer&& d_x) const { constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value; const cublasOperation_t trans = internal::to_cublas_op(op); - EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(handle_, params_.p, trans, n_, nrhs, dtype, d_lu_.get(), lda_, - static_cast<const int64_t*>(d_ipiv_.get()), dtype, d_x.get(), ldb, - scratch_info())); + EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(solver_ctx_.cusolver_, solver_ctx_.params_.p, trans, n_, nrhs, dtype, + d_lu_.get(), lda_, static_cast<const int64_t*>(d_ipiv_.get()), dtype, + d_x.get(), ldb, solver_ctx_.scratch_info())); DeviceMatrix<Scalar> result = DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_x.release()), n_, static_cast<Index>(nrhs)); - result.recordReady(stream_); + result.recordReady(solver_ctx_.stream_); return result; } - void init_context() { - EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_)); - EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_)); - EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_)); - ensure_scratch(kInfoBytes); - } - - void sync_info() { - if (!info_synced_) { - EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); - info_ = (info_word() == 0) ? Success : NumericalIssue; - info_synced_ = true; - } - } - - // Run cusolverDnXgetrf on d_lu_ (already on device). Allocates d_ipiv_. - // Enqueues factorization + async info download. Does NOT sync. - // Workspaces are stored as members to ensure they outlive the async kernels. void factorize() { constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value; const size_t ipiv_bytes = static_cast<size_t>(n_) * sizeof(int64_t); - info_synced_ = false; - info_ = InvalidInput; + solver_ctx_.mark_pending(); d_ipiv_ = internal::DeviceBuffer(ipiv_bytes); size_t dev_ws_bytes = 0, host_ws_bytes = 0; - EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(handle_, params_.p, n_, n_, dtype, d_lu_.get(), lda_, dtype, - &dev_ws_bytes, &host_ws_bytes)); + EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(solver_ctx_.cusolver_, solver_ctx_.params_.p, n_, n_, dtype, + d_lu_.get(), lda_, dtype, &dev_ws_bytes, &host_ws_bytes)); - ensure_scratch(scratchBytesFor(dev_ws_bytes)); - h_workspace_.resize(host_ws_bytes); + solver_ctx_.ensure_scratch(dev_ws_bytes); + solver_ctx_.h_workspace_.resize(host_ws_bytes); - EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf(handle_, params_.p, n_, n_, dtype, d_lu_.get(), lda_, - static_cast<int64_t*>(d_ipiv_.get()), dtype, scratch_workspace(), - dev_ws_bytes, host_ws_bytes > 0 ? h_workspace_.data() : nullptr, - host_ws_bytes, scratch_info())); + EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf( + solver_ctx_.cusolver_, solver_ctx_.params_.p, n_, n_, dtype, d_lu_.get(), lda_, + static_cast<int64_t*>(d_ipiv_.get()), dtype, solver_ctx_.scratch_workspace(), dev_ws_bytes, + host_ws_bytes > 0 ? solver_ctx_.h_workspace_.data() : nullptr, host_ws_bytes, solver_ctx_.scratch_info())); - EIGEN_CUDA_RUNTIME_CHECK( - cudaMemcpyAsync(&info_word(), scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solver_ctx_.info_word(), solver_ctx_.scratch_info(), sizeof(int), + cudaMemcpyDeviceToHost, solver_ctx_.stream_)); } };
diff --git a/unsupported/Eigen/src/GPU/GpuQR.h b/unsupported/Eigen/src/GPU/GpuQR.h new file mode 100644 index 0000000..821c458 --- /dev/null +++ b/unsupported/Eigen/src/GPU/GpuQR.h
@@ -0,0 +1,379 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// GPU QR decomposition using cuSOLVER. +// +// Wraps cusolverDnXgeqrf (factorization), cusolverDnXormqr (apply Q), +// and cublasXtrsm (triangular solve on R). Q is never formed explicitly. +// +// Handles both shapes transparently: +// m >= n (overdetermined or square): factor A = Q R; least-squares solve. +// m < n (underdetermined): factor A^H = Q R internally; min-norm solve. +// +// Usage: +// QR<double> qr(A); // upload A, geqrf (transparent transpose if m<n) +// if (qr.info() != Success) { ... } +// MatrixXd X = qr.solve(B); // least-squares (m>=n) or min-norm (m<n) +// MatrixXd R = qr.matrixR(); // upper-triangular factor (m>=n only) + +#ifndef EIGEN_GPU_QR_H +#define EIGEN_GPU_QR_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./GpuSolverContext.h" + +namespace Eigen { +namespace gpu { + +template <typename Scalar_> +class QR { + public: + using Scalar = Scalar_; + using RealScalar = typename NumTraits<Scalar>::Real; + using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>; + + QR() = default; + + template <typename InputType> + explicit QR(const EigenBase<InputType>& A) { + compute(A); + } + + explicit QR(const DeviceMatrix<Scalar>& d_A) { compute(d_A); } + + ~QR() = default; + + QR(const QR&) = delete; + QR& operator=(const QR&) = delete; + + QR(QR&& o) noexcept + : solver_ctx_(std::move(o.solver_ctx_)), + d_qr_(std::move(o.d_qr_)), + d_tau_(std::move(o.d_tau_)), + m_(o.m_), + n_(o.n_), + lda_(o.lda_), + transposed_(o.transposed_) { + o.m_ = 0; + o.n_ = 0; + o.lda_ = 0; + o.transposed_ = false; + } + + QR& operator=(QR&& o) noexcept { + if (this != &o) { + solver_ctx_ = std::move(o.solver_ctx_); + d_qr_ = std::move(o.d_qr_); + d_tau_ = std::move(o.d_tau_); + m_ = o.m_; + n_ = o.n_; + lda_ = o.lda_; + transposed_ = o.transposed_; + o.m_ = 0; + o.n_ = 0; + o.lda_ = 0; + o.transposed_ = false; + } + return *this; + } + + // ---- Factorization ------------------------------------------------------- + + template <typename InputType> + QR& compute(const EigenBase<InputType>& A) { + // Upload to device, then delegate. The wide-matrix transpose runs on the + // GPU (via cublasXgeam) inside the device-input path; no host transpose. + return compute(DeviceMatrix<Scalar>::fromHost(A.derived(), solver_ctx_.stream_)); + } + + QR& compute(const DeviceMatrix<Scalar>& d_A) { + m_ = d_A.rows(); + n_ = d_A.cols(); + + if (m_ == 0 || n_ == 0) { + solver_ctx_.info_ = Success; + solver_ctx_.info_synced_ = true; + return *this; + } + + transposed_ = (m_ < n_); + d_A.waitReady(solver_ctx_.stream_); + lda_ = static_cast<int64_t>(transposed_ ? n_ : m_); + const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(factor_cols()) * sizeof(Scalar); + const size_t tau_bytes = static_cast<size_t>(k()) * sizeof(Scalar); + + d_qr_ = internal::DeviceBuffer(mat_bytes); + d_tau_ = internal::DeviceBuffer(tau_bytes); + + if (transposed_) { + // Transpose-on-device via cuBLAS geam: d_qr_ = A^H. + Scalar alpha_one(1), beta_zero(0); + EIGEN_CUBLAS_CHECK(internal::cublasXgeam( + solver_ctx_.cublas_, CUBLAS_OP_C, CUBLAS_OP_N, internal::to_blas_int(n_), internal::to_blas_int(m_), + &alpha_one, d_A.data(), internal::to_blas_int(d_A.rows()), &beta_zero, static_cast<const Scalar*>(nullptr), + internal::to_blas_int(n_), static_cast<Scalar*>(d_qr_.get()), internal::to_blas_int(n_))); + } else { + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_qr_.get(), d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, solver_ctx_.stream_)); + } + + factorize(); + return *this; + } + + // ---- Solve --------------------------------------------------------------- + + /** Solve A * X = B. + * For m >= n (over-/exactly-determined): least-squares X = R^{-1} Q^H B (residual A^H r ≈ 0). + * For m < n (underdetermined): minimum-norm X = Q R^{-H} B (||X|| minimized). */ + template <typename Rhs> + PlainMatrix solve(const MatrixBase<Rhs>& B) const { + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success && "QR::solve called on a failed or uninitialized factorization"); + eigen_assert(B.rows() == m_); + + const PlainMatrix rhs(B); + const Index nrhs = rhs.cols(); + + if (!transposed_) { + return solve_overdetermined_host(rhs); + } + return solve_underdetermined_host(rhs, nrhs); + } + + /** Solve with device-resident RHS. Returns n × nrhs DeviceMatrix. */ + DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const { + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success && "QR::solve called on a failed or uninitialized factorization"); + eigen_assert(d_B.rows() == m_); + d_B.waitReady(solver_ctx_.stream_); + + if (!transposed_) { + return solve_overdetermined_device(d_B); + } + return solve_underdetermined_device(d_B); + } + + // ---- Accessors ----------------------------------------------------------- + + ComputationInfo info() const { return solver_ctx_.info(); } + + Index rows() const { return m_; } + Index cols() const { return n_; } + cudaStream_t stream() const { return solver_ctx_.stream_; } + + /** Upper-triangular factor R (k × n) of A = Q R. Available only for m >= n. */ + PlainMatrix matrixR() const { + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success); + eigen_assert(!transposed_ && "matrixR() not available when m < n (we factored A^H internally)"); + PlainMatrix qr_full(m_, n_); + if (m_ > 0 && n_ > 0) { + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(qr_full.data(), d_qr_.get(), + static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar), + cudaMemcpyDeviceToHost)); + } + PlainMatrix R = qr_full.topRows(k()).template triangularView<Upper>(); + return R; + } + + private: + mutable internal::GpuSolverContext solver_ctx_; + internal::DeviceBuffer d_qr_; // QR factors (reflectors below diag, R above) + internal::DeviceBuffer d_tau_; // Householder scalars (length k) + int64_t m_ = 0; // original A.rows() + int64_t n_ = 0; // original A.cols() + int64_t lda_ = 0; // factor leading dim = max(m_, n_) + bool transposed_ = false; // true iff m_ < n_ (we factored A^H instead of A) + + // Factor matrix dimensions (we always factor a "tall" matrix: rows >= cols). + int64_t factor_rows() const { return transposed_ ? n_ : m_; } + int64_t factor_cols() const { return transposed_ ? m_ : n_; } + int64_t k() const { return (std::min)(m_, n_); } + + void factorize() { + constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value; + + solver_ctx_.mark_pending(); + + const int64_t fm = factor_rows(); + const int64_t fn = factor_cols(); + size_t dev_ws = 0, host_ws = 0; + EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf_bufferSize(solver_ctx_.cusolver_, solver_ctx_.params_.p, fm, fn, dtype, + d_qr_.get(), lda_, dtype, d_tau_.get(), dtype, &dev_ws, &host_ws)); + + solver_ctx_.ensure_scratch(dev_ws); + solver_ctx_.h_workspace_.resize(host_ws); + + EIGEN_CUSOLVER_CHECK(cusolverDnXgeqrf(solver_ctx_.cusolver_, solver_ctx_.params_.p, fm, fn, dtype, d_qr_.get(), + lda_, dtype, d_tau_.get(), dtype, solver_ctx_.scratch_workspace(), dev_ws, + host_ws > 0 ? solver_ctx_.h_workspace_.data() : nullptr, host_ws, + solver_ctx_.scratch_info())); + + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solver_ctx_.info_word(), solver_ctx_.scratch_info(), sizeof(int), + cudaMemcpyDeviceToHost, solver_ctx_.stream_)); + } + + // Apply Q (op = CUBLAS_OP_N) or Q^H (op = CUBLAS_OP_T/C) to a device buffer in-place. + // For real types Q^H = Q^T -> CUBLAS_OP_T. For complex -> CUBLAS_OP_C. + // Workspace lives in solver_ctx_.scratch (grows but never shrinks): no per-call malloc/free. + void apply_Q(cublasOperation_t op, void* d_B, int64_t ldb, int64_t nrhs) const { + const int im = internal::to_blas_int(factor_rows()); + const int in = internal::to_blas_int(nrhs); + const int ik = internal::to_blas_int(k()); + const int ilda = internal::to_blas_int(lda_); + const int ildb = internal::to_blas_int(ldb); + + int lwork = 0; + EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr_bufferSize( + solver_ctx_.cusolver_, CUBLAS_SIDE_LEFT, op, im, in, ik, static_cast<const Scalar*>(d_qr_.get()), ilda, + static_cast<const Scalar*>(d_tau_.get()), static_cast<const Scalar*>(d_B), ildb, &lwork)); + + solver_ctx_.ensure_scratch(static_cast<size_t>(lwork) * sizeof(Scalar)); + + EIGEN_CUSOLVER_CHECK(internal::cusolverDnXormqr( + solver_ctx_.cusolver_, CUBLAS_SIDE_LEFT, op, im, in, ik, static_cast<const Scalar*>(d_qr_.get()), ilda, + static_cast<const Scalar*>(d_tau_.get()), static_cast<Scalar*>(d_B), ildb, + static_cast<Scalar*>(solver_ctx_.scratch_workspace()), lwork, solver_ctx_.scratch_info())); + } + + void apply_QH(void* d_B, int64_t ldb, int64_t nrhs) const { + constexpr cublasOperation_t trans = NumTraits<Scalar>::IsComplex ? CUBLAS_OP_C : CUBLAS_OP_T; + apply_Q(trans, d_B, ldb, nrhs); + } + + // ---- Solve helpers (overdetermined: m >= n) ------------------------------ + + PlainMatrix solve_overdetermined_host(const PlainMatrix& rhs) const { + const Index nrhs = rhs.cols(); + const size_t b_bytes = static_cast<size_t>(m_) * static_cast<size_t>(nrhs) * sizeof(Scalar); + + internal::DeviceBuffer d_B(b_bytes); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_B.get(), rhs.data(), b_bytes, cudaMemcpyHostToDevice, solver_ctx_.stream_)); + + apply_QH(d_B.get(), m_, nrhs); + trsm_R(d_B.get(), m_, nrhs, /*op=*/CUBLAS_OP_N); + + PlainMatrix X(n_, nrhs); + if (m_ == n_) { + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_B.get(), + static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar), + cudaMemcpyDeviceToHost, solver_ctx_.stream_)); + } else { + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(X.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_B.get(), + static_cast<size_t>(m_) * sizeof(Scalar), + static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs), + cudaMemcpyDeviceToHost, solver_ctx_.stream_)); + } + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(solver_ctx_.stream_)); + return X; + } + + DeviceMatrix<Scalar> solve_overdetermined_device(const DeviceMatrix<Scalar>& d_B) const { + const Index nrhs = d_B.cols(); + const size_t b_bytes = static_cast<size_t>(m_) * static_cast<size_t>(nrhs) * sizeof(Scalar); + + internal::DeviceBuffer d_work(b_bytes); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_work.get(), d_B.data(), b_bytes, cudaMemcpyDeviceToDevice, solver_ctx_.stream_)); + + apply_QH(d_work.get(), m_, nrhs); + trsm_R(d_work.get(), m_, nrhs, /*op=*/CUBLAS_OP_N); + + if (m_ == n_) { + DeviceMatrix<Scalar> result = + DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_work.release()), n_, static_cast<Index>(nrhs)); + result.recordReady(solver_ctx_.stream_); + return result; + } + DeviceMatrix<Scalar> result(n_, static_cast<Index>(nrhs)); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(result.data(), static_cast<size_t>(n_) * sizeof(Scalar), d_work.get(), + static_cast<size_t>(m_) * sizeof(Scalar), + static_cast<size_t>(n_) * sizeof(Scalar), static_cast<size_t>(nrhs), + cudaMemcpyDeviceToDevice, solver_ctx_.stream_)); + result.recordReady(solver_ctx_.stream_); + return result; + } + + // ---- Solve helpers (underdetermined: m < n) ------------------------------ + // + // We factored A^H = Q R, so A = R^H Q^H. Solving A X = B for X with min ||X||: + // z = R^{-H} B (m × nrhs, occupies top m rows of an n × nrhs buffer) + // X = Q [z; 0] (n × nrhs) + + PlainMatrix solve_underdetermined_host(const PlainMatrix& rhs, Index nrhs) const { + const size_t x_bytes = static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar); + + internal::DeviceBuffer d_X(x_bytes); + // Zero the full n × nrhs buffer; B will overwrite the top m × nrhs block. + EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(d_X.get(), 0, x_bytes, solver_ctx_.stream_)); + + // 2D copy: B (m × nrhs, leading dim m) into top of d_X (leading dim n). + if (m_ > 0 && nrhs > 0) { + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(d_X.get(), static_cast<size_t>(n_) * sizeof(Scalar), rhs.data(), + static_cast<size_t>(m_) * sizeof(Scalar), + static_cast<size_t>(m_) * sizeof(Scalar), static_cast<size_t>(nrhs), + cudaMemcpyHostToDevice, solver_ctx_.stream_)); + } + + trsm_R(d_X.get(), n_, nrhs, trsm_op_conj_trans()); + apply_Q(CUBLAS_OP_N, d_X.get(), n_, nrhs); + + PlainMatrix X(n_, nrhs); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(X.data(), d_X.get(), x_bytes, cudaMemcpyDeviceToHost, solver_ctx_.stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(solver_ctx_.stream_)); + return X; + } + + DeviceMatrix<Scalar> solve_underdetermined_device(const DeviceMatrix<Scalar>& d_B) const { + const Index nrhs = d_B.cols(); + const size_t x_bytes = static_cast<size_t>(n_) * static_cast<size_t>(nrhs) * sizeof(Scalar); + + internal::DeviceBuffer d_X(x_bytes); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(d_X.get(), 0, x_bytes, solver_ctx_.stream_)); + + if (m_ > 0 && nrhs > 0) { + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(d_X.get(), static_cast<size_t>(n_) * sizeof(Scalar), d_B.data(), + static_cast<size_t>(m_) * sizeof(Scalar), + static_cast<size_t>(m_) * sizeof(Scalar), static_cast<size_t>(nrhs), + cudaMemcpyDeviceToDevice, solver_ctx_.stream_)); + } + + trsm_R(d_X.get(), n_, nrhs, trsm_op_conj_trans()); + apply_Q(CUBLAS_OP_N, d_X.get(), n_, nrhs); + + DeviceMatrix<Scalar> result = + DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_X.release()), n_, static_cast<Index>(nrhs)); + result.recordReady(solver_ctx_.stream_); + return result; + } + + static cublasOperation_t trsm_op_conj_trans() { return NumTraits<Scalar>::IsComplex ? CUBLAS_OP_C : CUBLAS_OP_T; } + + // Triangular solve on R: X := op(R)^{-1} B (in-place on B). + // op = CUBLAS_OP_N for the m>=n branch (R X = (Q^H B)[:k,:]) + // op = CUBLAS_OP_T or _C for the m<n branch (R^H z = B) + void trsm_R(void* d_B, int64_t ldb, int64_t nrhs, cublasOperation_t op) const { + Scalar alpha(1); + EIGEN_CUBLAS_CHECK(internal::cublasXtrsm( + solver_ctx_.cublas_, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, op, CUBLAS_DIAG_NON_UNIT, + internal::to_blas_int(k()), internal::to_blas_int(nrhs), &alpha, static_cast<const Scalar*>(d_qr_.get()), + internal::to_blas_int(lda_), static_cast<Scalar*>(d_B), internal::to_blas_int(ldb))); + } +}; + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_QR_H
diff --git a/unsupported/Eigen/src/GPU/GpuSVD.h b/unsupported/Eigen/src/GPU/GpuSVD.h new file mode 100644 index 0000000..9b16cab --- /dev/null +++ b/unsupported/Eigen/src/GPU/GpuSVD.h
@@ -0,0 +1,531 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// GPU SVD decomposition using cuSOLVER (divide-and-conquer). +// +// Wraps cusolverDnXgesvd. Stores U, S, VT on device. Solve uses +// cuBLAS GEMM: X = VT^H * diag(D) * U^H * B. +// +// cuSOLVER returns VT (not V). We store and expose VT directly. +// +// Usage: +// SVD<double> svd(A, ComputeThinU | ComputeThinV); +// VectorXd S = svd.singularValues(); +// MatrixXd U = svd.matrixU(); // m×k or m×m +// MatrixXd VT = svd.matrixVT(); // k×n or n×n (this is V^T) +// MatrixXd X = svd.solve(B); // pseudoinverse +// MatrixXd X = svd.solve(B, k); // truncated (top k triplets) +// MatrixXd X = svd.solve(B, 0.1); // Tikhonov regularized + +#ifndef EIGEN_GPU_SVD_H +#define EIGEN_GPU_SVD_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./GpuSolverContext.h" + +namespace Eigen { +namespace gpu { + +template <typename Scalar_> +class SVD { + public: + using Scalar = Scalar_; + using RealScalar = typename NumTraits<Scalar>::Real; + using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>; + using PlainVector = Eigen::Matrix<Scalar, Dynamic, 1>; + using RealVector = Eigen::Matrix<RealScalar, Dynamic, 1>; + + SVD() = default; + + template <typename InputType> + explicit SVD(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) { + compute(A, options); + } + + explicit SVD(const DeviceMatrix<Scalar>& d_A, unsigned int options = ComputeThinU | ComputeThinV) { + compute(d_A, options); + } + + ~SVD() = default; + + SVD(const SVD&) = delete; + SVD& operator=(const SVD&) = delete; + + SVD(SVD&& o) noexcept + : solver_ctx_(std::move(o.solver_ctx_)), + d_A_(std::move(o.d_A_)), + d_U_(std::move(o.d_U_)), + d_S_(std::move(o.d_S_)), + d_VT_(std::move(o.d_VT_)), + options_(o.options_), + m_(o.m_), + n_(o.n_), + lda_(o.lda_), + transposed_(o.transposed_) { + o.options_ = 0; + o.m_ = 0; + o.n_ = 0; + o.lda_ = 0; + o.transposed_ = false; + } + + SVD& operator=(SVD&& o) noexcept { + if (this != &o) { + solver_ctx_ = std::move(o.solver_ctx_); + d_A_ = std::move(o.d_A_); + d_U_ = std::move(o.d_U_); + d_S_ = std::move(o.d_S_); + d_VT_ = std::move(o.d_VT_); + options_ = o.options_; + m_ = o.m_; + n_ = o.n_; + lda_ = o.lda_; + transposed_ = o.transposed_; + o.options_ = 0; + o.m_ = 0; + o.n_ = 0; + o.lda_ = 0; + o.transposed_ = false; + } + return *this; + } + + // ---- Factorization ------------------------------------------------------- + + template <typename InputType> + SVD& compute(const EigenBase<InputType>& A, unsigned int options = ComputeThinU | ComputeThinV) { + // Upload to device, then delegate. The wide-matrix transpose runs on the + // GPU (via cublasXgeam) inside the device-input path; no host transpose. + return compute(DeviceMatrix<Scalar>::fromHost(A.derived(), solver_ctx_.stream_), options); + } + + SVD& compute(const DeviceMatrix<Scalar>& d_A, unsigned int options = ComputeThinU | ComputeThinV) { + options_ = options; + m_ = d_A.rows(); + n_ = d_A.cols(); + lda_ = 0; + transposed_ = false; + + if (m_ == 0 || n_ == 0) { + d_A_ = internal::DeviceBuffer(); + d_U_ = internal::DeviceBuffer(); + d_S_ = internal::DeviceBuffer(); + d_VT_ = internal::DeviceBuffer(); + solver_ctx_.info_ = Success; + solver_ctx_.info_synced_ = true; + return *this; + } + + transposed_ = (m_ < n_); + d_A.waitReady(solver_ctx_.stream_); + + if (transposed_) { + // Transpose on device via cuBLAS geam: d_A_ = A^H. + std::swap(m_, n_); + lda_ = m_; + const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar); + d_A_ = internal::DeviceBuffer(mat_bytes); + // geam: C(m×n) = alpha * op(A) + beta * op(B). beta=0, B=nullptr. + Scalar alpha_one(1), beta_zero(0); + EIGEN_CUBLAS_CHECK(internal::cublasXgeam( + solver_ctx_.cublas_, CUBLAS_OP_C, CUBLAS_OP_N, internal::to_blas_int(m_), internal::to_blas_int(n_), + &alpha_one, d_A.data(), internal::to_blas_int(d_A.rows()), &beta_zero, static_cast<const Scalar*>(nullptr), + internal::to_blas_int(m_), static_cast<Scalar*>(d_A_.get()), internal::to_blas_int(m_))); + } else { + lda_ = static_cast<int64_t>(d_A.rows()); + const size_t mat_bytes = static_cast<size_t>(lda_) * static_cast<size_t>(n_) * sizeof(Scalar); + d_A_ = internal::DeviceBuffer(mat_bytes); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpyAsync(d_A_.get(), d_A.data(), mat_bytes, cudaMemcpyDeviceToDevice, solver_ctx_.stream_)); + } + + factorize(); + return *this; + } + + // ---- Accessors ----------------------------------------------------------- + + ComputationInfo info() const { return solver_ctx_.info(); } + + Index rows() const { return transposed_ ? n_ : m_; } + Index cols() const { return transposed_ ? m_ : n_; } + + /** Singular values (always available). Downloads from device on each call. */ + RealVector singularValues() const { + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success); + const Index k = (std::min)(m_, n_); + RealVector S(k); + EIGEN_CUDA_RUNTIME_CHECK( + cudaMemcpy(S.data(), d_S_.get(), static_cast<size_t>(k) * sizeof(RealScalar), cudaMemcpyDeviceToHost)); + return S; + } + + /** Left singular vectors U. */ + PlainMatrix matrixU() const { + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success); + eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "matrixU() requires ComputeThinU or ComputeFullU"); + const Index m_orig = transposed_ ? n_ : m_; + const Index n_orig = transposed_ ? m_ : n_; + const Index k = (std::min)(m_orig, n_orig); + if (!transposed_) { + const Index ucols = (options_ & ComputeFullU) ? m_ : k; + PlainMatrix U(m_, ucols); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U.data(), d_U_.get(), + static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar), + cudaMemcpyDeviceToHost)); + return U; + } else { + const Index vtrows = (options_ & ComputeFullU) ? m_orig : k; + PlainMatrix VT_stored(vtrows, n_); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT_stored.data(), d_VT_.get(), + static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar), + cudaMemcpyDeviceToHost)); + return VT_stored.adjoint(); + } + } + + /** Right singular vectors V (matches host JacobiSVD/BDCSVD). */ + PlainMatrix matrixV() const { return matrixVT().adjoint(); } + + /** Right singular vectors transposed V^T. */ + PlainMatrix matrixVT() const { + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success); + eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "matrixVT() requires ComputeThinV or ComputeFullV"); + const Index m_orig = transposed_ ? n_ : m_; + const Index n_orig = transposed_ ? m_ : n_; + const Index k = (std::min)(m_orig, n_orig); + if (!transposed_) { + const Index vtrows = (options_ & ComputeFullV) ? n_ : k; + PlainMatrix VT(vtrows, n_); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(VT.data(), d_VT_.get(), + static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar), + cudaMemcpyDeviceToHost)); + return VT; + } else { + const Index ucols = (options_ & ComputeFullV) ? n_orig : k; + PlainMatrix U_stored(m_, ucols); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy(U_stored.data(), d_U_.get(), + static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar), + cudaMemcpyDeviceToHost)); + return U_stored.adjoint(); + } + } + + // ---- Device-side accessors (zero-copy views; chain into cuBLAS without D2D) --------- + // + // These return non-owning DeviceMatrix views over the SVD's internal device storage. + // The view borrows the pointer: destruction does not free; the SVD object must outlive + // any view derived from it. For the common case (m >= n) all three accessors are pure + // metadata: zero kernel launches, zero allocations. + // + // For wide matrices (m < n, internally factored as A^H), original U and V^T are the + // adjoints of the stored buffers, so d_matrixU() / d_matrixVT() build them via a + // cublasXgeam into an owning temporary. d_singularValues() remains zero-copy. + + /** Singular values as a k × 1 view on this solver's stream. */ + DeviceMatrix<RealScalar> d_singularValues() const { + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success); + const Index k = (std::min)(m_, n_); + auto v = DeviceMatrix<RealScalar>::view(static_cast<RealScalar*>(d_S_.get()), k, 1); + v.recordReady(solver_ctx_.stream_); + return v; + } + + /** Left singular vectors U as a DeviceMatrix on this solver's stream. + * For m >= n: zero-copy view. For m < n: owning (one cublasXgeam adjoint pass). */ + DeviceMatrix<Scalar> d_matrixU() const { + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success); + eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "d_matrixU() requires ComputeThinU or ComputeFullU"); + const Index m_orig = transposed_ ? n_ : m_; + const Index n_orig = transposed_ ? m_ : n_; + const Index k = (std::min)(m_orig, n_orig); + if (!transposed_) { + const Index ucols = (options_ & ComputeFullU) ? m_ : k; + auto v = DeviceMatrix<Scalar>::view(static_cast<Scalar*>(d_U_.get()), m_, ucols); + v.recordReady(solver_ctx_.stream_); + return v; + } + // transposed: U_orig = VT_stored^H -> conjugate-transpose via cublasXgeam. + const Index vtrows_stored = (options_ & ComputeFullU) ? n_ : k; + DeviceMatrix<Scalar> result(n_, vtrows_stored); + if (n_ > 0 && vtrows_stored > 0) { + Scalar alpha_one(1), beta_zero(0); + EIGEN_CUBLAS_CHECK(internal::cublasXgeam( + solver_ctx_.cublas_, CUBLAS_OP_C, CUBLAS_OP_N, internal::to_blas_int(n_), + internal::to_blas_int(vtrows_stored), &alpha_one, static_cast<const Scalar*>(d_VT_.get()), + internal::to_blas_int(vtrows_stored), &beta_zero, static_cast<const Scalar*>(nullptr), + internal::to_blas_int(n_), result.data(), internal::to_blas_int(n_))); + result.recordReady(solver_ctx_.stream_); + } + return result; + } + + /** Right singular vectors transposed V^T as a DeviceMatrix on this solver's stream. + * For m >= n: zero-copy view. For m < n: owning (one cublasXgeam adjoint pass). */ + DeviceMatrix<Scalar> d_matrixVT() const { + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success); + eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "d_matrixVT() requires ComputeThinV or ComputeFullV"); + const Index m_orig = transposed_ ? n_ : m_; + const Index n_orig = transposed_ ? m_ : n_; + const Index k = (std::min)(m_orig, n_orig); + if (!transposed_) { + const Index vtrows = (options_ & ComputeFullV) ? n_ : k; + auto v = DeviceMatrix<Scalar>::view(static_cast<Scalar*>(d_VT_.get()), vtrows, n_); + v.recordReady(solver_ctx_.stream_); + return v; + } + // transposed: VT_orig = U_stored^H. + const Index ucols = (options_ & ComputeFullV) ? n_orig : k; + DeviceMatrix<Scalar> result(ucols, m_); + if (ucols > 0 && m_ > 0) { + Scalar alpha_one(1), beta_zero(0); + EIGEN_CUBLAS_CHECK( + internal::cublasXgeam(solver_ctx_.cublas_, CUBLAS_OP_C, CUBLAS_OP_N, internal::to_blas_int(ucols), + internal::to_blas_int(m_), &alpha_one, static_cast<const Scalar*>(d_U_.get()), + internal::to_blas_int(m_), &beta_zero, static_cast<const Scalar*>(nullptr), + internal::to_blas_int(ucols), result.data(), internal::to_blas_int(ucols))); + result.recordReady(solver_ctx_.stream_); + } + return result; + } + + /** Number of singular values above threshold. */ + Index rank(RealScalar threshold = RealScalar(-1)) const { + RealVector S = singularValues(); + if (S.size() == 0) return 0; + if (threshold < 0) { + threshold = (std::max)(m_, n_) * S(0) * NumTraits<RealScalar>::epsilon(); + } + return (S.array() > threshold).count(); + } + + // ---- Solve --------------------------------------------------------------- + + /** Pseudoinverse solve: X = V * diag(1/S) * U^H * B. */ + template <typename Rhs> + PlainMatrix solve(const MatrixBase<Rhs>& B) const { + return solve_impl(B, (std::min)(m_, n_), RealScalar(0)); + } + + /** Truncated solve: use only top trunc singular triplets. */ + template <typename Rhs> + PlainMatrix solve(const MatrixBase<Rhs>& B, Index trunc) const { + eigen_assert(trunc > 0 && trunc <= (std::min)(m_, n_)); + return solve_impl(B, trunc, RealScalar(0)); + } + + /** Tikhonov-regularized solve: D_ii = S_i / (S_i^2 + lambda^2). */ + template <typename Rhs> + PlainMatrix solve(const MatrixBase<Rhs>& B, RealScalar lambda) const { + eigen_assert(lambda > 0); + return solve_impl(B, (std::min)(m_, n_), lambda); + } + + cudaStream_t stream() const { return solver_ctx_.stream_; } + + private: + mutable internal::GpuSolverContext solver_ctx_; + internal::DeviceBuffer d_A_; + internal::DeviceBuffer d_U_; + internal::DeviceBuffer d_S_; + internal::DeviceBuffer d_VT_; + unsigned int options_ = 0; + int64_t m_ = 0; + int64_t n_ = 0; + int64_t lda_ = 0; + bool transposed_ = false; + + // Swap U↔V flags for the transposed case. + static unsigned int swap_uv_options(unsigned int opts) { + unsigned int result = 0; + if (opts & ComputeThinU) result |= ComputeThinV; + if (opts & ComputeFullU) result |= ComputeFullV; + if (opts & ComputeThinV) result |= ComputeThinU; + if (opts & ComputeFullV) result |= ComputeFullU; + return result; + } + + static signed char jobu(unsigned int opts) { + if (opts & ComputeFullU) return 'A'; + if (opts & ComputeThinU) return 'S'; + return 'N'; + } + + static signed char jobvt(unsigned int opts) { + if (opts & ComputeFullV) return 'A'; + if (opts & ComputeThinV) return 'S'; + return 'N'; + } + + void factorize() { + constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value; + constexpr cudaDataType_t rtype = internal::cuda_data_type<RealScalar>::value; + const Index k = (std::min)(m_, n_); + + solver_ctx_.mark_pending(); + + d_S_ = internal::DeviceBuffer(static_cast<size_t>(k) * sizeof(RealScalar)); + + const unsigned int int_opts = transposed_ ? swap_uv_options(options_) : options_; + + const Index ucols = (int_opts & ComputeFullU) ? m_ : ((int_opts & ComputeThinU) ? k : 0); + const Index vtrows = (int_opts & ComputeFullV) ? n_ : ((int_opts & ComputeThinV) ? k : 0); + const int64_t ldu = m_; + const int64_t ldvt = vtrows > 0 ? vtrows : 1; + + d_U_ = internal::DeviceBuffer(); + d_VT_ = internal::DeviceBuffer(); + if (ucols > 0) d_U_ = internal::DeviceBuffer(static_cast<size_t>(m_) * static_cast<size_t>(ucols) * sizeof(Scalar)); + if (vtrows > 0) + d_VT_ = internal::DeviceBuffer(static_cast<size_t>(vtrows) * static_cast<size_t>(n_) * sizeof(Scalar)); + + eigen_assert(m_ >= n_ && "Internal error: m_ < n_ should have been handled by transpose in compute()"); + size_t dev_ws = 0, host_ws = 0; + EIGEN_CUSOLVER_CHECK(cusolverDnXgesvd_bufferSize( + solver_ctx_.cusolver_, solver_ctx_.params_.p, jobu(int_opts), jobvt(int_opts), m_, n_, dtype, d_A_.get(), lda_, + rtype, d_S_.get(), dtype, ucols > 0 ? d_U_.get() : nullptr, ldu, dtype, vtrows > 0 ? d_VT_.get() : nullptr, + ldvt, dtype, &dev_ws, &host_ws)); + + solver_ctx_.ensure_scratch(dev_ws); + solver_ctx_.h_workspace_.resize(host_ws); + + EIGEN_CUSOLVER_CHECK( + cusolverDnXgesvd(solver_ctx_.cusolver_, solver_ctx_.params_.p, jobu(int_opts), jobvt(int_opts), m_, n_, dtype, + d_A_.get(), lda_, rtype, d_S_.get(), dtype, ucols > 0 ? d_U_.get() : nullptr, ldu, dtype, + vtrows > 0 ? d_VT_.get() : nullptr, ldvt, dtype, solver_ctx_.scratch_workspace(), dev_ws, + host_ws > 0 ? solver_ctx_.h_workspace_.data() : nullptr, host_ws, solver_ctx_.scratch_info())); + + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solver_ctx_.info_word(), solver_ctx_.scratch_info(), sizeof(int), + cudaMemcpyDeviceToHost, solver_ctx_.stream_)); + } + + template <typename Rhs> + PlainMatrix solve_impl(const MatrixBase<Rhs>& B, Index trunc, RealScalar lambda) const { + solver_ctx_.sync_info(); + eigen_assert(solver_ctx_.info_ == Success && "SVD::solve called on a failed or uninitialized decomposition"); + eigen_assert((options_ & (ComputeThinU | ComputeFullU)) && "solve requires U"); + eigen_assert((options_ & (ComputeThinV | ComputeFullV)) && "solve requires V"); + + const Index m_orig = transposed_ ? n_ : m_; + const Index n_orig = transposed_ ? m_ : n_; + eigen_assert(B.rows() == m_orig); + + const Index k = (std::min)(m_, n_); + const Index kk = (std::min)(trunc, k); + const Index nrhs = B.cols(); + + // Empty problem: no rank, no RHS, or zero domain -> result is the zero matrix. + // Returning early avoids reading S(0) below when k == 0 and prevents zero-extent + // GEMM/dgmm calls. + if (kk == 0 || nrhs == 0 || n_orig == 0) { + return PlainMatrix::Zero(n_orig, nrhs); + } + + // Enqueue both transfers on solver_ctx_.stream_ in one batch and sync once. Issuing the + // B upload before reading S means B's H2D is already in flight while we wait for + // gesvd-then-S-D2H, instead of two back-to-back blocking syncs. + const PlainMatrix rhs(B); + internal::DeviceBuffer d_B(static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar)); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_B.get(), rhs.data(), + static_cast<size_t>(m_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar), + cudaMemcpyHostToDevice, solver_ctx_.stream_)); + RealVector S(k); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(S.data(), d_S_.get(), static_cast<size_t>(k) * sizeof(RealScalar), + cudaMemcpyDeviceToHost, solver_ctx_.stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(solver_ctx_.stream_)); + + // Typed pointers for device buffers. + auto* U_dev = static_cast<const Scalar*>(d_U_.get()); + auto* VT_dev = static_cast<const Scalar*>(d_VT_.get()); + auto* B_dev = static_cast<const Scalar*>(d_B.get()); + + Scalar scalars[2] = {Scalar(1), Scalar(0)}; + + // Step 1: tmp = U_orig^H * B (kk × nrhs). + internal::DeviceBuffer d_tmp(static_cast<size_t>(kk) * static_cast<size_t>(nrhs) * sizeof(Scalar)); + auto* tmp_dev = static_cast<Scalar*>(d_tmp.get()); + if (!transposed_) { + EIGEN_CUBLAS_CHECK(internal::cublasXgemm(solver_ctx_.cublas_, CUBLAS_OP_C, CUBLAS_OP_N, internal::to_blas_int(kk), + internal::to_blas_int(nrhs), internal::to_blas_int(m_), &scalars[0], + U_dev, internal::to_blas_int(m_), B_dev, internal::to_blas_int(m_orig), + &scalars[1], tmp_dev, internal::to_blas_int(kk))); + } else { + const Index vtrows_stored = (swap_uv_options(options_) & ComputeFullV) ? n_ : k; + EIGEN_CUBLAS_CHECK(internal::cublasXgemm( + solver_ctx_.cublas_, CUBLAS_OP_N, CUBLAS_OP_N, internal::to_blas_int(kk), internal::to_blas_int(nrhs), + internal::to_blas_int(m_orig), &scalars[0], VT_dev, internal::to_blas_int(vtrows_stored), B_dev, + internal::to_blas_int(m_orig), &scalars[1], tmp_dev, internal::to_blas_int(kk))); + } + + // Step 2: Apply diag(D) to tmp on device via cublasXdgmm. + // D is built on host from the (small, k-entry) singular-values vector S + // and uploaded to a small device buffer; tmp itself stays on device. + // + // For lambda == 0 we mirror Eigen's SVDBase::_solve_impl: drop singular + // values below S(0) * k * eps (numerical-rank truncation), so this + // pseudoinverse solve agrees with CPU BDCSVD::solve on near-singular A. + // dgmm wants the diagonal in the matrix scalar type — for complex Scalar + // the diagonal is still real, so we build the real values then cast. + const RealScalar drop_threshold = S(0) * RealScalar(k) * NumTraits<RealScalar>::epsilon(); + auto S_head = S.head(kk).array(); + PlainVector D(kk); + if (lambda == RealScalar(0)) { + D = (S_head > drop_threshold).select(S_head.inverse(), RealScalar(0)).matrix().template cast<Scalar>(); + } else { + D = (S_head / (S_head.square() + lambda * lambda)).matrix().template cast<Scalar>(); + } + + internal::DeviceBuffer d_D(static_cast<size_t>(kk) * sizeof(Scalar)); + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_D.get(), D.data(), static_cast<size_t>(kk) * sizeof(Scalar), + cudaMemcpyHostToDevice, solver_ctx_.stream_)); + + EIGEN_CUBLAS_CHECK(internal::cublasXdgmm( + solver_ctx_.cublas_, CUBLAS_SIDE_LEFT, internal::to_blas_int(kk), internal::to_blas_int(nrhs), tmp_dev, + internal::to_blas_int(kk), static_cast<const Scalar*>(d_D.get()), 1, tmp_dev, internal::to_blas_int(kk))); + + // Step 3: X = V_orig * tmp (n_orig × nrhs). + PlainMatrix X(n_orig, nrhs); + internal::DeviceBuffer d_X(static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar)); + auto* X_dev = static_cast<Scalar*>(d_X.get()); + + if (!transposed_) { + const Index vtrows = (options_ & ComputeFullV) ? n_ : k; + EIGEN_CUBLAS_CHECK(internal::cublasXgemm( + solver_ctx_.cublas_, CUBLAS_OP_C, CUBLAS_OP_N, internal::to_blas_int(n_orig), internal::to_blas_int(nrhs), + internal::to_blas_int(kk), &scalars[0], VT_dev, internal::to_blas_int(vtrows), tmp_dev, + internal::to_blas_int(kk), &scalars[1], X_dev, internal::to_blas_int(n_orig))); + } else { + EIGEN_CUBLAS_CHECK(internal::cublasXgemm( + solver_ctx_.cublas_, CUBLAS_OP_N, CUBLAS_OP_N, internal::to_blas_int(n_orig), internal::to_blas_int(nrhs), + internal::to_blas_int(kk), &scalars[0], U_dev, internal::to_blas_int(m_), tmp_dev, internal::to_blas_int(kk), + &scalars[1], X_dev, internal::to_blas_int(n_orig))); + } + + EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(X.data(), d_X.get(), + static_cast<size_t>(n_orig) * static_cast<size_t>(nrhs) * sizeof(Scalar), + cudaMemcpyDeviceToHost, solver_ctx_.stream_)); + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(solver_ctx_.stream_)); + + return X; + } +}; + +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_SVD_H
diff --git a/unsupported/Eigen/src/GPU/GpuSolverContext.h b/unsupported/Eigen/src/GPU/GpuSolverContext.h new file mode 100644 index 0000000..87c4641 --- /dev/null +++ b/unsupported/Eigen/src/GPU/GpuSolverContext.h
@@ -0,0 +1,171 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// Shared context for GPU solvers (GpuLLT, GpuLU, GpuQR, GpuSVD, etc.). +// +// Owns a CUDA stream, cuSOLVER handle, cuBLAS handle, scratch buffer, +// and info word. Each solver holds a GpuSolverContext by composition +// and delegates lifecycle/scratch management to it. + +#ifndef EIGEN_GPU_SOLVER_CONTEXT_H +#define EIGEN_GPU_SOLVER_CONTEXT_H + +// IWYU pragma: private +#include "./InternalHeaderCheck.h" + +#include "./CuSolverSupport.h" +#include "./CuBlasSupport.h" +#include <vector> + +namespace Eigen { +namespace gpu { +namespace internal { + +struct GpuSolverContext { + cudaStream_t stream_ = nullptr; + cusolverDnHandle_t cusolver_ = nullptr; + cublasHandle_t cublas_ = nullptr; + CusolverParams params_; + DeviceBuffer d_scratch_; + size_t scratch_size_ = 0; + std::vector<char> h_workspace_; + ComputationInfo info_ = InvalidInput; + PinnedHostBuffer pinned_info_{sizeof(int)}; // pinned host memory for async D2H of info word + bool info_synced_ = true; + + int& info_word() { return *static_cast<int*>(pinned_info_.get()); } + int info_word() const { return *static_cast<const int*>(pinned_info_.get()); } + + GpuSolverContext() { + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_)); + EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_)); + EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_)); + EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_)); + EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_)); + ensure_scratch(0); + } + + ~GpuSolverContext() { + // Ignore errors here: dtors are noexcept, and EIGEN_CU{BLAS,SOLVER,DA_RUNTIME}_CHECK + // are eigen_assert-based — firing one from a noexcept dtor terminates the program. + // The trailing cudaFree() of the device buffers (via DeviceBuffer::~DeviceBuffer) is + // synchronous and waits for any in-flight kernel touching the buffer. + if (cublas_) (void)cublasDestroy(cublas_); + if (cusolver_) (void)cusolverDnDestroy(cusolver_); + if (stream_) (void)cudaStreamDestroy(stream_); + } + + GpuSolverContext(GpuSolverContext&& o) noexcept + : stream_(o.stream_), + cusolver_(o.cusolver_), + cublas_(o.cublas_), + params_(std::move(o.params_)), + d_scratch_(std::move(o.d_scratch_)), + scratch_size_(o.scratch_size_), + h_workspace_(std::move(o.h_workspace_)), + info_(o.info_), + pinned_info_(std::move(o.pinned_info_)), + info_synced_(o.info_synced_) { + o.stream_ = nullptr; + o.cusolver_ = nullptr; + o.cublas_ = nullptr; + o.scratch_size_ = 0; + o.info_ = InvalidInput; + o.info_synced_ = true; + } + + GpuSolverContext& operator=(GpuSolverContext&& o) noexcept { + if (this != &o) { + // Drain pending work on the old stream before tearing it down so the + // upcoming move of d_scratch_ doesn't free buffers that an in-flight + // kernel is still touching. The asymmetric move-ctor doesn't need this — + // it constructs onto uninitialized storage. + if (stream_) (void)cudaStreamSynchronize(stream_); + if (cublas_) (void)cublasDestroy(cublas_); + if (cusolver_) (void)cusolverDnDestroy(cusolver_); + if (stream_) (void)cudaStreamDestroy(stream_); + stream_ = o.stream_; + cusolver_ = o.cusolver_; + cublas_ = o.cublas_; + params_ = std::move(o.params_); + d_scratch_ = std::move(o.d_scratch_); + scratch_size_ = o.scratch_size_; + h_workspace_ = std::move(o.h_workspace_); + info_ = o.info_; + pinned_info_ = std::move(o.pinned_info_); + info_synced_ = o.info_synced_; + o.stream_ = nullptr; + o.cusolver_ = nullptr; + o.cublas_ = nullptr; + o.scratch_size_ = 0; + o.info_ = InvalidInput; + o.info_synced_ = true; + } + return *this; + } + + GpuSolverContext(const GpuSolverContext&) = delete; + GpuSolverContext& operator=(const GpuSolverContext&) = delete; + + // Scratch layout: [ workspace (aligned) | info_word (sizeof(int)) ]. + // Workspace size is rounded up to 16 bytes so the info word lands aligned. + static constexpr size_t kInfoBytes = sizeof(int); + static constexpr size_t kScratchAlign = 16; + + static size_t scratchBytesFor(size_t workspace_bytes) { + workspace_bytes = (workspace_bytes + kScratchAlign - 1) & ~(kScratchAlign - 1); + return workspace_bytes + kInfoBytes; + } + + // Ensure d_scratch_ holds at least `workspace_bytes` of scratch plus the trailing + // info word. Grows but never shrinks. Syncs the stream before reallocating to + // avoid freeing memory that async kernels may still be using. + void ensure_scratch(size_t workspace_bytes) { + size_t needed = scratchBytesFor(workspace_bytes); + if (needed > scratch_size_) { + if (d_scratch_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + d_scratch_ = DeviceBuffer(needed); + scratch_size_ = needed; + } + } + + void* scratch_workspace() const { return d_scratch_.get(); } + + int* scratch_info() const { + eigen_assert(d_scratch_ && scratch_size_ >= kInfoBytes); + return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.get()) + scratch_size_ - kInfoBytes); + } + + // Mark a factorization as pending (info not yet available). + void mark_pending() { + info_synced_ = false; + info_ = InvalidInput; + } + + // Synchronize the stream and interpret the info word. No-op if already synced. + void sync_info() { + if (!info_synced_) { + EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); + info_ = (info_word() == 0) ? Success : NumericalIssue; + info_synced_ = true; + } + } + + ComputationInfo info() { + sync_info(); + return info_; + } +}; + +} // namespace internal +} // namespace gpu +} // namespace Eigen + +#endif // EIGEN_GPU_SOLVER_CONTEXT_H
diff --git a/unsupported/Eigen/src/GPU/GpuSupport.h b/unsupported/Eigen/src/GPU/GpuSupport.h index d2c129f..2c9b0cc 100644 --- a/unsupported/Eigen/src/GPU/GpuSupport.h +++ b/unsupported/Eigen/src/GPU/GpuSupport.h
@@ -23,6 +23,7 @@ #include <cuda_runtime.h> +#include <limits> #include <memory> namespace Eigen { @@ -38,13 +39,30 @@ eigen_assert(_e == cudaSuccess && "CUDA runtime call failed"); \ } while (0) +// ---- Bounds-checked narrowing for cuBLAS/cuSOLVER int parameters ------------ +// cuBLAS and the legacy cuSOLVER APIs take dimensions and leading dimensions as +// `int` (32-bit signed). Modern GPUs can host allocations whose dimensions +// exceed INT_MAX, and Eigen's Index is 64-bit by default. Use this helper at +// every narrowing call site so an out-of-range value triggers an assert +// instead of silently overflowing the BLAS argument. + +inline int to_blas_int(int64_t v) { + eigen_assert(v >= 0 && v <= static_cast<int64_t>((std::numeric_limits<int>::max)()) && + "dimension exceeds the int range supported by cuBLAS / cuSOLVER"); + return static_cast<int>(v); +} + // ---- Custom deleters for CUDA-allocated memory ------------------------------ // Used with std::unique_ptr to give CUDA allocations RAII semantics with no // hand-rolled move/dtor boilerplate. struct CudaFreeDeleter { + // When `borrow == true`, the unique_ptr does not free the pointer. Used by + // DeviceMatrix::view() to wrap a non-owning device pointer with the same + // smart-pointer machinery as owning storage, without changing the type. + bool borrow = false; void operator()(void* p) const noexcept { - if (p) (void)cudaFree(p); + if (p && !borrow) (void)cudaFree(p); } };
diff --git a/unsupported/Eigen/src/GPU/README.md b/unsupported/Eigen/src/GPU/README.md index 2928a7c..b7c4269 100644 --- a/unsupported/Eigen/src/GPU/README.md +++ b/unsupported/Eigen/src/GPU/README.md
@@ -26,7 +26,7 @@ **CPU and GPU coexist.** There is no global compile-time switch that replaces CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers -explicitly -- `gpu::LLT<double>` vs `Eigen::LLT<MatrixXd>` -- and both coexist in +explicitly -- `gpu::LLT<double>` vs `LLT<MatrixXd>` -- and both coexist in the same binary. This also lets users keep the factored matrix on device across multiple solves, something impossible with compile-time replacement. @@ -167,10 +167,43 @@ gpu::LU<double> lu; lu.compute(d_A); auto d_Y = lu.solve(d_B, gpu::GpuOp::Trans); // A^T Y = B + +// QR solve (overdetermined least squares) +gpu::QR<double> qr(A); // host matrix input +MatrixXd X = qr.solve(B); // Q^H * B via ormqr, then trsm on R + +// SVD +gpu::SVD<double> svd(A, ComputeThinU | ComputeThinV); +VectorXd S = svd.singularValues(); +MatrixXd U = svd.matrixU(); +MatrixXd VT = svd.matrixVT(); +MatrixXd X = svd.solve(B); // pseudoinverse solve + +// SVD: device-side views (no D2H transfer; svd must outlive the views) +auto d_S = svd.d_singularValues(); // DeviceMatrix view of singular values +auto d_U = svd.d_matrixU(); // DeviceMatrix view of U +auto d_VT = svd.d_matrixVT(); // DeviceMatrix view of V^T + +// Self-adjoint eigenvalue decomposition +gpu::SelfAdjointEigenSolver<double> es(A); +VectorXd eigenvals = es.eigenvalues(); +MatrixXd eigenvecs = es.eigenvectors(); +auto d_W = es.d_eigenvalues(); // DeviceMatrix view of eigenvalues +auto d_V = es.d_eigenvectors(); // DeviceMatrix view of eigenvectors ``` The cached API keeps the factored matrix on device, avoiding redundant -host-device transfers and re-factorizations. +host-device transfers and re-factorizations. The `d_*` accessors return +non-owning `DeviceMatrix` views so downstream cuBLAS/cuSOLVER work can chain +without round-tripping through host memory. + +### Precision control + +GEMM dispatch uses `cublasXgemm` (type-specific Sgemm/Dgemm/Cgemm/Zgemm). +cuBLAS may internally use tensor cores depending on the GPU architecture, +matrix dimensions, and CUDA math mode settings. No Eigen-specific macros +control this; use the standard `CUDA_MATH_MODE` environment variable or +`cublasSetMathMode()` to configure tensor core behavior if needed. ### Stream control and async execution @@ -221,15 +254,15 @@ | DeviceMatrix expression | Library call | Parameters | |---|---|---| -| `C = A * B` | `cublasGemmEx` | transA=N, transB=N, alpha=1, beta=0 | -| `C = A.adjoint() * B` | `cublasGemmEx` | transA=C, transB=N | -| `C = A.transpose() * B` | `cublasGemmEx` | transA=T, transB=N | -| `C = A * B.adjoint()` | `cublasGemmEx` | transA=N, transB=C | -| `C = A * B.transpose()` | `cublasGemmEx` | transA=N, transB=T | -| `C = alpha * A * B` | `cublasGemmEx` | alpha from LHS | -| `C = A * (alpha * B)` | `cublasGemmEx` | alpha from RHS | -| `C += A * B` | `cublasGemmEx` | alpha=1, beta=1 | -| `C.device(ctx) -= A * B` | `cublasGemmEx` | alpha=-1, beta=1 | +| `C = A * B` | `cublasXgemm` | transA=N, transB=N, alpha=1, beta=0 | +| `C = A.adjoint() * B` | `cublasXgemm` | transA=C, transB=N | +| `C = A.transpose() * B` | `cublasXgemm` | transA=T, transB=N | +| `C = A * B.adjoint()` | `cublasXgemm` | transA=N, transB=C | +| `C = A * B.transpose()` | `cublasXgemm` | transA=N, transB=T | +| `C = alpha * A * B` | `cublasXgemm` | alpha from LHS | +| `C = A * (alpha * B)` | `cublasXgemm` | alpha from RHS | +| `C += A * B` | `cublasXgemm` | alpha=1, beta=1 | +| `C.device(ctx) -= A * B` | `cublasXgemm` | alpha=-1, beta=1 | | `X = A.llt().solve(B)` | `cusolverDnXpotrf` + `Xpotrs` | uplo, n, nrhs | | `X = A.llt<Upper>().solve(B)` | same | uplo=Upper | | `X = A.lu().solve(B)` | `cusolverDnXgetrf` + `Xgetrs` | n, nrhs | @@ -244,7 +277,9 @@ | `DeviceMatrix()` | -- | Empty (0x0) | | `DeviceMatrix(rows, cols)` | -- | Allocate uninitialized | | `fromHost(matrix, stream)` | yes | Upload from Eigen matrix | -| `fromHostAsync(ptr, rows, cols, outerStride, stream)` | no | Async upload (caller manages lifetime) | +| `fromHostAsync(ptr, rows, cols, stream)` | no | Async upload (caller manages lifetime) | +| `adopt(ptr, rows, cols)` | -- | Owning wrapper over a device pointer (caller relinquishes ownership) | +| `view(ptr, rows, cols)` | -- | Non-owning view over a device pointer (does not free on destruction) | | `toHost(stream)` | yes | Synchronous download | | `toHostAsync(stream)` | no | Returns `HostTransfer` future | | `clone(stream)` | no | Device-to-device deep copy | @@ -295,6 +330,76 @@ GPU dense partial-pivoting LU via cuSOLVER. Same pattern as `gpu::LLT`, plus a `gpu::GpuOp` parameter on `solve()` (`NoTrans`, `Trans`, `ConjTrans`). +### `gpu::QR<Scalar>` API + +GPU dense QR decomposition via cuSOLVER (`geqrf`). Solve uses `ormqr` (apply +Q^H) + `trsm` (back-substitute on R) -- Q is never formed explicitly. + +| Method | Description | +|--------|-------------| +| `gpu::QR()` | Default construct, then call `compute()` | +| `gpu::QR(A)` | Construct and factorize from host matrix | +| `compute(A)` | Upload + factorize | +| `compute(DeviceMatrix)` | D2D copy + factorize | +| `solve(host_matrix)` | Solve, return host matrix (syncs) | +| `solve(DeviceMatrix)` | Solve, return `DeviceMatrix` (async) | +| `matrixR()` | Download upper-triangular factor R to host (m >= n only) | +| `info()` | Lazy sync | +| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream | + +### `gpu::SVD<Scalar>` API + +GPU dense SVD via cuSOLVER (`gesvd`). Supports thin, full, and values-only +modes via Eigen's `ComputeThinU | ComputeThinV`, `ComputeFullU | ComputeFullV`, +or `0` (values only). + +| Method | Description | +|--------|-------------| +| `gpu::SVD()` | Default construct, then call `compute()` | +| `gpu::SVD(A, options)` | Construct and compute (options default: `ComputeThinU \| ComputeThinV`) | +| `compute(A, options)` | Compute from host matrix | +| `compute(DeviceMatrix, options)` | Compute from device matrix | +| `singularValues()` | Download singular values to host | +| `matrixU()` | Download U to host (requires `ComputeThinU` or `ComputeFullU`) | +| `matrixV()` | Download V to host (matches `JacobiSVD::matrixV()`) | +| `matrixVT()` | Download V^T to host (requires `ComputeThinV` or `ComputeFullV`) | +| `d_singularValues()` | `DeviceMatrix` view of singular values (zero-copy) | +| `d_matrixU()` | `DeviceMatrix` view of U (zero-copy when m >= n) | +| `d_matrixVT()` | `DeviceMatrix` view of V^T (zero-copy when m >= n) | +| `solve(B)` | Pseudoinverse solve (returns host matrix) | +| `solve(B, k)` | Truncated solve (top k singular triplets) | +| `solve(B, lambda)` | Tikhonov regularized solve | +| `rank(threshold)` | Count singular values above threshold | +| `info()` | Lazy sync | +| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream | + +Wide matrices (m < n) are handled by internally transposing via cuBLAS `geam`; +in that case `d_matrixU()` and `d_matrixVT()` allocate an owning result via a +single `cublasXgeam` adjoint pass. `d_singularValues()` is always zero-copy. + +### `gpu::SelfAdjointEigenSolver<Scalar>` API + +GPU symmetric/Hermitian eigenvalue decomposition via cuSOLVER (`syevd`). + +| Method | Description | +|--------|-------------| +| `gpu::SelfAdjointEigenSolver()` | Default construct, then call `compute()` | +| `gpu::SelfAdjointEigenSolver(A, mode)` | Construct and compute (mode default: `ComputeEigenvectors`) | +| `compute(A, mode)` | Compute from host matrix | +| `compute(DeviceMatrix, mode)` | Compute from device matrix | +| `eigenvalues()` | Download eigenvalues to host (ascending order) | +| `eigenvectors()` | Download eigenvectors to host (columns) | +| `d_eigenvalues()` | `DeviceMatrix` view of eigenvalues (zero-copy) | +| `d_eigenvectors()` | `DeviceMatrix` view of eigenvectors (zero-copy, requires `ComputeEigenvectors`) | +| `info()` | Lazy sync | +| `rows()`, `cols()`, `stream()` | Dimensions and CUDA stream | + +`ComputeMode`: `gpu::SelfAdjointEigenSolver::EigenvaluesOnly` or +`gpu::SelfAdjointEigenSolver::ComputeEigenvectors`. + +The `d_*` accessors return non-owning views into the solver's internal device +buffers; the solver must outlive any view derived from it. + ### `HostTransfer<Scalar>` API Future for async device-to-host transfer. @@ -313,6 +418,18 @@ SYMM/HEMM, and SYRK/HERK. Debug builds assert on these violations before dispatching to cuBLAS. +## Future work + +- **Reassess host-input vs. device-input API surface.** Each solver currently + exposes both host-input (`compute(MatrixXd)`, `solve(MatrixXd)`) and + device-input (`compute(DeviceMatrix)`, `solve(DeviceMatrix)`) overloads, plus + host- and device-side accessors (`matrixU()` vs `d_matrixU()`). This eases + migration from CPU Eigen but may invite accidental host ↔ device round-trips + when users mix the two without realising the cost. Revisit after the initial + GPU module roll-out (MRs !2408, !2412, !2413, !2414, !2415) is in users' + hands; if the convenience overloads cause more confusion than they save, + narrow toward a single explicit `fromHost` / `toHost` boundary. + ## File layout | File | Depends on | Contents | @@ -326,8 +443,12 @@ | `GpuContext.h` | `CuBlasSupport.h`, `CuSolverSupport.h` | `gpu::Context` | | `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>` | cuBLAS error macro, op/compute type maps | | `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping | -| `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization | -| `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization | +| `GpuSolverContext.h` | `CuSolverSupport.h`, `CuBlasSupport.h` | Shared solver context (stream, handles, scratch) | +| `GpuLLT.h` | `GpuSolverContext.h` | Cached dense Cholesky factorization | +| `GpuLU.h` | `GpuSolverContext.h` | Cached dense LU factorization | +| `GpuQR.h` | `GpuSolverContext.h` | Dense QR decomposition | +| `GpuSVD.h` | `GpuSolverContext.h` | Dense SVD decomposition | +| `GpuEigenSolver.h` | `GpuSolverContext.h` | Self-adjoint eigenvalue decomposition | ## Building and testing @@ -336,7 +457,8 @@ -DEIGEN_TEST_CUDA=ON \ -DEIGEN_CUDA_COMPUTE_ARCH="70" -cmake --build build --target cublas cusolver_llt cusolver_lu device_matrix +cmake --build build --target cublas cusolver_llt cusolver_lu \ + cusolver_qr cusolver_svd cusolver_eigen device_matrix ctest --test-dir build -L gpu --output-on-failure ```
diff --git a/unsupported/test/GPU/CMakeLists.txt b/unsupported/test/GPU/CMakeLists.txt index cdc96ac..706efef 100644 --- a/unsupported/test/GPU/CMakeLists.txt +++ b/unsupported/test/GPU/CMakeLists.txt
@@ -69,7 +69,7 @@ # cuSOLVER dense factorizations (LLT, LU). option(EIGEN_TEST_CUSOLVER "Test cuSOLVER integration" ON) if(EIGEN_TEST_CUSOLVER AND TARGET CUDA::cusolver) - foreach(_cusolver_test IN ITEMS cusolver_llt cusolver_lu) + foreach(_cusolver_test IN ITEMS cusolver_llt cusolver_lu cusolver_qr cusolver_svd cusolver_eigen) ei_add_gpu_test(${_cusolver_test} EXTRA_LIBS CUDA::cusolver CUDA::cublas) endforeach() endif()
diff --git a/unsupported/test/GPU/cusolver_eigen.cpp b/unsupported/test/GPU/cusolver_eigen.cpp new file mode 100644 index 0000000..3a6535d --- /dev/null +++ b/unsupported/test/GPU/cusolver_eigen.cpp
@@ -0,0 +1,328 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// Tests for GpuSelfAdjointEigenSolver: GPU symmetric/Hermitian eigenvalue +// decomposition via cuSOLVER. + +#define EIGEN_USE_GPU +#include "main.h" +#include <Eigen/Eigenvalues> +#include <unsupported/Eigen/GPU> + +using namespace Eigen; + +// ---- Reconstruction: V * diag(W) * V^H ≈ A --------------------------------- + +template <typename Scalar> +void test_eigen_reconstruction(Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + // Build a symmetric/Hermitian matrix. + Mat R = Mat::Random(n, n); + Mat A = R + R.adjoint(); + + gpu::SelfAdjointEigenSolver<Scalar> es(A); + VERIFY_IS_EQUAL(es.info(), Success); + + auto W = es.eigenvalues(); + Mat V = es.eigenvectors(); + + VERIFY_IS_EQUAL(W.size(), n); + VERIFY_IS_EQUAL(V.rows(), n); + VERIFY_IS_EQUAL(V.cols(), n); + + // Reconstruct: A_hat = V * diag(W) * V^H. + Mat A_hat = V * W.asDiagonal() * V.adjoint(); + RealScalar tol = RealScalar(8) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon() * A.norm(); + VERIFY((A_hat - A).norm() < tol); + + // Orthogonality: V^H * V ≈ I. + Mat VhV = V.adjoint() * V; + Mat eye = Mat::Identity(n, n); + VERIFY((VhV - eye).norm() < tol); +} + +// ---- Eigenvalues match CPU SelfAdjointEigenSolver --------------------------- + +template <typename Scalar> +void test_eigen_values(Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat R = Mat::Random(n, n); + Mat A = R + R.adjoint(); + + gpu::SelfAdjointEigenSolver<Scalar> gpu_es(A); + VERIFY_IS_EQUAL(gpu_es.info(), Success); + auto W_gpu = gpu_es.eigenvalues(); + + SelfAdjointEigenSolver<Mat> cpu_es(A); + auto W_cpu = cpu_es.eigenvalues(); + + RealScalar tol = + RealScalar(2) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon() * W_cpu.cwiseAbs().maxCoeff(); + VERIFY((W_gpu - W_cpu).norm() < tol); +} + +// ---- Eigenvalues-only mode -------------------------------------------------- + +template <typename Scalar> +void test_eigen_values_only(Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat R = Mat::Random(n, n); + Mat A = R + R.adjoint(); + + gpu::SelfAdjointEigenSolver<Scalar> gpu_es(A, EigenvaluesOnly); + VERIFY_IS_EQUAL(gpu_es.info(), Success); + auto W_gpu = gpu_es.eigenvalues(); + + SelfAdjointEigenSolver<Mat> cpu_es(A, EigenvaluesOnly); + auto W_cpu = cpu_es.eigenvalues(); + + RealScalar tol = + RealScalar(2) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon() * W_cpu.cwiseAbs().maxCoeff(); + VERIFY((W_gpu - W_cpu).norm() < tol); +} + +// ---- DeviceMatrix input path ------------------------------------------------ + +template <typename Scalar> +void test_eigen_device_matrix(Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat R = Mat::Random(n, n); + Mat A = R + R.adjoint(); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + gpu::SelfAdjointEigenSolver<Scalar> es; + es.compute(d_A); + VERIFY_IS_EQUAL(es.info(), Success); + + auto W_gpu = es.eigenvalues(); + Mat V = es.eigenvectors(); + + // Verify reconstruction. + Mat A_hat = V * W_gpu.asDiagonal() * V.adjoint(); + RealScalar tol = RealScalar(8) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon() * A.norm(); + VERIFY((A_hat - A).norm() < tol); +} + +// ---- Recompute (reuse solver object) ---------------------------------------- + +template <typename Scalar> +void test_eigen_recompute(Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + gpu::SelfAdjointEigenSolver<Scalar> es; + + for (int trial = 0; trial < 3; ++trial) { + Mat R = Mat::Random(n, n); + Mat A = R + R.adjoint(); + es.compute(A); + VERIFY_IS_EQUAL(es.info(), Success); + + auto W = es.eigenvalues(); + Mat V = es.eigenvectors(); + Mat A_hat = V * W.asDiagonal() * V.adjoint(); + RealScalar tol = RealScalar(8) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon() * A.norm(); + VERIFY((A_hat - A).norm() < tol); + } +} + +// ---- Repeated eigenvalues --------------------------------------------------- +// +// Builds A with a degenerate eigenvalue cluster. Eigenvectors within the cluster +// are not uniquely determined, but eigenvalues, reconstruction, and orthogonality +// must still hold. + +template <typename Scalar> +void test_eigen_repeated_values(Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using Vec = Matrix<typename NumTraits<Scalar>::Real, Dynamic, 1>; + using RealScalar = typename NumTraits<Scalar>::Real; + + // Build a unitary V via QR of a random matrix. + Mat seed = Mat::Random(n, n); + Mat V = HouseholderQR<Mat>(seed).householderQ(); + + // Spectrum: a 4-fold cluster at value 1, then increasing distinct values. + Vec eigs(n); + for (Index i = 0; i < n; ++i) { + eigs(i) = (i < 4) ? RealScalar(1) : RealScalar(i); + } + Mat A = V * eigs.asDiagonal() * V.adjoint(); + Mat A_sym = (A + A.adjoint()) * RealScalar(0.5); // symmetrize numerically + + gpu::SelfAdjointEigenSolver<Scalar> es(A_sym); + VERIFY_IS_EQUAL(es.info(), Success); + + // Eigenvalues must match the constructed spectrum (cuSOLVER returns ascending). + Vec W_gpu = es.eigenvalues(); + Vec W_expected = eigs; + std::sort(W_expected.data(), W_expected.data() + n); + + RealScalar tol_w = + RealScalar(2) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon() * W_expected.cwiseAbs().maxCoeff(); + VERIFY((W_gpu - W_expected).norm() < tol_w); + + // Reconstruction must hold even with a degenerate cluster (eigenvectors form a + // valid orthonormal basis for the cluster's invariant subspace). + Mat V_gpu = es.eigenvectors(); + Mat A_hat = V_gpu * W_gpu.asDiagonal() * V_gpu.adjoint(); + RealScalar tol = RealScalar(20) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon() * A.norm(); + VERIFY((A_hat - A_sym).norm() < tol); + + // Orthogonality of computed eigenvectors must hold. + VERIFY((V_gpu.adjoint() * V_gpu - Mat::Identity(n, n)).norm() < tol); +} + +// ---- Device-side accessors -------------------------------------------------- +// +// d_eigenvalues / d_eigenvectors return non-owning views over the solver's +// internal d_W_ / d_A_ buffers. Verify they agree with the host accessors and +// that repeated invocations leave the solver state intact. + +template <typename Scalar> +void test_eigen_device_accessors(Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + + Mat R = Mat::Random(n, n); + Mat A = R + R.adjoint(); + + gpu::SelfAdjointEigenSolver<Scalar> es(A); + VERIFY_IS_EQUAL(es.info(), Success); + + auto d_W = es.d_eigenvalues(); + auto d_V = es.d_eigenvectors(); + + VERIFY_IS_APPROX(d_W.toHost(), es.eigenvalues()); + VERIFY_IS_APPROX(d_V.toHost(), es.eigenvectors()); + + // Re-fetch must keep working (view destruction must not free the underlying buffers). + (void)es.d_eigenvalues(); + (void)es.d_eigenvectors(); + VERIFY_IS_APPROX(es.eigenvalues(), d_W.toHost()); + VERIFY_IS_APPROX(es.eigenvectors(), d_V.toHost()); +} + +// ---- Chain device views into a downstream cuBLAS GEMM (no D2D copy) --------- + +template <typename Scalar> +void test_eigen_chain_orthogonality(Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat R = Mat::Random(n, n); + Mat A = R + R.adjoint(); + + gpu::SelfAdjointEigenSolver<Scalar> es(A); + VERIFY_IS_EQUAL(es.info(), Success); + + // V^H * V = I — V is the device view of d_A_; the GEMM consumes the view + // without an intervening D2D copy. + gpu::Context ctx; + gpu::DeviceMatrix<Scalar> d_VtV; + { + auto d_V = es.d_eigenvectors(); + d_VtV.device(ctx) = d_V.adjoint() * d_V; + } + Mat VtV = d_VtV.toHost(); + RealScalar tol = RealScalar(20) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon(); + VERIFY((VtV - Mat::Identity(n, n)).norm() < tol); + + // After view destruction, the solver's state must remain valid. + Mat V_again = es.eigenvectors(); + VERIFY_IS_EQUAL(V_again.rows(), n); +} + +// ---- Move support ------------------------------------------------------------- + +template <typename Scalar> +void test_eigen_move(Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat R = Mat::Random(n, n); + Mat A = R + R.adjoint(); + + gpu::SelfAdjointEigenSolver<Scalar> es(A); + VERIFY_IS_EQUAL(es.info(), Success); + + gpu::SelfAdjointEigenSolver<Scalar> moved(std::move(es)); + VERIFY_IS_EQUAL(moved.info(), Success); + Mat V = moved.eigenvectors(); + auto W = moved.eigenvalues(); + RealScalar tol = RealScalar(8) * static_cast<RealScalar>(n) * NumTraits<Scalar>::epsilon() * A.norm(); + VERIFY((V * W.asDiagonal() * V.adjoint() - A).norm() < tol); + + gpu::SelfAdjointEigenSolver<Scalar> assigned; + assigned = std::move(moved); + VERIFY_IS_EQUAL(assigned.info(), Success); + V = assigned.eigenvectors(); + W = assigned.eigenvalues(); + VERIFY((V * W.asDiagonal() * V.adjoint() - A).norm() < tol); +} + +// ---- Empty matrix ----------------------------------------------------------- + +void test_eigen_empty() { + gpu::SelfAdjointEigenSolver<double> es(MatrixXd(0, 0)); + VERIFY_IS_EQUAL(es.info(), Success); + VERIFY_IS_EQUAL(es.rows(), 0); + VERIFY_IS_EQUAL(es.cols(), 0); +} + +// ---- Per-scalar driver ------------------------------------------------------ + +template <typename Scalar> +void test_scalar() { + // Reconstruction + orthogonality. + CALL_SUBTEST(test_eigen_reconstruction<Scalar>(64)); + CALL_SUBTEST(test_eigen_reconstruction<Scalar>(128)); + + // Eigenvalues match CPU. + CALL_SUBTEST(test_eigen_values<Scalar>(64)); + CALL_SUBTEST(test_eigen_values<Scalar>(128)); + + // Values-only mode. + CALL_SUBTEST(test_eigen_values_only<Scalar>(64)); + + // DeviceMatrix input. + CALL_SUBTEST(test_eigen_device_matrix<Scalar>(64)); + + // Recompute. + CALL_SUBTEST(test_eigen_recompute<Scalar>(32)); + + // Repeated eigenvalues (degenerate cluster). + CALL_SUBTEST(test_eigen_repeated_values<Scalar>(64)); + + // Device accessors. + CALL_SUBTEST(test_eigen_device_accessors<Scalar>(64)); + + // Chain device view into a downstream GEMM. + CALL_SUBTEST(test_eigen_chain_orthogonality<Scalar>(64)); + + // Move constructor/assignment. + CALL_SUBTEST(test_eigen_move<Scalar>(32)); +} + +EIGEN_DECLARE_TEST(gpu_cusolver_eigen) { + // Split by scalar so each part compiles in parallel. + CALL_SUBTEST_1(test_scalar<float>()); + CALL_SUBTEST_2(test_scalar<double>()); + CALL_SUBTEST_3(test_scalar<std::complex<float>>()); + CALL_SUBTEST_4(test_scalar<std::complex<double>>()); + CALL_SUBTEST_5(test_eigen_empty()); +}
diff --git a/unsupported/test/GPU/cusolver_llt.cpp b/unsupported/test/GPU/cusolver_llt.cpp index 9e763af..772333b 100644 --- a/unsupported/test/GPU/cusolver_llt.cpp +++ b/unsupported/test/GPU/cusolver_llt.cpp
@@ -106,6 +106,17 @@ VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::NumericalIssue); } +// solve(DeviceMatrix) must not silently return garbage when the factorization +// failed: it must sync the info word and assert just like solve(MatrixBase). +void test_not_spd_device_solve_asserts() { + Eigen::MatrixXd h_A = -Eigen::MatrixXd::Identity(8, 8); + Eigen::MatrixXd h_B = Eigen::MatrixXd::Random(8, 4); + Eigen::gpu::LLT<double> gpu_llt(h_A); + VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::NumericalIssue); + auto d_B = Eigen::gpu::DeviceMatrix<double>::fromHost(h_B); + VERIFY_RAISES_ASSERT(gpu_llt.solve(d_B)); +} + // ---- DeviceMatrix-native API -------------------------------------------- // These tests exercise the device-resident path: compute(DeviceMatrix) + // solve(DeviceMatrix) -> DeviceMatrix, with the user explicitly managing @@ -218,4 +229,5 @@ CALL_SUBTEST_3(test_scalar<std::complex<float>>()); CALL_SUBTEST_4(test_scalar<std::complex<double>>()); CALL_SUBTEST_5(test_not_spd()); + CALL_SUBTEST_5(test_not_spd_device_solve_asserts()); }
diff --git a/unsupported/test/GPU/cusolver_lu.cpp b/unsupported/test/GPU/cusolver_lu.cpp index 8c4e4be..195f6d1 100644 --- a/unsupported/test/GPU/cusolver_lu.cpp +++ b/unsupported/test/GPU/cusolver_lu.cpp
@@ -105,6 +105,17 @@ VERIFY_IS_EQUAL(lu.info(), NumericalIssue); } +// solve(DeviceMatrix) must not silently return garbage when the factorization +// failed: it must sync the info word and assert just like solve(MatrixBase). +void test_singular_device_solve_asserts() { + MatrixXd A = MatrixXd::Zero(8, 8); + MatrixXd B = MatrixXd::Random(8, 4); + gpu::LU<double> lu(A); + VERIFY_IS_EQUAL(lu.info(), NumericalIssue); + auto d_B = gpu::DeviceMatrix<double>::fromHost(B); + VERIFY_RAISES_ASSERT(lu.solve(d_B)); +} + // ---- DeviceMatrix integration tests ----------------------------------------- template <typename Scalar> @@ -202,4 +213,5 @@ CALL_SUBTEST_3(test_scalar<std::complex<float>>()); CALL_SUBTEST_4(test_scalar<std::complex<double>>()); CALL_SUBTEST_5(test_singular()); + CALL_SUBTEST_5(test_singular_device_solve_asserts()); }
diff --git a/unsupported/test/GPU/cusolver_qr.cpp b/unsupported/test/GPU/cusolver_qr.cpp new file mode 100644 index 0000000..c1f7f12 --- /dev/null +++ b/unsupported/test/GPU/cusolver_qr.cpp
@@ -0,0 +1,309 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// Tests for GpuQR: GPU QR decomposition via cuSOLVER. + +#define EIGEN_USE_GPU +#include "main.h" +#include <Eigen/QR> +#include <Eigen/SVD> +#include <unsupported/Eigen/GPU> + +using namespace Eigen; + +// ---- Solve square system: A * X = B ----------------------------------------- + +template <typename Scalar> +void test_qr_solve_square(Index n, Index nrhs) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(n, n); + Mat B = Mat::Random(n, nrhs); + + gpu::QR<Scalar> qr(A); + VERIFY_IS_EQUAL(qr.info(), Success); + + Mat X = qr.solve(B); + RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm()); + VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon()); +} + +// ---- Solve overdetermined system: m > n (least-squares) --------------------- + +template <typename Scalar> +void test_qr_solve_overdetermined(Index m, Index n, Index nrhs) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + eigen_assert(m >= n); + Mat A = Mat::Random(m, n); + Mat B = Mat::Random(m, nrhs); + + gpu::QR<Scalar> qr(A); + VERIFY_IS_EQUAL(qr.info(), Success); + + Mat X = qr.solve(B); + VERIFY_IS_EQUAL(X.rows(), n); + VERIFY_IS_EQUAL(X.cols(), nrhs); + + // For an overdetermined system the residual r = A X - B is generally large + // (O(||B||) when B is not in col(A)). What backward-stable least-squares + // makes small is the gradient of the LS objective, A^H r, which is zero at + // the optimum. Higham's bound for QR-based LS gives ||A^H r|| <= O(m * eps) + // * ||A|| * (||A|| ||X|| + ||B||), regardless of kappa(A). + Mat X_cpu = HouseholderQR<Mat>(A).solve(B); + RealScalar tol = RealScalar(10) * RealScalar(m) * NumTraits<Scalar>::epsilon(); + RealScalar A_norm = A.norm(); + RealScalar denom_gpu = A_norm * (A_norm * X.norm() + B.norm()); + RealScalar denom_cpu = A_norm * (A_norm * X_cpu.norm() + B.norm()); + VERIFY((A.adjoint() * (A * X - B)).norm() / denom_gpu < tol); + VERIFY((A.adjoint() * (A * X_cpu - B)).norm() / denom_cpu < tol); +} + +// ---- Solve with DeviceMatrix input ------------------------------------------ + +template <typename Scalar> +void test_qr_solve_device(Index n, Index nrhs) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(n, n); + Mat B = Mat::Random(n, nrhs); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::QR<Scalar> qr; + qr.compute(d_A); + VERIFY_IS_EQUAL(qr.info(), Success); + + gpu::DeviceMatrix<Scalar> d_X = qr.solve(d_B); + Mat X = d_X.toHost(); + + RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm()); + VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon()); +} + +// ---- Solve overdetermined via device path ----------------------------------- + +template <typename Scalar> +void test_qr_solve_overdetermined_device(Index m, Index n, Index nrhs) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + eigen_assert(m >= n); + Mat A = Mat::Random(m, n); + Mat B = Mat::Random(m, nrhs); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::QR<Scalar> qr; + qr.compute(d_A); + VERIFY_IS_EQUAL(qr.info(), Success); + + gpu::DeviceMatrix<Scalar> d_X = qr.solve(d_B); + VERIFY_IS_EQUAL(d_X.rows(), n); + VERIFY_IS_EQUAL(d_X.cols(), nrhs); + + Mat X = d_X.toHost(); + Mat X_cpu = HouseholderQR<Mat>(A).solve(B); + // See test_qr_solve_overdetermined: backward error for LS is bounded on + // the gradient A^H r, not on r itself. + RealScalar tol = RealScalar(10) * RealScalar(m) * NumTraits<Scalar>::epsilon(); + RealScalar A_norm = A.norm(); + RealScalar denom_gpu = A_norm * (A_norm * X.norm() + B.norm()); + RealScalar denom_cpu = A_norm * (A_norm * X_cpu.norm() + B.norm()); + VERIFY((A.adjoint() * (A * X - B)).norm() / denom_gpu < tol); + VERIFY((A.adjoint() * (A * X_cpu - B)).norm() / denom_cpu < tol); +} + +// ---- Solve underdetermined system: m < n (minimum norm) --------------------- + +template <typename Scalar> +void test_qr_solve_underdetermined(Index m, Index n, Index nrhs) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + eigen_assert(m < n); + Mat A = Mat::Random(m, n); + Mat B = Mat::Random(m, nrhs); + + gpu::QR<Scalar> qr(A); + VERIFY_IS_EQUAL(qr.info(), Success); + + Mat X = qr.solve(B); + VERIFY_IS_EQUAL(X.rows(), n); + VERIFY_IS_EQUAL(X.cols(), nrhs); + + // The minimum-norm solution exactly satisfies A X = B (m equations, n > m + // unknowns). Backward error is O(m * eps) * ||A|| * ||X||. + RealScalar tol = RealScalar(20) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY((A * X - B).norm() / (A.norm() * X.norm() + B.norm()) < tol); + + // Min-norm property: X ⟂ null(A), so X ∈ row space(A) = col space(A^H). + // ||X|| <= ||X_any|| for any other solution. Compare to the SVD min-norm reference. + Mat X_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV).solve(B); + // Both should have the same norm (up to rounding). + VERIFY(numext::abs(X.norm() - X_svd.norm()) / X_svd.norm() < + RealScalar(50) * RealScalar(n) * NumTraits<Scalar>::epsilon()); +} + +template <typename Scalar> +void test_qr_solve_underdetermined_device(Index m, Index n, Index nrhs) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + eigen_assert(m < n); + Mat A = Mat::Random(m, n); + Mat B = Mat::Random(m, nrhs); + + auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A); + auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B); + + gpu::QR<Scalar> qr; + qr.compute(d_A); + VERIFY_IS_EQUAL(qr.info(), Success); + + gpu::DeviceMatrix<Scalar> d_X = qr.solve(d_B); + VERIFY_IS_EQUAL(d_X.rows(), n); + VERIFY_IS_EQUAL(d_X.cols(), nrhs); + + Mat X = d_X.toHost(); + RealScalar tol = RealScalar(20) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + VERIFY((A * X - B).norm() / (A.norm() * X.norm() + B.norm()) < tol); +} + +// ---- matrixR() returns the upper-triangular factor -------------------------- + +template <typename Scalar> +void test_qr_matrixR(Index m, Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + eigen_assert(m >= n); + Mat A = Mat::Random(m, n); + gpu::QR<Scalar> qr(A); + VERIFY_IS_EQUAL(qr.info(), Success); + + Mat R = qr.matrixR(); + VERIFY_IS_EQUAL(R.rows(), n); + VERIFY_IS_EQUAL(R.cols(), n); + + // Verify R is upper-triangular (lower triangle exactly zero). + for (Index j = 0; j < n; ++j) { + for (Index i = j + 1; i < n; ++i) { + VERIFY_IS_EQUAL(R(i, j), Scalar(0)); + } + } + + // Verify that the diagonal of R matches the absolute values of CPU R diagonal + // up to sign (Householder QR has free diagonal sign). + Mat R_cpu = HouseholderQR<Mat>(A).matrixQR().topRows(n).template triangularView<Upper>(); + RealScalar tol = RealScalar(20) * RealScalar(m) * NumTraits<Scalar>::epsilon() * A.norm(); + for (Index i = 0; i < n; ++i) { + VERIFY(numext::abs(numext::abs(R(i, i)) - numext::abs(R_cpu(i, i))) < tol); + } +} + +// ---- Multiple solves reuse the factorization -------------------------------- + +template <typename Scalar> +void test_qr_multiple_solves(Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(n, n); + gpu::QR<Scalar> qr(A); + VERIFY_IS_EQUAL(qr.info(), Success); + + RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + for (int k = 0; k < 5; ++k) { + Mat B = Mat::Random(n, 3); + Mat X = qr.solve(B); + RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm()); + VERIFY(residual < tol); + } +} + +// ---- Agreement with CPU HouseholderQR --------------------------------------- + +template <typename Scalar> +void test_qr_vs_cpu(Index n, Index nrhs) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(n, n); + Mat B = Mat::Random(n, nrhs); + + gpu::QR<Scalar> gpu_qr(A); + VERIFY_IS_EQUAL(gpu_qr.info(), Success); + + Mat X_gpu = gpu_qr.solve(B); + Mat X_cpu = HouseholderQR<Mat>(A).solve(B); + + // Compare via residual rather than directly between X_gpu and X_cpu: for an + // ill-conditioned A (a random N(0,1) n*n matrix easily reaches kappa(A) ~ n), + // the forward error of each correct solve is bounded by O(kappa * eps), so + // ||X_gpu - X_cpu|| / ||X_cpu|| can legitimately exceed n*eps even though + // both are correct. The relative residual ||A X - B|| / (||A||*||X|| + ||B||) + // is bounded by Higham's backward-stable solve result at O(n * eps), + // independent of kappa. + RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon(); + RealScalar A_norm = A.norm(); + RealScalar B_norm = B.norm(); + RealScalar denom_gpu = A_norm * X_gpu.norm() + B_norm; + RealScalar denom_cpu = A_norm * X_cpu.norm() + B_norm; + VERIFY((A * X_gpu - B).norm() / denom_gpu < tol); + VERIFY((A * X_cpu - B).norm() / denom_cpu < tol); +} + +// ---- Per-scalar driver ------------------------------------------------------ + +template <typename Scalar> +void test_scalar() { + CALL_SUBTEST(test_qr_solve_square<Scalar>(1, 1)); + CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 1)); + CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 4)); + CALL_SUBTEST(test_qr_solve_square<Scalar>(256, 8)); + + CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(128, 64, 4)); + CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(256, 128, 1)); + + CALL_SUBTEST(test_qr_solve_underdetermined<Scalar>(64, 128, 4)); + CALL_SUBTEST(test_qr_solve_underdetermined<Scalar>(64, 128, 1)); + CALL_SUBTEST(test_qr_solve_underdetermined_device<Scalar>(64, 128, 4)); + + CALL_SUBTEST(test_qr_matrixR<Scalar>(64, 64)); + CALL_SUBTEST(test_qr_matrixR<Scalar>(128, 64)); + + CALL_SUBTEST(test_qr_solve_device<Scalar>(64, 4)); + CALL_SUBTEST(test_qr_solve_overdetermined_device<Scalar>(128, 64, 4)); + CALL_SUBTEST(test_qr_multiple_solves<Scalar>(64)); + CALL_SUBTEST(test_qr_vs_cpu<Scalar>(64, 4)); + CALL_SUBTEST(test_qr_vs_cpu<Scalar>(256, 8)); +} + +void test_qr_empty() { + gpu::QR<double> qr(MatrixXd(0, 0)); + VERIFY_IS_EQUAL(qr.info(), Success); + VERIFY_IS_EQUAL(qr.rows(), 0); + VERIFY_IS_EQUAL(qr.cols(), 0); +} + +EIGEN_DECLARE_TEST(gpu_cusolver_qr) { + // Split by scalar so each part compiles in parallel. + CALL_SUBTEST_1(test_scalar<float>()); + CALL_SUBTEST_2(test_scalar<double>()); + CALL_SUBTEST_3(test_scalar<std::complex<float>>()); + CALL_SUBTEST_4(test_scalar<std::complex<double>>()); + CALL_SUBTEST_5(test_qr_empty()); +}
diff --git a/unsupported/test/GPU/cusolver_svd.cpp b/unsupported/test/GPU/cusolver_svd.cpp new file mode 100644 index 0000000..5d756b1 --- /dev/null +++ b/unsupported/test/GPU/cusolver_svd.cpp
@@ -0,0 +1,431 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// SPDX-License-Identifier: MPL-2.0 + +// Tests for GpuSVD: GPU SVD via cuSOLVER. + +#define EIGEN_USE_GPU +#include "main.h" +#include <Eigen/SVD> +#include <unsupported/Eigen/GPU> + +using namespace Eigen; + +// ---- SVD reconstruction: U * diag(S) * VT ≈ A ------------------------------ + +template <typename Scalar, unsigned int Options> +void test_svd_reconstruction(Index m, Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(m, n); + gpu::SVD<Scalar> svd(A, Options); + VERIFY_IS_EQUAL(svd.info(), Success); + + auto S = svd.singularValues(); + Mat U = svd.matrixU(); + Mat VT = svd.matrixVT(); + + const Index k = (std::min)(m, n); + + // Reconstruct: A_hat = U[:,:k] * diag(S) * VT[:k,:]. + Mat A_hat = U.leftCols(k) * S.asDiagonal() * VT.topRows(k); + RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(k)) * NumTraits<Scalar>::epsilon() * A.norm(); + VERIFY((A_hat - A).norm() < tol); + + // Orthogonality: U^H * U ≈ I. + Mat UtU = U.adjoint() * U; + Mat I_u = Mat::Identity(U.cols(), U.cols()); + VERIFY((UtU - I_u).norm() < tol); + + // Orthogonality: VT * VT^H ≈ I. + Mat VtVh = VT * VT.adjoint(); + Mat I_v = Mat::Identity(VT.rows(), VT.rows()); + VERIFY((VtVh - I_v).norm() < tol); +} + +// ---- Singular values match CPU BDCSVD --------------------------------------- + +template <typename Scalar> +void test_svd_singular_values(Index m, Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(m, n); + gpu::SVD<Scalar> svd(A, 0); // values only + VERIFY_IS_EQUAL(svd.info(), Success); + + auto S_gpu = svd.singularValues(); + auto S_cpu = BDCSVD<Mat>(A, 0).singularValues(); + + // Weyl's perturbation bound (Higham, Accuracy and Stability of Numerical + // Algorithms, 2nd ed., §10.2.3): |σ_i(A) - σ_i(A+δA)| ≤ ||δA||. Both cuSOLVER + // and BDCSVD have backward error ||δA|| / ||A|| ≤ p(m,n) · u; the difference + // of the two computed singular value vectors is bounded by 2·p(m,n)·u·S_max, + // and √(min) · ||·||_∞ ≥ ||·||_2. Across 5k trials per case in + // {float, double, complex<float>, complex<double>} × {(64,64), (128,64)}, + // worst observed err / (√min · u · S_max) is ≈ 6.2; we use 12 for headroom + // against future runs hitting the tail of the distribution. + RealScalar tol = + RealScalar(12) * std::sqrt(static_cast<RealScalar>((std::min)(m, n))) * NumTraits<Scalar>::epsilon() * S_cpu(0); + VERIFY((S_gpu - S_cpu).norm() < tol); +} + +// ---- Solve: pseudoinverse --------------------------------------------------- + +template <typename Scalar> +void test_svd_solve(Index m, Index n, Index nrhs) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(m, n); + Mat B = Mat::Random(m, nrhs); + + gpu::SVD<Scalar> svd(A, ComputeThinU | ComputeThinV); + VERIFY_IS_EQUAL(svd.info(), Success); + + Mat X = svd.solve(B); + VERIFY_IS_EQUAL(X.rows(), n); + VERIFY_IS_EQUAL(X.cols(), nrhs); + + // Compare with CPU BDCSVD solve. Wedin's perturbation theorem (Higham, + // Accuracy and Stability of Numerical Algorithms, 2nd ed., §20.1) bounds + // the forward error of a backward-stable SVD-based pseudoinverse solve by + // c · κ(A) · u with c = O(1). Comparing two such solvers doubles the + // constant. Across 6k trials over {float, double, complex<float>, + // complex<double>} and {square, over-/underdetermined} shapes, the worst + // observed err / (κ · u) is 5.3. + auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV); + Mat X_cpu = cpu_svd.solve(B); + auto S = cpu_svd.singularValues(); + const RealScalar cond = S(0) / S(S.size() - 1); + const RealScalar tol = RealScalar(8) * cond * NumTraits<Scalar>::epsilon(); + VERIFY((X - X_cpu).norm() / X_cpu.norm() < tol); +} + +// ---- Solve: truncated ------------------------------------------------------- + +template <typename Scalar> +void test_svd_solve_truncated(Index m, Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(m, n); + Mat B = Mat::Random(m, 1); + const Index k = (std::min)(m, n); + const Index trunc = k / 2; + eigen_assert(trunc > 0); + + gpu::SVD<Scalar> svd(A, ComputeThinU | ComputeThinV); + Mat X_trunc = svd.solve(B, trunc); + + // Build CPU reference: truncated pseudoinverse. + auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV); + auto S = cpu_svd.singularValues(); + Mat U = cpu_svd.matrixU(); + Mat V = cpu_svd.matrixV(); + + // D_ii = 1/S_i for i < trunc, 0 otherwise. + Matrix<RealScalar, Dynamic, 1> D = Matrix<RealScalar, Dynamic, 1>::Zero(k); + for (Index i = 0; i < trunc; ++i) D(i) = RealScalar(1) / S(i); + Mat X_ref = V * D.asDiagonal() * U.adjoint() * B; + + RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon(); + VERIFY((X_trunc - X_ref).norm() / X_ref.norm() < tol); +} + +// ---- Solve: Tikhonov regularized -------------------------------------------- + +template <typename Scalar> +void test_svd_solve_regularized(Index m, Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(m, n); + Mat B = Mat::Random(m, 1); + RealScalar lambda = RealScalar(0.1); + const Index k = (std::min)(m, n); + + gpu::SVD<Scalar> svd(A, ComputeThinU | ComputeThinV); + Mat X_reg = svd.solve(B, lambda); + + // CPU reference: D_ii = S_i / (S_i^2 + lambda^2). + auto cpu_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV); + auto S = cpu_svd.singularValues(); + Mat U = cpu_svd.matrixU(); + Mat V = cpu_svd.matrixV(); + + Matrix<RealScalar, Dynamic, 1> D(k); + for (Index i = 0; i < k; ++i) D(i) = S(i) / (S(i) * S(i) + lambda * lambda); + Mat X_ref = V * D.asDiagonal() * U.adjoint() * B; + + RealScalar tol = RealScalar(100) * RealScalar(k) * NumTraits<Scalar>::epsilon(); + VERIFY((X_reg - X_ref).norm() / X_ref.norm() < tol); +} + +// ---- Solve: rank-deficient (exercise drop_threshold pseudoinverse) ---------- + +template <typename Scalar> +void test_svd_solve_rank_deficient(Index m, Index n, Index r) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using Vec = Matrix<typename NumTraits<Scalar>::Real, Dynamic, 1>; + using RealScalar = typename NumTraits<Scalar>::Real; + + eigen_assert(r > 0 && r < (std::min)(m, n)); + + // Build A of rank exactly r: A = U[:,:r] * diag(s) * V[:,:r]^H. + Mat U_seed = Mat::Random(m, m); + Mat U_full = HouseholderQR<Mat>(U_seed).householderQ(); + Mat V_seed = Mat::Random(n, n); + Mat V_full = HouseholderQR<Mat>(V_seed).householderQ(); + Vec sigma(r); + for (Index i = 0; i < r; ++i) sigma(i) = RealScalar(1) + RealScalar(i); // distinct, well-spaced + Mat A = U_full.leftCols(r) * sigma.asDiagonal() * V_full.leftCols(r).adjoint(); + + Mat B = Mat::Random(m, 1); + + gpu::SVD<Scalar> svd(A, ComputeThinU | ComputeThinV); + VERIFY_IS_EQUAL(svd.info(), Success); + Mat X_gpu = svd.solve(B); + + // Reference: CPU BDCSVD with the same drop_threshold semantics (its default solve() + // also drops near-zero singular values relative to S(0) * (m, n)_max * epsilon). + Mat X_cpu = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV).solve(B); + + // Both should produce the minimum-norm least-squares solution; compare via the + // "useful" residual A^H r: zero up to the rank-r component plus rounding. + RealScalar tol = RealScalar(50) * RealScalar((std::max)(m, n)) * NumTraits<Scalar>::epsilon(); + VERIFY((X_gpu - X_cpu).norm() / X_cpu.norm() < tol); + + // Norm of X should also be minimum (X has no component in null(A) up to rounding). + VERIFY(numext::abs(X_gpu.norm() - X_cpu.norm()) / X_cpu.norm() < tol); +} + +// ---- Reconstruction: full U / full V on a wide matrix (m < n) --------------- +// +// Exercises the transposed-internal-representation path through gesvd combined +// with the matrixU/matrixVT swap logic. + +template <typename Scalar> +void test_svd_reconstruction_full_wide(Index m, Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + eigen_assert(m < n); + Mat A = Mat::Random(m, n); + gpu::SVD<Scalar> svd(A, ComputeFullU | ComputeFullV); + VERIFY_IS_EQUAL(svd.info(), Success); + + auto S = svd.singularValues(); + Mat U = svd.matrixU(); // m × m + Mat VT = svd.matrixVT(); // n × n + Mat V = svd.matrixV(); // n × n (matrixV alias should match matrixVT().adjoint()) + + VERIFY_IS_EQUAL(U.rows(), m); + VERIFY_IS_EQUAL(U.cols(), m); + VERIFY_IS_EQUAL(VT.rows(), n); + VERIFY_IS_EQUAL(VT.cols(), n); + VERIFY_IS_EQUAL(V.rows(), n); + VERIFY_IS_EQUAL(V.cols(), n); + VERIFY_IS_APPROX(V, VT.adjoint()); + + const Index k = (std::min)(m, n); + Mat A_hat = U.leftCols(k) * S.asDiagonal() * VT.topRows(k); + RealScalar tol = RealScalar(5) * std::sqrt(static_cast<RealScalar>(k)) * NumTraits<Scalar>::epsilon() * A.norm(); + VERIFY((A_hat - A).norm() < tol); + + Mat UtU = U.adjoint() * U; + VERIFY((UtU - Mat::Identity(m, m)).norm() < tol); + Mat VtVh = VT * VT.adjoint(); + VERIFY((VtVh - Mat::Identity(n, n)).norm() < tol); +} + +// ---- Device-side accessors -------------------------------------------------- +// +// Verify that d_singularValues / d_matrixU / d_matrixVT return views over the +// SVD's internal storage that agree with the host accessors after toHost(). + +template <typename Scalar> +void test_svd_device_accessors(Index m, Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(m, n); + gpu::SVD<Scalar> svd(A, ComputeThinU | ComputeThinV); + VERIFY_IS_EQUAL(svd.info(), Success); + + auto d_S = svd.d_singularValues(); + auto d_U = svd.d_matrixU(); + auto d_VT = svd.d_matrixVT(); + + Matrix<RealScalar, Dynamic, 1> S_host = d_S.toHost(); + Mat U_host = d_U.toHost(); + Mat VT_host = d_VT.toHost(); + + VERIFY_IS_APPROX(S_host, svd.singularValues()); + VERIFY_IS_APPROX(U_host, svd.matrixU()); + VERIFY_IS_APPROX(VT_host, svd.matrixVT()); + + // Multiple invocations must keep returning views without disturbing the SVD's state: + // call again, then verify the SVD's host accessors still produce correct results. + (void)svd.d_matrixU(); + (void)svd.d_matrixVT(); + VERIFY_IS_APPROX(svd.matrixU(), U_host); + VERIFY_IS_APPROX(svd.matrixVT(), VT_host); +} + +template <typename Scalar> +void test_svd_device_accessors_full_wide(Index m, Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + eigen_assert(m < n); + Mat A = Mat::Random(m, n); + gpu::SVD<Scalar> svd(A, ComputeFullU | ComputeFullV); + VERIFY_IS_EQUAL(svd.info(), Success); + + auto d_S = svd.d_singularValues(); + auto d_U = svd.d_matrixU(); + auto d_VT = svd.d_matrixVT(); + + VERIFY_IS_EQUAL(d_U.rows(), m); + VERIFY_IS_EQUAL(d_U.cols(), m); + VERIFY_IS_EQUAL(d_VT.rows(), n); + VERIFY_IS_EQUAL(d_VT.cols(), n); + + Matrix<RealScalar, Dynamic, 1> S_host = d_S.toHost(); + Mat U_host = d_U.toHost(); + Mat VT_host = d_VT.toHost(); + + VERIFY_IS_APPROX(S_host, svd.singularValues()); + VERIFY_IS_APPROX(U_host, svd.matrixU()); + VERIFY_IS_APPROX(VT_host, svd.matrixVT()); +} + +// ---- Chain device views into a downstream cuBLAS GEMM (no D2D copy) --------- +// +// d_matrixU() returns a non-owning view over the SVD's internal d_U_ buffer. +// Feeding the view straight into a Context-driven GEMM exercises cross-stream +// event sync and confirms the borrow-deleter does not double-free on temporary +// destruction. + +template <typename Scalar> +void test_svd_chain_orthogonality(Index m, Index n) { + using Mat = Matrix<Scalar, Dynamic, Dynamic>; + using RealScalar = typename NumTraits<Scalar>::Real; + + Mat A = Mat::Random(m, n); + gpu::SVD<Scalar> svd(A, ComputeThinU | ComputeThinV); + VERIFY_IS_EQUAL(svd.info(), Success); + + const Index k = (std::min)(m, n); + + // U^H * U = I_k, all on device, no host roundtrip on U. + gpu::Context ctx; + gpu::DeviceMatrix<Scalar> d_UtU; + { + auto d_U = svd.d_matrixU(); + d_UtU.device(ctx) = d_U.adjoint() * d_U; + } + Mat UtU = d_UtU.toHost(); + RealScalar tol = RealScalar(20) * std::sqrt(static_cast<RealScalar>(k)) * NumTraits<Scalar>::epsilon(); + VERIFY((UtU - Mat::Identity(k, k)).norm() < tol); + + // VT * VT^H = I_k. + gpu::DeviceMatrix<Scalar> d_VVt; + { + auto d_VT = svd.d_matrixVT(); + d_VVt.device(ctx) = d_VT * d_VT.adjoint(); + } + Mat VVt = d_VVt.toHost(); + VERIFY((VVt - Mat::Identity(k, k)).norm() < tol); + + // After view destruction, SVD's state must remain valid (the deleter is a no-op + // for views, so the underlying d_U_ / d_VT_ are not freed). + Mat U_again = svd.matrixU(); + Mat VT_again = svd.matrixVT(); + VERIFY_IS_EQUAL(U_again.rows(), m); + VERIFY_IS_EQUAL(VT_again.cols(), n); +} + +// ---- Empty matrix ----------------------------------------------------------- + +void test_svd_empty() { + gpu::SVD<double> svd(MatrixXd(0, 0), 0); + VERIFY_IS_EQUAL(svd.info(), Success); + VERIFY_IS_EQUAL(svd.rows(), 0); + VERIFY_IS_EQUAL(svd.cols(), 0); + + svd.compute(MatrixXd::Random(4, 6), ComputeThinU | ComputeThinV); + VERIFY_IS_EQUAL(svd.info(), Success); + + svd.compute(MatrixXd(0, 7), 0); + VERIFY_IS_EQUAL(svd.info(), Success); + VERIFY_IS_EQUAL(svd.rows(), 0); + VERIFY_IS_EQUAL(svd.cols(), 7); + VERIFY_IS_EQUAL(svd.singularValues().size(), 0); + + svd.compute(MatrixXd(5, 0), 0); + VERIFY_IS_EQUAL(svd.info(), Success); + VERIFY_IS_EQUAL(svd.rows(), 5); + VERIFY_IS_EQUAL(svd.cols(), 0); + VERIFY_IS_EQUAL(svd.singularValues().size(), 0); +} + +// ---- Per-scalar driver ------------------------------------------------------ + +template <typename Scalar> +void test_scalar() { + // Reconstruction + orthogonality (thin and full, identical test logic). + CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 64))); + CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(128, 64))); + CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeThinU | ComputeThinV>(64, 128))); // wide (m < n) + CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(64, 64))); + CALL_SUBTEST((test_svd_reconstruction<Scalar, ComputeFullU | ComputeFullV>(128, 64))); + + // Singular values. + CALL_SUBTEST(test_svd_singular_values<Scalar>(64, 64)); + CALL_SUBTEST(test_svd_singular_values<Scalar>(128, 64)); + + // Solve. + CALL_SUBTEST(test_svd_solve<Scalar>(64, 64, 4)); + CALL_SUBTEST(test_svd_solve<Scalar>(128, 64, 4)); + CALL_SUBTEST(test_svd_solve<Scalar>(64, 128, 4)); // wide (m < n) + + // Truncated and regularized solve. + CALL_SUBTEST(test_svd_solve_truncated<Scalar>(64, 64)); + CALL_SUBTEST(test_svd_solve_regularized<Scalar>(64, 64)); + + // Rank-deficient solve (exercises drop_threshold pseudoinverse). + CALL_SUBTEST(test_svd_solve_rank_deficient<Scalar>(64, 64, 32)); + CALL_SUBTEST(test_svd_solve_rank_deficient<Scalar>(96, 64, 16)); + CALL_SUBTEST(test_svd_solve_rank_deficient<Scalar>(64, 96, 16)); + + // Wide matrix with full U/V (transposed-internal path). + CALL_SUBTEST((test_svd_reconstruction_full_wide<Scalar>(64, 96))); + + // Device accessors. + CALL_SUBTEST(test_svd_device_accessors<Scalar>(64, 64)); + CALL_SUBTEST(test_svd_device_accessors<Scalar>(96, 64)); + CALL_SUBTEST(test_svd_device_accessors<Scalar>(64, 96)); + CALL_SUBTEST(test_svd_device_accessors_full_wide<Scalar>(64, 96)); + + // Chain device views into a downstream GEMM (orthogonality check). + CALL_SUBTEST(test_svd_chain_orthogonality<Scalar>(64, 64)); + CALL_SUBTEST(test_svd_chain_orthogonality<Scalar>(96, 64)); +} + +EIGEN_DECLARE_TEST(gpu_cusolver_svd) { + // Split by scalar so each part compiles in parallel. + CALL_SUBTEST_1(test_scalar<float>()); + CALL_SUBTEST_2(test_scalar<double>()); + CALL_SUBTEST_3(test_scalar<std::complex<float>>()); + CALL_SUBTEST_4(test_scalar<std::complex<double>>()); + CALL_SUBTEST_5(test_svd_empty()); +}
diff --git a/unsupported/test/GPU/device_matrix.cpp b/unsupported/test/GPU/device_matrix.cpp index 4070f49..ac69983 100644 --- a/unsupported/test/GPU/device_matrix.cpp +++ b/unsupported/test/GPU/device_matrix.cpp
@@ -36,7 +36,6 @@ VERIFY(!dm.empty()); VERIFY_IS_EQUAL(dm.rows(), rows); VERIFY_IS_EQUAL(dm.cols(), cols); - VERIFY_IS_EQUAL(dm.outerStride(), rows); VERIFY(dm.data() != nullptr); VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar)); } @@ -70,7 +69,7 @@ EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream)); // Async upload from raw pointer. - auto dm = gpu::DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, rows, stream); + auto dm = gpu::DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, stream); VERIFY_IS_EQUAL(dm.rows(), rows); VERIFY_IS_EQUAL(dm.cols(), cols); @@ -88,7 +87,7 @@ EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&producer_stream)); EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&consumer_stream)); - auto cross_stream_dm = gpu::DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, rows, producer_stream); + auto cross_stream_dm = gpu::DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, producer_stream); auto cross_stream_transfer = cross_stream_dm.toHostAsync(consumer_stream); MatrixType cross_stream_result = cross_stream_transfer.get(); VERIFY_IS_APPROX(cross_stream_result, host); @@ -199,7 +198,6 @@ dm.resize(50, 30); VERIFY_IS_EQUAL(dm.rows(), 50); VERIFY_IS_EQUAL(dm.cols(), 30); - VERIFY_IS_EQUAL(dm.outerStride(), 50); VERIFY(dm.data() != nullptr); // Resize to same dimensions is a no-op.