| // This file is part of Eigen, a lightweight C++ template library |
| // for linear algebra. |
| // |
| // Copyright (C) 2026 Eigen Authors |
| // |
| // This Source Code Form is subject to the terms of the Mozilla |
| // Public License v. 2.0. If a copy of the MPL was not distributed |
| // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| |
| // GPU Cholesky (LLT) decomposition using cuSOLVER. |
| // |
| // Unlike Eigen's CPU LLT<MatrixType>, gpu::LLT keeps the factored Cholesky |
| // factor in device memory for the lifetime of the object. Multiple solves |
| // against the same factor therefore only transfer the RHS and solution |
| // vectors, not the factor itself. |
| // |
| // Requires CUDA 11.0+ (cusolverDnXpotrf / cusolverDnXpotrs generic API). |
| // Requires CUDA 11.4+ (cusolverDnX generic API + cudaMallocAsync). |
| // |
| // Usage: |
| // gpu::LLT<double> llt(A); // upload A, potrf, L stays on device |
| // if (llt.info() != Success) { ... } |
| // MatrixXd x1 = llt.solve(b1); // potrs, only b1 transferred |
| // MatrixXd x2 = llt.solve(b2); // L already on device |
| |
| #ifndef EIGEN_GPU_LLT_H |
| #define EIGEN_GPU_LLT_H |
| |
| // IWYU pragma: private |
| #include "./InternalHeaderCheck.h" |
| |
| #include "./CuSolverSupport.h" |
| #include <vector> |
| |
| namespace Eigen { |
| namespace gpu { |
| |
| /** \ingroup GPU_Module |
| * \class LLT |
| * \brief GPU Cholesky (LL^T) decomposition via cuSOLVER |
| * |
| * \tparam Scalar_ Element type: float, double, complex<float>, complex<double> |
| * \tparam UpLo_ Triangle used: Lower (default) or Upper |
| * |
| * Factorizes a symmetric positive-definite matrix A = LL^H on the GPU and |
| * caches the factor L in device memory. Each subsequent solve(B) uploads only |
| * B, calls cusolverDnXpotrs, and downloads the result — the factor is not |
| * re-transferred. |
| * |
| * Each LLT object owns a dedicated CUDA stream and cuSOLVER handle, |
| * enabling concurrent factorizations from multiple objects on the same host |
| * thread. |
| */ |
| template <typename Scalar_, int UpLo_ = Lower> |
| class LLT { |
| public: |
| using Scalar = Scalar_; |
| using RealScalar = typename NumTraits<Scalar>::Real; |
| using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>; |
| |
| static constexpr int UpLo = UpLo_; |
| |
| // ---- Construction / destruction ------------------------------------------ |
| |
| /** Default constructor. Does not factorize; call compute() before solve(). */ |
| LLT() { init_context(); } |
| |
| /** Factor A immediately. Equivalent to LLT llt; llt.compute(A). */ |
| template <typename InputType> |
| explicit LLT(const EigenBase<InputType>& A) : LLT() { |
| compute(A); |
| } |
| |
| ~LLT() { |
| // Ignore errors here: dtors are noexcept, and EIGEN_CUSOLVER_CHECK / |
| // EIGEN_CUDA_RUNTIME_CHECK are eigen_assert-based — firing one from a |
| // noexcept dtor terminates the program. The trailing cudaFree() of the |
| // device buffers (via internal::DeviceBuffer::~DeviceBuffer) is sync. |
| if (handle_) (void)cusolverDnDestroy(handle_); |
| if (stream_) (void)cudaStreamDestroy(stream_); |
| } |
| |
| // Non-copyable (owns device memory and library handles). |
| LLT(const LLT&) = delete; |
| LLT& operator=(const LLT&) = delete; |
| |
| // Movable. |
| LLT(LLT&& o) noexcept |
| : stream_(o.stream_), |
| handle_(o.handle_), |
| params_(std::move(o.params_)), |
| d_factor_(std::move(o.d_factor_)), |
| factor_alloc_size_(o.factor_alloc_size_), |
| d_scratch_(std::move(o.d_scratch_)), |
| scratch_size_(o.scratch_size_), |
| h_workspace_(std::move(o.h_workspace_)), |
| n_(o.n_), |
| lda_(o.lda_), |
| info_(o.info_), |
| pinned_info_(std::move(o.pinned_info_)), |
| info_synced_(o.info_synced_) { |
| o.stream_ = nullptr; |
| o.handle_ = nullptr; |
| o.factor_alloc_size_ = 0; |
| o.scratch_size_ = 0; |
| o.n_ = 0; |
| o.lda_ = 0; |
| o.info_ = InvalidInput; |
| o.info_synced_ = true; |
| } |
| |
| LLT& operator=(LLT&& o) noexcept { |
| if (this != &o) { |
| // Drain pending work on the old stream before tearing it down so the |
| // upcoming move of d_factor_/d_scratch_ doesn't free buffers that an |
| // in-flight kernel is still touching. The asymmetric move-ctor doesn't |
| // need this — it constructs onto uninitialized storage. |
| if (stream_) (void)cudaStreamSynchronize(stream_); |
| if (handle_) (void)cusolverDnDestroy(handle_); |
| if (stream_) (void)cudaStreamDestroy(stream_); |
| stream_ = o.stream_; |
| handle_ = o.handle_; |
| params_ = std::move(o.params_); |
| d_factor_ = std::move(o.d_factor_); |
| factor_alloc_size_ = o.factor_alloc_size_; |
| d_scratch_ = std::move(o.d_scratch_); |
| scratch_size_ = o.scratch_size_; |
| h_workspace_ = std::move(o.h_workspace_); |
| n_ = o.n_; |
| lda_ = o.lda_; |
| info_ = o.info_; |
| pinned_info_ = std::move(o.pinned_info_); |
| info_synced_ = o.info_synced_; |
| o.stream_ = nullptr; |
| o.handle_ = nullptr; |
| o.factor_alloc_size_ = 0; |
| o.scratch_size_ = 0; |
| o.n_ = 0; |
| o.lda_ = 0; |
| o.info_ = InvalidInput; |
| o.info_synced_ = true; |
| } |
| return *this; |
| } |
| |
| // ---- Factorization ------------------------------------------------------- |
| |
| /** Compute the Cholesky factorization of A (host matrix). |
| * |
| * Uploads A to device memory, calls cusolverDnXpotrf, and retains the |
| * factored matrix on device. Any previous factorization is overwritten. |
| */ |
| template <typename InputType> |
| LLT& compute(const EigenBase<InputType>& A) { |
| eigen_assert(A.rows() == A.cols()); |
| if (!begin_compute(A.rows())) return *this; |
| |
| // Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions). |
| const PlainMatrix mat(A.derived()); |
| lda_ = static_cast<int64_t>(mat.outerStride()); |
| allocate_factor_storage(); |
| EIGEN_CUDA_RUNTIME_CHECK( |
| cudaMemcpyAsync(d_factor_.get(), mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_)); |
| |
| factorize(); |
| return *this; |
| } |
| |
| /** Compute the Cholesky factorization from a device-resident matrix (D2D copy). */ |
| LLT& compute(const DeviceMatrix<Scalar>& d_A) { |
| eigen_assert(d_A.rows() == d_A.cols()); |
| if (!begin_compute(d_A.rows())) return *this; |
| |
| lda_ = static_cast<int64_t>(d_A.outerStride()); |
| d_A.waitReady(stream_); |
| allocate_factor_storage(); |
| EIGEN_CUDA_RUNTIME_CHECK( |
| cudaMemcpyAsync(d_factor_.get(), d_A.data(), factorBytes(), cudaMemcpyDeviceToDevice, stream_)); |
| |
| factorize(); |
| return *this; |
| } |
| |
| /** Compute the Cholesky factorization from a device matrix (move, no copy). */ |
| LLT& compute(DeviceMatrix<Scalar>&& d_A) { |
| eigen_assert(d_A.rows() == d_A.cols()); |
| if (!begin_compute(d_A.rows())) return *this; |
| |
| lda_ = static_cast<int64_t>(d_A.outerStride()); |
| d_A.waitReady(stream_); |
| d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release())); |
| factor_alloc_size_ = factorBytes(); |
| |
| factorize(); |
| return *this; |
| } |
| |
| // ---- Solve --------------------------------------------------------------- |
| |
| /** Solve A * X = B using the cached Cholesky factor (host → host). |
| * |
| * Uploads B to device memory, calls cusolverDnXpotrs using the factor |
| * retained from compute(), and returns the solution X on the host. |
| * The factor is not re-transferred; only B goes up and X comes down. |
| * |
| * \pre compute() must have been called and info() == Success. |
| * \returns X such that A * X ≈ B |
| */ |
| template <typename Rhs> |
| PlainMatrix solve(const MatrixBase<Rhs>& B) const { |
| const_cast<LLT*>(this)->sync_info(); |
| eigen_assert(info_ == Success && "LLT::solve called on a failed or uninitialized factorization"); |
| eigen_assert(B.rows() == n_); |
| |
| const PlainMatrix rhs(B); |
| const int64_t nrhs = static_cast<int64_t>(rhs.cols()); |
| const int64_t ldb = static_cast<int64_t>(rhs.outerStride()); |
| internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb)); |
| EIGEN_CUDA_RUNTIME_CHECK( |
| cudaMemcpyAsync(d_x.get(), rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_)); |
| DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, std::move(d_x)); |
| |
| PlainMatrix X(n_, B.cols()); |
| int solve_info = 0; |
| EIGEN_CUDA_RUNTIME_CHECK( |
| cudaMemcpyAsync(X.data(), d_X.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_)); |
| EIGEN_CUDA_RUNTIME_CHECK( |
| cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_)); |
| EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); |
| |
| eigen_assert(solve_info == 0 && "cusolverDnXpotrs reported an error"); |
| return X; |
| } |
| |
| /** Solve A * X = B with device-resident RHS. Fully async. |
| * |
| * All work is enqueued on this solver's stream. Returns a Matrix |
| * with a recorded ready event — no host synchronization occurs. |
| * The caller should check info() after compute() to verify the |
| * factorization succeeded; this method does not check. |
| */ |
| DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const { |
| eigen_assert(d_B.rows() == n_); |
| d_B.waitReady(stream_); |
| const int64_t nrhs = static_cast<int64_t>(d_B.cols()); |
| const int64_t ldb = static_cast<int64_t>(d_B.outerStride()); |
| internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb)); |
| EIGEN_CUDA_RUNTIME_CHECK( |
| cudaMemcpyAsync(d_x.get(), d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_)); |
| return solve_impl(nrhs, ldb, std::move(d_x)); |
| } |
| |
| // ---- Accessors ----------------------------------------------------------- |
| |
| /** Returns Success if the last compute() succeeded, NumericalIssue otherwise. |
| * Lazily synchronizes the stream on first call after compute(). */ |
| ComputationInfo info() const { |
| const_cast<LLT*>(this)->sync_info(); |
| return info_; |
| } |
| |
| Index rows() const { return n_; } |
| Index cols() const { return n_; } |
| |
| /** Returns the CUDA stream owned by this object. |
| * Advanced users may submit additional GPU work on this stream |
| * to overlap with or chain after LLT operations. */ |
| cudaStream_t stream() const { return stream_; } |
| |
| private: |
| cudaStream_t stream_ = nullptr; |
| cusolverDnHandle_t handle_ = nullptr; |
| internal::CusolverParams params_; // cuSOLVER params (created once, reused) |
| internal::DeviceBuffer d_factor_; // factored L (or U) on device (grows, never shrinks) |
| size_t factor_alloc_size_ = 0; // current d_factor_ allocation size |
| internal::DeviceBuffer d_scratch_; // combined workspace + info word (grows, never shrinks) |
| size_t scratch_size_ = 0; // current scratch allocation size |
| std::vector<char> h_workspace_; // host workspace (kept alive until next compute) |
| int64_t n_ = 0; |
| int64_t lda_ = 0; |
| ComputationInfo info_ = InvalidInput; |
| internal::PinnedHostBuffer pinned_info_{sizeof(int)}; // pinned host memory for async D2H |
| bool info_synced_ = true; // has the stream been synced for info? |
| |
| int& info_word() { return *static_cast<int*>(pinned_info_.get()); } |
| int info_word() const { return *static_cast<const int*>(pinned_info_.get()); } |
| |
| bool begin_compute(Index rows) { |
| n_ = rows; |
| if (n_ == 0) { |
| info_ = Success; |
| info_synced_ = true; |
| return false; |
| } |
| info_ = InvalidInput; |
| return true; |
| } |
| |
| size_t factorBytes() const { return rhsBytes(n_, lda_); } |
| |
| static size_t rhsBytes(int64_t cols, int64_t outer_stride) { |
| return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar); |
| } |
| |
| void allocate_factor_storage() { |
| size_t needed = factorBytes(); |
| if (needed > factor_alloc_size_) { |
| d_factor_ = internal::DeviceBuffer(needed); |
| factor_alloc_size_ = needed; |
| } |
| } |
| |
| // Scratch layout: [ workspace (aligned) | info_word (sizeof(int)) ]. |
| // Workspace size is rounded up to 16 bytes so the info word lands aligned. |
| static constexpr size_t kInfoBytes = sizeof(int); |
| static constexpr size_t kScratchAlign = 16; |
| |
| static size_t scratchBytesFor(size_t workspace_bytes) { |
| workspace_bytes = (workspace_bytes + kScratchAlign - 1) & ~(kScratchAlign - 1); |
| return workspace_bytes + kInfoBytes; |
| } |
| |
| // Ensure d_scratch_ is at least `bytes`. Grows but never shrinks. |
| // Syncs the stream before reallocating to avoid freeing memory that |
| // async kernels may still be using. |
| void ensure_scratch(size_t bytes) { |
| if (bytes > scratch_size_) { |
| if (d_scratch_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); |
| d_scratch_ = internal::DeviceBuffer(bytes); |
| scratch_size_ = bytes; |
| } |
| } |
| |
| void* scratch_workspace() const { return d_scratch_.get(); } |
| int* scratch_info() const { |
| eigen_assert(d_scratch_ && scratch_size_ >= kInfoBytes); |
| return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.get()) + scratch_size_ - kInfoBytes); |
| } |
| |
| // Solve in place on `d_x` (which already holds B), then re-wrap as a |
| // typed DeviceMatrix carrying shape and a ready event. The release/adopt |
| // hop just hands ownership of the raw cudaMalloc pointer from the untyped |
| // DeviceBuffer to the typed DeviceMatrix without copying. |
| DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, internal::DeviceBuffer&& d_x) const { |
| constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value; |
| constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value; |
| |
| EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, n_, nrhs, dtype, d_factor_.get(), lda_, dtype, |
| d_x.get(), ldb, scratch_info())); |
| |
| DeviceMatrix<Scalar> result = |
| DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_x.release()), n_, static_cast<Index>(nrhs)); |
| result.recordReady(stream_); |
| return result; |
| } |
| |
| void init_context() { |
| EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_)); |
| EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_)); |
| EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_)); |
| ensure_scratch(kInfoBytes); |
| } |
| |
| // Synchronize stream and interpret the info word. No-op if already synced. |
| void sync_info() { |
| if (!info_synced_) { |
| EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_)); |
| info_ = (info_word() == 0) ? Success : NumericalIssue; |
| info_synced_ = true; |
| } |
| } |
| |
| // Run cusolverDnXpotrf on d_factor_ (already on device). |
| // Enqueues factorization + async info download. Does NOT sync. |
| // Workspaces are stored as members to ensure they outlive the async kernels. |
| void factorize() { |
| constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value; |
| constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value; |
| |
| info_synced_ = false; |
| info_ = InvalidInput; |
| |
| size_t dev_ws_bytes = 0, host_ws_bytes = 0; |
| EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(handle_, params_.p, uplo, n_, dtype, d_factor_.get(), lda_, dtype, |
| &dev_ws_bytes, &host_ws_bytes)); |
| |
| ensure_scratch(scratchBytesFor(dev_ws_bytes)); |
| h_workspace_.resize(host_ws_bytes); |
| |
| EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf( |
| handle_, params_.p, uplo, n_, dtype, d_factor_.get(), lda_, dtype, scratch_workspace(), dev_ws_bytes, |
| host_ws_bytes > 0 ? h_workspace_.data() : nullptr, host_ws_bytes, scratch_info())); |
| |
| // Enqueue async download of info word — sync deferred to info() or solve(). |
| EIGEN_CUDA_RUNTIME_CHECK( |
| cudaMemcpyAsync(&info_word(), scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_)); |
| } |
| }; |
| |
| } // namespace gpu |
| } // namespace Eigen |
| |
| #endif // EIGEN_GPU_LLT_H |