blob: 8ca3dbd16bd5974f5b8a91e6021ef9c7aa35ee70 [file]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Eigen Authors
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// GPU Cholesky (LLT) decomposition using cuSOLVER.
//
// Unlike Eigen's CPU LLT<MatrixType>, gpu::LLT keeps the factored Cholesky
// factor in device memory for the lifetime of the object. Multiple solves
// against the same factor therefore only transfer the RHS and solution
// vectors, not the factor itself.
//
// Requires CUDA 11.0+ (cusolverDnXpotrf / cusolverDnXpotrs generic API).
// Requires CUDA 11.4+ (cusolverDnX generic API + cudaMallocAsync).
//
// Usage:
// gpu::LLT<double> llt(A); // upload A, potrf, L stays on device
// if (llt.info() != Success) { ... }
// MatrixXd x1 = llt.solve(b1); // potrs, only b1 transferred
// MatrixXd x2 = llt.solve(b2); // L already on device
#ifndef EIGEN_GPU_LLT_H
#define EIGEN_GPU_LLT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuSolverSupport.h"
#include <vector>
namespace Eigen {
namespace gpu {
/** \ingroup GPU_Module
* \class LLT
* \brief GPU Cholesky (LL^T) decomposition via cuSOLVER
*
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
* \tparam UpLo_ Triangle used: Lower (default) or Upper
*
* Factorizes a symmetric positive-definite matrix A = LL^H on the GPU and
* caches the factor L in device memory. Each subsequent solve(B) uploads only
* B, calls cusolverDnXpotrs, and downloads the result — the factor is not
* re-transferred.
*
* Each LLT object owns a dedicated CUDA stream and cuSOLVER handle,
* enabling concurrent factorizations from multiple objects on the same host
* thread.
*/
template <typename Scalar_, int UpLo_ = Lower>
class LLT {
public:
using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
static constexpr int UpLo = UpLo_;
// ---- Construction / destruction ------------------------------------------
/** Default constructor. Does not factorize; call compute() before solve(). */
LLT() { init_context(); }
/** Factor A immediately. Equivalent to LLT llt; llt.compute(A). */
template <typename InputType>
explicit LLT(const EigenBase<InputType>& A) : LLT() {
compute(A);
}
~LLT() {
// Ignore errors here: dtors are noexcept, and EIGEN_CUSOLVER_CHECK /
// EIGEN_CUDA_RUNTIME_CHECK are eigen_assert-based — firing one from a
// noexcept dtor terminates the program. The trailing cudaFree() of the
// device buffers (via internal::DeviceBuffer::~DeviceBuffer) is sync.
if (handle_) (void)cusolverDnDestroy(handle_);
if (stream_) (void)cudaStreamDestroy(stream_);
}
// Non-copyable (owns device memory and library handles).
LLT(const LLT&) = delete;
LLT& operator=(const LLT&) = delete;
// Movable.
LLT(LLT&& o) noexcept
: stream_(o.stream_),
handle_(o.handle_),
params_(std::move(o.params_)),
d_factor_(std::move(o.d_factor_)),
factor_alloc_size_(o.factor_alloc_size_),
d_scratch_(std::move(o.d_scratch_)),
scratch_size_(o.scratch_size_),
h_workspace_(std::move(o.h_workspace_)),
n_(o.n_),
lda_(o.lda_),
info_(o.info_),
pinned_info_(std::move(o.pinned_info_)),
info_synced_(o.info_synced_) {
o.stream_ = nullptr;
o.handle_ = nullptr;
o.factor_alloc_size_ = 0;
o.scratch_size_ = 0;
o.n_ = 0;
o.lda_ = 0;
o.info_ = InvalidInput;
o.info_synced_ = true;
}
LLT& operator=(LLT&& o) noexcept {
if (this != &o) {
// Drain pending work on the old stream before tearing it down so the
// upcoming move of d_factor_/d_scratch_ doesn't free buffers that an
// in-flight kernel is still touching. The asymmetric move-ctor doesn't
// need this — it constructs onto uninitialized storage.
if (stream_) (void)cudaStreamSynchronize(stream_);
if (handle_) (void)cusolverDnDestroy(handle_);
if (stream_) (void)cudaStreamDestroy(stream_);
stream_ = o.stream_;
handle_ = o.handle_;
params_ = std::move(o.params_);
d_factor_ = std::move(o.d_factor_);
factor_alloc_size_ = o.factor_alloc_size_;
d_scratch_ = std::move(o.d_scratch_);
scratch_size_ = o.scratch_size_;
h_workspace_ = std::move(o.h_workspace_);
n_ = o.n_;
lda_ = o.lda_;
info_ = o.info_;
pinned_info_ = std::move(o.pinned_info_);
info_synced_ = o.info_synced_;
o.stream_ = nullptr;
o.handle_ = nullptr;
o.factor_alloc_size_ = 0;
o.scratch_size_ = 0;
o.n_ = 0;
o.lda_ = 0;
o.info_ = InvalidInput;
o.info_synced_ = true;
}
return *this;
}
// ---- Factorization -------------------------------------------------------
/** Compute the Cholesky factorization of A (host matrix).
*
* Uploads A to device memory, calls cusolverDnXpotrf, and retains the
* factored matrix on device. Any previous factorization is overwritten.
*/
template <typename InputType>
LLT& compute(const EigenBase<InputType>& A) {
eigen_assert(A.rows() == A.cols());
if (!begin_compute(A.rows())) return *this;
// Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions).
const PlainMatrix mat(A.derived());
lda_ = static_cast<int64_t>(mat.outerStride());
allocate_factor_storage();
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_factor_.get(), mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_));
factorize();
return *this;
}
/** Compute the Cholesky factorization from a device-resident matrix (D2D copy). */
LLT& compute(const DeviceMatrix<Scalar>& d_A) {
eigen_assert(d_A.rows() == d_A.cols());
if (!begin_compute(d_A.rows())) return *this;
lda_ = static_cast<int64_t>(d_A.outerStride());
d_A.waitReady(stream_);
allocate_factor_storage();
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_factor_.get(), d_A.data(), factorBytes(), cudaMemcpyDeviceToDevice, stream_));
factorize();
return *this;
}
/** Compute the Cholesky factorization from a device matrix (move, no copy). */
LLT& compute(DeviceMatrix<Scalar>&& d_A) {
eigen_assert(d_A.rows() == d_A.cols());
if (!begin_compute(d_A.rows())) return *this;
lda_ = static_cast<int64_t>(d_A.outerStride());
d_A.waitReady(stream_);
d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
factor_alloc_size_ = factorBytes();
factorize();
return *this;
}
// ---- Solve ---------------------------------------------------------------
/** Solve A * X = B using the cached Cholesky factor (host → host).
*
* Uploads B to device memory, calls cusolverDnXpotrs using the factor
* retained from compute(), and returns the solution X on the host.
* The factor is not re-transferred; only B goes up and X comes down.
*
* \pre compute() must have been called and info() == Success.
* \returns X such that A * X ≈ B
*/
template <typename Rhs>
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
const_cast<LLT*>(this)->sync_info();
eigen_assert(info_ == Success && "LLT::solve called on a failed or uninitialized factorization");
eigen_assert(B.rows() == n_);
const PlainMatrix rhs(B);
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
const int64_t ldb = static_cast<int64_t>(rhs.outerStride());
internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x.get(), rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, std::move(d_x));
PlainMatrix X(n_, B.cols());
int solve_info = 0;
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(X.data(), d_X.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
eigen_assert(solve_info == 0 && "cusolverDnXpotrs reported an error");
return X;
}
/** Solve A * X = B with device-resident RHS. Fully async.
*
* All work is enqueued on this solver's stream. Returns a Matrix
* with a recorded ready event — no host synchronization occurs.
* The caller should check info() after compute() to verify the
* factorization succeeded; this method does not check.
*/
DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
eigen_assert(d_B.rows() == n_);
d_B.waitReady(stream_);
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
const int64_t ldb = static_cast<int64_t>(d_B.outerStride());
internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x.get(), d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
return solve_impl(nrhs, ldb, std::move(d_x));
}
// ---- Accessors -----------------------------------------------------------
/** Returns Success if the last compute() succeeded, NumericalIssue otherwise.
* Lazily synchronizes the stream on first call after compute(). */
ComputationInfo info() const {
const_cast<LLT*>(this)->sync_info();
return info_;
}
Index rows() const { return n_; }
Index cols() const { return n_; }
/** Returns the CUDA stream owned by this object.
* Advanced users may submit additional GPU work on this stream
* to overlap with or chain after LLT operations. */
cudaStream_t stream() const { return stream_; }
private:
cudaStream_t stream_ = nullptr;
cusolverDnHandle_t handle_ = nullptr;
internal::CusolverParams params_; // cuSOLVER params (created once, reused)
internal::DeviceBuffer d_factor_; // factored L (or U) on device (grows, never shrinks)
size_t factor_alloc_size_ = 0; // current d_factor_ allocation size
internal::DeviceBuffer d_scratch_; // combined workspace + info word (grows, never shrinks)
size_t scratch_size_ = 0; // current scratch allocation size
std::vector<char> h_workspace_; // host workspace (kept alive until next compute)
int64_t n_ = 0;
int64_t lda_ = 0;
ComputationInfo info_ = InvalidInput;
internal::PinnedHostBuffer pinned_info_{sizeof(int)}; // pinned host memory for async D2H
bool info_synced_ = true; // has the stream been synced for info?
int& info_word() { return *static_cast<int*>(pinned_info_.get()); }
int info_word() const { return *static_cast<const int*>(pinned_info_.get()); }
bool begin_compute(Index rows) {
n_ = rows;
if (n_ == 0) {
info_ = Success;
info_synced_ = true;
return false;
}
info_ = InvalidInput;
return true;
}
size_t factorBytes() const { return rhsBytes(n_, lda_); }
static size_t rhsBytes(int64_t cols, int64_t outer_stride) {
return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
}
void allocate_factor_storage() {
size_t needed = factorBytes();
if (needed > factor_alloc_size_) {
d_factor_ = internal::DeviceBuffer(needed);
factor_alloc_size_ = needed;
}
}
// Scratch layout: [ workspace (aligned) | info_word (sizeof(int)) ].
// Workspace size is rounded up to 16 bytes so the info word lands aligned.
static constexpr size_t kInfoBytes = sizeof(int);
static constexpr size_t kScratchAlign = 16;
static size_t scratchBytesFor(size_t workspace_bytes) {
workspace_bytes = (workspace_bytes + kScratchAlign - 1) & ~(kScratchAlign - 1);
return workspace_bytes + kInfoBytes;
}
// Ensure d_scratch_ is at least `bytes`. Grows but never shrinks.
// Syncs the stream before reallocating to avoid freeing memory that
// async kernels may still be using.
void ensure_scratch(size_t bytes) {
if (bytes > scratch_size_) {
if (d_scratch_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
d_scratch_ = internal::DeviceBuffer(bytes);
scratch_size_ = bytes;
}
}
void* scratch_workspace() const { return d_scratch_.get(); }
int* scratch_info() const {
eigen_assert(d_scratch_ && scratch_size_ >= kInfoBytes);
return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.get()) + scratch_size_ - kInfoBytes);
}
// Solve in place on `d_x` (which already holds B), then re-wrap as a
// typed DeviceMatrix carrying shape and a ready event. The release/adopt
// hop just hands ownership of the raw cudaMalloc pointer from the untyped
// DeviceBuffer to the typed DeviceMatrix without copying.
DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, internal::DeviceBuffer&& d_x) const {
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value;
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, n_, nrhs, dtype, d_factor_.get(), lda_, dtype,
d_x.get(), ldb, scratch_info()));
DeviceMatrix<Scalar> result =
DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_x.release()), n_, static_cast<Index>(nrhs));
result.recordReady(stream_);
return result;
}
void init_context() {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
ensure_scratch(kInfoBytes);
}
// Synchronize stream and interpret the info word. No-op if already synced.
void sync_info() {
if (!info_synced_) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
info_ = (info_word() == 0) ? Success : NumericalIssue;
info_synced_ = true;
}
}
// Run cusolverDnXpotrf on d_factor_ (already on device).
// Enqueues factorization + async info download. Does NOT sync.
// Workspaces are stored as members to ensure they outlive the async kernels.
void factorize() {
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value;
info_synced_ = false;
info_ = InvalidInput;
size_t dev_ws_bytes = 0, host_ws_bytes = 0;
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(handle_, params_.p, uplo, n_, dtype, d_factor_.get(), lda_, dtype,
&dev_ws_bytes, &host_ws_bytes));
ensure_scratch(scratchBytesFor(dev_ws_bytes));
h_workspace_.resize(host_ws_bytes);
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(
handle_, params_.p, uplo, n_, dtype, d_factor_.get(), lda_, dtype, scratch_workspace(), dev_ws_bytes,
host_ws_bytes > 0 ? h_workspace_.data() : nullptr, host_ws_bytes, scratch_info()));
// Enqueue async download of info word — sync deferred to info() or solve().
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(&info_word(), scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
}
};
} // namespace gpu
} // namespace Eigen
#endif // EIGEN_GPU_LLT_H