blob: b11254318df8a5b48b41d659edde7b9e87d62578 [file] [edit]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Eigen Authors
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// SPDX-License-Identifier: MPL-2.0
// GPU Cholesky (LLT) decomposition using cuSOLVER.
//
// Unlike Eigen's CPU LLT<MatrixType>, gpu::LLT keeps the factored Cholesky
// factor in device memory for the lifetime of the object. Multiple solves
// against the same factor therefore only transfer the RHS and solution
// vectors, not the factor itself.
//
// Requires CUDA 11.0+ (cusolverDnXpotrf / cusolverDnXpotrs generic API).
// Requires CUDA 11.4+ (cusolverDnX generic API + cudaMallocAsync).
//
// Usage:
// gpu::LLT<double> llt(A); // upload A, potrf, L stays on device
// if (llt.info() != Success) { ... }
// MatrixXd x1 = llt.solve(b1); // potrs, only b1 transferred
// MatrixXd x2 = llt.solve(b2); // L already on device
#ifndef EIGEN_GPU_LLT_H
#define EIGEN_GPU_LLT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./GpuSolverContext.h"
namespace Eigen {
namespace gpu {
/** \ingroup GPU_Module
* \class LLT
* \brief GPU Cholesky (LL^T) decomposition via cuSOLVER
*
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
* \tparam UpLo_ Triangle used: Lower (default) or Upper
*
* Factorizes a symmetric positive-definite matrix A = LL^H on the GPU and
* caches the factor L in device memory. Each subsequent solve(B) uploads only
* B, calls cusolverDnXpotrs, and downloads the result — the factor is not
* re-transferred.
*
* Each LLT object owns a dedicated CUDA stream and cuSOLVER handle,
* enabling concurrent factorizations from multiple objects on the same host
* thread.
*/
template <typename Scalar_, int UpLo_ = Lower>
class LLT {
public:
using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
static constexpr int UpLo = UpLo_;
// ---- Construction / destruction ------------------------------------------
/** Default constructor. Does not factorize; call compute() before solve(). */
LLT() = default;
/** Factor A immediately. Equivalent to LLT llt; llt.compute(A). */
template <typename InputType>
explicit LLT(const EigenBase<InputType>& A) {
compute(A);
}
~LLT() = default;
// Non-copyable (owns device memory and library handles).
LLT(const LLT&) = delete;
LLT& operator=(const LLT&) = delete;
// Movable.
LLT(LLT&& o) noexcept
: solver_ctx_(std::move(o.solver_ctx_)),
d_factor_(std::move(o.d_factor_)),
factor_alloc_size_(o.factor_alloc_size_),
n_(o.n_),
lda_(o.lda_) {
o.factor_alloc_size_ = 0;
o.n_ = 0;
o.lda_ = 0;
}
LLT& operator=(LLT&& o) noexcept {
if (this != &o) {
solver_ctx_ = std::move(o.solver_ctx_);
d_factor_ = std::move(o.d_factor_);
factor_alloc_size_ = o.factor_alloc_size_;
n_ = o.n_;
lda_ = o.lda_;
o.factor_alloc_size_ = 0;
o.n_ = 0;
o.lda_ = 0;
}
return *this;
}
// ---- Factorization -------------------------------------------------------
/** Compute the Cholesky factorization of A (host matrix). */
template <typename InputType>
LLT& compute(const EigenBase<InputType>& A) {
eigen_assert(A.rows() == A.cols());
if (!begin_compute(A.rows())) return *this;
const PlainMatrix mat(A.derived());
lda_ = static_cast<int64_t>(mat.rows());
allocate_factor_storage();
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_factor_.get(), mat.data(), factorBytes(), cudaMemcpyHostToDevice, solver_ctx_.stream_));
factorize();
return *this;
}
/** Compute the Cholesky factorization from a device-resident matrix (D2D copy). */
LLT& compute(const DeviceMatrix<Scalar>& d_A) {
eigen_assert(d_A.rows() == d_A.cols());
if (!begin_compute(d_A.rows())) return *this;
lda_ = static_cast<int64_t>(d_A.rows());
d_A.waitReady(solver_ctx_.stream_);
allocate_factor_storage();
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_factor_.get(), d_A.data(), factorBytes(), cudaMemcpyDeviceToDevice, solver_ctx_.stream_));
factorize();
return *this;
}
/** Compute the Cholesky factorization from a device matrix (move, no copy). */
LLT& compute(DeviceMatrix<Scalar>&& d_A) {
eigen_assert(d_A.rows() == d_A.cols());
if (!begin_compute(d_A.rows())) return *this;
lda_ = static_cast<int64_t>(d_A.rows());
d_A.waitReady(solver_ctx_.stream_);
d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
factor_alloc_size_ = factorBytes();
factorize();
return *this;
}
// ---- Solve ---------------------------------------------------------------
/** Solve A * X = B using the cached Cholesky factor (host → host). */
template <typename Rhs>
PlainMatrix solve(const MatrixBase<Rhs>& B) const {
solver_ctx_.sync_info();
eigen_assert(solver_ctx_.info_ == Success && "LLT::solve called on a failed or uninitialized factorization");
eigen_assert(B.rows() == n_);
const PlainMatrix rhs(B);
const int64_t nrhs = static_cast<int64_t>(rhs.cols());
const int64_t ldb = static_cast<int64_t>(rhs.rows());
internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x.get(), rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, solver_ctx_.stream_));
DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, std::move(d_x));
PlainMatrix X(n_, B.cols());
int solve_info = 0;
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(X.data(), d_X.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToHost, solver_ctx_.stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solve_info, solver_ctx_.scratch_info(), sizeof(int),
cudaMemcpyDeviceToHost, solver_ctx_.stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(solver_ctx_.stream_));
eigen_assert(solve_info == 0 && "cusolverDnXpotrs reported an error");
return X;
}
/** Solve A * X = B with device-resident RHS. Returns immediately after enqueuing
* the solve; the sync_info()/info_ check at entry forces a host wait on the
* factorization status so a singular A causes a clean assert rather than a
* silently-bad result. */
DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
solver_ctx_.sync_info();
eigen_assert(solver_ctx_.info_ == Success && "LLT::solve called on a failed or uninitialized factorization");
eigen_assert(d_B.rows() == n_);
d_B.waitReady(solver_ctx_.stream_);
const int64_t nrhs = static_cast<int64_t>(d_B.cols());
const int64_t ldb = static_cast<int64_t>(d_B.rows());
internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x.get(), d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, solver_ctx_.stream_));
return solve_impl(nrhs, ldb, std::move(d_x));
}
// ---- Accessors -----------------------------------------------------------
ComputationInfo info() const { return solver_ctx_.info(); }
Index rows() const { return n_; }
Index cols() const { return n_; }
cudaStream_t stream() const { return solver_ctx_.stream_; }
private:
mutable internal::GpuSolverContext solver_ctx_;
internal::DeviceBuffer d_factor_;
size_t factor_alloc_size_ = 0;
int64_t n_ = 0;
int64_t lda_ = 0;
bool begin_compute(Index rows) {
n_ = rows;
solver_ctx_.info_ = InvalidInput;
if (n_ == 0) {
solver_ctx_.info_ = Success;
solver_ctx_.info_synced_ = true;
return false;
}
return true;
}
size_t factorBytes() const { return rhsBytes(n_, lda_); }
static size_t rhsBytes(int64_t cols, int64_t ld) {
return static_cast<size_t>(ld) * static_cast<size_t>(cols) * sizeof(Scalar);
}
void allocate_factor_storage() {
size_t needed = factorBytes();
if (needed > factor_alloc_size_) {
d_factor_ = internal::DeviceBuffer(needed);
factor_alloc_size_ = needed;
}
}
// Solve in place on `d_x` (which already holds B), then re-wrap as a typed
// DeviceMatrix carrying shape and a ready event. The release/adopt hop hands
// ownership of the raw cudaMalloc pointer from the untyped DeviceBuffer to
// the typed DeviceMatrix without copying.
DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, internal::DeviceBuffer&& d_x) const {
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value;
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(solver_ctx_.cusolver_, solver_ctx_.params_.p, uplo, n_, nrhs, dtype,
d_factor_.get(), lda_, dtype, d_x.get(), ldb, solver_ctx_.scratch_info()));
DeviceMatrix<Scalar> result =
DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_x.release()), n_, static_cast<Index>(nrhs));
result.recordReady(solver_ctx_.stream_);
return result;
}
void factorize() {
constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value;
solver_ctx_.mark_pending();
size_t dev_ws_bytes = 0, host_ws_bytes = 0;
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(solver_ctx_.cusolver_, solver_ctx_.params_.p, uplo, n_, dtype,
d_factor_.get(), lda_, dtype, &dev_ws_bytes, &host_ws_bytes));
solver_ctx_.ensure_scratch(dev_ws_bytes);
solver_ctx_.h_workspace_.resize(host_ws_bytes);
EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(solver_ctx_.cusolver_, solver_ctx_.params_.p, uplo, n_, dtype,
d_factor_.get(), lda_, dtype, solver_ctx_.scratch_workspace(), dev_ws_bytes,
host_ws_bytes > 0 ? solver_ctx_.h_workspace_.data() : nullptr, host_ws_bytes,
solver_ctx_.scratch_info()));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solver_ctx_.info_word(), solver_ctx_.scratch_info(), sizeof(int),
cudaMemcpyDeviceToHost, solver_ctx_.stream_));
}
};
} // namespace gpu
} // namespace Eigen
#endif // EIGEN_GPU_LLT_H