blob: 2e15c4167dfe7689c87512cbe00a2446d0da865d [file]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Typed RAII wrapper for a dense matrix in GPU device memory.
//
// gpu::DeviceMatrix<Scalar> holds a column-major matrix on the GPU with tracked
// dimensions and leading dimension. It can be passed to GPU solvers
// (gpu::LLT, gpu::LU, cuBLAS expressions) without host round-trips.
//
// Cross-stream safety is automatic: an internal CUDA event tracks when the
// last write completed. Consumers on a different stream wait on that event
// before reading.
//
// Usage:
// auto d_A = gpu::DeviceMatrix<double>::fromHost(A); // upload (sync)
// gpu::LLT<double> llt;
// llt.compute(d_A); // factor on device
// auto d_X = llt.solve(d_B); // async, no sync
// MatrixXd X = d_X.toHost(); // download + block
//
// Async variants:
// auto d_A = gpu::DeviceMatrix<double>::fromHostAsync(A.data(), n, n, n, stream);
// auto transfer = d_X.toHostAsync(stream); // enqueue D2H
// // ... overlap with other work ...
// MatrixXd X = transfer.get(); // block + retrieve
#ifndef EIGEN_GPU_DEVICE_MATRIX_H
#define EIGEN_GPU_DEVICE_MATRIX_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include <cstring>
#include "./GpuSupport.h"
namespace Eigen {
namespace gpu {
// Forward declarations.
template <typename, int>
class LLT;
template <typename>
class LU;
template <typename>
class AdjointView;
template <typename>
class TransposeView;
template <typename>
class Assignment;
template <typename, typename>
class GemmExpr;
template <typename, int>
class LltSolveExpr;
template <typename>
class LuSolveExpr;
template <typename, int>
class LLTView;
template <typename>
class LUView;
template <typename, int>
class TriangularView;
template <typename, int>
class SelfAdjointView;
template <typename, int>
class ConstSelfAdjointView;
template <typename, int>
class TrsmExpr;
template <typename, int>
class SymmExpr;
template <typename, int>
class SyrkExpr;
class Context;
// --------------------------------------------------------------------------
// HostTransfer — future-like wrapper for an async device-to-host transfer.
// --------------------------------------------------------------------------
/** \ingroup GPU_Module
* \class HostTransfer
* \brief Future for an asynchronous device-to-host matrix transfer.
*
* Returned by gpu::DeviceMatrix::toHostAsync(). The transfer runs asynchronously
* on the given CUDA stream. Call get() to block until complete and retrieve
* the host matrix, or ready() to poll without blocking.
*/
template <typename Scalar_>
class HostTransfer {
public:
using Scalar = Scalar_;
using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
/** Block until the transfer completes and return the host matrix.
* Idempotent: subsequent calls return the same matrix without re-syncing.
* On first call, copies from pinned staging buffer into a regular matrix. */
PlainMatrix& get() {
if (!synced_) {
EIGEN_CUDA_RUNTIME_CHECK(cudaEventSynchronize(event_));
// Copy from pinned staging buffer into the regular (pageable) host matrix.
if (pinned_buf_ && host_buf_.size() > 0) {
std::memcpy(host_buf_.data(), pinned_buf_.get(), static_cast<size_t>(host_buf_.size()) * sizeof(Scalar));
}
pinned_buf_ = internal::PinnedHostBuffer(); // free pinned memory early
synced_ = true;
}
return host_buf_;
}
/** Non-blocking check: has the transfer completed? */
bool ready() const {
if (synced_) return true;
cudaError_t err = cudaEventQuery(event_);
if (err == cudaSuccess) return true;
eigen_assert(err == cudaErrorNotReady && "cudaEventQuery failed");
return false;
}
~HostTransfer() {
if (event_) (void)cudaEventDestroy(event_);
}
HostTransfer(HostTransfer&& o) noexcept
: host_buf_(std::move(o.host_buf_)), pinned_buf_(std::move(o.pinned_buf_)), event_(o.event_), synced_(o.synced_) {
o.event_ = nullptr;
o.synced_ = true;
}
HostTransfer& operator=(HostTransfer&& o) noexcept {
if (this != &o) {
if (event_) (void)cudaEventDestroy(event_);
host_buf_ = std::move(o.host_buf_);
pinned_buf_ = std::move(o.pinned_buf_);
event_ = o.event_;
synced_ = o.synced_;
o.event_ = nullptr;
o.synced_ = true;
}
return *this;
}
HostTransfer(const HostTransfer&) = delete;
HostTransfer& operator=(const HostTransfer&) = delete;
private:
template <typename>
friend class DeviceMatrix;
HostTransfer(PlainMatrix&& buf, internal::PinnedHostBuffer&& pinned, cudaEvent_t event)
: host_buf_(std::move(buf)), pinned_buf_(std::move(pinned)), event_(event), synced_(false) {}
PlainMatrix host_buf_; // final destination (pageable)
internal::PinnedHostBuffer pinned_buf_; // staging buffer for async DMA
cudaEvent_t event_ = nullptr;
bool synced_ = false;
};
// --------------------------------------------------------------------------
// Matrix — typed RAII wrapper for a dense matrix in device memory.
// --------------------------------------------------------------------------
/** \ingroup GPU_Module
* \class DeviceMatrix
* \brief RAII wrapper for a dense column-major matrix in GPU device memory.
*
* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
*
* Owns a device allocation with tracked dimensions and leading dimension.
* An internal CUDA event records when the data was last written, enabling
* safe cross-stream consumption without user-visible synchronization.
*
* Each method has a synchronous and an asynchronous variant:
* - fromHost() / fromHostAsync(): upload from host
* - toHost() / toHostAsync(): download to host
*/
template <typename Scalar_>
class DeviceMatrix {
public:
using Scalar = Scalar_;
using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
// ---- Construction / destruction ------------------------------------------
/** Default: empty (0x0, no allocation). */
DeviceMatrix() = default;
/** Allocate uninitialized device memory for a rows x cols matrix. */
DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols), outerStride_(rows) {
eigen_assert(rows >= 0 && cols >= 0);
size_t bytes = sizeInBytes();
if (bytes > 0) {
void* p = nullptr;
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
data_.reset(static_cast<Scalar*>(p));
}
}
~DeviceMatrix() {
// cudaEventDestroy on a pending event is non-blocking: the runtime defers
// teardown until the event completes. The trailing cudaFree() (via
// data_.reset()) is itself synchronous, so the buffer outlives any
// in-flight kernel that may still be touching it.
if (ready_event_) (void)cudaEventDestroy(ready_event_);
}
// ---- Move-only -----------------------------------------------------------
DeviceMatrix(DeviceMatrix&& o) noexcept
: data_(std::move(o.data_)),
rows_(o.rows_),
cols_(o.cols_),
outerStride_(o.outerStride_),
ready_event_(o.ready_event_),
ready_stream_(o.ready_stream_),
retained_buffer_(std::move(o.retained_buffer_)) {
o.rows_ = 0;
o.cols_ = 0;
o.outerStride_ = 0;
o.ready_event_ = nullptr;
o.ready_stream_ = nullptr;
}
DeviceMatrix& operator=(DeviceMatrix&& o) noexcept {
if (this != &o) {
if (ready_event_) (void)cudaEventDestroy(ready_event_);
data_ = std::move(o.data_);
rows_ = o.rows_;
cols_ = o.cols_;
outerStride_ = o.outerStride_;
ready_event_ = o.ready_event_;
ready_stream_ = o.ready_stream_;
retained_buffer_ = std::move(o.retained_buffer_);
o.rows_ = 0;
o.cols_ = 0;
o.outerStride_ = 0;
o.ready_event_ = nullptr;
o.ready_stream_ = nullptr;
}
return *this;
}
DeviceMatrix(const DeviceMatrix&) = delete;
DeviceMatrix& operator=(const DeviceMatrix&) = delete;
// ---- Upload from host ----------------------------------------------------
/** Upload a host Eigen matrix to device memory (synchronous).
*
* Evaluates the expression into a contiguous ColMajor temporary, copies to
* device via cudaMemcpyAsync on \p stream, and synchronizes before returning.
*
* \param host Any Eigen matrix expression.
* \param stream CUDA stream for the transfer (default: stream 0).
*/
template <typename Derived>
static DeviceMatrix fromHost(const MatrixBase<Derived>& host, cudaStream_t stream = nullptr) {
const PlainMatrix mat(host.derived());
DeviceMatrix dm(mat.rows(), mat.cols());
if (dm.sizeInBytes() > 0) {
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(dm.data_.get(), mat.data(), dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
}
return dm;
}
/** Upload from a raw host pointer to device memory (asynchronous).
*
* Enqueues an async H2D copy on \p stream and records an internal event.
* The caller must keep \p host_data alive until the transfer completes
* (check via the internal event or synchronize the stream).
*
* \param host_data Pointer to contiguous column-major host data.
* \param rows Number of rows.
* \param cols Number of columns.
* \param outerStride Leading dimension (>= rows). Use rows for dense.
* \param stream CUDA stream for the transfer.
*/
static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, Index outerStride,
cudaStream_t stream) {
eigen_assert(rows >= 0 && cols >= 0 && outerStride >= rows);
eigen_assert(host_data != nullptr || (rows == 0 || cols == 0));
DeviceMatrix dm(rows, cols);
if (dm.sizeInBytes() > 0) {
// If outerStride == rows (dense), single contiguous copy.
// Otherwise, copy column by column (strided layout).
if (outerStride == rows) {
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(dm.data_.get(), host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
} else {
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(dm.data_.get(), static_cast<size_t>(rows) * sizeof(Scalar),
host_data, static_cast<size_t>(outerStride) * sizeof(Scalar),
static_cast<size_t>(rows) * sizeof(Scalar),
static_cast<size_t>(cols), cudaMemcpyHostToDevice, stream));
}
dm.recordReady(stream);
}
return dm;
}
// ---- Download to host ----------------------------------------------------
/** Download device matrix to host memory (synchronous).
*
* Waits on the internal ready event, enqueues a D2H copy on \p stream,
* synchronizes, and returns the host matrix directly.
*
* \param stream CUDA stream for the transfer (default: stream 0).
*/
PlainMatrix toHost(cudaStream_t stream = nullptr) const {
PlainMatrix host_buf(rows_, cols_);
if (sizeInBytes() > 0) {
waitReady(stream);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(host_buf.data(), data_.get(), sizeInBytes(), cudaMemcpyDeviceToHost, stream));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
}
return host_buf;
}
/** Enqueue an async device-to-host transfer and return a future.
*
* Waits on the internal ready event (if any) to ensure the device data is
* valid, then enqueues the D2H copy on \p stream. Returns a HostTransfer
* future; call .get() to block and retrieve the host matrix.
*
* \param stream CUDA stream for the transfer (default: stream 0).
*/
HostTransfer<Scalar> toHostAsync(cudaStream_t stream = nullptr) const {
PlainMatrix host_buf(rows_, cols_);
internal::PinnedHostBuffer pinned_buf(sizeInBytes());
if (sizeInBytes() > 0) {
waitReady(stream);
// DMA into pinned staging buffer for truly async transfer.
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(pinned_buf.get(), data_.get(), sizeInBytes(), cudaMemcpyDeviceToHost, stream));
}
// Record a transfer-complete event.
cudaEvent_t transfer_event;
EIGEN_CUDA_RUNTIME_CHECK(cudaEventCreateWithFlags(&transfer_event, cudaEventDisableTiming));
EIGEN_CUDA_RUNTIME_CHECK(cudaEventRecord(transfer_event, stream));
return HostTransfer<Scalar>(std::move(host_buf), std::move(pinned_buf), transfer_event);
}
// ---- Device-to-device copy -----------------------------------------------
/** Deep copy on device. Fully async — records event on the result, no sync.
*
* \param stream CUDA stream for the D2D copy (default: stream 0).
*/
DeviceMatrix clone(cudaStream_t stream = nullptr) const {
DeviceMatrix result(rows_, cols_);
if (sizeInBytes() > 0) {
waitReady(stream);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(result.data_.get(), data_.get(), sizeInBytes(), cudaMemcpyDeviceToDevice, stream));
result.recordReady(stream);
}
return result;
}
// ---- Resize (destructive) ------------------------------------------------
/** Discard contents and reallocate to (rows x cols). Clears the ready event. */
void resize(Index rows, Index cols) {
eigen_assert(rows >= 0 && cols >= 0);
if (rows == rows_ && cols == cols_) return;
data_.reset();
if (ready_event_) {
(void)cudaEventDestroy(ready_event_);
ready_event_ = nullptr;
}
ready_stream_ = nullptr;
retained_buffer_ = internal::DeviceBuffer();
rows_ = rows;
cols_ = cols;
outerStride_ = rows;
size_t bytes = sizeInBytes();
if (bytes > 0) {
void* p = nullptr;
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
data_.reset(static_cast<Scalar*>(p));
}
}
// ---- Accessors -----------------------------------------------------------
Scalar* data() { return data_.get(); }
const Scalar* data() const { return data_.get(); }
Index rows() const { return rows_; }
Index cols() const { return cols_; }
Index outerStride() const { return outerStride_; }
bool empty() const { return rows_ == 0 || cols_ == 0; }
/** Size of the device allocation in bytes. */
size_t sizeInBytes() const { return static_cast<size_t>(outerStride_) * static_cast<size_t>(cols_) * sizeof(Scalar); }
// ---- Event synchronization (public for library dispatch interop) ---------
/** Record that device data is ready after work on \p stream. */
void recordReady(cudaStream_t stream) {
ensureEvent();
EIGEN_CUDA_RUNTIME_CHECK(cudaEventRecord(ready_event_, stream));
ready_stream_ = stream;
}
/** Make \p stream wait until the device data is ready.
* No-op if no event recorded, or if the consumer stream is the same as the
* producer stream (CUDA guarantees in-order execution within a stream). */
void waitReady(cudaStream_t stream) const {
if (ready_event_ && stream != ready_stream_) {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamWaitEvent(stream, ready_event_, 0));
}
}
// ---- Expression methods (dispatch to cuBLAS/cuSOLVER) --------------------
/** Adjoint view for GEMM dispatch. Maps to cublasXgemm with ConjTrans. */
AdjointView<Scalar> adjoint() const { return AdjointView<Scalar>(*this); }
/** Transpose view for GEMM dispatch. Maps to cublasXgemm with Trans. */
TransposeView<Scalar> transpose() const { return TransposeView<Scalar>(*this); }
/** Bind this matrix to a Context for expression assignment.
* Returns an Assignment proxy: `d_C.device(ctx) = d_A * d_B;` */
Assignment<Scalar> device(Context& ctx) { return Assignment<Scalar>(*this, ctx); }
/** Assign from a GEMM expression using the thread-local default Context.
* Defined out-of-line after Context is fully declared (see DeviceDispatch.h). */
template <typename Lhs, typename Rhs>
DeviceMatrix& operator=(const GemmExpr<Lhs, Rhs>& expr);
/** Accumulate from a GEMM expression using the thread-local default Context. */
template <typename Lhs, typename Rhs>
DeviceMatrix& operator+=(const GemmExpr<Lhs, Rhs>& expr);
/** Cholesky view: d_A.llt().solve(d_B) → LltSolveExpr. */
LLTView<Scalar, Lower> llt() const { return LLTView<Scalar, Lower>(*this); }
/** Cholesky view with explicit triangle: d_A.llt<Upper>().solve(d_B). */
template <int UpLo>
LLTView<Scalar, UpLo> llt() const {
return LLTView<Scalar, UpLo>(*this);
}
/** LU view: d_A.lu().solve(d_B) → LuSolveExpr. */
LUView<Scalar> lu() const { return LUView<Scalar>(*this); }
/** Assign from an LLT solve expression (thread-local default context). */
template <int UpLo>
DeviceMatrix& operator=(const LltSolveExpr<Scalar, UpLo>& expr);
/** Assign from an LU solve expression (thread-local default context). */
DeviceMatrix& operator=(const LuSolveExpr<Scalar>& expr);
/** Triangular view: d_A.triangularView<Lower>().solve(d_B) → TrsmExpr. */
template <int UpLo>
TriangularView<Scalar, UpLo> triangularView() const {
return TriangularView<Scalar, UpLo>(*this);
}
/** Self-adjoint view (mutable): d_C.selfadjointView<Lower>().rankUpdate(d_A). */
template <int UpLo>
SelfAdjointView<Scalar, UpLo> selfadjointView() {
return SelfAdjointView<Scalar, UpLo>(*this);
}
/** Self-adjoint view (const): d_A.selfadjointView<Lower>() * d_B → SymmExpr. */
template <int UpLo>
ConstSelfAdjointView<Scalar, UpLo> selfadjointView() const {
return ConstSelfAdjointView<Scalar, UpLo>(*this);
}
/** Assign from a TRSM expression (thread-local default context). */
template <int UpLo>
DeviceMatrix& operator=(const TrsmExpr<Scalar, UpLo>& expr);
/** Assign from a SYMM expression (thread-local default context). */
template <int UpLo>
DeviceMatrix& operator=(const SymmExpr<Scalar, UpLo>& expr);
// ---- Ownership transfer ---------------------------------------------------
/** Adopt an existing device pointer. Caller relinquishes ownership. */
static DeviceMatrix adopt(Scalar* device_ptr, Index rows, Index cols) {
DeviceMatrix dm;
dm.data_.reset(device_ptr);
dm.rows_ = rows;
dm.cols_ = cols;
dm.outerStride_ = rows;
return dm;
}
/** Transfer ownership of the device pointer out. Zeros internal state. */
Scalar* release() {
Scalar* p = data_.release();
rows_ = 0;
cols_ = 0;
outerStride_ = 0;
if (ready_event_) {
(void)cudaEventDestroy(ready_event_);
ready_event_ = nullptr;
}
ready_stream_ = nullptr;
return p;
}
private:
// ---- Private helpers -------------------------------------------------------
void ensureEvent() {
if (!ready_event_) {
EIGEN_CUDA_RUNTIME_CHECK(cudaEventCreateWithFlags(&ready_event_, cudaEventDisableTiming));
}
}
void retainBuffer(internal::DeviceBuffer&& buffer) { retained_buffer_ = std::move(buffer); }
// ---- Data members --------------------------------------------------------
std::unique_ptr<Scalar, internal::CudaFreeDeleter> data_;
Index rows_ = 0;
Index cols_ = 0;
Index outerStride_ = 0;
cudaEvent_t ready_event_ = nullptr; // internal: tracks last write completion
cudaStream_t ready_stream_ = nullptr; // stream that recorded ready_event_ (for same-stream skip)
internal::DeviceBuffer retained_buffer_; // internal: keeps async aux buffers alive
};
} // namespace gpu
} // namespace Eigen
#endif // EIGEN_GPU_DEVICE_MATRIX_H