blob: 45926f838ad5ae190e41c7774bc0b6fd70c5aa39 [file]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// SPDX-License-Identifier: MPL-2.0
// GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix
// multiply (SpMM) via cuSPARSE.
//
// SparseContext manages a cuSPARSE handle and device buffers. It accepts
// Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the
// GPU. RowMajor input is implicitly converted to ColMajor.
//
// Thread safety: not thread-safe. Concurrent multiply* calls on a single
// SparseContext race on the cuSPARSE handle, the bound stream, and the
// cached device buffers. Use one SparseContext per thread.
//
// Usage:
// SparseContext<double> ctx;
// VectorXd y = ctx.multiply(A, x); // y = A * x
// ctx.multiply(A, x, y, 2.0, 1.0); // y = 2*A*x + y
// ctx.multiply(A, x, y, 1.0, 0.0, gpu::GpuOp::ConjTrans); // y = A^H * x
// VectorXd z = ctx.multiplyT(A, x); // z = A^T * x
// VectorXcd w = ctx.multiplyAdjoint(A, x); // w = A^H * x (complex)
// MatrixXd Y = ctx.multiplyMat(A, X); // Y = A * X (multiple RHS)
#ifndef EIGEN_GPU_SPARSE_CONTEXT_H
#define EIGEN_GPU_SPARSE_CONTEXT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include "./CuSparseSupport.h"
namespace Eigen {
namespace gpu {
template <typename Scalar_>
class SparseContext {
public:
using Scalar = Scalar_;
using RealScalar = typename NumTraits<Scalar>::Real;
using StorageIndex = int;
using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
using DenseVector = Matrix<Scalar, Dynamic, 1>;
using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
SparseContext() {
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_));
EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_));
}
~SparseContext() {
destroy_descriptors_unchecked();
if (handle_) (void)cusparseDestroy(handle_);
if (stream_) (void)cudaStreamDestroy(stream_);
}
SparseContext(const SparseContext&) = delete;
SparseContext& operator=(const SparseContext&) = delete;
// ---- SpMV: y = A * x -----------------------------------------------------
/** Compute y = A * x. Returns y as a new dense vector. */
template <typename InputType, typename Rhs>
DenseVector multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
const InputType& input = A.derived();
check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
const SpMat mat(input);
DenseVector y(mat.rows());
y.setZero();
multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
return y;
}
/** Compute y = alpha * op(A) * x + beta * y (in-place). */
template <typename InputType, typename Rhs, typename Dest>
void multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x, MatrixBase<Dest>& y,
Scalar alpha = Scalar(1), Scalar beta = Scalar(0), GpuOp op = GpuOp::NoTrans) {
const InputType& input = A.derived();
check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
const SpMat mat(input);
multiply_impl(mat, x.derived(), y.derived(), alpha, beta, internal::to_cusparse_op_for_scalar<Scalar>(op));
}
// ---- SpMV transpose: y = A^T * x -----------------------------------------
/** Compute y = A^T * x. Returns y as a new dense vector. */
template <typename InputType, typename Rhs>
DenseVector multiplyT(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
const InputType& input = A.derived();
check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
const SpMat mat(input);
DenseVector y(mat.cols());
y.setZero();
multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE);
return y;
}
// ---- SpMV adjoint: y = A^H * x -------------------------------------------
/** Compute y = A^H * x (conjugate transpose). For real Scalar this is equivalent to multiplyT. */
template <typename InputType, typename Rhs>
DenseVector multiplyAdjoint(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
const InputType& input = A.derived();
check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
const SpMat mat(input);
DenseVector y(mat.cols());
y.setZero();
multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0),
internal::to_cusparse_op_for_scalar<Scalar>(GpuOp::ConjTrans));
return y;
}
// ---- SpMM: Y = op(A) * X (multiple RHS) ----------------------------------
/** Compute Y = op(A) * X where X is a dense matrix (multiple RHS). Returns Y. */
template <typename InputType, typename Rhs>
DenseMatrix multiplyMat(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& X, GpuOp op = GpuOp::NoTrans) {
const InputType& input = A.derived();
check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
const SpMat mat(input);
const DenseMatrix rhs(X.derived());
const cusparseOperation_t cu_op = internal::to_cusparse_op_for_scalar<Scalar>(op);
const Index m = (op == GpuOp::NoTrans) ? mat.rows() : mat.cols();
const Index k = (op == GpuOp::NoTrans) ? mat.cols() : mat.rows();
eigen_assert(k == rhs.rows());
const Index n = rhs.cols();
if (m == 0 || n == 0 || mat.nonZeros() == 0) return DenseMatrix::Zero(m, n);
DenseMatrix Y = DenseMatrix::Zero(m, n);
spmm_impl(mat, rhs, Y, Scalar(1), Scalar(0), cu_op);
return Y;
}
// ---- Accessors ------------------------------------------------------------
cudaStream_t stream() const { return stream_; }
private:
cudaStream_t stream_ = nullptr;
cusparseHandle_t handle_ = nullptr;
// Cached device buffers (grow-only).
internal::DeviceBuffer d_outerPtr_;
internal::DeviceBuffer d_innerIdx_;
internal::DeviceBuffer d_values_;
internal::DeviceBuffer d_x_;
internal::DeviceBuffer d_y_;
internal::DeviceBuffer d_workspace_;
size_t d_outerPtr_size_ = 0;
size_t d_innerIdx_size_ = 0;
size_t d_values_size_ = 0;
size_t d_x_size_ = 0;
size_t d_y_size_ = 0;
size_t d_workspace_size_ = 0;
// Cached cuSPARSE descriptors.
cusparseSpMatDescr_t spmat_desc_ = nullptr;
Index cached_rows_ = -1;
Index cached_cols_ = -1;
Index cached_nnz_ = -1;
// ---- SpMV implementation --------------------------------------------------
template <typename RhsDerived, typename DestDerived>
void multiply_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta,
cusparseOperation_t op) {
eigen_assert(A.isCompressed());
const Index m = A.rows();
const Index n = A.cols();
const Index nnz = A.nonZeros();
const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;
eigen_assert(x.size() == x_size);
eigen_assert(y.size() == y_size);
if (m == 0 || n == 0 || nnz == 0) {
if (beta == Scalar(0))
y.setZero();
else
y *= beta;
return;
}
// Upload sparse matrix to device.
upload_sparse(A);
// Upload x to device.
ensure_buffer(d_x_, d_x_size_, static_cast<size_t>(x_size) * sizeof(Scalar));
const DenseVector x_tmp(x);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_x_.get(), x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
// Upload y to device (for beta != 0).
ensure_buffer(d_y_, d_y_size_, static_cast<size_t>(y_size) * sizeof(Scalar));
if (beta != Scalar(0)) {
const DenseVector y_tmp(y);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_y_.get(), y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
}
// Create dense vector descriptors.
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr;
EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_.get(), dtype));
EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_.get(), dtype));
// Query workspace size.
size_t ws_size = 0;
EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
CUSPARSE_SPMV_ALG_DEFAULT, &ws_size));
ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
// Execute SpMV.
EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.get()));
// Download result.
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(y.data(), d_y_.get(), y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
EIGEN_CUSPARSE_CHECK(cusparseDestroyDnVec(x_desc));
EIGEN_CUSPARSE_CHECK(cusparseDestroyDnVec(y_desc));
}
// ---- SpMM implementation --------------------------------------------------
void spmm_impl(const SpMat& A, const DenseMatrix& X, DenseMatrix& Y, Scalar alpha, Scalar beta,
cusparseOperation_t op) {
eigen_assert(A.isCompressed());
// For op != NON_TRANSPOSE, Y = op(A) * X. The dense X / Y descriptors must
// describe the *post-op* shapes: X has k_op rows (= input dim of op(A)),
// Y has m_op rows (= output dim of op(A)).
const bool transposed = (op != CUSPARSE_OPERATION_NON_TRANSPOSE);
const Index m_op = transposed ? A.cols() : A.rows();
const Index k_op = transposed ? A.rows() : A.cols();
const Index n = X.cols();
const Index nnz = A.nonZeros();
if (m_op == 0 || n == 0 || k_op == 0 || nnz == 0) {
if (beta == Scalar(0))
Y.setZero();
else
Y *= beta;
return;
}
upload_sparse(A);
// Upload X to device. X is k_op x n, Y is m_op x n (column-major).
const size_t x_bytes = static_cast<size_t>(k_op) * static_cast<size_t>(n) * sizeof(Scalar);
const size_t y_bytes = static_cast<size_t>(m_op) * static_cast<size_t>(n) * sizeof(Scalar);
ensure_buffer(d_x_, d_x_size_, x_bytes);
ensure_buffer(d_y_, d_y_size_, y_bytes);
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_x_.get(), X.data(), x_bytes, cudaMemcpyHostToDevice, stream_));
if (beta != Scalar(0)) {
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.get(), Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_));
}
// Create dense matrix descriptors.
constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr;
// Eigen is column-major, so ld = rows.
EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k_op, n, k_op, d_x_.get(), dtype, CUSPARSE_ORDER_COL));
EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m_op, n, m_op, d_y_.get(), dtype, CUSPARSE_ORDER_COL));
// Query workspace.
size_t ws_size = 0;
EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_,
x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size));
ensure_buffer(d_workspace_, d_workspace_size_, ws_size);
// Execute SpMM.
EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta,
y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.get()));
// Download result.
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.get(), y_bytes, cudaMemcpyDeviceToHost, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
EIGEN_CUSPARSE_CHECK(cusparseDestroyDnMat(x_desc));
EIGEN_CUSPARSE_CHECK(cusparseDestroyDnMat(y_desc));
}
// ---- Helpers --------------------------------------------------------------
static void check_storage_index_bounds(Index rows, Index cols, Index nnz) {
const Index max_storage_index = static_cast<Index>((std::numeric_limits<StorageIndex>::max)());
eigen_assert(rows <= max_storage_index && cols <= max_storage_index && nnz <= max_storage_index &&
"gpu::SparseContext currently uses int StorageIndex; matrix dimensions or nonzeros exceed int range");
EIGEN_UNUSED_VARIABLE(rows);
EIGEN_UNUSED_VARIABLE(cols);
EIGEN_UNUSED_VARIABLE(nnz);
EIGEN_UNUSED_VARIABLE(max_storage_index);
}
void upload_sparse(const SpMat& A) {
// cuSPARSE 12.0+ accepts CSC directly for both SpMV and SpMM. cuSPARSE
// 11.x SpMM rejects CSC ("unsupported matrix format for matA (CSC)") and
// SpMV with CONJUGATE_TRANSPOSE on CSC+complex silently demotes to
// TRANSPOSE. CSR works on every version, so on 11.x we transpose-copy
// the user's ColMajor input into a RowMajor (CSR) representation.
#if CUSPARSE_VERSION >= 12000
upload_compressed_arrays(A.rows(), A.cols(), A.nonZeros(),
/*outer_count=*/A.cols() + 1, A.outerIndexPtr(), A.innerIndexPtr(), A.valuePtr(),
/*is_csr=*/false);
#else
using CsrMat = SparseMatrix<Scalar, RowMajor, StorageIndex>;
const CsrMat csr(A);
upload_compressed_arrays(csr.rows(), csr.cols(), csr.nonZeros(),
/*outer_count=*/csr.rows() + 1, csr.outerIndexPtr(), csr.innerIndexPtr(), csr.valuePtr(),
/*is_csr=*/true);
// cudaMemcpyAsync from pageable host memory blocks the host until the
// source is consumed, so the CsrMat temporary's lifetime is sufficient.
#endif
}
void upload_compressed_arrays(Index m, Index n, Index nnz, Index outer_count, const StorageIndex* host_outer,
const StorageIndex* host_inner, const Scalar* host_values, bool is_csr) {
const size_t outer_bytes = static_cast<size_t>(outer_count) * sizeof(StorageIndex);
const size_t inner_bytes = static_cast<size_t>(nnz) * sizeof(StorageIndex);
const size_t val_bytes = static_cast<size_t>(nnz) * sizeof(Scalar);
ensure_buffer(d_outerPtr_, d_outerPtr_size_, outer_bytes);
ensure_buffer(d_innerIdx_, d_innerIdx_size_, inner_bytes);
ensure_buffer(d_values_, d_values_size_, val_bytes);
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_outerPtr_.get(), host_outer, outer_bytes, cudaMemcpyHostToDevice, stream_));
EIGEN_CUDA_RUNTIME_CHECK(
cudaMemcpyAsync(d_innerIdx_.get(), host_inner, inner_bytes, cudaMemcpyHostToDevice, stream_));
EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.get(), host_values, val_bytes, cudaMemcpyHostToDevice, stream_));
if (m != cached_rows_ || n != cached_cols_ || nnz != cached_nnz_) {
destroy_descriptors_checked();
constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
constexpr cudaDataType_t val_type = internal::cuda_data_type<Scalar>::value;
if (is_csr) {
EIGEN_CUSPARSE_CHECK(cusparseCreateCsr(&spmat_desc_, m, n, nnz, d_outerPtr_.get(), d_innerIdx_.get(),
d_values_.get(), idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO,
val_type));
} else {
EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.get(), d_innerIdx_.get(),
d_values_.get(), idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO,
val_type));
}
cached_rows_ = m;
cached_cols_ = n;
cached_nnz_ = nnz;
} else if (is_csr) {
EIGEN_CUSPARSE_CHECK(cusparseCsrSetPointers(spmat_desc_, d_outerPtr_.get(), d_innerIdx_.get(), d_values_.get()));
} else {
EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.get(), d_innerIdx_.get(), d_values_.get()));
}
}
// Destructor-only cleanup: there is no useful recovery path for failures.
void destroy_descriptors_unchecked() {
if (spmat_desc_) {
(void)cusparseDestroySpMat(spmat_desc_);
spmat_desc_ = nullptr;
}
cached_rows_ = -1;
cached_cols_ = -1;
cached_nnz_ = -1;
}
void destroy_descriptors_checked() {
if (spmat_desc_) {
EIGEN_CUSPARSE_CHECK(cusparseDestroySpMat(spmat_desc_));
spmat_desc_ = nullptr;
}
cached_rows_ = -1;
cached_cols_ = -1;
cached_nnz_ = -1;
}
void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) {
if (needed > current_size) {
if (buf) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
buf = internal::DeviceBuffer(needed);
current_size = needed;
}
}
};
} // namespace gpu
} // namespace Eigen
#endif // EIGEN_GPU_SPARSE_CONTEXT_H