unsupported/Eigen/src/GPU/GpuSparseContext.h - mirror - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // SPDX-License-Identifier: MPL-2.0

 // GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix
 // multiply (SpMM) via cuSPARSE.
 //
 // SparseContext manages a cuSPARSE handle and device buffers. It accepts
 // Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the
 // GPU. RowMajor input is implicitly converted to ColMajor.
 //
 // Thread safety: not thread-safe. Concurrent multiply* calls on a single
 // SparseContext race on the cuSPARSE handle, the bound stream, and the
 // cached device buffers. Use one SparseContext per thread.
 //
 // Usage:
 //   SparseContext<double> ctx;
 //   VectorXd y = ctx.multiply(A, x);                  // y = A * x
 //   ctx.multiply(A, x, y, 2.0, 1.0);                  // y = 2*A*x + y
 //   ctx.multiply(A, x, y, 1.0, 0.0, gpu::GpuOp::ConjTrans);  // y = A^H * x
 //   VectorXd z = ctx.multiplyT(A, x);                 // z = A^T * x
 //   VectorXcd w = ctx.multiplyAdjoint(A, x);          // w = A^H * x (complex)
 //   MatrixXd Y = ctx.multiplyMat(A, X);               // Y = A * X (multiple RHS)

 #ifndef EIGEN_GPU_SPARSE_CONTEXT_H
 #define EIGEN_GPU_SPARSE_CONTEXT_H

 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"

 #include "./CuSparseSupport.h"

 namespace Eigen {
 namespace gpu {

 template <typename Scalar_>
 class SparseContext {
  public:
   using Scalar = Scalar_;
   using RealScalar = typename NumTraits<Scalar>::Real;
   using StorageIndex = int;
   using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
   using DenseVector = Matrix<Scalar, Dynamic, 1>;
   using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;

   SparseContext() {
     EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
     EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_));
     EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_));
   }

   ~SparseContext() {
     destroy_descriptors_unchecked();
     if (handle_) (void)cusparseDestroy(handle_);
     if (stream_) (void)cudaStreamDestroy(stream_);
   }

   SparseContext(const SparseContext&) = delete;
   SparseContext& operator=(const SparseContext&) = delete;

   // ---- SpMV: y = A * x -----------------------------------------------------

   /** Compute y = A * x. Returns y as a new dense vector. */
   template <typename InputType, typename Rhs>
   DenseVector multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
     const InputType& input = A.derived();
     check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
     const SpMat mat(input);
     DenseVector y(mat.rows());
     y.setZero();
     multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
     return y;
   }

   /** Compute y = alpha * op(A) * x + beta * y (in-place). */
   template <typename InputType, typename Rhs, typename Dest>
   void multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x, MatrixBase<Dest>& y,
                 Scalar alpha = Scalar(1), Scalar beta = Scalar(0), GpuOp op = GpuOp::NoTrans) {
     const InputType& input = A.derived();
     check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
     const SpMat mat(input);
     multiply_impl(mat, x.derived(), y.derived(), alpha, beta, internal::to_cusparse_op_for_scalar<Scalar>(op));
   }

   // ---- SpMV transpose: y = A^T * x -----------------------------------------

   /** Compute y = A^T * x. Returns y as a new dense vector. */
   template <typename InputType, typename Rhs>
   DenseVector multiplyT(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
     const InputType& input = A.derived();
     check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
     const SpMat mat(input);
     DenseVector y(mat.cols());
     y.setZero();
     multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE);
     return y;
   }

   // ---- SpMV adjoint: y = A^H * x -------------------------------------------

   /** Compute y = A^H * x (conjugate transpose). For real Scalar this is equivalent to multiplyT. */
   template <typename InputType, typename Rhs>
   DenseVector multiplyAdjoint(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
     const InputType& input = A.derived();
     check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
     const SpMat mat(input);
     DenseVector y(mat.cols());
     y.setZero();
     multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0),
                   internal::to_cusparse_op_for_scalar<Scalar>(GpuOp::ConjTrans));
     return y;
   }

   // ---- SpMM: Y = op(A) * X (multiple RHS) ----------------------------------

   /** Compute Y = op(A) * X where X is a dense matrix (multiple RHS). Returns Y. */
   template <typename InputType, typename Rhs>
   DenseMatrix multiplyMat(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& X, GpuOp op = GpuOp::NoTrans) {
     const InputType& input = A.derived();
     check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
     const SpMat mat(input);
     const DenseMatrix rhs(X.derived());

     const cusparseOperation_t cu_op = internal::to_cusparse_op_for_scalar<Scalar>(op);
     const Index m = (op == GpuOp::NoTrans) ? mat.rows() : mat.cols();
     const Index k = (op == GpuOp::NoTrans) ? mat.cols() : mat.rows();
     eigen_assert(k == rhs.rows());

     const Index n = rhs.cols();
     if (m == 0 || n == 0 || mat.nonZeros() == 0) return DenseMatrix::Zero(m, n);

     DenseMatrix Y = DenseMatrix::Zero(m, n);
     spmm_impl(mat, rhs, Y, Scalar(1), Scalar(0), cu_op);
     return Y;
   }

   // ---- Accessors ------------------------------------------------------------

   cudaStream_t stream() const { return stream_; }

  private:
   cudaStream_t stream_ = nullptr;
   cusparseHandle_t handle_ = nullptr;

   // Cached device buffers (grow-only).
   internal::DeviceBuffer d_outerPtr_;
   internal::DeviceBuffer d_innerIdx_;
   internal::DeviceBuffer d_values_;
   internal::DeviceBuffer d_x_;
   internal::DeviceBuffer d_y_;
   internal::DeviceBuffer d_workspace_;
   size_t d_outerPtr_size_ = 0;
   size_t d_innerIdx_size_ = 0;
   size_t d_values_size_ = 0;
   size_t d_x_size_ = 0;
   size_t d_y_size_ = 0;
   size_t d_workspace_size_ = 0;

   // Cached cuSPARSE descriptors.
   cusparseSpMatDescr_t spmat_desc_ = nullptr;
   Index cached_rows_ = -1;
   Index cached_cols_ = -1;
   Index cached_nnz_ = -1;

   // ---- SpMV implementation --------------------------------------------------

   template <typename RhsDerived, typename DestDerived>
   void multiply_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta,
                      cusparseOperation_t op) {
     eigen_assert(A.isCompressed());

     const Index m = A.rows();
     const Index n = A.cols();
     const Index nnz = A.nonZeros();
     const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
     const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;

     eigen_assert(x.size() == x_size);
     eigen_assert(y.size() == y_size);

     if (m == 0 || n == 0 || nnz == 0) {
       if (beta == Scalar(0))
         y.setZero();
       else
         y *= beta;
       return;
     }

     // Upload sparse matrix to device.
     upload_sparse(A);

     // Upload x to device.
     ensure_buffer(d_x_, d_x_size_, static_cast<size_t>(x_size) * sizeof(Scalar));
     const DenseVector x_tmp(x);
     EIGEN_CUDA_RUNTIME_CHECK(
         cudaMemcpyAsync(d_x_.get(), x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));

     // Upload y to device (for beta != 0).
     ensure_buffer(d_y_, d_y_size_, static_cast<size_t>(y_size) * sizeof(Scalar));
     if (beta != Scalar(0)) {
       const DenseVector y_tmp(y);
       EIGEN_CUDA_RUNTIME_CHECK(
           cudaMemcpyAsync(d_y_.get(), y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
     }

     // Create dense vector descriptors.
     constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
     cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr;
     EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_.get(), dtype));
     EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_.get(), dtype));

     // Query workspace size.
     size_t ws_size = 0;
     EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
                                                  CUSPARSE_SPMV_ALG_DEFAULT, &ws_size));
     ensure_buffer(d_workspace_, d_workspace_size_, ws_size);

     // Execute SpMV.
     EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
                                       CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.get()));

     // Download result.
     EIGEN_CUDA_RUNTIME_CHECK(
         cudaMemcpyAsync(y.data(), d_y_.get(), y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
     EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));

     EIGEN_CUSPARSE_CHECK(cusparseDestroyDnVec(x_desc));
     EIGEN_CUSPARSE_CHECK(cusparseDestroyDnVec(y_desc));
   }

   // ---- SpMM implementation --------------------------------------------------

   void spmm_impl(const SpMat& A, const DenseMatrix& X, DenseMatrix& Y, Scalar alpha, Scalar beta,
                  cusparseOperation_t op) {
     eigen_assert(A.isCompressed());

     // For op != NON_TRANSPOSE, Y = op(A) * X. The dense X / Y descriptors must
     // describe the *post-op* shapes: X has k_op rows (= input dim of op(A)),
     // Y has m_op rows (= output dim of op(A)).
     const bool transposed = (op != CUSPARSE_OPERATION_NON_TRANSPOSE);
     const Index m_op = transposed ? A.cols() : A.rows();
     const Index k_op = transposed ? A.rows() : A.cols();
     const Index n = X.cols();
     const Index nnz = A.nonZeros();

     if (m_op == 0 || n == 0 || k_op == 0 || nnz == 0) {
       if (beta == Scalar(0))
         Y.setZero();
       else
         Y *= beta;
       return;
     }

     upload_sparse(A);

     // Upload X to device. X is k_op x n, Y is m_op x n (column-major).
     const size_t x_bytes = static_cast<size_t>(k_op) * static_cast<size_t>(n) * sizeof(Scalar);
     const size_t y_bytes = static_cast<size_t>(m_op) * static_cast<size_t>(n) * sizeof(Scalar);
     ensure_buffer(d_x_, d_x_size_, x_bytes);
     ensure_buffer(d_y_, d_y_size_, y_bytes);
     EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_x_.get(), X.data(), x_bytes, cudaMemcpyHostToDevice, stream_));
     if (beta != Scalar(0)) {
       EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.get(), Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_));
     }

     // Create dense matrix descriptors.
     constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
     cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr;
     // Eigen is column-major, so ld = rows.
     EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k_op, n, k_op, d_x_.get(), dtype, CUSPARSE_ORDER_COL));
     EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m_op, n, m_op, d_y_.get(), dtype, CUSPARSE_ORDER_COL));

     // Query workspace.
     size_t ws_size = 0;
     EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_,
                                                  x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size));
     ensure_buffer(d_workspace_, d_workspace_size_, ws_size);

     // Execute SpMM.
     EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta,
                                       y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.get()));

     // Download result.
     EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.get(), y_bytes, cudaMemcpyDeviceToHost, stream_));
     EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));

     EIGEN_CUSPARSE_CHECK(cusparseDestroyDnMat(x_desc));
     EIGEN_CUSPARSE_CHECK(cusparseDestroyDnMat(y_desc));
   }

   // ---- Helpers --------------------------------------------------------------

   static void check_storage_index_bounds(Index rows, Index cols, Index nnz) {
     const Index max_storage_index = static_cast<Index>((std::numeric_limits<StorageIndex>::max)());
     eigen_assert(rows <= max_storage_index && cols <= max_storage_index && nnz <= max_storage_index &&
                  "gpu::SparseContext currently uses int StorageIndex; matrix dimensions or nonzeros exceed int range");
     EIGEN_UNUSED_VARIABLE(rows);
     EIGEN_UNUSED_VARIABLE(cols);
     EIGEN_UNUSED_VARIABLE(nnz);
     EIGEN_UNUSED_VARIABLE(max_storage_index);
   }

   void upload_sparse(const SpMat& A) {
     // cuSPARSE 12.0+ accepts CSC directly for both SpMV and SpMM. cuSPARSE
     // 11.x SpMM rejects CSC ("unsupported matrix format for matA (CSC)") and
     // SpMV with CONJUGATE_TRANSPOSE on CSC+complex silently demotes to
     // TRANSPOSE. CSR works on every version, so on 11.x we transpose-copy
     // the user's ColMajor input into a RowMajor (CSR) representation.
 #if CUSPARSE_VERSION >= 12000
     upload_compressed_arrays(A.rows(), A.cols(), A.nonZeros(),
                              /*outer_count=*/A.cols() + 1, A.outerIndexPtr(), A.innerIndexPtr(), A.valuePtr(),
                              /*is_csr=*/false);
 #else
     using CsrMat = SparseMatrix<Scalar, RowMajor, StorageIndex>;
     const CsrMat csr(A);
     upload_compressed_arrays(csr.rows(), csr.cols(), csr.nonZeros(),
                              /*outer_count=*/csr.rows() + 1, csr.outerIndexPtr(), csr.innerIndexPtr(), csr.valuePtr(),
                              /*is_csr=*/true);
     // cudaMemcpyAsync from pageable host memory blocks the host until the
     // source is consumed, so the CsrMat temporary's lifetime is sufficient.
 #endif
   }

   void upload_compressed_arrays(Index m, Index n, Index nnz, Index outer_count, const StorageIndex* host_outer,
                                 const StorageIndex* host_inner, const Scalar* host_values, bool is_csr) {
     const size_t outer_bytes = static_cast<size_t>(outer_count) * sizeof(StorageIndex);
     const size_t inner_bytes = static_cast<size_t>(nnz) * sizeof(StorageIndex);
     const size_t val_bytes = static_cast<size_t>(nnz) * sizeof(Scalar);

     ensure_buffer(d_outerPtr_, d_outerPtr_size_, outer_bytes);
     ensure_buffer(d_innerIdx_, d_innerIdx_size_, inner_bytes);
     ensure_buffer(d_values_, d_values_size_, val_bytes);

     EIGEN_CUDA_RUNTIME_CHECK(
         cudaMemcpyAsync(d_outerPtr_.get(), host_outer, outer_bytes, cudaMemcpyHostToDevice, stream_));
     EIGEN_CUDA_RUNTIME_CHECK(
         cudaMemcpyAsync(d_innerIdx_.get(), host_inner, inner_bytes, cudaMemcpyHostToDevice, stream_));
     EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.get(), host_values, val_bytes, cudaMemcpyHostToDevice, stream_));

     if (m != cached_rows_ || n != cached_cols_ || nnz != cached_nnz_) {
       destroy_descriptors_checked();

       constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
       constexpr cudaDataType_t val_type = internal::cuda_data_type<Scalar>::value;

       if (is_csr) {
         EIGEN_CUSPARSE_CHECK(cusparseCreateCsr(&spmat_desc_, m, n, nnz, d_outerPtr_.get(), d_innerIdx_.get(),
                                                d_values_.get(), idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO,
                                                val_type));
       } else {
         EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.get(), d_innerIdx_.get(),
                                                d_values_.get(), idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO,
                                                val_type));
       }
       cached_rows_ = m;
       cached_cols_ = n;
       cached_nnz_ = nnz;
     } else if (is_csr) {
       EIGEN_CUSPARSE_CHECK(cusparseCsrSetPointers(spmat_desc_, d_outerPtr_.get(), d_innerIdx_.get(), d_values_.get()));
     } else {
       EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.get(), d_innerIdx_.get(), d_values_.get()));
     }
   }

   // Destructor-only cleanup: there is no useful recovery path for failures.
   void destroy_descriptors_unchecked() {
     if (spmat_desc_) {
       (void)cusparseDestroySpMat(spmat_desc_);
       spmat_desc_ = nullptr;
     }
     cached_rows_ = -1;
     cached_cols_ = -1;
     cached_nnz_ = -1;
   }

   void destroy_descriptors_checked() {
     if (spmat_desc_) {
       EIGEN_CUSPARSE_CHECK(cusparseDestroySpMat(spmat_desc_));
       spmat_desc_ = nullptr;
     }
     cached_rows_ = -1;
     cached_cols_ = -1;
     cached_nnz_ = -1;
   }

   void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) {
     if (needed > current_size) {
       if (buf) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
       buf = internal::DeviceBuffer(needed);
       current_size = needed;
     }
   }
 };

 }  // namespace gpu
 }  // namespace Eigen

 #endif  // EIGEN_GPU_SPARSE_CONTEXT_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
	// SPDX-License-Identifier: MPL-2.0

	// GPU sparse matrix-vector multiply (SpMV) and sparse matrix-dense matrix
	// multiply (SpMM) via cuSPARSE.
	//
	// SparseContext manages a cuSPARSE handle and device buffers. It accepts
	// Eigen SparseMatrix<Scalar, ColMajor> (CSC) and performs SpMV/SpMM on the
	// GPU. RowMajor input is implicitly converted to ColMajor.
	//
	// Thread safety: not thread-safe. Concurrent multiply* calls on a single
	// SparseContext race on the cuSPARSE handle, the bound stream, and the
	// cached device buffers. Use one SparseContext per thread.
	//
	// Usage:
	// SparseContext<double> ctx;
	// VectorXd y = ctx.multiply(A, x); // y = A * x
	// ctx.multiply(A, x, y, 2.0, 1.0); // y = 2Ax + y
	// ctx.multiply(A, x, y, 1.0, 0.0, gpu::GpuOp::ConjTrans); // y = A^H * x
	// VectorXd z = ctx.multiplyT(A, x); // z = A^T * x
	// VectorXcd w = ctx.multiplyAdjoint(A, x); // w = A^H * x (complex)
	// MatrixXd Y = ctx.multiplyMat(A, X); // Y = A * X (multiple RHS)

	#ifndef EIGEN_GPU_SPARSE_CONTEXT_H
	#define EIGEN_GPU_SPARSE_CONTEXT_H

	// IWYU pragma: private
	#include "./InternalHeaderCheck.h"

	#include "./CuSparseSupport.h"

	namespace Eigen {
	namespace gpu {

	template <typename Scalar_>
	class SparseContext {
	public:
	using Scalar = Scalar_;
	using RealScalar = typename NumTraits<Scalar>::Real;
	using StorageIndex = int;
	using SpMat = SparseMatrix<Scalar, ColMajor, StorageIndex>;
	using DenseVector = Matrix<Scalar, Dynamic, 1>;
	using DenseMatrix = Matrix<Scalar, Dynamic, Dynamic, ColMajor>;

	SparseContext() {
	EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
	EIGEN_CUSPARSE_CHECK(cusparseCreate(&handle_));
	EIGEN_CUSPARSE_CHECK(cusparseSetStream(handle_, stream_));
	}

	~SparseContext() {
	destroy_descriptors_unchecked();
	if (handle_) (void)cusparseDestroy(handle_);
	if (stream_) (void)cudaStreamDestroy(stream_);
	}

	SparseContext(const SparseContext&) = delete;
	SparseContext& operator=(const SparseContext&) = delete;

	// ---- SpMV: y = A * x -----------------------------------------------------

	/** Compute y = A * x. Returns y as a new dense vector. */
	template <typename InputType, typename Rhs>
	DenseVector multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
	const InputType& input = A.derived();
	check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
	const SpMat mat(input);
	DenseVector y(mat.rows());
	y.setZero();
	multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_NON_TRANSPOSE);
	return y;
	}

	/** Compute y = alpha * op(A) * x + beta * y (in-place). */
	template <typename InputType, typename Rhs, typename Dest>
	void multiply(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x, MatrixBase<Dest>& y,
	Scalar alpha = Scalar(1), Scalar beta = Scalar(0), GpuOp op = GpuOp::NoTrans) {
	const InputType& input = A.derived();
	check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
	const SpMat mat(input);
	multiply_impl(mat, x.derived(), y.derived(), alpha, beta, internal::to_cusparse_op_for_scalar<Scalar>(op));
	}

	// ---- SpMV transpose: y = A^T * x -----------------------------------------

	/** Compute y = A^T * x. Returns y as a new dense vector. */
	template <typename InputType, typename Rhs>
	DenseVector multiplyT(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
	const InputType& input = A.derived();
	check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
	const SpMat mat(input);
	DenseVector y(mat.cols());
	y.setZero();
	multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0), CUSPARSE_OPERATION_TRANSPOSE);
	return y;
	}

	// ---- SpMV adjoint: y = A^H * x -------------------------------------------

	/** Compute y = A^H * x (conjugate transpose). For real Scalar this is equivalent to multiplyT. */
	template <typename InputType, typename Rhs>
	DenseVector multiplyAdjoint(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& x) {
	const InputType& input = A.derived();
	check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
	const SpMat mat(input);
	DenseVector y(mat.cols());
	y.setZero();
	multiply_impl(mat, x.derived(), y, Scalar(1), Scalar(0),
	internal::to_cusparse_op_for_scalar<Scalar>(GpuOp::ConjTrans));
	return y;
	}

	// ---- SpMM: Y = op(A) * X (multiple RHS) ----------------------------------

	/** Compute Y = op(A) * X where X is a dense matrix (multiple RHS). Returns Y. */
	template <typename InputType, typename Rhs>
	DenseMatrix multiplyMat(const SparseMatrixBase<InputType>& A, const MatrixBase<Rhs>& X, GpuOp op = GpuOp::NoTrans) {
	const InputType& input = A.derived();
	check_storage_index_bounds(input.rows(), input.cols(), input.nonZeros());
	const SpMat mat(input);
	const DenseMatrix rhs(X.derived());

	const cusparseOperation_t cu_op = internal::to_cusparse_op_for_scalar<Scalar>(op);
	const Index m = (op == GpuOp::NoTrans) ? mat.rows() : mat.cols();
	const Index k = (op == GpuOp::NoTrans) ? mat.cols() : mat.rows();
	eigen_assert(k == rhs.rows());

	const Index n = rhs.cols();
	if (m == 0 \|\| n == 0 \|\| mat.nonZeros() == 0) return DenseMatrix::Zero(m, n);

	DenseMatrix Y = DenseMatrix::Zero(m, n);
	spmm_impl(mat, rhs, Y, Scalar(1), Scalar(0), cu_op);
	return Y;
	}

	// ---- Accessors ------------------------------------------------------------

	cudaStream_t stream() const { return stream_; }

	private:
	cudaStream_t stream_ = nullptr;
	cusparseHandle_t handle_ = nullptr;

	// Cached device buffers (grow-only).
	internal::DeviceBuffer d_outerPtr_;
	internal::DeviceBuffer d_innerIdx_;
	internal::DeviceBuffer d_values_;
	internal::DeviceBuffer d_x_;
	internal::DeviceBuffer d_y_;
	internal::DeviceBuffer d_workspace_;
	size_t d_outerPtr_size_ = 0;
	size_t d_innerIdx_size_ = 0;
	size_t d_values_size_ = 0;
	size_t d_x_size_ = 0;
	size_t d_y_size_ = 0;
	size_t d_workspace_size_ = 0;

	// Cached cuSPARSE descriptors.
	cusparseSpMatDescr_t spmat_desc_ = nullptr;
	Index cached_rows_ = -1;
	Index cached_cols_ = -1;
	Index cached_nnz_ = -1;

	// ---- SpMV implementation --------------------------------------------------

	template <typename RhsDerived, typename DestDerived>
	void multiply_impl(const SpMat& A, const RhsDerived& x, DestDerived& y, Scalar alpha, Scalar beta,
	cusparseOperation_t op) {
	eigen_assert(A.isCompressed());

	const Index m = A.rows();
	const Index n = A.cols();
	const Index nnz = A.nonZeros();
	const Index x_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? n : m;
	const Index y_size = (op == CUSPARSE_OPERATION_NON_TRANSPOSE) ? m : n;

	eigen_assert(x.size() == x_size);
	eigen_assert(y.size() == y_size);

	if (m == 0 \|\| n == 0 \|\| nnz == 0) {
	if (beta == Scalar(0))
	y.setZero();
	else
	y *= beta;
	return;
	}

	// Upload sparse matrix to device.
	upload_sparse(A);

	// Upload x to device.
	ensure_buffer(d_x_, d_x_size_, static_cast<size_t>(x_size) * sizeof(Scalar));
	const DenseVector x_tmp(x);
	EIGEN_CUDA_RUNTIME_CHECK(
	cudaMemcpyAsync(d_x_.get(), x_tmp.data(), x_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));

	// Upload y to device (for beta != 0).
	ensure_buffer(d_y_, d_y_size_, static_cast<size_t>(y_size) * sizeof(Scalar));
	if (beta != Scalar(0)) {
	const DenseVector y_tmp(y);
	EIGEN_CUDA_RUNTIME_CHECK(
	cudaMemcpyAsync(d_y_.get(), y_tmp.data(), y_size * sizeof(Scalar), cudaMemcpyHostToDevice, stream_));
	}

	// Create dense vector descriptors.
	constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
	cusparseDnVecDescr_t x_desc = nullptr, y_desc = nullptr;
	EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&x_desc, x_size, d_x_.get(), dtype));
	EIGEN_CUSPARSE_CHECK(cusparseCreateDnVec(&y_desc, y_size, d_y_.get(), dtype));

	// Query workspace size.
	size_t ws_size = 0;
	EIGEN_CUSPARSE_CHECK(cusparseSpMV_bufferSize(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
	CUSPARSE_SPMV_ALG_DEFAULT, &ws_size));
	ensure_buffer(d_workspace_, d_workspace_size_, ws_size);

	// Execute SpMV.
	EIGEN_CUSPARSE_CHECK(cusparseSpMV(handle_, op, &alpha, spmat_desc_, x_desc, &beta, y_desc, dtype,
	CUSPARSE_SPMV_ALG_DEFAULT, d_workspace_.get()));

	// Download result.
	EIGEN_CUDA_RUNTIME_CHECK(
	cudaMemcpyAsync(y.data(), d_y_.get(), y_size * sizeof(Scalar), cudaMemcpyDeviceToHost, stream_));
	EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));

	EIGEN_CUSPARSE_CHECK(cusparseDestroyDnVec(x_desc));
	EIGEN_CUSPARSE_CHECK(cusparseDestroyDnVec(y_desc));
	}

	// ---- SpMM implementation --------------------------------------------------

	void spmm_impl(const SpMat& A, const DenseMatrix& X, DenseMatrix& Y, Scalar alpha, Scalar beta,
	cusparseOperation_t op) {
	eigen_assert(A.isCompressed());

	// For op != NON_TRANSPOSE, Y = op(A) * X. The dense X / Y descriptors must
	// describe the post-op shapes: X has k_op rows (= input dim of op(A)),
	// Y has m_op rows (= output dim of op(A)).
	const bool transposed = (op != CUSPARSE_OPERATION_NON_TRANSPOSE);
	const Index m_op = transposed ? A.cols() : A.rows();
	const Index k_op = transposed ? A.rows() : A.cols();
	const Index n = X.cols();
	const Index nnz = A.nonZeros();

	if (m_op == 0 \|\| n == 0 \|\| k_op == 0 \|\| nnz == 0) {
	if (beta == Scalar(0))
	Y.setZero();
	else
	Y *= beta;
	return;
	}

	upload_sparse(A);

	// Upload X to device. X is k_op x n, Y is m_op x n (column-major).
	const size_t x_bytes = static_cast<size_t>(k_op) * static_cast<size_t>(n) * sizeof(Scalar);
	const size_t y_bytes = static_cast<size_t>(m_op) * static_cast<size_t>(n) * sizeof(Scalar);
	ensure_buffer(d_x_, d_x_size_, x_bytes);
	ensure_buffer(d_y_, d_y_size_, y_bytes);
	EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_x_.get(), X.data(), x_bytes, cudaMemcpyHostToDevice, stream_));
	if (beta != Scalar(0)) {
	EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_y_.get(), Y.data(), y_bytes, cudaMemcpyHostToDevice, stream_));
	}

	// Create dense matrix descriptors.
	constexpr cudaDataType_t dtype = internal::cuda_data_type<Scalar>::value;
	cusparseDnMatDescr_t x_desc = nullptr, y_desc = nullptr;
	// Eigen is column-major, so ld = rows.
	EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&x_desc, k_op, n, k_op, d_x_.get(), dtype, CUSPARSE_ORDER_COL));
	EIGEN_CUSPARSE_CHECK(cusparseCreateDnMat(&y_desc, m_op, n, m_op, d_y_.get(), dtype, CUSPARSE_ORDER_COL));

	// Query workspace.
	size_t ws_size = 0;
	EIGEN_CUSPARSE_CHECK(cusparseSpMM_bufferSize(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_,
	x_desc, &beta, y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size));
	ensure_buffer(d_workspace_, d_workspace_size_, ws_size);

	// Execute SpMM.
	EIGEN_CUSPARSE_CHECK(cusparseSpMM(handle_, op, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, spmat_desc_, x_desc, &beta,
	y_desc, dtype, CUSPARSE_SPMM_ALG_DEFAULT, d_workspace_.get()));

	// Download result.
	EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(Y.data(), d_y_.get(), y_bytes, cudaMemcpyDeviceToHost, stream_));
	EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));

	EIGEN_CUSPARSE_CHECK(cusparseDestroyDnMat(x_desc));
	EIGEN_CUSPARSE_CHECK(cusparseDestroyDnMat(y_desc));
	}

	// ---- Helpers --------------------------------------------------------------

	static void check_storage_index_bounds(Index rows, Index cols, Index nnz) {
	const Index max_storage_index = static_cast<Index>((std::numeric_limits<StorageIndex>::max)());
	eigen_assert(rows <= max_storage_index && cols <= max_storage_index && nnz <= max_storage_index &&
	"gpu::SparseContext currently uses int StorageIndex; matrix dimensions or nonzeros exceed int range");
	EIGEN_UNUSED_VARIABLE(rows);
	EIGEN_UNUSED_VARIABLE(cols);
	EIGEN_UNUSED_VARIABLE(nnz);
	EIGEN_UNUSED_VARIABLE(max_storage_index);
	}

	void upload_sparse(const SpMat& A) {
	// cuSPARSE 12.0+ accepts CSC directly for both SpMV and SpMM. cuSPARSE
	// 11.x SpMM rejects CSC ("unsupported matrix format for matA (CSC)") and
	// SpMV with CONJUGATE_TRANSPOSE on CSC+complex silently demotes to
	// TRANSPOSE. CSR works on every version, so on 11.x we transpose-copy
	// the user's ColMajor input into a RowMajor (CSR) representation.
	#if CUSPARSE_VERSION >= 12000
	upload_compressed_arrays(A.rows(), A.cols(), A.nonZeros(),
	/outer_count=/A.cols() + 1, A.outerIndexPtr(), A.innerIndexPtr(), A.valuePtr(),
	/is_csr=/false);
	#else
	using CsrMat = SparseMatrix<Scalar, RowMajor, StorageIndex>;
	const CsrMat csr(A);
	upload_compressed_arrays(csr.rows(), csr.cols(), csr.nonZeros(),
	/outer_count=/csr.rows() + 1, csr.outerIndexPtr(), csr.innerIndexPtr(), csr.valuePtr(),
	/is_csr=/true);
	// cudaMemcpyAsync from pageable host memory blocks the host until the
	// source is consumed, so the CsrMat temporary's lifetime is sufficient.
	#endif
	}

	void upload_compressed_arrays(Index m, Index n, Index nnz, Index outer_count, const StorageIndex* host_outer,
	const StorageIndex* host_inner, const Scalar* host_values, bool is_csr) {
	const size_t outer_bytes = static_cast<size_t>(outer_count) * sizeof(StorageIndex);
	const size_t inner_bytes = static_cast<size_t>(nnz) * sizeof(StorageIndex);
	const size_t val_bytes = static_cast<size_t>(nnz) * sizeof(Scalar);

	ensure_buffer(d_outerPtr_, d_outerPtr_size_, outer_bytes);
	ensure_buffer(d_innerIdx_, d_innerIdx_size_, inner_bytes);
	ensure_buffer(d_values_, d_values_size_, val_bytes);

	EIGEN_CUDA_RUNTIME_CHECK(
	cudaMemcpyAsync(d_outerPtr_.get(), host_outer, outer_bytes, cudaMemcpyHostToDevice, stream_));
	EIGEN_CUDA_RUNTIME_CHECK(
	cudaMemcpyAsync(d_innerIdx_.get(), host_inner, inner_bytes, cudaMemcpyHostToDevice, stream_));
	EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_values_.get(), host_values, val_bytes, cudaMemcpyHostToDevice, stream_));

	if (m != cached_rows_ \|\| n != cached_cols_ \|\| nnz != cached_nnz_) {
	destroy_descriptors_checked();

	constexpr cusparseIndexType_t idx_type = (sizeof(StorageIndex) == 4) ? CUSPARSE_INDEX_32I : CUSPARSE_INDEX_64I;
	constexpr cudaDataType_t val_type = internal::cuda_data_type<Scalar>::value;

	if (is_csr) {
	EIGEN_CUSPARSE_CHECK(cusparseCreateCsr(&spmat_desc_, m, n, nnz, d_outerPtr_.get(), d_innerIdx_.get(),
	d_values_.get(), idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO,
	val_type));
	} else {
	EIGEN_CUSPARSE_CHECK(cusparseCreateCsc(&spmat_desc_, m, n, nnz, d_outerPtr_.get(), d_innerIdx_.get(),
	d_values_.get(), idx_type, idx_type, CUSPARSE_INDEX_BASE_ZERO,
	val_type));
	}
	cached_rows_ = m;
	cached_cols_ = n;
	cached_nnz_ = nnz;
	} else if (is_csr) {
	EIGEN_CUSPARSE_CHECK(cusparseCsrSetPointers(spmat_desc_, d_outerPtr_.get(), d_innerIdx_.get(), d_values_.get()));
	} else {
	EIGEN_CUSPARSE_CHECK(cusparseCscSetPointers(spmat_desc_, d_outerPtr_.get(), d_innerIdx_.get(), d_values_.get()));
	}
	}

	// Destructor-only cleanup: there is no useful recovery path for failures.
	void destroy_descriptors_unchecked() {
	if (spmat_desc_) {
	(void)cusparseDestroySpMat(spmat_desc_);
	spmat_desc_ = nullptr;
	}
	cached_rows_ = -1;
	cached_cols_ = -1;
	cached_nnz_ = -1;
	}

	void destroy_descriptors_checked() {
	if (spmat_desc_) {
	EIGEN_CUSPARSE_CHECK(cusparseDestroySpMat(spmat_desc_));
	spmat_desc_ = nullptr;
	}
	cached_rows_ = -1;
	cached_cols_ = -1;
	cached_nnz_ = -1;
	}

	void ensure_buffer(internal::DeviceBuffer& buf, size_t& current_size, size_t needed) {
	if (needed > current_size) {
	if (buf) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
	buf = internal::DeviceBuffer(needed);
	current_size = needed;
	}
	}
	};

	} // namespace gpu
	} // namespace Eigen

	#endif // EIGEN_GPU_SPARSE_CONTEXT_H