unsupported/Eigen/src/GPU/GpuLU.h - mirror - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Eigen Authors
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // SPDX-License-Identifier: MPL-2.0

 // GPU partial-pivoting LU decomposition using cuSOLVER.
 //
 // Wraps cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve).
 // The factored LU matrix and pivot array are kept in device memory for the
 // lifetime of the object, so repeated solves only transfer the RHS/solution.
 //
 // Requires CUDA 11.0+ (cusolverDnX generic API).
 //
 // Usage:
 //   gpu::LU<double> lu(A);            // upload A, getrf, LU+ipiv on device
 //   if (lu.info() != Success) { ... }
 //   MatrixXd x = lu.solve(b);         // getrs NoTrans, only b transferred
 //   MatrixXd xt = lu.solve(b, gpu::GpuOp::Trans);             // A^T x = b

 #ifndef EIGEN_GPU_LU_H
 #define EIGEN_GPU_LU_H

 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"

 #include "./CuBlasSupport.h"
 #include "./CuSolverSupport.h"
 #include <vector>

 namespace Eigen {
 namespace gpu {

 /** \ingroup GPU_Module
  * \class LU
  * \brief GPU LU decomposition with partial pivoting via cuSOLVER
  *
  * \tparam Scalar_  Element type: float, double, complex<float>, complex<double>
  *
  * Decomposes a square matrix A = P L U on the GPU and retains the factored
  * matrix and pivot array in device memory. Solves A*X=B, A^T*X=B, or
  * A^H*X=B by passing the appropriate gpu::GpuOp.
  *
  * Each LU object owns a dedicated CUDA stream and cuSOLVER handle.
  */
 template <typename Scalar_>
 class LU {
  public:
   using Scalar = Scalar_;
   using RealScalar = typename NumTraits<Scalar>::Real;
   using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;

   // ---- Construction / destruction ------------------------------------------

   LU() { init_context(); }

   template <typename InputType>
   explicit LU(const EigenBase<InputType>& A) : LU() {
     compute(A);
   }

   ~LU() {
     // Ignore errors here: dtors are noexcept, and EIGEN_CUSOLVER_CHECK /
     // EIGEN_CUDA_RUNTIME_CHECK are eigen_assert-based — firing one from a
     // noexcept dtor terminates the program. The trailing cudaFree() of the
     // device buffers (via internal::DeviceBuffer::~DeviceBuffer) is sync.
     if (handle_) (void)cusolverDnDestroy(handle_);
     if (stream_) (void)cudaStreamDestroy(stream_);
   }

   LU(const LU&) = delete;
   LU& operator=(const LU&) = delete;

   LU(LU&& o) noexcept
       : stream_(o.stream_),
         handle_(o.handle_),
         params_(std::move(o.params_)),
         d_lu_(std::move(o.d_lu_)),
         lu_alloc_size_(o.lu_alloc_size_),
         d_ipiv_(std::move(o.d_ipiv_)),
         d_scratch_(std::move(o.d_scratch_)),
         scratch_size_(o.scratch_size_),
         h_workspace_(std::move(o.h_workspace_)),
         n_(o.n_),
         lda_(o.lda_),
         info_(o.info_),
         pinned_info_(std::move(o.pinned_info_)),
         info_synced_(o.info_synced_) {
     o.stream_ = nullptr;
     o.handle_ = nullptr;
     o.lu_alloc_size_ = 0;
     o.scratch_size_ = 0;
     o.n_ = 0;
     o.lda_ = 0;
     o.info_ = InvalidInput;
     o.info_synced_ = true;
   }

   LU& operator=(LU&& o) noexcept {
     if (this != &o) {
       // Drain pending work on the old stream before tearing it down so the
       // upcoming move of d_lu_/d_ipiv_/d_scratch_ doesn't free buffers that an
       // in-flight kernel is still touching. The asymmetric move-ctor doesn't
       // need this — it constructs onto uninitialized storage.
       if (stream_) (void)cudaStreamSynchronize(stream_);
       if (handle_) (void)cusolverDnDestroy(handle_);
       if (stream_) (void)cudaStreamDestroy(stream_);
       stream_ = o.stream_;
       handle_ = o.handle_;
       params_ = std::move(o.params_);
       d_lu_ = std::move(o.d_lu_);
       lu_alloc_size_ = o.lu_alloc_size_;
       d_ipiv_ = std::move(o.d_ipiv_);
       d_scratch_ = std::move(o.d_scratch_);
       scratch_size_ = o.scratch_size_;
       h_workspace_ = std::move(o.h_workspace_);
       n_ = o.n_;
       lda_ = o.lda_;
       info_ = o.info_;
       pinned_info_ = std::move(o.pinned_info_);
       info_synced_ = o.info_synced_;
       o.stream_ = nullptr;
       o.handle_ = nullptr;
       o.lu_alloc_size_ = 0;
       o.scratch_size_ = 0;
       o.n_ = 0;
       o.lda_ = 0;
       o.info_ = InvalidInput;
       o.info_synced_ = true;
     }
     return *this;
   }

   // ---- Factorization -------------------------------------------------------

   /** Compute the LU factorization of A (host matrix, must be square). */
   template <typename InputType>
   LU& compute(const EigenBase<InputType>& A) {
     eigen_assert(A.rows() == A.cols() && "LU requires a square matrix");
     if (!begin_compute(A.rows())) return *this;

     const PlainMatrix mat(A.derived());
     lda_ = static_cast<int64_t>(mat.outerStride());
     allocate_lu_storage();
     EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.get(), mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_));

     factorize();
     return *this;
   }

   /** Compute the LU factorization from a device-resident matrix (D2D copy). */
   LU& compute(const DeviceMatrix<Scalar>& d_A) {
     eigen_assert(d_A.rows() == d_A.cols() && "LU requires a square matrix");
     if (!begin_compute(d_A.rows())) return *this;

     lda_ = static_cast<int64_t>(d_A.outerStride());
     d_A.waitReady(stream_);
     allocate_lu_storage();
     EIGEN_CUDA_RUNTIME_CHECK(
         cudaMemcpyAsync(d_lu_.get(), d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_));

     factorize();
     return *this;
   }

   /** Compute the LU factorization from a device matrix (move, no copy). */
   LU& compute(DeviceMatrix<Scalar>&& d_A) {
     eigen_assert(d_A.rows() == d_A.cols() && "LU requires a square matrix");
     if (!begin_compute(d_A.rows())) return *this;

     lda_ = static_cast<int64_t>(d_A.outerStride());
     d_A.waitReady(stream_);
     d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
     lu_alloc_size_ = matrixBytes();

     factorize();
     return *this;
   }

   // ---- Solve ---------------------------------------------------------------

   /** Solve op(A) * X = B using the cached LU factorization (host → host).
    *
    * \param B  Right-hand side (n x nrhs host matrix).
    * \param op gpu::GpuOp::NoTrans (default), Trans, or ConjTrans.
    */
   template <typename Rhs>
   PlainMatrix solve(const MatrixBase<Rhs>& B, GpuOp op = GpuOp::NoTrans) const {
     const_cast<LU*>(this)->sync_info();
     eigen_assert(info_ == Success && "LU::solve called on a failed or uninitialized factorization");
     eigen_assert(B.rows() == n_);

     const PlainMatrix rhs(B);
     const int64_t nrhs = static_cast<int64_t>(rhs.cols());
     const int64_t ldb = static_cast<int64_t>(rhs.outerStride());
     internal::DeviceBuffer d_x(matrixBytes(nrhs, ldb));
     EIGEN_CUDA_RUNTIME_CHECK(
         cudaMemcpyAsync(d_x.get(), rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
     DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, op, std::move(d_x));

     PlainMatrix X(n_, B.cols());
     int solve_info = 0;
     EIGEN_CUDA_RUNTIME_CHECK(
         cudaMemcpyAsync(X.data(), d_X.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
     EIGEN_CUDA_RUNTIME_CHECK(
         cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
     EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));

     eigen_assert(solve_info == 0 && "cusolverDnXgetrs reported an error");
     return X;
   }

   /** Solve op(A) * X = B with device-resident RHS. Fully async. */
   DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B, GpuOp op = GpuOp::NoTrans) const {
     eigen_assert(d_B.rows() == n_);
     d_B.waitReady(stream_);
     const int64_t nrhs = static_cast<int64_t>(d_B.cols());
     const int64_t ldb = static_cast<int64_t>(d_B.outerStride());
     internal::DeviceBuffer d_x(matrixBytes(nrhs, ldb));
     EIGEN_CUDA_RUNTIME_CHECK(
         cudaMemcpyAsync(d_x.get(), d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
     return solve_impl(nrhs, ldb, op, std::move(d_x));
   }

   // ---- Accessors -----------------------------------------------------------

   /** Lazily synchronizes the stream on first call after compute(). */
   ComputationInfo info() const {
     const_cast<LU*>(this)->sync_info();
     return info_;
   }
   Index rows() const { return n_; }
   Index cols() const { return n_; }
   cudaStream_t stream() const { return stream_; }

  private:
   cudaStream_t stream_ = nullptr;
   cusolverDnHandle_t handle_ = nullptr;
   internal::CusolverParams params_;   // cuSOLVER params (created once, reused)
   internal::DeviceBuffer d_lu_;       // LU factors on device (grows, never shrinks)
   size_t lu_alloc_size_ = 0;          // current d_lu_ allocation size
   internal::DeviceBuffer d_ipiv_;     // pivot indices (int64_t) on device
   internal::DeviceBuffer d_scratch_;  // combined workspace + info word (grows, never shrinks)
   size_t scratch_size_ = 0;           // current scratch allocation size
   std::vector<char> h_workspace_;     // host workspace (kept alive until next compute)
   int64_t n_ = 0;
   int64_t lda_ = 0;
   ComputationInfo info_ = InvalidInput;
   internal::PinnedHostBuffer pinned_info_{sizeof(int)};  // pinned host memory for async D2H
   bool info_synced_ = true;                              // has the stream been synced for info?

   int& info_word() { return *static_cast<int*>(pinned_info_.get()); }
   int info_word() const { return *static_cast<const int*>(pinned_info_.get()); }

   bool begin_compute(Index rows) {
     n_ = rows;
     info_ = InvalidInput;
     if (n_ == 0) {
       info_ = Success;
       info_synced_ = true;
       return false;
     }
     return true;
   }

   size_t matrixBytes() const { return matrixBytes(n_, lda_); }

   static size_t matrixBytes(int64_t cols, int64_t outer_stride) {
     return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
   }

   void allocate_lu_storage() {
     size_t needed = matrixBytes();
     if (needed > lu_alloc_size_) {
       d_lu_ = internal::DeviceBuffer(needed);
       lu_alloc_size_ = needed;
     }
   }

   // Scratch layout: [ workspace (aligned) | info_word (sizeof(int)) ].
   // Workspace size is rounded up to 16 bytes so the info word lands aligned.
   static constexpr size_t kInfoBytes = sizeof(int);
   static constexpr size_t kScratchAlign = 16;

   static size_t scratchBytesFor(size_t workspace_bytes) {
     workspace_bytes = (workspace_bytes + kScratchAlign - 1) & ~(kScratchAlign - 1);
     return workspace_bytes + kInfoBytes;
   }

   // Ensure d_scratch_ is at least `bytes`. Grows but never shrinks.
   // Syncs the stream before reallocating to avoid freeing memory that
   // async kernels may still be using.
   void ensure_scratch(size_t bytes) {
     if (bytes > scratch_size_) {
       if (d_scratch_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
       d_scratch_ = internal::DeviceBuffer(bytes);
       scratch_size_ = bytes;
     }
   }

   void* scratch_workspace() const { return d_scratch_.get(); }
   int* scratch_info() const {
     eigen_assert(d_scratch_ && scratch_size_ >= kInfoBytes);
     return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.get()) + scratch_size_ - kInfoBytes);
   }

   // Solve in place on `d_x` (which already holds B), then re-wrap as a
   // typed DeviceMatrix carrying shape and a ready event. The release/adopt
   // hop just hands ownership of the raw cudaMalloc pointer from the untyped
   // DeviceBuffer to the typed DeviceMatrix without copying.
   DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, GpuOp op, internal::DeviceBuffer&& d_x) const {
     constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
     const cublasOperation_t trans = internal::to_cublas_op(op);

     EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(handle_, params_.p, trans, n_, nrhs, dtype, d_lu_.get(), lda_,
                                           static_cast<const int64_t*>(d_ipiv_.get()), dtype, d_x.get(), ldb,
                                           scratch_info()));

     DeviceMatrix<Scalar> result =
         DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_x.release()), n_, static_cast<Index>(nrhs));
     result.recordReady(stream_);
     return result;
   }

   void init_context() {
     EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
     EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
     EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
     ensure_scratch(kInfoBytes);
   }

   void sync_info() {
     if (!info_synced_) {
       EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
       info_ = (info_word() == 0) ? Success : NumericalIssue;
       info_synced_ = true;
     }
   }

   // Run cusolverDnXgetrf on d_lu_ (already on device). Allocates d_ipiv_.
   // Enqueues factorization + async info download. Does NOT sync.
   // Workspaces are stored as members to ensure they outlive the async kernels.
   void factorize() {
     constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
     const size_t ipiv_bytes = static_cast<size_t>(n_) * sizeof(int64_t);

     info_synced_ = false;
     info_ = InvalidInput;

     d_ipiv_ = internal::DeviceBuffer(ipiv_bytes);

     size_t dev_ws_bytes = 0, host_ws_bytes = 0;
     EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(handle_, params_.p, n_, n_, dtype, d_lu_.get(), lda_, dtype,
                                                      &dev_ws_bytes, &host_ws_bytes));

     ensure_scratch(scratchBytesFor(dev_ws_bytes));
     h_workspace_.resize(host_ws_bytes);

     EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf(handle_, params_.p, n_, n_, dtype, d_lu_.get(), lda_,
                                           static_cast<int64_t*>(d_ipiv_.get()), dtype, scratch_workspace(),
                                           dev_ws_bytes, host_ws_bytes > 0 ? h_workspace_.data() : nullptr,
                                           host_ws_bytes, scratch_info()));

     EIGEN_CUDA_RUNTIME_CHECK(
         cudaMemcpyAsync(&info_word(), scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
   }
 };

 }  // namespace gpu
 }  // namespace Eigen

 #endif  // EIGEN_GPU_LU_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2026 Eigen Authors
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
	// SPDX-License-Identifier: MPL-2.0

	// GPU partial-pivoting LU decomposition using cuSOLVER.
	//
	// Wraps cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve).
	// The factored LU matrix and pivot array are kept in device memory for the
	// lifetime of the object, so repeated solves only transfer the RHS/solution.
	//
	// Requires CUDA 11.0+ (cusolverDnX generic API).
	//
	// Usage:
	// gpu::LU<double> lu(A); // upload A, getrf, LU+ipiv on device
	// if (lu.info() != Success) { ... }
	// MatrixXd x = lu.solve(b); // getrs NoTrans, only b transferred
	// MatrixXd xt = lu.solve(b, gpu::GpuOp::Trans); // A^T x = b

	#ifndef EIGEN_GPU_LU_H
	#define EIGEN_GPU_LU_H

	// IWYU pragma: private
	#include "./InternalHeaderCheck.h"

	#include "./CuBlasSupport.h"
	#include "./CuSolverSupport.h"
	#include <vector>

	namespace Eigen {
	namespace gpu {

	/** \ingroup GPU_Module
	* \class LU
	* \brief GPU LU decomposition with partial pivoting via cuSOLVER
	*
	* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
	*
	* Decomposes a square matrix A = P L U on the GPU and retains the factored
	* matrix and pivot array in device memory. Solves AX=B, A^TX=B, or
	* A^H*X=B by passing the appropriate gpu::GpuOp.
	*
	* Each LU object owns a dedicated CUDA stream and cuSOLVER handle.
	*/
	template <typename Scalar_>
	class LU {
	public:
	using Scalar = Scalar_;
	using RealScalar = typename NumTraits<Scalar>::Real;
	using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;

	// ---- Construction / destruction ------------------------------------------

	LU() { init_context(); }

	template <typename InputType>
	explicit LU(const EigenBase<InputType>& A) : LU() {
	compute(A);
	}

	~LU() {
	// Ignore errors here: dtors are noexcept, and EIGEN_CUSOLVER_CHECK /
	// EIGEN_CUDA_RUNTIME_CHECK are eigen_assert-based — firing one from a
	// noexcept dtor terminates the program. The trailing cudaFree() of the
	// device buffers (via internal::DeviceBuffer::~DeviceBuffer) is sync.
	if (handle_) (void)cusolverDnDestroy(handle_);
	if (stream_) (void)cudaStreamDestroy(stream_);
	}

	LU(const LU&) = delete;
	LU& operator=(const LU&) = delete;

	LU(LU&& o) noexcept
	: stream_(o.stream_),
	handle_(o.handle_),
	params_(std::move(o.params_)),
	d_lu_(std::move(o.d_lu_)),
	lu_alloc_size_(o.lu_alloc_size_),
	d_ipiv_(std::move(o.d_ipiv_)),
	d_scratch_(std::move(o.d_scratch_)),
	scratch_size_(o.scratch_size_),
	h_workspace_(std::move(o.h_workspace_)),
	n_(o.n_),
	lda_(o.lda_),
	info_(o.info_),
	pinned_info_(std::move(o.pinned_info_)),
	info_synced_(o.info_synced_) {
	o.stream_ = nullptr;
	o.handle_ = nullptr;
	o.lu_alloc_size_ = 0;
	o.scratch_size_ = 0;
	o.n_ = 0;
	o.lda_ = 0;
	o.info_ = InvalidInput;
	o.info_synced_ = true;
	}

	LU& operator=(LU&& o) noexcept {
	if (this != &o) {
	// Drain pending work on the old stream before tearing it down so the
	// upcoming move of d_lu_/d_ipiv_/d_scratch_ doesn't free buffers that an
	// in-flight kernel is still touching. The asymmetric move-ctor doesn't
	// need this — it constructs onto uninitialized storage.
	if (stream_) (void)cudaStreamSynchronize(stream_);
	if (handle_) (void)cusolverDnDestroy(handle_);
	if (stream_) (void)cudaStreamDestroy(stream_);
	stream_ = o.stream_;
	handle_ = o.handle_;
	params_ = std::move(o.params_);
	d_lu_ = std::move(o.d_lu_);
	lu_alloc_size_ = o.lu_alloc_size_;
	d_ipiv_ = std::move(o.d_ipiv_);
	d_scratch_ = std::move(o.d_scratch_);
	scratch_size_ = o.scratch_size_;
	h_workspace_ = std::move(o.h_workspace_);
	n_ = o.n_;
	lda_ = o.lda_;
	info_ = o.info_;
	pinned_info_ = std::move(o.pinned_info_);
	info_synced_ = o.info_synced_;
	o.stream_ = nullptr;
	o.handle_ = nullptr;
	o.lu_alloc_size_ = 0;
	o.scratch_size_ = 0;
	o.n_ = 0;
	o.lda_ = 0;
	o.info_ = InvalidInput;
	o.info_synced_ = true;
	}
	return *this;
	}

	// ---- Factorization -------------------------------------------------------

	/** Compute the LU factorization of A (host matrix, must be square). */
	template <typename InputType>
	LU& compute(const EigenBase<InputType>& A) {
	eigen_assert(A.rows() == A.cols() && "LU requires a square matrix");
	if (!begin_compute(A.rows())) return *this;

	const PlainMatrix mat(A.derived());
	lda_ = static_cast<int64_t>(mat.outerStride());
	allocate_lu_storage();
	EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.get(), mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_));

	factorize();
	return *this;
	}

	/** Compute the LU factorization from a device-resident matrix (D2D copy). */
	LU& compute(const DeviceMatrix<Scalar>& d_A) {
	eigen_assert(d_A.rows() == d_A.cols() && "LU requires a square matrix");
	if (!begin_compute(d_A.rows())) return *this;

	lda_ = static_cast<int64_t>(d_A.outerStride());
	d_A.waitReady(stream_);
	allocate_lu_storage();
	EIGEN_CUDA_RUNTIME_CHECK(
	cudaMemcpyAsync(d_lu_.get(), d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_));

	factorize();
	return *this;
	}

	/** Compute the LU factorization from a device matrix (move, no copy). */
	LU& compute(DeviceMatrix<Scalar>&& d_A) {
	eigen_assert(d_A.rows() == d_A.cols() && "LU requires a square matrix");
	if (!begin_compute(d_A.rows())) return *this;

	lda_ = static_cast<int64_t>(d_A.outerStride());
	d_A.waitReady(stream_);
	d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
	lu_alloc_size_ = matrixBytes();

	factorize();
	return *this;
	}

	// ---- Solve ---------------------------------------------------------------

	/** Solve op(A) * X = B using the cached LU factorization (host → host).
	*
	* \param B Right-hand side (n x nrhs host matrix).
	* \param op gpu::GpuOp::NoTrans (default), Trans, or ConjTrans.
	*/
	template <typename Rhs>
	PlainMatrix solve(const MatrixBase<Rhs>& B, GpuOp op = GpuOp::NoTrans) const {
	const_cast<LU*>(this)->sync_info();
	eigen_assert(info_ == Success && "LU::solve called on a failed or uninitialized factorization");
	eigen_assert(B.rows() == n_);

	const PlainMatrix rhs(B);
	const int64_t nrhs = static_cast<int64_t>(rhs.cols());
	const int64_t ldb = static_cast<int64_t>(rhs.outerStride());
	internal::DeviceBuffer d_x(matrixBytes(nrhs, ldb));
	EIGEN_CUDA_RUNTIME_CHECK(
	cudaMemcpyAsync(d_x.get(), rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
	DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, op, std::move(d_x));

	PlainMatrix X(n_, B.cols());
	int solve_info = 0;
	EIGEN_CUDA_RUNTIME_CHECK(
	cudaMemcpyAsync(X.data(), d_X.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
	EIGEN_CUDA_RUNTIME_CHECK(
	cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
	EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));

	eigen_assert(solve_info == 0 && "cusolverDnXgetrs reported an error");
	return X;
	}

	/** Solve op(A) * X = B with device-resident RHS. Fully async. */
	DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B, GpuOp op = GpuOp::NoTrans) const {
	eigen_assert(d_B.rows() == n_);
	d_B.waitReady(stream_);
	const int64_t nrhs = static_cast<int64_t>(d_B.cols());
	const int64_t ldb = static_cast<int64_t>(d_B.outerStride());
	internal::DeviceBuffer d_x(matrixBytes(nrhs, ldb));
	EIGEN_CUDA_RUNTIME_CHECK(
	cudaMemcpyAsync(d_x.get(), d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
	return solve_impl(nrhs, ldb, op, std::move(d_x));
	}

	// ---- Accessors -----------------------------------------------------------

	/** Lazily synchronizes the stream on first call after compute(). */
	ComputationInfo info() const {
	const_cast<LU*>(this)->sync_info();
	return info_;
	}
	Index rows() const { return n_; }
	Index cols() const { return n_; }
	cudaStream_t stream() const { return stream_; }

	private:
	cudaStream_t stream_ = nullptr;
	cusolverDnHandle_t handle_ = nullptr;
	internal::CusolverParams params_; // cuSOLVER params (created once, reused)
	internal::DeviceBuffer d_lu_; // LU factors on device (grows, never shrinks)
	size_t lu_alloc_size_ = 0; // current d_lu_ allocation size
	internal::DeviceBuffer d_ipiv_; // pivot indices (int64_t) on device
	internal::DeviceBuffer d_scratch_; // combined workspace + info word (grows, never shrinks)
	size_t scratch_size_ = 0; // current scratch allocation size
	std::vector<char> h_workspace_; // host workspace (kept alive until next compute)
	int64_t n_ = 0;
	int64_t lda_ = 0;
	ComputationInfo info_ = InvalidInput;
	internal::PinnedHostBuffer pinned_info_{sizeof(int)}; // pinned host memory for async D2H
	bool info_synced_ = true; // has the stream been synced for info?

	int& info_word() { return static_cast<int>(pinned_info_.get()); }
	int info_word() const { return static_cast<const int>(pinned_info_.get()); }

	bool begin_compute(Index rows) {
	n_ = rows;
	info_ = InvalidInput;
	if (n_ == 0) {
	info_ = Success;
	info_synced_ = true;
	return false;
	}
	return true;
	}

	size_t matrixBytes() const { return matrixBytes(n_, lda_); }

	static size_t matrixBytes(int64_t cols, int64_t outer_stride) {
	return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
	}

	void allocate_lu_storage() {
	size_t needed = matrixBytes();
	if (needed > lu_alloc_size_) {
	d_lu_ = internal::DeviceBuffer(needed);
	lu_alloc_size_ = needed;
	}
	}

	// Scratch layout: [ workspace (aligned) \| info_word (sizeof(int)) ].
	// Workspace size is rounded up to 16 bytes so the info word lands aligned.
	static constexpr size_t kInfoBytes = sizeof(int);
	static constexpr size_t kScratchAlign = 16;

	static size_t scratchBytesFor(size_t workspace_bytes) {
	workspace_bytes = (workspace_bytes + kScratchAlign - 1) & ~(kScratchAlign - 1);
	return workspace_bytes + kInfoBytes;
	}

	// Ensure d_scratch_ is at least `bytes`. Grows but never shrinks.
	// Syncs the stream before reallocating to avoid freeing memory that
	// async kernels may still be using.
	void ensure_scratch(size_t bytes) {
	if (bytes > scratch_size_) {
	if (d_scratch_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
	d_scratch_ = internal::DeviceBuffer(bytes);
	scratch_size_ = bytes;
	}
	}

	void* scratch_workspace() const { return d_scratch_.get(); }
	int* scratch_info() const {
	eigen_assert(d_scratch_ && scratch_size_ >= kInfoBytes);
	return reinterpret_cast<int>(static_cast<char>(d_scratch_.get()) + scratch_size_ - kInfoBytes);
	}

	// Solve in place on `d_x` (which already holds B), then re-wrap as a
	// typed DeviceMatrix carrying shape and a ready event. The release/adopt
	// hop just hands ownership of the raw cudaMalloc pointer from the untyped
	// DeviceBuffer to the typed DeviceMatrix without copying.
	DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, GpuOp op, internal::DeviceBuffer&& d_x) const {
	constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
	const cublasOperation_t trans = internal::to_cublas_op(op);

	EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(handle_, params_.p, trans, n_, nrhs, dtype, d_lu_.get(), lda_,
	static_cast<const int64_t*>(d_ipiv_.get()), dtype, d_x.get(), ldb,
	scratch_info()));

	DeviceMatrix<Scalar> result =
	DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_x.release()), n_, static_cast<Index>(nrhs));
	result.recordReady(stream_);
	return result;
	}

	void init_context() {
	EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
	EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
	EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
	ensure_scratch(kInfoBytes);
	}

	void sync_info() {
	if (!info_synced_) {
	EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
	info_ = (info_word() == 0) ? Success : NumericalIssue;
	info_synced_ = true;
	}
	}

	// Run cusolverDnXgetrf on d_lu_ (already on device). Allocates d_ipiv_.
	// Enqueues factorization + async info download. Does NOT sync.
	// Workspaces are stored as members to ensure they outlive the async kernels.
	void factorize() {
	constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
	const size_t ipiv_bytes = static_cast<size_t>(n_) * sizeof(int64_t);

	info_synced_ = false;
	info_ = InvalidInput;

	d_ipiv_ = internal::DeviceBuffer(ipiv_bytes);

	size_t dev_ws_bytes = 0, host_ws_bytes = 0;
	EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(handle_, params_.p, n_, n_, dtype, d_lu_.get(), lda_, dtype,
	&dev_ws_bytes, &host_ws_bytes));

	ensure_scratch(scratchBytesFor(dev_ws_bytes));
	h_workspace_.resize(host_ws_bytes);

	EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf(handle_, params_.p, n_, n_, dtype, d_lu_.get(), lda_,
	static_cast<int64_t*>(d_ipiv_.get()), dtype, scratch_workspace(),
	dev_ws_bytes, host_ws_bytes > 0 ? h_workspace_.data() : nullptr,
	host_ws_bytes, scratch_info()));

	EIGEN_CUDA_RUNTIME_CHECK(
	cudaMemcpyAsync(&info_word(), scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
	}
	};

	} // namespace gpu
	} // namespace Eigen

	#endif // EIGEN_GPU_LU_H