unsupported/Eigen/src/GPU/GpuLLT.h - mirror - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Eigen Authors
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // SPDX-License-Identifier: MPL-2.0

 // GPU Cholesky (LLT) decomposition using cuSOLVER.
 //
 // Unlike Eigen's CPU LLT<MatrixType>, gpu::LLT keeps the factored Cholesky
 // factor in device memory for the lifetime of the object. Multiple solves
 // against the same factor therefore only transfer the RHS and solution
 // vectors, not the factor itself.
 //
 // Requires CUDA 11.0+ (cusolverDnXpotrf / cusolverDnXpotrs generic API).
 // Requires CUDA 11.4+ (cusolverDnX generic API + cudaMallocAsync).
 //
 // Usage:
 //   gpu::LLT<double> llt(A);            // upload A, potrf, L stays on device
 //   if (llt.info() != Success) { ... }
 //   MatrixXd x1 = llt.solve(b1);        // potrs, only b1 transferred
 //   MatrixXd x2 = llt.solve(b2);        // L already on device

 #ifndef EIGEN_GPU_LLT_H
 #define EIGEN_GPU_LLT_H

 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"

 #include "./GpuSolverContext.h"

 namespace Eigen {
 namespace gpu {

 /** \ingroup GPU_Module
  * \class LLT
  * \brief GPU Cholesky (LL^T) decomposition via cuSOLVER
  *
  * \tparam Scalar_  Element type: float, double, complex<float>, complex<double>
  * \tparam UpLo_    Triangle used: Lower (default) or Upper
  *
  * Factorizes a symmetric positive-definite matrix A = LL^H on the GPU and
  * caches the factor L in device memory. Each subsequent solve(B) uploads only
  * B, calls cusolverDnXpotrs, and downloads the result — the factor is not
  * re-transferred.
  *
  * Each LLT object owns a dedicated CUDA stream and cuSOLVER handle,
  * enabling concurrent factorizations from multiple objects on the same host
  * thread.
  */
 template <typename Scalar_, int UpLo_ = Lower>
 class LLT {
  public:
   using Scalar = Scalar_;
   using RealScalar = typename NumTraits<Scalar>::Real;
   using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;

   static constexpr int UpLo = UpLo_;

   // ---- Construction / destruction ------------------------------------------

   /** Default constructor. Does not factorize; call compute() before solve(). */
   LLT() = default;

   /** Factor A immediately. Equivalent to LLT llt; llt.compute(A). */
   template <typename InputType>
   explicit LLT(const EigenBase<InputType>& A) {
     compute(A);
   }

   ~LLT() = default;

   // Non-copyable (owns device memory and library handles).
   LLT(const LLT&) = delete;
   LLT& operator=(const LLT&) = delete;

   // Movable.
   LLT(LLT&& o) noexcept
       : solver_ctx_(std::move(o.solver_ctx_)),
         d_factor_(std::move(o.d_factor_)),
         factor_alloc_size_(o.factor_alloc_size_),
         n_(o.n_),
         lda_(o.lda_) {
     o.factor_alloc_size_ = 0;
     o.n_ = 0;
     o.lda_ = 0;
   }

   LLT& operator=(LLT&& o) noexcept {
     if (this != &o) {
       solver_ctx_ = std::move(o.solver_ctx_);
       d_factor_ = std::move(o.d_factor_);
       factor_alloc_size_ = o.factor_alloc_size_;
       n_ = o.n_;
       lda_ = o.lda_;
       o.factor_alloc_size_ = 0;
       o.n_ = 0;
       o.lda_ = 0;
     }
     return *this;
   }

   // ---- Factorization -------------------------------------------------------

   /** Compute the Cholesky factorization of A (host matrix). */
   template <typename InputType>
   LLT& compute(const EigenBase<InputType>& A) {
     eigen_assert(A.rows() == A.cols());
     if (!begin_compute(A.rows())) return *this;

     const PlainMatrix mat(A.derived());
     lda_ = static_cast<int64_t>(mat.rows());
     allocate_factor_storage();
     EIGEN_CUDA_RUNTIME_CHECK(
         cudaMemcpyAsync(d_factor_.get(), mat.data(), factorBytes(), cudaMemcpyHostToDevice, solver_ctx_.stream_));

     factorize();
     return *this;
   }

   /** Compute the Cholesky factorization from a device-resident matrix (D2D copy). */
   LLT& compute(const DeviceMatrix<Scalar>& d_A) {
     eigen_assert(d_A.rows() == d_A.cols());
     if (!begin_compute(d_A.rows())) return *this;

     lda_ = static_cast<int64_t>(d_A.rows());
     d_A.waitReady(solver_ctx_.stream_);
     allocate_factor_storage();
     EIGEN_CUDA_RUNTIME_CHECK(
         cudaMemcpyAsync(d_factor_.get(), d_A.data(), factorBytes(), cudaMemcpyDeviceToDevice, solver_ctx_.stream_));

     factorize();
     return *this;
   }

   /** Compute the Cholesky factorization from a device matrix (move, no copy). */
   LLT& compute(DeviceMatrix<Scalar>&& d_A) {
     eigen_assert(d_A.rows() == d_A.cols());
     if (!begin_compute(d_A.rows())) return *this;

     lda_ = static_cast<int64_t>(d_A.rows());
     d_A.waitReady(solver_ctx_.stream_);
     d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
     factor_alloc_size_ = factorBytes();

     factorize();
     return *this;
   }

   // ---- Solve ---------------------------------------------------------------

   /** Solve A * X = B using the cached Cholesky factor (host → host). */
   template <typename Rhs>
   PlainMatrix solve(const MatrixBase<Rhs>& B) const {
     solver_ctx_.sync_info();
     eigen_assert(solver_ctx_.info_ == Success && "LLT::solve called on a failed or uninitialized factorization");
     eigen_assert(B.rows() == n_);

     const PlainMatrix rhs(B);
     const int64_t nrhs = static_cast<int64_t>(rhs.cols());
     const int64_t ldb = static_cast<int64_t>(rhs.rows());
     internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb));
     EIGEN_CUDA_RUNTIME_CHECK(
         cudaMemcpyAsync(d_x.get(), rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, solver_ctx_.stream_));
     DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, std::move(d_x));

     PlainMatrix X(n_, B.cols());
     int solve_info = 0;
     EIGEN_CUDA_RUNTIME_CHECK(
         cudaMemcpyAsync(X.data(), d_X.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToHost, solver_ctx_.stream_));
     EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solve_info, solver_ctx_.scratch_info(), sizeof(int),
                                              cudaMemcpyDeviceToHost, solver_ctx_.stream_));
     EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(solver_ctx_.stream_));

     eigen_assert(solve_info == 0 && "cusolverDnXpotrs reported an error");
     return X;
   }

   /** Solve A * X = B with device-resident RHS. Returns immediately after enqueuing
    * the solve; the sync_info()/info_ check at entry forces a host wait on the
    * factorization status so a singular A causes a clean assert rather than a
    * silently-bad result. */
   DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
     solver_ctx_.sync_info();
     eigen_assert(solver_ctx_.info_ == Success && "LLT::solve called on a failed or uninitialized factorization");
     eigen_assert(d_B.rows() == n_);
     d_B.waitReady(solver_ctx_.stream_);
     const int64_t nrhs = static_cast<int64_t>(d_B.cols());
     const int64_t ldb = static_cast<int64_t>(d_B.rows());
     internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb));
     EIGEN_CUDA_RUNTIME_CHECK(
         cudaMemcpyAsync(d_x.get(), d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, solver_ctx_.stream_));
     return solve_impl(nrhs, ldb, std::move(d_x));
   }

   // ---- Accessors -----------------------------------------------------------

   ComputationInfo info() const { return solver_ctx_.info(); }
   Index rows() const { return n_; }
   Index cols() const { return n_; }
   cudaStream_t stream() const { return solver_ctx_.stream_; }

  private:
   mutable internal::GpuSolverContext solver_ctx_;
   internal::DeviceBuffer d_factor_;
   size_t factor_alloc_size_ = 0;
   int64_t n_ = 0;
   int64_t lda_ = 0;

   bool begin_compute(Index rows) {
     n_ = rows;
     solver_ctx_.info_ = InvalidInput;
     if (n_ == 0) {
       solver_ctx_.info_ = Success;
       solver_ctx_.info_synced_ = true;
       return false;
     }
     return true;
   }

   size_t factorBytes() const { return rhsBytes(n_, lda_); }

   static size_t rhsBytes(int64_t cols, int64_t ld) {
     return static_cast<size_t>(ld) * static_cast<size_t>(cols) * sizeof(Scalar);
   }

   void allocate_factor_storage() {
     size_t needed = factorBytes();
     if (needed > factor_alloc_size_) {
       d_factor_ = internal::DeviceBuffer(needed);
       factor_alloc_size_ = needed;
     }
   }

   // Solve in place on `d_x` (which already holds B), then re-wrap as a typed
   // DeviceMatrix carrying shape and a ready event. The release/adopt hop hands
   // ownership of the raw cudaMalloc pointer from the untyped DeviceBuffer to
   // the typed DeviceMatrix without copying.
   DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, internal::DeviceBuffer&& d_x) const {
     constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
     constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value;

     EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(solver_ctx_.cusolver_, solver_ctx_.params_.p, uplo, n_, nrhs, dtype,
                                           d_factor_.get(), lda_, dtype, d_x.get(), ldb, solver_ctx_.scratch_info()));

     DeviceMatrix<Scalar> result =
         DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_x.release()), n_, static_cast<Index>(nrhs));
     result.recordReady(solver_ctx_.stream_);
     return result;
   }

   void factorize() {
     constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
     constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value;

     solver_ctx_.mark_pending();

     size_t dev_ws_bytes = 0, host_ws_bytes = 0;
     EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(solver_ctx_.cusolver_, solver_ctx_.params_.p, uplo, n_, dtype,
                                                      d_factor_.get(), lda_, dtype, &dev_ws_bytes, &host_ws_bytes));

     solver_ctx_.ensure_scratch(dev_ws_bytes);
     solver_ctx_.h_workspace_.resize(host_ws_bytes);

     EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(solver_ctx_.cusolver_, solver_ctx_.params_.p, uplo, n_, dtype,
                                           d_factor_.get(), lda_, dtype, solver_ctx_.scratch_workspace(), dev_ws_bytes,
                                           host_ws_bytes > 0 ? solver_ctx_.h_workspace_.data() : nullptr, host_ws_bytes,
                                           solver_ctx_.scratch_info()));

     EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solver_ctx_.info_word(), solver_ctx_.scratch_info(), sizeof(int),
                                              cudaMemcpyDeviceToHost, solver_ctx_.stream_));
   }
 };

 }  // namespace gpu
 }  // namespace Eigen

 #endif  // EIGEN_GPU_LLT_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2026 Eigen Authors
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
	// SPDX-License-Identifier: MPL-2.0

	// GPU Cholesky (LLT) decomposition using cuSOLVER.
	//
	// Unlike Eigen's CPU LLT<MatrixType>, gpu::LLT keeps the factored Cholesky
	// factor in device memory for the lifetime of the object. Multiple solves
	// against the same factor therefore only transfer the RHS and solution
	// vectors, not the factor itself.
	//
	// Requires CUDA 11.0+ (cusolverDnXpotrf / cusolverDnXpotrs generic API).
	// Requires CUDA 11.4+ (cusolverDnX generic API + cudaMallocAsync).
	//
	// Usage:
	// gpu::LLT<double> llt(A); // upload A, potrf, L stays on device
	// if (llt.info() != Success) { ... }
	// MatrixXd x1 = llt.solve(b1); // potrs, only b1 transferred
	// MatrixXd x2 = llt.solve(b2); // L already on device

	#ifndef EIGEN_GPU_LLT_H
	#define EIGEN_GPU_LLT_H

	// IWYU pragma: private
	#include "./InternalHeaderCheck.h"

	#include "./GpuSolverContext.h"

	namespace Eigen {
	namespace gpu {

	/** \ingroup GPU_Module
	* \class LLT
	* \brief GPU Cholesky (LL^T) decomposition via cuSOLVER
	*
	* \tparam Scalar_ Element type: float, double, complex<float>, complex<double>
	* \tparam UpLo_ Triangle used: Lower (default) or Upper
	*
	* Factorizes a symmetric positive-definite matrix A = LL^H on the GPU and
	* caches the factor L in device memory. Each subsequent solve(B) uploads only
	* B, calls cusolverDnXpotrs, and downloads the result — the factor is not
	* re-transferred.
	*
	* Each LLT object owns a dedicated CUDA stream and cuSOLVER handle,
	* enabling concurrent factorizations from multiple objects on the same host
	* thread.
	*/
	template <typename Scalar_, int UpLo_ = Lower>
	class LLT {
	public:
	using Scalar = Scalar_;
	using RealScalar = typename NumTraits<Scalar>::Real;
	using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;

	static constexpr int UpLo = UpLo_;

	// ---- Construction / destruction ------------------------------------------

	/** Default constructor. Does not factorize; call compute() before solve(). */
	LLT() = default;

	/** Factor A immediately. Equivalent to LLT llt; llt.compute(A). */
	template <typename InputType>
	explicit LLT(const EigenBase<InputType>& A) {
	compute(A);
	}

	~LLT() = default;

	// Non-copyable (owns device memory and library handles).
	LLT(const LLT&) = delete;
	LLT& operator=(const LLT&) = delete;

	// Movable.
	LLT(LLT&& o) noexcept
	: solver_ctx_(std::move(o.solver_ctx_)),
	d_factor_(std::move(o.d_factor_)),
	factor_alloc_size_(o.factor_alloc_size_),
	n_(o.n_),
	lda_(o.lda_) {
	o.factor_alloc_size_ = 0;
	o.n_ = 0;
	o.lda_ = 0;
	}

	LLT& operator=(LLT&& o) noexcept {
	if (this != &o) {
	solver_ctx_ = std::move(o.solver_ctx_);
	d_factor_ = std::move(o.d_factor_);
	factor_alloc_size_ = o.factor_alloc_size_;
	n_ = o.n_;
	lda_ = o.lda_;
	o.factor_alloc_size_ = 0;
	o.n_ = 0;
	o.lda_ = 0;
	}
	return *this;
	}

	// ---- Factorization -------------------------------------------------------

	/** Compute the Cholesky factorization of A (host matrix). */
	template <typename InputType>
	LLT& compute(const EigenBase<InputType>& A) {
	eigen_assert(A.rows() == A.cols());
	if (!begin_compute(A.rows())) return *this;

	const PlainMatrix mat(A.derived());
	lda_ = static_cast<int64_t>(mat.rows());
	allocate_factor_storage();
	EIGEN_CUDA_RUNTIME_CHECK(
	cudaMemcpyAsync(d_factor_.get(), mat.data(), factorBytes(), cudaMemcpyHostToDevice, solver_ctx_.stream_));

	factorize();
	return *this;
	}

	/** Compute the Cholesky factorization from a device-resident matrix (D2D copy). */
	LLT& compute(const DeviceMatrix<Scalar>& d_A) {
	eigen_assert(d_A.rows() == d_A.cols());
	if (!begin_compute(d_A.rows())) return *this;

	lda_ = static_cast<int64_t>(d_A.rows());
	d_A.waitReady(solver_ctx_.stream_);
	allocate_factor_storage();
	EIGEN_CUDA_RUNTIME_CHECK(
	cudaMemcpyAsync(d_factor_.get(), d_A.data(), factorBytes(), cudaMemcpyDeviceToDevice, solver_ctx_.stream_));

	factorize();
	return *this;
	}

	/** Compute the Cholesky factorization from a device matrix (move, no copy). */
	LLT& compute(DeviceMatrix<Scalar>&& d_A) {
	eigen_assert(d_A.rows() == d_A.cols());
	if (!begin_compute(d_A.rows())) return *this;

	lda_ = static_cast<int64_t>(d_A.rows());
	d_A.waitReady(solver_ctx_.stream_);
	d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
	factor_alloc_size_ = factorBytes();

	factorize();
	return *this;
	}

	// ---- Solve ---------------------------------------------------------------

	/** Solve A * X = B using the cached Cholesky factor (host → host). */
	template <typename Rhs>
	PlainMatrix solve(const MatrixBase<Rhs>& B) const {
	solver_ctx_.sync_info();
	eigen_assert(solver_ctx_.info_ == Success && "LLT::solve called on a failed or uninitialized factorization");
	eigen_assert(B.rows() == n_);

	const PlainMatrix rhs(B);
	const int64_t nrhs = static_cast<int64_t>(rhs.cols());
	const int64_t ldb = static_cast<int64_t>(rhs.rows());
	internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb));
	EIGEN_CUDA_RUNTIME_CHECK(
	cudaMemcpyAsync(d_x.get(), rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, solver_ctx_.stream_));
	DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, std::move(d_x));

	PlainMatrix X(n_, B.cols());
	int solve_info = 0;
	EIGEN_CUDA_RUNTIME_CHECK(
	cudaMemcpyAsync(X.data(), d_X.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToHost, solver_ctx_.stream_));
	EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solve_info, solver_ctx_.scratch_info(), sizeof(int),
	cudaMemcpyDeviceToHost, solver_ctx_.stream_));
	EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(solver_ctx_.stream_));

	eigen_assert(solve_info == 0 && "cusolverDnXpotrs reported an error");
	return X;
	}

	/** Solve A * X = B with device-resident RHS. Returns immediately after enqueuing
	* the solve; the sync_info()/info_ check at entry forces a host wait on the
	* factorization status so a singular A causes a clean assert rather than a
	* silently-bad result. */
	DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
	solver_ctx_.sync_info();
	eigen_assert(solver_ctx_.info_ == Success && "LLT::solve called on a failed or uninitialized factorization");
	eigen_assert(d_B.rows() == n_);
	d_B.waitReady(solver_ctx_.stream_);
	const int64_t nrhs = static_cast<int64_t>(d_B.cols());
	const int64_t ldb = static_cast<int64_t>(d_B.rows());
	internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb));
	EIGEN_CUDA_RUNTIME_CHECK(
	cudaMemcpyAsync(d_x.get(), d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, solver_ctx_.stream_));
	return solve_impl(nrhs, ldb, std::move(d_x));
	}

	// ---- Accessors -----------------------------------------------------------

	ComputationInfo info() const { return solver_ctx_.info(); }
	Index rows() const { return n_; }
	Index cols() const { return n_; }
	cudaStream_t stream() const { return solver_ctx_.stream_; }

	private:
	mutable internal::GpuSolverContext solver_ctx_;
	internal::DeviceBuffer d_factor_;
	size_t factor_alloc_size_ = 0;
	int64_t n_ = 0;
	int64_t lda_ = 0;

	bool begin_compute(Index rows) {
	n_ = rows;
	solver_ctx_.info_ = InvalidInput;
	if (n_ == 0) {
	solver_ctx_.info_ = Success;
	solver_ctx_.info_synced_ = true;
	return false;
	}
	return true;
	}

	size_t factorBytes() const { return rhsBytes(n_, lda_); }

	static size_t rhsBytes(int64_t cols, int64_t ld) {
	return static_cast<size_t>(ld) * static_cast<size_t>(cols) * sizeof(Scalar);
	}

	void allocate_factor_storage() {
	size_t needed = factorBytes();
	if (needed > factor_alloc_size_) {
	d_factor_ = internal::DeviceBuffer(needed);
	factor_alloc_size_ = needed;
	}
	}

	// Solve in place on `d_x` (which already holds B), then re-wrap as a typed
	// DeviceMatrix carrying shape and a ready event. The release/adopt hop hands
	// ownership of the raw cudaMalloc pointer from the untyped DeviceBuffer to
	// the typed DeviceMatrix without copying.
	DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, internal::DeviceBuffer&& d_x) const {
	constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
	constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value;

	EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(solver_ctx_.cusolver_, solver_ctx_.params_.p, uplo, n_, nrhs, dtype,
	d_factor_.get(), lda_, dtype, d_x.get(), ldb, solver_ctx_.scratch_info()));

	DeviceMatrix<Scalar> result =
	DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_x.release()), n_, static_cast<Index>(nrhs));
	result.recordReady(solver_ctx_.stream_);
	return result;
	}

	void factorize() {
	constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
	constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value;

	solver_ctx_.mark_pending();

	size_t dev_ws_bytes = 0, host_ws_bytes = 0;
	EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(solver_ctx_.cusolver_, solver_ctx_.params_.p, uplo, n_, dtype,
	d_factor_.get(), lda_, dtype, &dev_ws_bytes, &host_ws_bytes));

	solver_ctx_.ensure_scratch(dev_ws_bytes);
	solver_ctx_.h_workspace_.resize(host_ws_bytes);

	EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(solver_ctx_.cusolver_, solver_ctx_.params_.p, uplo, n_, dtype,
	d_factor_.get(), lda_, dtype, solver_ctx_.scratch_workspace(), dev_ws_bytes,
	host_ws_bytes > 0 ? solver_ctx_.h_workspace_.data() : nullptr, host_ws_bytes,
	solver_ctx_.scratch_info()));

	EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(&solver_ctx_.info_word(), solver_ctx_.scratch_info(), sizeof(int),
	cudaMemcpyDeviceToHost, solver_ctx_.stream_));
	}
	};

	} // namespace gpu
	} // namespace Eigen

	#endif // EIGEN_GPU_LLT_H