unsupported/Eigen/src/GPU/GpuSupport.h - mirror - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // SPDX-License-Identifier: MPL-2.0

 // Generic CUDA runtime support shared across all GPU library integrations
 // (cuSOLVER and cuBLAS):
 //   - Error-checking macros
 //   - RAII device buffer
 //
 // Only depends on <cuda_runtime.h>. No NVIDIA library headers.

 #ifndef EIGEN_GPU_SUPPORT_H
 #define EIGEN_GPU_SUPPORT_H

 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"

 #include <cuda_runtime.h>

 #include <limits>
 #include <memory>

 namespace Eigen {
 namespace gpu {

 // ---- Generic operation flag -------------------------------------------------
 // Public flag for transpose/adjoint in BLAS-, solver-, and sparse-style calls.
 // Each library's support header maps this to its own enum (cublasOperation_t,
 // cusparseOperation_t, etc.) via a small to_<lib>_op() helper.

 enum class GpuOp { NoTrans, Trans, ConjTrans };

 namespace internal {

 // ---- Error-checking macros --------------------------------------------------
 // These abort (via eigen_assert) on failure. Not for use in destructors.

 #define EIGEN_CUDA_RUNTIME_CHECK(expr)                             \
   do {                                                             \
     cudaError_t _e = (expr);                                       \
     eigen_assert(_e == cudaSuccess && "CUDA runtime call failed"); \
   } while (0)

 // ---- Bounds-checked narrowing for cuBLAS/cuSOLVER int parameters ------------
 // cuBLAS and the legacy cuSOLVER APIs take dimensions and leading dimensions as
 // `int` (32-bit signed). Modern GPUs can host allocations whose dimensions
 // exceed INT_MAX, and Eigen's Index is 64-bit by default. Use this helper at
 // every narrowing call site so an out-of-range value triggers an assert
 // instead of silently overflowing the BLAS argument.

 inline int to_blas_int(int64_t v) {
   eigen_assert(v >= 0 && v <= static_cast<int64_t>((std::numeric_limits<int>::max)()) &&
                "dimension exceeds the int range supported by cuBLAS / cuSOLVER");
   return static_cast<int>(v);
 }

 // ---- Custom deleters for CUDA-allocated memory ------------------------------
 // Used with std::unique_ptr to give CUDA allocations RAII semantics with no
 // hand-rolled move/dtor boilerplate.

 struct CudaFreeDeleter {
   // When `borrow == true`, the unique_ptr does not free the pointer. Used by
   // DeviceMatrix::view() to wrap a non-owning device pointer with the same
   // smart-pointer machinery as owning storage, without changing the type.
   bool borrow = false;
   void operator()(void* p) const noexcept {
     if (p && !borrow) (void)cudaFree(p);
   }
 };

 struct CudaFreeHostDeleter {
   void operator()(void* p) const noexcept {
     if (p) (void)cudaFreeHost(p);
   }
 };

 // ---- RAII: device buffer ----------------------------------------------------

 class DeviceBuffer {
  public:
   DeviceBuffer() = default;

   explicit DeviceBuffer(size_t bytes) {
     if (bytes > 0) {
       void* p = nullptr;
       EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
       ptr_.reset(p);
     }
   }

   void* get() const noexcept { return ptr_.get(); }
   void* release() noexcept { return ptr_.release(); }
   explicit operator bool() const noexcept { return static_cast<bool>(ptr_); }

   // Adopt an existing device pointer. Caller relinquishes ownership.
   static DeviceBuffer adopt(void* p) noexcept {
     DeviceBuffer b;
     b.ptr_.reset(p);
     return b;
   }

  private:
   std::unique_ptr<void, CudaFreeDeleter> ptr_;
 };

 // ---- RAII: pinned host buffer -----------------------------------------------
 // For async D2H copies (cudaMemcpyAsync requires pinned host memory for true
 // asynchrony and to avoid compute-sanitizer warnings).

 class PinnedHostBuffer {
  public:
   PinnedHostBuffer() = default;

   explicit PinnedHostBuffer(size_t bytes) {
     if (bytes > 0) {
       void* p = nullptr;
       EIGEN_CUDA_RUNTIME_CHECK(cudaMallocHost(&p, bytes));
       ptr_.reset(p);
     }
   }

   void* get() const noexcept { return ptr_.get(); }
   explicit operator bool() const noexcept { return static_cast<bool>(ptr_); }

  private:
   std::unique_ptr<void, CudaFreeHostDeleter> ptr_;
 };

 // ---- Scalar → cudaDataType_t ------------------------------------------------
 // Shared by cuBLAS and cuSOLVER. cudaDataType_t is defined in library_types.h
 // which is included transitively by cuda_runtime.h.

 template <typename Scalar>
 struct cuda_data_type;

 template <>
 struct cuda_data_type<float> {
   static constexpr cudaDataType_t value = CUDA_R_32F;
 };
 template <>
 struct cuda_data_type<double> {
   static constexpr cudaDataType_t value = CUDA_R_64F;
 };
 template <>
 struct cuda_data_type<std::complex<float>> {
   static constexpr cudaDataType_t value = CUDA_C_32F;
 };
 template <>
 struct cuda_data_type<std::complex<double>> {
   static constexpr cudaDataType_t value = CUDA_C_64F;
 };

 }  // namespace internal
 }  // namespace gpu
 }  // namespace Eigen

 #endif  // EIGEN_GPU_SUPPORT_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
	// SPDX-License-Identifier: MPL-2.0

	// Generic CUDA runtime support shared across all GPU library integrations
	// (cuSOLVER and cuBLAS):
	// - Error-checking macros
	// - RAII device buffer
	//
	// Only depends on <cuda_runtime.h>. No NVIDIA library headers.

	#ifndef EIGEN_GPU_SUPPORT_H
	#define EIGEN_GPU_SUPPORT_H

	// IWYU pragma: private
	#include "./InternalHeaderCheck.h"

	#include <cuda_runtime.h>

	#include <limits>
	#include <memory>

	namespace Eigen {
	namespace gpu {

	// ---- Generic operation flag -------------------------------------------------
	// Public flag for transpose/adjoint in BLAS-, solver-, and sparse-style calls.
	// Each library's support header maps this to its own enum (cublasOperation_t,
	// cusparseOperation_t, etc.) via a small to_<lib>_op() helper.

	enum class GpuOp { NoTrans, Trans, ConjTrans };

	namespace internal {

	// ---- Error-checking macros --------------------------------------------------
	// These abort (via eigen_assert) on failure. Not for use in destructors.

	#define EIGEN_CUDA_RUNTIME_CHECK(expr) \
	do { \
	cudaError_t _e = (expr); \
	eigen_assert(_e == cudaSuccess && "CUDA runtime call failed"); \
	} while (0)

	// ---- Bounds-checked narrowing for cuBLAS/cuSOLVER int parameters ------------
	// cuBLAS and the legacy cuSOLVER APIs take dimensions and leading dimensions as
	// `int` (32-bit signed). Modern GPUs can host allocations whose dimensions
	// exceed INT_MAX, and Eigen's Index is 64-bit by default. Use this helper at
	// every narrowing call site so an out-of-range value triggers an assert
	// instead of silently overflowing the BLAS argument.

	inline int to_blas_int(int64_t v) {
	eigen_assert(v >= 0 && v <= static_cast<int64_t>((std::numeric_limits<int>::max)()) &&
	"dimension exceeds the int range supported by cuBLAS / cuSOLVER");
	return static_cast<int>(v);
	}

	// ---- Custom deleters for CUDA-allocated memory ------------------------------
	// Used with std::unique_ptr to give CUDA allocations RAII semantics with no
	// hand-rolled move/dtor boilerplate.

	struct CudaFreeDeleter {
	// When `borrow == true`, the unique_ptr does not free the pointer. Used by
	// DeviceMatrix::view() to wrap a non-owning device pointer with the same
	// smart-pointer machinery as owning storage, without changing the type.
	bool borrow = false;
	void operator()(void* p) const noexcept {
	if (p && !borrow) (void)cudaFree(p);
	}
	};

	struct CudaFreeHostDeleter {
	void operator()(void* p) const noexcept {
	if (p) (void)cudaFreeHost(p);
	}
	};

	// ---- RAII: device buffer ----------------------------------------------------

	class DeviceBuffer {
	public:
	DeviceBuffer() = default;

	explicit DeviceBuffer(size_t bytes) {
	if (bytes > 0) {
	void* p = nullptr;
	EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
	ptr_.reset(p);
	}
	}

	void* get() const noexcept { return ptr_.get(); }
	void* release() noexcept { return ptr_.release(); }
	explicit operator bool() const noexcept { return static_cast<bool>(ptr_); }

	// Adopt an existing device pointer. Caller relinquishes ownership.
	static DeviceBuffer adopt(void* p) noexcept {
	DeviceBuffer b;
	b.ptr_.reset(p);
	return b;
	}

	private:
	std::unique_ptr<void, CudaFreeDeleter> ptr_;
	};

	// ---- RAII: pinned host buffer -----------------------------------------------
	// For async D2H copies (cudaMemcpyAsync requires pinned host memory for true
	// asynchrony and to avoid compute-sanitizer warnings).

	class PinnedHostBuffer {
	public:
	PinnedHostBuffer() = default;

	explicit PinnedHostBuffer(size_t bytes) {
	if (bytes > 0) {
	void* p = nullptr;
	EIGEN_CUDA_RUNTIME_CHECK(cudaMallocHost(&p, bytes));
	ptr_.reset(p);
	}
	}

	void* get() const noexcept { return ptr_.get(); }
	explicit operator bool() const noexcept { return static_cast<bool>(ptr_); }

	private:
	std::unique_ptr<void, CudaFreeHostDeleter> ptr_;
	};

	// ---- Scalar → cudaDataType_t ------------------------------------------------
	// Shared by cuBLAS and cuSOLVER. cudaDataType_t is defined in library_types.h
	// which is included transitively by cuda_runtime.h.

	template <typename Scalar>
	struct cuda_data_type;

	template <>
	struct cuda_data_type<float> {
	static constexpr cudaDataType_t value = CUDA_R_32F;
	};
	template <>
	struct cuda_data_type<double> {
	static constexpr cudaDataType_t value = CUDA_R_64F;
	};
	template <>
	struct cuda_data_type<std::complex<float>> {
	static constexpr cudaDataType_t value = CUDA_C_32F;
	};
	template <>
	struct cuda_data_type<std::complex<double>> {
	static constexpr cudaDataType_t value = CUDA_C_64F;
	};

	} // namespace internal
	} // namespace gpu
	} // namespace Eigen

	#endif // EIGEN_GPU_SUPPORT_H