unsupported/Eigen/src/GPU/GpuSupport.h - mirror - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // SPDX-License-Identifier: MPL-2.0

 // Generic CUDA runtime support shared across all GPU library integrations
 // (cuSOLVER and cuBLAS):
 //   - Error-checking macros
 //   - RAII device buffer
 //
 // Only depends on <cuda_runtime.h>. No NVIDIA library headers.

 #ifndef EIGEN_GPU_SUPPORT_H
 #define EIGEN_GPU_SUPPORT_H

 // IWYU pragma: private
 #include "./InternalHeaderCheck.h"

 #include <cuda_runtime.h>
 #include <vector>

 #include <limits>
 #include <memory>

 namespace Eigen {
 namespace gpu {

 // ---- Generic operation flag -------------------------------------------------
 // Public flag for transpose/adjoint in BLAS-, solver-, and sparse-style calls.
 // Each library's support header maps this to its own enum (cublasOperation_t,
 // cusparseOperation_t, etc.) via a small to_<lib>_op() helper.

 enum class GpuOp { NoTrans, Trans, ConjTrans };

 namespace internal {

 // ---- Error-checking macros --------------------------------------------------
 // These abort (via eigen_assert) on failure. Not for use in destructors.

 #define EIGEN_CUDA_RUNTIME_CHECK(expr)                             \
   do {                                                             \
     cudaError_t _e = (expr);                                       \
     eigen_assert(_e == cudaSuccess && "CUDA runtime call failed"); \
   } while (0)

 // ---- Bounds-checked narrowing for cuBLAS/cuSOLVER int parameters ------------
 // cuBLAS and the legacy cuSOLVER APIs take dimensions and leading dimensions as
 // `int` (32-bit signed). Modern GPUs can host allocations whose dimensions
 // exceed INT_MAX, and Eigen's Index is 64-bit by default. Use this helper at
 // every narrowing call site so an out-of-range value triggers an assert
 // instead of silently overflowing the BLAS argument.

 inline int to_blas_int(int64_t v) {
   eigen_assert(v >= 0 && v <= static_cast<int64_t>((std::numeric_limits<int>::max)()) &&
                "dimension exceeds the int range supported by cuBLAS / cuSOLVER");
   return static_cast<int>(v);
 }

 // ---- Custom deleters for CUDA-allocated memory ------------------------------
 // Used with std::unique_ptr to give CUDA allocations RAII semantics with no
 // hand-rolled move/dtor boilerplate.

 struct CudaFreeDeleter {
   // When `borrow == true`, the unique_ptr does not free the pointer. Used by
   // DeviceMatrix::view() to wrap a non-owning device pointer with the same
   // smart-pointer machinery as owning storage, without changing the type.
   bool borrow = false;
   void operator()(void* p) const noexcept {
     if (p && !borrow) (void)cudaFree(p);
   }
 };

 struct CudaFreeHostDeleter {
   void operator()(void* p) const noexcept {
     if (p) (void)cudaFreeHost(p);
   }
 };

 // ---- Thread-local pool of small device buffers ------------------------------
 // Recycles allocations up to kSmallBufferThreshold bytes (e.g., DeviceScalar)
 // to avoid cudaMalloc/cudaFree overhead. Larger allocations bypass the pool.

 template <size_t SmallBufferThreshold = 256, size_t MaxPoolSize = 64>
 struct DeviceBufferPool {
   static constexpr size_t kSmallBufferThreshold = SmallBufferThreshold;
   static constexpr size_t kMaxPoolSize = MaxPoolSize;

   struct Entry {
     void* ptr;
     size_t bytes;
   };

   ~DeviceBufferPool() {
     for (auto& e : free_list_) (void)cudaFree(e.ptr);
   }

   void* allocate(size_t bytes) {
     for (size_t i = 0; i < free_list_.size(); ++i) {
       if (free_list_[i].bytes >= bytes) {
         void* p = free_list_[i].ptr;
         free_list_[i] = free_list_.back();
         free_list_.pop_back();
         return p;
       }
     }
     void* p = nullptr;
     EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
     return p;
   }

   void deallocate(void* p, size_t bytes) {
     if (free_list_.size() < kMaxPoolSize) {
       free_list_.push_back({p, bytes});
     } else {
       (void)cudaFree(p);
     }
   }

   static DeviceBufferPool& threadLocal() {
     thread_local DeviceBufferPool pool;
     return pool;
   }

  private:
   std::vector<Entry> free_list_;
 };

 // Stateful deleter that returns small buffers to the thread-local pool and
 // cudaFree's larger ones. size==0 means "always cudaFree" (for adopted ptrs).
 struct PooledCudaFreeDeleter {
   size_t size = 0;

   void operator()(void* p) const noexcept {
     if (!p) return;
     if (size > 0 && size <= DeviceBufferPool<>::kSmallBufferThreshold) {
       DeviceBufferPool<>::threadLocal().deallocate(p, size);
     } else {
       (void)cudaFree(p);
     }
   }
 };

 // ---- RAII: device buffer ----------------------------------------------------

 class DeviceBuffer {
  public:
   DeviceBuffer() = default;

   explicit DeviceBuffer(size_t bytes) {
     if (bytes > 0) {
       void* p = nullptr;
       if (bytes <= DeviceBufferPool<>::kSmallBufferThreshold) {
         p = DeviceBufferPool<>::threadLocal().allocate(bytes);
       } else {
         EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
       }
       ptr_ = std::unique_ptr<void, PooledCudaFreeDeleter>(p, PooledCudaFreeDeleter{bytes});
     }
   }

   void* get() const noexcept { return ptr_.get(); }
   void* release() noexcept { return ptr_.release(); }
   explicit operator bool() const noexcept { return static_cast<bool>(ptr_); }

   size_t size() const noexcept { return ptr_.get_deleter().size; }

   // Adopt an existing device pointer. Caller relinquishes ownership.
   // Adopted buffers bypass the pool on destruction (deleter size == 0).
   static DeviceBuffer adopt(void* p) noexcept {
     DeviceBuffer b;
     b.ptr_ = std::unique_ptr<void, PooledCudaFreeDeleter>(p, PooledCudaFreeDeleter{});
     return b;
   }

  private:
   std::unique_ptr<void, PooledCudaFreeDeleter> ptr_;
 };

 // ---- RAII: pinned host buffer -----------------------------------------------
 // For async D2H copies (cudaMemcpyAsync requires pinned host memory for true
 // asynchrony and to avoid compute-sanitizer warnings).

 class PinnedHostBuffer {
  public:
   PinnedHostBuffer() = default;

   explicit PinnedHostBuffer(size_t bytes) {
     if (bytes > 0) {
       void* p = nullptr;
       EIGEN_CUDA_RUNTIME_CHECK(cudaMallocHost(&p, bytes));
       ptr_.reset(p);
     }
   }

   void* get() const noexcept { return ptr_.get(); }
   explicit operator bool() const noexcept { return static_cast<bool>(ptr_); }

  private:
   std::unique_ptr<void, CudaFreeHostDeleter> ptr_;
 };

 // ---- Scalar → cudaDataType_t ------------------------------------------------
 // Shared by cuBLAS and cuSOLVER. cudaDataType_t is defined in library_types.h
 // which is included transitively by cuda_runtime.h.

 template <typename Scalar>
 struct cuda_data_type;

 template <>
 struct cuda_data_type<float> {
   static constexpr cudaDataType_t value = CUDA_R_32F;
 };
 template <>
 struct cuda_data_type<double> {
   static constexpr cudaDataType_t value = CUDA_R_64F;
 };
 template <>
 struct cuda_data_type<std::complex<float>> {
   static constexpr cudaDataType_t value = CUDA_C_32F;
 };
 template <>
 struct cuda_data_type<std::complex<double>> {
   static constexpr cudaDataType_t value = CUDA_C_64F;
 };

 }  // namespace internal
 }  // namespace gpu
 }  // namespace Eigen

 #endif  // EIGEN_GPU_SUPPORT_H
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
	// SPDX-License-Identifier: MPL-2.0

	// Generic CUDA runtime support shared across all GPU library integrations
	// (cuSOLVER and cuBLAS):
	// - Error-checking macros
	// - RAII device buffer
	//
	// Only depends on <cuda_runtime.h>. No NVIDIA library headers.

	#ifndef EIGEN_GPU_SUPPORT_H
	#define EIGEN_GPU_SUPPORT_H

	// IWYU pragma: private
	#include "./InternalHeaderCheck.h"

	#include <cuda_runtime.h>
	#include <vector>

	#include <limits>
	#include <memory>

	namespace Eigen {
	namespace gpu {

	// ---- Generic operation flag -------------------------------------------------
	// Public flag for transpose/adjoint in BLAS-, solver-, and sparse-style calls.
	// Each library's support header maps this to its own enum (cublasOperation_t,
	// cusparseOperation_t, etc.) via a small to_<lib>_op() helper.

	enum class GpuOp { NoTrans, Trans, ConjTrans };

	namespace internal {

	// ---- Error-checking macros --------------------------------------------------
	// These abort (via eigen_assert) on failure. Not for use in destructors.

	#define EIGEN_CUDA_RUNTIME_CHECK(expr) \
	do { \
	cudaError_t _e = (expr); \
	eigen_assert(_e == cudaSuccess && "CUDA runtime call failed"); \
	} while (0)

	// ---- Bounds-checked narrowing for cuBLAS/cuSOLVER int parameters ------------
	// cuBLAS and the legacy cuSOLVER APIs take dimensions and leading dimensions as
	// `int` (32-bit signed). Modern GPUs can host allocations whose dimensions
	// exceed INT_MAX, and Eigen's Index is 64-bit by default. Use this helper at
	// every narrowing call site so an out-of-range value triggers an assert
	// instead of silently overflowing the BLAS argument.

	inline int to_blas_int(int64_t v) {
	eigen_assert(v >= 0 && v <= static_cast<int64_t>((std::numeric_limits<int>::max)()) &&
	"dimension exceeds the int range supported by cuBLAS / cuSOLVER");
	return static_cast<int>(v);
	}

	// ---- Custom deleters for CUDA-allocated memory ------------------------------
	// Used with std::unique_ptr to give CUDA allocations RAII semantics with no
	// hand-rolled move/dtor boilerplate.

	struct CudaFreeDeleter {
	// When `borrow == true`, the unique_ptr does not free the pointer. Used by
	// DeviceMatrix::view() to wrap a non-owning device pointer with the same
	// smart-pointer machinery as owning storage, without changing the type.
	bool borrow = false;
	void operator()(void* p) const noexcept {
	if (p && !borrow) (void)cudaFree(p);
	}
	};

	struct CudaFreeHostDeleter {
	void operator()(void* p) const noexcept {
	if (p) (void)cudaFreeHost(p);
	}
	};

	// ---- Thread-local pool of small device buffers ------------------------------
	// Recycles allocations up to kSmallBufferThreshold bytes (e.g., DeviceScalar)
	// to avoid cudaMalloc/cudaFree overhead. Larger allocations bypass the pool.

	template <size_t SmallBufferThreshold = 256, size_t MaxPoolSize = 64>
	struct DeviceBufferPool {
	static constexpr size_t kSmallBufferThreshold = SmallBufferThreshold;
	static constexpr size_t kMaxPoolSize = MaxPoolSize;

	struct Entry {
	void* ptr;
	size_t bytes;
	};

	~DeviceBufferPool() {
	for (auto& e : free_list_) (void)cudaFree(e.ptr);
	}

	void* allocate(size_t bytes) {
	for (size_t i = 0; i < free_list_.size(); ++i) {
	if (free_list_[i].bytes >= bytes) {
	void* p = free_list_[i].ptr;
	free_list_[i] = free_list_.back();
	free_list_.pop_back();
	return p;
	}
	}
	void* p = nullptr;
	EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
	return p;
	}

	void deallocate(void* p, size_t bytes) {
	if (free_list_.size() < kMaxPoolSize) {
	free_list_.push_back({p, bytes});
	} else {
	(void)cudaFree(p);
	}
	}

	static DeviceBufferPool& threadLocal() {
	thread_local DeviceBufferPool pool;
	return pool;
	}

	private:
	std::vector<Entry> free_list_;
	};

	// Stateful deleter that returns small buffers to the thread-local pool and
	// cudaFree's larger ones. size==0 means "always cudaFree" (for adopted ptrs).
	struct PooledCudaFreeDeleter {
	size_t size = 0;

	void operator()(void* p) const noexcept {
	if (!p) return;
	if (size > 0 && size <= DeviceBufferPool<>::kSmallBufferThreshold) {
	DeviceBufferPool<>::threadLocal().deallocate(p, size);
	} else {
	(void)cudaFree(p);
	}
	}
	};

	// ---- RAII: device buffer ----------------------------------------------------

	class DeviceBuffer {
	public:
	DeviceBuffer() = default;

	explicit DeviceBuffer(size_t bytes) {
	if (bytes > 0) {
	void* p = nullptr;
	if (bytes <= DeviceBufferPool<>::kSmallBufferThreshold) {
	p = DeviceBufferPool<>::threadLocal().allocate(bytes);
	} else {
	EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
	}
	ptr_ = std::unique_ptr<void, PooledCudaFreeDeleter>(p, PooledCudaFreeDeleter{bytes});
	}
	}

	void* get() const noexcept { return ptr_.get(); }
	void* release() noexcept { return ptr_.release(); }
	explicit operator bool() const noexcept { return static_cast<bool>(ptr_); }

	size_t size() const noexcept { return ptr_.get_deleter().size; }

	// Adopt an existing device pointer. Caller relinquishes ownership.
	// Adopted buffers bypass the pool on destruction (deleter size == 0).
	static DeviceBuffer adopt(void* p) noexcept {
	DeviceBuffer b;
	b.ptr_ = std::unique_ptr<void, PooledCudaFreeDeleter>(p, PooledCudaFreeDeleter{});
	return b;
	}

	private:
	std::unique_ptr<void, PooledCudaFreeDeleter> ptr_;
	};

	// ---- RAII: pinned host buffer -----------------------------------------------
	// For async D2H copies (cudaMemcpyAsync requires pinned host memory for true
	// asynchrony and to avoid compute-sanitizer warnings).

	class PinnedHostBuffer {
	public:
	PinnedHostBuffer() = default;

	explicit PinnedHostBuffer(size_t bytes) {
	if (bytes > 0) {
	void* p = nullptr;
	EIGEN_CUDA_RUNTIME_CHECK(cudaMallocHost(&p, bytes));
	ptr_.reset(p);
	}
	}

	void* get() const noexcept { return ptr_.get(); }
	explicit operator bool() const noexcept { return static_cast<bool>(ptr_); }

	private:
	std::unique_ptr<void, CudaFreeHostDeleter> ptr_;
	};

	// ---- Scalar → cudaDataType_t ------------------------------------------------
	// Shared by cuBLAS and cuSOLVER. cudaDataType_t is defined in library_types.h
	// which is included transitively by cuda_runtime.h.

	template <typename Scalar>
	struct cuda_data_type;

	template <>
	struct cuda_data_type<float> {
	static constexpr cudaDataType_t value = CUDA_R_32F;
	};
	template <>
	struct cuda_data_type<double> {
	static constexpr cudaDataType_t value = CUDA_R_64F;
	};
	template <>
	struct cuda_data_type<std::complex<float>> {
	static constexpr cudaDataType_t value = CUDA_C_32F;
	};
	template <>
	struct cuda_data_type<std::complex<double>> {
	static constexpr cudaDataType_t value = CUDA_C_64F;
	};

	} // namespace internal
	} // namespace gpu
	} // namespace Eigen

	#endif // EIGEN_GPU_SUPPORT_H