blob: 650e3ad77cbbadb65bafe148d06a2f493262f1c7 [file] [edit]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// SPDX-License-Identifier: MPL-2.0
// Generic CUDA runtime support shared across all GPU library integrations
// (cuSOLVER and cuBLAS):
// - Error-checking macros
// - RAII device buffer
//
// Only depends on <cuda_runtime.h>. No NVIDIA library headers.
#ifndef EIGEN_GPU_SUPPORT_H
#define EIGEN_GPU_SUPPORT_H
// IWYU pragma: private
#include "./InternalHeaderCheck.h"
#include <cuda_runtime.h>
#include <vector>
#include <limits>
#include <memory>
namespace Eigen {
namespace gpu {
// ---- Generic operation flag -------------------------------------------------
// Public flag for transpose/adjoint in BLAS-, solver-, and sparse-style calls.
// Each library's support header maps this to its own enum (cublasOperation_t,
// cusparseOperation_t, etc.) via a small to_<lib>_op() helper.
enum class GpuOp { NoTrans, Trans, ConjTrans };
namespace internal {
// ---- Error-checking macros --------------------------------------------------
// These abort (via eigen_assert) on failure. Not for use in destructors.
#define EIGEN_CUDA_RUNTIME_CHECK(expr) \
do { \
cudaError_t _e = (expr); \
eigen_assert(_e == cudaSuccess && "CUDA runtime call failed"); \
} while (0)
// ---- Bounds-checked narrowing for cuBLAS/cuSOLVER int parameters ------------
// cuBLAS and the legacy cuSOLVER APIs take dimensions and leading dimensions as
// `int` (32-bit signed). Modern GPUs can host allocations whose dimensions
// exceed INT_MAX, and Eigen's Index is 64-bit by default. Use this helper at
// every narrowing call site so an out-of-range value triggers an assert
// instead of silently overflowing the BLAS argument.
inline int to_blas_int(int64_t v) {
eigen_assert(v >= 0 && v <= static_cast<int64_t>((std::numeric_limits<int>::max)()) &&
"dimension exceeds the int range supported by cuBLAS / cuSOLVER");
return static_cast<int>(v);
}
// ---- Custom deleters for CUDA-allocated memory ------------------------------
// Used with std::unique_ptr to give CUDA allocations RAII semantics with no
// hand-rolled move/dtor boilerplate.
struct CudaFreeDeleter {
// When `borrow == true`, the unique_ptr does not free the pointer. Used by
// DeviceMatrix::view() to wrap a non-owning device pointer with the same
// smart-pointer machinery as owning storage, without changing the type.
bool borrow = false;
void operator()(void* p) const noexcept {
if (p && !borrow) (void)cudaFree(p);
}
};
struct CudaFreeHostDeleter {
void operator()(void* p) const noexcept {
if (p) (void)cudaFreeHost(p);
}
};
// ---- Thread-local pool of small device buffers ------------------------------
// Recycles allocations up to kSmallBufferThreshold bytes (e.g., DeviceScalar)
// to avoid cudaMalloc/cudaFree overhead. Larger allocations bypass the pool.
template <size_t SmallBufferThreshold = 256, size_t MaxPoolSize = 64>
struct DeviceBufferPool {
static constexpr size_t kSmallBufferThreshold = SmallBufferThreshold;
static constexpr size_t kMaxPoolSize = MaxPoolSize;
struct Entry {
void* ptr;
size_t bytes;
};
~DeviceBufferPool() {
for (auto& e : free_list_) (void)cudaFree(e.ptr);
}
void* allocate(size_t bytes) {
for (size_t i = 0; i < free_list_.size(); ++i) {
if (free_list_[i].bytes >= bytes) {
void* p = free_list_[i].ptr;
free_list_[i] = free_list_.back();
free_list_.pop_back();
return p;
}
}
void* p = nullptr;
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
return p;
}
void deallocate(void* p, size_t bytes) {
if (free_list_.size() < kMaxPoolSize) {
free_list_.push_back({p, bytes});
} else {
(void)cudaFree(p);
}
}
static DeviceBufferPool& threadLocal() {
thread_local DeviceBufferPool pool;
return pool;
}
private:
std::vector<Entry> free_list_;
};
// Stateful deleter that returns small buffers to the thread-local pool and
// cudaFree's larger ones. size==0 means "always cudaFree" (for adopted ptrs).
struct PooledCudaFreeDeleter {
size_t size = 0;
void operator()(void* p) const noexcept {
if (!p) return;
if (size > 0 && size <= DeviceBufferPool<>::kSmallBufferThreshold) {
DeviceBufferPool<>::threadLocal().deallocate(p, size);
} else {
(void)cudaFree(p);
}
}
};
// ---- RAII: device buffer ----------------------------------------------------
class DeviceBuffer {
public:
DeviceBuffer() = default;
explicit DeviceBuffer(size_t bytes) {
if (bytes > 0) {
void* p = nullptr;
if (bytes <= DeviceBufferPool<>::kSmallBufferThreshold) {
p = DeviceBufferPool<>::threadLocal().allocate(bytes);
} else {
EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
}
ptr_ = std::unique_ptr<void, PooledCudaFreeDeleter>(p, PooledCudaFreeDeleter{bytes});
}
}
void* get() const noexcept { return ptr_.get(); }
void* release() noexcept { return ptr_.release(); }
explicit operator bool() const noexcept { return static_cast<bool>(ptr_); }
size_t size() const noexcept { return ptr_.get_deleter().size; }
// Adopt an existing device pointer. Caller relinquishes ownership.
// Adopted buffers bypass the pool on destruction (deleter size == 0).
static DeviceBuffer adopt(void* p) noexcept {
DeviceBuffer b;
b.ptr_ = std::unique_ptr<void, PooledCudaFreeDeleter>(p, PooledCudaFreeDeleter{});
return b;
}
private:
std::unique_ptr<void, PooledCudaFreeDeleter> ptr_;
};
// ---- RAII: pinned host buffer -----------------------------------------------
// For async D2H copies (cudaMemcpyAsync requires pinned host memory for true
// asynchrony and to avoid compute-sanitizer warnings).
class PinnedHostBuffer {
public:
PinnedHostBuffer() = default;
explicit PinnedHostBuffer(size_t bytes) {
if (bytes > 0) {
void* p = nullptr;
EIGEN_CUDA_RUNTIME_CHECK(cudaMallocHost(&p, bytes));
ptr_.reset(p);
}
}
void* get() const noexcept { return ptr_.get(); }
explicit operator bool() const noexcept { return static_cast<bool>(ptr_); }
private:
std::unique_ptr<void, CudaFreeHostDeleter> ptr_;
};
// ---- Scalar → cudaDataType_t ------------------------------------------------
// Shared by cuBLAS and cuSOLVER. cudaDataType_t is defined in library_types.h
// which is included transitively by cuda_runtime.h.
template <typename Scalar>
struct cuda_data_type;
template <>
struct cuda_data_type<float> {
static constexpr cudaDataType_t value = CUDA_R_32F;
};
template <>
struct cuda_data_type<double> {
static constexpr cudaDataType_t value = CUDA_R_64F;
};
template <>
struct cuda_data_type<std::complex<float>> {
static constexpr cudaDataType_t value = CUDA_C_32F;
};
template <>
struct cuda_data_type<std::complex<double>> {
static constexpr cudaDataType_t value = CUDA_C_64F;
};
} // namespace internal
} // namespace gpu
} // namespace Eigen
#endif // EIGEN_GPU_SUPPORT_H