blob: 15ce898a6ad4465afa6b6fd3482c1e34da0acc0c [file] [edit]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// SPDX-License-Identifier: MPL-2.0
// Device-resident scalar and element-wise operations via NPP signals.
// Header-only — no custom CUDA kernels needed. Uses nppsDiv, nppsMul,
// nppsMulC from the NPP library (CUDA::npps, part of the CUDA toolkit).
#ifndef EIGEN_GPU_DEVICE_SCALAR_OPS_H
#define EIGEN_GPU_DEVICE_SCALAR_OPS_H
#include <cuda_runtime.h>
#include <npps_arithmetic_and_logical_operations.h>
#include "./GpuSupport.h"
namespace Eigen {
namespace gpu {
namespace internal {
// ---- NppStreamContext helper ------------------------------------------------
inline NppStreamContext make_npp_stream_ctx(cudaStream_t stream) {
// NPP requires nCudaDeviceId / device attributes to match the device that
// owns 'stream' at this call. We query each time (cheap relative to the NPP
// launch itself) so multi-device or borrowed-stream callers stay correct.
NppStreamContext ctx = {};
ctx.hStream = stream;
#if CUDART_VERSION >= 12050
// cudaStreamGetDevice returns the device that owns the stream regardless of
// the calling thread's current device — safe for borrowed streams.
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamGetDevice(stream, &ctx.nCudaDeviceId));
#else
// Older CUDA runtimes lack cudaStreamGetDevice. Callers using borrowed
// streams from a different device must cudaSetDevice() first.
EIGEN_CUDA_RUNTIME_CHECK(cudaGetDevice(&ctx.nCudaDeviceId));
#endif
EIGEN_CUDA_RUNTIME_CHECK(cudaDeviceGetAttribute(&ctx.nCudaDevAttrComputeCapabilityMajor,
cudaDevAttrComputeCapabilityMajor, ctx.nCudaDeviceId));
EIGEN_CUDA_RUNTIME_CHECK(cudaDeviceGetAttribute(&ctx.nCudaDevAttrComputeCapabilityMinor,
cudaDevAttrComputeCapabilityMinor, ctx.nCudaDeviceId));
EIGEN_CUDA_RUNTIME_CHECK(
cudaDeviceGetAttribute(&ctx.nMultiProcessorCount, cudaDevAttrMultiProcessorCount, ctx.nCudaDeviceId));
EIGEN_CUDA_RUNTIME_CHECK(cudaDeviceGetAttribute(&ctx.nMaxThreadsPerMultiProcessor,
cudaDevAttrMaxThreadsPerMultiProcessor, ctx.nCudaDeviceId));
EIGEN_CUDA_RUNTIME_CHECK(
cudaDeviceGetAttribute(&ctx.nMaxThreadsPerBlock, cudaDevAttrMaxThreadsPerBlock, ctx.nCudaDeviceId));
int shared_mem_per_block = 0;
EIGEN_CUDA_RUNTIME_CHECK(
cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, ctx.nCudaDeviceId));
ctx.nSharedMemPerBlock = static_cast<size_t>(shared_mem_per_block);
EIGEN_CUDA_RUNTIME_CHECK(cudaStreamGetFlags(stream, &ctx.nStreamFlags));
return ctx;
}
// ---- Scalar division: c = a / b (device-resident, async) --------------------
inline void device_scalar_div(const float* a, const float* b, float* c, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsDiv_32f_Ctx(b, a, c, 1, npp_ctx); // NPP: pDst[i] = pSrc2[i] / pSrc1[i]
}
inline void device_scalar_div(const double* a, const double* b, double* c, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsDiv_64f_Ctx(b, a, c, 1, npp_ctx); // NPP: pDst[i] = pSrc2[i] / pSrc1[i]
}
// ---- Scalar negation: c = -a (device-resident, async) -----------------------
inline void device_scalar_neg(const float* a, float* c, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsMulC_32f_Ctx(a, -1.0f, c, 1, npp_ctx);
}
inline void device_scalar_neg(const double* a, double* c, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsMulC_64f_Ctx(a, -1.0, c, 1, npp_ctx);
}
// ---- Element-wise vector multiply: c[i] = a[i] * b[i] ----------------------
inline void device_cwiseProduct(const float* a, const float* b, float* c, int n, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsMul_32f_Ctx(a, b, c, static_cast<size_t>(n), npp_ctx);
}
inline void device_cwiseProduct(const double* a, const double* b, double* c, int n, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsMul_64f_Ctx(a, b, c, static_cast<size_t>(n), npp_ctx);
}
// ---- Element-wise vector division: c[i] = a[i] / b[i] ----------------------
inline void device_cwiseQuotient(const float* a, const float* b, float* c, int n, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsDiv_32f_Ctx(b, a, c, static_cast<size_t>(n), npp_ctx); // NPP: dst = src2 / src1
}
inline void device_cwiseQuotient(const double* a, const double* b, double* c, int n, cudaStream_t stream) {
NppStreamContext npp_ctx = make_npp_stream_ctx(stream);
nppsDiv_64f_Ctx(b, a, c, static_cast<size_t>(n), npp_ctx);
}
} // namespace internal
} // namespace gpu
} // namespace Eigen
#endif // EIGEN_GPU_DEVICE_SCALAR_OPS_H