blob: 772333bdef5249a4372c742f59f91b57ae0fc201 [file]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Eigen Authors
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// SPDX-License-Identifier: MPL-2.0
// Tests for GpuLLT: GPU Cholesky (LL^T) using cuSOLVER.
// Covers cusolverDnXpotrf (factorization) and cusolverDnXpotrs (solve)
// for float, double, complex<float>, complex<double>, Lower and Upper.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/Cholesky>
#include <unsupported/Eigen/GPU>
// Identifier convention throughout this file:
// h_ prefix for host-resident Eigen::Matrix values
// d_ prefix for device-resident Eigen::gpu::DeviceMatrix values
// We also keep namespaces explicit (Eigen::, Eigen::gpu::) so the CPU vs GPU
// path is obvious at every call site.
// Build a random symmetric positive-definite matrix: A = M^H*M + n*I.
template <typename MatrixType>
MatrixType make_spd(Eigen::Index n) {
using Scalar = typename MatrixType::Scalar;
MatrixType M = MatrixType::Random(n, n);
return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n);
}
// Test factorization: L*L^H must reconstruct A to within floating-point tolerance.
template <typename Scalar, int UpLo>
void test_potrf(Eigen::Index n) {
using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
MatrixType h_A = make_spd<MatrixType>(n);
// GPU factorization under test: factor stays on device.
Eigen::gpu::LLT<Scalar, UpLo> gpu_llt(h_A);
VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success);
// CPU LLT is the oracle. GpuLLT does not expose the device-resident factor,
// so we validate correctness via the CPU-reconstructed matrix.
Eigen::LLT<MatrixType, UpLo> cpu_llt(h_A);
VERIFY_IS_EQUAL(cpu_llt.info(), Eigen::Success);
MatrixType h_A_reconstructed = cpu_llt.reconstructedMatrix();
RealScalar tol = RealScalar(4) * RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon() * h_A.norm();
VERIFY((h_A_reconstructed - h_A).norm() < tol);
// Cross-check: GPU and CPU solves must agree on the same RHS. `solve(Matrix)`
// uploads/downloads internally, so both outputs end up on the host.
MatrixType h_b = MatrixType::Random(n, 1);
MatrixType h_x_gpu = gpu_llt.solve(h_b);
MatrixType h_x_cpu = cpu_llt.solve(h_b);
VERIFY((h_x_gpu - h_x_cpu).norm() < tol);
}
// Test solve: residual ||A*X - B|| / ||B|| must be small.
template <typename Scalar, int UpLo>
void test_potrs(Eigen::Index n, Eigen::Index nrhs) {
using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
MatrixType h_A = make_spd<MatrixType>(n);
MatrixType h_B = MatrixType::Random(n, nrhs);
Eigen::gpu::LLT<Scalar, UpLo> gpu_llt(h_A);
VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success);
MatrixType h_X = gpu_llt.solve(h_B);
RealScalar residual = (h_A * h_X - h_B).norm() / h_B.norm();
RealScalar tol = RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon();
VERIFY(residual < tol);
}
// Test that multiple solves against the same factor all produce correct results.
// This exercises the key design property: L stays on device across calls.
template <typename Scalar>
void test_multiple_solves(Eigen::Index n) {
using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
MatrixType h_A = make_spd<MatrixType>(n);
Eigen::gpu::LLT<Scalar, Eigen::Lower> gpu_llt(h_A);
VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success);
RealScalar tol = RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon();
for (int k = 0; k < 5; ++k) {
MatrixType h_B = MatrixType::Random(n, 3);
MatrixType h_X = gpu_llt.solve(h_B);
RealScalar residual = (h_A * h_X - h_B).norm() / h_B.norm();
VERIFY(residual < tol);
}
}
// Test that GpuLLT correctly detects a non-SPD matrix.
void test_not_spd() {
Eigen::MatrixXd h_A = -Eigen::MatrixXd::Identity(8, 8); // negative definite
Eigen::gpu::LLT<double> gpu_llt(h_A);
VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::NumericalIssue);
}
// solve(DeviceMatrix) must not silently return garbage when the factorization
// failed: it must sync the info word and assert just like solve(MatrixBase).
void test_not_spd_device_solve_asserts() {
Eigen::MatrixXd h_A = -Eigen::MatrixXd::Identity(8, 8);
Eigen::MatrixXd h_B = Eigen::MatrixXd::Random(8, 4);
Eigen::gpu::LLT<double> gpu_llt(h_A);
VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::NumericalIssue);
auto d_B = Eigen::gpu::DeviceMatrix<double>::fromHost(h_B);
VERIFY_RAISES_ASSERT(gpu_llt.solve(d_B));
}
// ---- DeviceMatrix-native API --------------------------------------------
// These tests exercise the device-resident path: compute(DeviceMatrix) +
// solve(DeviceMatrix) -> DeviceMatrix, with the user explicitly managing
// upload/download. The tests above use the host-Matrix overloads which do
// the transfers internally; this section covers the no-implicit-transfer
// surface that keeps data on device across a chain of calls.
// compute(DeviceMatrix) + solve(DeviceMatrix) → toHost
template <typename Scalar, int UpLo>
void test_device_matrix_solve(Eigen::Index n, Eigen::Index nrhs) {
using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
MatrixType h_A = make_spd<MatrixType>(n);
MatrixType h_B = MatrixType::Random(n, nrhs);
auto d_A = Eigen::gpu::DeviceMatrix<Scalar>::fromHost(h_A);
auto d_B = Eigen::gpu::DeviceMatrix<Scalar>::fromHost(h_B);
Eigen::gpu::LLT<Scalar, UpLo> gpu_llt;
gpu_llt.compute(d_A);
VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success);
Eigen::gpu::DeviceMatrix<Scalar> d_X = gpu_llt.solve(d_B);
MatrixType h_X = d_X.toHost();
RealScalar residual = (h_A * h_X - h_B).norm() / h_B.norm();
VERIFY(residual < RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon());
}
// compute(DeviceMatrix&&) — move path
template <typename Scalar>
void test_device_matrix_move_compute(Eigen::Index n) {
using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
MatrixType h_A = make_spd<MatrixType>(n);
MatrixType h_B = MatrixType::Random(n, 1);
auto d_A = Eigen::gpu::DeviceMatrix<Scalar>::fromHost(h_A);
Eigen::gpu::LLT<Scalar, Eigen::Lower> gpu_llt;
gpu_llt.compute(std::move(d_A));
VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success);
// d_A should be empty after move.
VERIFY(d_A.empty());
MatrixType h_X = gpu_llt.solve(h_B);
RealScalar residual = (h_A * h_X - h_B).norm() / h_B.norm();
VERIFY(residual < RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon());
}
// Full async chain: compute → solve → solve again with result as RHS → toHost
template <typename Scalar>
void test_chaining(Eigen::Index n) {
using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
MatrixType h_A = make_spd<MatrixType>(n);
MatrixType h_B = MatrixType::Random(n, 3);
auto d_A = Eigen::gpu::DeviceMatrix<Scalar>::fromHost(h_A);
auto d_B = Eigen::gpu::DeviceMatrix<Scalar>::fromHost(h_B);
Eigen::gpu::LLT<Scalar, Eigen::Lower> gpu_llt;
gpu_llt.compute(d_A);
VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success);
// Chain: solve → use result as RHS for another solve. Everything stays on
// device until the final toHost() below; that's the only sync point.
Eigen::gpu::DeviceMatrix<Scalar> d_X = gpu_llt.solve(d_B);
Eigen::gpu::DeviceMatrix<Scalar> d_Y = gpu_llt.solve(d_X);
MatrixType h_Y = d_Y.toHost();
// Verify: Y = A^{-2} * B using CPU oracle.
MatrixType h_X_ref = Eigen::LLT<MatrixType, Eigen::Lower>(h_A).solve(h_B);
MatrixType h_Y_ref = Eigen::LLT<MatrixType, Eigen::Lower>(h_A).solve(h_X_ref);
RealScalar tol = RealScalar(4) * RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon() * h_Y_ref.norm();
VERIFY((h_Y - h_Y_ref).norm() < tol);
}
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST((test_potrf<Scalar, Eigen::Lower>(1)));
CALL_SUBTEST((test_potrf<Scalar, Eigen::Lower>(64)));
CALL_SUBTEST((test_potrf<Scalar, Eigen::Lower>(256)));
CALL_SUBTEST((test_potrf<Scalar, Eigen::Upper>(64)));
CALL_SUBTEST((test_potrf<Scalar, Eigen::Upper>(256)));
CALL_SUBTEST((test_potrs<Scalar, Eigen::Lower>(64, 1)));
CALL_SUBTEST((test_potrs<Scalar, Eigen::Lower>(64, 4)));
CALL_SUBTEST((test_potrs<Scalar, Eigen::Lower>(256, 8)));
CALL_SUBTEST((test_potrs<Scalar, Eigen::Upper>(64, 1)));
CALL_SUBTEST((test_potrs<Scalar, Eigen::Upper>(256, 4)));
CALL_SUBTEST(test_multiple_solves<Scalar>(128));
CALL_SUBTEST((test_device_matrix_solve<Scalar, Eigen::Lower>(64, 4)));
CALL_SUBTEST((test_device_matrix_solve<Scalar, Eigen::Upper>(128, 1)));
CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64));
CALL_SUBTEST(test_chaining<Scalar>(64));
}
EIGEN_DECLARE_TEST(gpu_cusolver_llt) {
// Split by scalar so each part compiles in parallel.
CALL_SUBTEST_1(test_scalar<float>());
CALL_SUBTEST_2(test_scalar<double>());
CALL_SUBTEST_3(test_scalar<std::complex<float>>());
CALL_SUBTEST_4(test_scalar<std::complex<double>>());
CALL_SUBTEST_5(test_not_spd());
CALL_SUBTEST_5(test_not_spd_device_solve_asserts());
}