blob: c1f7f126618507dcdab5eca95fed38eed9d96d95 [file]
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// SPDX-License-Identifier: MPL-2.0
// Tests for GpuQR: GPU QR decomposition via cuSOLVER.
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/QR>
#include <Eigen/SVD>
#include <unsupported/Eigen/GPU>
using namespace Eigen;
// ---- Solve square system: A * X = B -----------------------------------------
template <typename Scalar>
void test_qr_solve_square(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, nrhs);
gpu::QR<Scalar> qr(A);
VERIFY_IS_EQUAL(qr.info(), Success);
Mat X = qr.solve(B);
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
}
// ---- Solve overdetermined system: m > n (least-squares) ---------------------
template <typename Scalar>
void test_qr_solve_overdetermined(Index m, Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
eigen_assert(m >= n);
Mat A = Mat::Random(m, n);
Mat B = Mat::Random(m, nrhs);
gpu::QR<Scalar> qr(A);
VERIFY_IS_EQUAL(qr.info(), Success);
Mat X = qr.solve(B);
VERIFY_IS_EQUAL(X.rows(), n);
VERIFY_IS_EQUAL(X.cols(), nrhs);
// For an overdetermined system the residual r = A X - B is generally large
// (O(||B||) when B is not in col(A)). What backward-stable least-squares
// makes small is the gradient of the LS objective, A^H r, which is zero at
// the optimum. Higham's bound for QR-based LS gives ||A^H r|| <= O(m * eps)
// * ||A|| * (||A|| ||X|| + ||B||), regardless of kappa(A).
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
RealScalar tol = RealScalar(10) * RealScalar(m) * NumTraits<Scalar>::epsilon();
RealScalar A_norm = A.norm();
RealScalar denom_gpu = A_norm * (A_norm * X.norm() + B.norm());
RealScalar denom_cpu = A_norm * (A_norm * X_cpu.norm() + B.norm());
VERIFY((A.adjoint() * (A * X - B)).norm() / denom_gpu < tol);
VERIFY((A.adjoint() * (A * X_cpu - B)).norm() / denom_cpu < tol);
}
// ---- Solve with DeviceMatrix input ------------------------------------------
template <typename Scalar>
void test_qr_solve_device(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, nrhs);
auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
gpu::QR<Scalar> qr;
qr.compute(d_A);
VERIFY_IS_EQUAL(qr.info(), Success);
gpu::DeviceMatrix<Scalar> d_X = qr.solve(d_B);
Mat X = d_X.toHost();
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
}
// ---- Solve overdetermined via device path -----------------------------------
template <typename Scalar>
void test_qr_solve_overdetermined_device(Index m, Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
eigen_assert(m >= n);
Mat A = Mat::Random(m, n);
Mat B = Mat::Random(m, nrhs);
auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
gpu::QR<Scalar> qr;
qr.compute(d_A);
VERIFY_IS_EQUAL(qr.info(), Success);
gpu::DeviceMatrix<Scalar> d_X = qr.solve(d_B);
VERIFY_IS_EQUAL(d_X.rows(), n);
VERIFY_IS_EQUAL(d_X.cols(), nrhs);
Mat X = d_X.toHost();
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
// See test_qr_solve_overdetermined: backward error for LS is bounded on
// the gradient A^H r, not on r itself.
RealScalar tol = RealScalar(10) * RealScalar(m) * NumTraits<Scalar>::epsilon();
RealScalar A_norm = A.norm();
RealScalar denom_gpu = A_norm * (A_norm * X.norm() + B.norm());
RealScalar denom_cpu = A_norm * (A_norm * X_cpu.norm() + B.norm());
VERIFY((A.adjoint() * (A * X - B)).norm() / denom_gpu < tol);
VERIFY((A.adjoint() * (A * X_cpu - B)).norm() / denom_cpu < tol);
}
// ---- Solve underdetermined system: m < n (minimum norm) ---------------------
template <typename Scalar>
void test_qr_solve_underdetermined(Index m, Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
eigen_assert(m < n);
Mat A = Mat::Random(m, n);
Mat B = Mat::Random(m, nrhs);
gpu::QR<Scalar> qr(A);
VERIFY_IS_EQUAL(qr.info(), Success);
Mat X = qr.solve(B);
VERIFY_IS_EQUAL(X.rows(), n);
VERIFY_IS_EQUAL(X.cols(), nrhs);
// The minimum-norm solution exactly satisfies A X = B (m equations, n > m
// unknowns). Backward error is O(m * eps) * ||A|| * ||X||.
RealScalar tol = RealScalar(20) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((A * X - B).norm() / (A.norm() * X.norm() + B.norm()) < tol);
// Min-norm property: X ⟂ null(A), so X ∈ row space(A) = col space(A^H).
// ||X|| <= ||X_any|| for any other solution. Compare to the SVD min-norm reference.
Mat X_svd = BDCSVD<Mat>(A, ComputeThinU | ComputeThinV).solve(B);
// Both should have the same norm (up to rounding).
VERIFY(numext::abs(X.norm() - X_svd.norm()) / X_svd.norm() <
RealScalar(50) * RealScalar(n) * NumTraits<Scalar>::epsilon());
}
template <typename Scalar>
void test_qr_solve_underdetermined_device(Index m, Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
eigen_assert(m < n);
Mat A = Mat::Random(m, n);
Mat B = Mat::Random(m, nrhs);
auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
gpu::QR<Scalar> qr;
qr.compute(d_A);
VERIFY_IS_EQUAL(qr.info(), Success);
gpu::DeviceMatrix<Scalar> d_X = qr.solve(d_B);
VERIFY_IS_EQUAL(d_X.rows(), n);
VERIFY_IS_EQUAL(d_X.cols(), nrhs);
Mat X = d_X.toHost();
RealScalar tol = RealScalar(20) * RealScalar(n) * NumTraits<Scalar>::epsilon();
VERIFY((A * X - B).norm() / (A.norm() * X.norm() + B.norm()) < tol);
}
// ---- matrixR() returns the upper-triangular factor --------------------------
template <typename Scalar>
void test_qr_matrixR(Index m, Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
eigen_assert(m >= n);
Mat A = Mat::Random(m, n);
gpu::QR<Scalar> qr(A);
VERIFY_IS_EQUAL(qr.info(), Success);
Mat R = qr.matrixR();
VERIFY_IS_EQUAL(R.rows(), n);
VERIFY_IS_EQUAL(R.cols(), n);
// Verify R is upper-triangular (lower triangle exactly zero).
for (Index j = 0; j < n; ++j) {
for (Index i = j + 1; i < n; ++i) {
VERIFY_IS_EQUAL(R(i, j), Scalar(0));
}
}
// Verify that the diagonal of R matches the absolute values of CPU R diagonal
// up to sign (Householder QR has free diagonal sign).
Mat R_cpu = HouseholderQR<Mat>(A).matrixQR().topRows(n).template triangularView<Upper>();
RealScalar tol = RealScalar(20) * RealScalar(m) * NumTraits<Scalar>::epsilon() * A.norm();
for (Index i = 0; i < n; ++i) {
VERIFY(numext::abs(numext::abs(R(i, i)) - numext::abs(R_cpu(i, i))) < tol);
}
}
// ---- Multiple solves reuse the factorization --------------------------------
template <typename Scalar>
void test_qr_multiple_solves(Index n) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
gpu::QR<Scalar> qr(A);
VERIFY_IS_EQUAL(qr.info(), Success);
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
for (int k = 0; k < 5; ++k) {
Mat B = Mat::Random(n, 3);
Mat X = qr.solve(B);
RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
VERIFY(residual < tol);
}
}
// ---- Agreement with CPU HouseholderQR ---------------------------------------
template <typename Scalar>
void test_qr_vs_cpu(Index n, Index nrhs) {
using Mat = Matrix<Scalar, Dynamic, Dynamic>;
using RealScalar = typename NumTraits<Scalar>::Real;
Mat A = Mat::Random(n, n);
Mat B = Mat::Random(n, nrhs);
gpu::QR<Scalar> gpu_qr(A);
VERIFY_IS_EQUAL(gpu_qr.info(), Success);
Mat X_gpu = gpu_qr.solve(B);
Mat X_cpu = HouseholderQR<Mat>(A).solve(B);
// Compare via residual rather than directly between X_gpu and X_cpu: for an
// ill-conditioned A (a random N(0,1) n*n matrix easily reaches kappa(A) ~ n),
// the forward error of each correct solve is bounded by O(kappa * eps), so
// ||X_gpu - X_cpu|| / ||X_cpu|| can legitimately exceed n*eps even though
// both are correct. The relative residual ||A X - B|| / (||A||*||X|| + ||B||)
// is bounded by Higham's backward-stable solve result at O(n * eps),
// independent of kappa.
RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
RealScalar A_norm = A.norm();
RealScalar B_norm = B.norm();
RealScalar denom_gpu = A_norm * X_gpu.norm() + B_norm;
RealScalar denom_cpu = A_norm * X_cpu.norm() + B_norm;
VERIFY((A * X_gpu - B).norm() / denom_gpu < tol);
VERIFY((A * X_cpu - B).norm() / denom_cpu < tol);
}
// ---- Per-scalar driver ------------------------------------------------------
template <typename Scalar>
void test_scalar() {
CALL_SUBTEST(test_qr_solve_square<Scalar>(1, 1));
CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 1));
CALL_SUBTEST(test_qr_solve_square<Scalar>(64, 4));
CALL_SUBTEST(test_qr_solve_square<Scalar>(256, 8));
CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(128, 64, 4));
CALL_SUBTEST(test_qr_solve_overdetermined<Scalar>(256, 128, 1));
CALL_SUBTEST(test_qr_solve_underdetermined<Scalar>(64, 128, 4));
CALL_SUBTEST(test_qr_solve_underdetermined<Scalar>(64, 128, 1));
CALL_SUBTEST(test_qr_solve_underdetermined_device<Scalar>(64, 128, 4));
CALL_SUBTEST(test_qr_matrixR<Scalar>(64, 64));
CALL_SUBTEST(test_qr_matrixR<Scalar>(128, 64));
CALL_SUBTEST(test_qr_solve_device<Scalar>(64, 4));
CALL_SUBTEST(test_qr_solve_overdetermined_device<Scalar>(128, 64, 4));
CALL_SUBTEST(test_qr_multiple_solves<Scalar>(64));
CALL_SUBTEST(test_qr_vs_cpu<Scalar>(64, 4));
CALL_SUBTEST(test_qr_vs_cpu<Scalar>(256, 8));
}
void test_qr_empty() {
gpu::QR<double> qr(MatrixXd(0, 0));
VERIFY_IS_EQUAL(qr.info(), Success);
VERIFY_IS_EQUAL(qr.rows(), 0);
VERIFY_IS_EQUAL(qr.cols(), 0);
}
EIGEN_DECLARE_TEST(gpu_cusolver_qr) {
// Split by scalar so each part compiles in parallel.
CALL_SUBTEST_1(test_scalar<float>());
CALL_SUBTEST_2(test_scalar<double>());
CALL_SUBTEST_3(test_scalar<std::complex<float>>());
CALL_SUBTEST_4(test_scalar<std::complex<double>>());
CALL_SUBTEST_5(test_qr_empty());
}