unsupported/test/GPU/cg.cpp - mirror - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // SPDX-License-Identifier: MPL-2.0

 // End-to-end test: CG algorithm running on GPU via gpu::DeviceMatrix.
 //
 // Uses DeviceSparseView for SpMV, gpu::DeviceMatrix for vectors, DeviceScalar
 // for deferred reductions. Verifies correctness against CPU ConjugateGradient.

 #define EIGEN_USE_GPU
 #include "main.h"
 #include <Eigen/Sparse>
 #include <Eigen/IterativeLinearSolvers>
 #include <unsupported/Eigen/GPU>

 using namespace Eigen;

 // ---- Helper: build a sparse SPD matrix --------------------------------------

 template <typename Scalar>
 SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
   using SpMat = SparseMatrix<Scalar, ColMajor, int>;
   using RealScalar = typename NumTraits<Scalar>::Real;

   SpMat R(n, n);
   R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
   for (Index j = 0; j < n; ++j) {
     for (Index i = 0; i < n; ++i) {
       if (i == j || (std::rand() / double(RAND_MAX)) < density) {
         R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
       }
     }
   }
   R.makeCompressed();
   SpMat A = R.adjoint() * R;
   for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
   A.makeCompressed();
   return A;
 }

 // ---- GPU CG without preconditioner ------------------------------------------

 template <typename Scalar>
 void test_gpu_cg(Index n) {
   using SpMat = SparseMatrix<Scalar, ColMajor, int>;
   using Vec = Matrix<Scalar, Dynamic, 1>;
   using RealScalar = typename NumTraits<Scalar>::Real;

   SpMat A = make_spd<Scalar>(n);
   Vec b = Vec::Random(n);

   // CPU reference (identity preconditioner to match GPU).
   ConjugateGradient<SpMat, Lower | Upper, IdentityPreconditioner> cpu_cg;
   cpu_cg.setMaxIterations(1000);
   cpu_cg.setTolerance(RealScalar(1e-8));
   cpu_cg.compute(A);
   Vec x_cpu = cpu_cg.solve(b);
   VERIFY_IS_EQUAL(cpu_cg.info(), Success);

   // GPU CG: mirrors Eigen's conjugate_gradient() using gpu::DeviceMatrix ops.
   gpu::Context ctx;
   gpu::Context::setThreadLocal(&ctx);
   gpu::SparseContext<Scalar> spmv_ctx(ctx);
   auto mat = spmv_ctx.deviceView(A);

   auto d_b = gpu::DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
   gpu::DeviceMatrix<Scalar> d_x(n, 1);
   d_x.setZero(ctx);

   // r = b (since x=0)
   gpu::DeviceMatrix<Scalar> residual(n, 1);
   residual.copyFrom(ctx, d_b);

   RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
   RealScalar tol = RealScalar(1e-8);
   RealScalar threshold = tol * tol * rhsNorm2;
   RealScalar residualNorm2 = residual.squaredNorm(ctx);

   // p = r (no preconditioner)
   gpu::DeviceMatrix<Scalar> p(n, 1);
   p.copyFrom(ctx, residual);
   gpu::DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);

   auto absNew = residual.dot(ctx, p);
   Index maxIters = 1000;
   Index i = 0;
   while (i < maxIters) {
     tmp.noalias() = mat * p;

     auto alpha = absNew / p.dot(ctx, tmp);
     d_x += alpha * p;
     residual -= alpha * tmp;

     residualNorm2 = residual.squaredNorm(ctx);
     if (residualNorm2 < threshold) break;

     // z = r (no preconditioner)
     z.copyFrom(ctx, residual);

     auto absOld = std::move(absNew);
     absNew = residual.dot(ctx, z);
     auto beta = absNew / absOld;

     p *= beta;
     p += z;
     i++;
   }

   gpu::Context::setThreadLocal(nullptr);

   Vec x_gpu = d_x.toHost(ctx.stream());

   // Verify residual.
   Vec r = A * x_gpu - b;
   RealScalar relres = r.norm() / b.norm();
   VERIFY(relres < RealScalar(1e-6));

   // Compare with CPU.
   RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
   VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
 }

 // ---- GPU CG with Jacobi preconditioner --------------------------------------

 template <typename Scalar>
 void test_gpu_cg_jacobi(Index n) {
   using SpMat = SparseMatrix<Scalar, ColMajor, int>;
   using Vec = Matrix<Scalar, Dynamic, 1>;
   using RealScalar = typename NumTraits<Scalar>::Real;

   SpMat A = make_spd<Scalar>(n);
   Vec b = Vec::Random(n);

   // CPU reference.
   ConjugateGradient<SpMat, Lower | Upper> cpu_cg;
   cpu_cg.setMaxIterations(1000);
   cpu_cg.setTolerance(RealScalar(1e-8));
   cpu_cg.compute(A);
   Vec x_cpu = cpu_cg.solve(b);

   // Extract inverse diagonal.
   Vec invdiag(n);
   for (Index j = 0; j < A.outerSize(); ++j) {
     typename SpMat::InnerIterator it(A, j);
     while (it && it.index() != j) ++it;
     if (it && it.index() == j && it.value() != Scalar(0))
       invdiag(j) = Scalar(1) / it.value();
     else
       invdiag(j) = Scalar(1);
   }

   // GPU CG with Jacobi preconditioner.
   gpu::Context ctx;
   gpu::Context::setThreadLocal(&ctx);
   gpu::SparseContext<Scalar> spmv_ctx(ctx);
   auto mat = spmv_ctx.deviceView(A);
   auto d_invdiag = gpu::DeviceMatrix<Scalar>::fromHost(invdiag, ctx.stream());

   auto d_b = gpu::DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
   gpu::DeviceMatrix<Scalar> d_x(n, 1);
   d_x.setZero(ctx);

   gpu::DeviceMatrix<Scalar> residual(n, 1);
   residual.copyFrom(ctx, d_b);

   RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
   RealScalar tol = RealScalar(1e-8);
   RealScalar threshold = tol * tol * rhsNorm2;
   RealScalar residualNorm2 = residual.squaredNorm(ctx);

   // p = precond.solve(r) = invdiag .* r
   gpu::DeviceMatrix<Scalar> p = d_invdiag.cwiseProduct(ctx, residual);
   gpu::DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);

   auto absNew = residual.dot(ctx, p);
   Index maxIters = 1000;
   Index i = 0;
   while (i < maxIters) {
     tmp.noalias() = mat * p;

     auto alpha = absNew / p.dot(ctx, tmp);
     d_x += alpha * p;
     residual -= alpha * tmp;

     residualNorm2 = residual.squaredNorm(ctx);
     if (residualNorm2 < threshold) break;

     // z = precond.solve(r) = invdiag .* r
     z.cwiseProduct(ctx, d_invdiag, residual);

     auto absOld = std::move(absNew);
     absNew = residual.dot(ctx, z);
     auto beta = absNew / absOld;

     p *= beta;
     p += z;
     i++;
   }

   gpu::Context::setThreadLocal(nullptr);

   Vec x_gpu = d_x.toHost(ctx.stream());

   Vec r = A * x_gpu - b;
   RealScalar relres = r.norm() / b.norm();
   VERIFY(relres < RealScalar(1e-6));

   RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
   VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
 }

 EIGEN_DECLARE_TEST(gpu_cg) {
   // Split by scalar so each part compiles in parallel.
   CALL_SUBTEST_1(test_gpu_cg<double>(64));
   CALL_SUBTEST_1(test_gpu_cg<double>(256));
   CALL_SUBTEST_1(test_gpu_cg_jacobi<double>(64));
   CALL_SUBTEST_1(test_gpu_cg_jacobi<double>(256));
   CALL_SUBTEST_2(test_gpu_cg<float>(64));
   CALL_SUBTEST_2(test_gpu_cg<float>(256));
   CALL_SUBTEST_2(test_gpu_cg_jacobi<float>(64));
   CALL_SUBTEST_2(test_gpu_cg_jacobi<float>(256));
 }
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
	// SPDX-License-Identifier: MPL-2.0

	// End-to-end test: CG algorithm running on GPU via gpu::DeviceMatrix.
	//
	// Uses DeviceSparseView for SpMV, gpu::DeviceMatrix for vectors, DeviceScalar
	// for deferred reductions. Verifies correctness against CPU ConjugateGradient.

	#define EIGEN_USE_GPU
	#include "main.h"
	#include <Eigen/Sparse>
	#include <Eigen/IterativeLinearSolvers>
	#include <unsupported/Eigen/GPU>

	using namespace Eigen;

	// ---- Helper: build a sparse SPD matrix --------------------------------------

	template <typename Scalar>
	SparseMatrix<Scalar, ColMajor, int> make_spd(Index n, double density = 0.1) {
	using SpMat = SparseMatrix<Scalar, ColMajor, int>;
	using RealScalar = typename NumTraits<Scalar>::Real;

	SpMat R(n, n);
	R.reserve(VectorXi::Constant(n, static_cast<int>(n * density) + 1));
	for (Index j = 0; j < n; ++j) {
	for (Index i = 0; i < n; ++i) {
	if (i == j \|\| (std::rand() / double(RAND_MAX)) < density) {
	R.insert(i, j) = Scalar(std::rand() / double(RAND_MAX) - 0.5);
	}
	}
	}
	R.makeCompressed();
	SpMat A = R.adjoint() * R;
	for (Index i = 0; i < n; ++i) A.coeffRef(i, i) += Scalar(RealScalar(n));
	A.makeCompressed();
	return A;
	}

	// ---- GPU CG without preconditioner ------------------------------------------

	template <typename Scalar>
	void test_gpu_cg(Index n) {
	using SpMat = SparseMatrix<Scalar, ColMajor, int>;
	using Vec = Matrix<Scalar, Dynamic, 1>;
	using RealScalar = typename NumTraits<Scalar>::Real;

	SpMat A = make_spd<Scalar>(n);
	Vec b = Vec::Random(n);

	// CPU reference (identity preconditioner to match GPU).
	ConjugateGradient<SpMat, Lower \| Upper, IdentityPreconditioner> cpu_cg;
	cpu_cg.setMaxIterations(1000);
	cpu_cg.setTolerance(RealScalar(1e-8));
	cpu_cg.compute(A);
	Vec x_cpu = cpu_cg.solve(b);
	VERIFY_IS_EQUAL(cpu_cg.info(), Success);

	// GPU CG: mirrors Eigen's conjugate_gradient() using gpu::DeviceMatrix ops.
	gpu::Context ctx;
	gpu::Context::setThreadLocal(&ctx);
	gpu::SparseContext<Scalar> spmv_ctx(ctx);
	auto mat = spmv_ctx.deviceView(A);

	auto d_b = gpu::DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
	gpu::DeviceMatrix<Scalar> d_x(n, 1);
	d_x.setZero(ctx);

	// r = b (since x=0)
	gpu::DeviceMatrix<Scalar> residual(n, 1);
	residual.copyFrom(ctx, d_b);

	RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
	RealScalar tol = RealScalar(1e-8);
	RealScalar threshold = tol * tol * rhsNorm2;
	RealScalar residualNorm2 = residual.squaredNorm(ctx);

	// p = r (no preconditioner)
	gpu::DeviceMatrix<Scalar> p(n, 1);
	p.copyFrom(ctx, residual);
	gpu::DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);

	auto absNew = residual.dot(ctx, p);
	Index maxIters = 1000;
	Index i = 0;
	while (i < maxIters) {
	tmp.noalias() = mat * p;

	auto alpha = absNew / p.dot(ctx, tmp);
	d_x += alpha * p;
	residual -= alpha * tmp;

	residualNorm2 = residual.squaredNorm(ctx);
	if (residualNorm2 < threshold) break;

	// z = r (no preconditioner)
	z.copyFrom(ctx, residual);

	auto absOld = std::move(absNew);
	absNew = residual.dot(ctx, z);
	auto beta = absNew / absOld;

	p *= beta;
	p += z;
	i++;
	}

	gpu::Context::setThreadLocal(nullptr);

	Vec x_gpu = d_x.toHost(ctx.stream());

	// Verify residual.
	Vec r = A * x_gpu - b;
	RealScalar relres = r.norm() / b.norm();
	VERIFY(relres < RealScalar(1e-6));

	// Compare with CPU.
	RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
	VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
	}

	// ---- GPU CG with Jacobi preconditioner --------------------------------------

	template <typename Scalar>
	void test_gpu_cg_jacobi(Index n) {
	using SpMat = SparseMatrix<Scalar, ColMajor, int>;
	using Vec = Matrix<Scalar, Dynamic, 1>;
	using RealScalar = typename NumTraits<Scalar>::Real;

	SpMat A = make_spd<Scalar>(n);
	Vec b = Vec::Random(n);

	// CPU reference.
	ConjugateGradient<SpMat, Lower \| Upper> cpu_cg;
	cpu_cg.setMaxIterations(1000);
	cpu_cg.setTolerance(RealScalar(1e-8));
	cpu_cg.compute(A);
	Vec x_cpu = cpu_cg.solve(b);

	// Extract inverse diagonal.
	Vec invdiag(n);
	for (Index j = 0; j < A.outerSize(); ++j) {
	typename SpMat::InnerIterator it(A, j);
	while (it && it.index() != j) ++it;
	if (it && it.index() == j && it.value() != Scalar(0))
	invdiag(j) = Scalar(1) / it.value();
	else
	invdiag(j) = Scalar(1);
	}

	// GPU CG with Jacobi preconditioner.
	gpu::Context ctx;
	gpu::Context::setThreadLocal(&ctx);
	gpu::SparseContext<Scalar> spmv_ctx(ctx);
	auto mat = spmv_ctx.deviceView(A);
	auto d_invdiag = gpu::DeviceMatrix<Scalar>::fromHost(invdiag, ctx.stream());

	auto d_b = gpu::DeviceMatrix<Scalar>::fromHost(b, ctx.stream());
	gpu::DeviceMatrix<Scalar> d_x(n, 1);
	d_x.setZero(ctx);

	gpu::DeviceMatrix<Scalar> residual(n, 1);
	residual.copyFrom(ctx, d_b);

	RealScalar rhsNorm2 = d_b.squaredNorm(ctx);
	RealScalar tol = RealScalar(1e-8);
	RealScalar threshold = tol * tol * rhsNorm2;
	RealScalar residualNorm2 = residual.squaredNorm(ctx);

	// p = precond.solve(r) = invdiag .* r
	gpu::DeviceMatrix<Scalar> p = d_invdiag.cwiseProduct(ctx, residual);
	gpu::DeviceMatrix<Scalar> z(n, 1), tmp(n, 1);

	auto absNew = residual.dot(ctx, p);
	Index maxIters = 1000;
	Index i = 0;
	while (i < maxIters) {
	tmp.noalias() = mat * p;

	auto alpha = absNew / p.dot(ctx, tmp);
	d_x += alpha * p;
	residual -= alpha * tmp;

	residualNorm2 = residual.squaredNorm(ctx);
	if (residualNorm2 < threshold) break;

	// z = precond.solve(r) = invdiag .* r
	z.cwiseProduct(ctx, d_invdiag, residual);

	auto absOld = std::move(absNew);
	absNew = residual.dot(ctx, z);
	auto beta = absNew / absOld;

	p *= beta;
	p += z;
	i++;
	}

	gpu::Context::setThreadLocal(nullptr);

	Vec x_gpu = d_x.toHost(ctx.stream());

	Vec r = A * x_gpu - b;
	RealScalar relres = r.norm() / b.norm();
	VERIFY(relres < RealScalar(1e-6));

	RealScalar sol_tol = RealScalar(100) * RealScalar(n) * NumTraits<Scalar>::epsilon();
	VERIFY((x_gpu - x_cpu).norm() / (x_cpu.norm() + RealScalar(1)) < sol_tol);
	}

	EIGEN_DECLARE_TEST(gpu_cg) {
	// Split by scalar so each part compiles in parallel.
	CALL_SUBTEST_1(test_gpu_cg<double>(64));
	CALL_SUBTEST_1(test_gpu_cg<double>(256));
	CALL_SUBTEST_1(test_gpu_cg_jacobi<double>(64));
	CALL_SUBTEST_1(test_gpu_cg_jacobi<double>(256));
	CALL_SUBTEST_2(test_gpu_cg<float>(64));
	CALL_SUBTEST_2(test_gpu_cg<float>(256));
	CALL_SUBTEST_2(test_gpu_cg_jacobi<float>(64));
	CALL_SUBTEST_2(test_gpu_cg_jacobi<float>(256));
	}