unsupported/test/GPU/cusparse_spmv.cpp - mirror - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // SPDX-License-Identifier: MPL-2.0

 // Tests for gpu::SparseContext: GPU SpMV/SpMM via cuSPARSE.

 #define EIGEN_USE_GPU
 #include "main.h"
 #include <Eigen/Sparse>
 #include <unsupported/Eigen/GPU>
 #include "gpu_test_helpers.h"

 using namespace Eigen;

 // ---- Helper: build a random sparse matrix -----------------------------------

 template <typename Scalar>
 SparseMatrix<Scalar, ColMajor, int> make_sparse(Index rows, Index cols, double density = 0.1) {
   using SpMat = SparseMatrix<Scalar, ColMajor, int>;
   using RealScalar = typename NumTraits<Scalar>::Real;

   SpMat R(rows, cols);
   R.reserve(VectorXi::Constant(cols, static_cast<int>(rows * density) + 1));
   for (Index j = 0; j < cols; ++j) {
     for (Index i = 0; i < rows; ++i) {
       if ((std::rand() / double(RAND_MAX)) < density) {
         const RealScalar re = RealScalar(std::rand() / double(RAND_MAX) - 0.5);
         const RealScalar im = RealScalar(std::rand() / double(RAND_MAX) - 0.5);
         R.insert(i, j) = gpu_test::make_test_value<Scalar>(re, im);
       }
     }
   }
   R.makeCompressed();
   return R;
 }

 // ---- SpMV: y = A * x -------------------------------------------------------

 template <typename Scalar>
 void test_spmv(Index rows, Index cols) {
   using SpMat = SparseMatrix<Scalar, ColMajor, int>;
   using Vec = Matrix<Scalar, Dynamic, 1>;
   using RealScalar = typename NumTraits<Scalar>::Real;

   SpMat A = make_sparse<Scalar>(rows, cols);
   Vec x = Vec::Random(cols);

   gpu::SparseContext<Scalar> ctx;
   Vec y_gpu = ctx.multiply(A, x);
   Vec y_cpu = A * x;

   RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
   VERIFY_IS_EQUAL(y_gpu.size(), rows);
   VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
 }

 // ---- SpMV with alpha/beta: y = alpha*A*x + beta*y ---------------------------

 template <typename Scalar>
 void test_spmv_alpha_beta(Index n) {
   using SpMat = SparseMatrix<Scalar, ColMajor, int>;
   using Vec = Matrix<Scalar, Dynamic, 1>;
   using RealScalar = typename NumTraits<Scalar>::Real;

   SpMat A = make_sparse<Scalar>(n, n);
   Vec x = Vec::Random(n);
   Vec y_init = Vec::Random(n);

   Scalar alpha(2);
   Scalar beta(3);

   Vec y_cpu = alpha * (A * x) + beta * y_init;

   gpu::SparseContext<Scalar> ctx;
   Vec y_gpu = y_init;
   ctx.multiply(A, x, y_gpu, alpha, beta);

   RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
   VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
 }

 // ---- Transpose: y = A^T * x ------------------------------------------------

 template <typename Scalar>
 void test_spmv_transpose(Index rows, Index cols) {
   using SpMat = SparseMatrix<Scalar, ColMajor, int>;
   using Vec = Matrix<Scalar, Dynamic, 1>;
   using RealScalar = typename NumTraits<Scalar>::Real;

   SpMat A = make_sparse<Scalar>(rows, cols);
   Vec x = Vec::Random(rows);

   gpu::SparseContext<Scalar> ctx;
   Vec y_gpu = ctx.multiplyT(A, x);
   Vec y_cpu = A.transpose() * x;

   RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
   VERIFY_IS_EQUAL(y_gpu.size(), cols);
   VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
 }

 // ---- SpMV adjoint: y = A^H * x ----------------------------------------------

 template <typename Scalar>
 void test_spmv_adjoint(Index rows, Index cols) {
   using SpMat = SparseMatrix<Scalar, ColMajor, int>;
   using Vec = Matrix<Scalar, Dynamic, 1>;
   using RealScalar = typename NumTraits<Scalar>::Real;

   SpMat A = make_sparse<Scalar>(rows, cols);
   Vec x = Vec::Random(rows);

   gpu::SparseContext<Scalar> ctx;
   Vec y_gpu = ctx.multiplyAdjoint(A, x);
   Vec y_cpu = A.adjoint() * x;

   RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
   VERIFY_IS_EQUAL(y_gpu.size(), cols);
   VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
 }

 // ---- SpMM: Y = A * X (multiple RHS) ----------------------------------------

 template <typename Scalar>
 void test_spmm(Index rows, Index cols, Index nrhs) {
   using SpMat = SparseMatrix<Scalar, ColMajor, int>;
   using Mat = Matrix<Scalar, Dynamic, Dynamic>;
   using RealScalar = typename NumTraits<Scalar>::Real;

   SpMat A = make_sparse<Scalar>(rows, cols);
   Mat X = Mat::Random(cols, nrhs);

   gpu::SparseContext<Scalar> ctx;
   Mat Y_gpu = ctx.multiplyMat(A, X);
   Mat Y_cpu = A * X;

   RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
   VERIFY_IS_EQUAL(Y_gpu.rows(), rows);
   VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs);
   VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol);
 }

 // ---- SpMM transpose: Y = A^T * X --------------------------------------------

 template <typename Scalar>
 void test_spmm_transpose(Index rows, Index cols, Index nrhs) {
   using SpMat = SparseMatrix<Scalar, ColMajor, int>;
   using Mat = Matrix<Scalar, Dynamic, Dynamic>;
   using RealScalar = typename NumTraits<Scalar>::Real;

   SpMat A = make_sparse<Scalar>(rows, cols);
   Mat X = Mat::Random(rows, nrhs);

   gpu::SparseContext<Scalar> ctx;
   Mat Y_gpu = ctx.multiplyMat(A, X, gpu::GpuOp::Trans);
   Mat Y_cpu = A.transpose() * X;

   RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
   VERIFY_IS_EQUAL(Y_gpu.rows(), cols);
   VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs);
   VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol);
 }

 // ---- Identity matrix: I * x = x --------------------------------------------

 template <typename Scalar>
 void test_identity(Index n) {
   using SpMat = SparseMatrix<Scalar, ColMajor, int>;
   using Vec = Matrix<Scalar, Dynamic, 1>;
   using RealScalar = typename NumTraits<Scalar>::Real;

   // Build sparse identity.
   SpMat eye(n, n);
   eye.setIdentity();
   eye.makeCompressed();

   Vec x = Vec::Random(n);

   gpu::SparseContext<Scalar> ctx;
   Vec y = ctx.multiply(eye, x);

   RealScalar tol = NumTraits<Scalar>::epsilon();
   VERIFY((y - x).norm() < tol);
 }

 // ---- Context reuse ----------------------------------------------------------

 template <typename Scalar>
 void test_reuse(Index n) {
   using SpMat = SparseMatrix<Scalar, ColMajor, int>;
   using Vec = Matrix<Scalar, Dynamic, 1>;
   using RealScalar = typename NumTraits<Scalar>::Real;

   gpu::SparseContext<Scalar> ctx;
   RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();

   for (int trial = 0; trial < 3; ++trial) {
     SpMat A = make_sparse<Scalar>(n, n);
     Vec x = Vec::Random(n);
     Vec y_gpu = ctx.multiply(A, x);
     Vec y_cpu = A * x;
     VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
   }
 }

 // ---- Empty ------------------------------------------------------------------

 template <typename Scalar>
 void test_empty() {
   using SpMat = SparseMatrix<Scalar, ColMajor, int>;
   using Vec = Matrix<Scalar, Dynamic, 1>;

   SpMat A(0, 0);
   A.makeCompressed();
   Vec x(0);

   gpu::SparseContext<Scalar> ctx;
   Vec y = ctx.multiply(A, x);
   VERIFY_IS_EQUAL(y.size(), 0);
 }

 // ---- gpu::DeviceMatrix SpMV (no host roundtrip) ----------------------------------

 template <typename Scalar>
 void test_spmv_device(Index n) {
   using SpMat = SparseMatrix<Scalar, ColMajor, int>;
   using Vec = Matrix<Scalar, Dynamic, 1>;
   using RealScalar = typename NumTraits<Scalar>::Real;

   SpMat A = make_sparse<Scalar>(n, n);
   Vec x = Vec::Random(n);

   // Use shared gpu::Context for same-stream execution.
   gpu::Context gpu_ctx;
   gpu::SparseContext<Scalar> ctx(gpu_ctx);

   auto d_x = gpu::DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
   gpu::DeviceMatrix<Scalar> d_y;

   ctx.multiply(A, d_x, d_y);

   Vec y_gpu = d_y.toHost(gpu_ctx.stream());
   Vec y_cpu = A * x;

   RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
   VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
 }

 // ---- Expression syntax: d_y = d_A * d_x ------------------------------------

 template <typename Scalar>
 void test_spmv_expr(Index n) {
   using SpMat = SparseMatrix<Scalar, ColMajor, int>;
   using Vec = Matrix<Scalar, Dynamic, 1>;
   using RealScalar = typename NumTraits<Scalar>::Real;

   SpMat A = make_sparse<Scalar>(n, n);
   Vec x = Vec::Random(n);

   gpu::Context gpu_ctx;
   gpu::SparseContext<Scalar> ctx(gpu_ctx);

   // Upload sparse matrix and create device view.
   auto d_A = ctx.deviceView(A);

   // Upload x.
   auto d_x = gpu::DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());

   // Expression syntax: d_y = d_A * d_x
   gpu::DeviceMatrix<Scalar> d_y;
   d_y = d_A * d_x;

   // Also test with noalias():
   gpu::DeviceMatrix<Scalar> d_tmp;
   d_tmp.noalias() = d_A * d_x;

   Vec y_gpu = d_y.toHost(gpu_ctx.stream());
   Vec tmp_gpu = d_tmp.toHost(gpu_ctx.stream());
   Vec y_cpu = A * x;

   RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
   VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
   VERIFY((tmp_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
 }

 // ---- deviceView overwrite: second view replaces first -----------------------

 template <typename Scalar>
 void test_deviceview_overwrite(Index n) {
   using SpMat = SparseMatrix<Scalar, ColMajor, int>;
   using Vec = Matrix<Scalar, Dynamic, 1>;
   using RealScalar = typename NumTraits<Scalar>::Real;

   SpMat A1 = make_sparse<Scalar>(n, n);
   SpMat A2 = make_sparse<Scalar>(n, n);  // different random matrix

   Vec x = Vec::Random(n);

   gpu::Context gpu_ctx;
   gpu::SparseContext<Scalar> ctx(gpu_ctx);

   // First view: A1.
   auto d_A1 = ctx.deviceView(A1);
   auto d_x = gpu::DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
   gpu::DeviceMatrix<Scalar> d_y1;
   d_y1 = d_A1 * d_x;
   Vec y1_gpu = d_y1.toHost(gpu_ctx.stream());
   Vec y1_cpu = A1 * x;
   RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
   VERIFY((y1_gpu - y1_cpu).norm() / (y1_cpu.norm() + RealScalar(1)) < tol);

   // Second view overwrites first: now uses A2.
   auto d_A2 = ctx.deviceView(A2);
   gpu::DeviceMatrix<Scalar> d_y2;
   d_y2 = d_A2 * d_x;
   Vec y2_gpu = d_y2.toHost(gpu_ctx.stream());
   Vec y2_cpu = A2 * x;
   VERIFY((y2_gpu - y2_cpu).norm() / (y2_cpu.norm() + RealScalar(1)) < tol);
 }

 // ---- Per-scalar driver ------------------------------------------------------

 template <typename Scalar>
 void test_scalar() {
   CALL_SUBTEST(test_spmv<Scalar>(64, 64));
   CALL_SUBTEST(test_spmv<Scalar>(128, 64));  // non-square
   CALL_SUBTEST(test_spmv<Scalar>(64, 128));  // wide
   CALL_SUBTEST(test_spmv_alpha_beta<Scalar>(64));
   CALL_SUBTEST(test_spmv_transpose<Scalar>(128, 64));
   CALL_SUBTEST(test_spmv_adjoint<Scalar>(128, 64));
   CALL_SUBTEST(test_spmm<Scalar>(64, 64, 4));
   CALL_SUBTEST(test_spmm_transpose<Scalar>(128, 64, 4));
   CALL_SUBTEST(test_identity<Scalar>(64));
   CALL_SUBTEST(test_reuse<Scalar>(64));
   CALL_SUBTEST(test_empty<Scalar>());
   CALL_SUBTEST(test_spmv_device<Scalar>(64));
   CALL_SUBTEST(test_spmv_expr<Scalar>(64));
   CALL_SUBTEST(test_deviceview_overwrite<Scalar>(64));
 }

 EIGEN_DECLARE_TEST(gpu_cusparse_spmv) {
   gpu_test::require_cusparse_context();

   // Split by scalar so each part compiles in parallel.
   CALL_SUBTEST_1(test_scalar<float>());
   CALL_SUBTEST_2(test_scalar<double>());
   CALL_SUBTEST_3(test_scalar<std::complex<float>>());
   CALL_SUBTEST_4(test_scalar<std::complex<double>>());
 }
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
	// SPDX-License-Identifier: MPL-2.0

	// Tests for gpu::SparseContext: GPU SpMV/SpMM via cuSPARSE.

	#define EIGEN_USE_GPU
	#include "main.h"
	#include <Eigen/Sparse>
	#include <unsupported/Eigen/GPU>
	#include "gpu_test_helpers.h"

	using namespace Eigen;

	// ---- Helper: build a random sparse matrix -----------------------------------

	template <typename Scalar>
	SparseMatrix<Scalar, ColMajor, int> make_sparse(Index rows, Index cols, double density = 0.1) {
	using SpMat = SparseMatrix<Scalar, ColMajor, int>;
	using RealScalar = typename NumTraits<Scalar>::Real;

	SpMat R(rows, cols);
	R.reserve(VectorXi::Constant(cols, static_cast<int>(rows * density) + 1));
	for (Index j = 0; j < cols; ++j) {
	for (Index i = 0; i < rows; ++i) {
	if ((std::rand() / double(RAND_MAX)) < density) {
	const RealScalar re = RealScalar(std::rand() / double(RAND_MAX) - 0.5);
	const RealScalar im = RealScalar(std::rand() / double(RAND_MAX) - 0.5);
	R.insert(i, j) = gpu_test::make_test_value<Scalar>(re, im);
	}
	}
	}
	R.makeCompressed();
	return R;
	}

	// ---- SpMV: y = A * x -------------------------------------------------------

	template <typename Scalar>
	void test_spmv(Index rows, Index cols) {
	using SpMat = SparseMatrix<Scalar, ColMajor, int>;
	using Vec = Matrix<Scalar, Dynamic, 1>;
	using RealScalar = typename NumTraits<Scalar>::Real;

	SpMat A = make_sparse<Scalar>(rows, cols);
	Vec x = Vec::Random(cols);

	gpu::SparseContext<Scalar> ctx;
	Vec y_gpu = ctx.multiply(A, x);
	Vec y_cpu = A * x;

	RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
	VERIFY_IS_EQUAL(y_gpu.size(), rows);
	VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
	}

	// ---- SpMV with alpha/beta: y = alphaAx + beta*y ---------------------------

	template <typename Scalar>
	void test_spmv_alpha_beta(Index n) {
	using SpMat = SparseMatrix<Scalar, ColMajor, int>;
	using Vec = Matrix<Scalar, Dynamic, 1>;
	using RealScalar = typename NumTraits<Scalar>::Real;

	SpMat A = make_sparse<Scalar>(n, n);
	Vec x = Vec::Random(n);
	Vec y_init = Vec::Random(n);

	Scalar alpha(2);
	Scalar beta(3);

	Vec y_cpu = alpha * (A * x) + beta * y_init;

	gpu::SparseContext<Scalar> ctx;
	Vec y_gpu = y_init;
	ctx.multiply(A, x, y_gpu, alpha, beta);

	RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
	VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
	}

	// ---- Transpose: y = A^T * x ------------------------------------------------

	template <typename Scalar>
	void test_spmv_transpose(Index rows, Index cols) {
	using SpMat = SparseMatrix<Scalar, ColMajor, int>;
	using Vec = Matrix<Scalar, Dynamic, 1>;
	using RealScalar = typename NumTraits<Scalar>::Real;

	SpMat A = make_sparse<Scalar>(rows, cols);
	Vec x = Vec::Random(rows);

	gpu::SparseContext<Scalar> ctx;
	Vec y_gpu = ctx.multiplyT(A, x);
	Vec y_cpu = A.transpose() * x;

	RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
	VERIFY_IS_EQUAL(y_gpu.size(), cols);
	VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
	}

	// ---- SpMV adjoint: y = A^H * x ----------------------------------------------

	template <typename Scalar>
	void test_spmv_adjoint(Index rows, Index cols) {
	using SpMat = SparseMatrix<Scalar, ColMajor, int>;
	using Vec = Matrix<Scalar, Dynamic, 1>;
	using RealScalar = typename NumTraits<Scalar>::Real;

	SpMat A = make_sparse<Scalar>(rows, cols);
	Vec x = Vec::Random(rows);

	gpu::SparseContext<Scalar> ctx;
	Vec y_gpu = ctx.multiplyAdjoint(A, x);
	Vec y_cpu = A.adjoint() * x;

	RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
	VERIFY_IS_EQUAL(y_gpu.size(), cols);
	VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
	}

	// ---- SpMM: Y = A * X (multiple RHS) ----------------------------------------

	template <typename Scalar>
	void test_spmm(Index rows, Index cols, Index nrhs) {
	using SpMat = SparseMatrix<Scalar, ColMajor, int>;
	using Mat = Matrix<Scalar, Dynamic, Dynamic>;
	using RealScalar = typename NumTraits<Scalar>::Real;

	SpMat A = make_sparse<Scalar>(rows, cols);
	Mat X = Mat::Random(cols, nrhs);

	gpu::SparseContext<Scalar> ctx;
	Mat Y_gpu = ctx.multiplyMat(A, X);
	Mat Y_cpu = A * X;

	RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
	VERIFY_IS_EQUAL(Y_gpu.rows(), rows);
	VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs);
	VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol);
	}

	// ---- SpMM transpose: Y = A^T * X --------------------------------------------

	template <typename Scalar>
	void test_spmm_transpose(Index rows, Index cols, Index nrhs) {
	using SpMat = SparseMatrix<Scalar, ColMajor, int>;
	using Mat = Matrix<Scalar, Dynamic, Dynamic>;
	using RealScalar = typename NumTraits<Scalar>::Real;

	SpMat A = make_sparse<Scalar>(rows, cols);
	Mat X = Mat::Random(rows, nrhs);

	gpu::SparseContext<Scalar> ctx;
	Mat Y_gpu = ctx.multiplyMat(A, X, gpu::GpuOp::Trans);
	Mat Y_cpu = A.transpose() * X;

	RealScalar tol = RealScalar(10) * RealScalar((std::max)(rows, cols)) * NumTraits<Scalar>::epsilon();
	VERIFY_IS_EQUAL(Y_gpu.rows(), cols);
	VERIFY_IS_EQUAL(Y_gpu.cols(), nrhs);
	VERIFY((Y_gpu - Y_cpu).norm() / (Y_cpu.norm() + RealScalar(1)) < tol);
	}

	// ---- Identity matrix: I * x = x --------------------------------------------

	template <typename Scalar>
	void test_identity(Index n) {
	using SpMat = SparseMatrix<Scalar, ColMajor, int>;
	using Vec = Matrix<Scalar, Dynamic, 1>;
	using RealScalar = typename NumTraits<Scalar>::Real;

	// Build sparse identity.
	SpMat eye(n, n);
	eye.setIdentity();
	eye.makeCompressed();

	Vec x = Vec::Random(n);

	gpu::SparseContext<Scalar> ctx;
	Vec y = ctx.multiply(eye, x);

	RealScalar tol = NumTraits<Scalar>::epsilon();
	VERIFY((y - x).norm() < tol);
	}

	// ---- Context reuse ----------------------------------------------------------

	template <typename Scalar>
	void test_reuse(Index n) {
	using SpMat = SparseMatrix<Scalar, ColMajor, int>;
	using Vec = Matrix<Scalar, Dynamic, 1>;
	using RealScalar = typename NumTraits<Scalar>::Real;

	gpu::SparseContext<Scalar> ctx;
	RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();

	for (int trial = 0; trial < 3; ++trial) {
	SpMat A = make_sparse<Scalar>(n, n);
	Vec x = Vec::Random(n);
	Vec y_gpu = ctx.multiply(A, x);
	Vec y_cpu = A * x;
	VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
	}
	}

	// ---- Empty ------------------------------------------------------------------

	template <typename Scalar>
	void test_empty() {
	using SpMat = SparseMatrix<Scalar, ColMajor, int>;
	using Vec = Matrix<Scalar, Dynamic, 1>;

	SpMat A(0, 0);
	A.makeCompressed();
	Vec x(0);

	gpu::SparseContext<Scalar> ctx;
	Vec y = ctx.multiply(A, x);
	VERIFY_IS_EQUAL(y.size(), 0);
	}

	// ---- gpu::DeviceMatrix SpMV (no host roundtrip) ----------------------------------

	template <typename Scalar>
	void test_spmv_device(Index n) {
	using SpMat = SparseMatrix<Scalar, ColMajor, int>;
	using Vec = Matrix<Scalar, Dynamic, 1>;
	using RealScalar = typename NumTraits<Scalar>::Real;

	SpMat A = make_sparse<Scalar>(n, n);
	Vec x = Vec::Random(n);

	// Use shared gpu::Context for same-stream execution.
	gpu::Context gpu_ctx;
	gpu::SparseContext<Scalar> ctx(gpu_ctx);

	auto d_x = gpu::DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
	gpu::DeviceMatrix<Scalar> d_y;

	ctx.multiply(A, d_x, d_y);

	Vec y_gpu = d_y.toHost(gpu_ctx.stream());
	Vec y_cpu = A * x;

	RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
	VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
	}

	// ---- Expression syntax: d_y = d_A * d_x ------------------------------------

	template <typename Scalar>
	void test_spmv_expr(Index n) {
	using SpMat = SparseMatrix<Scalar, ColMajor, int>;
	using Vec = Matrix<Scalar, Dynamic, 1>;
	using RealScalar = typename NumTraits<Scalar>::Real;

	SpMat A = make_sparse<Scalar>(n, n);
	Vec x = Vec::Random(n);

	gpu::Context gpu_ctx;
	gpu::SparseContext<Scalar> ctx(gpu_ctx);

	// Upload sparse matrix and create device view.
	auto d_A = ctx.deviceView(A);

	// Upload x.
	auto d_x = gpu::DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());

	// Expression syntax: d_y = d_A * d_x
	gpu::DeviceMatrix<Scalar> d_y;
	d_y = d_A * d_x;

	// Also test with noalias():
	gpu::DeviceMatrix<Scalar> d_tmp;
	d_tmp.noalias() = d_A * d_x;

	Vec y_gpu = d_y.toHost(gpu_ctx.stream());
	Vec tmp_gpu = d_tmp.toHost(gpu_ctx.stream());
	Vec y_cpu = A * x;

	RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
	VERIFY((y_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
	VERIFY((tmp_gpu - y_cpu).norm() / (y_cpu.norm() + RealScalar(1)) < tol);
	}

	// ---- deviceView overwrite: second view replaces first -----------------------

	template <typename Scalar>
	void test_deviceview_overwrite(Index n) {
	using SpMat = SparseMatrix<Scalar, ColMajor, int>;
	using Vec = Matrix<Scalar, Dynamic, 1>;
	using RealScalar = typename NumTraits<Scalar>::Real;

	SpMat A1 = make_sparse<Scalar>(n, n);
	SpMat A2 = make_sparse<Scalar>(n, n); // different random matrix

	Vec x = Vec::Random(n);

	gpu::Context gpu_ctx;
	gpu::SparseContext<Scalar> ctx(gpu_ctx);

	// First view: A1.
	auto d_A1 = ctx.deviceView(A1);
	auto d_x = gpu::DeviceMatrix<Scalar>::fromHost(x, gpu_ctx.stream());
	gpu::DeviceMatrix<Scalar> d_y1;
	d_y1 = d_A1 * d_x;
	Vec y1_gpu = d_y1.toHost(gpu_ctx.stream());
	Vec y1_cpu = A1 * x;
	RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
	VERIFY((y1_gpu - y1_cpu).norm() / (y1_cpu.norm() + RealScalar(1)) < tol);

	// Second view overwrites first: now uses A2.
	auto d_A2 = ctx.deviceView(A2);
	gpu::DeviceMatrix<Scalar> d_y2;
	d_y2 = d_A2 * d_x;
	Vec y2_gpu = d_y2.toHost(gpu_ctx.stream());
	Vec y2_cpu = A2 * x;
	VERIFY((y2_gpu - y2_cpu).norm() / (y2_cpu.norm() + RealScalar(1)) < tol);
	}

	// ---- Per-scalar driver ------------------------------------------------------

	template <typename Scalar>
	void test_scalar() {
	CALL_SUBTEST(test_spmv<Scalar>(64, 64));
	CALL_SUBTEST(test_spmv<Scalar>(128, 64)); // non-square
	CALL_SUBTEST(test_spmv<Scalar>(64, 128)); // wide
	CALL_SUBTEST(test_spmv_alpha_beta<Scalar>(64));
	CALL_SUBTEST(test_spmv_transpose<Scalar>(128, 64));
	CALL_SUBTEST(test_spmv_adjoint<Scalar>(128, 64));
	CALL_SUBTEST(test_spmm<Scalar>(64, 64, 4));
	CALL_SUBTEST(test_spmm_transpose<Scalar>(128, 64, 4));
	CALL_SUBTEST(test_identity<Scalar>(64));
	CALL_SUBTEST(test_reuse<Scalar>(64));
	CALL_SUBTEST(test_empty<Scalar>());
	CALL_SUBTEST(test_spmv_device<Scalar>(64));
	CALL_SUBTEST(test_spmv_expr<Scalar>(64));
	CALL_SUBTEST(test_deviceview_overwrite<Scalar>(64));
	}

	EIGEN_DECLARE_TEST(gpu_cusparse_spmv) {
	gpu_test::require_cusparse_context();

	// Split by scalar so each part compiles in parallel.
	CALL_SUBTEST_1(test_scalar<float>());
	CALL_SUBTEST_2(test_scalar<double>());
	CALL_SUBTEST_3(test_scalar<std::complex<float>>());
	CALL_SUBTEST_4(test_scalar<std::complex<double>>());
	}