Eigen/GPU [2/5]: Add library dispatch module (DeviceMatrix, cuBLAS, cuSOLVER)

libeigen/eigen!2412

Co-authored-by: Rasmus Munk Larsen <rlarsen@nvidia.com>
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 601e45f..b38be67 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.10.0)
+cmake_minimum_required(VERSION 3.17)
 
 #==============================================================================
 # CMake Policy issues.
@@ -9,7 +9,7 @@
 endif (POLICY CMP0077)
 
 # NOTE Remove setting the policy once the minimum required CMake version is
-# increased to at least 3.15. Retain enabling the export to package registry.
+# increased to at least 3.21. Retain enabling the export to package registry.
 if (POLICY CMP0090)
   # The export command does not populate package registry by default
   cmake_policy (SET CMP0090 NEW)
diff --git a/ci/build.linux.gitlab-ci.yml b/ci/build.linux.gitlab-ci.yml
index 7dc5ca4..1907986 100644
--- a/ci/build.linux.gitlab-ci.yml
+++ b/ci/build.linux.gitlab-ci.yml
@@ -214,11 +214,15 @@
 
 .build:linux:cuda:
   extends: .build:linux:cross:x86-64
+  # CUDA images and compiler packages vary enough that a cached CMake build
+  # directory can retain stale compiler-feature detection across pipeline runs.
+  cache: []
   variables:
     # Additional flags passed to the cuda compiler.
     EIGEN_CI_CUDA_CXX_FLAGS: ""
     # Compute architectures present in the GitLab CI runners.
-    EIGEN_CI_CUDA_COMPUTE_ARCH: "70;80"
+    # SaaS GPU runners have Tesla T4 (sm_75).
+    EIGEN_CI_CUDA_COMPUTE_ARCH: "75"
     EIGEN_CI_BUILD_TARGET: buildtests_gpu
     EIGEN_CI_TEST_CUDA_CLANG: "off"
     EIGEN_CI_TEST_CUDA_NVC: "off"
@@ -237,22 +241,80 @@
     # Build on regular linux to limit GPU cost.
     - saas-linux-2xlarge-amd64
 
-# GCC-10, CUDA-12.2
-build:linux:cuda-12.2:gcc-10:
+# ---- CUDA 11.5 + oldest supported compilers ---------------------------------
+# clang-14 fully supports up to CUDA 11.5.  CUDA 11.5 images are Ubuntu 20.04
+# only, which lacks clang-14 and cmake >= 3.17.  EIGEN_CI_BEFORE_SCRIPT adds
+# the LLVM apt repo (for clang-14) and installs cmake via pip.
+
+build:linux:cuda-11.5:gcc-10:
   extends: .build:linux:cuda
-  image: nvidia/cuda:12.2.0-devel-ubuntu20.04
+  image: nvidia/cuda:11.5.2-devel-ubuntu20.04
+  before_script:
+    # Ubuntu 20.04 has cmake 3.16; Eigen requires 3.17+. Pull a newer
+    # cmake from Kitware's official apt repo for focal.
+    - apt-get update -y > /dev/null
+    - apt-get install -y --no-install-recommends wget gnupg ca-certificates > /dev/null
+    - wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc
+        | gpg --dearmor -o /usr/share/keyrings/kitware-archive-keyring.gpg
+    - echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ focal main'
+        > /etc/apt/sources.list.d/kitware.list
+    - apt-get update -y > /dev/null
+    - . ci/scripts/common.linux.before_script.sh
   variables:
+    EIGEN_CI_INSTALL: gcc-10 g++-10
     EIGEN_CI_C_COMPILER: gcc-10
     EIGEN_CI_CXX_COMPILER: g++-10
 
-# Clang-12, CUDA-12.2
-build:linux:cuda-12.2:clang-12:
-  extends: build:linux:cuda-12.2:gcc-10
+build:linux:cuda-11.5:clang-14:
+  extends: build:linux:cuda-11.5:gcc-10
+  before_script:
+    # Ubuntu 20.04 lacks clang-14 and has cmake 3.16. Add both the LLVM
+    # apt repo (for clang-14) and Kitware's apt repo (for cmake >= 3.17)
+    # before running common.linux.before_script.sh, which apt-installs
+    # cmake + the requested compiler.
+    - apt-get update -y > /dev/null
+    - apt-get install -y --no-install-recommends wget gnupg ca-certificates > /dev/null
+    - wget -qO - https://apt.llvm.org/llvm-snapshot.gpg.key
+        | gpg --dearmor -o /usr/share/keyrings/llvm-archive-keyring.gpg
+    - echo 'deb [signed-by=/usr/share/keyrings/llvm-archive-keyring.gpg] http://apt.llvm.org/focal/ llvm-toolchain-focal-14 main'
+        > /etc/apt/sources.list.d/llvm-14.list
+    - wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc
+        | gpg --dearmor -o /usr/share/keyrings/kitware-archive-keyring.gpg
+    - echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ focal main'
+        > /etc/apt/sources.list.d/kitware.list
+    - apt-get update -y > /dev/null
+    - . ci/scripts/common.linux.before_script.sh
   variables:
-    EIGEN_CI_C_COMPILER: clang-12
-    EIGEN_CI_CXX_COMPILER: clang++-12
+    EIGEN_CI_INSTALL: clang-14
+    EIGEN_CI_C_COMPILER: clang-14
+    EIGEN_CI_CXX_COMPILER: clang++-14
     EIGEN_CI_TEST_CUDA_CLANG: "on"
 
+# ---- New: latest CUDA + newest compilers, sm_75;sm_90 ----------------------
+# sm_75 (T4) coverage comes from every job; sm_90 only from newer CUDA.
+
+# GCC-13, CUDA-12.6 (nvcc)
+# CUDA 12.6 nvcc only supports gcc up to 13.
+build:linux:cuda-12.6:gcc-13:
+  extends: .build:linux:cuda
+  image: nvidia/cuda:12.6.3-devel-ubuntu24.04
+  variables:
+    EIGEN_CI_C_COMPILER: gcc-13
+    EIGEN_CI_CXX_COMPILER: g++-13
+    EIGEN_CI_CUDA_COMPUTE_ARCH: "75;90"
+
+# Clang-19, CUDA-12.6 (clang as CUDA compiler)
+# Clang CUDA support lags behind nvcc; clang-19 supports up to CUDA 12.x.
+build:linux:cuda-12.6:clang-19:
+  extends: .build:linux:cuda
+  image: nvidia/cuda:12.6.3-devel-ubuntu24.04
+  variables:
+    EIGEN_CI_INSTALL: clang-19
+    EIGEN_CI_C_COMPILER: clang-19
+    EIGEN_CI_CXX_COMPILER: clang++-19
+    EIGEN_CI_TEST_CUDA_CLANG: "on"
+    EIGEN_CI_CUDA_COMPUTE_ARCH: "75;90"
+
 
 ######### HIP ##################################################################
 # Note: these are currently build-only, until we get an AMD-supported runner.
@@ -402,4 +464,3 @@
     - if: $CI_PIPELINE_SOURCE == "merge_request_event"
   tags:
     - saas-linux-medium-arm64
-
diff --git a/ci/test.linux.gitlab-ci.yml b/ci/test.linux.gitlab-ci.yml
index b4ca0b3..f19a5bf 100644
--- a/ci/test.linux.gitlab-ci.yml
+++ b/ci/test.linux.gitlab-ci.yml
@@ -271,23 +271,59 @@
   tags:
     - saas-linux-medium-amd64-gpu-standard
 
-# GCC-10, CUDA-12.2
-test:linux:cuda-12.2:gcc-10:
-  extends: .test:linux:cuda
-  image: nvidia/cuda:12.2.0-devel-ubuntu20.04
-  needs: [ build:linux:cuda-12.2:gcc-10 ]
-  variables:
-    EIGEN_CI_CXX_COMPILER: g++-10
-    EIGEN_CI_CC_COMPILER: gcc-10
+# ---- CUDA 11.5 + oldest supported compilers ---------------------------------
 
-# Clang-12, CUDA-12.2
-test:linux:cuda-12.2:clang-12:
+# GCC-10, CUDA-11.5
+test:linux:cuda-11.5:gcc-10:
   extends: .test:linux:cuda
-  image: nvidia/cuda:12.2.0-devel-ubuntu20.04
-  needs: [ build:linux:cuda-12.2:clang-12 ]
+  image: nvidia/cuda:11.5.2-devel-ubuntu20.04
+  needs: [ build:linux:cuda-11.5:gcc-10 ]
   variables:
-    EIGEN_CI_CXX_COMPILER: clang++-12
-    EIGEN_CI_CC_COMPILER: clang-12
+    EIGEN_CI_INSTALL: gcc-10 g++-10
+    EIGEN_CI_C_COMPILER: gcc-10
+    EIGEN_CI_CXX_COMPILER: g++-10
+
+# Clang-14, CUDA-11.5
+test:linux:cuda-11.5:clang-14:
+  extends: .test:linux:cuda
+  image: nvidia/cuda:11.5.2-devel-ubuntu20.04
+  needs: [ build:linux:cuda-11.5:clang-14 ]
+  before_script:
+    # Ubuntu 20.04 lacks clang-14.  Add the LLVM apt repo before
+    # the common before_script installs EIGEN_CI_INSTALL packages.
+    - apt-get update -y > /dev/null
+    - apt-get install -y --no-install-recommends wget gnupg > /dev/null
+    - wget -qO - https://apt.llvm.org/llvm-snapshot.gpg.key
+        | gpg --dearmor -o /usr/share/keyrings/llvm-archive-keyring.gpg
+    - echo 'deb [signed-by=/usr/share/keyrings/llvm-archive-keyring.gpg] http://apt.llvm.org/focal/ llvm-toolchain-focal-14 main'
+        > /etc/apt/sources.list.d/llvm-14.list
+    - apt-get update -y > /dev/null
+    - . ci/scripts/common.linux.before_script.sh
+  variables:
+    EIGEN_CI_INSTALL: clang-14
+    EIGEN_CI_CXX_COMPILER: clang++-14
+    EIGEN_CI_C_COMPILER: clang-14
+
+# ---- New: latest CUDA -------------------------------------------------------
+
+# GCC-13, CUDA-12.6
+test:linux:cuda-12.6:gcc-13:
+  extends: .test:linux:cuda
+  image: nvidia/cuda:12.6.3-devel-ubuntu24.04
+  needs: [ build:linux:cuda-12.6:gcc-13 ]
+  variables:
+    EIGEN_CI_CXX_COMPILER: g++-13
+    EIGEN_CI_C_COMPILER: gcc-13
+
+# Clang-19, CUDA-12.6
+test:linux:cuda-12.6:clang-19:
+  extends: .test:linux:cuda
+  image: nvidia/cuda:12.6.3-devel-ubuntu24.04
+  needs: [ build:linux:cuda-12.6:clang-19 ]
+  variables:
+    EIGEN_CI_INSTALL: clang-19
+    EIGEN_CI_CXX_COMPILER: clang++-19
+    EIGEN_CI_C_COMPILER: clang-19
 
 
 ##### arm ######################################################################
@@ -485,4 +521,3 @@
     - if: $CI_PIPELINE_SOURCE == "merge_request_event"
   tags:
     - saas-linux-medium-arm64
-
diff --git a/unsupported/Eigen/CMakeLists.txt b/unsupported/Eigen/CMakeLists.txt
index 6d3c57e..d15bfac 100644
--- a/unsupported/Eigen/CMakeLists.txt
+++ b/unsupported/Eigen/CMakeLists.txt
@@ -6,6 +6,7 @@
   BVH
   EulerAngles
   FFT
+  GPU
   IterativeSolvers
   KroneckerProduct
   LevenbergMarquardt
diff --git a/unsupported/Eigen/GPU b/unsupported/Eigen/GPU
new file mode 100644
index 0000000..ca32cd6
--- /dev/null
+++ b/unsupported/Eigen/GPU
@@ -0,0 +1,55 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GPU_MODULE_H
+#define EIGEN_GPU_MODULE_H
+
+#include "../../Eigen/Core"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+/** \defgroup GPU_Module GPU module
+ *
+ * GPU-accelerated dense linear algebra using NVIDIA CUDA libraries
+ * (cuSOLVER and cuBLAS).
+ *
+ * This module provides explicit GPU solver classes that coexist with Eigen's
+ * CPU solvers. Unlike the LAPACKE dispatch (which replaces the CPU
+ * implementation globally), GPU classes are separate types the user
+ * instantiates by choice:
+ *
+ * \code
+ * #define EIGEN_USE_GPU
+ * #include <unsupported/Eigen/GPU>
+ *
+ * // CPU path (unchanged)
+ * Eigen::LLT<Eigen::MatrixXd> llt_cpu(A);
+ *
+ * // GPU path (explicit)
+ * Eigen::gpu::LLT<double> llt_gpu(A);   // L stays on device
+ * auto X = llt_gpu.solve(B);            // only B transferred per solve
+ * \endcode
+ *
+ * Requires CUDA 11.4+.
+ */
+
+#ifdef EIGEN_USE_GPU
+// IWYU pragma: begin_exports
+#include "src/GPU/DeviceMatrix.h"
+#include "src/GPU/GpuContext.h"
+#include "src/GPU/DeviceExpr.h"
+#include "src/GPU/DeviceBlasExpr.h"
+#include "src/GPU/DeviceSolverExpr.h"
+#include "src/GPU/DeviceDispatch.h"
+#include "src/GPU/GpuLLT.h"
+#include "src/GPU/GpuLU.h"
+// IWYU pragma: end_exports
+#endif
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif  // EIGEN_GPU_MODULE_H
diff --git a/unsupported/Eigen/src/GPU/CuBlasSupport.h b/unsupported/Eigen/src/GPU/CuBlasSupport.h
new file mode 100644
index 0000000..6201e39
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/CuBlasSupport.h
@@ -0,0 +1,192 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// cuBLAS-specific support types:
+//   - Error-checking macro
+//   - Operation enum and mapping to cublasOperation_t
+//
+// Generic CUDA runtime utilities (DeviceBuffer, cuda_data_type) are in GpuSupport.h.
+
+#ifndef EIGEN_GPU_CUBLAS_SUPPORT_H
+#define EIGEN_GPU_CUBLAS_SUPPORT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSupport.h"
+#include <cublas_v2.h>
+#include <cstring>
+
+namespace Eigen {
+namespace gpu {
+
+// ---- Operation enum ---------------------------------------------------------
+// Public flag for transpose/adjoint in BLAS- and solver-style calls.
+
+enum class GpuOp { NoTrans, Trans, ConjTrans };
+
+namespace internal {
+
+// ---- Error-checking macro ---------------------------------------------------
+
+#define EIGEN_CUBLAS_CHECK(expr)                                       \
+  do {                                                                 \
+    cublasStatus_t _s = (expr);                                        \
+    eigen_assert(_s == CUBLAS_STATUS_SUCCESS && "cuBLAS call failed"); \
+  } while (0)
+
+constexpr cublasOperation_t to_cublas_op(GpuOp op) {
+  switch (op) {
+    case GpuOp::Trans:
+      return CUBLAS_OP_T;
+    case GpuOp::ConjTrans:
+      return CUBLAS_OP_C;
+    default:
+      return CUBLAS_OP_N;
+  }
+}
+
+// ---- Type-specific cuBLAS wrappers ------------------------------------------
+// cuBLAS uses separate functions per type (Sgemm, Dgemm, etc.).
+// These overloaded wrappers allow calling cublasXgemm/cublasXtrsm/etc.
+// with any supported scalar type.
+
+// GEMM wrappers
+inline cublasStatus_t cublasXgemm(cublasHandle_t h, cublasOperation_t transA, cublasOperation_t transB, int m, int n,
+                                  int k, const float* alpha, const float* A, int lda, const float* B, int ldb,
+                                  const float* beta, float* C, int ldc) {
+  return cublasSgemm(h, transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+inline cublasStatus_t cublasXgemm(cublasHandle_t h, cublasOperation_t transA, cublasOperation_t transB, int m, int n,
+                                  int k, const double* alpha, const double* A, int lda, const double* B, int ldb,
+                                  const double* beta, double* C, int ldc) {
+  return cublasDgemm(h, transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+static_assert(sizeof(cuComplex) == sizeof(std::complex<float>), "cuComplex and std::complex<float> layout mismatch");
+static_assert(sizeof(cuDoubleComplex) == sizeof(std::complex<double>),
+              "cuDoubleComplex and std::complex<double> layout mismatch");
+
+// Complex scalar args (alpha, beta) are type-punned from std::complex<T>*
+// to cuComplex*/cuDoubleComplex*.  A reinterpret_cast violates strict
+// aliasing: when inlined, clang/MSVC can elide the caller's store (the
+// compiler no longer sees a read through the original type), causing
+// segfaults.  We use memcpy — the standard-blessed type-pun — for scalars.
+// Device array pointers (A, B, C) are opaque to the host compiler, so
+// reinterpret_cast is safe there.
+inline cublasStatus_t cublasXgemm(cublasHandle_t h, cublasOperation_t transA, cublasOperation_t transB, int m, int n,
+                                  int k, const std::complex<float>* alpha, const std::complex<float>* A, int lda,
+                                  const std::complex<float>* B, int ldb, const std::complex<float>* beta,
+                                  std::complex<float>* C, int ldc) {
+  cuComplex a, b;
+  std::memcpy(&a, alpha, sizeof(a));
+  std::memcpy(&b, beta, sizeof(b));
+  return cublasCgemm(h, transA, transB, m, n, k, &a, reinterpret_cast<const cuComplex*>(A), lda,
+                     reinterpret_cast<const cuComplex*>(B), ldb, &b, reinterpret_cast<cuComplex*>(C), ldc);
+}
+inline cublasStatus_t cublasXgemm(cublasHandle_t h, cublasOperation_t transA, cublasOperation_t transB, int m, int n,
+                                  int k, const std::complex<double>* alpha, const std::complex<double>* A, int lda,
+                                  const std::complex<double>* B, int ldb, const std::complex<double>* beta,
+                                  std::complex<double>* C, int ldc) {
+  cuDoubleComplex a, b;
+  std::memcpy(&a, alpha, sizeof(a));
+  std::memcpy(&b, beta, sizeof(b));
+  return cublasZgemm(h, transA, transB, m, n, k, &a, reinterpret_cast<const cuDoubleComplex*>(A), lda,
+                     reinterpret_cast<const cuDoubleComplex*>(B), ldb, &b, reinterpret_cast<cuDoubleComplex*>(C), ldc);
+}
+
+// TRSM wrappers
+inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
+                                  cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const float* alpha,
+                                  const float* A, int lda, float* B, int ldb) {
+  return cublasStrsm(h, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
+                                  cublasOperation_t trans, cublasDiagType_t diag, int m, int n, const double* alpha,
+                                  const double* A, int lda, double* B, int ldb) {
+  return cublasDtrsm(h, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
+                                  cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+                                  const std::complex<float>* alpha, const std::complex<float>* A, int lda,
+                                  std::complex<float>* B, int ldb) {
+  cuComplex a;
+  std::memcpy(&a, alpha, sizeof(a));
+  return cublasCtrsm(h, side, uplo, trans, diag, m, n, &a, reinterpret_cast<const cuComplex*>(A), lda,
+                     reinterpret_cast<cuComplex*>(B), ldb);
+}
+inline cublasStatus_t cublasXtrsm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo,
+                                  cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+                                  const std::complex<double>* alpha, const std::complex<double>* A, int lda,
+                                  std::complex<double>* B, int ldb) {
+  cuDoubleComplex a;
+  std::memcpy(&a, alpha, sizeof(a));
+  return cublasZtrsm(h, side, uplo, trans, diag, m, n, &a, reinterpret_cast<const cuDoubleComplex*>(A), lda,
+                     reinterpret_cast<cuDoubleComplex*>(B), ldb);
+}
+
+// SYMM wrappers (real → symm, complex → hemm)
+inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
+                                  const float* alpha, const float* A, int lda, const float* B, int ldb,
+                                  const float* beta, float* C, int ldc) {
+  return cublasSsymm(h, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
+                                  const double* alpha, const double* A, int lda, const double* B, int ldb,
+                                  const double* beta, double* C, int ldc) {
+  return cublasDsymm(h, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
+                                  const std::complex<float>* alpha, const std::complex<float>* A, int lda,
+                                  const std::complex<float>* B, int ldb, const std::complex<float>* beta,
+                                  std::complex<float>* C, int ldc) {
+  cuComplex a, b;
+  std::memcpy(&a, alpha, sizeof(a));
+  std::memcpy(&b, beta, sizeof(b));
+  return cublasChemm(h, side, uplo, m, n, &a, reinterpret_cast<const cuComplex*>(A), lda,
+                     reinterpret_cast<const cuComplex*>(B), ldb, &b, reinterpret_cast<cuComplex*>(C), ldc);
+}
+inline cublasStatus_t cublasXsymm(cublasHandle_t h, cublasSideMode_t side, cublasFillMode_t uplo, int m, int n,
+                                  const std::complex<double>* alpha, const std::complex<double>* A, int lda,
+                                  const std::complex<double>* B, int ldb, const std::complex<double>* beta,
+                                  std::complex<double>* C, int ldc) {
+  cuDoubleComplex a, b;
+  std::memcpy(&a, alpha, sizeof(a));
+  std::memcpy(&b, beta, sizeof(b));
+  return cublasZhemm(h, side, uplo, m, n, &a, reinterpret_cast<const cuDoubleComplex*>(A), lda,
+                     reinterpret_cast<const cuDoubleComplex*>(B), ldb, &b, reinterpret_cast<cuDoubleComplex*>(C), ldc);
+}
+
+// SYRK wrappers (real → syrk, complex → herk)
+inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
+                                  const float* alpha, const float* A, int lda, const float* beta, float* C, int ldc) {
+  return cublasSsyrk(h, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
+                                  const double* alpha, const double* A, int lda, const double* beta, double* C,
+                                  int ldc) {
+  return cublasDsyrk(h, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
+                                  const float* alpha, const std::complex<float>* A, int lda, const float* beta,
+                                  std::complex<float>* C, int ldc) {
+  return cublasCherk(h, uplo, trans, n, k, alpha, reinterpret_cast<const cuComplex*>(A), lda, beta,
+                     reinterpret_cast<cuComplex*>(C), ldc);
+}
+inline cublasStatus_t cublasXsyrk(cublasHandle_t h, cublasFillMode_t uplo, cublasOperation_t trans, int n, int k,
+                                  const double* alpha, const std::complex<double>* A, int lda, const double* beta,
+                                  std::complex<double>* C, int ldc) {
+  return cublasZherk(h, uplo, trans, n, k, alpha, reinterpret_cast<const cuDoubleComplex*>(A), lda, beta,
+                     reinterpret_cast<cuDoubleComplex*>(C), ldc);
+}
+
+}  // namespace internal
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_CUBLAS_SUPPORT_H
diff --git a/unsupported/Eigen/src/GPU/CuSolverSupport.h b/unsupported/Eigen/src/GPU/CuSolverSupport.h
new file mode 100644
index 0000000..2de889d
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/CuSolverSupport.h
@@ -0,0 +1,130 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// cuSOLVER-specific support types:
+//   - cuSOLVER error-checking macro
+//   - RAII wrapper for cusolverDnParams
+//   - Scalar → cudaDataType_t mapping
+//   - (UpLo, StorageOrder) → cublasFillMode_t mapping
+//
+// Generic CUDA runtime utilities (DeviceBuffer, EIGEN_CUDA_RUNTIME_CHECK)
+// are in GpuSupport.h.
+
+#ifndef EIGEN_GPU_CUSOLVER_SUPPORT_H
+#define EIGEN_GPU_CUSOLVER_SUPPORT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./GpuSupport.h"
+#include <cusolverDn.h>
+#include <cstdio>
+
+namespace Eigen {
+namespace gpu {
+namespace internal {
+
+// ---- Error-checking macros --------------------------------------------------
+
+// cuSOLVER does not ship a cusolverGetErrorString() in the public API, so we
+// stringify the codes ourselves. Keeps failed asserts actionable.
+inline const char* cusolver_status_name(cusolverStatus_t s) {
+  switch (s) {
+    case CUSOLVER_STATUS_SUCCESS:
+      return "CUSOLVER_STATUS_SUCCESS";
+    case CUSOLVER_STATUS_NOT_INITIALIZED:
+      return "CUSOLVER_STATUS_NOT_INITIALIZED";
+    case CUSOLVER_STATUS_ALLOC_FAILED:
+      return "CUSOLVER_STATUS_ALLOC_FAILED";
+    case CUSOLVER_STATUS_INVALID_VALUE:
+      return "CUSOLVER_STATUS_INVALID_VALUE";
+    case CUSOLVER_STATUS_ARCH_MISMATCH:
+      return "CUSOLVER_STATUS_ARCH_MISMATCH";
+    case CUSOLVER_STATUS_EXECUTION_FAILED:
+      return "CUSOLVER_STATUS_EXECUTION_FAILED";
+    case CUSOLVER_STATUS_INTERNAL_ERROR:
+      return "CUSOLVER_STATUS_INTERNAL_ERROR";
+    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSOLVER_STATUS_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_NOT_SUPPORTED";
+    default:
+      return "CUSOLVER_STATUS_UNKNOWN";
+  }
+}
+
+inline bool report_cusolver_failure(cusolverStatus_t s, const char* expr, const char* file, int line) {
+  std::fprintf(stderr,
+               "cuSOLVER call failed\n"
+               "  expr:   %s\n"
+               "  status: %s (%d)\n"
+               "  at:     %s:%d\n",
+               expr, cusolver_status_name(s), static_cast<int>(s), file, line);
+  return false;
+}
+
+#define EIGEN_CUSOLVER_CHECK(expr)                                                                \
+  do {                                                                                            \
+    cusolverStatus_t _s = (expr);                                                                 \
+    eigen_assert(_s == CUSOLVER_STATUS_SUCCESS ||                                                 \
+                 ::Eigen::gpu::internal::report_cusolver_failure(_s, #expr, __FILE__, __LINE__)); \
+  } while (0)
+
+// ---- RAII: cusolverDnParams -------------------------------------------------
+
+struct CusolverParams {
+  cusolverDnParams_t p = nullptr;
+
+  CusolverParams() { EIGEN_CUSOLVER_CHECK(cusolverDnCreateParams(&p)); }
+
+  ~CusolverParams() {
+    if (p) (void)cusolverDnDestroyParams(p);  // destructor: can't propagate
+  }
+
+  // Move-only.
+  CusolverParams(CusolverParams&& o) noexcept : p(o.p) { o.p = nullptr; }
+  CusolverParams& operator=(CusolverParams&& o) noexcept {
+    if (this != &o) {
+      if (p) (void)cusolverDnDestroyParams(p);
+      p = o.p;
+      o.p = nullptr;
+    }
+    return *this;
+  }
+
+  CusolverParams(const CusolverParams&) = delete;
+  CusolverParams& operator=(const CusolverParams&) = delete;
+};
+
+// ---- Scalar → cudaDataType_t ------------------------------------------------
+// Alias for backward compatibility. The canonical trait is cuda_data_type<> in GpuSupport.h.
+template <typename Scalar>
+using cusolver_data_type = cuda_data_type<Scalar>;
+
+// ---- UpLo → cublasFillMode_t ------------------------------------------------
+// cuSOLVER always interprets the matrix as column-major. Callers pass the
+// triangle that holds the data in column-major layout.
+
+template <int UpLo>
+struct cusolver_fill_mode;
+
+template <>
+struct cusolver_fill_mode<Lower> {
+  static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_LOWER;
+};
+template <>
+struct cusolver_fill_mode<Upper> {
+  static constexpr cublasFillMode_t value = CUBLAS_FILL_MODE_UPPER;
+};
+
+}  // namespace internal
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_CUSOLVER_SUPPORT_H
diff --git a/unsupported/Eigen/src/GPU/DeviceBlasExpr.h b/unsupported/Eigen/src/GPU/DeviceBlasExpr.h
new file mode 100644
index 0000000..7597320
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/DeviceBlasExpr.h
@@ -0,0 +1,152 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// BLAS Level 3 expression types for gpu::DeviceMatrix (beyond GEMM):
+//   TrsmExpr           -> cublasXtrsm   (triangular solve)
+//   SymmExpr           -> cublasXsymm   (symmetric multiply, real)
+//                      -> cublasXhemm   (Hermitian multiply, complex)
+//   SyrkExpr           -> cublasXsyrk   (symmetric rank-k update, real)
+//                      -> cublasXherk   (Hermitian rank-k update, complex)
+
+#ifndef EIGEN_GPU_DEVICE_BLAS_EXPR_H
+#define EIGEN_GPU_DEVICE_BLAS_EXPR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include <functional>
+
+namespace Eigen {
+namespace gpu {
+
+template <typename Scalar_>
+class DeviceMatrix;
+template <typename Scalar_, int UpLo_>
+class TrsmExpr;
+
+// ---- TriangularView --------------------------------------------------------
+// d_A.triangularView<Lower>() -> view with .solve(d_B)
+
+template <typename Scalar_, int UpLo_>
+class TriangularView {
+ public:
+  using Scalar = Scalar_;
+  static constexpr int UpLo = UpLo_;
+
+  explicit TriangularView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
+  const DeviceMatrix<Scalar>& matrix() const { return mat_; }
+
+  /** Build a TRSM solve expression. */
+  TrsmExpr<Scalar, UpLo_> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
+
+ private:
+  std::reference_wrapper<const DeviceMatrix<Scalar>> mat_;
+};
+
+// ---- TrsmExpr: triangularView<UpLo>().solve(B) -> cublasXtrsm --------------
+
+template <typename Scalar_, int UpLo_>
+class TrsmExpr {
+ public:
+  using Scalar = Scalar_;
+  static constexpr int UpLo = UpLo_;
+
+  TrsmExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
+  const DeviceMatrix<Scalar>& matrix() const { return A_; }
+  const DeviceMatrix<Scalar>& rhs() const { return B_; }
+
+ private:
+  std::reference_wrapper<const DeviceMatrix<Scalar>> A_;
+  std::reference_wrapper<const DeviceMatrix<Scalar>> B_;
+};
+
+// ---- SelfAdjointView -------------------------------------------------------
+// d_A.selfadjointView<Lower>() -> view that can multiply: view * d_B
+
+template <typename Scalar_, int UpLo_>
+class SelfAdjointView {
+ public:
+  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  static constexpr int UpLo = UpLo_;
+
+  explicit SelfAdjointView(DeviceMatrix<Scalar>& m) : mat_(m) {}
+  const DeviceMatrix<Scalar>& matrix() const { return mat_; }
+  DeviceMatrix<Scalar>& matrix() { return mat_; }
+
+  /** Rank-k update: C.selfadjointView<Lower>().rankUpdate(A, alpha)
+   * computes C = alpha * A * A^H + C (lower triangle only).
+   * Maps to cublasXsyrk (real) or cublasXherk (complex). */
+  void rankUpdate(const DeviceMatrix<Scalar>& A, RealScalar alpha = RealScalar(1));
+
+ private:
+  std::reference_wrapper<DeviceMatrix<Scalar>> mat_;
+};
+
+// Const variant for multiplication only (no rankUpdate).
+template <typename Scalar_, int UpLo_>
+class ConstSelfAdjointView {
+ public:
+  using Scalar = Scalar_;
+  static constexpr int UpLo = UpLo_;
+
+  explicit ConstSelfAdjointView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
+  const DeviceMatrix<Scalar>& matrix() const { return mat_; }
+
+ private:
+  std::reference_wrapper<const DeviceMatrix<Scalar>> mat_;
+};
+
+// ---- SymmExpr: selfadjointView<UpLo>() * B -> cublasXsymm/Xhemm -----------
+
+template <typename Scalar_, int UpLo_>
+class SymmExpr {
+ public:
+  using Scalar = Scalar_;
+  static constexpr int UpLo = UpLo_;
+
+  SymmExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
+  const DeviceMatrix<Scalar>& matrix() const { return A_; }
+  const DeviceMatrix<Scalar>& rhs() const { return B_; }
+
+ private:
+  std::reference_wrapper<const DeviceMatrix<Scalar>> A_;
+  std::reference_wrapper<const DeviceMatrix<Scalar>> B_;
+};
+
+// operator*: SelfAdjointView * Matrix -> SymmExpr (mutable and const variants)
+template <typename S, int UpLo>
+SymmExpr<S, UpLo> operator*(const SelfAdjointView<S, UpLo>& a, const DeviceMatrix<S>& b) {
+  return {a.matrix(), b};
+}
+template <typename S, int UpLo>
+SymmExpr<S, UpLo> operator*(const ConstSelfAdjointView<S, UpLo>& a, const DeviceMatrix<S>& b) {
+  return {a.matrix(), b};
+}
+
+// ---- SyrkExpr: rankUpdate(A) -> cublasXsyrk/Xherk --------------------------
+// C.rankUpdate(A) computes C += A * A^H (or A^H * A depending on convention).
+
+template <typename Scalar_, int UpLo_>
+class SyrkExpr {
+ public:
+  using Scalar = Scalar_;
+  static constexpr int UpLo = UpLo_;
+
+  SyrkExpr(const DeviceMatrix<Scalar>& A) : A_(A) {}
+  const DeviceMatrix<Scalar>& matrix() const { return A_; }
+
+ private:
+  std::reference_wrapper<const DeviceMatrix<Scalar>> A_;
+};
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_DEVICE_BLAS_EXPR_H
diff --git a/unsupported/Eigen/src/GPU/DeviceDispatch.h b/unsupported/Eigen/src/GPU/DeviceDispatch.h
new file mode 100644
index 0000000..26ffc8f
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/DeviceDispatch.h
@@ -0,0 +1,489 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Dispatch functions that map DeviceMatrix expressions to NVIDIA library calls.
+//
+// dispatch_gemm()  — GemmExpr → cublasXgemm
+//
+// Each function documents the exact library call and parameters.
+
+#ifndef EIGEN_GPU_DEVICE_DISPATCH_H
+#define EIGEN_GPU_DEVICE_DISPATCH_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include <climits>
+
+#include "./DeviceExpr.h"
+#include "./DeviceBlasExpr.h"
+#include "./DeviceSolverExpr.h"
+#include "./GpuContext.h"
+#include "./CuSolverSupport.h"
+
+namespace Eigen {
+namespace gpu {
+namespace internal {
+
+template <typename Scalar>
+bool aliases_device_memory(const DeviceMatrix<Scalar>& a, const DeviceMatrix<Scalar>& b) {
+  return a.data() != nullptr && a.data() == b.data();
+}
+
+// ---- GEMM dispatch ----------------------------------------------------------
+// GemmExpr<Lhs, Rhs> → cublasXgemm (type-specific Sgemm/Dgemm/Cgemm/Zgemm).
+
+template <typename Lhs, typename Rhs>
+void dispatch_gemm(
+    Context& ctx, DeviceMatrix<typename device_expr_traits<Lhs>::scalar_type>& dst, const GemmExpr<Lhs, Rhs>& expr,
+    typename device_expr_traits<Lhs>::scalar_type beta_val,
+    typename device_expr_traits<Lhs>::scalar_type alpha_scale = typename device_expr_traits<Lhs>::scalar_type(1)) {
+  using Scalar = typename device_expr_traits<Lhs>::scalar_type;
+  using traits_lhs = device_expr_traits<Lhs>;
+  using traits_rhs = device_expr_traits<Rhs>;
+
+  const DeviceMatrix<Scalar>& A = traits_lhs::matrix(expr.lhs());
+  const DeviceMatrix<Scalar>& B = traits_rhs::matrix(expr.rhs());
+
+  constexpr cublasOperation_t transA = to_cublas_op(traits_lhs::op);
+  constexpr cublasOperation_t transB = to_cublas_op(traits_rhs::op);
+
+  // GEMM dimensions: C(m,n) = op(A)(m,k) * op(B)(k,n)
+  const int64_t m = (traits_lhs::op == GpuOp::NoTrans) ? A.rows() : A.cols();
+  const int64_t k = (traits_lhs::op == GpuOp::NoTrans) ? A.cols() : A.rows();
+  const int64_t n = (traits_rhs::op == GpuOp::NoTrans) ? B.cols() : B.rows();
+  const int64_t rhs_k = (traits_rhs::op == GpuOp::NoTrans) ? B.rows() : B.cols();
+
+  eigen_assert(k == rhs_k && "DeviceMatrix GEMM dimension mismatch");
+
+  const int64_t lda = A.outerStride();
+  const int64_t ldb = B.outerStride();
+
+  eigen_assert(!aliases_device_memory(dst, A) && "DeviceMatrix GEMM destination aliases lhs operand");
+  eigen_assert(!aliases_device_memory(dst, B) && "DeviceMatrix GEMM destination aliases rhs operand");
+
+  if (!dst.empty()) {
+    dst.waitReady(ctx.stream());
+  }
+
+  const bool resized = dst.empty() || dst.rows() != m || dst.cols() != n;
+  if (resized) {
+    dst.resize(m, n);
+  }
+  const int64_t ldc = dst.outerStride();
+
+  Scalar alpha_local = alpha_scale * traits_lhs::alpha(expr.lhs()) * traits_rhs::alpha(expr.rhs());
+
+  A.waitReady(ctx.stream());
+  B.waitReady(ctx.stream());
+
+  if (resized && beta_val != Scalar(0) && dst.sizeInBytes() > 0) {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
+  }
+
+  eigen_assert(m <= INT_MAX && n <= INT_MAX && k <= INT_MAX && lda <= INT_MAX && ldb <= INT_MAX && ldc <= INT_MAX &&
+               "cublasXgemm dimensions exceed int range");
+
+  Scalar scalars[2] = {alpha_local, beta_val};
+  EIGEN_CUBLAS_CHECK(cublasXgemm(ctx.cublasHandle(), transA, transB, static_cast<int>(m), static_cast<int>(n),
+                                 static_cast<int>(k), &scalars[0], A.data(), static_cast<int>(lda), B.data(),
+                                 static_cast<int>(ldb), &scalars[1], dst.data(), static_cast<int>(ldc)));
+
+  dst.recordReady(ctx.stream());
+}
+
+// ---- LLT solve dispatch -----------------------------------------------------
+
+template <typename Scalar, int UpLo>
+void dispatch_llt_solve(Context& ctx, DeviceMatrix<Scalar>& dst, const LltSolveExpr<Scalar, UpLo>& expr) {
+  const DeviceMatrix<Scalar>& A = expr.matrix();
+  const DeviceMatrix<Scalar>& B = expr.rhs();
+
+  eigen_assert(A.rows() == A.cols() && "LLT requires a square matrix");
+  eigen_assert(B.rows() == A.rows() && "LLT solve: RHS rows must match matrix size");
+
+  const int64_t n = static_cast<int64_t>(A.rows());
+  const int64_t nrhs = static_cast<int64_t>(B.cols());
+
+  if (n == 0 || nrhs == 0) {
+    if (!dst.empty()) dst.waitReady(ctx.stream());
+    dst.resize(n, B.cols());
+    return;
+  }
+
+  A.waitReady(ctx.stream());
+  B.waitReady(ctx.stream());
+  if (!dst.empty()) dst.waitReady(ctx.stream());
+
+  constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
+  constexpr cublasFillMode_t uplo = cusolver_fill_mode<UpLo>::value;
+  const int64_t lda = static_cast<int64_t>(A.outerStride());
+  const int64_t ldb = static_cast<int64_t>(B.outerStride());
+  const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
+  const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
+
+  DeviceBuffer d_factor(mat_bytes);
+  EIGEN_CUDA_RUNTIME_CHECK(
+      cudaMemcpyAsync(d_factor.get(), A.data(), mat_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
+
+  // Two info slots (potrf, potrs) so we can queue both kernels back-to-back
+  // and host-sync once at the end. If potrf fails, potrs runs on garbage but
+  // the assert fires after the single sync — saving a round trip.
+  PinnedHostBuffer h_info(2 * sizeof(int));
+  int* info_words = static_cast<int*>(h_info.get());
+
+  CusolverParams params;
+  DeviceBuffer d_info(2 * sizeof(int));
+  int* d_info_potrf = static_cast<int*>(d_info.get());
+  int* d_info_potrs = d_info_potrf + 1;
+  size_t dev_ws = 0, host_ws = 0;
+  EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(ctx.cusolverHandle(), params.p, uplo, n, dtype, d_factor.get(), lda,
+                                                   dtype, &dev_ws, &host_ws));
+
+  DeviceBuffer d_workspace(dev_ws);
+  std::vector<char> h_workspace(host_ws);
+
+  EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(ctx.cusolverHandle(), params.p, uplo, n, dtype, d_factor.get(), lda, dtype,
+                                        d_workspace.get(), dev_ws, host_ws > 0 ? h_workspace.data() : nullptr, host_ws,
+                                        d_info_potrf));
+
+  EIGEN_CUDA_RUNTIME_CHECK(
+      cudaMemcpyAsync(&info_words[0], d_info_potrf, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+
+  dst.resize(n, B.cols());
+  EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
+
+  EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(ctx.cusolverHandle(), params.p, uplo, n, nrhs, dtype, d_factor.get(), lda,
+                                        dtype, dst.data(), static_cast<int64_t>(dst.outerStride()), d_info_potrs));
+
+  EIGEN_CUDA_RUNTIME_CHECK(
+      cudaMemcpyAsync(&info_words[1], d_info_potrs, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
+  eigen_assert(info_words[0] == 0 && "cuSOLVER LLT factorization failed (matrix not positive definite)");
+  eigen_assert(info_words[1] == 0 && "cuSOLVER LLT solve failed");
+
+  dst.recordReady(ctx.stream());
+}
+
+// ---- LU solve dispatch ------------------------------------------------------
+
+template <typename Scalar>
+void dispatch_lu_solve(Context& ctx, DeviceMatrix<Scalar>& dst, const LuSolveExpr<Scalar>& expr) {
+  const DeviceMatrix<Scalar>& A = expr.matrix();
+  const DeviceMatrix<Scalar>& B = expr.rhs();
+
+  eigen_assert(A.rows() == A.cols() && "LU requires a square matrix");
+  eigen_assert(B.rows() == A.rows() && "LU solve: RHS rows must match matrix size");
+
+  const int64_t n = static_cast<int64_t>(A.rows());
+  const int64_t nrhs = static_cast<int64_t>(B.cols());
+
+  if (n == 0 || nrhs == 0) {
+    if (!dst.empty()) dst.waitReady(ctx.stream());
+    dst.resize(n, B.cols());
+    return;
+  }
+
+  A.waitReady(ctx.stream());
+  B.waitReady(ctx.stream());
+  if (!dst.empty()) dst.waitReady(ctx.stream());
+
+  constexpr cudaDataType_t dtype = cuda_data_type<Scalar>::value;
+  const int64_t lda = static_cast<int64_t>(A.outerStride());
+  const int64_t ldb = static_cast<int64_t>(B.outerStride());
+  const size_t mat_bytes = static_cast<size_t>(lda) * static_cast<size_t>(n) * sizeof(Scalar);
+  const size_t rhs_bytes = static_cast<size_t>(ldb) * static_cast<size_t>(nrhs) * sizeof(Scalar);
+  const size_t ipiv_bytes = static_cast<size_t>(n) * sizeof(int64_t);
+
+  DeviceBuffer d_lu(mat_bytes);
+  EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu.get(), A.data(), mat_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
+
+  DeviceBuffer d_ipiv(ipiv_bytes);
+
+  // See dispatch_llt_solve: two info slots + single end-of-chain sync.
+  PinnedHostBuffer h_info(2 * sizeof(int));
+  int* info_words = static_cast<int*>(h_info.get());
+
+  CusolverParams params;
+  DeviceBuffer d_info(2 * sizeof(int));
+  int* d_info_getrf = static_cast<int*>(d_info.get());
+  int* d_info_getrs = d_info_getrf + 1;
+  size_t dev_ws = 0, host_ws = 0;
+  EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(ctx.cusolverHandle(), params.p, n, n, dtype, d_lu.get(), lda, dtype,
+                                                   &dev_ws, &host_ws));
+
+  DeviceBuffer d_workspace(dev_ws);
+  std::vector<char> h_workspace(host_ws);
+
+  EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf(ctx.cusolverHandle(), params.p, n, n, dtype, d_lu.get(), lda,
+                                        static_cast<int64_t*>(d_ipiv.get()), dtype, d_workspace.get(), dev_ws,
+                                        host_ws > 0 ? h_workspace.data() : nullptr, host_ws, d_info_getrf));
+
+  EIGEN_CUDA_RUNTIME_CHECK(
+      cudaMemcpyAsync(&info_words[0], d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+
+  dst.resize(n, B.cols());
+  EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
+
+  EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(ctx.cusolverHandle(), params.p, CUBLAS_OP_N, n, nrhs, dtype, d_lu.get(), lda,
+                                        static_cast<const int64_t*>(d_ipiv.get()), dtype, dst.data(),
+                                        static_cast<int64_t>(dst.outerStride()), d_info_getrs));
+
+  EIGEN_CUDA_RUNTIME_CHECK(
+      cudaMemcpyAsync(&info_words[1], d_info_getrs, sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(ctx.stream()));
+  eigen_assert(info_words[0] == 0 && "cuSOLVER LU factorization failed (singular matrix)");
+  eigen_assert(info_words[1] == 0 && "cuSOLVER LU solve failed");
+
+  dst.recordReady(ctx.stream());
+}
+
+// ---- TRSM dispatch ----------------------------------------------------------
+
+template <typename Scalar, int UpLo>
+void dispatch_trsm(Context& ctx, DeviceMatrix<Scalar>& dst, const TrsmExpr<Scalar, UpLo>& expr) {
+  const DeviceMatrix<Scalar>& A = expr.matrix();
+  const DeviceMatrix<Scalar>& B = expr.rhs();
+
+  eigen_assert(A.rows() == A.cols() && "TRSM requires a square triangular matrix");
+  eigen_assert(B.rows() == A.rows() && "TRSM: RHS rows must match matrix size");
+
+  eigen_assert(A.rows() <= INT_MAX && B.cols() <= INT_MAX && A.outerStride() <= INT_MAX &&
+               "cublasXtrsm dimensions exceed int range");
+
+  const int n = static_cast<int>(A.rows());
+  const int nrhs = static_cast<int>(B.cols());
+
+  if (n == 0 || nrhs == 0) {
+    if (!dst.empty()) dst.waitReady(ctx.stream());
+    dst.resize(n, B.cols());
+    return;
+  }
+
+  A.waitReady(ctx.stream());
+  B.waitReady(ctx.stream());
+  eigen_assert(!aliases_device_memory(dst, A) && "DeviceMatrix TRSM destination aliases triangular operand");
+  eigen_assert(!aliases_device_memory(dst, B) && "DeviceMatrix TRSM destination aliases RHS operand");
+  if (!dst.empty()) dst.waitReady(ctx.stream());
+
+  dst.resize(n, B.cols());
+  const size_t rhs_bytes = static_cast<size_t>(dst.outerStride()) * static_cast<size_t>(nrhs) * sizeof(Scalar);
+  EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(dst.data(), B.data(), rhs_bytes, cudaMemcpyDeviceToDevice, ctx.stream()));
+
+  constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+  Scalar alpha(1);
+
+  EIGEN_CUBLAS_CHECK(cublasXtrsm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, nrhs,
+                                 &alpha, A.data(), static_cast<int>(A.outerStride()), dst.data(),
+                                 static_cast<int>(dst.outerStride())));
+
+  dst.recordReady(ctx.stream());
+}
+
+// ---- SYMM/HEMM dispatch -----------------------------------------------------
+
+template <typename Scalar, int UpLo>
+void dispatch_symm(Context& ctx, DeviceMatrix<Scalar>& dst, const SymmExpr<Scalar, UpLo>& expr) {
+  const DeviceMatrix<Scalar>& A = expr.matrix();
+  const DeviceMatrix<Scalar>& B = expr.rhs();
+
+  eigen_assert(A.rows() == A.cols() && "SYMM requires a square matrix");
+  eigen_assert(B.rows() == A.rows() && "SYMM: RHS rows must match matrix size");
+  eigen_assert(A.rows() <= INT_MAX && B.cols() <= INT_MAX && A.outerStride() <= INT_MAX && B.outerStride() <= INT_MAX &&
+               "cublasXsymm dimensions exceed int range");
+
+  const int m = static_cast<int>(A.rows());
+  const int n = static_cast<int>(B.cols());
+
+  if (m == 0 || n == 0) {
+    if (!dst.empty()) dst.waitReady(ctx.stream());
+    dst.resize(m == 0 ? 0 : m, B.cols());
+    return;
+  }
+
+  A.waitReady(ctx.stream());
+  B.waitReady(ctx.stream());
+  eigen_assert(!aliases_device_memory(dst, A) && "DeviceMatrix SYMM destination aliases self-adjoint operand");
+  eigen_assert(!aliases_device_memory(dst, B) && "DeviceMatrix SYMM destination aliases RHS operand");
+  if (!dst.empty()) dst.waitReady(ctx.stream());
+
+  dst.resize(m, n);
+
+  constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+  Scalar scalars[2] = {Scalar(1), Scalar(0)};
+
+  EIGEN_CUBLAS_CHECK(cublasXsymm(ctx.cublasHandle(), CUBLAS_SIDE_LEFT, uplo, m, n, &scalars[0], A.data(),
+                                 static_cast<int>(A.outerStride()), B.data(), static_cast<int>(B.outerStride()),
+                                 &scalars[1], dst.data(), static_cast<int>(dst.outerStride())));
+
+  dst.recordReady(ctx.stream());
+}
+
+// ---- SYRK/HERK dispatch -----------------------------------------------------
+
+template <typename Scalar, int UpLo>
+void dispatch_syrk(Context& ctx, DeviceMatrix<Scalar>& dst, const SyrkExpr<Scalar, UpLo>& expr,
+                   typename NumTraits<Scalar>::Real alpha_val, typename NumTraits<Scalar>::Real beta_val) {
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  const DeviceMatrix<Scalar>& A = expr.matrix();
+
+  eigen_assert(A.rows() <= INT_MAX && A.cols() <= INT_MAX && A.outerStride() <= INT_MAX &&
+               "cublasXsyrk dimensions exceed int range");
+
+  const int n = static_cast<int>(A.rows());
+  const int k = static_cast<int>(A.cols());
+
+  if (n == 0) {
+    if (!dst.empty()) dst.waitReady(ctx.stream());
+    dst.resize(0, 0);
+    return;
+  }
+
+  A.waitReady(ctx.stream());
+  eigen_assert(!aliases_device_memory(dst, A) && "DeviceMatrix SYRK destination aliases input operand");
+  if (!dst.empty()) dst.waitReady(ctx.stream());
+
+  if (dst.empty() || dst.rows() != n || dst.cols() != n) {
+    dst.resize(n, n);
+    if (beta_val != RealScalar(0)) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMemsetAsync(dst.data(), 0, dst.sizeInBytes(), ctx.stream()));
+    }
+  }
+
+  constexpr cublasFillMode_t uplo = (UpLo == Lower) ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
+
+  EIGEN_CUBLAS_CHECK(cublasXsyrk(ctx.cublasHandle(), uplo, CUBLAS_OP_N, n, k, &alpha_val, A.data(),
+                                 static_cast<int>(A.outerStride()), &beta_val, dst.data(),
+                                 static_cast<int>(dst.outerStride())));
+
+  dst.recordReady(ctx.stream());
+}
+
+}  // namespace internal
+
+// ---- Assignment: d_C.device(ctx) = expr ------------------------------------
+
+template <typename Scalar_>
+class Assignment {
+ public:
+  using Scalar = Scalar_;
+
+  Assignment(DeviceMatrix<Scalar>& dst, Context& ctx) : dst_(dst), ctx_(ctx) {}
+
+  template <typename Lhs, typename Rhs>
+  DeviceMatrix<Scalar>& operator=(const GemmExpr<Lhs, Rhs>& expr) {
+    internal::dispatch_gemm(ctx_, dst_, expr, Scalar(0));
+    return dst_;
+  }
+
+  template <typename Lhs, typename Rhs>
+  DeviceMatrix<Scalar>& operator+=(const GemmExpr<Lhs, Rhs>& expr) {
+    internal::dispatch_gemm(ctx_, dst_, expr, Scalar(1));
+    return dst_;
+  }
+
+  template <typename Lhs, typename Rhs>
+  DeviceMatrix<Scalar>& operator-=(const GemmExpr<Lhs, Rhs>& expr) {
+    internal::dispatch_gemm(ctx_, dst_, expr, Scalar(1), Scalar(-1));
+    return dst_;
+  }
+
+  template <int UpLo>
+  DeviceMatrix<Scalar>& operator=(const LltSolveExpr<Scalar, UpLo>& expr) {
+    internal::dispatch_llt_solve(ctx_, dst_, expr);
+    return dst_;
+  }
+
+  DeviceMatrix<Scalar>& operator=(const LuSolveExpr<Scalar>& expr) {
+    internal::dispatch_lu_solve(ctx_, dst_, expr);
+    return dst_;
+  }
+
+  template <int UpLo>
+  DeviceMatrix<Scalar>& operator=(const TrsmExpr<Scalar, UpLo>& expr) {
+    internal::dispatch_trsm(ctx_, dst_, expr);
+    return dst_;
+  }
+
+  template <int UpLo>
+  DeviceMatrix<Scalar>& operator=(const SymmExpr<Scalar, UpLo>& expr) {
+    internal::dispatch_symm(ctx_, dst_, expr);
+    return dst_;
+  }
+
+  template <typename Expr>
+  DeviceMatrix<Scalar>& operator=(const Expr&) {
+    static_assert(sizeof(Expr) == 0,
+                  "DeviceMatrix expression not supported: no cuBLAS/cuSOLVER mapping. "
+                  "Supported: GEMM (A*B), TRSM (.triangularView().solve()), "
+                  "SYMM (.selfadjointView()*B), LLT (.llt().solve()), LU (.lu().solve()).");
+    return dst_;
+  }
+
+ private:
+  DeviceMatrix<Scalar>& dst_;
+  Context& ctx_;
+};
+
+// ---- Out-of-line Matrix expression operator= definitions ------------------
+// Declared in DeviceMatrix.h, defined here because they need Context::threadLocal().
+
+template <typename Scalar_>
+template <typename Lhs, typename Rhs>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const GemmExpr<Lhs, Rhs>& expr) {
+  device(Context::threadLocal()) = expr;
+  return *this;
+}
+
+template <typename Scalar_>
+template <typename Lhs, typename Rhs>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator+=(const GemmExpr<Lhs, Rhs>& expr) {
+  device(Context::threadLocal()) += expr;
+  return *this;
+}
+
+template <typename Scalar_>
+template <int UpLo>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const LltSolveExpr<Scalar_, UpLo>& expr) {
+  device(Context::threadLocal()) = expr;
+  return *this;
+}
+
+template <typename Scalar_>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const LuSolveExpr<Scalar_>& expr) {
+  device(Context::threadLocal()) = expr;
+  return *this;
+}
+
+template <typename Scalar_>
+template <int UpLo>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const TrsmExpr<Scalar_, UpLo>& expr) {
+  device(Context::threadLocal()) = expr;
+  return *this;
+}
+
+template <typename Scalar_>
+template <int UpLo>
+DeviceMatrix<Scalar_>& DeviceMatrix<Scalar_>::operator=(const SymmExpr<Scalar_, UpLo>& expr) {
+  device(Context::threadLocal()) = expr;
+  return *this;
+}
+
+// SelfAdjointView::rankUpdate — defined here because it needs Context.
+template <typename Scalar_, int UpLo_>
+void SelfAdjointView<Scalar_, UpLo_>::rankUpdate(const DeviceMatrix<Scalar_>& A, RealScalar alpha) {
+  SyrkExpr<Scalar_, UpLo_> expr(A);
+  RealScalar beta = matrix().empty() ? RealScalar(0) : RealScalar(1);
+  internal::dispatch_syrk(Context::threadLocal(), matrix(), expr, alpha, beta);
+}
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_DEVICE_DISPATCH_H
diff --git a/unsupported/Eigen/src/GPU/DeviceExpr.h b/unsupported/Eigen/src/GPU/DeviceExpr.h
new file mode 100644
index 0000000..bcf842a
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/DeviceExpr.h
@@ -0,0 +1,190 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Lightweight expression types for DeviceMatrix operations.
+//
+// These are NOT Eigen expression templates. Each type maps 1:1 to a single
+// NVIDIA library call (cuBLAS or cuSOLVER). There is no coefficient-level
+// evaluation, no lazy fusion, no packet operations.
+//
+// Expression types:
+//   AdjointView<S>   — d_A.adjoint()  → marks ConjTrans for GEMM
+//   TransposeView<S> — d_A.transpose() → marks Trans for GEMM
+//   Scaled<Expr>     — alpha * expr    → carries scalar factor
+//   gpu::GemmExpr<Lhs, Rhs> — lhs * rhs    → dispatches to cublasXgemm
+
+#ifndef EIGEN_GPU_DEVICE_EXPR_H
+#define EIGEN_GPU_DEVICE_EXPR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuBlasSupport.h"
+
+namespace Eigen {
+namespace gpu {
+
+namespace internal {
+// Forward declaration — specializations follow below, after the class definitions.
+template <typename Expr>
+struct device_expr_traits;
+}  // namespace internal
+
+// Forward declaration.
+template <typename Scalar_>
+class DeviceMatrix;
+
+// ---- AdjointView: marks ConjTrans -------------------------------------------
+// Returned by DeviceMatrix::adjoint(). Maps to cublasXgemm transA/B = C.
+
+template <typename Scalar_>
+class AdjointView {
+ public:
+  using Scalar = Scalar_;
+  explicit AdjointView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
+  const DeviceMatrix<Scalar>& matrix() const { return mat_; }
+
+ private:
+  const DeviceMatrix<Scalar>& mat_;
+};
+
+// ---- TransposeView: marks Trans ---------------------------------------------
+// Returned by DeviceMatrix::transpose(). Maps to cublasXgemm transA/B = T.
+
+template <typename Scalar_>
+class TransposeView {
+ public:
+  using Scalar = Scalar_;
+  explicit TransposeView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
+  const DeviceMatrix<Scalar>& matrix() const { return mat_; }
+
+ private:
+  const DeviceMatrix<Scalar>& mat_;
+};
+
+// ---- Scaled: alpha * expr ---------------------------------------------------
+// Returned by operator*(Scalar, DeviceMatrix/View). Carries the scalar factor.
+
+template <typename Inner>
+class Scaled {
+ public:
+  using Scalar = typename internal::device_expr_traits<Inner>::scalar_type;
+  Scaled(Scalar alpha, const Inner& inner) : alpha_(alpha), inner_(inner) {}
+  Scalar scalar() const { return alpha_; }
+  const Inner& inner() const { return inner_; }
+
+ private:
+  Scalar alpha_;
+  const Inner& inner_;
+};
+
+// ---- GemmExpr: lhs * rhs -> cublasXgemm ------------------------------------
+// Returned by operator*(lhs_expr, rhs_expr). Dispatches to cuBLAS GEMM.
+
+template <typename Lhs, typename Rhs>
+class GemmExpr {
+ public:
+  using Scalar = typename internal::device_expr_traits<Lhs>::scalar_type;
+  static_assert(std::is_same<Scalar, typename internal::device_expr_traits<Rhs>::scalar_type>::value,
+                "DeviceMatrix GEMM: LHS and RHS must have the same scalar type");
+
+  GemmExpr(const Lhs& lhs, const Rhs& rhs) : lhs_(lhs), rhs_(rhs) {}
+  const Lhs& lhs() const { return lhs_; }
+  const Rhs& rhs() const { return rhs_; }
+
+ private:
+  // Stored by reference — like Eigen's CPU expression templates, these must
+  // not be captured with auto (the references will dangle). Use .eval() or
+  // assign to a DeviceMatrix immediately.
+  const Lhs& lhs_;
+  const Rhs& rhs_;
+};
+
+// ---- Free operator* overloads that produce GemmExpr -------------------------
+// Defined after device_expr_traits so it can accept any supported view pair.
+
+// ---- Scalar * Matrix / View -> Scaled ---------------------------------------
+
+template <typename S>
+Scaled<DeviceMatrix<S>> operator*(S alpha, const DeviceMatrix<S>& m) {
+  return {alpha, m};
+}
+
+template <typename S>
+Scaled<AdjointView<S>> operator*(S alpha, const AdjointView<S>& m) {
+  return {alpha, m};
+}
+
+template <typename S>
+Scaled<TransposeView<S>> operator*(S alpha, const TransposeView<S>& m) {
+  return {alpha, m};
+}
+
+namespace internal {
+
+// ---- Traits: extract operation info from expression types -------------------
+
+// Default: a DeviceMatrix is NoTrans.
+template <typename T>
+struct device_expr_traits {
+  static constexpr bool is_device_expr = false;
+};
+
+template <typename Scalar>
+struct device_expr_traits<DeviceMatrix<Scalar>> {
+  using scalar_type = Scalar;
+  static constexpr GpuOp op = GpuOp::NoTrans;
+  static constexpr bool is_device_expr = true;
+  static const DeviceMatrix<Scalar>& matrix(const DeviceMatrix<Scalar>& x) { return x; }
+  static Scalar alpha(const DeviceMatrix<Scalar>&) { return Scalar(1); }
+};
+
+template <typename Scalar>
+struct device_expr_traits<AdjointView<Scalar>> {
+  using scalar_type = Scalar;
+  static constexpr GpuOp op = GpuOp::ConjTrans;
+  static constexpr bool is_device_expr = true;
+  static const DeviceMatrix<Scalar>& matrix(const AdjointView<Scalar>& x) { return x.matrix(); }
+  static Scalar alpha(const AdjointView<Scalar>&) { return Scalar(1); }
+};
+
+template <typename Scalar>
+struct device_expr_traits<TransposeView<Scalar>> {
+  using scalar_type = Scalar;
+  static constexpr GpuOp op = GpuOp::Trans;
+  static constexpr bool is_device_expr = true;
+  static const DeviceMatrix<Scalar>& matrix(const TransposeView<Scalar>& x) { return x.matrix(); }
+  static Scalar alpha(const TransposeView<Scalar>&) { return Scalar(1); }
+};
+
+template <typename Inner>
+struct device_expr_traits<Scaled<Inner>> {
+  using scalar_type = typename device_expr_traits<Inner>::scalar_type;
+  static constexpr GpuOp op = device_expr_traits<Inner>::op;
+  static constexpr bool is_device_expr = true;
+  static const DeviceMatrix<scalar_type>& matrix(const Scaled<Inner>& x) {
+    return device_expr_traits<Inner>::matrix(x.inner());
+  }
+  static scalar_type alpha(const Scaled<Inner>& x) { return x.scalar() * device_expr_traits<Inner>::alpha(x.inner()); }
+};
+
+}  // namespace internal
+
+template <typename Lhs, typename Rhs,
+          typename std::enable_if<internal::device_expr_traits<Lhs>::is_device_expr &&
+                                      internal::device_expr_traits<Rhs>::is_device_expr,
+                                  int>::type = 0>
+GemmExpr<Lhs, Rhs> operator*(const Lhs& a, const Rhs& b) {
+  return {a, b};
+}
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_DEVICE_EXPR_H
diff --git a/unsupported/Eigen/src/GPU/DeviceMatrix.h b/unsupported/Eigen/src/GPU/DeviceMatrix.h
new file mode 100644
index 0000000..2e15c41
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/DeviceMatrix.h
@@ -0,0 +1,538 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Typed RAII wrapper for a dense matrix in GPU device memory.
+//
+// gpu::DeviceMatrix<Scalar> holds a column-major matrix on the GPU with tracked
+// dimensions and leading dimension. It can be passed to GPU solvers
+// (gpu::LLT, gpu::LU, cuBLAS expressions) without host round-trips.
+//
+// Cross-stream safety is automatic: an internal CUDA event tracks when the
+// last write completed. Consumers on a different stream wait on that event
+// before reading.
+//
+// Usage:
+//   auto d_A = gpu::DeviceMatrix<double>::fromHost(A);     // upload (sync)
+//   gpu::LLT<double> llt;
+//   llt.compute(d_A);                                // factor on device
+//   auto d_X = llt.solve(d_B);                       // async, no sync
+//   MatrixXd X = d_X.toHost();                       // download + block
+//
+// Async variants:
+//   auto d_A = gpu::DeviceMatrix<double>::fromHostAsync(A.data(), n, n, n, stream);
+//   auto transfer = d_X.toHostAsync(stream);         // enqueue D2H
+//   // ... overlap with other work ...
+//   MatrixXd X = transfer.get();                     // block + retrieve
+
+#ifndef EIGEN_GPU_DEVICE_MATRIX_H
+#define EIGEN_GPU_DEVICE_MATRIX_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include <cstring>
+
+#include "./GpuSupport.h"
+
+namespace Eigen {
+namespace gpu {
+
+// Forward declarations.
+template <typename, int>
+class LLT;
+template <typename>
+class LU;
+template <typename>
+class AdjointView;
+template <typename>
+class TransposeView;
+template <typename>
+class Assignment;
+template <typename, typename>
+class GemmExpr;
+template <typename, int>
+class LltSolveExpr;
+template <typename>
+class LuSolveExpr;
+template <typename, int>
+class LLTView;
+template <typename>
+class LUView;
+template <typename, int>
+class TriangularView;
+template <typename, int>
+class SelfAdjointView;
+template <typename, int>
+class ConstSelfAdjointView;
+template <typename, int>
+class TrsmExpr;
+template <typename, int>
+class SymmExpr;
+template <typename, int>
+class SyrkExpr;
+class Context;
+
+// --------------------------------------------------------------------------
+// HostTransfer — future-like wrapper for an async device-to-host transfer.
+// --------------------------------------------------------------------------
+
+/** \ingroup GPU_Module
+ * \class HostTransfer
+ * \brief Future for an asynchronous device-to-host matrix transfer.
+ *
+ * Returned by gpu::DeviceMatrix::toHostAsync(). The transfer runs asynchronously
+ * on the given CUDA stream. Call get() to block until complete and retrieve
+ * the host matrix, or ready() to poll without blocking.
+ */
+template <typename Scalar_>
+class HostTransfer {
+ public:
+  using Scalar = Scalar_;
+  using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+
+  /** Block until the transfer completes and return the host matrix.
+   * Idempotent: subsequent calls return the same matrix without re-syncing.
+   * On first call, copies from pinned staging buffer into a regular matrix. */
+  PlainMatrix& get() {
+    if (!synced_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaEventSynchronize(event_));
+      // Copy from pinned staging buffer into the regular (pageable) host matrix.
+      if (pinned_buf_ && host_buf_.size() > 0) {
+        std::memcpy(host_buf_.data(), pinned_buf_.get(), static_cast<size_t>(host_buf_.size()) * sizeof(Scalar));
+      }
+      pinned_buf_ = internal::PinnedHostBuffer();  // free pinned memory early
+      synced_ = true;
+    }
+    return host_buf_;
+  }
+
+  /** Non-blocking check: has the transfer completed? */
+  bool ready() const {
+    if (synced_) return true;
+    cudaError_t err = cudaEventQuery(event_);
+    if (err == cudaSuccess) return true;
+    eigen_assert(err == cudaErrorNotReady && "cudaEventQuery failed");
+    return false;
+  }
+
+  ~HostTransfer() {
+    if (event_) (void)cudaEventDestroy(event_);
+  }
+
+  HostTransfer(HostTransfer&& o) noexcept
+      : host_buf_(std::move(o.host_buf_)), pinned_buf_(std::move(o.pinned_buf_)), event_(o.event_), synced_(o.synced_) {
+    o.event_ = nullptr;
+    o.synced_ = true;
+  }
+
+  HostTransfer& operator=(HostTransfer&& o) noexcept {
+    if (this != &o) {
+      if (event_) (void)cudaEventDestroy(event_);
+      host_buf_ = std::move(o.host_buf_);
+      pinned_buf_ = std::move(o.pinned_buf_);
+      event_ = o.event_;
+      synced_ = o.synced_;
+      o.event_ = nullptr;
+      o.synced_ = true;
+    }
+    return *this;
+  }
+
+  HostTransfer(const HostTransfer&) = delete;
+  HostTransfer& operator=(const HostTransfer&) = delete;
+
+ private:
+  template <typename>
+  friend class DeviceMatrix;
+
+  HostTransfer(PlainMatrix&& buf, internal::PinnedHostBuffer&& pinned, cudaEvent_t event)
+      : host_buf_(std::move(buf)), pinned_buf_(std::move(pinned)), event_(event), synced_(false) {}
+
+  PlainMatrix host_buf_;                   // final destination (pageable)
+  internal::PinnedHostBuffer pinned_buf_;  // staging buffer for async DMA
+  cudaEvent_t event_ = nullptr;
+  bool synced_ = false;
+};
+
+// --------------------------------------------------------------------------
+// Matrix — typed RAII wrapper for a dense matrix in device memory.
+// --------------------------------------------------------------------------
+
+/** \ingroup GPU_Module
+ * \class DeviceMatrix
+ * \brief RAII wrapper for a dense column-major matrix in GPU device memory.
+ *
+ * \tparam Scalar_  Element type: float, double, complex<float>, complex<double>
+ *
+ * Owns a device allocation with tracked dimensions and leading dimension.
+ * An internal CUDA event records when the data was last written, enabling
+ * safe cross-stream consumption without user-visible synchronization.
+ *
+ * Each method has a synchronous and an asynchronous variant:
+ *  - fromHost() / fromHostAsync(): upload from host
+ *  - toHost() / toHostAsync(): download to host
+ */
+template <typename Scalar_>
+class DeviceMatrix {
+ public:
+  using Scalar = Scalar_;
+  using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+
+  // ---- Construction / destruction ------------------------------------------
+
+  /** Default: empty (0x0, no allocation). */
+  DeviceMatrix() = default;
+
+  /** Allocate uninitialized device memory for a rows x cols matrix. */
+  DeviceMatrix(Index rows, Index cols) : rows_(rows), cols_(cols), outerStride_(rows) {
+    eigen_assert(rows >= 0 && cols >= 0);
+    size_t bytes = sizeInBytes();
+    if (bytes > 0) {
+      void* p = nullptr;
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
+      data_.reset(static_cast<Scalar*>(p));
+    }
+  }
+
+  ~DeviceMatrix() {
+    // cudaEventDestroy on a pending event is non-blocking: the runtime defers
+    // teardown until the event completes. The trailing cudaFree() (via
+    // data_.reset()) is itself synchronous, so the buffer outlives any
+    // in-flight kernel that may still be touching it.
+    if (ready_event_) (void)cudaEventDestroy(ready_event_);
+  }
+
+  // ---- Move-only -----------------------------------------------------------
+
+  DeviceMatrix(DeviceMatrix&& o) noexcept
+      : data_(std::move(o.data_)),
+        rows_(o.rows_),
+        cols_(o.cols_),
+        outerStride_(o.outerStride_),
+        ready_event_(o.ready_event_),
+        ready_stream_(o.ready_stream_),
+        retained_buffer_(std::move(o.retained_buffer_)) {
+    o.rows_ = 0;
+    o.cols_ = 0;
+    o.outerStride_ = 0;
+    o.ready_event_ = nullptr;
+    o.ready_stream_ = nullptr;
+  }
+
+  DeviceMatrix& operator=(DeviceMatrix&& o) noexcept {
+    if (this != &o) {
+      if (ready_event_) (void)cudaEventDestroy(ready_event_);
+      data_ = std::move(o.data_);
+      rows_ = o.rows_;
+      cols_ = o.cols_;
+      outerStride_ = o.outerStride_;
+      ready_event_ = o.ready_event_;
+      ready_stream_ = o.ready_stream_;
+      retained_buffer_ = std::move(o.retained_buffer_);
+      o.rows_ = 0;
+      o.cols_ = 0;
+      o.outerStride_ = 0;
+      o.ready_event_ = nullptr;
+      o.ready_stream_ = nullptr;
+    }
+    return *this;
+  }
+
+  DeviceMatrix(const DeviceMatrix&) = delete;
+  DeviceMatrix& operator=(const DeviceMatrix&) = delete;
+
+  // ---- Upload from host ----------------------------------------------------
+
+  /** Upload a host Eigen matrix to device memory (synchronous).
+   *
+   * Evaluates the expression into a contiguous ColMajor temporary, copies to
+   * device via cudaMemcpyAsync on \p stream, and synchronizes before returning.
+   *
+   * \param host   Any Eigen matrix expression.
+   * \param stream CUDA stream for the transfer (default: stream 0).
+   */
+  template <typename Derived>
+  static DeviceMatrix fromHost(const MatrixBase<Derived>& host, cudaStream_t stream = nullptr) {
+    const PlainMatrix mat(host.derived());
+    DeviceMatrix dm(mat.rows(), mat.cols());
+    if (dm.sizeInBytes() > 0) {
+      EIGEN_CUDA_RUNTIME_CHECK(
+          cudaMemcpyAsync(dm.data_.get(), mat.data(), dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
+    }
+    return dm;
+  }
+
+  /** Upload from a raw host pointer to device memory (asynchronous).
+   *
+   * Enqueues an async H2D copy on \p stream and records an internal event.
+   * The caller must keep \p host_data alive until the transfer completes
+   * (check via the internal event or synchronize the stream).
+   *
+   * \param host_data    Pointer to contiguous column-major host data.
+   * \param rows         Number of rows.
+   * \param cols         Number of columns.
+   * \param outerStride  Leading dimension (>= rows). Use rows for dense.
+   * \param stream       CUDA stream for the transfer.
+   */
+  static DeviceMatrix fromHostAsync(const Scalar* host_data, Index rows, Index cols, Index outerStride,
+                                    cudaStream_t stream) {
+    eigen_assert(rows >= 0 && cols >= 0 && outerStride >= rows);
+    eigen_assert(host_data != nullptr || (rows == 0 || cols == 0));
+    DeviceMatrix dm(rows, cols);
+    if (dm.sizeInBytes() > 0) {
+      // If outerStride == rows (dense), single contiguous copy.
+      // Otherwise, copy column by column (strided layout).
+      if (outerStride == rows) {
+        EIGEN_CUDA_RUNTIME_CHECK(
+            cudaMemcpyAsync(dm.data_.get(), host_data, dm.sizeInBytes(), cudaMemcpyHostToDevice, stream));
+      } else {
+        EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpy2DAsync(dm.data_.get(), static_cast<size_t>(rows) * sizeof(Scalar),
+                                                   host_data, static_cast<size_t>(outerStride) * sizeof(Scalar),
+                                                   static_cast<size_t>(rows) * sizeof(Scalar),
+                                                   static_cast<size_t>(cols), cudaMemcpyHostToDevice, stream));
+      }
+      dm.recordReady(stream);
+    }
+    return dm;
+  }
+
+  // ---- Download to host ----------------------------------------------------
+
+  /** Download device matrix to host memory (synchronous).
+   *
+   * Waits on the internal ready event, enqueues a D2H copy on \p stream,
+   * synchronizes, and returns the host matrix directly.
+   *
+   * \param stream CUDA stream for the transfer (default: stream 0).
+   */
+  PlainMatrix toHost(cudaStream_t stream = nullptr) const {
+    PlainMatrix host_buf(rows_, cols_);
+    if (sizeInBytes() > 0) {
+      waitReady(stream);
+      EIGEN_CUDA_RUNTIME_CHECK(
+          cudaMemcpyAsync(host_buf.data(), data_.get(), sizeInBytes(), cudaMemcpyDeviceToHost, stream));
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream));
+    }
+    return host_buf;
+  }
+
+  /** Enqueue an async device-to-host transfer and return a future.
+   *
+   * Waits on the internal ready event (if any) to ensure the device data is
+   * valid, then enqueues the D2H copy on \p stream. Returns a HostTransfer
+   * future; call .get() to block and retrieve the host matrix.
+   *
+   * \param stream CUDA stream for the transfer (default: stream 0).
+   */
+  HostTransfer<Scalar> toHostAsync(cudaStream_t stream = nullptr) const {
+    PlainMatrix host_buf(rows_, cols_);
+    internal::PinnedHostBuffer pinned_buf(sizeInBytes());
+    if (sizeInBytes() > 0) {
+      waitReady(stream);
+      // DMA into pinned staging buffer for truly async transfer.
+      EIGEN_CUDA_RUNTIME_CHECK(
+          cudaMemcpyAsync(pinned_buf.get(), data_.get(), sizeInBytes(), cudaMemcpyDeviceToHost, stream));
+    }
+    // Record a transfer-complete event.
+    cudaEvent_t transfer_event;
+    EIGEN_CUDA_RUNTIME_CHECK(cudaEventCreateWithFlags(&transfer_event, cudaEventDisableTiming));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaEventRecord(transfer_event, stream));
+    return HostTransfer<Scalar>(std::move(host_buf), std::move(pinned_buf), transfer_event);
+  }
+
+  // ---- Device-to-device copy -----------------------------------------------
+
+  /** Deep copy on device. Fully async — records event on the result, no sync.
+   *
+   * \param stream CUDA stream for the D2D copy (default: stream 0).
+   */
+  DeviceMatrix clone(cudaStream_t stream = nullptr) const {
+    DeviceMatrix result(rows_, cols_);
+    if (sizeInBytes() > 0) {
+      waitReady(stream);
+      EIGEN_CUDA_RUNTIME_CHECK(
+          cudaMemcpyAsync(result.data_.get(), data_.get(), sizeInBytes(), cudaMemcpyDeviceToDevice, stream));
+      result.recordReady(stream);
+    }
+    return result;
+  }
+
+  // ---- Resize (destructive) ------------------------------------------------
+
+  /** Discard contents and reallocate to (rows x cols). Clears the ready event. */
+  void resize(Index rows, Index cols) {
+    eigen_assert(rows >= 0 && cols >= 0);
+    if (rows == rows_ && cols == cols_) return;
+    data_.reset();
+    if (ready_event_) {
+      (void)cudaEventDestroy(ready_event_);
+      ready_event_ = nullptr;
+    }
+    ready_stream_ = nullptr;
+    retained_buffer_ = internal::DeviceBuffer();
+    rows_ = rows;
+    cols_ = cols;
+    outerStride_ = rows;
+    size_t bytes = sizeInBytes();
+    if (bytes > 0) {
+      void* p = nullptr;
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
+      data_.reset(static_cast<Scalar*>(p));
+    }
+  }
+
+  // ---- Accessors -----------------------------------------------------------
+
+  Scalar* data() { return data_.get(); }
+  const Scalar* data() const { return data_.get(); }
+  Index rows() const { return rows_; }
+  Index cols() const { return cols_; }
+  Index outerStride() const { return outerStride_; }
+  bool empty() const { return rows_ == 0 || cols_ == 0; }
+
+  /** Size of the device allocation in bytes. */
+  size_t sizeInBytes() const { return static_cast<size_t>(outerStride_) * static_cast<size_t>(cols_) * sizeof(Scalar); }
+
+  // ---- Event synchronization (public for library dispatch interop) ---------
+
+  /** Record that device data is ready after work on \p stream. */
+  void recordReady(cudaStream_t stream) {
+    ensureEvent();
+    EIGEN_CUDA_RUNTIME_CHECK(cudaEventRecord(ready_event_, stream));
+    ready_stream_ = stream;
+  }
+
+  /** Make \p stream wait until the device data is ready.
+   * No-op if no event recorded, or if the consumer stream is the same as the
+   * producer stream (CUDA guarantees in-order execution within a stream). */
+  void waitReady(cudaStream_t stream) const {
+    if (ready_event_ && stream != ready_stream_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamWaitEvent(stream, ready_event_, 0));
+    }
+  }
+
+  // ---- Expression methods (dispatch to cuBLAS/cuSOLVER) --------------------
+
+  /** Adjoint view for GEMM dispatch. Maps to cublasXgemm with ConjTrans. */
+  AdjointView<Scalar> adjoint() const { return AdjointView<Scalar>(*this); }
+
+  /** Transpose view for GEMM dispatch. Maps to cublasXgemm with Trans. */
+  TransposeView<Scalar> transpose() const { return TransposeView<Scalar>(*this); }
+
+  /** Bind this matrix to a Context for expression assignment.
+   * Returns an Assignment proxy: `d_C.device(ctx) = d_A * d_B;` */
+  Assignment<Scalar> device(Context& ctx) { return Assignment<Scalar>(*this, ctx); }
+
+  /** Assign from a GEMM expression using the thread-local default Context.
+   * Defined out-of-line after Context is fully declared (see DeviceDispatch.h). */
+  template <typename Lhs, typename Rhs>
+  DeviceMatrix& operator=(const GemmExpr<Lhs, Rhs>& expr);
+
+  /** Accumulate from a GEMM expression using the thread-local default Context. */
+  template <typename Lhs, typename Rhs>
+  DeviceMatrix& operator+=(const GemmExpr<Lhs, Rhs>& expr);
+
+  /** Cholesky view: d_A.llt().solve(d_B) → LltSolveExpr. */
+  LLTView<Scalar, Lower> llt() const { return LLTView<Scalar, Lower>(*this); }
+
+  /** Cholesky view with explicit triangle: d_A.llt<Upper>().solve(d_B). */
+  template <int UpLo>
+  LLTView<Scalar, UpLo> llt() const {
+    return LLTView<Scalar, UpLo>(*this);
+  }
+
+  /** LU view: d_A.lu().solve(d_B) → LuSolveExpr. */
+  LUView<Scalar> lu() const { return LUView<Scalar>(*this); }
+
+  /** Assign from an LLT solve expression (thread-local default context). */
+  template <int UpLo>
+  DeviceMatrix& operator=(const LltSolveExpr<Scalar, UpLo>& expr);
+
+  /** Assign from an LU solve expression (thread-local default context). */
+  DeviceMatrix& operator=(const LuSolveExpr<Scalar>& expr);
+
+  /** Triangular view: d_A.triangularView<Lower>().solve(d_B) → TrsmExpr. */
+  template <int UpLo>
+  TriangularView<Scalar, UpLo> triangularView() const {
+    return TriangularView<Scalar, UpLo>(*this);
+  }
+
+  /** Self-adjoint view (mutable): d_C.selfadjointView<Lower>().rankUpdate(d_A). */
+  template <int UpLo>
+  SelfAdjointView<Scalar, UpLo> selfadjointView() {
+    return SelfAdjointView<Scalar, UpLo>(*this);
+  }
+
+  /** Self-adjoint view (const): d_A.selfadjointView<Lower>() * d_B → SymmExpr. */
+  template <int UpLo>
+  ConstSelfAdjointView<Scalar, UpLo> selfadjointView() const {
+    return ConstSelfAdjointView<Scalar, UpLo>(*this);
+  }
+
+  /** Assign from a TRSM expression (thread-local default context). */
+  template <int UpLo>
+  DeviceMatrix& operator=(const TrsmExpr<Scalar, UpLo>& expr);
+
+  /** Assign from a SYMM expression (thread-local default context). */
+  template <int UpLo>
+  DeviceMatrix& operator=(const SymmExpr<Scalar, UpLo>& expr);
+
+  // ---- Ownership transfer ---------------------------------------------------
+
+  /** Adopt an existing device pointer. Caller relinquishes ownership. */
+  static DeviceMatrix adopt(Scalar* device_ptr, Index rows, Index cols) {
+    DeviceMatrix dm;
+    dm.data_.reset(device_ptr);
+    dm.rows_ = rows;
+    dm.cols_ = cols;
+    dm.outerStride_ = rows;
+    return dm;
+  }
+
+  /** Transfer ownership of the device pointer out. Zeros internal state. */
+  Scalar* release() {
+    Scalar* p = data_.release();
+    rows_ = 0;
+    cols_ = 0;
+    outerStride_ = 0;
+    if (ready_event_) {
+      (void)cudaEventDestroy(ready_event_);
+      ready_event_ = nullptr;
+    }
+    ready_stream_ = nullptr;
+    return p;
+  }
+
+ private:
+  // ---- Private helpers -------------------------------------------------------
+
+  void ensureEvent() {
+    if (!ready_event_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaEventCreateWithFlags(&ready_event_, cudaEventDisableTiming));
+    }
+  }
+
+  void retainBuffer(internal::DeviceBuffer&& buffer) { retained_buffer_ = std::move(buffer); }
+
+  // ---- Data members --------------------------------------------------------
+
+  std::unique_ptr<Scalar, internal::CudaFreeDeleter> data_;
+  Index rows_ = 0;
+  Index cols_ = 0;
+  Index outerStride_ = 0;
+  cudaEvent_t ready_event_ = nullptr;       // internal: tracks last write completion
+  cudaStream_t ready_stream_ = nullptr;     // stream that recorded ready_event_ (for same-stream skip)
+  internal::DeviceBuffer retained_buffer_;  // internal: keeps async aux buffers alive
+};
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_DEVICE_MATRIX_H
diff --git a/unsupported/Eigen/src/GPU/DeviceSolverExpr.h b/unsupported/Eigen/src/GPU/DeviceSolverExpr.h
new file mode 100644
index 0000000..a71a998
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/DeviceSolverExpr.h
@@ -0,0 +1,119 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Solver expression types for gpu::DeviceMatrix.
+//
+// Each expression maps 1:1 to cuSOLVER library calls:
+//   LltSolveExpr  -> cusolverDnXpotrf + cusolverDnXpotrs
+//   LuSolveExpr   -> cusolverDnXgetrf + cusolverDnXgetrs
+//
+// Usage:
+//   d_X = d_A.llt().solve(d_B);              // Cholesky solve
+//   d_X.device(ctx) = d_A.lu().solve(d_B);   // LU solve on explicit stream
+
+#ifndef EIGEN_GPU_DEVICE_SOLVER_EXPR_H
+#define EIGEN_GPU_DEVICE_SOLVER_EXPR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include <functional>
+
+namespace Eigen {
+namespace gpu {
+
+// Forward declarations.
+template <typename Scalar_>
+class DeviceMatrix;
+class Context;
+
+// ---- LLT solve expression ---------------------------------------------------
+// d_A.llt().solve(d_B) -> LltSolveExpr -> cusolverDnXpotrf + cusolverDnXpotrs
+
+template <typename Scalar_, int UpLo_ = Lower>
+class LltSolveExpr {
+ public:
+  using Scalar = Scalar_;
+  static constexpr int UpLo = UpLo_;
+
+  LltSolveExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
+  const DeviceMatrix<Scalar>& matrix() const { return A_; }
+  const DeviceMatrix<Scalar>& rhs() const { return B_; }
+
+ private:
+  std::reference_wrapper<const DeviceMatrix<Scalar>> A_;
+  std::reference_wrapper<const DeviceMatrix<Scalar>> B_;
+};
+
+// ---- LU solve expression ----------------------------------------------------
+// d_A.lu().solve(d_B) -> LuSolveExpr -> cusolverDnXgetrf + cusolverDnXgetrs
+
+template <typename Scalar_>
+class LuSolveExpr {
+ public:
+  using Scalar = Scalar_;
+
+  LuSolveExpr(const DeviceMatrix<Scalar>& A, const DeviceMatrix<Scalar>& B) : A_(A), B_(B) {}
+  const DeviceMatrix<Scalar>& matrix() const { return A_; }
+  const DeviceMatrix<Scalar>& rhs() const { return B_; }
+
+ private:
+  std::reference_wrapper<const DeviceMatrix<Scalar>> A_;
+  std::reference_wrapper<const DeviceMatrix<Scalar>> B_;
+};
+
+// ---- LLTView: d_A.llt() -> view with .solve() and .device() ----------------
+
+template <typename Scalar_, int UpLo_ = Lower>
+class LLTView {
+ public:
+  using Scalar = Scalar_;
+
+  explicit LLTView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
+
+  /** Build a solve expression: d_A.llt().solve(d_B).
+   * The expression is evaluated when assigned to a gpu::DeviceMatrix. */
+  LltSolveExpr<Scalar, UpLo_> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
+
+  // For cached factorizations, use the explicit gpu::LLT API directly:
+  //   gpu::LLT<double> llt;
+  //   llt.compute(d_A);
+  //   auto d_X1 = llt.solve(d_B1);
+  //   auto d_X2 = llt.solve(d_B2);
+
+ private:
+  std::reference_wrapper<const DeviceMatrix<Scalar>> mat_;
+};
+
+// ---- LUView: d_A.lu() -> view with .solve() and .device() ------------------
+
+template <typename Scalar_>
+class LUView {
+ public:
+  using Scalar = Scalar_;
+
+  explicit LUView(const DeviceMatrix<Scalar>& m) : mat_(m) {}
+
+  /** Build a solve expression: d_A.lu().solve(d_B). */
+  LuSolveExpr<Scalar> solve(const DeviceMatrix<Scalar>& rhs) const { return {mat_, rhs}; }
+
+  // For cached factorizations, use the explicit gpu::LU API directly:
+  //   gpu::LU<double> lu;
+  //   lu.compute(d_A);
+  //   auto d_X1 = lu.solve(d_B1);
+  //   auto d_X2 = lu.solve(d_B2);
+
+ private:
+  std::reference_wrapper<const DeviceMatrix<Scalar>> mat_;
+};
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_DEVICE_SOLVER_EXPR_H
diff --git a/unsupported/Eigen/src/GPU/GpuContext.h b/unsupported/Eigen/src/GPU/GpuContext.h
new file mode 100644
index 0000000..bff4053
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/GpuContext.h
@@ -0,0 +1,95 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Unified GPU execution context.
+//
+// gpu::Context owns a CUDA stream and NVIDIA library handles (cuBLAS,
+// cuSOLVER). It is the entry point for all GPU
+// operations on gpu::DeviceMatrix.
+//
+// Usage:
+//   gpu::Context ctx;                        // explicit context
+//   d_C.device(ctx) = d_A * d_B;            // GEMM on ctx's stream
+//
+//   d_C = d_A * d_B;                        // thread-local default context
+//   gpu::Context& ctx = gpu::Context::threadLocal();
+
+#ifndef EIGEN_GPU_CONTEXT_H
+#define EIGEN_GPU_CONTEXT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuBlasSupport.h"
+#include "./CuSolverSupport.h"
+
+namespace Eigen {
+namespace gpu {
+
+/** \ingroup GPU_Module
+ * \class Context
+ * \brief Unified GPU execution context owning a CUDA stream and library handles.
+ *
+ * Each Context instance creates a dedicated CUDA stream, a cuBLAS handle,
+ * and a cuSOLVER handle, all bound to that stream. Multiple contexts enable
+ * concurrent execution on independent streams.
+ *
+ * A lazily-created thread-local default is available via threadLocal() for
+ * simple single-stream usage.
+ */
+class Context {
+ public:
+  Context() {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    EIGEN_CUBLAS_CHECK(cublasCreate(&cublas_));
+    EIGEN_CUBLAS_CHECK(cublasSetStream(cublas_, stream_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&cusolver_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(cusolver_, stream_));
+  }
+
+  ~Context() {
+    if (cusolver_) (void)cusolverDnDestroy(cusolver_);
+    if (cublas_) (void)cublasDestroy(cublas_);
+    if (stream_) (void)cudaStreamDestroy(stream_);
+  }
+
+  // Non-copyable, non-movable (owns library handles).
+  Context(const Context&) = delete;
+  Context& operator=(const Context&) = delete;
+  Context(Context&&) = delete;
+  Context& operator=(Context&&) = delete;
+
+  /** Lazily-created thread-local default context.
+   *
+   * \note The thread-local instance is destroyed when the thread exits (or at
+   * static destruction time for the main thread). On some CUDA driver
+   * configurations this may print "CUDA_ERROR_DEINITIALIZED" to stderr if the
+   * CUDA context has already been torn down. These errors are harmless and are
+   * suppressed in the destructor, but they can produce noise in test output.
+   * To avoid this, call cudaDeviceReset() only after all Context instances
+   * (including thread-local ones) have been destroyed. */
+  static Context& threadLocal() {
+    thread_local Context ctx;
+    return ctx;
+  }
+
+  cudaStream_t stream() const { return stream_; }
+  cublasHandle_t cublasHandle() const { return cublas_; }
+  cusolverDnHandle_t cusolverHandle() const { return cusolver_; }
+
+ private:
+  cudaStream_t stream_ = nullptr;
+  cublasHandle_t cublas_ = nullptr;
+  cusolverDnHandle_t cusolver_ = nullptr;
+};
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_CONTEXT_H
diff --git a/unsupported/Eigen/src/GPU/GpuLLT.h b/unsupported/Eigen/src/GPU/GpuLLT.h
new file mode 100644
index 0000000..8ca3dbd
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/GpuLLT.h
@@ -0,0 +1,403 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// GPU Cholesky (LLT) decomposition using cuSOLVER.
+//
+// Unlike Eigen's CPU LLT<MatrixType>, gpu::LLT keeps the factored Cholesky
+// factor in device memory for the lifetime of the object. Multiple solves
+// against the same factor therefore only transfer the RHS and solution
+// vectors, not the factor itself.
+//
+// Requires CUDA 11.0+ (cusolverDnXpotrf / cusolverDnXpotrs generic API).
+// Requires CUDA 11.4+ (cusolverDnX generic API + cudaMallocAsync).
+//
+// Usage:
+//   gpu::LLT<double> llt(A);            // upload A, potrf, L stays on device
+//   if (llt.info() != Success) { ... }
+//   MatrixXd x1 = llt.solve(b1);        // potrs, only b1 transferred
+//   MatrixXd x2 = llt.solve(b2);        // L already on device
+
+#ifndef EIGEN_GPU_LLT_H
+#define EIGEN_GPU_LLT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuSolverSupport.h"
+#include <vector>
+
+namespace Eigen {
+namespace gpu {
+
+/** \ingroup GPU_Module
+ * \class LLT
+ * \brief GPU Cholesky (LL^T) decomposition via cuSOLVER
+ *
+ * \tparam Scalar_  Element type: float, double, complex<float>, complex<double>
+ * \tparam UpLo_    Triangle used: Lower (default) or Upper
+ *
+ * Factorizes a symmetric positive-definite matrix A = LL^H on the GPU and
+ * caches the factor L in device memory. Each subsequent solve(B) uploads only
+ * B, calls cusolverDnXpotrs, and downloads the result — the factor is not
+ * re-transferred.
+ *
+ * Each LLT object owns a dedicated CUDA stream and cuSOLVER handle,
+ * enabling concurrent factorizations from multiple objects on the same host
+ * thread.
+ */
+template <typename Scalar_, int UpLo_ = Lower>
+class LLT {
+ public:
+  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+
+  static constexpr int UpLo = UpLo_;
+
+  // ---- Construction / destruction ------------------------------------------
+
+  /** Default constructor. Does not factorize; call compute() before solve(). */
+  LLT() { init_context(); }
+
+  /** Factor A immediately. Equivalent to LLT llt; llt.compute(A). */
+  template <typename InputType>
+  explicit LLT(const EigenBase<InputType>& A) : LLT() {
+    compute(A);
+  }
+
+  ~LLT() {
+    // Ignore errors here: dtors are noexcept, and EIGEN_CUSOLVER_CHECK /
+    // EIGEN_CUDA_RUNTIME_CHECK are eigen_assert-based — firing one from a
+    // noexcept dtor terminates the program. The trailing cudaFree() of the
+    // device buffers (via internal::DeviceBuffer::~DeviceBuffer) is sync.
+    if (handle_) (void)cusolverDnDestroy(handle_);
+    if (stream_) (void)cudaStreamDestroy(stream_);
+  }
+
+  // Non-copyable (owns device memory and library handles).
+  LLT(const LLT&) = delete;
+  LLT& operator=(const LLT&) = delete;
+
+  // Movable.
+  LLT(LLT&& o) noexcept
+      : stream_(o.stream_),
+        handle_(o.handle_),
+        params_(std::move(o.params_)),
+        d_factor_(std::move(o.d_factor_)),
+        factor_alloc_size_(o.factor_alloc_size_),
+        d_scratch_(std::move(o.d_scratch_)),
+        scratch_size_(o.scratch_size_),
+        h_workspace_(std::move(o.h_workspace_)),
+        n_(o.n_),
+        lda_(o.lda_),
+        info_(o.info_),
+        pinned_info_(std::move(o.pinned_info_)),
+        info_synced_(o.info_synced_) {
+    o.stream_ = nullptr;
+    o.handle_ = nullptr;
+    o.factor_alloc_size_ = 0;
+    o.scratch_size_ = 0;
+    o.n_ = 0;
+    o.lda_ = 0;
+    o.info_ = InvalidInput;
+    o.info_synced_ = true;
+  }
+
+  LLT& operator=(LLT&& o) noexcept {
+    if (this != &o) {
+      // Drain pending work on the old stream before tearing it down so the
+      // upcoming move of d_factor_/d_scratch_ doesn't free buffers that an
+      // in-flight kernel is still touching. The asymmetric move-ctor doesn't
+      // need this — it constructs onto uninitialized storage.
+      if (stream_) (void)cudaStreamSynchronize(stream_);
+      if (handle_) (void)cusolverDnDestroy(handle_);
+      if (stream_) (void)cudaStreamDestroy(stream_);
+      stream_ = o.stream_;
+      handle_ = o.handle_;
+      params_ = std::move(o.params_);
+      d_factor_ = std::move(o.d_factor_);
+      factor_alloc_size_ = o.factor_alloc_size_;
+      d_scratch_ = std::move(o.d_scratch_);
+      scratch_size_ = o.scratch_size_;
+      h_workspace_ = std::move(o.h_workspace_);
+      n_ = o.n_;
+      lda_ = o.lda_;
+      info_ = o.info_;
+      pinned_info_ = std::move(o.pinned_info_);
+      info_synced_ = o.info_synced_;
+      o.stream_ = nullptr;
+      o.handle_ = nullptr;
+      o.factor_alloc_size_ = 0;
+      o.scratch_size_ = 0;
+      o.n_ = 0;
+      o.lda_ = 0;
+      o.info_ = InvalidInput;
+      o.info_synced_ = true;
+    }
+    return *this;
+  }
+
+  // ---- Factorization -------------------------------------------------------
+
+  /** Compute the Cholesky factorization of A (host matrix).
+   *
+   * Uploads A to device memory, calls cusolverDnXpotrf, and retains the
+   * factored matrix on device. Any previous factorization is overwritten.
+   */
+  template <typename InputType>
+  LLT& compute(const EigenBase<InputType>& A) {
+    eigen_assert(A.rows() == A.cols());
+    if (!begin_compute(A.rows())) return *this;
+
+    // Evaluate A into a contiguous ColMajor matrix (handles arbitrary expressions).
+    const PlainMatrix mat(A.derived());
+    lda_ = static_cast<int64_t>(mat.outerStride());
+    allocate_factor_storage();
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_factor_.get(), mat.data(), factorBytes(), cudaMemcpyHostToDevice, stream_));
+
+    factorize();
+    return *this;
+  }
+
+  /** Compute the Cholesky factorization from a device-resident matrix (D2D copy). */
+  LLT& compute(const DeviceMatrix<Scalar>& d_A) {
+    eigen_assert(d_A.rows() == d_A.cols());
+    if (!begin_compute(d_A.rows())) return *this;
+
+    lda_ = static_cast<int64_t>(d_A.outerStride());
+    d_A.waitReady(stream_);
+    allocate_factor_storage();
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_factor_.get(), d_A.data(), factorBytes(), cudaMemcpyDeviceToDevice, stream_));
+
+    factorize();
+    return *this;
+  }
+
+  /** Compute the Cholesky factorization from a device matrix (move, no copy). */
+  LLT& compute(DeviceMatrix<Scalar>&& d_A) {
+    eigen_assert(d_A.rows() == d_A.cols());
+    if (!begin_compute(d_A.rows())) return *this;
+
+    lda_ = static_cast<int64_t>(d_A.outerStride());
+    d_A.waitReady(stream_);
+    d_factor_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
+    factor_alloc_size_ = factorBytes();
+
+    factorize();
+    return *this;
+  }
+
+  // ---- Solve ---------------------------------------------------------------
+
+  /** Solve A * X = B using the cached Cholesky factor (host → host).
+   *
+   * Uploads B to device memory, calls cusolverDnXpotrs using the factor
+   * retained from compute(), and returns the solution X on the host.
+   * The factor is not re-transferred; only B goes up and X comes down.
+   *
+   * \pre compute() must have been called and info() == Success.
+   * \returns X such that A * X ≈ B
+   */
+  template <typename Rhs>
+  PlainMatrix solve(const MatrixBase<Rhs>& B) const {
+    const_cast<LLT*>(this)->sync_info();
+    eigen_assert(info_ == Success && "LLT::solve called on a failed or uninitialized factorization");
+    eigen_assert(B.rows() == n_);
+
+    const PlainMatrix rhs(B);
+    const int64_t nrhs = static_cast<int64_t>(rhs.cols());
+    const int64_t ldb = static_cast<int64_t>(rhs.outerStride());
+    internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_x.get(), rhs.data(), rhsBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
+    DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, std::move(d_x));
+
+    PlainMatrix X(n_, B.cols());
+    int solve_info = 0;
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(X.data(), d_X.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+
+    eigen_assert(solve_info == 0 && "cusolverDnXpotrs reported an error");
+    return X;
+  }
+
+  /** Solve A * X = B with device-resident RHS. Fully async.
+   *
+   * All work is enqueued on this solver's stream. Returns a Matrix
+   * with a recorded ready event — no host synchronization occurs.
+   * The caller should check info() after compute() to verify the
+   * factorization succeeded; this method does not check.
+   */
+  DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B) const {
+    eigen_assert(d_B.rows() == n_);
+    d_B.waitReady(stream_);
+    const int64_t nrhs = static_cast<int64_t>(d_B.cols());
+    const int64_t ldb = static_cast<int64_t>(d_B.outerStride());
+    internal::DeviceBuffer d_x(rhsBytes(nrhs, ldb));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_x.get(), d_B.data(), rhsBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
+    return solve_impl(nrhs, ldb, std::move(d_x));
+  }
+
+  // ---- Accessors -----------------------------------------------------------
+
+  /** Returns Success if the last compute() succeeded, NumericalIssue otherwise.
+   * Lazily synchronizes the stream on first call after compute(). */
+  ComputationInfo info() const {
+    const_cast<LLT*>(this)->sync_info();
+    return info_;
+  }
+
+  Index rows() const { return n_; }
+  Index cols() const { return n_; }
+
+  /** Returns the CUDA stream owned by this object.
+   *  Advanced users may submit additional GPU work on this stream
+   *  to overlap with or chain after LLT operations. */
+  cudaStream_t stream() const { return stream_; }
+
+ private:
+  cudaStream_t stream_ = nullptr;
+  cusolverDnHandle_t handle_ = nullptr;
+  internal::CusolverParams params_;   // cuSOLVER params (created once, reused)
+  internal::DeviceBuffer d_factor_;   // factored L (or U) on device (grows, never shrinks)
+  size_t factor_alloc_size_ = 0;      // current d_factor_ allocation size
+  internal::DeviceBuffer d_scratch_;  // combined workspace + info word (grows, never shrinks)
+  size_t scratch_size_ = 0;           // current scratch allocation size
+  std::vector<char> h_workspace_;     // host workspace (kept alive until next compute)
+  int64_t n_ = 0;
+  int64_t lda_ = 0;
+  ComputationInfo info_ = InvalidInput;
+  internal::PinnedHostBuffer pinned_info_{sizeof(int)};  // pinned host memory for async D2H
+  bool info_synced_ = true;                              // has the stream been synced for info?
+
+  int& info_word() { return *static_cast<int*>(pinned_info_.get()); }
+  int info_word() const { return *static_cast<const int*>(pinned_info_.get()); }
+
+  bool begin_compute(Index rows) {
+    n_ = rows;
+    if (n_ == 0) {
+      info_ = Success;
+      info_synced_ = true;
+      return false;
+    }
+    info_ = InvalidInput;
+    return true;
+  }
+
+  size_t factorBytes() const { return rhsBytes(n_, lda_); }
+
+  static size_t rhsBytes(int64_t cols, int64_t outer_stride) {
+    return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
+  }
+
+  void allocate_factor_storage() {
+    size_t needed = factorBytes();
+    if (needed > factor_alloc_size_) {
+      d_factor_ = internal::DeviceBuffer(needed);
+      factor_alloc_size_ = needed;
+    }
+  }
+
+  // Scratch layout: [ workspace (aligned) | info_word (sizeof(int)) ].
+  // Workspace size is rounded up to 16 bytes so the info word lands aligned.
+  static constexpr size_t kInfoBytes = sizeof(int);
+  static constexpr size_t kScratchAlign = 16;
+
+  static size_t scratchBytesFor(size_t workspace_bytes) {
+    workspace_bytes = (workspace_bytes + kScratchAlign - 1) & ~(kScratchAlign - 1);
+    return workspace_bytes + kInfoBytes;
+  }
+
+  // Ensure d_scratch_ is at least `bytes`. Grows but never shrinks.
+  // Syncs the stream before reallocating to avoid freeing memory that
+  // async kernels may still be using.
+  void ensure_scratch(size_t bytes) {
+    if (bytes > scratch_size_) {
+      if (d_scratch_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      d_scratch_ = internal::DeviceBuffer(bytes);
+      scratch_size_ = bytes;
+    }
+  }
+
+  void* scratch_workspace() const { return d_scratch_.get(); }
+  int* scratch_info() const {
+    eigen_assert(d_scratch_ && scratch_size_ >= kInfoBytes);
+    return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.get()) + scratch_size_ - kInfoBytes);
+  }
+
+  // Solve in place on `d_x` (which already holds B), then re-wrap as a
+  // typed DeviceMatrix carrying shape and a ready event. The release/adopt
+  // hop just hands ownership of the raw cudaMalloc pointer from the untyped
+  // DeviceBuffer to the typed DeviceMatrix without copying.
+  DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, internal::DeviceBuffer&& d_x) const {
+    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
+    constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value;
+
+    EIGEN_CUSOLVER_CHECK(cusolverDnXpotrs(handle_, params_.p, uplo, n_, nrhs, dtype, d_factor_.get(), lda_, dtype,
+                                          d_x.get(), ldb, scratch_info()));
+
+    DeviceMatrix<Scalar> result =
+        DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_x.release()), n_, static_cast<Index>(nrhs));
+    result.recordReady(stream_);
+    return result;
+  }
+
+  void init_context() {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
+    ensure_scratch(kInfoBytes);
+  }
+
+  // Synchronize stream and interpret the info word. No-op if already synced.
+  void sync_info() {
+    if (!info_synced_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      info_ = (info_word() == 0) ? Success : NumericalIssue;
+      info_synced_ = true;
+    }
+  }
+
+  // Run cusolverDnXpotrf on d_factor_ (already on device).
+  // Enqueues factorization + async info download. Does NOT sync.
+  // Workspaces are stored as members to ensure they outlive the async kernels.
+  void factorize() {
+    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
+    constexpr cublasFillMode_t uplo = internal::cusolver_fill_mode<UpLo_>::value;
+
+    info_synced_ = false;
+    info_ = InvalidInput;
+
+    size_t dev_ws_bytes = 0, host_ws_bytes = 0;
+    EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf_bufferSize(handle_, params_.p, uplo, n_, dtype, d_factor_.get(), lda_, dtype,
+                                                     &dev_ws_bytes, &host_ws_bytes));
+
+    ensure_scratch(scratchBytesFor(dev_ws_bytes));
+    h_workspace_.resize(host_ws_bytes);
+
+    EIGEN_CUSOLVER_CHECK(cusolverDnXpotrf(
+        handle_, params_.p, uplo, n_, dtype, d_factor_.get(), lda_, dtype, scratch_workspace(), dev_ws_bytes,
+        host_ws_bytes > 0 ? h_workspace_.data() : nullptr, host_ws_bytes, scratch_info()));
+
+    // Enqueue async download of info word — sync deferred to info() or solve().
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(&info_word(), scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
+  }
+};
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_LLT_H
diff --git a/unsupported/Eigen/src/GPU/GpuLU.h b/unsupported/Eigen/src/GPU/GpuLU.h
new file mode 100644
index 0000000..e4392bc
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/GpuLU.h
@@ -0,0 +1,375 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// GPU partial-pivoting LU decomposition using cuSOLVER.
+//
+// Wraps cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve).
+// The factored LU matrix and pivot array are kept in device memory for the
+// lifetime of the object, so repeated solves only transfer the RHS/solution.
+//
+// Requires CUDA 11.0+ (cusolverDnX generic API).
+//
+// Usage:
+//   gpu::LU<double> lu(A);            // upload A, getrf, LU+ipiv on device
+//   if (lu.info() != Success) { ... }
+//   MatrixXd x = lu.solve(b);         // getrs NoTrans, only b transferred
+//   MatrixXd xt = lu.solve(b, gpu::GpuOp::Trans);             // A^T x = b
+
+#ifndef EIGEN_GPU_LU_H
+#define EIGEN_GPU_LU_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "./CuBlasSupport.h"
+#include "./CuSolverSupport.h"
+#include <vector>
+
+namespace Eigen {
+namespace gpu {
+
+/** \ingroup GPU_Module
+ * \class LU
+ * \brief GPU LU decomposition with partial pivoting via cuSOLVER
+ *
+ * \tparam Scalar_  Element type: float, double, complex<float>, complex<double>
+ *
+ * Decomposes a square matrix A = P L U on the GPU and retains the factored
+ * matrix and pivot array in device memory. Solves A*X=B, A^T*X=B, or
+ * A^H*X=B by passing the appropriate gpu::GpuOp.
+ *
+ * Each LU object owns a dedicated CUDA stream and cuSOLVER handle.
+ */
+template <typename Scalar_>
+class LU {
+ public:
+  using Scalar = Scalar_;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  using PlainMatrix = Eigen::Matrix<Scalar, Dynamic, Dynamic, ColMajor>;
+
+  // ---- Construction / destruction ------------------------------------------
+
+  LU() { init_context(); }
+
+  template <typename InputType>
+  explicit LU(const EigenBase<InputType>& A) : LU() {
+    compute(A);
+  }
+
+  ~LU() {
+    // Ignore errors here: dtors are noexcept, and EIGEN_CUSOLVER_CHECK /
+    // EIGEN_CUDA_RUNTIME_CHECK are eigen_assert-based — firing one from a
+    // noexcept dtor terminates the program. The trailing cudaFree() of the
+    // device buffers (via internal::DeviceBuffer::~DeviceBuffer) is sync.
+    if (handle_) (void)cusolverDnDestroy(handle_);
+    if (stream_) (void)cudaStreamDestroy(stream_);
+  }
+
+  LU(const LU&) = delete;
+  LU& operator=(const LU&) = delete;
+
+  LU(LU&& o) noexcept
+      : stream_(o.stream_),
+        handle_(o.handle_),
+        params_(std::move(o.params_)),
+        d_lu_(std::move(o.d_lu_)),
+        lu_alloc_size_(o.lu_alloc_size_),
+        d_ipiv_(std::move(o.d_ipiv_)),
+        d_scratch_(std::move(o.d_scratch_)),
+        scratch_size_(o.scratch_size_),
+        h_workspace_(std::move(o.h_workspace_)),
+        n_(o.n_),
+        lda_(o.lda_),
+        info_(o.info_),
+        pinned_info_(std::move(o.pinned_info_)),
+        info_synced_(o.info_synced_) {
+    o.stream_ = nullptr;
+    o.handle_ = nullptr;
+    o.lu_alloc_size_ = 0;
+    o.scratch_size_ = 0;
+    o.n_ = 0;
+    o.lda_ = 0;
+    o.info_ = InvalidInput;
+    o.info_synced_ = true;
+  }
+
+  LU& operator=(LU&& o) noexcept {
+    if (this != &o) {
+      // Drain pending work on the old stream before tearing it down so the
+      // upcoming move of d_lu_/d_ipiv_/d_scratch_ doesn't free buffers that an
+      // in-flight kernel is still touching. The asymmetric move-ctor doesn't
+      // need this — it constructs onto uninitialized storage.
+      if (stream_) (void)cudaStreamSynchronize(stream_);
+      if (handle_) (void)cusolverDnDestroy(handle_);
+      if (stream_) (void)cudaStreamDestroy(stream_);
+      stream_ = o.stream_;
+      handle_ = o.handle_;
+      params_ = std::move(o.params_);
+      d_lu_ = std::move(o.d_lu_);
+      lu_alloc_size_ = o.lu_alloc_size_;
+      d_ipiv_ = std::move(o.d_ipiv_);
+      d_scratch_ = std::move(o.d_scratch_);
+      scratch_size_ = o.scratch_size_;
+      h_workspace_ = std::move(o.h_workspace_);
+      n_ = o.n_;
+      lda_ = o.lda_;
+      info_ = o.info_;
+      pinned_info_ = std::move(o.pinned_info_);
+      info_synced_ = o.info_synced_;
+      o.stream_ = nullptr;
+      o.handle_ = nullptr;
+      o.lu_alloc_size_ = 0;
+      o.scratch_size_ = 0;
+      o.n_ = 0;
+      o.lda_ = 0;
+      o.info_ = InvalidInput;
+      o.info_synced_ = true;
+    }
+    return *this;
+  }
+
+  // ---- Factorization -------------------------------------------------------
+
+  /** Compute the LU factorization of A (host matrix, must be square). */
+  template <typename InputType>
+  LU& compute(const EigenBase<InputType>& A) {
+    eigen_assert(A.rows() == A.cols() && "LU requires a square matrix");
+    if (!begin_compute(A.rows())) return *this;
+
+    const PlainMatrix mat(A.derived());
+    lda_ = static_cast<int64_t>(mat.outerStride());
+    allocate_lu_storage();
+    EIGEN_CUDA_RUNTIME_CHECK(cudaMemcpyAsync(d_lu_.get(), mat.data(), matrixBytes(), cudaMemcpyHostToDevice, stream_));
+
+    factorize();
+    return *this;
+  }
+
+  /** Compute the LU factorization from a device-resident matrix (D2D copy). */
+  LU& compute(const DeviceMatrix<Scalar>& d_A) {
+    eigen_assert(d_A.rows() == d_A.cols() && "LU requires a square matrix");
+    if (!begin_compute(d_A.rows())) return *this;
+
+    lda_ = static_cast<int64_t>(d_A.outerStride());
+    d_A.waitReady(stream_);
+    allocate_lu_storage();
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_lu_.get(), d_A.data(), matrixBytes(), cudaMemcpyDeviceToDevice, stream_));
+
+    factorize();
+    return *this;
+  }
+
+  /** Compute the LU factorization from a device matrix (move, no copy). */
+  LU& compute(DeviceMatrix<Scalar>&& d_A) {
+    eigen_assert(d_A.rows() == d_A.cols() && "LU requires a square matrix");
+    if (!begin_compute(d_A.rows())) return *this;
+
+    lda_ = static_cast<int64_t>(d_A.outerStride());
+    d_A.waitReady(stream_);
+    d_lu_ = internal::DeviceBuffer::adopt(static_cast<void*>(d_A.release()));
+    lu_alloc_size_ = matrixBytes();
+
+    factorize();
+    return *this;
+  }
+
+  // ---- Solve ---------------------------------------------------------------
+
+  /** Solve op(A) * X = B using the cached LU factorization (host → host).
+   *
+   * \param B  Right-hand side (n x nrhs host matrix).
+   * \param op gpu::GpuOp::NoTrans (default), Trans, or ConjTrans.
+   */
+  template <typename Rhs>
+  PlainMatrix solve(const MatrixBase<Rhs>& B, GpuOp op = GpuOp::NoTrans) const {
+    const_cast<LU*>(this)->sync_info();
+    eigen_assert(info_ == Success && "LU::solve called on a failed or uninitialized factorization");
+    eigen_assert(B.rows() == n_);
+
+    const PlainMatrix rhs(B);
+    const int64_t nrhs = static_cast<int64_t>(rhs.cols());
+    const int64_t ldb = static_cast<int64_t>(rhs.outerStride());
+    internal::DeviceBuffer d_x(matrixBytes(nrhs, ldb));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_x.get(), rhs.data(), matrixBytes(nrhs, ldb), cudaMemcpyHostToDevice, stream_));
+    DeviceMatrix<Scalar> d_X = solve_impl(nrhs, ldb, op, std::move(d_x));
+
+    PlainMatrix X(n_, B.cols());
+    int solve_info = 0;
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(X.data(), d_X.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(&solve_info, scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+
+    eigen_assert(solve_info == 0 && "cusolverDnXgetrs reported an error");
+    return X;
+  }
+
+  /** Solve op(A) * X = B with device-resident RHS. Fully async. */
+  DeviceMatrix<Scalar> solve(const DeviceMatrix<Scalar>& d_B, GpuOp op = GpuOp::NoTrans) const {
+    eigen_assert(d_B.rows() == n_);
+    d_B.waitReady(stream_);
+    const int64_t nrhs = static_cast<int64_t>(d_B.cols());
+    const int64_t ldb = static_cast<int64_t>(d_B.outerStride());
+    internal::DeviceBuffer d_x(matrixBytes(nrhs, ldb));
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(d_x.get(), d_B.data(), matrixBytes(nrhs, ldb), cudaMemcpyDeviceToDevice, stream_));
+    return solve_impl(nrhs, ldb, op, std::move(d_x));
+  }
+
+  // ---- Accessors -----------------------------------------------------------
+
+  /** Lazily synchronizes the stream on first call after compute(). */
+  ComputationInfo info() const {
+    const_cast<LU*>(this)->sync_info();
+    return info_;
+  }
+  Index rows() const { return n_; }
+  Index cols() const { return n_; }
+  cudaStream_t stream() const { return stream_; }
+
+ private:
+  cudaStream_t stream_ = nullptr;
+  cusolverDnHandle_t handle_ = nullptr;
+  internal::CusolverParams params_;   // cuSOLVER params (created once, reused)
+  internal::DeviceBuffer d_lu_;       // LU factors on device (grows, never shrinks)
+  size_t lu_alloc_size_ = 0;          // current d_lu_ allocation size
+  internal::DeviceBuffer d_ipiv_;     // pivot indices (int64_t) on device
+  internal::DeviceBuffer d_scratch_;  // combined workspace + info word (grows, never shrinks)
+  size_t scratch_size_ = 0;           // current scratch allocation size
+  std::vector<char> h_workspace_;     // host workspace (kept alive until next compute)
+  int64_t n_ = 0;
+  int64_t lda_ = 0;
+  ComputationInfo info_ = InvalidInput;
+  internal::PinnedHostBuffer pinned_info_{sizeof(int)};  // pinned host memory for async D2H
+  bool info_synced_ = true;                              // has the stream been synced for info?
+
+  int& info_word() { return *static_cast<int*>(pinned_info_.get()); }
+  int info_word() const { return *static_cast<const int*>(pinned_info_.get()); }
+
+  bool begin_compute(Index rows) {
+    n_ = rows;
+    info_ = InvalidInput;
+    if (n_ == 0) {
+      info_ = Success;
+      info_synced_ = true;
+      return false;
+    }
+    return true;
+  }
+
+  size_t matrixBytes() const { return matrixBytes(n_, lda_); }
+
+  static size_t matrixBytes(int64_t cols, int64_t outer_stride) {
+    return static_cast<size_t>(outer_stride) * static_cast<size_t>(cols) * sizeof(Scalar);
+  }
+
+  void allocate_lu_storage() {
+    size_t needed = matrixBytes();
+    if (needed > lu_alloc_size_) {
+      d_lu_ = internal::DeviceBuffer(needed);
+      lu_alloc_size_ = needed;
+    }
+  }
+
+  // Scratch layout: [ workspace (aligned) | info_word (sizeof(int)) ].
+  // Workspace size is rounded up to 16 bytes so the info word lands aligned.
+  static constexpr size_t kInfoBytes = sizeof(int);
+  static constexpr size_t kScratchAlign = 16;
+
+  static size_t scratchBytesFor(size_t workspace_bytes) {
+    workspace_bytes = (workspace_bytes + kScratchAlign - 1) & ~(kScratchAlign - 1);
+    return workspace_bytes + kInfoBytes;
+  }
+
+  // Ensure d_scratch_ is at least `bytes`. Grows but never shrinks.
+  // Syncs the stream before reallocating to avoid freeing memory that
+  // async kernels may still be using.
+  void ensure_scratch(size_t bytes) {
+    if (bytes > scratch_size_) {
+      if (d_scratch_) EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      d_scratch_ = internal::DeviceBuffer(bytes);
+      scratch_size_ = bytes;
+    }
+  }
+
+  void* scratch_workspace() const { return d_scratch_.get(); }
+  int* scratch_info() const {
+    eigen_assert(d_scratch_ && scratch_size_ >= kInfoBytes);
+    return reinterpret_cast<int*>(static_cast<char*>(d_scratch_.get()) + scratch_size_ - kInfoBytes);
+  }
+
+  // Solve in place on `d_x` (which already holds B), then re-wrap as a
+  // typed DeviceMatrix carrying shape and a ready event. The release/adopt
+  // hop just hands ownership of the raw cudaMalloc pointer from the untyped
+  // DeviceBuffer to the typed DeviceMatrix without copying.
+  DeviceMatrix<Scalar> solve_impl(int64_t nrhs, int64_t ldb, GpuOp op, internal::DeviceBuffer&& d_x) const {
+    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
+    const cublasOperation_t trans = internal::to_cublas_op(op);
+
+    EIGEN_CUSOLVER_CHECK(cusolverDnXgetrs(handle_, params_.p, trans, n_, nrhs, dtype, d_lu_.get(), lda_,
+                                          static_cast<const int64_t*>(d_ipiv_.get()), dtype, d_x.get(), ldb,
+                                          scratch_info()));
+
+    DeviceMatrix<Scalar> result =
+        DeviceMatrix<Scalar>::adopt(static_cast<Scalar*>(d_x.release()), n_, static_cast<Index>(nrhs));
+    result.recordReady(stream_);
+    return result;
+  }
+
+  void init_context() {
+    EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnCreate(&handle_));
+    EIGEN_CUSOLVER_CHECK(cusolverDnSetStream(handle_, stream_));
+    ensure_scratch(kInfoBytes);
+  }
+
+  void sync_info() {
+    if (!info_synced_) {
+      EIGEN_CUDA_RUNTIME_CHECK(cudaStreamSynchronize(stream_));
+      info_ = (info_word() == 0) ? Success : NumericalIssue;
+      info_synced_ = true;
+    }
+  }
+
+  // Run cusolverDnXgetrf on d_lu_ (already on device). Allocates d_ipiv_.
+  // Enqueues factorization + async info download. Does NOT sync.
+  // Workspaces are stored as members to ensure they outlive the async kernels.
+  void factorize() {
+    constexpr cudaDataType_t dtype = internal::cusolver_data_type<Scalar>::value;
+    const size_t ipiv_bytes = static_cast<size_t>(n_) * sizeof(int64_t);
+
+    info_synced_ = false;
+    info_ = InvalidInput;
+
+    d_ipiv_ = internal::DeviceBuffer(ipiv_bytes);
+
+    size_t dev_ws_bytes = 0, host_ws_bytes = 0;
+    EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(handle_, params_.p, n_, n_, dtype, d_lu_.get(), lda_, dtype,
+                                                     &dev_ws_bytes, &host_ws_bytes));
+
+    ensure_scratch(scratchBytesFor(dev_ws_bytes));
+    h_workspace_.resize(host_ws_bytes);
+
+    EIGEN_CUSOLVER_CHECK(cusolverDnXgetrf(handle_, params_.p, n_, n_, dtype, d_lu_.get(), lda_,
+                                          static_cast<int64_t*>(d_ipiv_.get()), dtype, scratch_workspace(),
+                                          dev_ws_bytes, host_ws_bytes > 0 ? h_workspace_.data() : nullptr,
+                                          host_ws_bytes, scratch_info()));
+
+    EIGEN_CUDA_RUNTIME_CHECK(
+        cudaMemcpyAsync(&info_word(), scratch_info(), sizeof(int), cudaMemcpyDeviceToHost, stream_));
+  }
+};
+
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_LU_H
diff --git a/unsupported/Eigen/src/GPU/GpuSupport.h b/unsupported/Eigen/src/GPU/GpuSupport.h
new file mode 100644
index 0000000..7d840bb
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/GpuSupport.h
@@ -0,0 +1,136 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Generic CUDA runtime support shared across all GPU library integrations
+// (cuSOLVER and cuBLAS):
+//   - Error-checking macros
+//   - RAII device buffer
+//
+// Only depends on <cuda_runtime.h>. No NVIDIA library headers.
+
+#ifndef EIGEN_GPU_SUPPORT_H
+#define EIGEN_GPU_SUPPORT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include <cuda_runtime.h>
+
+#include <memory>
+
+namespace Eigen {
+namespace gpu {
+namespace internal {
+
+// ---- Error-checking macros --------------------------------------------------
+// These abort (via eigen_assert) on failure. Not for use in destructors.
+
+#define EIGEN_CUDA_RUNTIME_CHECK(expr)                             \
+  do {                                                             \
+    cudaError_t _e = (expr);                                       \
+    eigen_assert(_e == cudaSuccess && "CUDA runtime call failed"); \
+  } while (0)
+
+// ---- Custom deleters for CUDA-allocated memory ------------------------------
+// Used with std::unique_ptr to give CUDA allocations RAII semantics with no
+// hand-rolled move/dtor boilerplate.
+
+struct CudaFreeDeleter {
+  void operator()(void* p) const noexcept {
+    if (p) (void)cudaFree(p);
+  }
+};
+
+struct CudaFreeHostDeleter {
+  void operator()(void* p) const noexcept {
+    if (p) (void)cudaFreeHost(p);
+  }
+};
+
+// ---- RAII: device buffer ----------------------------------------------------
+
+class DeviceBuffer {
+ public:
+  DeviceBuffer() = default;
+
+  explicit DeviceBuffer(size_t bytes) {
+    if (bytes > 0) {
+      void* p = nullptr;
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMalloc(&p, bytes));
+      ptr_.reset(p);
+    }
+  }
+
+  void* get() const noexcept { return ptr_.get(); }
+  void* release() noexcept { return ptr_.release(); }
+  explicit operator bool() const noexcept { return static_cast<bool>(ptr_); }
+
+  // Adopt an existing device pointer. Caller relinquishes ownership.
+  static DeviceBuffer adopt(void* p) noexcept {
+    DeviceBuffer b;
+    b.ptr_.reset(p);
+    return b;
+  }
+
+ private:
+  std::unique_ptr<void, CudaFreeDeleter> ptr_;
+};
+
+// ---- RAII: pinned host buffer -----------------------------------------------
+// For async D2H copies (cudaMemcpyAsync requires pinned host memory for true
+// asynchrony and to avoid compute-sanitizer warnings).
+
+class PinnedHostBuffer {
+ public:
+  PinnedHostBuffer() = default;
+
+  explicit PinnedHostBuffer(size_t bytes) {
+    if (bytes > 0) {
+      void* p = nullptr;
+      EIGEN_CUDA_RUNTIME_CHECK(cudaMallocHost(&p, bytes));
+      ptr_.reset(p);
+    }
+  }
+
+  void* get() const noexcept { return ptr_.get(); }
+  explicit operator bool() const noexcept { return static_cast<bool>(ptr_); }
+
+ private:
+  std::unique_ptr<void, CudaFreeHostDeleter> ptr_;
+};
+
+// ---- Scalar → cudaDataType_t ------------------------------------------------
+// Shared by cuBLAS and cuSOLVER. cudaDataType_t is defined in library_types.h
+// which is included transitively by cuda_runtime.h.
+
+template <typename Scalar>
+struct cuda_data_type;
+
+template <>
+struct cuda_data_type<float> {
+  static constexpr cudaDataType_t value = CUDA_R_32F;
+};
+template <>
+struct cuda_data_type<double> {
+  static constexpr cudaDataType_t value = CUDA_R_64F;
+};
+template <>
+struct cuda_data_type<std::complex<float>> {
+  static constexpr cudaDataType_t value = CUDA_C_32F;
+};
+template <>
+struct cuda_data_type<std::complex<double>> {
+  static constexpr cudaDataType_t value = CUDA_C_64F;
+};
+
+}  // namespace internal
+}  // namespace gpu
+}  // namespace Eigen
+
+#endif  // EIGEN_GPU_SUPPORT_H
diff --git a/unsupported/Eigen/src/GPU/InternalHeaderCheck.h b/unsupported/Eigen/src/GPU/InternalHeaderCheck.h
new file mode 100644
index 0000000..cb2163c
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_GPU_MODULE_H
+#error "Please include unsupported/Eigen/GPU instead of including headers inside the src/GPU directory directly."
+#endif
diff --git a/unsupported/Eigen/src/GPU/README.md b/unsupported/Eigen/src/GPU/README.md
new file mode 100644
index 0000000..2928a7c
--- /dev/null
+++ b/unsupported/Eigen/src/GPU/README.md
@@ -0,0 +1,344 @@
+# Eigen GPU Module (`unsupported/Eigen/GPU`)
+
+GPU-accelerated dense linear algebra for Eigen users, dispatching to NVIDIA
+CUDA Math Libraries (cuBLAS, cuSOLVER). Requires CUDA 11.4+. Header-only (link
+against CUDA runtime, cuBLAS, and cuSOLVER).
+
+## Why this module
+
+Eigen is the linear algebra foundation for a large ecosystem of C++ projects
+in robotics (ROS, Drake, MoveIt, Pinocchio), computer vision (OpenCV, COLMAP,
+Open3D), scientific computing (Ceres, Stan), and beyond. Many of these
+projects run on GPU-equipped hardware but cannot use GPUs for Eigen operations
+without dropping down to raw CUDA library APIs. Third-party projects like
+[EigenCuda](https://github.com/NLESC-JCER/EigenCuda) and
+[cholespy](https://github.com/rgl-epfl/cholespy) exist specifically to fill
+this gap, and downstream projects like
+[Ceres](https://github.com/ceres-solver/ceres-solver/issues/1151) and
+[COLMAP](https://github.com/colmap/colmap/issues/4018) have open requests for
+GPU-accelerated solvers through Eigen.
+
+The `unsupported/Eigen/GPU` module aims to close this gap: Existing Eigen users should be
+able to move performance-critical dense linear algebra to the GPU with minimal
+code changes and without learning CUDA library APIs directly.
+
+## Design philosophy
+
+**CPU and GPU coexist.** There is no global compile-time switch that replaces
+CPU implementations (unlike `EIGEN_USE_LAPACKE`). Users choose GPU solvers
+explicitly -- `gpu::LLT<double>` vs `Eigen::LLT<MatrixXd>` -- and both coexist in
+the same binary. This also lets users keep the factored matrix on device across
+multiple solves, something impossible with compile-time replacement.
+
+**Familiar syntax.** GPU operations use the same expression patterns as CPU
+Eigen. Here is a side-by-side comparison:
+
+```cpp
+// ---- CPU (Eigen) ----               // ---- GPU (unsupported/Eigen/GPU) ----
+#include <Eigen/Dense>                  #define EIGEN_USE_GPU
+                                        #include <unsupported/Eigen/GPU>
+
+MatrixXd A = ...;                       auto d_A = gpu::DeviceMatrix<double>::fromHost(A);
+MatrixXd B = ...;                       auto d_B = gpu::DeviceMatrix<double>::fromHost(B);
+
+MatrixXd C = A * B;                     gpu::DeviceMatrix<double> d_C = d_A * d_B;
+MatrixXd X = A.llt().solve(B);          gpu::DeviceMatrix<double> d_X = d_A.llt().solve(d_B);
+
+                                        MatrixXd X = d_X.toHost();
+```
+
+The GPU version reads like CPU Eigen with explicit upload/download.
+`operator*` dispatches to cuBLAS GEMM, `.llt().solve()` dispatches to
+cuSOLVER potrf + potrs. Unsupported expressions are compile errors.
+
+**Standalone module.** `unsupported/Eigen/GPU` does not modify or depend on Eigen's Core
+expression template system (`MatrixBase`, `CwiseBinaryOp`, etc.).
+`DeviceMatrix` is not an Eigen expression type and does not inherit from
+`MatrixBase`. The expression layer is a thin compile-time dispatch where every
+supported expression maps to a single NVIDIA library call. There is no
+coefficient-level evaluation, lazy fusion, or packet operations.
+
+**Explicit over implicit.** Host-device transfers, stream management, and
+library handle lifetimes are visible in the API. There are no hidden
+allocations or synchronizations except where documented (e.g., `toHost()` must
+synchronize to deliver data to the host).
+
+## Key concepts
+
+### `DeviceMatrix<Scalar>`
+
+A typed RAII wrapper for a dense column-major matrix in GPU device memory.
+This is the GPU counterpart of Eigen's `MatrixX<Scalar>`. A vector is simply
+a `DeviceMatrix` with one column.
+
+```cpp
+// Upload from host
+auto d_A = gpu::DeviceMatrix<double>::fromHost(A);
+
+// Allocate uninitialized
+gpu::DeviceMatrix<double> d_C(m, n);
+
+// Download to host
+MatrixXd C = d_C.toHost();
+
+// Async download (returns a future)
+auto transfer = d_C.toHostAsync();
+// ... do other work ...
+MatrixXd C = transfer.get();
+```
+
+`DeviceMatrix` supports expression methods that mirror Eigen's API:
+`adjoint()`, `transpose()`, `triangularView<UpLo>()`,
+`selfadjointView<UpLo>()`, `llt()`, `lu()`. These return lightweight
+expression objects that are evaluated when assigned.
+
+### `gpu::Context`
+
+Every GPU operation needs a CUDA stream and library handles (cuBLAS,
+cuSOLVER). `gpu::Context` bundles these together.
+
+For simple usage, you don't need to create one -- a per-thread default context
+is created lazily on first use:
+
+```cpp
+// These use the thread-local default context automatically
+d_C = d_A * d_B;
+d_X = d_A.llt().solve(d_B);
+```
+
+For concurrent multi-stream execution, create explicit contexts:
+
+```cpp
+gpu::Context ctx1, ctx2;
+d_C1.device(ctx1) = d_A1 * d_B1;   // runs on stream 1
+d_C2.device(ctx2) = d_A2 * d_B2;   // runs on stream 2 (concurrently)
+```
+
+## Usage
+
+### Matrix operations (cuBLAS)
+
+```cpp
+auto d_A = gpu::DeviceMatrix<double>::fromHost(A);
+auto d_B = gpu::DeviceMatrix<double>::fromHost(B);
+
+// GEMM: C = A * B, C = A^H * B, C = A * B^T, ...
+gpu::DeviceMatrix<double> d_C = d_A * d_B;
+d_C = d_A.adjoint() * d_B;
+d_C = d_A * d_B.transpose();
+
+// Scaled and accumulated
+d_C += 2.0 * d_A * d_B;             // alpha=2, beta=1
+d_C.device(ctx) -= d_A * d_B;       // alpha=-1, beta=1 (requires explicit context)
+
+// Triangular solve (TRSM)
+d_X = d_A.triangularView<Lower>().solve(d_B);
+
+// Symmetric/Hermitian multiply (SYMM/HEMM)
+d_C = d_A.selfadjointView<Lower>() * d_B;
+
+// Rank-k update (SYRK/HERK)
+d_C.selfadjointView<Lower>().rankUpdate(d_A);  // C += A * A^H
+```
+
+### Dense solvers (cuSOLVER)
+
+**One-shot expression syntax** -- Convenient, re-factorizes each time:
+
+```cpp
+// Cholesky solve (potrf + potrs)
+d_X = d_A.llt().solve(d_B);
+
+// LU solve (getrf + getrs)
+d_Y = d_A.lu().solve(d_B);
+```
+
+**Cached factorization** -- Factor once, solve many times:
+
+```cpp
+gpu::LLT<double> llt;
+llt.compute(d_A);                    // factorize (async)
+if (llt.info() != Success) { ... }   // lazy sync on first info() call
+auto d_X1 = llt.solve(d_B1);        // reuses factor (async)
+auto d_X2 = llt.solve(d_B2);        // reuses factor (async)
+MatrixXd X2 = d_X2.toHost();
+
+// LU with transpose solve
+gpu::LU<double> lu;
+lu.compute(d_A);
+auto d_Y = lu.solve(d_B, gpu::GpuOp::Trans);           // A^T Y = B
+```
+
+The cached API keeps the factored matrix on device, avoiding redundant
+host-device transfers and re-factorizations.
+
+### Stream control and async execution
+
+Operations are asynchronous by default. The compute-solve chain runs without
+host synchronization until you need a result on the host:
+
+```text
+fromHost(A) --sync-->  compute() --async-->  solve() --async-->  toHost()
+   H2D                  potrf                 potrs                D2H
+                                                                   sync
+```
+
+Mandatory sync points:
+- `fromHost()` -- Synchronizes to complete the upload before returning
+- `toHost()` / `HostTransfer::get()` -- Must deliver data to host
+- `info()` -- Must read the factorization status
+
+**Cross-stream safety** is automatic. `DeviceMatrix` tracks write completion
+via CUDA events. When a matrix written on stream A is read on stream B, the
+module automatically inserts `cudaStreamWaitEvent`. Same-stream operations
+skip the wait (CUDA guarantees in-order execution within a stream).
+
+**Lifetime of cached factorizations.** A `gpu::LLT` / `gpu::LU` object owns
+its CUDA stream, library handle, and the cached factor on device. Destroying
+the factorization object while a `solve()` it issued is still in flight is
+*correct* but not actually async: `cudaStreamDestroy` returns immediately,
+but the destructor of the cached factor calls `cudaFree`, which is fully
+device-synchronous and stalls until the in-flight `potrs`/`getrs` retires.
+For genuine async pipelining keep the factorization object alive until you
+have drained its results (e.g. via `toHost()` or by binding consumption to
+an explicit `gpu::Context` that outlives both producer and consumer):
+
+```cpp
+gpu::LLT<double> llt(d_A);             // factor stays on device
+auto d_X = llt.solve(d_B);
+auto h_x = d_X.toHostAsync(llt.stream());
+h_x.get();                             // sync: factor + result complete
+// llt may now be destroyed without stalling the device
+```
+
+## Reference
+
+### Supported scalar types
+
+`float`, `double`, `std::complex<float>`, `std::complex<double>`.
+
+### Expression -> library call mapping
+
+| DeviceMatrix expression | Library call | Parameters |
+|---|---|---|
+| `C = A * B` | `cublasGemmEx` | transA=N, transB=N, alpha=1, beta=0 |
+| `C = A.adjoint() * B` | `cublasGemmEx` | transA=C, transB=N |
+| `C = A.transpose() * B` | `cublasGemmEx` | transA=T, transB=N |
+| `C = A * B.adjoint()` | `cublasGemmEx` | transA=N, transB=C |
+| `C = A * B.transpose()` | `cublasGemmEx` | transA=N, transB=T |
+| `C = alpha * A * B` | `cublasGemmEx` | alpha from LHS |
+| `C = A * (alpha * B)` | `cublasGemmEx` | alpha from RHS |
+| `C += A * B` | `cublasGemmEx` | alpha=1, beta=1 |
+| `C.device(ctx) -= A * B` | `cublasGemmEx` | alpha=-1, beta=1 |
+| `X = A.llt().solve(B)` | `cusolverDnXpotrf` + `Xpotrs` | uplo, n, nrhs |
+| `X = A.llt<Upper>().solve(B)` | same | uplo=Upper |
+| `X = A.lu().solve(B)` | `cusolverDnXgetrf` + `Xgetrs` | n, nrhs |
+| `X = A.triangularView<L>().solve(B)` | `cublasXtrsm` | side=L, uplo, diag=NonUnit |
+| `C = A.selfadjointView<L>() * B` | `cublasXsymm` / `cublasXhemm` | side=L, uplo |
+| `C.selfadjointView<L>().rankUpdate(A)` | `cublasXsyrk` / `cublasXherk` | uplo, trans=N |
+
+### `DeviceMatrix<Scalar>` API
+
+| Method | Sync? | Description |
+|--------|-------|-------------|
+| `DeviceMatrix()` | -- | Empty (0x0) |
+| `DeviceMatrix(rows, cols)` | -- | Allocate uninitialized |
+| `fromHost(matrix, stream)` | yes | Upload from Eigen matrix |
+| `fromHostAsync(ptr, rows, cols, outerStride, stream)` | no | Async upload (caller manages lifetime) |
+| `toHost(stream)` | yes | Synchronous download |
+| `toHostAsync(stream)` | no | Returns `HostTransfer` future |
+| `clone(stream)` | no | Device-to-device deep copy |
+| `resize(rows, cols)` | -- | Discard contents, reallocate |
+| `data()` | -- | Raw device pointer |
+| `rows()`, `cols()` | -- | Dimensions |
+| `sizeInBytes()` | -- | Total device allocation size in bytes |
+| `empty()` | -- | True if 0x0 |
+| `adjoint()` | -- | Adjoint view (GEMM ConjTrans) |
+| `transpose()` | -- | Transpose view (GEMM Trans) |
+| `llt()` / `llt<UpLo>()` | -- | Cholesky expression builder |
+| `lu()` | -- | LU expression builder |
+| `triangularView<UpLo>()` | -- | Triangular view (TRSM) |
+| `selfadjointView<UpLo>()` | -- | Self-adjoint view (SYMM, rankUpdate) |
+| `device(ctx)` | -- | Assignment proxy bound to context |
+
+### `gpu::Context`
+
+Unified GPU execution context owning a CUDA stream and library handles.
+
+```cpp
+gpu::Context()                                             // Creates dedicated stream + handles
+static gpu::Context& threadLocal()                         // Per-thread default (lazy-created)
+
+cudaStream_t       stream()
+cublasHandle_t     cublasHandle()
+cusolverDnHandle_t cusolverHandle()
+```
+
+Non-copyable, non-movable (owns library handles).
+
+### `gpu::LLT<Scalar, UpLo>` API
+
+GPU dense Cholesky (LL^T) via cuSOLVER. Caches factor on device.
+
+| Method | Sync? | Description |
+|--------|-------|-------------|
+| `gpu::LLT(A)` | deferred | Construct and factorize from host matrix |
+| `compute(host_matrix)` | deferred | Upload and factorize |
+| `compute(DeviceMatrix)` | deferred | D2D copy and factorize |
+| `compute(DeviceMatrix&&)` | deferred | Move-adopt and factorize (no copy) |
+| `solve(host_matrix)` | yes | Solve, return host matrix |
+| `solve(DeviceMatrix)` | no | Solve, return `DeviceMatrix` (async) |
+| `info()` | lazy | Syncs stream on first call, returns `Success` or `NumericalIssue` |
+
+### `gpu::LU<Scalar>` API
+
+GPU dense partial-pivoting LU via cuSOLVER. Same pattern as `gpu::LLT`, plus a
+`gpu::GpuOp` parameter on `solve()` (`NoTrans`, `Trans`, `ConjTrans`).
+
+### `HostTransfer<Scalar>` API
+
+Future for async device-to-host transfer.
+
+| Method | Description |
+|--------|-------------|
+| `get()` | Block until transfer completes, return host matrix reference. Idempotent. |
+| `ready()` | Non-blocking poll |
+
+### Aliasing
+
+Unlike Eigen's `Matrix`, where omitting `.noalias()` triggers a copy to a
+temporary, DeviceMatrix dispatches directly to NVIDIA library calls which have
+no built-in aliasing protection. All operations are implicitly noalias.
+The caller must ensure operands don't alias the destination for GEMM, TRSM,
+SYMM/HEMM, and SYRK/HERK. Debug builds assert on these violations before
+dispatching to cuBLAS.
+
+## File layout
+
+| File | Depends on | Contents |
+|------|-----------|----------|
+| `GpuSupport.h` | `<cuda_runtime.h>` | Error macro, `DeviceBuffer`, `PinnedHostBuffer`, `cuda_data_type<>` |
+| `DeviceMatrix.h` | `GpuSupport.h` | `DeviceMatrix<>`, `HostTransfer<>` |
+| `DeviceExpr.h` | `DeviceMatrix.h` | GEMM expression wrappers |
+| `DeviceBlasExpr.h` | `DeviceMatrix.h` | TRSM, SYMM, SYRK expression wrappers |
+| `DeviceSolverExpr.h` | `DeviceMatrix.h` | Solver expression wrappers (LLT, LU) |
+| `DeviceDispatch.h` | all above | All dispatch functions + `Assignment` |
+| `GpuContext.h` | `CuBlasSupport.h`, `CuSolverSupport.h` | `gpu::Context` |
+| `CuBlasSupport.h` | `GpuSupport.h`, `<cublas_v2.h>` | cuBLAS error macro, op/compute type maps |
+| `CuSolverSupport.h` | `GpuSupport.h`, `<cusolverDn.h>` | cuSOLVER params, fill-mode mapping |
+| `GpuLLT.h` | `CuSolverSupport.h` | Cached dense Cholesky factorization |
+| `GpuLU.h` | `CuSolverSupport.h` | Cached dense LU factorization |
+
+## Building and testing
+
+```bash
+cmake -G Ninja -B build -S . \
+  -DEIGEN_TEST_CUDA=ON \
+  -DEIGEN_CUDA_COMPUTE_ARCH="70"
+
+cmake --build build --target cublas cusolver_llt cusolver_lu device_matrix
+ctest --test-dir build -L gpu --output-on-failure
+```
+
+`EIGEN_TEST_CUBLAS` and `EIGEN_TEST_CUSOLVER` default to ON when CUDA is enabled
+(cuBLAS and cuSOLVER are part of the CUDA toolkit).
diff --git a/unsupported/benchmarks/CMakeLists.txt b/unsupported/benchmarks/CMakeLists.txt
index ed8c88e..d8b9fcd 100644
--- a/unsupported/benchmarks/CMakeLists.txt
+++ b/unsupported/benchmarks/CMakeLists.txt
@@ -33,3 +33,10 @@
 add_subdirectory(Splines)
 add_subdirectory(IterativeSolvers)
 add_subdirectory(KroneckerProduct)
+
+# GPU benchmarks have their own CMake project (needs CUDAToolkit).
+# They can also be built standalone: cmake -B build -S unsupported/benchmarks/GPU
+find_package(CUDAToolkit QUIET)
+if(CUDAToolkit_FOUND)
+  add_subdirectory(GPU)
+endif()
diff --git a/unsupported/benchmarks/GPU/CMakeLists.txt b/unsupported/benchmarks/GPU/CMakeLists.txt
new file mode 100644
index 0000000..2c3511f
--- /dev/null
+++ b/unsupported/benchmarks/GPU/CMakeLists.txt
@@ -0,0 +1,54 @@
+# GPU benchmarks require CUDA runtime + cuSOLVER.
+# Build separately from the main benchmark tree since they need CUDA toolchain.
+#
+# Usage:
+#   cmake -G Ninja -B build-bench-gpu -S unsupported/benchmarks/GPU \
+#         -DCMAKE_CUDA_ARCHITECTURES=89
+#   cmake --build build-bench-gpu
+#
+# Profiling:
+#   nsys profile --trace=cuda ./build-bench-gpu/bench_solvers
+#   ncu --set full -o profile ./build-bench-gpu/bench_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
+
+cmake_minimum_required(VERSION 3.17)
+project(EigenGpuBenchmarks CXX)
+
+find_package(benchmark REQUIRED)
+find_package(CUDAToolkit REQUIRED)
+
+set(EIGEN_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
+
+function(eigen_add_gpu_benchmark name source)
+  cmake_parse_arguments(BENCH "" "" "LIBRARIES;DEFINITIONS" ${ARGN})
+  if(NOT IS_ABSOLUTE "${source}")
+    set(source "${CMAKE_CURRENT_SOURCE_DIR}/${source}")
+  endif()
+  add_executable(${name} ${source})
+  target_compile_features(${name} PRIVATE cxx_std_14)
+  target_include_directories(${name} PRIVATE
+    ${EIGEN_SOURCE_DIR}
+    ${CUDAToolkit_INCLUDE_DIRS})
+  target_link_libraries(${name} PRIVATE
+    benchmark::benchmark benchmark::benchmark_main
+    CUDA::cudart CUDA::cusolver CUDA::cublas)
+  if(BENCH_LIBRARIES)
+    target_link_libraries(${name} PRIVATE ${BENCH_LIBRARIES})
+  endif()
+  target_compile_options(${name} PRIVATE -O3 -DNDEBUG)
+  target_compile_definitions(${name} PRIVATE EIGEN_USE_GPU)
+  if(BENCH_DEFINITIONS)
+    target_compile_definitions(${name} PRIVATE ${BENCH_DEFINITIONS})
+  endif()
+endfunction()
+
+# Solver benchmarks: LLT/LU compute + solve, host vs device paths, CPU baselines.
+eigen_add_gpu_benchmark(bench_solvers bench_solvers.cpp)
+eigen_add_gpu_benchmark(bench_solvers_float bench_solvers.cpp DEFINITIONS SCALAR=float)
+
+# Chaining benchmarks: async pipeline efficiency, host-roundtrip vs device chain.
+eigen_add_gpu_benchmark(bench_chaining bench_chaining.cpp)
+eigen_add_gpu_benchmark(bench_chaining_float bench_chaining.cpp DEFINITIONS SCALAR=float)
+
+# Batching benchmarks: multi-stream concurrency for many small systems.
+eigen_add_gpu_benchmark(bench_batching bench_batching.cpp)
+eigen_add_gpu_benchmark(bench_batching_float bench_batching.cpp DEFINITIONS SCALAR=float)
diff --git a/unsupported/benchmarks/GPU/bench_batching.cpp b/unsupported/benchmarks/GPU/bench_batching.cpp
new file mode 100644
index 0000000..f9cabc5
--- /dev/null
+++ b/unsupported/benchmarks/GPU/bench_batching.cpp
@@ -0,0 +1,268 @@
+// GPU batching benchmarks: multi-stream concurrency for many small solves.
+//
+// Each gpu::LLT/gpu::LU owns its own CUDA stream. This benchmark measures how
+// well multiple solver instances overlap on the GPU, which is critical for
+// workloads like robotics (many small systems) and SLAM (batched poses).
+//
+// Compares:
+//   1. Sequential: one solver handles all systems one by one
+//   2. Batched: N solvers on N streams, all launched before any sync
+//   3. CPU baseline: Eigen LLT on host
+//
+// For Nsight Systems: batched mode should show overlapping kernels on
+// different streams in the timeline view.
+//
+//   nsys profile --trace=cuda ./bench_batching
+
+#include <benchmark/benchmark.h>
+
+#include <Eigen/Cholesky>
+#include <unsupported/Eigen/GPU>
+
+#include <memory>
+#include <vector>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR double
+#endif
+
+using Scalar = SCALAR;
+using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+
+static Mat make_spd(Index n) {
+  Mat M = Mat::Random(n, n);
+  return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
+}
+
+static void cuda_warmup() {
+  static bool done = false;
+  if (!done) {
+    void* p;
+    cudaMalloc(&p, 1);
+    cudaFree(p);
+    done = true;
+  }
+}
+
+// --------------------------------------------------------------------------
+// Sequential: one solver, N systems solved one after another
+// --------------------------------------------------------------------------
+
+static void BM_Batch_Sequential(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int batch_size = static_cast<int>(state.range(1));
+
+  // Pre-generate all SPD matrices and RHS vectors.
+  std::vector<Mat> As(batch_size);
+  std::vector<Mat> Bs(batch_size);
+  for (int i = 0; i < batch_size; ++i) {
+    As[i] = make_spd(n);
+    Bs[i] = Mat::Random(n, 1);
+  }
+
+  gpu::LLT<Scalar> llt;
+
+  for (auto _ : state) {
+    std::vector<Mat> results(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      llt.compute(As[i]);
+      results[i] = llt.solve(Bs[i]);
+    }
+    benchmark::DoNotOptimize(results.back().data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["batch"] = batch_size;
+  state.counters["total_solves"] = batch_size;
+}
+
+// --------------------------------------------------------------------------
+// Sequential with DeviceMatrix (avoid re-upload of A each iteration)
+// --------------------------------------------------------------------------
+
+static void BM_Batch_Sequential_Device(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int batch_size = static_cast<int>(state.range(1));
+
+  std::vector<Mat> As(batch_size);
+  std::vector<Mat> Bs(batch_size);
+  std::vector<gpu::DeviceMatrix<Scalar>> d_As(batch_size);
+  std::vector<gpu::DeviceMatrix<Scalar>> d_Bs(batch_size);
+  for (int i = 0; i < batch_size; ++i) {
+    As[i] = make_spd(n);
+    Bs[i] = Mat::Random(n, 1);
+    d_As[i] = gpu::DeviceMatrix<Scalar>::fromHost(As[i]);
+    d_Bs[i] = gpu::DeviceMatrix<Scalar>::fromHost(Bs[i]);
+  }
+
+  gpu::LLT<Scalar> llt;
+
+  for (auto _ : state) {
+    std::vector<Mat> results(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      llt.compute(d_As[i]);
+      gpu::DeviceMatrix<Scalar> d_X = llt.solve(d_Bs[i]);
+      results[i] = d_X.toHost();
+    }
+    benchmark::DoNotOptimize(results.back().data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["batch"] = batch_size;
+  state.counters["total_solves"] = batch_size;
+}
+
+// --------------------------------------------------------------------------
+// Batched: N solvers on N streams, overlapping execution
+// --------------------------------------------------------------------------
+
+static void BM_Batch_MultiStream(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int batch_size = static_cast<int>(state.range(1));
+
+  std::vector<Mat> As(batch_size);
+  std::vector<Mat> Bs(batch_size);
+  std::vector<gpu::DeviceMatrix<Scalar>> d_As(batch_size);
+  std::vector<gpu::DeviceMatrix<Scalar>> d_Bs(batch_size);
+  for (int i = 0; i < batch_size; ++i) {
+    As[i] = make_spd(n);
+    Bs[i] = Mat::Random(n, 1);
+    d_As[i] = gpu::DeviceMatrix<Scalar>::fromHost(As[i]);
+    d_Bs[i] = gpu::DeviceMatrix<Scalar>::fromHost(Bs[i]);
+  }
+
+  // N solvers = N independent CUDA streams.
+  std::vector<std::unique_ptr<gpu::LLT<Scalar>>> solvers(batch_size);
+  for (int i = 0; i < batch_size; ++i) {
+    solvers[i] = std::make_unique<gpu::LLT<Scalar>>();
+  }
+
+  for (auto _ : state) {
+    // Phase 1: launch all factorizations (async, different streams).
+    for (int i = 0; i < batch_size; ++i) {
+      solvers[i]->compute(d_As[i]);
+    }
+
+    // Phase 2: launch all solves (async, different streams).
+    std::vector<gpu::DeviceMatrix<Scalar>> d_Xs(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      d_Xs[i] = solvers[i]->solve(d_Bs[i]);
+    }
+
+    // Phase 3: download all results.
+    std::vector<Mat> results(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      results[i] = d_Xs[i].toHost();
+    }
+    benchmark::DoNotOptimize(results.back().data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["batch"] = batch_size;
+  state.counters["streams"] = batch_size;
+  state.counters["total_solves"] = batch_size;
+}
+
+// --------------------------------------------------------------------------
+// Batched with async download (overlap D2H with computation)
+// --------------------------------------------------------------------------
+
+static void BM_Batch_MultiStream_AsyncDownload(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int batch_size = static_cast<int>(state.range(1));
+
+  std::vector<Mat> As(batch_size);
+  std::vector<Mat> Bs(batch_size);
+  std::vector<gpu::DeviceMatrix<Scalar>> d_As(batch_size);
+  std::vector<gpu::DeviceMatrix<Scalar>> d_Bs(batch_size);
+  for (int i = 0; i < batch_size; ++i) {
+    As[i] = make_spd(n);
+    Bs[i] = Mat::Random(n, 1);
+    d_As[i] = gpu::DeviceMatrix<Scalar>::fromHost(As[i]);
+    d_Bs[i] = gpu::DeviceMatrix<Scalar>::fromHost(Bs[i]);
+  }
+
+  std::vector<std::unique_ptr<gpu::LLT<Scalar>>> solvers(batch_size);
+  for (int i = 0; i < batch_size; ++i) {
+    solvers[i] = std::make_unique<gpu::LLT<Scalar>>();
+  }
+
+  for (auto _ : state) {
+    // Launch all compute + solve.
+    std::vector<gpu::DeviceMatrix<Scalar>> d_Xs(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      solvers[i]->compute(d_As[i]);
+      d_Xs[i] = solvers[i]->solve(d_Bs[i]);
+    }
+
+    // Enqueue all async downloads.
+    std::vector<gpu::HostTransfer<Scalar>> transfers;
+    transfers.reserve(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      transfers.push_back(d_Xs[i].toHostAsync());
+    }
+
+    // Collect all results.
+    for (int i = 0; i < batch_size; ++i) {
+      benchmark::DoNotOptimize(transfers[i].get().data());
+    }
+  }
+
+  state.counters["n"] = n;
+  state.counters["batch"] = batch_size;
+  state.counters["streams"] = batch_size;
+  state.counters["total_solves"] = batch_size;
+}
+
+// --------------------------------------------------------------------------
+// CPU baseline: Eigen LLT on host, sequential
+// --------------------------------------------------------------------------
+
+static void BM_Batch_CPU(benchmark::State& state) {
+  const Index n = state.range(0);
+  const int batch_size = static_cast<int>(state.range(1));
+
+  std::vector<Mat> As(batch_size);
+  std::vector<Mat> Bs(batch_size);
+  for (int i = 0; i < batch_size; ++i) {
+    As[i] = make_spd(n);
+    Bs[i] = Mat::Random(n, 1);
+  }
+
+  for (auto _ : state) {
+    std::vector<Mat> results(batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      LLT<Mat> llt(As[i]);
+      results[i] = llt.solve(Bs[i]);
+    }
+    benchmark::DoNotOptimize(results.back().data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["batch"] = batch_size;
+  state.counters["total_solves"] = batch_size;
+}
+
+// --------------------------------------------------------------------------
+// Registration
+// --------------------------------------------------------------------------
+
+// clang-format off
+// Args: {matrix_size, batch_size}
+// Small matrices with large batches are the interesting case for multi-stream.
+BENCHMARK(BM_Batch_Sequential)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_Batch_Sequential_Device)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_Batch_CPU)->ArgsProduct({{16, 32, 64, 128, 256, 512}, {1, 4, 16, 64}})->Unit(benchmark::kMicrosecond);
+
+// Also run larger sizes with moderate batching.
+BENCHMARK(BM_Batch_MultiStream)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_Batch_MultiStream_AsyncDownload)->ArgsProduct({{512, 1024, 2048}, {1, 4, 8}})->Unit(benchmark::kMicrosecond);
+// clang-format on
diff --git a/unsupported/benchmarks/GPU/bench_chaining.cpp b/unsupported/benchmarks/GPU/bench_chaining.cpp
new file mode 100644
index 0000000..323d8b2
--- /dev/null
+++ b/unsupported/benchmarks/GPU/bench_chaining.cpp
@@ -0,0 +1,232 @@
+// GPU chaining benchmarks: measure async pipeline efficiency.
+//
+// Compares:
+//   1. Host round-trip per solve (baseline)
+//   2. DeviceMatrix chaining (no host round-trip between solves)
+//   3. Varying chain lengths (1, 2, 4, 8 consecutive solves)
+//
+// For Nsight Systems: look for gaps between kernel launches in the timeline.
+// Host round-trip creates visible idle gaps; chaining should show back-to-back kernels.
+//
+//   nsys profile --trace=cuda,nvtx ./bench_chaining
+
+#include <benchmark/benchmark.h>
+
+#include <Eigen/Cholesky>
+#include <unsupported/Eigen/GPU>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR double
+#endif
+
+using Scalar = SCALAR;
+using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+
+static Mat make_spd(Index n) {
+  Mat M = Mat::Random(n, n);
+  return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
+}
+
+static void cuda_warmup() {
+  static bool done = false;
+  if (!done) {
+    void* p;
+    cudaMalloc(&p, 1);
+    cudaFree(p);
+    done = true;
+  }
+}
+
+// --------------------------------------------------------------------------
+// Baseline: host round-trip between every solve
+// --------------------------------------------------------------------------
+
+static void BM_Chain_HostRoundtrip(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int chain_len = static_cast<int>(state.range(1));
+
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, 1);
+  gpu::LLT<Scalar> llt(A);
+
+  for (auto _ : state) {
+    Mat X = B;
+    for (int i = 0; i < chain_len; ++i) {
+      X = llt.solve(X);  // host → device → host each time
+    }
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["chain"] = chain_len;
+  state.counters["solves/iter"] = chain_len;
+}
+
+// --------------------------------------------------------------------------
+// DeviceMatrix chaining: no host round-trip between solves
+// --------------------------------------------------------------------------
+
+static void BM_Chain_Device(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int chain_len = static_cast<int>(state.range(1));
+
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, 1);
+  gpu::LLT<Scalar> llt(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  for (auto _ : state) {
+    gpu::DeviceMatrix<Scalar> d_X = llt.solve(d_B);
+    for (int i = 1; i < chain_len; ++i) {
+      d_X = llt.solve(d_X);  // device → device, fully async
+    }
+    Mat X = d_X.toHost();  // single sync at end
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["chain"] = chain_len;
+  state.counters["solves/iter"] = chain_len;
+}
+
+// --------------------------------------------------------------------------
+// DeviceMatrix chaining with async download (overlap D2H with next iteration)
+//
+// Double-buffered: each loop body issues iteration N+1's chain *before*
+// draining iteration N's D2H, so the host wait overlaps with on-device
+// compute instead of stalling the pipeline.
+// --------------------------------------------------------------------------
+
+static void BM_Chain_DeviceAsync(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int chain_len = static_cast<int>(state.range(1));
+
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, 1);
+  gpu::LLT<Scalar> llt(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  auto run_chain = [&]() {
+    gpu::DeviceMatrix<Scalar> d_X = llt.solve(d_B);
+    for (int i = 1; i < chain_len; ++i) {
+      d_X = llt.solve(d_X);
+    }
+    return d_X.toHostAsync();
+  };
+
+  // Prime the pipeline so each timed iteration overlaps a fresh chain
+  // with the previous iteration's D2H.
+  auto prev = run_chain();
+
+  for (auto _ : state) {
+    auto next = run_chain();  // kick off N+1 while D2H of N is in flight
+    Mat X = prev.get();       // drain N (overlaps with `next` compute)
+    benchmark::DoNotOptimize(X.data());
+    prev = std::move(next);
+  }
+
+  // Drain the trailing transfer outside the timed region.
+  prev.get();
+
+  state.counters["n"] = n;
+  state.counters["chain"] = chain_len;
+  state.counters["solves/iter"] = chain_len;
+}
+
+// --------------------------------------------------------------------------
+// Pure GPU chain (no download — measures kernel-only throughput)
+// --------------------------------------------------------------------------
+
+static void BM_Chain_DeviceNoDownload(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int chain_len = static_cast<int>(state.range(1));
+
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, 1);
+  gpu::LLT<Scalar> llt(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  for (auto _ : state) {
+    gpu::DeviceMatrix<Scalar> d_X = llt.solve(d_B);
+    for (int i = 1; i < chain_len; ++i) {
+      d_X = llt.solve(d_X);
+    }
+    cudaStreamSynchronize(llt.stream());
+    benchmark::DoNotOptimize(d_X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["chain"] = chain_len;
+  state.counters["solves/iter"] = chain_len;
+}
+
+// --------------------------------------------------------------------------
+// Compute + solve chain (full pipeline: factorize, then chain solves)
+// --------------------------------------------------------------------------
+
+static void BM_FullPipeline_Host(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int chain_len = static_cast<int>(state.range(1));
+
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, 1);
+
+  for (auto _ : state) {
+    gpu::LLT<Scalar> llt(A);
+    Mat X = B;
+    for (int i = 0; i < chain_len; ++i) {
+      X = llt.solve(X);
+    }
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["chain"] = chain_len;
+}
+
+static void BM_FullPipeline_Device(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const int chain_len = static_cast<int>(state.range(1));
+
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, 1);
+
+  for (auto _ : state) {
+    auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+    auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+    gpu::LLT<Scalar> llt;
+    llt.compute(d_A);
+    gpu::DeviceMatrix<Scalar> d_X = llt.solve(d_B);
+    for (int i = 1; i < chain_len; ++i) {
+      d_X = llt.solve(d_X);
+    }
+    Mat X = d_X.toHost();
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["chain"] = chain_len;
+}
+
+// --------------------------------------------------------------------------
+// Registration
+// --------------------------------------------------------------------------
+
+// clang-format off
+// Args: {matrix_size, chain_length}
+BENCHMARK(BM_Chain_HostRoundtrip)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_Chain_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_Chain_DeviceAsync)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_Chain_DeviceNoDownload)->ArgsProduct({{64, 256, 1024, 4096}, {1, 2, 4, 8}})->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_FullPipeline_Host)->ArgsProduct({{256, 1024, 4096}, {1, 4}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_FullPipeline_Device)->ArgsProduct({{256, 1024, 4096}, {1, 4}})->Unit(benchmark::kMicrosecond);
+// clang-format on
diff --git a/unsupported/benchmarks/GPU/bench_solvers.cpp b/unsupported/benchmarks/GPU/bench_solvers.cpp
new file mode 100644
index 0000000..61d86ce
--- /dev/null
+++ b/unsupported/benchmarks/GPU/bench_solvers.cpp
@@ -0,0 +1,296 @@
+// GPU solver benchmarks: gpu::LLT and gpu::LU compute + solve throughput.
+//
+// Measures factorization and solve performance for the host-matrix and
+// DeviceMatrix code paths across a range of matrix sizes.
+//
+// For Nsight Systems profiling:
+//   nsys profile --trace=cuda,nvtx ./bench_solvers
+//
+// For Nsight Compute kernel analysis:
+//   ncu --set full -o profile ./bench_solvers --benchmark_filter=BM_GpuLLT_Compute/4096
+
+#include <benchmark/benchmark.h>
+
+#include <Eigen/Cholesky>
+#include <unsupported/Eigen/GPU>
+#include <Eigen/LU>
+
+using namespace Eigen;
+
+#ifndef SCALAR
+#define SCALAR double
+#endif
+
+using Scalar = SCALAR;
+using Mat = Matrix<Scalar, Dynamic, Dynamic>;
+
+// --------------------------------------------------------------------------
+// Helpers
+// --------------------------------------------------------------------------
+
+static Mat make_spd(Index n) {
+  Mat M = Mat::Random(n, n);
+  return M.adjoint() * M + Mat::Identity(n, n) * static_cast<Scalar>(n);
+}
+
+// CUDA warm-up: ensure the GPU is initialized before timing.
+static void cuda_warmup() {
+  static bool done = false;
+  if (!done) {
+    void* p;
+    cudaMalloc(&p, 1);
+    cudaFree(p);
+    done = true;
+  }
+}
+
+// --------------------------------------------------------------------------
+// GpuLLT benchmarks
+// --------------------------------------------------------------------------
+
+// Factorize from host matrix (includes H2D upload).
+static void BM_GpuLLT_Compute_Host(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  Mat A = make_spd(n);
+  gpu::LLT<Scalar> llt;
+
+  for (auto _ : state) {
+    llt.compute(A);
+    if (llt.info() != Success) state.SkipWithError("factorization failed");
+  }
+
+  double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["n"] = n;
+}
+
+// Factorize from DeviceMatrix (D2D copy path).
+static void BM_GpuLLT_Compute_Device(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  Mat A = make_spd(n);
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  gpu::LLT<Scalar> llt;
+
+  for (auto _ : state) {
+    llt.compute(d_A);
+    if (llt.info() != Success) state.SkipWithError("factorization failed");
+  }
+
+  double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["n"] = n;
+}
+
+// Factorize from DeviceMatrix (move path, no copy).
+static void BM_GpuLLT_Compute_DeviceMove(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  Mat A = make_spd(n);
+  gpu::LLT<Scalar> llt;
+
+  for (auto _ : state) {
+    auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+    llt.compute(std::move(d_A));
+    if (llt.info() != Success) state.SkipWithError("factorization failed");
+  }
+
+  double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["n"] = n;
+}
+
+// Solve from host matrix (H2D + potrs + D2H).
+static void BM_GpuLLT_Solve_Host(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const Index nrhs = state.range(1);
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, nrhs);
+  gpu::LLT<Scalar> llt(A);
+
+  for (auto _ : state) {
+    Mat X = llt.solve(B);
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["nrhs"] = nrhs;
+}
+
+// Solve from DeviceMatrix (D2D + potrs, async, toHost at end).
+static void BM_GpuLLT_Solve_Device(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const Index nrhs = state.range(1);
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, nrhs);
+  gpu::LLT<Scalar> llt(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  for (auto _ : state) {
+    gpu::DeviceMatrix<Scalar> d_X = llt.solve(d_B);
+    Mat X = d_X.toHost();
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["nrhs"] = nrhs;
+}
+
+// Solve staying entirely on device (no toHost — measures pure GPU time).
+static void BM_GpuLLT_Solve_DeviceOnly(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const Index nrhs = state.range(1);
+  Mat A = make_spd(n);
+  Mat B = Mat::Random(n, nrhs);
+  gpu::LLT<Scalar> llt(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  for (auto _ : state) {
+    gpu::DeviceMatrix<Scalar> d_X = llt.solve(d_B);
+    // Force completion without D2H transfer.
+    cudaStreamSynchronize(llt.stream());
+    benchmark::DoNotOptimize(d_X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["nrhs"] = nrhs;
+}
+
+// --------------------------------------------------------------------------
+// GpuLU benchmarks
+// --------------------------------------------------------------------------
+
+static void BM_GpuLU_Compute_Host(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  Mat A = Mat::Random(n, n);
+  gpu::LU<Scalar> lu;
+
+  for (auto _ : state) {
+    lu.compute(A);
+    if (lu.info() != Success) state.SkipWithError("factorization failed");
+  }
+
+  double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["n"] = n;
+}
+
+static void BM_GpuLU_Compute_Device(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  Mat A = Mat::Random(n, n);
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  gpu::LU<Scalar> lu;
+
+  for (auto _ : state) {
+    lu.compute(d_A);
+    if (lu.info() != Success) state.SkipWithError("factorization failed");
+  }
+
+  double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["n"] = n;
+}
+
+static void BM_GpuLU_Solve_Host(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const Index nrhs = state.range(1);
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, nrhs);
+  gpu::LU<Scalar> lu(A);
+
+  for (auto _ : state) {
+    Mat X = lu.solve(B);
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["nrhs"] = nrhs;
+}
+
+static void BM_GpuLU_Solve_Device(benchmark::State& state) {
+  cuda_warmup();
+  const Index n = state.range(0);
+  const Index nrhs = state.range(1);
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, nrhs);
+  gpu::LU<Scalar> lu(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  for (auto _ : state) {
+    gpu::DeviceMatrix<Scalar> d_X = lu.solve(d_B);
+    Mat X = d_X.toHost();
+    benchmark::DoNotOptimize(X.data());
+  }
+
+  state.counters["n"] = n;
+  state.counters["nrhs"] = nrhs;
+}
+
+// --------------------------------------------------------------------------
+// CPU baselines for comparison
+// --------------------------------------------------------------------------
+
+static void BM_CpuLLT_Compute(benchmark::State& state) {
+  const Index n = state.range(0);
+  Mat A = make_spd(n);
+  LLT<Mat> llt;
+
+  for (auto _ : state) {
+    llt.compute(A);
+    benchmark::DoNotOptimize(llt.matrixLLT().data());
+  }
+
+  double flops = static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n) / 3.0;
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["n"] = n;
+}
+
+static void BM_CpuLU_Compute(benchmark::State& state) {
+  const Index n = state.range(0);
+  Mat A = Mat::Random(n, n);
+  PartialPivLU<Mat> lu;
+
+  for (auto _ : state) {
+    lu.compute(A);
+    benchmark::DoNotOptimize(lu.matrixLU().data());
+  }
+
+  double flops = 2.0 / 3.0 * static_cast<double>(n) * static_cast<double>(n) * static_cast<double>(n);
+  state.counters["GFLOPS"] =
+      benchmark::Counter(flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::kIs1000);
+  state.counters["n"] = n;
+}
+
+// --------------------------------------------------------------------------
+// Registration
+// --------------------------------------------------------------------------
+
+// clang-format off
+BENCHMARK(BM_GpuLLT_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_GpuLLT_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_GpuLLT_Compute_DeviceMove)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_GpuLLT_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_GpuLLT_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_GpuLLT_Solve_DeviceOnly)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_GpuLU_Compute_Host)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_GpuLU_Compute_Device)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_GpuLU_Solve_Host)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_GpuLU_Solve_Device)->ArgsProduct({{64, 256, 1024, 4096}, {1, 16}})->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_CpuLLT_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_CpuLU_Compute)->ArgsProduct({{64, 128, 256, 512, 1024, 2048, 4096}})->Unit(benchmark::kMicrosecond);
+// clang-format on
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index a521b85..e7d628f 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -287,6 +287,8 @@
   ei_add_test(tensor_random_gpu)
 
   unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+
+  add_subdirectory(GPU)
 endif()
 
 # Add HIP specific tests
diff --git a/unsupported/test/GPU/CMakeLists.txt b/unsupported/test/GPU/CMakeLists.txt
new file mode 100644
index 0000000..02d6832
--- /dev/null
+++ b/unsupported/test/GPU/CMakeLists.txt
@@ -0,0 +1,73 @@
+# GPU module tests. Included from unsupported/test/CMakeLists.txt inside the
+# if(CUDA_FOUND AND EIGEN_TEST_CUDA) block.
+
+find_package(CUDAToolkit)
+
+# ei_add_gpu_test(<name> [EXTRA_LIBS <lib>...] [EXTRA_INCLUDES <dir>...] [EXTRA_DEFINES <def>...])
+#
+# Thin wrapper around ei_add_test for GPU module tests. Delegates to
+# ei_add_test so CALL_SUBTEST_N splitting and EIGEN_TEST_MAX_SIZE handling
+# work the same way as the rest of the test suite, then layers on the
+# GPU-specific properties (EXTRA_{LIBS,INCLUDES,DEFINES}, the
+# "Official;gpu" labels, SKIP_RETURN_CODE=77, buildtests_gpu dependency).
+#
+# GPU module tests are plain .cpp (host-compiled) by design: they
+# orchestrate NVIDIA library calls and never launch kernels themselves,
+# which avoids NVCC instantiating Eigen CPU packet ops for CUDA vector
+# types. That keeps them outside ei_add_test's "is_gpu_test" path (which
+# triggers on EIGEN_ADD_TEST_FILENAME_EXTENSION=cu), so we set the
+# SKIP_RETURN_CODE and labels ourselves.
+function(ei_add_gpu_test test_name)
+  cmake_parse_arguments(ARG "" "" "EXTRA_LIBS;EXTRA_INCLUDES;EXTRA_DEFINES" ${ARGN})
+
+  # Delegate to ei_add_test. The third arg is a semicolon-separated library
+  # list; CUDA::cudart_static is always needed for any host test that
+  # touches a device-allocated buffer.
+  ei_add_test(${test_name} "" "CUDA::cudart_static;${ARG_EXTRA_LIBS}")
+
+  # Discover targets ei_add_test created. The scan mirrors ei_add_test's own
+  # scan so we see the same set of suffixes: one target per distinct
+  # CALL_SUBTEST_N on EIGEN_SPLIT_LARGE_TESTS, else a single target.
+  file(READ "${test_name}.cpp" _src)
+  string(REGEX MATCHALL "CALL_SUBTEST_[0-9]+" _matches "${_src}")
+  string(REGEX REPLACE "CALL_SUBTEST_" "" _suffixes "${_matches}")
+  list(REMOVE_DUPLICATES _suffixes)
+  set(_targets)
+  if(EIGEN_SPLIT_LARGE_TESTS AND _suffixes)
+    foreach(s ${_suffixes})
+      list(APPEND _targets "${test_name}_${s}")
+    endforeach()
+  else()
+    list(APPEND _targets "${test_name}")
+  endif()
+
+  foreach(t ${_targets})
+    target_include_directories(${t} PRIVATE
+      "${CUDA_TOOLKIT_ROOT_DIR}/include"
+      "${CMAKE_CURRENT_BINARY_DIR}"
+      ${ARG_EXTRA_INCLUDES})
+    if(ARG_EXTRA_DEFINES)
+      target_compile_definitions(${t} PRIVATE ${ARG_EXTRA_DEFINES})
+    endif()
+    add_dependencies(buildtests_gpu ${t})
+    set_property(TEST ${t} APPEND PROPERTY LABELS "Official;gpu")
+    set_property(TEST ${t} PROPERTY SKIP_RETURN_CODE 77)
+  endforeach()
+endfunction()
+
+# DeviceMatrix core: CUDA runtime only.
+ei_add_gpu_test(device_matrix)
+
+# cuBLAS integration (BLAS-3, triangular solves, rank-k updates, ...).
+option(EIGEN_TEST_CUBLAS "Test cuBLAS integration" ON)
+if(EIGEN_TEST_CUBLAS AND TARGET CUDA::cublas)
+  ei_add_gpu_test(cublas EXTRA_LIBS CUDA::cublas CUDA::cusolver)
+endif()
+
+# cuSOLVER dense factorizations (LLT, LU).
+option(EIGEN_TEST_CUSOLVER "Test cuSOLVER integration" ON)
+if(EIGEN_TEST_CUSOLVER AND TARGET CUDA::cusolver)
+  foreach(_cusolver_test IN ITEMS cusolver_llt cusolver_lu)
+    ei_add_gpu_test(${_cusolver_test} EXTRA_LIBS CUDA::cusolver CUDA::cublas)
+  endforeach()
+endif()
diff --git a/unsupported/test/GPU/cublas.cpp b/unsupported/test/GPU/cublas.cpp
new file mode 100644
index 0000000..7c04eb2
--- /dev/null
+++ b/unsupported/test/GPU/cublas.cpp
@@ -0,0 +1,739 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Tests for cuBLAS GEMM dispatch via DeviceMatrix expression syntax.
+// Covers: d_C = d_A * d_B, adjoint, transpose, scaled, +=, .device(ctx).
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <unsupported/Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- Basic GEMM: C = A * B -------------------------------------------------
+
+template <typename Scalar>
+void test_gemm_basic(Index m, Index n, Index k) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(k, n);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  // Expression: d_C = d_A * d_B
+  gpu::DeviceMatrix<Scalar> d_C;
+  d_C = d_A * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = A * B;
+
+  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM with adjoint: C = A^H * B ----------------------------------------
+
+template <typename Scalar>
+void test_gemm_adjoint_lhs(Index m, Index n, Index k) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(k, m);  // A is k×m, A^H is m×k
+  Mat B = Mat::Random(k, n);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::DeviceMatrix<Scalar> d_C;
+  d_C = d_A.adjoint() * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = A.adjoint() * B;
+
+  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM with transpose: C = A * B^T --------------------------------------
+
+template <typename Scalar>
+void test_gemm_transpose_rhs(Index m, Index n, Index k) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(n, k);  // B is n×k, B^T is k×n
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::DeviceMatrix<Scalar> d_C;
+  d_C = d_A * d_B.transpose();
+
+  Mat C = d_C.toHost();
+  Mat C_ref = A * B.transpose();
+
+  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM with view/view operands: C = alpha * A^T * B^H -------------------
+
+template <typename Scalar>
+void test_gemm_view_pairs(Index m, Index n, Index k) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(k, m);  // A^T is m x k
+  Mat B = Mat::Random(n, k);  // B^H is k x n
+  Scalar alpha = Scalar(2);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::DeviceMatrix<Scalar> d_C;
+  d_C = (alpha * d_A.transpose()) * d_B.adjoint();
+
+  Mat C = d_C.toHost();
+  Mat C_ref = (alpha * A.transpose()) * B.adjoint();
+
+  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM with scaled: C = alpha * A * B ------------------------------------
+
+template <typename Scalar>
+void test_gemm_scaled(Index m, Index n, Index k) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(k, n);
+  Scalar alpha = Scalar(2.5);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::DeviceMatrix<Scalar> d_C;
+  d_C = alpha * d_A * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = alpha * A * B;
+
+  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM accumulate: C += A * B (beta=1) -----------------------------------
+
+template <typename Scalar>
+void test_gemm_accumulate(Index m, Index n, Index k) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(k, n);
+  Mat C_init = Mat::Random(m, n);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+  auto d_C = gpu::DeviceMatrix<Scalar>::fromHost(C_init);
+
+  d_C += d_A * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = C_init + A * B;
+
+  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM accumulate into empty destination ---------------------------------
+
+template <typename Scalar>
+void test_gemm_accumulate_empty(Index m, Index n, Index k) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(k, n);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+  gpu::DeviceMatrix<Scalar> d_C;
+
+  d_C += d_A * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = A * B;
+
+  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM subtract: C -= A * B (beta=1, alpha=-1) --------------------------
+
+template <typename Scalar>
+void test_gemm_subtract(Index m, Index n, Index k) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(k, n);
+  Mat C_init = Mat::Random(m, n);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+  auto d_C = gpu::DeviceMatrix<Scalar>::fromHost(C_init);
+
+  gpu::Context ctx;
+  d_C.device(ctx) -= d_A * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = C_init - A * B;
+
+  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM subtract from empty destination -----------------------------------
+
+template <typename Scalar>
+void test_gemm_subtract_empty(Index m, Index n, Index k) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(k, n);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::Context ctx;
+  gpu::DeviceMatrix<Scalar> d_C;
+  d_C.device(ctx) -= d_A * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = -(A * B);
+
+  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM with scaled RHS: C = A * (alpha * B) -----------------------------
+
+template <typename Scalar>
+void test_gemm_scaled_rhs(Index m, Index n, Index k) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(k, n);
+  Scalar alpha = Scalar(3.0);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::DeviceMatrix<Scalar> d_C;
+  d_C = d_A * (alpha * d_B);
+
+  Mat C = d_C.toHost();
+  Mat C_ref = A * (alpha * B);
+
+  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM dimension mismatch must assert ------------------------------------
+// Note: we do NOT use VERIFY_RAISES_ASSERT here because it relies on
+// setjmp/longjmp which skips RAII destructors for DeviceMatrix (GPU memory)
+// and cuBLAS/cuSOLVER handles, corrupting state for subsequent tests.
+
+// ---- GEMM with explicit gpu::Context ------------------------------------------
+
+template <typename Scalar>
+void test_gemm_explicit_context(Index m, Index n, Index k) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(m, k);
+  Mat B = Mat::Random(k, n);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::Context ctx;
+  gpu::DeviceMatrix<Scalar> d_C;
+  d_C.device(ctx) = d_A * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = A * B;
+
+  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM cross-context reuse of the same destination -----------------------
+
+template <typename Scalar>
+void test_gemm_cross_context_reuse(Index n) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, n);
+  Mat D = Mat::Random(n, n);
+  Mat E = Mat::Random(n, n);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+  auto d_D = gpu::DeviceMatrix<Scalar>::fromHost(D);
+  auto d_E = gpu::DeviceMatrix<Scalar>::fromHost(E);
+
+  gpu::Context ctx1;
+  gpu::Context ctx2;
+  gpu::DeviceMatrix<Scalar> d_C;
+  d_C.device(ctx1) = d_A * d_B;
+  d_C.device(ctx2) += d_D * d_E;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = A * B + D * E;
+
+  RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm();
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM cross-context resize of the destination ---------------------------
+
+template <typename Scalar>
+void test_gemm_cross_context_resize() {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(64, 64);
+  Mat B = Mat::Random(64, 64);
+  Mat D = Mat::Random(32, 16);
+  Mat E = Mat::Random(16, 8);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+  auto d_D = gpu::DeviceMatrix<Scalar>::fromHost(D);
+  auto d_E = gpu::DeviceMatrix<Scalar>::fromHost(E);
+
+  gpu::Context ctx1;
+  gpu::Context ctx2;
+  gpu::DeviceMatrix<Scalar> d_C;
+  d_C.device(ctx1) = d_A * d_B;
+  d_C.device(ctx2) = d_D * d_E;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = D * E;
+
+  RealScalar tol = RealScalar(16) * NumTraits<Scalar>::epsilon() * C_ref.norm();
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- GEMM chaining: C = (A * B) then D = C * E -----------------------------
+
+template <typename Scalar>
+void test_gemm_chain(Index n) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, n);
+  Mat E = Mat::Random(n, n);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+  auto d_E = gpu::DeviceMatrix<Scalar>::fromHost(E);
+
+  gpu::DeviceMatrix<Scalar> d_C;
+  d_C = d_A * d_B;
+  gpu::DeviceMatrix<Scalar> d_D;
+  d_D = d_C * d_E;
+
+  Mat D = d_D.toHost();
+  Mat D_ref = (A * B) * E;
+
+  RealScalar tol = RealScalar(2) * RealScalar(n) * NumTraits<Scalar>::epsilon() * D_ref.norm();
+  VERIFY((D - D_ref).norm() < tol);
+}
+
+// ---- Square identity check: A * I = A ---------------------------------------
+
+template <typename Scalar>
+void test_gemm_identity(Index n) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+
+  Mat A = Mat::Random(n, n);
+  Mat eye = Mat::Identity(n, n);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_I = gpu::DeviceMatrix<Scalar>::fromHost(eye);
+
+  gpu::DeviceMatrix<Scalar> d_C;
+  d_C = d_A * d_I;
+
+  Mat C = d_C.toHost();
+  VERIFY_IS_APPROX(C, A);
+}
+
+// ---- LLT solve expression: d_X = d_A.llt().solve(d_B) ----------------------
+
+template <typename MatrixType>
+MatrixType make_spd(Index n) {
+  using Scalar = typename MatrixType::Scalar;
+  MatrixType M = MatrixType::Random(n, n);
+  return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n);
+}
+
+template <typename Scalar>
+void test_llt_solve_expr(Index n, Index nrhs) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = make_spd<Mat>(n);
+  Mat B = Mat::Random(n, nrhs);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::DeviceMatrix<Scalar> d_X;
+  d_X = d_A.llt().solve(d_B);
+
+  Mat X = d_X.toHost();
+  RealScalar residual = (A * X - B).norm() / B.norm();
+  VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+// ---- LLT solve with explicit context ----------------------------------------
+
+template <typename Scalar>
+void test_llt_solve_expr_context(Index n, Index nrhs) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = make_spd<Mat>(n);
+  Mat B = Mat::Random(n, nrhs);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::Context ctx;
+  gpu::DeviceMatrix<Scalar> d_X;
+  d_X.device(ctx) = d_A.llt().solve(d_B);
+
+  Mat X = d_X.toHost();
+  RealScalar residual = (A * X - B).norm() / B.norm();
+  VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+// ---- LU solve expression: d_X = d_A.lu().solve(d_B) ------------------------
+
+template <typename Scalar>
+void test_lu_solve_expr(Index n, Index nrhs) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, nrhs);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::DeviceMatrix<Scalar> d_X;
+  d_X = d_A.lu().solve(d_B);
+
+  Mat X = d_X.toHost();
+  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
+  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+// ---- GEMM + solver chain: C = A * B, X = C.llt().solve(D) ------------------
+
+template <typename Scalar>
+void test_gemm_then_solve(Index n) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  Mat D = Mat::Random(n, 1);
+
+  // Make SPD: C = A^H * A + n*I
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  gpu::DeviceMatrix<Scalar> d_C;
+  d_C = d_A.adjoint() * d_A;
+
+  // Add n*I on host (no element-wise ops on DeviceMatrix yet).
+  Mat C = d_C.toHost();
+  C += Mat::Identity(n, n) * static_cast<Scalar>(n);
+  d_C = gpu::DeviceMatrix<Scalar>::fromHost(C);
+
+  auto d_D = gpu::DeviceMatrix<Scalar>::fromHost(D);
+
+  gpu::DeviceMatrix<Scalar> d_X;
+  d_X = d_C.llt().solve(d_D);
+
+  Mat X = d_X.toHost();
+  RealScalar residual = (C * X - D).norm() / D.norm();
+  VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+// ---- LLT solve with Upper triangle -----------------------------------------
+
+template <typename Scalar>
+void test_llt_solve_upper(Index n, Index nrhs) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = make_spd<Mat>(n);
+  Mat B = Mat::Random(n, nrhs);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::DeviceMatrix<Scalar> d_X;
+  d_X = d_A.template llt<Upper>().solve(d_B);
+
+  Mat X = d_X.toHost();
+  RealScalar residual = (A * X - B).norm() / B.norm();
+  VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+// ---- LU solve with explicit context -----------------------------------------
+
+template <typename Scalar>
+void test_lu_solve_expr_context(Index n, Index nrhs) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, nrhs);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::Context ctx;
+  gpu::DeviceMatrix<Scalar> d_X;
+  d_X.device(ctx) = d_A.lu().solve(d_B);
+
+  Mat X = d_X.toHost();
+  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
+  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+// ---- Zero-nrhs solver expressions ------------------------------------------
+
+template <typename Scalar>
+void test_llt_solve_zero_nrhs(Index n) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+
+  Mat A = make_spd<Mat>(n);
+  Mat B = Mat::Random(n, 0);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::DeviceMatrix<Scalar> d_X;
+  d_X = d_A.llt().solve(d_B);
+
+  VERIFY_IS_EQUAL(d_X.rows(), n);
+  VERIFY_IS_EQUAL(d_X.cols(), 0);
+}
+
+template <typename Scalar>
+void test_lu_solve_zero_nrhs(Index n) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+
+  Mat A = Mat::Random(n, n);
+  Mat B = Mat::Random(n, 0);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::DeviceMatrix<Scalar> d_X;
+  d_X = d_A.lu().solve(d_B);
+
+  VERIFY_IS_EQUAL(d_X.rows(), n);
+  VERIFY_IS_EQUAL(d_X.cols(), 0);
+}
+
+// ---- TRSM: triangularView<UpLo>().solve(B) ----------------------------------
+
+template <typename Scalar, int UpLo>
+void test_trsm(Index n, Index nrhs) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  // Build a well-conditioned triangular matrix.
+  Mat A = Mat::Random(n, n);
+  A.diagonal().array() += static_cast<Scalar>(n);  // ensure non-singular
+  if (UpLo == Lower)
+    A = A.template triangularView<Lower>();
+  else
+    A = A.template triangularView<Upper>();
+
+  Mat B = Mat::Random(n, nrhs);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::DeviceMatrix<Scalar> d_X;
+  d_X = d_A.template triangularView<UpLo>().solve(d_B);
+
+  Mat X = d_X.toHost();
+  RealScalar residual = (A * X - B).norm() / B.norm();
+  VERIFY(residual < RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+// ---- SYMM/HEMM: selfadjointView<UpLo>() * B --------------------------------
+
+template <typename Scalar, int UpLo>
+void test_symm(Index n, Index nrhs) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = make_spd<Mat>(n);  // SPD is also self-adjoint
+  Mat B = Mat::Random(n, nrhs);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::DeviceMatrix<Scalar> d_C;
+  d_C = d_A.template selfadjointView<UpLo>() * d_B;
+
+  Mat C = d_C.toHost();
+  Mat C_ref = A * B;  // A is symmetric, so full multiply == symm
+
+  RealScalar tol = RealScalar(n) * NumTraits<Scalar>::epsilon() * C_ref.norm();
+  VERIFY((C - C_ref).norm() < tol);
+}
+
+// ---- SYRK/HERK: rankUpdate(A) → C = A * A^H --------------------------------
+
+template <typename Scalar>
+void test_syrk(Index n, Index k) {
+  using Mat = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  Mat A = Mat::Random(n, k);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+
+  gpu::DeviceMatrix<Scalar> d_C;
+  d_C.template selfadjointView<Lower>().rankUpdate(d_A);
+
+  Mat C = d_C.toHost();
+  // Only lower triangle is meaningful for SYRK. Compare lower triangle.
+  Mat C_ref = A * A.adjoint();
+
+  // Extract lower triangle for comparison.
+  Mat C_lower = C.template triangularView<Lower>();
+  Mat C_ref_lower = C_ref.template triangularView<Lower>();
+
+  RealScalar tol = RealScalar(k) * NumTraits<Scalar>::epsilon() * C_ref.norm();
+  VERIFY((C_lower - C_ref_lower).norm() < tol);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST(test_gemm_basic<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_basic<Scalar>(128, 64, 32));
+  CALL_SUBTEST(test_gemm_basic<Scalar>(1, 1, 1));
+  CALL_SUBTEST(test_gemm_basic<Scalar>(256, 256, 256));
+
+  CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_adjoint_lhs<Scalar>(128, 32, 64));
+
+  CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_transpose_rhs<Scalar>(128, 32, 64));
+  CALL_SUBTEST(test_gemm_view_pairs<Scalar>(64, 32, 48));
+
+  CALL_SUBTEST(test_gemm_scaled<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_scaled_rhs<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_accumulate<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_accumulate_empty<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_subtract<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_subtract_empty<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_explicit_context<Scalar>(64, 64, 64));
+  CALL_SUBTEST(test_gemm_cross_context_reuse<Scalar>(64));
+  CALL_SUBTEST(test_gemm_cross_context_resize<Scalar>());
+  CALL_SUBTEST(test_gemm_chain<Scalar>(64));
+  CALL_SUBTEST(test_gemm_identity<Scalar>(64));
+
+  // Solver expressions — zero-size edge cases (use dedicated tests, not residual-based)
+
+  // Solver expressions
+  CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 1));
+  CALL_SUBTEST(test_llt_solve_expr<Scalar>(64, 4));
+  CALL_SUBTEST(test_llt_solve_expr<Scalar>(256, 8));
+  CALL_SUBTEST(test_llt_solve_expr_context<Scalar>(64, 4));
+  CALL_SUBTEST(test_llt_solve_upper<Scalar>(64, 4));
+  CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 1));
+  CALL_SUBTEST(test_lu_solve_expr<Scalar>(64, 4));
+  CALL_SUBTEST(test_lu_solve_expr<Scalar>(256, 8));
+  CALL_SUBTEST(test_lu_solve_expr_context<Scalar>(64, 4));
+  CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(64));
+  CALL_SUBTEST(test_llt_solve_zero_nrhs<Scalar>(0));
+  CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(64));
+  CALL_SUBTEST(test_lu_solve_zero_nrhs<Scalar>(0));
+  CALL_SUBTEST(test_gemm_then_solve<Scalar>(64));
+
+  // TRSM
+  CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 1)));
+  CALL_SUBTEST((test_trsm<Scalar, Lower>(64, 4)));
+  CALL_SUBTEST((test_trsm<Scalar, Upper>(64, 4)));
+  CALL_SUBTEST((test_trsm<Scalar, Lower>(256, 8)));
+
+  // SYMM/HEMM
+  CALL_SUBTEST((test_symm<Scalar, Lower>(64, 4)));
+  CALL_SUBTEST((test_symm<Scalar, Upper>(64, 4)));
+  CALL_SUBTEST((test_symm<Scalar, Lower>(128, 8)));
+
+  // SYRK/HERK
+  CALL_SUBTEST(test_syrk<Scalar>(64, 64));
+  CALL_SUBTEST(test_syrk<Scalar>(64, 32));
+  CALL_SUBTEST(test_syrk<Scalar>(128, 64));
+}
+
+// ---- Solver failure mode tests (not templated on Scalar) --------------------
+// Use the cached GpuLLT/GpuLU API which reports failure via info() rather than
+// the expression API which asserts inside dispatch (incompatible with
+// VERIFY_RAISES_ASSERT due to longjmp skipping RAII destructors).
+
+void test_llt_not_spd() {
+  // Negative definite matrix — LLT factorization must fail.
+  MatrixXd A = -MatrixXd::Identity(8, 8);
+  gpu::LLT<double> llt(A);
+  VERIFY_IS_EQUAL(llt.info(), NumericalIssue);
+}
+
+void test_lu_singular() {
+  // Zero matrix — LU factorization must detect singularity.
+  MatrixXd A = MatrixXd::Zero(8, 8);
+  gpu::LU<double> lu(A);
+  VERIFY_IS_EQUAL(lu.info(), NumericalIssue);
+}
+
+EIGEN_DECLARE_TEST(gpu_cublas) {
+  // Split by scalar so each part compiles in parallel.
+  CALL_SUBTEST_1(test_scalar<float>());
+  CALL_SUBTEST_2(test_scalar<double>());
+  CALL_SUBTEST_3(test_scalar<std::complex<float>>());
+  CALL_SUBTEST_4(test_scalar<std::complex<double>>());
+  CALL_SUBTEST_5(test_llt_not_spd());
+  CALL_SUBTEST_5(test_lu_singular());
+}
diff --git a/unsupported/test/GPU/cusolver_llt.cpp b/unsupported/test/GPU/cusolver_llt.cpp
new file mode 100644
index 0000000..011c02a
--- /dev/null
+++ b/unsupported/test/GPU/cusolver_llt.cpp
@@ -0,0 +1,220 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Tests for GpuLLT: GPU Cholesky (LL^T) using cuSOLVER.
+// Covers cusolverDnXpotrf (factorization) and cusolverDnXpotrs (solve)
+// for float, double, complex<float>, complex<double>, Lower and Upper.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <Eigen/Cholesky>
+#include <unsupported/Eigen/GPU>
+
+// Identifier convention throughout this file:
+//   h_ prefix for host-resident Eigen::Matrix values
+//   d_ prefix for device-resident Eigen::gpu::DeviceMatrix values
+// We also keep namespaces explicit (Eigen::, Eigen::gpu::) so the CPU vs GPU
+// path is obvious at every call site.
+
+// Build a random symmetric positive-definite matrix: A = M^H*M + n*I.
+template <typename MatrixType>
+MatrixType make_spd(Eigen::Index n) {
+  using Scalar = typename MatrixType::Scalar;
+  MatrixType M = MatrixType::Random(n, n);
+  return M.adjoint() * M + MatrixType::Identity(n, n) * static_cast<Scalar>(n);
+}
+
+// Test factorization: L*L^H must reconstruct A to within floating-point tolerance.
+template <typename Scalar, int UpLo>
+void test_potrf(Eigen::Index n) {
+  using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+  MatrixType h_A = make_spd<MatrixType>(n);
+
+  // GPU factorization under test: factor stays on device.
+  Eigen::gpu::LLT<Scalar, UpLo> gpu_llt(h_A);
+  VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success);
+
+  // CPU LLT is the oracle. GpuLLT does not expose the device-resident factor,
+  // so we validate correctness via the CPU-reconstructed matrix.
+  Eigen::LLT<MatrixType, UpLo> cpu_llt(h_A);
+  VERIFY_IS_EQUAL(cpu_llt.info(), Eigen::Success);
+  MatrixType h_A_reconstructed = cpu_llt.reconstructedMatrix();
+
+  RealScalar tol = RealScalar(4) * RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon() * h_A.norm();
+  VERIFY((h_A_reconstructed - h_A).norm() < tol);
+
+  // Cross-check: GPU and CPU solves must agree on the same RHS. `solve(Matrix)`
+  // uploads/downloads internally, so both outputs end up on the host.
+  MatrixType h_b = MatrixType::Random(n, 1);
+  MatrixType h_x_gpu = gpu_llt.solve(h_b);
+  MatrixType h_x_cpu = cpu_llt.solve(h_b);
+  VERIFY((h_x_gpu - h_x_cpu).norm() < tol);
+}
+
+// Test solve: residual ||A*X - B|| / ||B|| must be small.
+template <typename Scalar, int UpLo>
+void test_potrs(Eigen::Index n, Eigen::Index nrhs) {
+  using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+  MatrixType h_A = make_spd<MatrixType>(n);
+  MatrixType h_B = MatrixType::Random(n, nrhs);
+
+  Eigen::gpu::LLT<Scalar, UpLo> gpu_llt(h_A);
+  VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success);
+
+  MatrixType h_X = gpu_llt.solve(h_B);
+
+  RealScalar residual = (h_A * h_X - h_B).norm() / h_B.norm();
+  RealScalar tol = RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon();
+  VERIFY(residual < tol);
+}
+
+// Test that multiple solves against the same factor all produce correct results.
+// This exercises the key design property: L stays on device across calls.
+template <typename Scalar>
+void test_multiple_solves(Eigen::Index n) {
+  using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+  MatrixType h_A = make_spd<MatrixType>(n);
+  Eigen::gpu::LLT<Scalar, Eigen::Lower> gpu_llt(h_A);
+  VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success);
+
+  RealScalar tol = RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon();
+  for (int k = 0; k < 5; ++k) {
+    MatrixType h_B = MatrixType::Random(n, 3);
+    MatrixType h_X = gpu_llt.solve(h_B);
+    RealScalar residual = (h_A * h_X - h_B).norm() / h_B.norm();
+    VERIFY(residual < tol);
+  }
+}
+
+// Test that GpuLLT correctly detects a non-SPD matrix.
+void test_not_spd() {
+  Eigen::MatrixXd h_A = -Eigen::MatrixXd::Identity(8, 8);  // negative definite
+  Eigen::gpu::LLT<double> gpu_llt(h_A);
+  VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::NumericalIssue);
+}
+
+// ---- DeviceMatrix-native API --------------------------------------------
+// These tests exercise the device-resident path: compute(DeviceMatrix) +
+// solve(DeviceMatrix) -> DeviceMatrix, with the user explicitly managing
+// upload/download. The tests above use the host-Matrix overloads which do
+// the transfers internally; this section covers the no-implicit-transfer
+// surface that keeps data on device across a chain of calls.
+
+// compute(DeviceMatrix) + solve(DeviceMatrix) → toHost
+template <typename Scalar, int UpLo>
+void test_device_matrix_solve(Eigen::Index n, Eigen::Index nrhs) {
+  using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+  MatrixType h_A = make_spd<MatrixType>(n);
+  MatrixType h_B = MatrixType::Random(n, nrhs);
+
+  auto d_A = Eigen::gpu::DeviceMatrix<Scalar>::fromHost(h_A);
+  auto d_B = Eigen::gpu::DeviceMatrix<Scalar>::fromHost(h_B);
+
+  Eigen::gpu::LLT<Scalar, UpLo> gpu_llt;
+  gpu_llt.compute(d_A);
+  VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success);
+
+  Eigen::gpu::DeviceMatrix<Scalar> d_X = gpu_llt.solve(d_B);
+  MatrixType h_X = d_X.toHost();
+
+  RealScalar residual = (h_A * h_X - h_B).norm() / h_B.norm();
+  VERIFY(residual < RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon());
+}
+
+// compute(DeviceMatrix&&) — move path
+template <typename Scalar>
+void test_device_matrix_move_compute(Eigen::Index n) {
+  using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+  MatrixType h_A = make_spd<MatrixType>(n);
+  MatrixType h_B = MatrixType::Random(n, 1);
+
+  auto d_A = Eigen::gpu::DeviceMatrix<Scalar>::fromHost(h_A);
+  Eigen::gpu::LLT<Scalar, Eigen::Lower> gpu_llt;
+  gpu_llt.compute(std::move(d_A));
+  VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success);
+
+  // d_A should be empty after move.
+  VERIFY(d_A.empty());
+
+  MatrixType h_X = gpu_llt.solve(h_B);
+  RealScalar residual = (h_A * h_X - h_B).norm() / h_B.norm();
+  VERIFY(residual < RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon());
+}
+
+// Full async chain: compute → solve → solve again with result as RHS → toHost
+template <typename Scalar>
+void test_chaining(Eigen::Index n) {
+  using MatrixType = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+  MatrixType h_A = make_spd<MatrixType>(n);
+  MatrixType h_B = MatrixType::Random(n, 3);
+
+  auto d_A = Eigen::gpu::DeviceMatrix<Scalar>::fromHost(h_A);
+  auto d_B = Eigen::gpu::DeviceMatrix<Scalar>::fromHost(h_B);
+
+  Eigen::gpu::LLT<Scalar, Eigen::Lower> gpu_llt;
+  gpu_llt.compute(d_A);
+  VERIFY_IS_EQUAL(gpu_llt.info(), Eigen::Success);
+
+  // Chain: solve → use result as RHS for another solve. Everything stays on
+  // device until the final toHost() below; that's the only sync point.
+  Eigen::gpu::DeviceMatrix<Scalar> d_X = gpu_llt.solve(d_B);
+  Eigen::gpu::DeviceMatrix<Scalar> d_Y = gpu_llt.solve(d_X);
+
+  MatrixType h_Y = d_Y.toHost();
+
+  // Verify: Y = A^{-2} * B using CPU oracle.
+  MatrixType h_X_ref = Eigen::LLT<MatrixType, Eigen::Lower>(h_A).solve(h_B);
+  MatrixType h_Y_ref = Eigen::LLT<MatrixType, Eigen::Lower>(h_A).solve(h_X_ref);
+
+  RealScalar tol = RealScalar(4) * RealScalar(n) * Eigen::NumTraits<Scalar>::epsilon() * h_Y_ref.norm();
+  VERIFY((h_Y - h_Y_ref).norm() < tol);
+}
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST((test_potrf<Scalar, Eigen::Lower>(1)));
+  CALL_SUBTEST((test_potrf<Scalar, Eigen::Lower>(64)));
+  CALL_SUBTEST((test_potrf<Scalar, Eigen::Lower>(256)));
+  CALL_SUBTEST((test_potrf<Scalar, Eigen::Upper>(64)));
+  CALL_SUBTEST((test_potrf<Scalar, Eigen::Upper>(256)));
+
+  CALL_SUBTEST((test_potrs<Scalar, Eigen::Lower>(64, 1)));
+  CALL_SUBTEST((test_potrs<Scalar, Eigen::Lower>(64, 4)));
+  CALL_SUBTEST((test_potrs<Scalar, Eigen::Lower>(256, 8)));
+  CALL_SUBTEST((test_potrs<Scalar, Eigen::Upper>(64, 1)));
+  CALL_SUBTEST((test_potrs<Scalar, Eigen::Upper>(256, 4)));
+
+  CALL_SUBTEST(test_multiple_solves<Scalar>(128));
+
+  CALL_SUBTEST((test_device_matrix_solve<Scalar, Eigen::Lower>(64, 4)));
+  CALL_SUBTEST((test_device_matrix_solve<Scalar, Eigen::Upper>(128, 1)));
+  CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64));
+  CALL_SUBTEST(test_chaining<Scalar>(64));
+}
+
+EIGEN_DECLARE_TEST(gpu_cusolver_llt) {
+  // Split by scalar so each part compiles in parallel.
+  CALL_SUBTEST_1(test_scalar<float>());
+  CALL_SUBTEST_2(test_scalar<double>());
+  CALL_SUBTEST_3(test_scalar<std::complex<float>>());
+  CALL_SUBTEST_4(test_scalar<std::complex<double>>());
+  CALL_SUBTEST_5(test_not_spd());
+}
diff --git a/unsupported/test/GPU/cusolver_lu.cpp b/unsupported/test/GPU/cusolver_lu.cpp
new file mode 100644
index 0000000..08bfde8
--- /dev/null
+++ b/unsupported/test/GPU/cusolver_lu.cpp
@@ -0,0 +1,204 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Tests for GpuLU: GPU partial-pivoting LU decomposition via cuSOLVER.
+// Covers cusolverDnXgetrf (factorization) and cusolverDnXgetrs (solve)
+// for float, double, complex<float>, complex<double>.
+//
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <unsupported/Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- Test factorization + NoTrans solve: residual ||A*X - B|| / ||B|| -------
+
+template <typename Scalar>
+void test_getrf(Index n) {
+  using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = MatrixType::Random(n, n);
+  MatrixType B = MatrixType::Random(n, 4);
+
+  gpu::LU<Scalar> lu(A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+
+  MatrixType X = lu.solve(B);
+  // Backward error bound for LU: ||A*X - B|| <= O(n*u) * ||A|| * ||X||.
+  // Normalize by ||A||*||X|| rather than ||B|| to be condition-number agnostic.
+  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
+  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+// ---- Test solve: A^T*X = B and A^H*X = B ------------------------------------
+
+template <typename Scalar>
+void test_getrs_trans(Index n) {
+  using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = MatrixType::Random(n, n);
+  MatrixType B = MatrixType::Random(n, 3);
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+
+  gpu::LU<Scalar> lu(A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+
+  MatrixType Xt = lu.solve(B, gpu::GpuOp::Trans);
+  VERIFY((A.transpose() * Xt - B).norm() / (A.norm() * Xt.norm()) < tol);
+
+  MatrixType Xc = lu.solve(B, gpu::GpuOp::ConjTrans);
+  VERIFY((A.adjoint() * Xc - B).norm() / (A.norm() * Xc.norm()) < tol);
+}
+
+// ---- Test multiple solves reuse the device-resident LU ----------------------
+
+template <typename Scalar>
+void test_multiple_solves(Index n) {
+  using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = MatrixType::Random(n, n);
+  gpu::LU<Scalar> lu(A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  for (int k = 0; k < 5; ++k) {
+    MatrixType B = MatrixType::Random(n, 3);
+    MatrixType X = lu.solve(B);
+    VERIFY((A * X - B).norm() / (A.norm() * X.norm()) < tol);
+  }
+}
+
+// ---- Residual check for host solve ------------------------------------------
+
+template <typename Scalar>
+void test_vs_cpu(Index n) {
+  using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = MatrixType::Random(n, n);
+  MatrixType B = MatrixType::Random(n, 5);
+
+  gpu::LU<Scalar> gpu_lu(A);
+  VERIFY_IS_EQUAL(gpu_lu.info(), Success);
+
+  MatrixType X_gpu = gpu_lu.solve(B);
+
+  RealScalar residual = (A * X_gpu - B).norm() / (A.norm() * X_gpu.norm());
+  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+// ---- Singular matrix detection ----------------------------------------------
+
+void test_singular() {
+  MatrixXd A = MatrixXd::Zero(8, 8);
+  gpu::LU<double> lu(A);
+  VERIFY_IS_EQUAL(lu.info(), NumericalIssue);
+}
+
+// ---- DeviceMatrix integration tests -----------------------------------------
+
+template <typename Scalar>
+void test_device_matrix_solve(Index n) {
+  using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = MatrixType::Random(n, n);
+  MatrixType B = MatrixType::Random(n, 4);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::LU<Scalar> lu;
+  lu.compute(d_A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+
+  gpu::DeviceMatrix<Scalar> d_X = lu.solve(d_B);
+  MatrixType X = d_X.toHost();
+
+  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
+  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+template <typename Scalar>
+void test_device_matrix_move_compute(Index n) {
+  using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = MatrixType::Random(n, n);
+  MatrixType B = MatrixType::Random(n, 1);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  gpu::LU<Scalar> lu;
+  lu.compute(std::move(d_A));
+  VERIFY_IS_EQUAL(lu.info(), Success);
+  VERIFY(d_A.empty());
+
+  MatrixType X = lu.solve(B);
+  RealScalar residual = (A * X - B).norm() / (A.norm() * X.norm());
+  VERIFY(residual < RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon());
+}
+
+template <typename Scalar>
+void test_chaining(Index n) {
+  using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  MatrixType A = MatrixType::Random(n, n);
+  MatrixType B = MatrixType::Random(n, 3);
+
+  auto d_A = gpu::DeviceMatrix<Scalar>::fromHost(A);
+  auto d_B = gpu::DeviceMatrix<Scalar>::fromHost(B);
+
+  gpu::LU<Scalar> lu;
+  lu.compute(d_A);
+  VERIFY_IS_EQUAL(lu.info(), Success);
+
+  // Chain: solve → use result as RHS
+  gpu::DeviceMatrix<Scalar> d_X = lu.solve(d_B);
+  gpu::DeviceMatrix<Scalar> d_Y = lu.solve(d_X);
+  MatrixType X = d_X.toHost();
+  MatrixType Y = d_Y.toHost();
+
+  RealScalar tol = RealScalar(10) * RealScalar(n) * NumTraits<Scalar>::epsilon();
+  VERIFY((A * X - B).norm() / (A.norm() * X.norm()) < tol);
+  VERIFY((A * Y - X).norm() / (A.norm() * Y.norm()) < tol);
+}
+
+// ---- Per-scalar driver -------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  CALL_SUBTEST(test_getrf<Scalar>(1));
+  CALL_SUBTEST(test_getrf<Scalar>(64));
+  CALL_SUBTEST(test_getrf<Scalar>(256));
+
+  CALL_SUBTEST(test_getrs_trans<Scalar>(64));
+  CALL_SUBTEST(test_getrs_trans<Scalar>(128));
+
+  CALL_SUBTEST(test_multiple_solves<Scalar>(128));
+
+  CALL_SUBTEST(test_vs_cpu<Scalar>(64));
+  CALL_SUBTEST(test_vs_cpu<Scalar>(256));
+
+  CALL_SUBTEST(test_device_matrix_solve<Scalar>(64));
+  CALL_SUBTEST(test_device_matrix_move_compute<Scalar>(64));
+  CALL_SUBTEST(test_chaining<Scalar>(64));
+}
+
+EIGEN_DECLARE_TEST(gpu_cusolver_lu) {
+  // Split by scalar so each part compiles in parallel.
+  CALL_SUBTEST_1(test_scalar<float>());
+  CALL_SUBTEST_2(test_scalar<double>());
+  CALL_SUBTEST_3(test_scalar<std::complex<float>>());
+  CALL_SUBTEST_4(test_scalar<std::complex<double>>());
+  CALL_SUBTEST_5(test_singular());
+}
diff --git a/unsupported/test/GPU/device_matrix.cpp b/unsupported/test/GPU/device_matrix.cpp
new file mode 100644
index 0000000..d09e641
--- /dev/null
+++ b/unsupported/test/GPU/device_matrix.cpp
@@ -0,0 +1,260 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2026 Rasmus Munk Larsen <rmlarsen@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Tests for DeviceMatrix and HostTransfer: typed RAII GPU memory wrapper.
+// No cuSOLVER dependency — only CUDA runtime.
+
+#define EIGEN_USE_GPU
+#include "main.h"
+#include <unsupported/Eigen/GPU>
+
+using namespace Eigen;
+
+// ---- Default construction ---------------------------------------------------
+
+void test_default_construct() {
+  gpu::DeviceMatrix<double> dm;
+  VERIFY(dm.empty());
+  VERIFY_IS_EQUAL(dm.rows(), 0);
+  VERIFY_IS_EQUAL(dm.cols(), 0);
+  VERIFY(dm.data() == nullptr);
+  VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(0));
+}
+
+// ---- Allocate uninitialized -------------------------------------------------
+
+template <typename Scalar>
+void test_allocate(Index rows, Index cols) {
+  gpu::DeviceMatrix<Scalar> dm(rows, cols);
+  VERIFY(!dm.empty());
+  VERIFY_IS_EQUAL(dm.rows(), rows);
+  VERIFY_IS_EQUAL(dm.cols(), cols);
+  VERIFY_IS_EQUAL(dm.outerStride(), rows);
+  VERIFY(dm.data() != nullptr);
+  VERIFY_IS_EQUAL(dm.sizeInBytes(), size_t(rows) * size_t(cols) * sizeof(Scalar));
+}
+
+// ---- fromHost / toHost roundtrip (synchronous) ------------------------------
+
+template <typename Scalar>
+void test_roundtrip(Index rows, Index cols) {
+  using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  MatrixType host = MatrixType::Random(rows, cols);
+
+  auto dm = gpu::DeviceMatrix<Scalar>::fromHost(host);
+  VERIFY_IS_EQUAL(dm.rows(), rows);
+  VERIFY_IS_EQUAL(dm.cols(), cols);
+  VERIFY(!dm.empty());
+
+  MatrixType result = dm.toHost();
+  VERIFY_IS_EQUAL(result.rows(), rows);
+  VERIFY_IS_EQUAL(result.cols(), cols);
+  VERIFY_IS_APPROX(result, host);
+}
+
+// ---- fromHostAsync / toHostAsync roundtrip -----------------------------------
+
+template <typename Scalar>
+void test_roundtrip_async(Index rows, Index cols) {
+  using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  MatrixType host = MatrixType::Random(rows, cols);
+
+  cudaStream_t stream;
+  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&stream));
+
+  // Async upload from raw pointer.
+  auto dm = gpu::DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, rows, stream);
+  VERIFY_IS_EQUAL(dm.rows(), rows);
+  VERIFY_IS_EQUAL(dm.cols(), cols);
+
+  // Async download via HostTransfer future.
+  auto transfer = dm.toHostAsync(stream);
+
+  // get() blocks and returns the matrix.
+  MatrixType result = transfer.get();
+  VERIFY_IS_APPROX(result, host);
+
+  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamDestroy(stream));
+
+  cudaStream_t producer_stream;
+  cudaStream_t consumer_stream;
+  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&producer_stream));
+  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamCreate(&consumer_stream));
+
+  auto cross_stream_dm = gpu::DeviceMatrix<Scalar>::fromHostAsync(host.data(), rows, cols, rows, producer_stream);
+  auto cross_stream_transfer = cross_stream_dm.toHostAsync(consumer_stream);
+  MatrixType cross_stream_result = cross_stream_transfer.get();
+  VERIFY_IS_APPROX(cross_stream_result, host);
+
+  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamDestroy(consumer_stream));
+  EIGEN_CUDA_RUNTIME_CHECK(cudaStreamDestroy(producer_stream));
+}
+
+// ---- HostTransfer::ready() and idempotent get() -----------------------------
+
+void test_host_transfer_ready() {
+  using MatrixType = Eigen::Matrix<double, Dynamic, Dynamic>;
+  MatrixType host = MatrixType::Random(100, 100);
+
+  auto dm = gpu::DeviceMatrix<double>::fromHost(host);
+  auto transfer = dm.toHostAsync();
+
+  // After get(), ready() must return true.
+  MatrixType result = transfer.get();
+  VERIFY(transfer.ready());
+  VERIFY_IS_APPROX(result, host);
+
+  // get() is idempotent.
+  MatrixType& result2 = transfer.get();
+  VERIFY_IS_APPROX(result2, host);
+}
+
+// ---- HostTransfer move ------------------------------------------------------
+
+void test_host_transfer_move() {
+  using MatrixType = Eigen::Matrix<double, Dynamic, Dynamic>;
+  MatrixType host = MatrixType::Random(50, 50);
+
+  auto dm = gpu::DeviceMatrix<double>::fromHost(host);
+  auto transfer = dm.toHostAsync();
+
+  gpu::HostTransfer<double> moved(std::move(transfer));
+  MatrixType result = moved.get();
+  VERIFY_IS_APPROX(result, host);
+}
+
+// ---- clone() produces independent copy --------------------------------------
+
+template <typename Scalar>
+void test_clone(Index rows, Index cols) {
+  using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  MatrixType host = MatrixType::Random(rows, cols);
+
+  auto dm = gpu::DeviceMatrix<Scalar>::fromHost(host);
+  auto cloned = dm.clone();
+
+  // Overwrite original with different data.
+  MatrixType other = MatrixType::Random(rows, cols);
+  dm = gpu::DeviceMatrix<Scalar>::fromHost(other);
+
+  // Clone still holds the original data.
+  MatrixType clone_result = cloned.toHost();
+  VERIFY_IS_APPROX(clone_result, host);
+
+  // Original holds the new data.
+  MatrixType dm_result = dm.toHost();
+  VERIFY_IS_APPROX(dm_result, other);
+}
+
+// ---- Move construct ---------------------------------------------------------
+
+template <typename Scalar>
+void test_move_construct(Index rows, Index cols) {
+  using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  MatrixType host = MatrixType::Random(rows, cols);
+
+  auto dm = gpu::DeviceMatrix<Scalar>::fromHost(host);
+  gpu::DeviceMatrix<Scalar> moved(std::move(dm));
+
+  VERIFY(dm.empty());
+  VERIFY(dm.data() == nullptr);
+
+  VERIFY_IS_EQUAL(moved.rows(), rows);
+  VERIFY_IS_EQUAL(moved.cols(), cols);
+  MatrixType result = moved.toHost();
+  VERIFY_IS_APPROX(result, host);
+}
+
+// ---- Move assign ------------------------------------------------------------
+
+template <typename Scalar>
+void test_move_assign(Index rows, Index cols) {
+  using MatrixType = Eigen::Matrix<Scalar, Dynamic, Dynamic>;
+  MatrixType host = MatrixType::Random(rows, cols);
+
+  auto dm = gpu::DeviceMatrix<Scalar>::fromHost(host);
+  gpu::DeviceMatrix<Scalar> dest;
+  dest = std::move(dm);
+
+  VERIFY(dm.empty());
+  VERIFY_IS_EQUAL(dest.rows(), rows);
+  MatrixType result = dest.toHost();
+  VERIFY_IS_APPROX(result, host);
+}
+
+// ---- resize() ---------------------------------------------------------------
+
+void test_resize() {
+  gpu::DeviceMatrix<double> dm(10, 20);
+  VERIFY_IS_EQUAL(dm.rows(), 10);
+  VERIFY_IS_EQUAL(dm.cols(), 20);
+
+  dm.resize(50, 30);
+  VERIFY_IS_EQUAL(dm.rows(), 50);
+  VERIFY_IS_EQUAL(dm.cols(), 30);
+  VERIFY_IS_EQUAL(dm.outerStride(), 50);
+  VERIFY(dm.data() != nullptr);
+
+  // Resize to same dimensions is a no-op.
+  double* ptr_before = dm.data();
+  dm.resize(50, 30);
+  VERIFY(dm.data() == ptr_before);
+}
+
+// ---- Empty / 0x0 matrix -----------------------------------------------------
+
+void test_empty() {
+  using MatrixType = Eigen::Matrix<double, Dynamic, Dynamic>;
+  MatrixType empty_mat(0, 0);
+
+  auto dm = gpu::DeviceMatrix<double>::fromHost(empty_mat);
+  VERIFY(dm.empty());
+  VERIFY_IS_EQUAL(dm.rows(), 0);
+  VERIFY_IS_EQUAL(dm.cols(), 0);
+
+  MatrixType result = dm.toHost();
+  VERIFY_IS_EQUAL(result.rows(), 0);
+  VERIFY_IS_EQUAL(result.cols(), 0);
+}
+
+// ---- Per-scalar driver ------------------------------------------------------
+
+template <typename Scalar>
+void test_scalar() {
+  // Square.
+  CALL_SUBTEST(test_roundtrip<Scalar>(1, 1));
+  CALL_SUBTEST(test_roundtrip<Scalar>(64, 64));
+  CALL_SUBTEST(test_roundtrip<Scalar>(256, 256));
+
+  // Rectangular.
+  CALL_SUBTEST(test_roundtrip<Scalar>(100, 7));
+  CALL_SUBTEST(test_roundtrip<Scalar>(7, 100));
+
+  // Async roundtrip.
+  CALL_SUBTEST(test_roundtrip_async<Scalar>(64, 64));
+  CALL_SUBTEST(test_roundtrip_async<Scalar>(100, 7));
+
+  CALL_SUBTEST(test_clone<Scalar>(64, 64));
+  CALL_SUBTEST(test_move_construct<Scalar>(64, 64));
+  CALL_SUBTEST(test_move_assign<Scalar>(64, 64));
+}
+
+EIGEN_DECLARE_TEST(gpu_device_matrix) {
+  CALL_SUBTEST(test_default_construct());
+  CALL_SUBTEST(test_empty());
+  CALL_SUBTEST(test_resize());
+  CALL_SUBTEST(test_host_transfer_ready());
+  CALL_SUBTEST(test_host_transfer_move());
+  CALL_SUBTEST((test_allocate<float>(100, 50)));
+  CALL_SUBTEST((test_allocate<double>(100, 50)));
+  CALL_SUBTEST(test_scalar<float>());
+  CALL_SUBTEST(test_scalar<double>());
+  CALL_SUBTEST(test_scalar<std::complex<float>>());
+  CALL_SUBTEST(test_scalar<std::complex<double>>());
+}