test/product_sme.cpp - mirror - Git at Google

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 // SPDX-FileCopyrightText: The Eigen Authors
 // SPDX-License-Identifier: MPL-2.0

 // SME GEMM kernel tests.
 // Requires compiler flags: -march=armv9.2-a+sme2 -msve-vector-bits=512
 // and the define: -DEIGEN_ARM64_USE_SME

 #include "product.h"

 // Without the right -march flags, __ARM_FEATURE_SME is undefined and
 // EIGEN_VECTORIZE_SME never fires - the test would silently compile
 // against the NEON GEBP kernel and pass, making this a useless no-op.
 // Fail the build instead.
 #if !defined(EIGEN_VECTORIZE_SME)
 #error \
     "product_sme requires the SME backend.  Build with -march=armv9.2-a+sme2 " \
     "-msve-vector-bits=512 -DEIGEN_ARM64_USE_SME (see -DEIGEN_TEST_SME=ON in " \
     "test/CMakeLists.txt for the typical CMake invocation)."
 #endif

 using SmeColMajorMatF = Matrix<float, Dynamic, Dynamic, ColMajor>;
 using SmeRowMajorMatF = Matrix<float, Dynamic, Dynamic, RowMajor>;
 using SmeColMajorStridedMatF = Map<SmeColMajorMatF, 0, Stride<Dynamic, Dynamic>>;
 using SmeRowMajorStridedMatF = Map<SmeRowMajorMatF, 0, Stride<Dynamic, Dynamic>>;

 template <typename InputMat, typename ResultMat, typename ResultMap>
 static void verify_strided_result(int n, ResultMat& storage, const Stride<Dynamic, Dynamic>& stride) {
   InputMat A = InputMat::Random(n, n);
   InputMat B = InputMat::Random(n, n);
   ResultMap C(storage.data(), n, n, stride);
   C = ResultMat::Random(n, n);
   ResultMat c_before = C.eval();

   C.noalias() += A * B;

   ResultMat ref = c_before + (A.lazyProduct(B)).eval();
   ResultMat got = C;
   VERIFY_IS_APPROX(got, ref);
 }

 template <typename InputMat>
 static void test_general_strided_result(int n) {
   // General-stride C path: InputMat selects the source packers, while both C
   // strides are non-unit so sme_store_za_tile uses scalar scatter.
   SmeColMajorMatF storage = SmeColMajorMatF::Zero(2 * n, n);
   verify_strided_result<InputMat, SmeColMajorMatF, SmeColMajorStridedMatF>(
       n, storage, Stride<Dynamic, Dynamic>(/*outer=*/2 * n, /*inner=*/2));

   // Padding rows skipped by the strided Map should not be touched.
   for (int i = 0; i < n; ++i) {
     for (int j = 0; j < n; ++j) {
       VERIFY(storage(2 * i + 1, j) == float(0));
     }
   }
 }

 static void test_rowmajor_strided_result(int n) {
   // RowMajor C path: inner stride is one, with padded columns after the Map.
   SmeRowMajorMatF storage = SmeRowMajorMatF::Zero(n, 2 * n);
   verify_strided_result<SmeRowMajorMatF, SmeRowMajorMatF, SmeRowMajorStridedMatF>(
       n, storage, Stride<Dynamic, Dynamic>(/*outer=*/2 * n, /*inner=*/1));

   // Padding columns skipped by the strided Map should not be touched.
   for (int i = 0; i < n; ++i) {
     for (int j = n; j < 2 * n; ++j) {
       VERIFY(storage(i, j) == float(0));
     }
   }
 }

 static void test_deep_k_split() {
   constexpr int rows = 64;
   constexpr int depth = 2050;
   constexpr int cols = 64;
   SmeColMajorMatF A = SmeColMajorMatF::Random(rows, depth);
   SmeColMajorMatF B = SmeColMajorMatF::Random(depth, cols);
   SmeColMajorMatF C = SmeColMajorMatF::Random(rows, cols);
   SmeColMajorMatF c_before = C;

   C.noalias() += A * B;

   VERIFY_IS_APPROX(C, c_before + (A.lazyProduct(B)).eval());
 }

 EIGEN_DECLARE_TEST(product_sme) {
   // Square edge cases around the 2x2 tile-grid boundaries (MR=NR=32,
   // ZA tiles are 16x16 fp32; sizes near 16, 17, 31, 32, 33, 64, 65
   // exercise the VL-wide intra-tile splits and the MR/NR tails).
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(1, 1)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(15, 15)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(16, 16)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(17, 17)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(31, 31)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(33, 33)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(63, 63)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(64, 64)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(65, 65)));

   // Thin / wide rectangular cases (M x 1, 1 x N)
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(32, 1)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(1, 32)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(1, 64)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(64, 1)));

   // Non-float scalar smoke tests: SME only specializes fp32, so these prove
   // unsupported scalar types still route through the generic product path.
   CALL_SUBTEST_2(product(Matrix<double, Dynamic, Dynamic>(33, 17)));
   CALL_SUBTEST_3(product(Matrix<std::complex<float>, Dynamic, Dynamic>(33, 17)));

   // Non-square cases that exercise tail paths for both M and N
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(17, 65)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(65, 17)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(15, 63)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(33, 7)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(7, 33)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(128, 3)));
   CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(3, 128)));

   // Exercise the kc split path just above the SME heuristic's 2048 depth cap.
   test_deep_k_split();

   // Random sizes
   for (int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(internal::random<int>(1, EIGEN_TEST_MAX_SIZE),
                                                            internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
   }

   // Exercise the RowMajor packers and RowMajor result path.  When the input
   // MatrixType is RowMajor, product() instantiates m1/m2/m3/res in RowMajor,
   // so every matrix-matrix product in the suite flows through:
   //   - the RowMajor LHS packer (gemm_pack_lhs<..., RowMajor>)
   //   - the RowMajor RHS packer (gemm_pack_rhs<..., RowMajor>)
   //   - the RowMajor-C dispatch in GeneralMatrixMatrix.h (which transposes
   //     the computation: C^T = B^T * A^T).
   CALL_SUBTEST_1(product(SmeRowMajorMatF(15, 15)));
   CALL_SUBTEST_1(product(SmeRowMajorMatF(16, 16)));
   CALL_SUBTEST_1(product(SmeRowMajorMatF(17, 17)));
   CALL_SUBTEST_1(product(SmeRowMajorMatF(31, 31)));
   CALL_SUBTEST_1(product(SmeRowMajorMatF(32, 32)));
   CALL_SUBTEST_1(product(SmeRowMajorMatF(33, 33)));
   CALL_SUBTEST_1(product(SmeRowMajorMatF(64, 64)));
   CALL_SUBTEST_1(product(SmeRowMajorMatF(65, 65)));
   CALL_SUBTEST_1(product(SmeRowMajorMatF(17, 65)));
   CALL_SUBTEST_1(product(SmeRowMajorMatF(65, 17)));
   for (int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1(product(
         SmeRowMajorMatF(internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
   }

   // Exercise the general-stride branch of sme_store_za_tile: fires when both
   // C_stride_row != 1 and C_stride_col != 1, e.g. a Map<Matrix> with an
   // explicit non-unit inner stride.  product.h never builds such a result, so
   // without this subtest the scalar-scatter path is effectively untested.
   for (int n : {15, 16, 17, 31, 32, 33, 63, 64, 65}) {
     test_general_strided_result<SmeColMajorMatF>(n);
     test_general_strided_result<SmeRowMajorMatF>(n);
     test_rowmajor_strided_result(n);
   }

   // Row-LHS x Row-RHS -> Col-C: the one LHS/RHS/C storage combination that
   // product.h's transpose-style expressions never build directly (it always
   // flips one side of the multiplication).  The code paths are the same as
   // other combinations via Eigen's dispatch, but exercise them explicitly.
   for (int n : {15, 16, 17, 31, 32, 33, 63, 64, 65}) {
     Matrix<float, Dynamic, Dynamic, RowMajor> A = Matrix<float, Dynamic, Dynamic, RowMajor>::Random(n, n);
     Matrix<float, Dynamic, Dynamic, RowMajor> B = Matrix<float, Dynamic, Dynamic, RowMajor>::Random(n, n);
     SmeColMajorMatF C = SmeColMajorMatF::Zero(n, n);
     C.noalias() += A * B;
     VERIFY_IS_APPROX(C, (A.lazyProduct(B)).eval());
   }
 }
	// This file is part of Eigen, a lightweight C++ template library
	// for linear algebra.
	//
	// This Source Code Form is subject to the terms of the Mozilla
	// Public License v. 2.0. If a copy of the MPL was not distributed
	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
	// SPDX-FileCopyrightText: The Eigen Authors
	// SPDX-License-Identifier: MPL-2.0

	// SME GEMM kernel tests.
	// Requires compiler flags: -march=armv9.2-a+sme2 -msve-vector-bits=512
	// and the define: -DEIGEN_ARM64_USE_SME

	#include "product.h"

	// Without the right -march flags, __ARM_FEATURE_SME is undefined and
	// EIGEN_VECTORIZE_SME never fires - the test would silently compile
	// against the NEON GEBP kernel and pass, making this a useless no-op.
	// Fail the build instead.
	#if !defined(EIGEN_VECTORIZE_SME)
	#error \
	"product_sme requires the SME backend. Build with -march=armv9.2-a+sme2 " \
	"-msve-vector-bits=512 -DEIGEN_ARM64_USE_SME (see -DEIGEN_TEST_SME=ON in " \
	"test/CMakeLists.txt for the typical CMake invocation)."
	#endif

	using SmeColMajorMatF = Matrix<float, Dynamic, Dynamic, ColMajor>;
	using SmeRowMajorMatF = Matrix<float, Dynamic, Dynamic, RowMajor>;
	using SmeColMajorStridedMatF = Map<SmeColMajorMatF, 0, Stride<Dynamic, Dynamic>>;
	using SmeRowMajorStridedMatF = Map<SmeRowMajorMatF, 0, Stride<Dynamic, Dynamic>>;

	template <typename InputMat, typename ResultMat, typename ResultMap>
	static void verify_strided_result(int n, ResultMat& storage, const Stride<Dynamic, Dynamic>& stride) {
	InputMat A = InputMat::Random(n, n);
	InputMat B = InputMat::Random(n, n);
	ResultMap C(storage.data(), n, n, stride);
	C = ResultMat::Random(n, n);
	ResultMat c_before = C.eval();

	C.noalias() += A * B;

	ResultMat ref = c_before + (A.lazyProduct(B)).eval();
	ResultMat got = C;
	VERIFY_IS_APPROX(got, ref);
	}

	template <typename InputMat>
	static void test_general_strided_result(int n) {
	// General-stride C path: InputMat selects the source packers, while both C
	// strides are non-unit so sme_store_za_tile uses scalar scatter.
	SmeColMajorMatF storage = SmeColMajorMatF::Zero(2 * n, n);
	verify_strided_result<InputMat, SmeColMajorMatF, SmeColMajorStridedMatF>(
	n, storage, Stride<Dynamic, Dynamic>(/outer=/2 * n, /inner=/2));

	// Padding rows skipped by the strided Map should not be touched.
	for (int i = 0; i < n; ++i) {
	for (int j = 0; j < n; ++j) {
	VERIFY(storage(2 * i + 1, j) == float(0));
	}
	}
	}

	static void test_rowmajor_strided_result(int n) {
	// RowMajor C path: inner stride is one, with padded columns after the Map.
	SmeRowMajorMatF storage = SmeRowMajorMatF::Zero(n, 2 * n);
	verify_strided_result<SmeRowMajorMatF, SmeRowMajorMatF, SmeRowMajorStridedMatF>(
	n, storage, Stride<Dynamic, Dynamic>(/outer=/2 * n, /inner=/1));

	// Padding columns skipped by the strided Map should not be touched.
	for (int i = 0; i < n; ++i) {
	for (int j = n; j < 2 * n; ++j) {
	VERIFY(storage(i, j) == float(0));
	}
	}
	}

	static void test_deep_k_split() {
	constexpr int rows = 64;
	constexpr int depth = 2050;
	constexpr int cols = 64;
	SmeColMajorMatF A = SmeColMajorMatF::Random(rows, depth);
	SmeColMajorMatF B = SmeColMajorMatF::Random(depth, cols);
	SmeColMajorMatF C = SmeColMajorMatF::Random(rows, cols);
	SmeColMajorMatF c_before = C;

	C.noalias() += A * B;

	VERIFY_IS_APPROX(C, c_before + (A.lazyProduct(B)).eval());
	}

	EIGEN_DECLARE_TEST(product_sme) {
	// Square edge cases around the 2x2 tile-grid boundaries (MR=NR=32,
	// ZA tiles are 16x16 fp32; sizes near 16, 17, 31, 32, 33, 64, 65
	// exercise the VL-wide intra-tile splits and the MR/NR tails).
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(1, 1)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(15, 15)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(16, 16)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(17, 17)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(31, 31)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(33, 33)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(63, 63)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(64, 64)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(65, 65)));

	// Thin / wide rectangular cases (M x 1, 1 x N)
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(32, 1)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(1, 32)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(1, 64)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(64, 1)));

	// Non-float scalar smoke tests: SME only specializes fp32, so these prove
	// unsupported scalar types still route through the generic product path.
	CALL_SUBTEST_2(product(Matrix<double, Dynamic, Dynamic>(33, 17)));
	CALL_SUBTEST_3(product(Matrix<std::complex<float>, Dynamic, Dynamic>(33, 17)));

	// Non-square cases that exercise tail paths for both M and N
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(17, 65)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(65, 17)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(15, 63)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(33, 7)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(7, 33)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(128, 3)));
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(3, 128)));

	// Exercise the kc split path just above the SME heuristic's 2048 depth cap.
	test_deep_k_split();

	// Random sizes
	for (int i = 0; i < g_repeat; i++) {
	CALL_SUBTEST_1(product(Matrix<float, Dynamic, Dynamic>(internal::random<int>(1, EIGEN_TEST_MAX_SIZE),
	internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
	}

	// Exercise the RowMajor packers and RowMajor result path. When the input
	// MatrixType is RowMajor, product() instantiates m1/m2/m3/res in RowMajor,
	// so every matrix-matrix product in the suite flows through:
	// - the RowMajor LHS packer (gemm_pack_lhs<..., RowMajor>)
	// - the RowMajor RHS packer (gemm_pack_rhs<..., RowMajor>)
	// - the RowMajor-C dispatch in GeneralMatrixMatrix.h (which transposes
	// the computation: C^T = B^T * A^T).
	CALL_SUBTEST_1(product(SmeRowMajorMatF(15, 15)));
	CALL_SUBTEST_1(product(SmeRowMajorMatF(16, 16)));
	CALL_SUBTEST_1(product(SmeRowMajorMatF(17, 17)));
	CALL_SUBTEST_1(product(SmeRowMajorMatF(31, 31)));
	CALL_SUBTEST_1(product(SmeRowMajorMatF(32, 32)));
	CALL_SUBTEST_1(product(SmeRowMajorMatF(33, 33)));
	CALL_SUBTEST_1(product(SmeRowMajorMatF(64, 64)));
	CALL_SUBTEST_1(product(SmeRowMajorMatF(65, 65)));
	CALL_SUBTEST_1(product(SmeRowMajorMatF(17, 65)));
	CALL_SUBTEST_1(product(SmeRowMajorMatF(65, 17)));
	for (int i = 0; i < g_repeat; i++) {
	CALL_SUBTEST_1(product(
	SmeRowMajorMatF(internal::random<int>(1, EIGEN_TEST_MAX_SIZE), internal::random<int>(1, EIGEN_TEST_MAX_SIZE))));
	}

	// Exercise the general-stride branch of sme_store_za_tile: fires when both
	// C_stride_row != 1 and C_stride_col != 1, e.g. a Map<Matrix> with an
	// explicit non-unit inner stride. product.h never builds such a result, so
	// without this subtest the scalar-scatter path is effectively untested.
	for (int n : {15, 16, 17, 31, 32, 33, 63, 64, 65}) {
	test_general_strided_result<SmeColMajorMatF>(n);
	test_general_strided_result<SmeRowMajorMatF>(n);
	test_rowmajor_strided_result(n);
	}

	// Row-LHS x Row-RHS -> Col-C: the one LHS/RHS/C storage combination that
	// product.h's transpose-style expressions never build directly (it always
	// flips one side of the multiplication). The code paths are the same as
	// other combinations via Eigen's dispatch, but exercise them explicitly.
	for (int n : {15, 16, 17, 31, 32, 33, 63, 64, 65}) {
	Matrix<float, Dynamic, Dynamic, RowMajor> A = Matrix<float, Dynamic, Dynamic, RowMajor>::Random(n, n);
	Matrix<float, Dynamic, Dynamic, RowMajor> B = Matrix<float, Dynamic, Dynamic, RowMajor>::Random(n, n);
	SmeColMajorMatF C = SmeColMajorMatF::Zero(n, n);
	C.noalias() += A * B;
	VERIFY_IS_APPROX(C, (A.lazyProduct(B)).eval());
	}
	}