benchmarks/Core/bench_segment_tail.cpp - mirror - Git at Google

 // Benchmarks for the masked partial-packet ("segment") tail of vectorized
 // assignments -- the path gated by internal::has_packet_segment<Packet>.
 // See libeigen/eigen#3083 (the regression) and #3086 (segment-tail placement).
 //
 // Part A measures end-to-end assignments whose length is not a multiple of
 // the packet size, so the tail loop is on the hot path:
 //   * fixed-size  -> LinearVectorizedTraversal / CompleteUnrolling
 //                    (compile-time tail count; the #3083 regression shape)
 //   * dynamic     -> LinearVectorizedTraversal / NoUnrolling
 //                    (runtime tail count)
 // Part A's destinations are not read back, so it does NOT exercise the
 // store-to-load forwarding hazard -- see Part C.
 //
 // Part B is a direct A/B of the segment primitives (ploaduSegment /
 // pstoreuSegment) against an equivalent scalar loop, swept over the tail
 // length. This isolates the masked load/store hardware cost (e.g. AVX2
 // vmaskmov) and the store-to-load forwarding stall, and is the data that
 // drives the per-ISA has_packet_segment decision.
 //
 // Part C runs the #3083 regression kernels: chained small fixed-size matrix
 // ops where each result is consumed by the next. A partial-packet tail store
 // then collides with the consumer's packet load -- the store-to-load
 // forwarding hazard that makes a masked tail far more costly than a scalar
 // one. This is the cost Parts A and B cannot see in isolation.
 //
 // The active packet size and the value of has_packet_segment per scalar type
 // are emitted as Google Benchmark custom context, so a captured run is
 // self-describing. To sweep ISAs, build once per target, e.g.:
 //   g++ -O3 -DNDEBUG -std=c++17 -msse4.2      bench_segment_tail.cpp ...
 //   g++ -O3 -DNDEBUG -std=c++17 -mavx         bench_segment_tail.cpp ...
 //   g++ -O3 -DNDEBUG -std=c++17 -mavx2 -mfma  bench_segment_tail.cpp ...
 //   g++ -O3 -DNDEBUG -std=c++17 -march=native bench_segment_tail.cpp ...
 // or, via CMake, configure benchmarks/ with -DCMAKE_CXX_FLAGS=<isa flags>.
 //
 // SPDX-FileCopyrightText: The Eigen Authors
 // SPDX-License-Identifier: MPL-2.0

 #include <benchmark/benchmark.h>

 #include <complex>
 #include <cstdint>
 #include <string>
 #include <tuple>

 #include <Eigen/Core>
 #include <Eigen/LU>

 using namespace Eigen;

 namespace {

 // Capacity of the Part B scratch buffers (Buffers<T>). Must be >= the largest
 // packet size of any benchmarked scalar type, since the primitives load and
 // store a full packet; the static_assert in Buffers enforces it.
 constexpr int kMaxCount = 16;

 // ---------------------------------------------------------------------------
 // Records, per scalar type, the packet size in use and the value of
 // has_packet_segment for the active ISA. Emitted as Google Benchmark custom
 // context so it appears once in the report header and in any --benchmark_out
 // JSON, making the captured baseline self-describing.
 // ---------------------------------------------------------------------------
 template <typename T>
 void add_trait_context(const char* name) {
   using Packet = typename internal::packet_traits<T>::type;
   constexpr int PacketSize = internal::unpacket_traits<Packet>::size;
   constexpr bool HasSegment = internal::has_packet_segment<Packet>::value;
   benchmark::AddCustomContext(
       std::string("segment.") + name,
       "PacketSize=" + std::to_string(PacketSize) + " has_packet_segment=" + (HasSegment ? "true" : "false"));
 }

 // ===========================================================================
 // Part A : assignment-level tail benchmarks
 // ===========================================================================

 // Fixed-size column vector: dst = a + b, with N a compile-time constant.
 // When N % PacketSize != 0 the completely-unrolled assignment ends in a
 // compile-time-length segment tail.
 template <typename T, int N>
 void BM_FixedAssign(benchmark::State& state) {
   using Vec = Matrix<T, N, 1>;
   Vec a = Vec::Random();
   Vec b = Vec::Random();
   Vec dst = Vec::Random();
   for (auto _ : state) {
     benchmark::DoNotOptimize(a.data());
     benchmark::DoNotOptimize(b.data());
     dst = a + b;
     benchmark::DoNotOptimize(dst.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(N) * sizeof(T) * 3);
 }

 // Dynamic-size column vector: dst = a + b, with a runtime length that is not a
 // multiple of the packet size, so the NoUnrolling tail loop runs every call.
 template <typename T>
 void BM_DynamicAssign(benchmark::State& state) {
   const Index size = state.range(0);
   Matrix<T, Dynamic, 1> a = Matrix<T, Dynamic, 1>::Random(size);
   Matrix<T, Dynamic, 1> b = Matrix<T, Dynamic, 1>::Random(size);
   Matrix<T, Dynamic, 1> dst = Matrix<T, Dynamic, 1>::Random(size);
   for (auto _ : state) {
     benchmark::DoNotOptimize(a.data());
     benchmark::DoNotOptimize(b.data());
     dst = a + b;
     benchmark::DoNotOptimize(dst.data());
     benchmark::ClobberMemory();
   }
   state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(size) * sizeof(T) * 3);
 }

 // ===========================================================================
 // Part B : segment-primitive A/B micro-benchmarks
 //
 // Each pair (segment vs scalar) is registered over the same count range so the
 // reports line up. count is taken from state.range(0): runtime for the
 // non-CT variants (matches the dynamic tail), compile-time for the *_CT
 // variants (matches the completely-unrolled tail).
 // ===========================================================================

 template <typename T>
 struct Buffers {
   static_assert(kMaxCount >= internal::unpacket_traits<typename internal::packet_traits<T>::type>::size,
                 "kMaxCount is smaller than this type's packet size: the scratch buffers cannot hold a full packet");
   EIGEN_ALIGN_MAX T src[kMaxCount];
   EIGEN_ALIGN_MAX T dst[kMaxCount];
   Buffers() {
     for (int i = 0; i < kMaxCount; ++i) {
       src[i] = T(i + 1);
       dst[i] = T(0);
     }
   }
 };

 // --- runtime-count store: masked segment store vs scalar store loop ---------
 template <typename T>
 void BM_PrimSegmentStore(benchmark::State& state) {
   using Packet = typename internal::packet_traits<T>::type;
   constexpr int PacketSize = internal::unpacket_traits<Packet>::size;
   const Index count = state.range(0);
   if (PacketSize <= 1 || count > PacketSize) {
     state.SkipWithError("count exceeds packet size");
     return;
   }
   Buffers<T> buf;
   Packet p = internal::ploadu<Packet>(buf.src);
   for (auto _ : state) {
     benchmark::DoNotOptimize(p);
     internal::pstoreuSegment<T, Packet>(buf.dst, p, 0, count);
     benchmark::DoNotOptimize(buf.dst);
     benchmark::ClobberMemory();
   }
   state.SetItemsProcessed(state.iterations() * count);
 }

 template <typename T>
 void BM_PrimScalarStore(benchmark::State& state) {
   using Packet = typename internal::packet_traits<T>::type;
   constexpr int PacketSize = internal::unpacket_traits<Packet>::size;
   const Index count = state.range(0);
   if (PacketSize <= 1 || count > PacketSize) {
     state.SkipWithError("count exceeds packet size");
     return;
   }
   Buffers<T> buf;
   for (auto _ : state) {
     benchmark::DoNotOptimize(buf.src);
     for (Index k = 0; k < count; ++k) buf.dst[k] = buf.src[k];
     benchmark::DoNotOptimize(buf.dst);
     benchmark::ClobberMemory();
   }
   state.SetItemsProcessed(state.iterations() * count);
 }

 // --- runtime-count store+reload: captures store-to-load forwarding stalls ---
 template <typename T>
 void BM_PrimSegmentStoreReload(benchmark::State& state) {
   using Packet = typename internal::packet_traits<T>::type;
   constexpr int PacketSize = internal::unpacket_traits<Packet>::size;
   const Index count = state.range(0);
   if (PacketSize <= 1 || count > PacketSize) {
     state.SkipWithError("count exceeds packet size");
     return;
   }
   Buffers<T> buf;
   Packet p = internal::ploadu<Packet>(buf.src);
   Packet acc = internal::pset1<Packet>(T(0));
   for (auto _ : state) {
     benchmark::DoNotOptimize(p);
     internal::pstoreuSegment<T, Packet>(buf.dst, p, 0, count);
     benchmark::DoNotOptimize(buf.dst);
     Packet q = internal::ploaduSegment<Packet>(buf.dst, 0, count);
     acc = internal::padd(acc, q);
     benchmark::DoNotOptimize(acc);
   }
   state.SetItemsProcessed(state.iterations() * count);
 }

 template <typename T>
 void BM_PrimScalarStoreReload(benchmark::State& state) {
   using Packet = typename internal::packet_traits<T>::type;
   constexpr int PacketSize = internal::unpacket_traits<Packet>::size;
   const Index count = state.range(0);
   if (PacketSize <= 1 || count > PacketSize) {
     state.SkipWithError("count exceeds packet size");
     return;
   }
   Buffers<T> buf;
   for (auto _ : state) {
     benchmark::DoNotOptimize(buf.src);
     for (Index k = 0; k < count; ++k) buf.dst[k] = buf.src[k];
     benchmark::DoNotOptimize(buf.dst);
     // acc is reset every iteration: a persistent accumulator would grow without
     // bound over the benchmark's iterations -- signed-integer overflow (UB) for
     // integer T. Reset keeps the reload + add live without that.
     T acc = T(0);
     for (Index k = 0; k < count; ++k) acc += buf.dst[k];
     benchmark::DoNotOptimize(acc);
   }
   state.SetItemsProcessed(state.iterations() * count);
 }

 // --- overlapping full-width store+reload: one unmasked packet, count-free ---
 // Models the "store a full packet ending at size" tail strategy: valid for a
 // plain assignment when size >= PacketSize. Mask-free, so it is available on
 // every ISA (including SSE/NEON), and a full-width store forwards cleanly to a
 // same-width reload -- unlike a masked store. Cost is independent of the tail
 // length, so this is registered once per type rather than swept over count.
 template <typename T>
 void BM_PrimOverlapStoreReload(benchmark::State& state) {
   using Packet = typename internal::packet_traits<T>::type;
   constexpr int PacketSize = internal::unpacket_traits<Packet>::size;
   if (PacketSize <= 1) {
     state.SkipWithError("type not vectorized");
     return;
   }
   Buffers<T> buf;
   Packet p = internal::ploadu<Packet>(buf.src);
   Packet acc = internal::pset1<Packet>(T(0));
   for (auto _ : state) {
     benchmark::DoNotOptimize(p);
     internal::pstoreu<T, Packet>(buf.dst, p);
     benchmark::DoNotOptimize(buf.dst);
     Packet q = internal::ploadu<Packet>(buf.dst);
     acc = internal::padd(acc, q);
     benchmark::DoNotOptimize(acc);
   }
   state.SetItemsProcessed(state.iterations() * PacketSize);
 }

 // --- runtime-count load: masked segment load vs scalar load loop -----------
 template <typename T>
 void BM_PrimSegmentLoad(benchmark::State& state) {
   using Packet = typename internal::packet_traits<T>::type;
   constexpr int PacketSize = internal::unpacket_traits<Packet>::size;
   const Index count = state.range(0);
   if (PacketSize <= 1 || count > PacketSize) {
     state.SkipWithError("count exceeds packet size");
     return;
   }
   Buffers<T> buf;
   Packet acc = internal::pset1<Packet>(T(0));
   for (auto _ : state) {
     benchmark::DoNotOptimize(buf.src);
     Packet q = internal::ploaduSegment<Packet>(buf.src, 0, count);
     acc = internal::padd(acc, q);
     benchmark::DoNotOptimize(acc);
   }
   state.SetItemsProcessed(state.iterations() * count);
 }

 template <typename T>
 void BM_PrimScalarLoad(benchmark::State& state) {
   using Packet = typename internal::packet_traits<T>::type;
   constexpr int PacketSize = internal::unpacket_traits<Packet>::size;
   const Index count = state.range(0);
   if (PacketSize <= 1 || count > PacketSize) {
     state.SkipWithError("count exceeds packet size");
     return;
   }
   Buffers<T> buf;
   for (auto _ : state) {
     benchmark::DoNotOptimize(buf.src);
     // acc is reset every iteration -- see BM_PrimScalarStoreReload.
     T acc = T(0);
     for (Index k = 0; k < count; ++k) acc += buf.src[k];
     benchmark::DoNotOptimize(acc);
   }
   state.SetItemsProcessed(state.iterations() * count);
 }

 // --- compile-time-count store: matches the completely-unrolled tail --------
 // Here the scalar alternative is itself fully unrolled (no loop), which is why
 // it can beat a masked store for tiny tails.
 template <typename T, int Count>
 void BM_PrimSegmentStoreCT(benchmark::State& state) {
   using Packet = typename internal::packet_traits<T>::type;
   constexpr int PacketSize = internal::unpacket_traits<Packet>::size;
   if (PacketSize <= 1 || Count > PacketSize) {
     state.SkipWithError("count exceeds packet size");
     return;
   }
   Buffers<T> buf;
   Packet p = internal::ploadu<Packet>(buf.src);
   for (auto _ : state) {
     benchmark::DoNotOptimize(p);
     internal::pstoreuSegment<T, Packet>(buf.dst, p, 0, Count);
     benchmark::DoNotOptimize(buf.dst);
     benchmark::ClobberMemory();
   }
   state.SetItemsProcessed(state.iterations() * Count);
 }

 template <typename T, int Count>
 void BM_PrimScalarStoreCT(benchmark::State& state) {
   using Packet = typename internal::packet_traits<T>::type;
   constexpr int PacketSize = internal::unpacket_traits<Packet>::size;
   if (PacketSize <= 1 || Count > PacketSize) {
     state.SkipWithError("count exceeds packet size");
     return;
   }
   Buffers<T> buf;
   for (auto _ : state) {
     benchmark::DoNotOptimize(buf.src);
     EIGEN_UNROLL_LOOP
     for (int k = 0; k < Count; ++k) buf.dst[k] = buf.src[k];
     benchmark::DoNotOptimize(buf.dst);
     benchmark::ClobberMemory();
   }
   state.SetItemsProcessed(state.iterations() * Count);
 }

 // ===========================================================================
 // Part C : chained fixed-size kernels (results consumed immediately)
 //
 // The #3083 regression kernels. Unlike Part A, each assignment's result feeds
 // the next op, so a partial-packet tail store is followed by a packet load of
 // the same destination. That store-to-load forwarding hazard is what makes a
 // masked (or overlapping) tail far more costly than a scalar one -- it is only
 // visible when the result is consumed, which these benchmarks do and Part A
 // deliberately does not. These are the kernels that reflect the real cost.
 // ===========================================================================

 // #3083 Example 2: double 3x3 inverse() * t -- the headline regression.
 EIGEN_DONT_INLINE void kernel_inverse(const Vector3d& a, const Vector3d& b, const Vector3d& c, const Matrix3d& R,
                                       const Vector3d& t, Vector3d& result) {
   Matrix3d A;
   A << a, -R * b, c;
   result = A.inverse() * t;
 }

 void BM_Chained_Inverse3x3(benchmark::State& state) {
   Vector3d a = Vector3d::Random(), b = Vector3d::Random(), c = Vector3d::Random(), t = Vector3d::Random(), result;
   Matrix3d R = Matrix3d::Random();
   for (auto _ : state) {
     benchmark::DoNotOptimize(a);
     benchmark::DoNotOptimize(b);
     benchmark::DoNotOptimize(R);
     kernel_inverse(a, b, c, R, t, result);
     benchmark::DoNotOptimize(result);
     benchmark::ClobberMemory();
   }
 }

 // #3083 Example 1: camera kernel chaining Matrix<double,2,3> blocks.
 using Blk23 = Matrix<double, 2, 3, RowMajor>;
 using Blocks5 = std::tuple<Blk23, Blk23, Blk23, Blk23, Blk23>;

 EIGEN_DONT_INLINE void kernel_camera(const Matrix3d& A, const Matrix3d& B, const Vector3d& u, const Vector3d& v,
                                      const Vector3d& w, const Vector3d& x, const Vector4d& params, Vector2d& output,
                                      Blocks5& blocks) {
   const Vector3d a = A * u + v;
   const Vector3d b = B.transpose() * (a - w);
   const Vector3d c = b - x;
   const double iz = 1.0 / c.z();
   const double iz2 = iz * iz;
   const double k0 = params(0), k1 = params(1);
   Blk23 J;
   J << k0 * iz, 0.0, -k0 * c.x() * iz2, 0.0, k1 * iz, -k1 * c.y() * iz2;
   std::get<0>(blocks) = J * B.transpose();
   std::get<1>(blocks) = std::get<0>(blocks) * A;
   std::get<2>(blocks) = -std::get<1>(blocks);
   std::get<3>(blocks) = std::get<0>(blocks) + std::get<1>(blocks);
   std::get<4>(blocks) = std::get<3>(blocks) - std::get<2>(blocks);
   output << k0 * c.x() * iz + params(2), k1 * c.y() * iz + params(3);
 }

 void BM_Chained_Camera(benchmark::State& state) {
   Matrix3d A = Matrix3d::Random(), B = Matrix3d::Random();
   Vector3d u = Vector3d::Random(), v = Vector3d::Random(), w = Vector3d::Random(), x = Vector3d::Random();
   Vector4d params = Vector4d::Random();
   Vector2d output;
   Blocks5 blocks;
   for (auto _ : state) {
     benchmark::DoNotOptimize(A);
     benchmark::DoNotOptimize(B);
     benchmark::DoNotOptimize(u);
     kernel_camera(A, B, u, v, w, x, params, output, blocks);
     benchmark::DoNotOptimize(&blocks);
     benchmark::DoNotOptimize(output);
     benchmark::ClobberMemory();
   }
 }

 // Direct chained Matrix<double,2,3> sums: isolates the 6-element tail with a
 // read-after-write dependency between consecutive assignments.
 void BM_Chained_Block23(benchmark::State& state) {
   Blk23 m0 = Blk23::Random(), m1 = Blk23::Random(), m2 = Blk23::Random(), m3 = Blk23::Random();
   for (auto _ : state) {
     benchmark::DoNotOptimize(m0);
     m1 = m0 + m1;
     m2 = m1 + m2;
     m3 = m2 + m3;
     benchmark::DoNotOptimize(m3);
     benchmark::ClobberMemory();
   }
 }

 // ===========================================================================
 // Registration
 //
 // Done programmatically (rather than via the BENCHMARK macro) so that Part B's
 // tail count is clamped to the packet size actually in use: the report then
 // contains no inapplicable count > PacketSize entries, whatever ISA the source
 // was compiled for.
 // ===========================================================================

 template <typename T>
 const char* type_tag();
 template <>
 const char* type_tag<float>() {
   return "f32";
 }
 template <>
 const char* type_tag<double>() {
   return "f64";
 }
 template <>
 const char* type_tag<int32_t>() {
   return "i32";
 }
 template <>
 const char* type_tag<int64_t>() {
   return "i64";
 }
 template <>
 const char* type_tag<std::complex<float>>() {
   return "cf32";
 }
 template <>
 const char* type_tag<std::complex<double>>() {
   return "cf64";
 }

 template <typename T>
 int packet_size() {
   return internal::unpacket_traits<typename internal::packet_traits<T>::type>::size;
 }

 // Part A : fixed-size tails. Exact multiples (8/16/32/64) act as no-tail
 // controls; the rest cover a spread of remainders across SSE/AVX/AVX-512.
 template <typename T, int N>
 void add_fixed() {
   benchmark::RegisterBenchmark(std::string("FixedAssign/") + type_tag<T>() + "/" + std::to_string(N),
                                &BM_FixedAssign<T, N>);
 }

 template <typename T>
 void add_all_fixed() {
   add_fixed<T, 8>();
   add_fixed<T, 9>();
   add_fixed<T, 11>();
   add_fixed<T, 15>();
   add_fixed<T, 16>();
   add_fixed<T, 17>();
   add_fixed<T, 23>();
   add_fixed<T, 32>();
   add_fixed<T, 33>();
   add_fixed<T, 47>();
   add_fixed<T, 64>();
   add_fixed<T, 65>();
 }

 // Part A : dynamic-size tails. Small sizes keep the tail a meaningful fraction;
 // 1025 confirms there is no regression once the tail is amortized.
 template <typename T>
 void add_dynamic() {
   auto* b = benchmark::RegisterBenchmark(std::string("DynAssign/") + type_tag<T>(), &BM_DynamicAssign<T>);
   for (int s : {15, 17, 31, 33, 63, 65, 127, 129, 255, 1025}) b->Arg(s);
 }

 // Part B : segment vs scalar primitives, runtime count 1..PacketSize.
 void add_prim_range(const std::string& name, void (*fn)(benchmark::State&), int ps) {
   auto* b = benchmark::RegisterBenchmark(name, fn);
   for (int c = 1; c <= ps; ++c) b->Arg(c);
 }

 template <typename T>
 void add_prim() {
   const int ps = packet_size<T>();
   if (ps <= 1) return;  // type not vectorized on this ISA
   const std::string tag = type_tag<T>();
   add_prim_range("Prim/Store/seg/" + tag, &BM_PrimSegmentStore<T>, ps);
   add_prim_range("Prim/Store/scl/" + tag, &BM_PrimScalarStore<T>, ps);
   add_prim_range("Prim/StoreReload/seg/" + tag, &BM_PrimSegmentStoreReload<T>, ps);
   add_prim_range("Prim/StoreReload/scl/" + tag, &BM_PrimScalarStoreReload<T>, ps);
   benchmark::RegisterBenchmark("Prim/StoreReload/ovl/" + tag, &BM_PrimOverlapStoreReload<T>);
   add_prim_range("Prim/Load/seg/" + tag, &BM_PrimSegmentLoad<T>, ps);
   add_prim_range("Prim/Load/scl/" + tag, &BM_PrimScalarLoad<T>, ps);
 }

 // Part B : compile-time count store (matches the completely-unrolled tail).
 template <typename T, int Count>
 void add_prim_ct() {
   if (Count > packet_size<T>()) return;
   const std::string tag = std::string(type_tag<T>()) + "/" + std::to_string(Count);
   benchmark::RegisterBenchmark("PrimCT/Store/seg/" + tag, &BM_PrimSegmentStoreCT<T, Count>);
   benchmark::RegisterBenchmark("PrimCT/Store/scl/" + tag, &BM_PrimScalarStoreCT<T, Count>);
 }

 template <typename T>
 void add_all_prim_ct() {
   add_prim_ct<T, 1>();
   add_prim_ct<T, 3>();
   add_prim_ct<T, 5>();
   add_prim_ct<T, 7>();
 }

 int RegisterAll() {
   add_trait_context<float>("f32");
   add_trait_context<double>("f64");
   add_trait_context<int32_t>("i32");
   add_trait_context<int64_t>("i64");
   add_trait_context<std::complex<float>>("cf32");
   add_trait_context<std::complex<double>>("cf64");

   add_all_fixed<float>();
   add_all_fixed<double>();
   add_dynamic<float>();
   add_dynamic<double>();

   add_prim<float>();
   add_prim<double>();
   add_prim<int32_t>();
   add_prim<int64_t>();
   add_prim<std::complex<float>>();
   add_prim<std::complex<double>>();

   add_all_prim_ct<float>();
   add_all_prim_ct<double>();

   benchmark::RegisterBenchmark("Chained/Inverse3x3", &BM_Chained_Inverse3x3);
   benchmark::RegisterBenchmark("Chained/Camera", &BM_Chained_Camera);
   benchmark::RegisterBenchmark("Chained/Block23", &BM_Chained_Block23);
   return 0;
 }

 const int registered = RegisterAll();

 }  // namespace

 // main() is provided by benchmark::benchmark_main (linked via eigen_add_benchmark).