Vectorize TensorReverse / TensorRoll packet() inner-slice fast path

libeigen/eigen!2478

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
diff --git a/unsupported/Eigen/src/Tensor/TensorReverse.h b/unsupported/Eigen/src/Tensor/TensorReverse.h
index ece85ca..158eac4 100644
--- a/unsupported/Eigen/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/src/Tensor/TensorReverse.h
@@ -191,15 +191,28 @@
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
     eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
 
-    // TODO(ndjaitly): write a better packing routine that uses
-    // local structure.
+    // Fast path: when the whole packet stays inside a single inner-most
+    // slice of the input, replace PacketSize coeff() calls with one packet
+    // load (plus a preverse when the inner dim is reversed).
+    constexpr int inner_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumDims - 1;
+    const Index inner_size = m_dimensions[inner_dim];
+    const Index inner_pos = index - (index / inner_size) * inner_size;
+    if (inner_pos + PacketSize <= inner_size) {
+      if (m_reverse[inner_dim]) {
+        const Index input_index = reverseIndex(index + PacketSize - 1);
+        return internal::preverse(m_impl.template packet<Unaligned>(input_index));
+      }
+      return m_impl.template packet<Unaligned>(reverseIndex(index));
+    }
+
+    // Slow path: the packet crosses an inner-slice boundary, so the
+    // contiguous-load trick does not apply.
     EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
     EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index + i);
     }
-    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-    return rslt;
+    return internal::pload<PacketReturnType>(values);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
@@ -322,7 +335,9 @@
         compute_cost += 2 * TensorOpCost::AddCost<Index>();
       }
     }
-    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+    // The inner-slice fast path runs the per-coeff index math once per packet,
+    // so the amortized compute cost matches the vectorized convention.
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
   EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
diff --git a/unsupported/Eigen/src/Tensor/TensorRoll.h b/unsupported/Eigen/src/Tensor/TensorRoll.h
index e4b5181..2184b4d 100644
--- a/unsupported/Eigen/src/Tensor/TensorRoll.h
+++ b/unsupported/Eigen/src/Tensor/TensorRoll.h
@@ -189,13 +189,28 @@
   template <int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
     eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    // Fast path: when the entire packet stays inside one inner-most slice
+    // of both the output and the rolled input (no modular wrap on the
+    // inner dim), the PacketSize coeff() calls collapse to a single
+    // contiguous packet load from the underlying tensor.
+    constexpr int inner_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumDims - 1;
+    const Index inner_size = m_dimensions[inner_dim];
+    const Index inner_pos = index - (index / inner_size) * inner_size;
+    if (inner_pos + PacketSize <= inner_size) {
+      const Index rolled_inner_pos = roll(inner_pos, m_rolls[inner_dim], inner_size);
+      if (rolled_inner_pos + PacketSize <= inner_size) {
+        return m_impl.template packet<Unaligned>(rollIndex(index));
+      }
+    }
+
+    // Slow path: the packet straddles a slice boundary on either side.
     EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
     EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index + i);
     }
-    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-    return rslt;
+    return internal::pload<PacketReturnType>(values);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
@@ -273,7 +288,9 @@
     for (int i = 0; i < NumDims; ++i) {
       compute_cost += 2 * TensorOpCost::AddCost<Index>();
     }
-    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+    // The inner-slice fast path runs the per-coeff index math once per packet,
+    // so the amortized compute cost matches the vectorized convention.
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
   EIGEN_DEVICE_FUNC typename Storage::Type data() const { return nullptr; }
diff --git a/unsupported/benchmarks/Tensor/CMakeLists.txt b/unsupported/benchmarks/Tensor/CMakeLists.txt
index c0a7c9c..301ce39 100644
--- a/unsupported/benchmarks/Tensor/CMakeLists.txt
+++ b/unsupported/benchmarks/Tensor/CMakeLists.txt
@@ -7,3 +7,5 @@
 eigen_add_benchmark(bench_morphing bench_morphing.cpp)
 eigen_add_benchmark(bench_coefficient_wise bench_coefficient_wise.cpp)
 eigen_add_benchmark(bench_image_patch bench_image_patch.cpp)
+eigen_add_benchmark(bench_reverse bench_reverse.cpp)
+eigen_add_benchmark(bench_roll bench_roll.cpp)
diff --git a/unsupported/benchmarks/Tensor/bench_reverse.cpp b/unsupported/benchmarks/Tensor/bench_reverse.cpp
new file mode 100644
index 0000000..852df68
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_reverse.cpp
@@ -0,0 +1,103 @@
+// Benchmarks for Eigen TensorReverse.
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using namespace Eigen;
+
+typedef float Scalar;
+
+// --- Reverse only the inner-most (contiguous) dimension. SIMD preverse case. ---
+static void BM_Reverse_Inner(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  array<bool, 2> dim_rev = {true, false};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> B = A.reverse(dim_rev);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(M) * N * sizeof(Scalar));
+}
+
+// --- Reverse only an outer dimension. Inner dim stays contiguous. ---
+static void BM_Reverse_Outer(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  array<bool, 2> dim_rev = {false, true};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> B = A.reverse(dim_rev);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(M) * N * sizeof(Scalar));
+}
+
+// --- Reverse every dimension. ---
+static void BM_Reverse_All(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  array<bool, 2> dim_rev = {true, true};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> B = A.reverse(dim_rev);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(M) * N * sizeof(Scalar));
+}
+
+// --- 3D reverse with the inner dim reversed (typical CNN-style layout). ---
+static void BM_Reverse_3D_Inner(benchmark::State& state) {
+  const int D0 = state.range(0);
+  const int D1 = state.range(1);
+  const int D2 = state.range(2);
+
+  Tensor<Scalar, 3> A(D0, D1, D2);
+  A.setRandom();
+
+  array<bool, 3> dim_rev = {true, false, false};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 3> B = A.reverse(dim_rev);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(D0) * D1 * D2 * sizeof(Scalar));
+}
+
+// Sweep sizes that span L1 (~32 KB), L2 (~256 KB), and LLC (~MBs) for float
+// tensors. Bytes per element = 4, so per-side sizes:
+//   64x64    = 16 KB (L1)
+//   256x256  = 256 KB (L2)
+//   1024x1024 = 4 MB (LLC / DRAM)
+static void ReverseSizes(::benchmark::Benchmark* b) {
+  for (int size : {64, 256, 1024}) {
+    b->Args({size, size});
+  }
+}
+
+static void Reverse3DSizes(::benchmark::Benchmark* b) {
+  b->Args({32, 32, 32});     // 128 KB
+  b->Args({64, 64, 64});     // 1 MB
+  b->Args({128, 128, 128});  // 8 MB
+}
+
+BENCHMARK(BM_Reverse_Inner)->Apply(ReverseSizes);
+BENCHMARK(BM_Reverse_Outer)->Apply(ReverseSizes);
+BENCHMARK(BM_Reverse_All)->Apply(ReverseSizes);
+BENCHMARK(BM_Reverse_3D_Inner)->Apply(Reverse3DSizes);
diff --git a/unsupported/benchmarks/Tensor/bench_roll.cpp b/unsupported/benchmarks/Tensor/bench_roll.cpp
new file mode 100644
index 0000000..219a8ff
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_roll.cpp
@@ -0,0 +1,103 @@
+// Benchmarks for Eigen TensorRoll.
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using namespace Eigen;
+
+typedef float Scalar;
+
+// --- Roll only the inner-most (contiguous) dimension. ---
+static void BM_Roll_Inner(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int shift = state.range(2);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  array<Index, 2> rolls = {shift, 0};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> B = A.roll(rolls);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(M) * N * sizeof(Scalar));
+}
+
+// --- Roll only an outer dimension. Inner dim stays contiguous. ---
+static void BM_Roll_Outer(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int shift = state.range(2);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  array<Index, 2> rolls = {0, shift};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> B = A.roll(rolls);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(M) * N * sizeof(Scalar));
+}
+
+// --- Roll every dimension. ---
+static void BM_Roll_All(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+  const int shift = state.range(2);
+
+  Tensor<Scalar, 2> A(M, N);
+  A.setRandom();
+
+  array<Index, 2> rolls = {shift, shift};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2> B = A.roll(rolls);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(M) * N * sizeof(Scalar));
+}
+
+// --- 3D roll with the inner dim shifted. ---
+static void BM_Roll_3D_Inner(benchmark::State& state) {
+  const int D0 = state.range(0);
+  const int D1 = state.range(1);
+  const int D2 = state.range(2);
+
+  Tensor<Scalar, 3> A(D0, D1, D2);
+  A.setRandom();
+
+  array<Index, 3> rolls = {D0 / 4, 0, 0};
+
+  for (auto _ : state) {
+    Tensor<Scalar, 3> B = A.roll(rolls);
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(D0) * D1 * D2 * sizeof(Scalar));
+}
+
+static void RollSizes(::benchmark::Benchmark* b) {
+  for (int size : {64, 256, 1024}) {
+    for (int shift : {1, 13}) {
+      b->Args({size, size, shift});
+    }
+  }
+}
+
+static void Roll3DSizes(::benchmark::Benchmark* b) {
+  b->Args({32, 32, 32});
+  b->Args({64, 64, 64});
+  b->Args({128, 128, 128});
+}
+
+BENCHMARK(BM_Roll_Inner)->Apply(RollSizes);
+BENCHMARK(BM_Roll_Outer)->Apply(RollSizes);
+BENCHMARK(BM_Roll_All)->Apply(RollSizes);
+BENCHMARK(BM_Roll_3D_Inner)->Apply(Roll3DSizes);
diff --git a/unsupported/test/tensor_reverse.cpp b/unsupported/test/tensor_reverse.cpp
index 37a836a..d6b5a7c 100644
--- a/unsupported/test/tensor_reverse.cpp
+++ b/unsupported/test/tensor_reverse.cpp
@@ -169,6 +169,46 @@
   }
 }
 
+// Verify that the rvalue evaluator's packet() returns the same lanes as
+// coeff() at every aligned and unaligned packet offset. This guards against
+// regressions in the packet implementation that the executor-level tests
+// (which only compare the assembled result) would not surface.
+template <int DataLayout>
+static void test_packet_reverse() {
+  using namespace Eigen::internal;
+
+  Tensor<float, 3, DataLayout> tensor(8, 5, 7);
+  tensor.setRandom();
+
+  array<bool, 3> dim_rev_inner =
+      (DataLayout == ColMajor) ? array<bool, 3>{{true, false, false}} : array<bool, 3>{{false, false, true}};
+  array<bool, 3> dim_rev_outer =
+      (DataLayout == ColMajor) ? array<bool, 3>{{false, false, true}} : array<bool, 3>{{true, false, false}};
+  array<bool, 3> dim_rev_all{{true, true, true}};
+
+  for (const auto& dim_rev : {dim_rev_inner, dim_rev_outer, dim_rev_all}) {
+    auto expr = tensor.reverse(dim_rev);
+    using Eval = TensorEvaluator<const decltype(expr), DefaultDevice>;
+    using Packet = typename Eval::PacketReturnType;
+    constexpr int PacketSize = Eval::PacketSize;
+
+    DefaultDevice device;
+    Eval eval(expr, device);
+    eval.evalSubExprsIfNeeded(nullptr);
+
+    const Index total = tensor.size();
+    EIGEN_ALIGN_MAX float lanes[PacketSize];
+    for (Index offset = 0; offset + PacketSize <= total; ++offset) {
+      Packet p = eval.template packet<Unaligned>(offset);
+      pstoreu(lanes, p);
+      for (int i = 0; i < PacketSize; ++i) {
+        VERIFY_IS_EQUAL(lanes[i], eval.coeff(offset + i));
+      }
+    }
+    eval.cleanup();
+  }
+}
+
 EIGEN_DECLARE_TEST(tensor_reverse) {
   CALL_SUBTEST(test_simple_reverse<ColMajor>());
   CALL_SUBTEST(test_simple_reverse<RowMajor>());
@@ -176,4 +216,6 @@
   CALL_SUBTEST(test_expr_reverse<RowMajor>(true));
   CALL_SUBTEST(test_expr_reverse<ColMajor>(false));
   CALL_SUBTEST(test_expr_reverse<RowMajor>(false));
+  CALL_SUBTEST(test_packet_reverse<ColMajor>());
+  CALL_SUBTEST(test_packet_reverse<RowMajor>());
 }
diff --git a/unsupported/test/tensor_roll.cpp b/unsupported/test/tensor_roll.cpp
index 0142ba7..762b48e 100644
--- a/unsupported/test/tensor_roll.cpp
+++ b/unsupported/test/tensor_roll.cpp
@@ -146,6 +146,47 @@
   }
 }
 
+// Verify that the rvalue evaluator's packet() returns the same lanes as
+// coeff() for every offset across a range of shift values, including the
+// no-shift, in-slice, and wrap-around cases.
+template <int DataLayout>
+static void test_packet_roll() {
+  using namespace Eigen::internal;
+
+  Tensor<float, 3, DataLayout> tensor(8, 5, 7);
+  tensor.setRandom();
+
+  // Cover several shift configurations: zero, in-bounds, and a shift that
+  // forces a modular wrap-around inside any reasonable packet.
+  array<array<Index, 3>, 4> roll_configs;
+  roll_configs[0] = {{0, 0, 0}};
+  roll_configs[1] = {{2, 1, 3}};
+  roll_configs[2] = {{6, 0, 0}};  // near-end shift on dim 0
+  roll_configs[3] = {{0, 0, 6}};  // near-end shift on dim 2
+
+  for (const auto& rolls : roll_configs) {
+    auto expr = tensor.roll(rolls);
+    using Eval = TensorEvaluator<const decltype(expr), DefaultDevice>;
+    using Packet = typename Eval::PacketReturnType;
+    constexpr int PacketSize = Eval::PacketSize;
+
+    DefaultDevice device;
+    Eval eval(expr, device);
+    eval.evalSubExprsIfNeeded(nullptr);
+
+    const Index total = tensor.size();
+    EIGEN_ALIGN_MAX float lanes[PacketSize];
+    for (Index offset = 0; offset + PacketSize <= total; ++offset) {
+      Packet p = eval.template packet<Unaligned>(offset);
+      pstoreu(lanes, p);
+      for (int i = 0; i < PacketSize; ++i) {
+        VERIFY_IS_EQUAL(lanes[i], eval.coeff(offset + i));
+      }
+    }
+    eval.cleanup();
+  }
+}
+
 EIGEN_DECLARE_TEST(tensor_roll) {
   CALL_SUBTEST(test_simple_roll<ColMajor>());
   CALL_SUBTEST(test_simple_roll<RowMajor>());
@@ -153,4 +194,6 @@
   CALL_SUBTEST(test_expr_roll<RowMajor>(true));
   CALL_SUBTEST(test_expr_roll<ColMajor>(false));
   CALL_SUBTEST(test_expr_roll<RowMajor>(false));
+  CALL_SUBTEST(test_packet_roll<ColMajor>());
+  CALL_SUBTEST(test_packet_roll<RowMajor>());
 }