Add block() / getResourceRequirements() to Tensor{Scan,FFT,LayoutSwap,Contraction}

libeigen/eigen!2477

Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
diff --git a/unsupported/Eigen/src/Tensor/TensorContraction.h b/unsupported/Eigen/src/Tensor/TensorContraction.h
index 5ff3afe..cf59a80 100644
--- a/unsupported/Eigen/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/src/Tensor/TensorContraction.h
@@ -955,6 +955,17 @@
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; }
 
+  // Required so a contraction can be composed with operators whose own
+  // getResourceRequirements() forwards into m_impl (TensorPaddingOp,
+  // TensorBroadcastingOp, etc.). Without this, e.g. an expression like
+  // `Tensor B = A.contract(C, dims).pad(p)` fails to compile because
+  // Pad's BlockAccess is gated on m_impl.RawAccess (which is true here)
+  // and instantiating Pad's getResourceRequirements then requires this
+  // method on the operand evaluator.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
  protected:
   Dimensions m_dimensions;
 
diff --git a/unsupported/Eigen/src/Tensor/TensorFFT.h b/unsupported/Eigen/src/Tensor/TensorFFT.h
index 141b416..2cffdae 100644
--- a/unsupported/Eigen/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/src/Tensor/TensorFFT.h
@@ -146,14 +146,22 @@
   enum {
     IsAligned = false,
     PacketAccess = true,
-    BlockAccess = false,
+    // FFT eagerly materializes its result into m_data; once that buffer
+    // exists, exposing block access is just a wrapper around it. Leave
+    // PreferBlockAccess false so the executor still uses the cheaper
+    // packet path by default; this only matters when an outer expression
+    // calls block() directly.
+    BlockAccess = (NumDims > 0),
     PreferBlockAccess = false,
     CoordAccess = false,
     RawAccess = false
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+  typedef typename internal::TensorMaterializedBlock<std::remove_const_t<CoeffReturnType>, NumDims, Layout, Index>
+      TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -214,6 +222,16 @@
 
   EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    eigen_assert(m_data != nullptr);
+    return TensorBlock::materialize(m_data, m_dimensions, desc, scratch);
+  }
+
  private:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) {
     const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
diff --git a/unsupported/Eigen/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/src/Tensor/TensorLayoutSwap.h
index 3ddcc1c..50bb5ec 100644
--- a/unsupported/Eigen/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/src/Tensor/TensorLayoutSwap.h
@@ -96,14 +96,27 @@
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
+    // Layout swap is a no-op at the flat-memory level; expose block access
+    // whenever the underlying expression has a raw data pointer we can
+    // hand off to TensorMaterializedBlock.
+    BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess && NumDims > 0,
     PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     CoordAccess = false,  // to be implemented
     RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, Layout, Index> TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {
@@ -112,12 +125,6 @@
     }
   }
 
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); }
@@ -134,6 +141,16 @@
     return m_impl.costPerCoeff(vectorized);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    eigen_assert(m_impl.data() != nullptr);
+    return TensorBlock::materialize(m_impl.data(), m_dimensions, desc, scratch);
+  }
+
   EIGEN_DEVICE_FUNC typename Storage::Type data() const { return constCast(m_impl.data()); }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
diff --git a/unsupported/Eigen/src/Tensor/TensorScan.h b/unsupported/Eigen/src/Tensor/TensorScan.h
index f3e5bd4..bf99601 100644
--- a/unsupported/Eigen/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/src/Tensor/TensorScan.h
@@ -365,14 +365,21 @@
   enum {
     IsAligned = false,
     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess = false,
+    // Scan eagerly materializes its result into m_output; once that buffer
+    // exists, exposing block access is just a wrapper around it. Leave
+    // PreferBlockAccess false so the executor still uses the cheaper
+    // raw/packet paths by default; the flag matters only when an outer
+    // expression calls block() directly.
+    BlockAccess = (NumDims > 0),
     PreferBlockAccess = false,
     CoordAccess = false,
     RawAccess = true
   };
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+  typedef typename internal::TensorMaterializedBlock<Scalar, NumDims, Layout, Index> TensorBlock;
   //===--------------------------------------------------------------------===//
 
   EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -442,6 +449,16 @@
     return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    eigen_assert(m_output != nullptr);
+    return TensorBlock::materialize(m_output, m_impl.dimensions(), desc, scratch);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_output; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_output[index]; }
diff --git a/unsupported/benchmarks/Tensor/CMakeLists.txt b/unsupported/benchmarks/Tensor/CMakeLists.txt
index 301ce39..06ee4f9 100644
--- a/unsupported/benchmarks/Tensor/CMakeLists.txt
+++ b/unsupported/benchmarks/Tensor/CMakeLists.txt
@@ -9,3 +9,4 @@
 eigen_add_benchmark(bench_image_patch bench_image_patch.cpp)
 eigen_add_benchmark(bench_reverse bench_reverse.cpp)
 eigen_add_benchmark(bench_roll bench_roll.cpp)
+eigen_add_benchmark(bench_layout_swap bench_layout_swap.cpp)
diff --git a/unsupported/benchmarks/Tensor/bench_layout_swap.cpp b/unsupported/benchmarks/Tensor/bench_layout_swap.cpp
new file mode 100644
index 0000000..8caeb57
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_layout_swap.cpp
@@ -0,0 +1,77 @@
+// Benchmarks for Eigen TensorLayoutSwap.
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using namespace Eigen;
+
+typedef float Scalar;
+
+static void BM_LayoutSwap_2D(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2, ColMajor> A(M, N);
+  A.setRandom();
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2, RowMajor> B = A.swap_layout();
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  // 1 read (A) + 1 write (B).
+  state.SetBytesProcessed(state.iterations() * 2ll * static_cast<int64_t>(M) * N * sizeof(Scalar));
+}
+
+static void BM_LayoutSwap_3D(benchmark::State& state) {
+  const int D0 = state.range(0);
+  const int D1 = state.range(1);
+  const int D2 = state.range(2);
+
+  Tensor<Scalar, 3, ColMajor> A(D0, D1, D2);
+  A.setRandom();
+
+  for (auto _ : state) {
+    Tensor<Scalar, 3, RowMajor> B = A.swap_layout();
+    benchmark::DoNotOptimize(B.data());
+    benchmark::ClobberMemory();
+  }
+  // 1 read (A) + 1 write (B).
+  state.SetBytesProcessed(state.iterations() * 2ll * static_cast<int64_t>(D0) * D1 * D2 * sizeof(Scalar));
+}
+
+// Composing swap_layout with a coefficient-wise op forces evaluation through
+// the executor and exercises any subsequent block consumers.
+static void BM_LayoutSwap_Composed(benchmark::State& state) {
+  const int M = state.range(0);
+  const int N = state.range(1);
+
+  Tensor<Scalar, 2, ColMajor> A(M, N);
+  Tensor<Scalar, 2, ColMajor> B(M, N);
+  A.setRandom();
+  B.setRandom();
+
+  for (auto _ : state) {
+    Tensor<Scalar, 2, RowMajor> C = (A + B).swap_layout();
+    benchmark::DoNotOptimize(C.data());
+    benchmark::ClobberMemory();
+  }
+  // 2 reads (A, B) + 1 write (C).
+  state.SetBytesProcessed(state.iterations() * 3ll * static_cast<int64_t>(M) * N * sizeof(Scalar));
+}
+
+static void LayoutSwapSizes(::benchmark::Benchmark* b) {
+  for (int size : {64, 256, 1024}) {
+    b->Args({size, size});
+  }
+}
+
+static void LayoutSwap3DSizes(::benchmark::Benchmark* b) {
+  b->Args({32, 32, 32});
+  b->Args({64, 64, 64});
+  b->Args({128, 128, 128});
+}
+
+BENCHMARK(BM_LayoutSwap_2D)->Apply(LayoutSwapSizes);
+BENCHMARK(BM_LayoutSwap_3D)->Apply(LayoutSwap3DSizes);
+BENCHMARK(BM_LayoutSwap_Composed)->Apply(LayoutSwapSizes);
diff --git a/unsupported/test/tensor_block_eval.cpp b/unsupported/test/tensor_block_eval.cpp
index aecd7b2..cedfcb9 100644
--- a/unsupported/test/tensor_block_eval.cpp
+++ b/unsupported/test/tensor_block_eval.cpp
@@ -203,6 +203,10 @@
   for (Index i = 0; i < block.dimensions().TotalSize(); ++i) {
     VERIFY_IS_EQUAL(block.coeff(i), slice.coeff(i));
   }
+
+  // Release evaluator-owned temporaries (e.g. the materialized buffer that
+  // TensorScan / TensorFFT allocate during evalSubExprsIfNeeded).
+  eval.cleanup();
 }
 
 // -------------------------------------------------------------------------- //
@@ -274,6 +278,118 @@
 }
 
 template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_scan() {
+  // Scan eagerly materializes m_output during evalSubExprsIfNeeded; this
+  // exercises the block() wrapper around that buffer.
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(4, 12);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  const Index axis = NumDims == 1 ? 0 : NumDims / 2;
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(input.cumsum(axis), [&dims]() { return RandomBlock<Layout>(dims, 1, 5); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(input.cumsum(axis), [&dims]() { return FixedSizeBlock(dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_fft() {
+  // FFT eagerly materializes m_data during evalSubExprsIfNeeded; this
+  // exercises the block() wrapper around that buffer. Use RealPart so
+  // the output type matches the input type and the harness's slice
+  // comparison stays straightforward.
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(4, 12);
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  Eigen::array<int, 1> fft_dims = {0};
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(input.template fft<RealPart, FFT_FORWARD>(fft_dims),
+                                           [&dims]() { return RandomBlock<Layout>(dims, 1, 5); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(input.template fft<RealPart, FFT_FORWARD>(fft_dims),
+                                           [&dims]() { return FixedSizeBlock(dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_layout_swap() {
+  // The swap_layout expression has the opposite layout of its operand. Build
+  // the input with the opposite layout and assert the block evaluator on the
+  // resulting expression matches the slice-based reference.
+  constexpr int InputLayout = (Layout == ColMajor) ? RowMajor : ColMajor;
+  DSizes<Index, NumDims> input_dims = RandomDims<NumDims>(10, 20);
+  Tensor<T, NumDims, InputLayout> input(input_dims);
+  input.setRandom();
+
+  DSizes<Index, NumDims> swapped_dims;
+  for (int i = 0; i < NumDims; ++i) {
+    swapped_dims[i] = input_dims[NumDims - 1 - i];
+  }
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(input.swap_layout(),
+                                           [&swapped_dims]() { return RandomBlock<Layout>(swapped_dims, 1, 10); });
+
+  VerifyBlockEvaluator<T, NumDims, Layout>(input.swap_layout(),
+                                           [&swapped_dims]() { return FixedSizeBlock(swapped_dims); });
+}
+
+// Regression for the original failure mode this MR fixes: TensorPaddingOp's
+// BlockAccess gates on m_impl.RawAccess (true for Scan/FFT/LayoutSwap/Contraction)
+// and its getResourceRequirements() unconditionally calls the operand's. Before
+// the operand-side block plumbing was added, instantiating the executor's
+// Tiling=On path through these compositions failed to compile. Driving block()
+// over the composed expressions exercises both sides of that compile path.
+template <typename T, int NumDims, int Layout>
+static void test_eval_composed_block_ops() {
+  DSizes<Index, NumDims> dims = RandomDims<NumDims>(4, 8);
+  array<std::pair<Index, Index>, NumDims> paddings;
+  DSizes<Index, NumDims> padded_dims;
+  for (int i = 0; i < NumDims; ++i) {
+    paddings[i] = std::make_pair(1, 2);
+    padded_dims[i] = dims[i] + 3;
+  }
+
+  const Index axis = NumDims == 1 ? 0 : NumDims / 2;
+
+  Tensor<T, NumDims, Layout> input(dims);
+  input.setRandom();
+
+  // cumsum(...).pad(...) — TensorScan + TensorPadding.
+  VerifyBlockEvaluator<T, NumDims, Layout>(input.cumsum(axis).pad(paddings),
+                                           [&padded_dims]() { return RandomBlock<Layout>(padded_dims, 1, 5); });
+
+  // swap_layout().pad(...) — operand built with the opposite layout so the
+  // composed expression evaluates in the test's Layout.
+  constexpr int InputLayout = (Layout == ColMajor) ? RowMajor : ColMajor;
+  Tensor<T, NumDims, InputLayout> swap_input(dims);
+  swap_input.setRandom();
+  DSizes<Index, NumDims> swap_padded_dims;
+  for (int i = 0; i < NumDims; ++i) {
+    swap_padded_dims[i] = dims[NumDims - 1 - i] + 3;
+  }
+  VerifyBlockEvaluator<T, NumDims, Layout>(swap_input.swap_layout().pad(paddings), [&swap_padded_dims]() {
+    return RandomBlock<Layout>(swap_padded_dims, 1, 5);
+  });
+}
+
+// 2D-specific regression: contract(...).pad(...) hit the same composition bug
+// because TensorContraction has RawAccess=true but lacked getResourceRequirements().
+template <typename T, int Layout>
+static void test_eval_contract_pad_composition() {
+  Tensor<T, 2, Layout> A(8, 6);
+  Tensor<T, 2, Layout> B(6, 4);
+  A.setRandom();
+  B.setRandom();
+
+  Eigen::array<IndexPair<Index>, 1> contract_dims = {IndexPair<Index>(1, 0)};
+  array<std::pair<Index, Index>, 2> paddings = {std::pair<Index, Index>{1, 1}, std::pair<Index, Index>{2, 2}};
+  DSizes<Index, 2> padded_dims(8 + 2, 4 + 4);
+
+  VerifyBlockEvaluator<T, 2, Layout>(A.contract(B, contract_dims).pad(paddings),
+                                     [&padded_dims]() { return RandomBlock<Layout>(padded_dims, 1, 5); });
+}
+
+template <typename T, int NumDims, int Layout>
 static void test_eval_tensor_reshape() {
   DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 10);
 
@@ -815,6 +931,27 @@
   CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_binary_with_unary_expr_block);
   CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_broadcast);
   CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_reshape);
+  CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_layout_swap);
+  CALL_SUBTEST_PART(2)((test_eval_tensor_scan<float, 2, RowMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_tensor_scan<float, 3, RowMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_tensor_scan<float, 4, RowMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_tensor_scan<float, 2, ColMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_tensor_scan<float, 3, ColMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_tensor_scan<float, 4, ColMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_tensor_fft<float, 2, RowMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_tensor_fft<float, 3, RowMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_tensor_fft<float, 4, RowMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_tensor_fft<float, 2, ColMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_tensor_fft<float, 3, ColMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_tensor_fft<float, 4, ColMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_composed_block_ops<float, 2, RowMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_composed_block_ops<float, 3, RowMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_composed_block_ops<float, 4, RowMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_composed_block_ops<float, 2, ColMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_composed_block_ops<float, 3, ColMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_composed_block_ops<float, 4, ColMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_contract_pad_composition<float, RowMajor>()));
+  CALL_SUBTEST_PART(2)((test_eval_contract_pad_composition<float, ColMajor>()));
   CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_cast);
   CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_select);
   CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_padding);