Add block() / getResourceRequirements() to Tensor{Scan,FFT,LayoutSwap,Contraction}
libeigen/eigen!2477
Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
diff --git a/unsupported/Eigen/src/Tensor/TensorContraction.h b/unsupported/Eigen/src/Tensor/TensorContraction.h
index 5ff3afe..cf59a80 100644
--- a/unsupported/Eigen/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/src/Tensor/TensorContraction.h
@@ -955,6 +955,17 @@
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; }
+ // Required so a contraction can be composed with operators whose own
+ // getResourceRequirements() forwards into m_impl (TensorPaddingOp,
+ // TensorBroadcastingOp, etc.). Without this, e.g. an expression like
+ // `Tensor B = A.contract(C, dims).pad(p)` fails to compile because
+ // Pad's BlockAccess is gated on m_impl.RawAccess (which is true here)
+ // and instantiating Pad's getResourceRequirements then requires this
+ // method on the operand evaluator.
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ return internal::TensorBlockResourceRequirements::any();
+ }
+
protected:
Dimensions m_dimensions;
diff --git a/unsupported/Eigen/src/Tensor/TensorFFT.h b/unsupported/Eigen/src/Tensor/TensorFFT.h
index 141b416..2cffdae 100644
--- a/unsupported/Eigen/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/src/Tensor/TensorFFT.h
@@ -146,14 +146,22 @@
enum {
IsAligned = false,
PacketAccess = true,
- BlockAccess = false,
+ // FFT eagerly materializes its result into m_data; once that buffer
+ // exists, exposing block access is just a wrapper around it. Leave
+ // PreferBlockAccess false so the executor still uses the cheaper
+ // packet path by default; this only matters when an outer expression
+ // calls block() directly.
+ BlockAccess = (NumDims > 0),
PreferBlockAccess = false,
CoordAccess = false,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockNotImplemented TensorBlock;
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+ typedef typename internal::TensorMaterializedBlock<std::remove_const_t<CoeffReturnType>, NumDims, Layout, Index>
+ TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -214,6 +222,16 @@
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ return internal::TensorBlockResourceRequirements::any();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ eigen_assert(m_data != nullptr);
+ return TensorBlock::materialize(m_data, m_dimensions, desc, scratch);
+ }
+
private:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) {
const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
diff --git a/unsupported/Eigen/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/src/Tensor/TensorLayoutSwap.h
index 3ddcc1c..50bb5ec 100644
--- a/unsupported/Eigen/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/src/Tensor/TensorLayoutSwap.h
@@ -96,14 +96,27 @@
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
- BlockAccess = false,
+ // Layout swap is a no-op at the flat-memory level; expose block access
+ // whenever the underlying expression has a raw data pointer we can
+ // hand off to TensorMaterializedBlock.
+ BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess && NumDims > 0,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
CoordAccess = false, // to be implemented
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
};
+ typedef typename XprType::Scalar Scalar;
+ typedef typename XprType::CoeffReturnType CoeffReturnType;
+ typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+ typedef StorageMemory<CoeffReturnType, Device> Storage;
+ typedef typename Storage::Type EvaluatorPointerType;
+
+ typedef std::remove_const_t<Scalar> ScalarNoConst;
+
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockNotImplemented TensorBlock;
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+ typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, Layout, Index> TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {
@@ -112,12 +125,6 @@
}
}
- typedef typename XprType::Scalar Scalar;
- typedef typename XprType::CoeffReturnType CoeffReturnType;
- typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
- typedef StorageMemory<CoeffReturnType, Device> Storage;
- typedef typename Storage::Type EvaluatorPointerType;
-
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); }
@@ -134,6 +141,16 @@
return m_impl.costPerCoeff(vectorized);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ return internal::TensorBlockResourceRequirements::any();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ eigen_assert(m_impl.data() != nullptr);
+ return TensorBlock::materialize(m_impl.data(), m_dimensions, desc, scratch);
+ }
+
EIGEN_DEVICE_FUNC typename Storage::Type data() const { return constCast(m_impl.data()); }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
diff --git a/unsupported/Eigen/src/Tensor/TensorScan.h b/unsupported/Eigen/src/Tensor/TensorScan.h
index f3e5bd4..bf99601 100644
--- a/unsupported/Eigen/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/src/Tensor/TensorScan.h
@@ -365,14 +365,21 @@
enum {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
- BlockAccess = false,
+ // Scan eagerly materializes its result into m_output; once that buffer
+ // exists, exposing block access is just a wrapper around it. Leave
+ // PreferBlockAccess false so the executor still uses the cheaper
+ // raw/packet paths by default; the flag matters only when an outer
+ // expression calls block() directly.
+ BlockAccess = (NumDims > 0),
PreferBlockAccess = false,
CoordAccess = false,
RawAccess = true
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockNotImplemented TensorBlock;
+ typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+ typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+ typedef typename internal::TensorMaterializedBlock<Scalar, NumDims, Layout, Index> TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -442,6 +449,16 @@
return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
}
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+ return internal::TensorBlockResourceRequirements::any();
+ }
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+ bool /*root_of_expr_ast*/ = false) const {
+ eigen_assert(m_output != nullptr);
+ return TensorBlock::materialize(m_output, m_impl.dimensions(), desc, scratch);
+ }
+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_output; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_output[index]; }
diff --git a/unsupported/benchmarks/Tensor/CMakeLists.txt b/unsupported/benchmarks/Tensor/CMakeLists.txt
index 301ce39..06ee4f9 100644
--- a/unsupported/benchmarks/Tensor/CMakeLists.txt
+++ b/unsupported/benchmarks/Tensor/CMakeLists.txt
@@ -9,3 +9,4 @@
eigen_add_benchmark(bench_image_patch bench_image_patch.cpp)
eigen_add_benchmark(bench_reverse bench_reverse.cpp)
eigen_add_benchmark(bench_roll bench_roll.cpp)
+eigen_add_benchmark(bench_layout_swap bench_layout_swap.cpp)
diff --git a/unsupported/benchmarks/Tensor/bench_layout_swap.cpp b/unsupported/benchmarks/Tensor/bench_layout_swap.cpp
new file mode 100644
index 0000000..8caeb57
--- /dev/null
+++ b/unsupported/benchmarks/Tensor/bench_layout_swap.cpp
@@ -0,0 +1,77 @@
+// Benchmarks for Eigen TensorLayoutSwap.
+
+#include <benchmark/benchmark.h>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using namespace Eigen;
+
+typedef float Scalar;
+
+static void BM_LayoutSwap_2D(benchmark::State& state) {
+ const int M = state.range(0);
+ const int N = state.range(1);
+
+ Tensor<Scalar, 2, ColMajor> A(M, N);
+ A.setRandom();
+
+ for (auto _ : state) {
+ Tensor<Scalar, 2, RowMajor> B = A.swap_layout();
+ benchmark::DoNotOptimize(B.data());
+ benchmark::ClobberMemory();
+ }
+ // 1 read (A) + 1 write (B).
+ state.SetBytesProcessed(state.iterations() * 2ll * static_cast<int64_t>(M) * N * sizeof(Scalar));
+}
+
+static void BM_LayoutSwap_3D(benchmark::State& state) {
+ const int D0 = state.range(0);
+ const int D1 = state.range(1);
+ const int D2 = state.range(2);
+
+ Tensor<Scalar, 3, ColMajor> A(D0, D1, D2);
+ A.setRandom();
+
+ for (auto _ : state) {
+ Tensor<Scalar, 3, RowMajor> B = A.swap_layout();
+ benchmark::DoNotOptimize(B.data());
+ benchmark::ClobberMemory();
+ }
+ // 1 read (A) + 1 write (B).
+ state.SetBytesProcessed(state.iterations() * 2ll * static_cast<int64_t>(D0) * D1 * D2 * sizeof(Scalar));
+}
+
+// Composing swap_layout with a coefficient-wise op forces evaluation through
+// the executor and exercises any subsequent block consumers.
+static void BM_LayoutSwap_Composed(benchmark::State& state) {
+ const int M = state.range(0);
+ const int N = state.range(1);
+
+ Tensor<Scalar, 2, ColMajor> A(M, N);
+ Tensor<Scalar, 2, ColMajor> B(M, N);
+ A.setRandom();
+ B.setRandom();
+
+ for (auto _ : state) {
+ Tensor<Scalar, 2, RowMajor> C = (A + B).swap_layout();
+ benchmark::DoNotOptimize(C.data());
+ benchmark::ClobberMemory();
+ }
+ // 2 reads (A, B) + 1 write (C).
+ state.SetBytesProcessed(state.iterations() * 3ll * static_cast<int64_t>(M) * N * sizeof(Scalar));
+}
+
+static void LayoutSwapSizes(::benchmark::Benchmark* b) {
+ for (int size : {64, 256, 1024}) {
+ b->Args({size, size});
+ }
+}
+
+static void LayoutSwap3DSizes(::benchmark::Benchmark* b) {
+ b->Args({32, 32, 32});
+ b->Args({64, 64, 64});
+ b->Args({128, 128, 128});
+}
+
+BENCHMARK(BM_LayoutSwap_2D)->Apply(LayoutSwapSizes);
+BENCHMARK(BM_LayoutSwap_3D)->Apply(LayoutSwap3DSizes);
+BENCHMARK(BM_LayoutSwap_Composed)->Apply(LayoutSwapSizes);
diff --git a/unsupported/test/tensor_block_eval.cpp b/unsupported/test/tensor_block_eval.cpp
index aecd7b2..cedfcb9 100644
--- a/unsupported/test/tensor_block_eval.cpp
+++ b/unsupported/test/tensor_block_eval.cpp
@@ -203,6 +203,10 @@
for (Index i = 0; i < block.dimensions().TotalSize(); ++i) {
VERIFY_IS_EQUAL(block.coeff(i), slice.coeff(i));
}
+
+ // Release evaluator-owned temporaries (e.g. the materialized buffer that
+ // TensorScan / TensorFFT allocate during evalSubExprsIfNeeded).
+ eval.cleanup();
}
// -------------------------------------------------------------------------- //
@@ -274,6 +278,118 @@
}
template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_scan() {
+ // Scan eagerly materializes m_output during evalSubExprsIfNeeded; this
+ // exercises the block() wrapper around that buffer.
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(4, 12);
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ const Index axis = NumDims == 1 ? 0 : NumDims / 2;
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(input.cumsum(axis), [&dims]() { return RandomBlock<Layout>(dims, 1, 5); });
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(input.cumsum(axis), [&dims]() { return FixedSizeBlock(dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_fft() {
+ // FFT eagerly materializes m_data during evalSubExprsIfNeeded; this
+ // exercises the block() wrapper around that buffer. Use RealPart so
+ // the output type matches the input type and the harness's slice
+ // comparison stays straightforward.
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(4, 12);
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ Eigen::array<int, 1> fft_dims = {0};
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(input.template fft<RealPart, FFT_FORWARD>(fft_dims),
+ [&dims]() { return RandomBlock<Layout>(dims, 1, 5); });
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(input.template fft<RealPart, FFT_FORWARD>(fft_dims),
+ [&dims]() { return FixedSizeBlock(dims); });
+}
+
+template <typename T, int NumDims, int Layout>
+static void test_eval_tensor_layout_swap() {
+ // The swap_layout expression has the opposite layout of its operand. Build
+ // the input with the opposite layout and assert the block evaluator on the
+ // resulting expression matches the slice-based reference.
+ constexpr int InputLayout = (Layout == ColMajor) ? RowMajor : ColMajor;
+ DSizes<Index, NumDims> input_dims = RandomDims<NumDims>(10, 20);
+ Tensor<T, NumDims, InputLayout> input(input_dims);
+ input.setRandom();
+
+ DSizes<Index, NumDims> swapped_dims;
+ for (int i = 0; i < NumDims; ++i) {
+ swapped_dims[i] = input_dims[NumDims - 1 - i];
+ }
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(input.swap_layout(),
+ [&swapped_dims]() { return RandomBlock<Layout>(swapped_dims, 1, 10); });
+
+ VerifyBlockEvaluator<T, NumDims, Layout>(input.swap_layout(),
+ [&swapped_dims]() { return FixedSizeBlock(swapped_dims); });
+}
+
+// Regression for the original failure mode this MR fixes: TensorPaddingOp's
+// BlockAccess gates on m_impl.RawAccess (true for Scan/FFT/LayoutSwap/Contraction)
+// and its getResourceRequirements() unconditionally calls the operand's. Before
+// the operand-side block plumbing was added, instantiating the executor's
+// Tiling=On path through these compositions failed to compile. Driving block()
+// over the composed expressions exercises both sides of that compile path.
+template <typename T, int NumDims, int Layout>
+static void test_eval_composed_block_ops() {
+ DSizes<Index, NumDims> dims = RandomDims<NumDims>(4, 8);
+ array<std::pair<Index, Index>, NumDims> paddings;
+ DSizes<Index, NumDims> padded_dims;
+ for (int i = 0; i < NumDims; ++i) {
+ paddings[i] = std::make_pair(1, 2);
+ padded_dims[i] = dims[i] + 3;
+ }
+
+ const Index axis = NumDims == 1 ? 0 : NumDims / 2;
+
+ Tensor<T, NumDims, Layout> input(dims);
+ input.setRandom();
+
+ // cumsum(...).pad(...) — TensorScan + TensorPadding.
+ VerifyBlockEvaluator<T, NumDims, Layout>(input.cumsum(axis).pad(paddings),
+ [&padded_dims]() { return RandomBlock<Layout>(padded_dims, 1, 5); });
+
+ // swap_layout().pad(...) — operand built with the opposite layout so the
+ // composed expression evaluates in the test's Layout.
+ constexpr int InputLayout = (Layout == ColMajor) ? RowMajor : ColMajor;
+ Tensor<T, NumDims, InputLayout> swap_input(dims);
+ swap_input.setRandom();
+ DSizes<Index, NumDims> swap_padded_dims;
+ for (int i = 0; i < NumDims; ++i) {
+ swap_padded_dims[i] = dims[NumDims - 1 - i] + 3;
+ }
+ VerifyBlockEvaluator<T, NumDims, Layout>(swap_input.swap_layout().pad(paddings), [&swap_padded_dims]() {
+ return RandomBlock<Layout>(swap_padded_dims, 1, 5);
+ });
+}
+
+// 2D-specific regression: contract(...).pad(...) hit the same composition bug
+// because TensorContraction has RawAccess=true but lacked getResourceRequirements().
+template <typename T, int Layout>
+static void test_eval_contract_pad_composition() {
+ Tensor<T, 2, Layout> A(8, 6);
+ Tensor<T, 2, Layout> B(6, 4);
+ A.setRandom();
+ B.setRandom();
+
+ Eigen::array<IndexPair<Index>, 1> contract_dims = {IndexPair<Index>(1, 0)};
+ array<std::pair<Index, Index>, 2> paddings = {std::pair<Index, Index>{1, 1}, std::pair<Index, Index>{2, 2}};
+ DSizes<Index, 2> padded_dims(8 + 2, 4 + 4);
+
+ VerifyBlockEvaluator<T, 2, Layout>(A.contract(B, contract_dims).pad(paddings),
+ [&padded_dims]() { return RandomBlock<Layout>(padded_dims, 1, 5); });
+}
+
+template <typename T, int NumDims, int Layout>
static void test_eval_tensor_reshape() {
DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 10);
@@ -815,6 +931,27 @@
CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_binary_with_unary_expr_block);
CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_broadcast);
CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_reshape);
+ CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_layout_swap);
+ CALL_SUBTEST_PART(2)((test_eval_tensor_scan<float, 2, RowMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_tensor_scan<float, 3, RowMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_tensor_scan<float, 4, RowMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_tensor_scan<float, 2, ColMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_tensor_scan<float, 3, ColMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_tensor_scan<float, 4, ColMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_tensor_fft<float, 2, RowMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_tensor_fft<float, 3, RowMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_tensor_fft<float, 4, RowMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_tensor_fft<float, 2, ColMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_tensor_fft<float, 3, ColMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_tensor_fft<float, 4, ColMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_composed_block_ops<float, 2, RowMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_composed_block_ops<float, 3, RowMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_composed_block_ops<float, 4, RowMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_composed_block_ops<float, 2, ColMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_composed_block_ops<float, 3, ColMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_composed_block_ops<float, 4, ColMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_contract_pad_composition<float, RowMajor>()));
+ CALL_SUBTEST_PART(2)((test_eval_contract_pad_composition<float, ColMajor>()));
CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_cast);
CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_select);
CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_padding);