Vectorize TensorReverse / TensorRoll packet() inner-slice fast path libeigen/eigen!2478 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
diff --git a/unsupported/Eigen/src/Tensor/TensorReverse.h b/unsupported/Eigen/src/Tensor/TensorReverse.h index ece85ca..158eac4 100644 --- a/unsupported/Eigen/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/src/Tensor/TensorReverse.h
@@ -191,15 +191,28 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); - // TODO(ndjaitly): write a better packing routine that uses - // local structure. + // Fast path: when the whole packet stays inside a single inner-most + // slice of the input, replace PacketSize coeff() calls with one packet + // load (plus a preverse when the inner dim is reversed). + constexpr int inner_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumDims - 1; + const Index inner_size = m_dimensions[inner_dim]; + const Index inner_pos = index - (index / inner_size) * inner_size; + if (inner_pos + PacketSize <= inner_size) { + if (m_reverse[inner_dim]) { + const Index input_index = reverseIndex(index + PacketSize - 1); + return internal::preverse(m_impl.template packet<Unaligned>(input_index)); + } + return m_impl.template packet<Unaligned>(reverseIndex(index)); + } + + // Slow path: the packet crosses an inner-slice boundary, so the + // contiguous-load trick does not apply. EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize]; EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index + i); } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; + return internal::pload<PacketReturnType>(values); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { @@ -322,7 +335,9 @@ compute_cost += 2 * TensorOpCost::AddCost<Index>(); } } - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + // The inner-slice fast path runs the per-coeff index math once per packet, + // so the amortized compute cost matches the vectorized convention. + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
diff --git a/unsupported/Eigen/src/Tensor/TensorRoll.h b/unsupported/Eigen/src/Tensor/TensorRoll.h index e4b5181..2184b4d 100644 --- a/unsupported/Eigen/src/Tensor/TensorRoll.h +++ b/unsupported/Eigen/src/Tensor/TensorRoll.h
@@ -189,13 +189,28 @@ template <int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); + + // Fast path: when the entire packet stays inside one inner-most slice + // of both the output and the rolled input (no modular wrap on the + // inner dim), the PacketSize coeff() calls collapse to a single + // contiguous packet load from the underlying tensor. + constexpr int inner_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumDims - 1; + const Index inner_size = m_dimensions[inner_dim]; + const Index inner_pos = index - (index / inner_size) * inner_size; + if (inner_pos + PacketSize <= inner_size) { + const Index rolled_inner_pos = roll(inner_pos, m_rolls[inner_dim], inner_size); + if (rolled_inner_pos + PacketSize <= inner_size) { + return m_impl.template packet<Unaligned>(rollIndex(index)); + } + } + + // Slow path: the packet straddles a slice boundary on either side. EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize]; EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index + i); } - PacketReturnType rslt = internal::pload<PacketReturnType>(values); - return rslt; + return internal::pload<PacketReturnType>(values); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const { @@ -273,7 +288,9 @@ for (int i = 0; i < NumDims; ++i) { compute_cost += 2 * TensorOpCost::AddCost<Index>(); } - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize); + // The inner-slice fast path runs the per-coeff index math once per packet, + // so the amortized compute cost matches the vectorized convention. + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } EIGEN_DEVICE_FUNC typename Storage::Type data() const { return nullptr; }
diff --git a/unsupported/benchmarks/Tensor/CMakeLists.txt b/unsupported/benchmarks/Tensor/CMakeLists.txt index c0a7c9c..301ce39 100644 --- a/unsupported/benchmarks/Tensor/CMakeLists.txt +++ b/unsupported/benchmarks/Tensor/CMakeLists.txt
@@ -7,3 +7,5 @@ eigen_add_benchmark(bench_morphing bench_morphing.cpp) eigen_add_benchmark(bench_coefficient_wise bench_coefficient_wise.cpp) eigen_add_benchmark(bench_image_patch bench_image_patch.cpp) +eigen_add_benchmark(bench_reverse bench_reverse.cpp) +eigen_add_benchmark(bench_roll bench_roll.cpp)
diff --git a/unsupported/benchmarks/Tensor/bench_reverse.cpp b/unsupported/benchmarks/Tensor/bench_reverse.cpp new file mode 100644 index 0000000..852df68 --- /dev/null +++ b/unsupported/benchmarks/Tensor/bench_reverse.cpp
@@ -0,0 +1,103 @@ +// Benchmarks for Eigen TensorReverse. + +#include <benchmark/benchmark.h> +#include <unsupported/Eigen/CXX11/Tensor> + +using namespace Eigen; + +typedef float Scalar; + +// --- Reverse only the inner-most (contiguous) dimension. SIMD preverse case. --- +static void BM_Reverse_Inner(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor<Scalar, 2> A(M, N); + A.setRandom(); + + array<bool, 2> dim_rev = {true, false}; + + for (auto _ : state) { + Tensor<Scalar, 2> B = A.reverse(dim_rev); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(M) * N * sizeof(Scalar)); +} + +// --- Reverse only an outer dimension. Inner dim stays contiguous. --- +static void BM_Reverse_Outer(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor<Scalar, 2> A(M, N); + A.setRandom(); + + array<bool, 2> dim_rev = {false, true}; + + for (auto _ : state) { + Tensor<Scalar, 2> B = A.reverse(dim_rev); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(M) * N * sizeof(Scalar)); +} + +// --- Reverse every dimension. --- +static void BM_Reverse_All(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + + Tensor<Scalar, 2> A(M, N); + A.setRandom(); + + array<bool, 2> dim_rev = {true, true}; + + for (auto _ : state) { + Tensor<Scalar, 2> B = A.reverse(dim_rev); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(M) * N * sizeof(Scalar)); +} + +// --- 3D reverse with the inner dim reversed (typical CNN-style layout). --- +static void BM_Reverse_3D_Inner(benchmark::State& state) { + const int D0 = state.range(0); + const int D1 = state.range(1); + const int D2 = state.range(2); + + Tensor<Scalar, 3> A(D0, D1, D2); + A.setRandom(); + + array<bool, 3> dim_rev = {true, false, false}; + + for (auto _ : state) { + Tensor<Scalar, 3> B = A.reverse(dim_rev); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(D0) * D1 * D2 * sizeof(Scalar)); +} + +// Sweep sizes that span L1 (~32 KB), L2 (~256 KB), and LLC (~MBs) for float +// tensors. Bytes per element = 4, so per-side sizes: +// 64x64 = 16 KB (L1) +// 256x256 = 256 KB (L2) +// 1024x1024 = 4 MB (LLC / DRAM) +static void ReverseSizes(::benchmark::Benchmark* b) { + for (int size : {64, 256, 1024}) { + b->Args({size, size}); + } +} + +static void Reverse3DSizes(::benchmark::Benchmark* b) { + b->Args({32, 32, 32}); // 128 KB + b->Args({64, 64, 64}); // 1 MB + b->Args({128, 128, 128}); // 8 MB +} + +BENCHMARK(BM_Reverse_Inner)->Apply(ReverseSizes); +BENCHMARK(BM_Reverse_Outer)->Apply(ReverseSizes); +BENCHMARK(BM_Reverse_All)->Apply(ReverseSizes); +BENCHMARK(BM_Reverse_3D_Inner)->Apply(Reverse3DSizes);
diff --git a/unsupported/benchmarks/Tensor/bench_roll.cpp b/unsupported/benchmarks/Tensor/bench_roll.cpp new file mode 100644 index 0000000..219a8ff --- /dev/null +++ b/unsupported/benchmarks/Tensor/bench_roll.cpp
@@ -0,0 +1,103 @@ +// Benchmarks for Eigen TensorRoll. + +#include <benchmark/benchmark.h> +#include <unsupported/Eigen/CXX11/Tensor> + +using namespace Eigen; + +typedef float Scalar; + +// --- Roll only the inner-most (contiguous) dimension. --- +static void BM_Roll_Inner(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int shift = state.range(2); + + Tensor<Scalar, 2> A(M, N); + A.setRandom(); + + array<Index, 2> rolls = {shift, 0}; + + for (auto _ : state) { + Tensor<Scalar, 2> B = A.roll(rolls); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(M) * N * sizeof(Scalar)); +} + +// --- Roll only an outer dimension. Inner dim stays contiguous. --- +static void BM_Roll_Outer(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int shift = state.range(2); + + Tensor<Scalar, 2> A(M, N); + A.setRandom(); + + array<Index, 2> rolls = {0, shift}; + + for (auto _ : state) { + Tensor<Scalar, 2> B = A.roll(rolls); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(M) * N * sizeof(Scalar)); +} + +// --- Roll every dimension. --- +static void BM_Roll_All(benchmark::State& state) { + const int M = state.range(0); + const int N = state.range(1); + const int shift = state.range(2); + + Tensor<Scalar, 2> A(M, N); + A.setRandom(); + + array<Index, 2> rolls = {shift, shift}; + + for (auto _ : state) { + Tensor<Scalar, 2> B = A.roll(rolls); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(M) * N * sizeof(Scalar)); +} + +// --- 3D roll with the inner dim shifted. --- +static void BM_Roll_3D_Inner(benchmark::State& state) { + const int D0 = state.range(0); + const int D1 = state.range(1); + const int D2 = state.range(2); + + Tensor<Scalar, 3> A(D0, D1, D2); + A.setRandom(); + + array<Index, 3> rolls = {D0 / 4, 0, 0}; + + for (auto _ : state) { + Tensor<Scalar, 3> B = A.roll(rolls); + benchmark::DoNotOptimize(B.data()); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * static_cast<int64_t>(D0) * D1 * D2 * sizeof(Scalar)); +} + +static void RollSizes(::benchmark::Benchmark* b) { + for (int size : {64, 256, 1024}) { + for (int shift : {1, 13}) { + b->Args({size, size, shift}); + } + } +} + +static void Roll3DSizes(::benchmark::Benchmark* b) { + b->Args({32, 32, 32}); + b->Args({64, 64, 64}); + b->Args({128, 128, 128}); +} + +BENCHMARK(BM_Roll_Inner)->Apply(RollSizes); +BENCHMARK(BM_Roll_Outer)->Apply(RollSizes); +BENCHMARK(BM_Roll_All)->Apply(RollSizes); +BENCHMARK(BM_Roll_3D_Inner)->Apply(Roll3DSizes);
diff --git a/unsupported/test/tensor_reverse.cpp b/unsupported/test/tensor_reverse.cpp index 37a836a..d6b5a7c 100644 --- a/unsupported/test/tensor_reverse.cpp +++ b/unsupported/test/tensor_reverse.cpp
@@ -169,6 +169,46 @@ } } +// Verify that the rvalue evaluator's packet() returns the same lanes as +// coeff() at every aligned and unaligned packet offset. This guards against +// regressions in the packet implementation that the executor-level tests +// (which only compare the assembled result) would not surface. +template <int DataLayout> +static void test_packet_reverse() { + using namespace Eigen::internal; + + Tensor<float, 3, DataLayout> tensor(8, 5, 7); + tensor.setRandom(); + + array<bool, 3> dim_rev_inner = + (DataLayout == ColMajor) ? array<bool, 3>{{true, false, false}} : array<bool, 3>{{false, false, true}}; + array<bool, 3> dim_rev_outer = + (DataLayout == ColMajor) ? array<bool, 3>{{false, false, true}} : array<bool, 3>{{true, false, false}}; + array<bool, 3> dim_rev_all{{true, true, true}}; + + for (const auto& dim_rev : {dim_rev_inner, dim_rev_outer, dim_rev_all}) { + auto expr = tensor.reverse(dim_rev); + using Eval = TensorEvaluator<const decltype(expr), DefaultDevice>; + using Packet = typename Eval::PacketReturnType; + constexpr int PacketSize = Eval::PacketSize; + + DefaultDevice device; + Eval eval(expr, device); + eval.evalSubExprsIfNeeded(nullptr); + + const Index total = tensor.size(); + EIGEN_ALIGN_MAX float lanes[PacketSize]; + for (Index offset = 0; offset + PacketSize <= total; ++offset) { + Packet p = eval.template packet<Unaligned>(offset); + pstoreu(lanes, p); + for (int i = 0; i < PacketSize; ++i) { + VERIFY_IS_EQUAL(lanes[i], eval.coeff(offset + i)); + } + } + eval.cleanup(); + } +} + EIGEN_DECLARE_TEST(tensor_reverse) { CALL_SUBTEST(test_simple_reverse<ColMajor>()); CALL_SUBTEST(test_simple_reverse<RowMajor>()); @@ -176,4 +216,6 @@ CALL_SUBTEST(test_expr_reverse<RowMajor>(true)); CALL_SUBTEST(test_expr_reverse<ColMajor>(false)); CALL_SUBTEST(test_expr_reverse<RowMajor>(false)); + CALL_SUBTEST(test_packet_reverse<ColMajor>()); + CALL_SUBTEST(test_packet_reverse<RowMajor>()); }
diff --git a/unsupported/test/tensor_roll.cpp b/unsupported/test/tensor_roll.cpp index 0142ba7..762b48e 100644 --- a/unsupported/test/tensor_roll.cpp +++ b/unsupported/test/tensor_roll.cpp
@@ -146,6 +146,47 @@ } } +// Verify that the rvalue evaluator's packet() returns the same lanes as +// coeff() for every offset across a range of shift values, including the +// no-shift, in-slice, and wrap-around cases. +template <int DataLayout> +static void test_packet_roll() { + using namespace Eigen::internal; + + Tensor<float, 3, DataLayout> tensor(8, 5, 7); + tensor.setRandom(); + + // Cover several shift configurations: zero, in-bounds, and a shift that + // forces a modular wrap-around inside any reasonable packet. + array<array<Index, 3>, 4> roll_configs; + roll_configs[0] = {{0, 0, 0}}; + roll_configs[1] = {{2, 1, 3}}; + roll_configs[2] = {{6, 0, 0}}; // near-end shift on dim 0 + roll_configs[3] = {{0, 0, 6}}; // near-end shift on dim 2 + + for (const auto& rolls : roll_configs) { + auto expr = tensor.roll(rolls); + using Eval = TensorEvaluator<const decltype(expr), DefaultDevice>; + using Packet = typename Eval::PacketReturnType; + constexpr int PacketSize = Eval::PacketSize; + + DefaultDevice device; + Eval eval(expr, device); + eval.evalSubExprsIfNeeded(nullptr); + + const Index total = tensor.size(); + EIGEN_ALIGN_MAX float lanes[PacketSize]; + for (Index offset = 0; offset + PacketSize <= total; ++offset) { + Packet p = eval.template packet<Unaligned>(offset); + pstoreu(lanes, p); + for (int i = 0; i < PacketSize; ++i) { + VERIFY_IS_EQUAL(lanes[i], eval.coeff(offset + i)); + } + } + eval.cleanup(); + } +} + EIGEN_DECLARE_TEST(tensor_roll) { CALL_SUBTEST(test_simple_roll<ColMajor>()); CALL_SUBTEST(test_simple_roll<RowMajor>()); @@ -153,4 +194,6 @@ CALL_SUBTEST(test_expr_roll<RowMajor>(true)); CALL_SUBTEST(test_expr_roll<ColMajor>(false)); CALL_SUBTEST(test_expr_roll<RowMajor>(false)); + CALL_SUBTEST(test_packet_roll<ColMajor>()); + CALL_SUBTEST(test_packet_roll<RowMajor>()); }