Avoid vectorizing tiny fixed-size assignments libeigen/eigen!2480 Closes #1342
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 5373222..3c30d22 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h
@@ -83,20 +83,36 @@ (OuterStride != Dynamic) && (OuterStride % InnerPacketSize == 0) && (EIGEN_UNALIGNED_VECTORIZE || JointAlignment >= InnerRequiredAlignment); static constexpr bool MayLinearize = StorageOrdersAgree && (DstFlags & SrcFlags & LinearAccessBit); + static constexpr int CoeffReadCost = int(DstEvaluator::CoeffReadCost) + int(SrcEvaluator::CoeffReadCost); + static constexpr bool SmallAssignmentScalarPathIsCheap = + (SizeAtCompileTime != Dynamic) && (SizeAtCompileTime * CoeffReadCost <= EIGEN_UNROLLING_LIMIT); + /* Packet traversal has enough setup/tail overhead that it is not worth it + for very small fixed-size assignments when the scalar path can be fully + unrolled. More expensive RHS expressions can still amortize packet setup. */ + static constexpr int SmallAssignmentPacketThreshold = 3; + static constexpr int LinearPacketThreshold = SmallAssignmentScalarPathIsCheap ? SmallAssignmentPacketThreshold : 1; + static constexpr int LinearSizeThreshold = LinearPacketThreshold * LinearPacketSize; static constexpr bool MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess && (EIGEN_UNALIGNED_VECTORIZE || (DstAlignment >= LinearRequiredAlignment) || MaxSizeAtCompileTime == Dynamic) && - (MaxSizeAtCompileTime == Dynamic || MaxSizeAtCompileTime >= LinearPacketSize); - /* If the destination isn't aligned, we have to do runtime checks and we don't unroll, - so it's only good for large enough sizes. */ - static constexpr int InnerSizeThreshold = (EIGEN_UNALIGNED_VECTORIZE ? 1 : 3) * InnerPacketSize; + (MaxSizeAtCompileTime == Dynamic || MaxSizeAtCompileTime >= LinearSizeThreshold); + /* Slice vectorization can be slow, so use MaxInnerSize rather than InnerSize: + a dynamic block in a fixed-size matrix can still have large slices. With + EIGEN_UNALIGNED_VECTORIZE and unrolling, one packet is still worthwhile for + non-vector slices. Cheap fixed-size vector blocks can otherwise fall back to + slice vectorization after the linear path is rejected, so use the same + conservative cutoff there. */ + static constexpr bool UseConservativeVectorInnerThreshold = IsVectorAtCompileTime && SmallAssignmentScalarPathIsCheap; + static constexpr int VectorInnerPacketThreshold = + (UseConservativeVectorInnerThreshold || !EIGEN_UNALIGNED_VECTORIZE) ? SmallAssignmentPacketThreshold : 1; + static constexpr int VectorInnerSizeThreshold = VectorInnerPacketThreshold * InnerPacketSize; + static constexpr int NonVectorInnerSizeThreshold = + (EIGEN_UNALIGNED_VECTORIZE ? 1 : SmallAssignmentPacketThreshold) * InnerPacketSize; + static constexpr int InnerSizeThreshold = + IsVectorAtCompileTime ? VectorInnerSizeThreshold : NonVectorInnerSizeThreshold; static constexpr bool MaySliceVectorize = MightVectorize && DstHasDirectAccess && (MaxInnerSizeAtCompileTime == Dynamic || MaxInnerSizeAtCompileTime >= InnerSizeThreshold); - /* slice vectorization can be slow, so we only want it if the slices are big, which is - indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block - in a fixed-size matrix - However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */ public: static constexpr int Traversal = SizeAtCompileTime == 0 ? AllAtOnceTraversal @@ -115,7 +131,6 @@ private: static constexpr int ActualPacketSize = Vectorized ? unpacket_traits<PacketType>::size : 1; static constexpr int UnrollingLimit = EIGEN_UNROLLING_LIMIT * ActualPacketSize; - static constexpr int CoeffReadCost = int(DstEvaluator::CoeffReadCost) + int(SrcEvaluator::CoeffReadCost); static constexpr bool MayUnrollCompletely = (SizeAtCompileTime != Dynamic) && (SizeAtCompileTime * CoeffReadCost <= UnrollingLimit); static constexpr bool MayUnrollInner =
diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index ab83d7e..edd9285 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp
@@ -84,6 +84,56 @@ return res; } +template <typename Dst, typename Src> +bool test_add_assign(int traversal, int unrolling) { + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst, Src); + typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>, internal::evaluator<Src>, + internal::add_assign_op<typename Dst::Scalar, typename Src::Scalar> > + traits; + bool res = traits::Traversal == traversal && traits::Unrolling == unrolling; + if (!res) { + std::cerr << "Src: " << demangle_flags(Src::Flags) << std::endl; + std::cerr << " " << demangle_flags(internal::evaluator<Src>::Flags) << std::endl; + std::cerr << "Dst: " << demangle_flags(Dst::Flags) << std::endl; + std::cerr << " " << demangle_flags(internal::evaluator<Dst>::Flags) << std::endl; + traits::debug(); + std::cerr << " Expected Traversal == " << demangle_traversal(traversal) << " got " + << demangle_traversal(traits::Traversal) << "\n"; + std::cerr << " Expected Unrolling == " << demangle_unrolling(unrolling) << " got " + << demangle_unrolling(traits::Unrolling) << "\n"; + } + return res; +} + +template <typename Dst, typename Src> +bool test_add_assign(const Dst&, const Src&, int traversal, int unrolling) { + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst, Src); + typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>, internal::evaluator<Src>, + internal::add_assign_op<typename Dst::Scalar, typename Src::Scalar> > + traits; + // If traversal or unrolling are negative, ignore. + bool res = traversal > -1 ? traits::Traversal == traversal : true; + if (unrolling > -1) { + if (unrolling == InnerUnrolling + CompleteUnrolling) { + res = res && (int(traits::Unrolling) == InnerUnrolling || int(traits::Unrolling) == CompleteUnrolling); + } else { + res = res && int(traits::Unrolling) == unrolling; + } + } + if (!res) { + std::cerr << "Src: " << demangle_flags(Src::Flags) << std::endl; + std::cerr << " " << demangle_flags(internal::evaluator<Src>::Flags) << std::endl; + std::cerr << "Dst: " << demangle_flags(Dst::Flags) << std::endl; + std::cerr << " " << demangle_flags(internal::evaluator<Dst>::Flags) << std::endl; + traits::debug(); + std::cerr << " Expected Traversal == " << demangle_traversal(traversal) << " got " + << demangle_traversal(traits::Traversal) << "\n"; + std::cerr << " Expected Unrolling == " << demangle_unrolling(unrolling) << " got " + << demangle_unrolling(traits::Unrolling) << "\n"; + } + return res; +} + template <typename Xpr> bool test_redux(const Xpr&, int traversal, int unrolling) { typedef typename Xpr::Scalar Scalar; @@ -157,6 +207,9 @@ : /*PacketSize==1 ?*/ 1), DontAlign | ((Matrix1::Flags & RowMajorBit) ? RowMajor : ColMajor)> Matrix1u; + constexpr int Matrix1InnerPacketSize = internal::unpacket_traits< + typename internal::find_best_packet<Scalar, Matrix1::InnerSizeAtCompileTime>::type>::size; + constexpr bool Matrix1CanInnerVectorize = int(Matrix1::InnerSizeAtCompileTime) % Matrix1InnerPacketSize == 0; // this type is made such that it can only be vectorized when viewed as a linear 1D vector typedef Matrix<Scalar, @@ -185,14 +238,11 @@ EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling)); VERIFY(test_assign(Matrix1(), Matrix1() + Matrix1(), - (int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize)) == 0 ? InnerVectorizedTraversal - : LinearVectorizedTraversal, - CompleteUnrolling)); + Matrix1CanInnerVectorize ? InnerVectorizedTraversal : LinearTraversal, CompleteUnrolling)); VERIFY(test_assign(Matrix1u(), Matrix1() + Matrix1(), EIGEN_UNALIGNED_VECTORIZE - ? ((int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize)) == 0 ? InnerVectorizedTraversal - : LinearVectorizedTraversal) + ? (Matrix1CanInnerVectorize ? InnerVectorizedTraversal : LinearTraversal) : LinearTraversal, CompleteUnrolling)); @@ -203,6 +253,55 @@ CompleteUnrolling)); if (PacketSize > 1) { + typedef Matrix<Scalar, 5, 1> Vector5; + typedef Matrix<Scalar, 8, 1> Vector8; + typedef Block<Vector8, 5, 1> Vector8Block5; + typedef Block<Vector8, 3, 1> Vector8Block3; + enum { + Vector5PacketSize = internal::unpacket_traits<typename internal::find_best_packet<Scalar, 5>::type>::size, + Vector3PacketSize = internal::unpacket_traits<typename internal::find_best_packet<Scalar, 3>::type>::size + }; + enum { + ThresholdPacketSize = internal::unpacket_traits<typename internal::find_best_packet<Scalar, 3>::type>::size + }; + typedef Matrix<Scalar, 2 * ThresholdPacketSize, 1> Vector2Packets; + typedef Matrix<Scalar, 3 * ThresholdPacketSize - 1, 1> VectorAlmost3Packets; + typedef Matrix<Scalar, 3 * ThresholdPacketSize, 1> Vector3Packets; + typedef Matrix<Scalar, 3 * ThresholdPacketSize + 1, 1> Vector3PacketsPlus1; + typedef Block<Vector3PacketsPlus1, 2 * ThresholdPacketSize, 1> VectorBlock2Packets; + typedef Block<Vector3Packets, 3 * ThresholdPacketSize - 1, 1> VectorBlockAlmost3Packets; + typedef Block<Vector3PacketsPlus1, 3 * ThresholdPacketSize, 1> VectorBlock3Packets; + if (Vector5PacketSize > 1) { + VERIFY((test_assign<Vector8Block5, Vector5>(LinearTraversal, CompleteUnrolling))); + VERIFY((test_add_assign<Vector8Block5, Vector5>(LinearTraversal, CompleteUnrolling))); + VERIFY((test_add_assign<Vector5, Vector8Block5>(LinearTraversal, CompleteUnrolling))); + } + if (Vector3PacketSize > 1) { + VERIFY((test_assign<Vector8Block3, Vector8Block3>(LinearTraversal, CompleteUnrolling))); + VERIFY((test_add_assign<Vector8Block3, Vector8Block3>(LinearTraversal, CompleteUnrolling))); + } + if (ThresholdPacketSize > 1) { + VERIFY((test_assign<VectorBlock2Packets, Vector2Packets>(LinearTraversal, CompleteUnrolling))); + VERIFY((test_add_assign<VectorBlock2Packets, Vector2Packets>(LinearTraversal, CompleteUnrolling))); + VERIFY((test_assign<VectorBlockAlmost3Packets, VectorAlmost3Packets>(LinearTraversal, CompleteUnrolling))); + VERIFY((test_add_assign<VectorBlockAlmost3Packets, VectorAlmost3Packets>(LinearTraversal, CompleteUnrolling))); + VERIFY((test_assign<VectorAlmost3Packets, VectorBlockAlmost3Packets>(LinearTraversal, CompleteUnrolling))); + VERIFY((test_add_assign<VectorAlmost3Packets, VectorBlockAlmost3Packets>(LinearTraversal, CompleteUnrolling))); + VERIFY((test_assign<VectorBlock3Packets, Vector3Packets>( + EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal, CompleteUnrolling))); + VERIFY((test_add_assign<VectorBlock3Packets, Vector3Packets>( + EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal, CompleteUnrolling))); + VERIFY((test_assign<Vector3Packets, VectorBlock3Packets>( + EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearTraversal, CompleteUnrolling))); + VERIFY((test_add_assign<Vector3Packets, VectorBlock3Packets>( + EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearTraversal, CompleteUnrolling))); + Vector3PacketsPlus1 vector3_packets_plus1; + VERIFY(test_add_assign( + vector3_packets_plus1.template segment<2 * ThresholdPacketSize>(1).array(), + vector3_packets_plus1.template tail<2 * ThresholdPacketSize>().array().log(), + (EIGEN_UNALIGNED_VECTORIZE && PacketTraits::HasLog) ? LinearVectorizedTraversal : LinearTraversal, -1)); + } + typedef Matrix<Scalar, 3, 3, ColMajor> Matrix33c; typedef Matrix<Scalar, 3, 1, ColMajor> Vector3; VERIFY( @@ -325,6 +424,12 @@ typedef Matrix<Scalar, 5 * MinVSize, 7, ColMajor> Matrix57; typedef Matrix<Scalar, 3 * MinVSize, 5, ColMajor> Matrix35; typedef Matrix<Scalar, 5 * MinVSize, 7, DontAlign | ColMajor> Matrix57u; + constexpr int Vector1LinearPacketSize = + internal::unpacket_traits<typename internal::find_best_packet<Scalar, Vector1::SizeAtCompileTime>::type>::size; + constexpr bool Vector1CanLinearVectorize = int(Vector1::SizeAtCompileTime) >= 3 * Vector1LinearPacketSize; + constexpr int Vector1SegmentTraversal = + EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal + : (Vector1CanLinearVectorize ? LinearVectorizedTraversal : LinearTraversal); typedef Matrix<Scalar, (PacketSize == 16 ? 8 @@ -352,6 +457,9 @@ : /*PacketSize==1 ?*/ 1), DontAlign | ((Matrix1::Flags & RowMajorBit) ? RowMajor : ColMajor)> Matrix1u; + constexpr int Matrix1InnerPacketSize = internal::unpacket_traits< + typename internal::find_best_packet<Scalar, Matrix1::InnerSizeAtCompileTime>::type>::size; + constexpr bool Matrix1CanInnerVectorize = int(Matrix1::InnerSizeAtCompileTime) % Matrix1InnerPacketSize == 0; // this type is made such that it can only be vectorized when viewed as a linear 1D vector typedef Matrix<Scalar, @@ -370,8 +478,7 @@ #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT VERIFY(test_assign(Vector1(), Vector1(), InnerVectorizedTraversal, CompleteUnrolling)); VERIFY(test_assign(Vector1(), Vector1() + Vector1(), InnerVectorizedTraversal, CompleteUnrolling)); - VERIFY(test_assign(Vector1(), Vector1().template segment<MinVSize>(0).derived(), - EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearVectorizedTraversal, + VERIFY(test_assign(Vector1(), Vector1().template segment<MinVSize>(0).derived(), Vector1SegmentTraversal, CompleteUnrolling)); VERIFY(test_assign(Vector1(), Scalar(RealScalar(2.1)) * Vector1() - Vector1(), InnerVectorizedTraversal, CompleteUnrolling)); @@ -379,7 +486,7 @@ Vector1(), (Scalar(RealScalar(2.1)) * Vector1().template segment<MinVSize>(0) - Vector1().template segment<MinVSize>(0)) .derived(), - EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearVectorizedTraversal, CompleteUnrolling)); + Vector1SegmentTraversal, CompleteUnrolling)); VERIFY(test_assign(Vector1(), Vector1().cwiseProduct(Vector1()), InnerVectorizedTraversal, CompleteUnrolling)); VERIFY(test_assign(Vector1(), Vector1().template cast<Scalar>(), InnerVectorizedTraversal, CompleteUnrolling)); @@ -391,8 +498,7 @@ VERIFY(test_assign(Matrix1u(), Matrix1() + Matrix1(), EIGEN_UNALIGNED_VECTORIZE - ? ((int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize)) == 0 ? InnerVectorizedTraversal - : LinearVectorizedTraversal) + ? (Matrix1CanInnerVectorize ? InnerVectorizedTraversal : LinearTraversal) : LinearTraversal, CompleteUnrolling)); @@ -471,15 +577,15 @@ CALL_SUBTEST(vectorization_logic_half<std::complex<double> >::run()); // For backends without sub-packet types (e.g. the generic clang backend), - // find_best_packet may return a packet larger than the matrix, making - // MayLinearVectorize false in the assignment path (which requires - // SizeAtCompileTime >= PacketSize). The redux path has no such constraint. + // find_best_packet may return a packet too large for tiny fixed-size assignments, + // making MayLinearVectorize false in the assignment path (which requires + // SizeAtCompileTime >= 3 * PacketSize). The redux path has no such constraint. if (internal::packet_traits<float>::Vectorizable) { constexpr int kFloatBestPacketSize3x3 = internal::unpacket_traits<typename internal::find_best_packet<float, 9>::type>::size; VERIFY(test_assign( Matrix<float, 3, 3>(), Matrix<float, 3, 3>() + Matrix<float, 3, 3>(), - EIGEN_UNALIGNED_VECTORIZE && kFloatBestPacketSize3x3 <= 9 ? LinearVectorizedTraversal : LinearTraversal, + EIGEN_UNALIGNED_VECTORIZE && 3 * kFloatBestPacketSize3x3 <= 9 ? LinearVectorizedTraversal : LinearTraversal, CompleteUnrolling)); VERIFY(test_redux(Matrix<float, 5, 2>(), EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal, @@ -491,7 +597,7 @@ internal::unpacket_traits<typename internal::find_best_packet<double, 9>::type>::size; VERIFY(test_assign( Matrix<double, 3, 3>(), Matrix<double, 3, 3>() + Matrix<double, 3, 3>(), - EIGEN_UNALIGNED_VECTORIZE && kDoubleBestPacketSize3x3 <= 9 ? LinearVectorizedTraversal : LinearTraversal, + EIGEN_UNALIGNED_VECTORIZE && 3 * kDoubleBestPacketSize3x3 <= 9 ? LinearVectorizedTraversal : LinearTraversal, CompleteUnrolling)); VERIFY(test_redux(Matrix<double, 7, 3>(), EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal,