Speed up StableNorm for non-trivial sizes and improve consistency between aligned and unaligned inputs.
diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h
index de84d81..711ee3f 100644
--- a/Eigen/src/Core/StableNorm.h
+++ b/Eigen/src/Core/StableNorm.h
@@ -48,34 +48,16 @@
template <typename VectorType, typename RealScalar>
void stable_norm_impl_inner_step(const VectorType& vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale) {
- typedef typename VectorType::Scalar Scalar;
const Index blockSize = 4096;
- typedef typename internal::nested_eval<VectorType, 2>::type VectorTypeCopy;
- typedef internal::remove_all_t<VectorTypeCopy> VectorTypeCopyClean;
- const VectorTypeCopy copy(vec);
-
- enum {
- CanAlign =
- ((int(VectorTypeCopyClean::Flags) & DirectAccessBit) ||
- (int(internal::evaluator<VectorTypeCopyClean>::Alignment) > 0) // FIXME Alignment)>0 might not be enough
- ) &&
- (blockSize * sizeof(Scalar) * 2 < EIGEN_STACK_ALLOCATION_LIMIT) &&
- (EIGEN_MAX_STATIC_ALIGN_BYTES >
- 0) // if we cannot allocate on the stack, then let's not bother about this optimization
- };
- typedef std::conditional_t<
- CanAlign,
- Ref<const Matrix<Scalar, Dynamic, 1, 0, blockSize, 1>, internal::evaluator<VectorTypeCopyClean>::Alignment>,
- typename VectorTypeCopyClean::ConstSegmentReturnType>
- SegmentWrapper;
Index n = vec.size();
-
- Index bi = internal::first_default_aligned(copy);
- if (bi > 0) internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
- for (; bi < n; bi += blockSize)
- internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi, numext::mini(blockSize, n - bi))), ssq, scale,
- invScale);
+ Index blockEnd = numext::round_down(n, blockSize);
+ for (Index i = 0; i < blockEnd; i += blockSize) {
+ internal::stable_norm_kernel(vec.template segment<blockSize>(i), ssq, scale, invScale);
+ }
+ if (n > blockEnd) {
+ internal::stable_norm_kernel(vec.tail(n - blockEnd), ssq, scale, invScale);
+ }
}
template <typename VectorType>
@@ -85,8 +67,7 @@
using std::sqrt;
Index n = vec.size();
-
- if (n == 1) return abs(vec.coeff(0));
+ if (EIGEN_PREDICT_FALSE(n == 1)) return abs(vec.coeff(0));
typedef typename VectorType::RealScalar RealScalar;
RealScalar scale(0);