AVX2 - double->int64_t casting
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 3b94af5..4a0c72e 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -560,7 +560,7 @@
}
template <int N>
EIGEN_STRONG_INLINE std::enable_if_t<(N == 63), Packet4l> parithmetic_shift_right(Packet4l a) {
- return _mm256_shuffle_epi32(_mm256_srai_epi32(a, 31), (shuffle_mask<1, 1, 3, 3>::mask));
+ return _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);
}
template <int N>
EIGEN_STRONG_INLINE std::enable_if_t<(N < 0) || (N > 63), Packet4l> parithmetic_shift_right(Packet4l a) {
@@ -1802,14 +1802,12 @@
// pabs should be ok
template <>
EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) {
- const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF,
- 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF));
+ const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
return _mm256_and_ps(a, mask);
}
template <>
EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) {
- const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF,
- 0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF));
+ const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF));
return _mm256_and_pd(a, mask);
}
template <>
@@ -1829,28 +1827,28 @@
template <>
EIGEN_STRONG_INLINE Packet8h psignbit(const Packet8h& a) {
- return _mm_srai_epi16(a, 15);
+ return _mm_cmpgt_epi16(_mm_setzero_si128(), a);
}
template <>
EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
- return _mm_srai_epi16(a, 15);
+ return _mm_cmpgt_epi16(_mm_setzero_si128(), a);
}
template <>
EIGEN_STRONG_INLINE Packet8f psignbit(const Packet8f& a) {
- return _mm256_castsi256_ps(parithmetic_shift_right<31>((Packet8i)_mm256_castps_si256(a)));
+ return _mm256_castsi256_ps(_mm256_cmpgt_epi32(_mm256_setzero_si256(), _mm256_castps_si256(a)));
}
template <>
EIGEN_STRONG_INLINE Packet8ui psignbit(const Packet8ui& a) {
- return pzero(a);
+ return _mm256_setzero_si256();
}
#ifdef EIGEN_VECTORIZE_AVX2
template <>
EIGEN_STRONG_INLINE Packet4d psignbit(const Packet4d& a) {
- return _mm256_castsi256_pd(parithmetic_shift_right<63>((Packet4l)_mm256_castpd_si256(a)));
+ return _mm256_castsi256_pd(_mm256_cmpgt_epi64(_mm256_setzero_si256(), _mm256_castpd_si256(a)));
}
template <>
EIGEN_STRONG_INLINE Packet4ul psignbit(const Packet4ul& a) {
- return pzero(a);
+ return _mm256_setzero_si256();
}
#endif
diff --git a/Eigen/src/Core/arch/AVX/TypeCasting.h b/Eigen/src/Core/arch/AVX/TypeCasting.h
index 2581eff..9dcd6ef 100644
--- a/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -200,10 +200,38 @@
#if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL)
return _mm256_cvttpd_epi64(a);
#else
- EIGEN_ALIGN16 double aux[4];
- pstore(aux, a);
- return _mm256_set_epi64x(static_cast<int64_t>(aux[3]), static_cast<int64_t>(aux[2]), static_cast<int64_t>(aux[1]),
- static_cast<int64_t>(aux[0]));
+
+ // if 'a' exceeds the numerical limits of int64_t, the behavior is undefined
+
+ // e <= 0 corresponds to |a| < 1, which should result in zero. incidentally, intel intrinsics with shift arguments
+ // greater than or equal to 64 produce zero. furthermore, negative shifts appear to be interpreted as large positive
+ // shifts (two's complement), which also result in zero. therefore, e does not need to be clamped to [0, 64)
+
+ constexpr int kTotalBits = sizeof(double) * CHAR_BIT, kMantissaBits = std::numeric_limits<double>::digits - 1,
+ kExponentBits = kTotalBits - kMantissaBits - 1, kBias = (1 << (kExponentBits - 1)) - 1;
+
+ const __m256i cst_one = _mm256_set1_epi64x(1);
+ const __m256i cst_total_bits = _mm256_set1_epi64x(kTotalBits);
+ const __m256i cst_bias = _mm256_set1_epi64x(kBias);
+
+ __m256i a_bits = _mm256_castpd_si256(a);
+ // shift left by 1 to clear the sign bit, and shift right by kMantissaBits + 1 to recover biased exponent
+ __m256i biased_e = _mm256_srli_epi64(_mm256_slli_epi64(a_bits, 1), kMantissaBits + 1);
+ __m256i e = _mm256_sub_epi64(biased_e, cst_bias);
+
+ // shift to the left by kExponentBits + 1 to clear the sign and exponent bits
+ __m256i shifted_mantissa = _mm256_slli_epi64(a_bits, kExponentBits + 1);
+ // shift to the right by kTotalBits - e to convert the significand to an integer
+ __m256i result_significand = _mm256_srlv_epi64(shifted_mantissa, _mm256_sub_epi64(cst_total_bits, e));
+
+ // add the implied bit
+ __m256i result_exponent = _mm256_sllv_epi64(cst_one, e);
+ // e <= 0 is interpreted as a large positive shift (2's complement), which also conveniently results in zero
+ __m256i result = _mm256_add_epi64(result_significand, result_exponent);
+ // handle negative arguments
+ __m256i sign_mask = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a_bits);
+ result = _mm256_sub_epi64(_mm256_xor_si256(result, sign_mask), sign_mask);
+ return result;
#endif
}