Update the padding computation for PADDING_SAME to be consistent with TensorFlow.
diff --git a/.hgignore b/.hgignore index dcd9f44..ebbf746 100644 --- a/.hgignore +++ b/.hgignore
@@ -13,7 +13,7 @@ core.* *.bak *~ -build* +*build* *.moc.* *.moc ui_*
diff --git a/Eigen/Core b/Eigen/Core index c66359b..5a6dec8 100644 --- a/Eigen/Core +++ b/Eigen/Core
@@ -54,9 +54,9 @@ #endif #define EIGEN_DEVICE_FUNC __host__ __device__ - // We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro + // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro // works properly on the device side - #include <math_functions.hpp> + #include <cuda_runtime.h> #else #define EIGEN_DEVICE_FUNC #endif
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 15b361b..d145411 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h
@@ -136,7 +136,9 @@ public: EIGEN_DEVICE_FUNC plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) { - EIGEN_ONLY_USED_FOR_DEBUG(outerStride); +#ifndef EIGEN_INTERNAL_DEBUGGING + EIGEN_UNUSED_VARIABLE(outerStride); +#endif eigen_internal_assert(outerStride==OuterStride); } EIGEN_DEVICE_FUNC Index outerStride() const { return OuterStride; } @@ -1034,7 +1036,7 @@ OuterStrideAtCompileTime = HasSameStorageOrderAsArgType ? int(outer_stride_at_compile_time<ArgType>::ret) : int(inner_stride_at_compile_time<ArgType>::ret), - MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, + MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0, FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator<ArgType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0, FlagsRowMajorBit = XprType::Flags&RowMajorBit, @@ -1044,7 +1046,9 @@ Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit, PacketAlignment = unpacket_traits<PacketScalar>::alignment, - Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0, + Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) + && (OuterStrideAtCompileTime!=0) + && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0, Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0) }; typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
diff --git a/Eigen/src/Core/StableNorm.h b/Eigen/src/Core/StableNorm.h index 4ea598b..e0e7a0c 100644 --- a/Eigen/src/Core/StableNorm.h +++ b/Eigen/src/Core/StableNorm.h
@@ -165,7 +165,7 @@ typedef typename internal::nested_eval<Derived,2>::type DerivedCopy; typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean; - DerivedCopy copy(derived()); + const DerivedCopy copy(derived()); enum { CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit)
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 3a56b0e..e351b7a 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h
@@ -514,11 +514,13 @@ // EIGEN_STRONG_INLINE is a stronger version of the inline, using __forceinline on MSVC, // but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline // but GCC is still doing fine with just inline. +#ifndef EIGEN_STRONG_INLINE #if EIGEN_COMP_MSVC || EIGEN_COMP_ICC #define EIGEN_STRONG_INLINE __forceinline #else #define EIGEN_STRONG_INLINE inline #endif +#endif // EIGEN_ALWAYS_INLINE is the stronget, it has the effect of making the function inline and adding every possible // attribute to maximize inlining. This should only be used when really necessary: in particular,
diff --git a/Eigen/src/Geometry/Scaling.h b/Eigen/src/Geometry/Scaling.h index f58ca03..c1899a0 100755 --- a/Eigen/src/Geometry/Scaling.h +++ b/Eigen/src/Geometry/Scaling.h
@@ -29,6 +29,22 @@ * * \sa Scaling(), class DiagonalMatrix, MatrixBase::asDiagonal(), class Translation, class Transform */ + +namespace internal +{ + // This helper helps nvcc+MSVC to properly parse this file. + // See bug 1412. + template <typename Scalar, int Dim, int Mode> + struct uniformscaling_times_affine_returntype + { + enum + { + NewMode = int(Mode) == int(Isometry) ? Affine : Mode + }; + typedef Transform <Scalar, Dim, NewMode> type; + }; +} + template<typename _Scalar> class UniformScaling { @@ -60,9 +76,11 @@ /** Concatenates a uniform scaling and an affine transformation */ template<int Dim, int Mode, int Options> - inline Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> operator* (const Transform<Scalar,Dim, Mode, Options>& t) const + inline typename + internal::uniformscaling_times_affine_returntype <Scalar,Dim,Mode>::type + operator* (const Transform<Scalar, Dim, Mode, Options>& t) const { - Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> res = t; + typename internal::uniformscaling_times_affine_returntype <Scalar,Dim,Mode> res = t; res.prescale(factor()); return res; } @@ -70,7 +88,7 @@ /** Concatenates a uniform scaling and a linear transformation matrix */ // TODO returns an expression template<typename Derived> - inline typename internal::plain_matrix_type<Derived>::type operator* (const MatrixBase<Derived>& other) const + inline typename Eigen::internal::plain_matrix_type<Derived>::type operator* (const MatrixBase<Derived>& other) const { return other * m_factor; } template<typename Derived,int Dim>
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 0abd4c1..06865a3 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h
@@ -217,7 +217,7 @@ // Method to allocate and initialize matrix and attributes template<typename MatrixType> -void BDCSVD<MatrixType>::allocate(Index rows, Index cols, unsigned int computationOptions) +void BDCSVD<MatrixType>::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions) { m_isTranspose = (cols > rows); @@ -393,7 +393,7 @@ //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper. template<typename MatrixType> -void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift) +void BDCSVD<MatrixType>::divide (Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift) { // requires rows = cols + 1; using std::pow; @@ -573,7 +573,7 @@ // handling of round-off errors, be consistent in ordering // For instance, to solve the secular equation using FMM, see http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf template <typename MatrixType> -void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) +void BDCSVD<MatrixType>::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) { const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)(); using std::abs; @@ -1059,7 +1059,7 @@ // i >= 1, di almost null and zi non null. // We use a rotation to zero out zi applied to the left of M template <typename MatrixType> -void BDCSVD<MatrixType>::deflation43(Index firstCol, Index shift, Index i, Index size) +void BDCSVD<MatrixType>::deflation43(Eigen::Index firstCol, Eigen::Index shift, Eigen::Index i, Eigen::Index size) { using std::abs; using std::sqrt; @@ -1088,7 +1088,7 @@ // We apply two rotations to have zj = 0; // TODO deflation44 is still broken and not properly tested template <typename MatrixType> -void BDCSVD<MatrixType>::deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size) +void BDCSVD<MatrixType>::deflation44(Eigen::Index firstColu , Eigen::Index firstColm, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index i, Eigen::Index j, Eigen::Index size) { using std::abs; using std::sqrt; @@ -1128,7 +1128,7 @@ // acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive] template <typename MatrixType> -void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift) +void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index k, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift) { using std::sqrt; using std::abs;
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 43488b1..1c7c803 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h
@@ -610,7 +610,7 @@ }; template<typename MatrixType, int QRPreconditioner> -void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, unsigned int computationOptions) +void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions) { eigen_assert(rows >= 0 && cols >= 0);
diff --git a/Eigen/src/SVD/JacobiSVD_LAPACKE.h b/Eigen/src/SVD/JacobiSVD_LAPACKE.h index 5027215..ff0516f 100644 --- a/Eigen/src/SVD/JacobiSVD_LAPACKE.h +++ b/Eigen/src/SVD/JacobiSVD_LAPACKE.h
@@ -61,9 +61,10 @@ u = (LAPACKE_TYPE*)m_matrixU.data(); \ } else { ldu=1; u=&dummy; }\ MatrixType localV; \ - ldvt = (m_computeFullV) ? internal::convert_index<lapack_int>(m_cols) : (m_computeThinV) ? internal::convert_index<lapack_int>(m_diagSize) : 1; \ + lapack_int vt_rows = (m_computeFullV) ? internal::convert_index<lapack_int>(m_cols) : (m_computeThinV) ? internal::convert_index<lapack_int>(m_diagSize) : 1; \ if (computeV()) { \ - localV.resize(ldvt, m_cols); \ + localV.resize(vt_rows, m_cols); \ + ldvt = internal::convert_index<lapack_int>(localV.outerStride()); \ vt = (LAPACKE_TYPE*)localV.data(); \ } else { ldvt=1; vt=&dummy; }\ Matrix<LAPACKE_RTYPE, Dynamic, Dynamic> superb; superb.resize(m_diagSize, 1); \
diff --git a/bench/spbench/CMakeLists.txt b/bench/spbench/CMakeLists.txt index 9327356..029ba6d 100644 --- a/bench/spbench/CMakeLists.txt +++ b/bench/spbench/CMakeLists.txt
@@ -60,7 +60,7 @@ endif(SCOTCH_FOUND) set(SPARSE_LIBS ${SPARSE_LIBS} ${PASTIX_LIBRARIES_DEP} ${ORDERING_LIBRARIES}) set(PASTIX_ALL_LIBS ${PASTIX_LIBRARIES_DEP}) -endif(PASTIX_FOUND AND BLAS_FOUND) +endif() if(METIS_FOUND) include_directories(${METIS_INCLUDE_DIRS})
diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox index 0919d41..b6d08c7 100644 --- a/doc/PreprocessorDirectives.dox +++ b/doc/PreprocessorDirectives.dox
@@ -122,6 +122,10 @@ this threshold raises a compile time assertion. Use 0 to set no limit. Default is 128 KB. - \b \c EIGEN_NO_CUDA - disables CUDA support when defined. Might be useful in .cu files for which Eigen is used on the host only, and never called from device code. + - \b \c EIGEN_STRONG_INLINE - This macro is used to qualify critical functions and methods that we expect the compiler to inline. + By default it is defined to \c __forceinline for MSVC and ICC, and to \c inline for other compilers. A tipical usage is to + define it to \c inline for MSVC users wanting faster compilation times, at the risk of performance degradations in some rare + cases for which MSVC inliner fails to do a good job. - \c EIGEN_DONT_ALIGN - Deprecated, it is a synonym for \c EIGEN_MAX_ALIGN_BYTES=0. It disables alignment completely. %Eigen will not try to align its objects and does not expect that any objects passed to it are aligned. This will turn off vectorization if \b EIGEN_UNALIGNED_VECTORIZE=1. Not defined by default.
diff --git a/test/klu_support.cpp b/test/klu_support.cpp index 8b1fdeb..138dcc3 100644 --- a/test/klu_support.cpp +++ b/test/klu_support.cpp
@@ -10,7 +10,7 @@ #define EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS #include "sparse_solver.h" -#include <unsupported/Eigen/KLUSupport> +#include <Eigen/KLUSupport> template<typename T> void test_klu_support_T() {
diff --git a/test/sparse_basic.cpp b/test/sparse_basic.cpp index 3849850..f84b6e3 100644 --- a/test/sparse_basic.cpp +++ b/test/sparse_basic.cpp
@@ -228,8 +228,8 @@ VERIFY_RAISES_ASSERT( m1 -= m1.innerVector(0) ); VERIFY_RAISES_ASSERT( refM1 -= m1.innerVector(0) ); VERIFY_RAISES_ASSERT( refM1 += m1.innerVector(0) ); - m1 = m4; refM1 = refM4; } + m1 = m4; refM1 = refM4; // test aliasing VERIFY_IS_APPROX((m1 = -m1), (refM1 = -refM1));
diff --git a/test/stable_norm.cpp b/test/stable_norm.cpp index c3eb5ff..3c02474 100644 --- a/test/stable_norm.cpp +++ b/test/stable_norm.cpp
@@ -65,6 +65,8 @@ factor = internal::random<Scalar>(); Scalar small = factor * ((std::numeric_limits<RealScalar>::min)() * RealScalar(1e4)); + Scalar one(1); + MatrixType vzero = MatrixType::Zero(rows, cols), vrand = MatrixType::Random(rows, cols), vbig(rows, cols), @@ -78,6 +80,14 @@ VERIFY_IS_APPROX(vrand.blueNorm(), vrand.norm()); VERIFY_IS_APPROX(vrand.hypotNorm(), vrand.norm()); + // test with expressions as input + VERIFY_IS_APPROX((one*vrand).stableNorm(), vrand.norm()); + VERIFY_IS_APPROX((one*vrand).blueNorm(), vrand.norm()); + VERIFY_IS_APPROX((one*vrand).hypotNorm(), vrand.norm()); + VERIFY_IS_APPROX((one*vrand+one*vrand-one*vrand).stableNorm(), vrand.norm()); + VERIFY_IS_APPROX((one*vrand+one*vrand-one*vrand).blueNorm(), vrand.norm()); + VERIFY_IS_APPROX((one*vrand+one*vrand-one*vrand).hypotNorm(), vrand.norm()); + RealScalar size = static_cast<RealScalar>(m.size()); // test numext::isfinite
diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 83c1439..37e7495 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp
@@ -207,6 +207,12 @@ VERIFY(test_redux(Vector1(), LinearVectorizedTraversal,CompleteUnrolling)); + VERIFY(test_redux(Vector1().array()*Vector1().array(), + LinearVectorizedTraversal,CompleteUnrolling)); + + VERIFY(test_redux((Vector1().array()*Vector1().array()).col(0), + LinearVectorizedTraversal,CompleteUnrolling)); + VERIFY(test_redux(Matrix<Scalar,PacketSize,3>(), LinearVectorizedTraversal,CompleteUnrolling));
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 0d6331e..1d459a3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -836,7 +836,8 @@ protected: template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor; template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize; - template <typename OtherDerived, int AccessLevel> friend class TensorBase; + // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 + template <typename OtherDerived, int AccessLevel> friend class Eigen::TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); } }; @@ -852,7 +853,8 @@ template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor; template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize; - template <typename OtherDerived, int OtherAccessLevel> friend class TensorBase; + // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 + template <typename OtherDerived, int OtherAccessLevel> friend class Eigen::TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& setZero() {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index 10e0a8a..f81da31 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -231,20 +231,32 @@ // t_n = exp(sqrt(-1) * pi * n^2 / line_len) // for n = 0, 1,..., line_len-1. // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 - pos_j_base_powered[0] = ComplexScalar(1, 0); - if (line_len > 1) { - const RealScalar pi_over_len(EIGEN_PI / line_len); - const ComplexScalar pos_j_base = ComplexScalar( - std::cos(pi_over_len), std::sin(pi_over_len)); - pos_j_base_powered[1] = pos_j_base; - if (line_len > 2) { - const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; - for (int j = 2; j < line_len + 1; ++j) { - pos_j_base_powered[j] = pos_j_base_powered[j - 1] * - pos_j_base_powered[j - 1] / - pos_j_base_powered[j - 2] * pos_j_base_sq; - } - } + + // The recurrence is correct in exact arithmetic, but causes + // numerical issues for large transforms, especially in + // single-precision floating point. + // + // pos_j_base_powered[0] = ComplexScalar(1, 0); + // if (line_len > 1) { + // const ComplexScalar pos_j_base = ComplexScalar( + // numext::cos(M_PI / line_len), numext::sin(M_PI / line_len)); + // pos_j_base_powered[1] = pos_j_base; + // if (line_len > 2) { + // const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; + // for (int i = 2; i < line_len + 1; ++i) { + // pos_j_base_powered[i] = pos_j_base_powered[i - 1] * + // pos_j_base_powered[i - 1] / + // pos_j_base_powered[i - 2] * + // pos_j_base_sq; + // } + // } + // } + // TODO(rmlarsen): Find a way to use Eigen's vectorized sin + // and cosine functions here. + for (int j = 0; j < line_len + 1; ++j) { + double arg = ((EIGEN_PI * j) * j) / line_len; + std::complex<double> tmp(numext::cos(arg), numext::sin(arg)); + pos_j_base_powered[j] = static_cast<ComplexScalar>(tmp); } }
diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp index 2f14ebc..a553694 100644 --- a/unsupported/test/cxx11_tensor_fft.cpp +++ b/unsupported/test/cxx11_tensor_fft.cpp
@@ -224,6 +224,32 @@ } } +template <typename RealScalar> +static void test_fft_non_power_of_2_round_trip(int exponent) { + int n = (1 << exponent) + 1; + + Eigen::DSizes<long, 1> dimensions; + dimensions[0] = n; + const DSizes<long, 1> arr = dimensions; + Tensor<RealScalar, 1, ColMajor, long> input; + + input.resize(arr); + input.setRandom(); + + array<int, 1> fft; + fft[0] = 0; + + Tensor<std::complex<RealScalar>, 1, ColMajor> forward = + input.template fft<BothParts, FFT_FORWARD>(fft); + + Tensor<RealScalar, 1, ColMajor, long> output = + forward.template fft<RealPart, FFT_REVERSE>(fft); + + for (int i = 0; i < n; ++i) { + VERIFY_IS_APPROX(input[i], output[i]); + } +} + void test_cxx11_tensor_fft() { test_fft_complex_input_golden(); test_fft_real_input_golden(); @@ -270,4 +296,6 @@ test_fft_real_input_energy<RowMajor, double, true, Eigen::BothParts, FFT_FORWARD, 4>(); test_fft_real_input_energy<RowMajor, float, false, Eigen::BothParts, FFT_FORWARD, 4>(); test_fft_real_input_energy<RowMajor, double, false, Eigen::BothParts, FFT_FORWARD, 4>(); + + test_fft_non_power_of_2_round_trip<float>(7); }