Merge from eigen/eigen

diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 05462c5..954863c 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h

@@ -1282,17 +1282,17 @@
 
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 std::complex<float> exp(const std::complex<float>& x) {
-  auto com = ::expf(x.real());
-  auto res_real = com * ::cosf(x.imag());
-  auto res_imag = com * ::sinf(x.imag());
+  float com = ::expf(x.real());
+  float res_real = com * ::cosf(x.imag());
+  float res_imag = com * ::sinf(x.imag());
   return std::complex<float>(res_real, res_imag);
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 std::complex<double> exp(const std::complex<double>& x) {
-  auto com = ::exp(x.real());
-  auto res_real = com * ::cos(x.imag());
-  auto res_imag = com * ::sin(x.imag());
+  double com = ::exp(x.real());
+  double res_real = com * ::cos(x.imag());
+  double res_imag = com * ::sin(x.imag());
   return std::complex<double>(res_real, res_imag);
 }
 #endif

diff --git a/Eigen/src/Core/SelfCwiseBinaryOp.h b/Eigen/src/Core/SelfCwiseBinaryOp.h
index 50099df..7c89c2e 100644
--- a/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/Eigen/src/Core/SelfCwiseBinaryOp.h

@@ -17,7 +17,6 @@
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op<Scalar,Scalar>());
   return derived();
 }
@@ -25,7 +24,6 @@
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator+=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op<Scalar,Scalar>());
   return derived();
 }
@@ -33,7 +31,6 @@
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase<Derived>::operator-=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op<Scalar,Scalar>());
   return derived();
 }
@@ -41,7 +38,6 @@
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other)
 {
-  typedef typename Derived::PlainObject PlainObject;
   internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op<Scalar,Scalar>());
   return derived();
 }

diff --git a/Eigen/src/Core/Solve.h b/Eigen/src/Core/Solve.h
index a8daea5..2bf940a 100644
--- a/Eigen/src/Core/Solve.h
+++ b/Eigen/src/Core/Solve.h

@@ -181,7 +181,7 @@
   }
 };
 
-} // end namepsace internal
+} // end namespace internal
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index 59aebd9..774e649 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h

@@ -318,9 +318,9 @@
 }
 
 #ifndef EIGEN_VECTORIZE_AVX512
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 #endif
 
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet8f>(const Packet8f& a) {

diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index eb5de43..4e2e916 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h

@@ -604,9 +604,9 @@
   pstore(to, pa);
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 
 template <>
 EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {

diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index c618cfa..d075043 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h

@@ -128,7 +128,7 @@
                                      _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
@@ -324,7 +324,7 @@
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 
 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {

diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index d1a7c65..c944d2c 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h

@@ -461,10 +461,16 @@
   pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
 }
 
+#if EIGEN_COMP_PGI
+typedef const void * SsePrefetchPtrType;
+#else
+typedef const char * SsePrefetchPtrType;
+#endif
+
 #ifndef EIGEN_VECTORIZE_AVX
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const void*)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
 #endif
 
 #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64

diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h
index e4d1310..15b5c5f 100644
--- a/Eigen/src/Core/products/Parallelizer.h
+++ b/Eigen/src/Core/products/Parallelizer.h

@@ -91,7 +91,7 @@
   // FIXME the transpose variable is only needed to properly split
   // the matrix product when multithreading is enabled. This is a temporary
   // fix to support row-major destination matrices. This whole
-  // parallelizer mechanism has to be redisigned anyway.
+  // parallelizer mechanism has to be redesigned anyway.
   EIGEN_UNUSED_VARIABLE(depth);
   EIGEN_UNUSED_VARIABLE(transpose);
   func(0,rows, 0,cols);

diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 9c68ecb..8927bd4 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h

@@ -702,7 +702,7 @@
 // If the user explicitly disable vectorization, then we also disable alignment
 #if defined(EIGEN_DONT_VECTORIZE)
   #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
-#elif defined(EIGEN_VECTORIZE_AVX512)
+#elif defined(__AVX512F__)
   // 64 bytes static alignment is preferred only if really required
   #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
 #elif defined(__AVX__)
@@ -1030,7 +1030,13 @@
 #   define EIGEN_NOEXCEPT
 #   define EIGEN_NOEXCEPT_IF(x)
 #   define EIGEN_NO_THROW throw()
-#   define EIGEN_EXCEPTION_SPEC(X) throw(X)
+#   if EIGEN_COMP_MSVC
+      // MSVC does not support exception specifications (warning C4290),
+      // and they are deprecated in c++11 anyway.
+#     define EIGEN_EXCEPTION_SPEC(X) throw()
+#   else
+#     define EIGEN_EXCEPTION_SPEC(X) throw(X)
+#   endif
 #endif
 
 #endif // EIGEN_MACROS_H

diff --git a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
index 0ace451..79e1e48 100644
--- a/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
+++ b/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h

@@ -108,7 +108,7 @@
   }
 };
 
-} // end namepsace internal
+} // end namespace internal
 
 } // end namespace Eigen
 

diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
index 84a1bf2..0aa92f8 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h

@@ -5,7 +5,7 @@
 
 /*
 
-NOTE: thes functions vave been adapted from the LDL library:
+NOTE: these functions have been adapted from the LDL library:
 
 LDL Copyright (c) 2005 by Timothy A. Davis.  All Rights Reserved.
 

diff --git a/doc/examples/Cwise_lgamma.cpp b/doc/examples/Cwise_lgamma.cpp
index f1c4f50..6bfaccb 100644
--- a/doc/examples/Cwise_lgamma.cpp
+++ b/doc/examples/Cwise_lgamma.cpp

@@ -6,4 +6,4 @@
 {
   Array4d v(0.5,10,0,-1);
   std::cout << v.lgamma() << std::endl;
-}
\ No newline at end of file
+}

diff --git a/doc/examples/Tutorial_simple_example_dynamic_size.cpp b/doc/examples/Tutorial_simple_example_dynamic_size.cpp
index 0f0280e..defcb1e 100644
--- a/doc/examples/Tutorial_simple_example_dynamic_size.cpp
+++ b/doc/examples/Tutorial_simple_example_dynamic_size.cpp

@@ -10,7 +10,7 @@
     MatrixXi m(size,size+1);         // a (size)x(size+1)-matrix of int's
     for (int j=0; j<m.cols(); ++j)   // loop over columns
       for (int i=0; i<m.rows(); ++i) // loop over rows
-        m(i,j) = i+j*m.rows();       // to access matrix coefficients,
+        m(i,j) = i+j*size;           // to access matrix coefficients,
                                      // use operator()(int,int)
     std::cout << m << "\n\n";
   }

diff --git a/doc/examples/matrixfree_cg.cpp b/doc/examples/matrixfree_cg.cpp
index 6a205ae..7469938 100644
--- a/doc/examples/matrixfree_cg.cpp
+++ b/doc/examples/matrixfree_cg.cpp

@@ -67,6 +67,7 @@
       // This method should implement "dst += alpha * lhs * rhs" inplace,
       // however, for iterative solvers, alpha is always equal to 1, so let's not bother about it.
       assert(alpha==Scalar(1) && "scaling is not implemented");
+      EIGEN_ONLY_USED_FOR_DEBUG(alpha);
 
       // Here we could simply call dst.noalias() += lhs.my_matrix() * rhs,
       // but let's do something fancier (and less efficient):

diff --git a/doc/special_examples/Tutorial_sparse_example.cpp b/doc/special_examples/Tutorial_sparse_example.cpp
index 89937b4..8850db0 100644
--- a/doc/special_examples/Tutorial_sparse_example.cpp
+++ b/doc/special_examples/Tutorial_sparse_example.cpp

@@ -1,5 +1,6 @@
 #include <Eigen/Sparse>
 #include <vector>
+#include <iostream>
 
 typedef Eigen::SparseMatrix<double> SpMat; // declares a column-major sparse matrix type of double
 typedef Eigen::Triplet<double> T;
@@ -9,7 +10,10 @@
 
 int main(int argc, char** argv)
 {
-  assert(argc==2);
+  if(argc!=2) {
+    std::cerr << "Error: expected one and only one argument.\n";
+    return -1;
+  }
   
   int n = 300;  // size of the image
   int m = n*n;  // number of unknowns (=number of pixels)

diff --git a/test/indexed_view.cpp b/test/indexed_view.cpp
index 033d883..2d46ffd 100644
--- a/test/indexed_view.cpp
+++ b/test/indexed_view.cpp

@@ -140,7 +140,7 @@
     "500  501  502  503  504  505  506  507  508  509")
   );
 
-  // takes the row numer 3, and repeat it 5 times
+  // take row number 3, and repeat it 5 times
   VERIFY( MATCH( A(seqN(3,5,0), all),
     "300  301  302  303  304  305  306  307  308  309\n"
     "300  301  302  303  304  305  306  307  308  309\n"

diff --git a/test/main.h b/test/main.h
index 0fcd6cb..a94e1ab 100644
--- a/test/main.h
+++ b/test/main.h

@@ -359,7 +359,7 @@
 inline bool test_isApprox(const short& a, const short& b)
 { return internal::isApprox(a, b, test_precision<short>()); }
 inline bool test_isApprox(const unsigned short& a, const unsigned short& b)
-{ return internal::isApprox(a, b, test_precision<unsigned long>()); }
+{ return internal::isApprox(a, b, test_precision<unsigned short>()); }
 inline bool test_isApprox(const unsigned int& a, const unsigned int& b)
 { return internal::isApprox(a, b, test_precision<unsigned int>()); }
 inline bool test_isApprox(const long& a, const long& b)

diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index 49cc33c..dfd7ab7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md

@@ -581,7 +581,7 @@
 
 Creates a tensor mapping an existing array of data. The data must not be freed
 until the TensorMap is discarded, and the size of the data must be large enough
-to accomodate of the coefficients of the tensor.
+to accommodate the coefficients of the tensor.
 
     float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
     Eigen::TensorMap<Tensor<float, 2>> a(data, 3, 4);

diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 1940a96..4fd9644 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h

@@ -48,7 +48,7 @@
   *
   * <dl>
   * <dt><b>Relation to other parts of Eigen:</b></dt>
-  * <dd>The midterm developement goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that
+  * <dd>The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that
   * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code
   * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor
   * class does not provide any of these features and is only available as a stand-alone class that just allows for

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 88b0af0..bdc1a17 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h

@@ -20,7 +20,7 @@
   * \brief The tensor base class.
   *
   * This class is the common parent of the Tensor and TensorMap class, thus
-  * making it possible to use either class interchangably in expressions.
+  * making it possible to use either class interchangeably in expressions.
   */
 
 template<typename Derived>

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index b6c93af..9ab6b35 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h

@@ -105,6 +105,7 @@
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  bool nByOne = false, oneByN = false;
 
   enum {
     IsAligned = true,
@@ -142,6 +143,24 @@
         m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
       }
     }
+
+    if (input_dims[0] == 1) {
+      oneByN = true;
+      for (int i = 1; i < NumDims; ++i) {
+        if (broadcast[i] != 1) {
+          oneByN = false;
+          break;
+        }
+      }
+    } else if (input_dims[NumDims-1] == 1) {
+      nByOne = true;
+      for (int i = 0; i < NumDims-1; ++i) {
+        if (broadcast[i] != 1) {
+          nByOne = false;
+          break;
+        }
+      }
+    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
@@ -237,9 +256,84 @@
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      return packetColMajor<LoadMode>(index);
+      if (oneByN) {
+        return packetNByOne<LoadMode>(index);
+      } else if (nByOne) {
+        return packetOneByN<LoadMode>(index);
+      } else {
+        return packetColMajor<LoadMode>(index);
+      }
     } else {
-      return packetRowMajor<LoadMode>(index);
+      if (oneByN) {
+        return packetOneByN<LoadMode>(index);
+      } else if (nByOne) {
+        return packetNByOne<LoadMode>(index);
+      } else {
+        return packetRowMajor<LoadMode>(index);
+      }
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    Index dim, inputIndex;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      dim = NumDims - 1;
+    } else {
+      dim = 0;
+    }
+
+    inputIndex = index % m_inputStrides[dim];
+    if (inputIndex + PacketSize <= m_inputStrides[dim]) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      for (int i = 0; i < PacketSize; ++i) {
+        if (inputIndex > m_inputStrides[dim]-1) {
+          inputIndex = 0;
+        }
+        values[i] = m_impl.coeff(inputIndex++);
+      }
+      return internal::pload<PacketReturnType>(values);
+    }
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
+  {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    Index dim, inputIndex, outputOffset;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      dim = 1;
+    } else {
+      dim = NumDims - 2;
+    }
+
+    inputIndex   = index / m_outputStrides[dim];
+    outputOffset = index % m_outputStrides[dim];
+    if (outputOffset + PacketSize <= m_outputStrides[dim]) {
+      values[0] = m_impl.coeff(inputIndex);
+      return internal::pload1<PacketReturnType>(values);
+    } else {
+      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
+        if (outputOffset + cur < m_outputStrides[dim]) {
+          values[i] = m_impl.coeff(inputIndex);
+        } else {
+          values[i] = m_impl.coeff(++inputIndex);
+          outputOffset = 0;
+          cur = 0;
+        }
+      }
+      return internal::pload<PacketReturnType>(values);
     }
   }
 
@@ -290,7 +384,11 @@
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndex);
       for (int i = 1; i < PacketSize; ++i) {
-        values[i] = coeffColMajor(originalIndex+i);
+        if (innermostLoc + i < m_impl.dimensions()[0]) {
+          values[i] = m_impl.coeff(inputIndex+i);
+        } else {
+          values[i] = coeffColMajor(originalIndex+i);
+        }
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;
@@ -342,7 +440,11 @@
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndex);
       for (int i = 1; i < PacketSize; ++i) {
-        values[i] = coeffRowMajor(originalIndex+i);
+        if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
+          values[i] = m_impl.coeff(inputIndex+i);
+        } else {
+          values[i] = coeffRowMajor(originalIndex+i);
+        }
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index d34f9ca..639d99f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h

@@ -75,7 +75,7 @@
       outer_n_ = outer_n_ != 0 ? outer_n_ : n;
     }
 #else
-    // Defaults, possibly overriden per-platform.
+    // Defaults, possibly overridden per-platform.
     copyA_ = true;
     copyB_ = false;
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index d30cc96..3c007b1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h

@@ -350,7 +350,7 @@
         // Normal number of notifications for k slice switch is
         // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
         // nm_ + nn_ notifications, because they will not receive notifications
-        // from preceeding kernels.
+        // from preceding kernels.
         state_switch_[x] =
             x == 0
                 ? 1
@@ -530,7 +530,7 @@
 
     void kernel(Index m, Index n, Index k) {
       // Note: order of iteration matters here. Iteration over m is innermost
-      // because we want to reuse the same packed rhs in consequetive tasks
+      // because we want to reuse the same packed rhs in consecutive tasks
       // (rhs fits into L2$ while lhs only into L3$).
       const Index nend = n * gn_ + gn(n);
       const Index mend = m * gm_ + gm(m);

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index b148dae..bb63bae 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h

@@ -195,7 +195,7 @@
     // 11 is L2 cache latency on Haswell.
     // We don't know whether data is in L1, L2 or L3. But we are most interested
     // in single-threaded computational time around 100us-10ms (smaller time
-    // is too small for parallelization, larger time is not intersting
+    // is too small for parallelization, larger time is not interesting
     // either because we are probably using all available threads already).
     // And for the target time range, L2 seems to be what matters. Data set
     // fitting into L1 is too small to take noticeable time. Data set fitting

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index 6158acb..e7beb2c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h

@@ -286,7 +286,7 @@
     tileSize =static_cast<Index>(m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>());
     auto s=  m_queue.get_device().template get_info<cl::sycl::info::device::vendor>();
     std::transform(s.begin(), s.end(), s.begin(), ::tolower);
-    if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+    if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size
       tileSize=std::min(static_cast<Index>(256), static_cast<Index>(tileSize));
     }
     rng = n;
@@ -303,7 +303,7 @@
   template<typename Index>
   EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1, Index &tileSize0, Index &tileSize1, Index &rng0, Index &rng1, Index &GRange0, Index &GRange1)  const {
     Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
-    if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+    if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size
       max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
     }
     Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
@@ -331,7 +331,7 @@
   template<typename Index>
   EIGEN_STRONG_INLINE void parallel_for_setup(Index dim0, Index dim1,Index dim2, Index &tileSize0, Index &tileSize1, Index &tileSize2, Index &rng0, Index &rng1, Index &rng2, Index &GRange0, Index &GRange1, Index &GRange2)  const {
     Index max_workgroup_Size = static_cast<Index>(maxSyclThreadsPerBlock());
-    if(m_queue.get_device().is_cpu()){ // intel doesnot allow to use max workgroup size
+    if(m_queue.get_device().is_cpu()){ // intel doesn't allow to use max workgroup size
       max_workgroup_Size=std::min(static_cast<Index>(256), static_cast<Index>(max_workgroup_Size));
     }
     Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
@@ -377,7 +377,7 @@
   EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
 
   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
-    // OpenCL doesnot have such concept
+    // OpenCL doesn't have such concept
     return 2;
   }
 
@@ -519,7 +519,7 @@
     return m_queue_stream->maxSyclThreadsPerBlock();
   }
   EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
-    // OpenCL doesnot have such concept
+    // OpenCL doesn't have such concept
     return m_queue_stream->maxSyclThreadsPerMultiProcessor();
   //  return stream_->deviceProperties().maxThreadsPerMultiProcessor;
   }
@@ -544,7 +544,7 @@
 };
 // This is used as a distingushable device inside the kernel as the sycl device class is not Standard layout.
 // This is internal and must not be used by user. This dummy device allow us to specialise the tensor evaluator
-// inside the kenrel. So we can have two types of eval for host and device. This is required for TensorArgMax operation
+// inside the kernel. So we can have two types of eval for host and device. This is required for TensorArgMax operation
 struct SyclKernelDevice:DefaultDevice{};
 
 }  // end namespace Eigen

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index f81da31..d6ab4d9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h

@@ -274,7 +274,7 @@
           }
         }
 
-        // processs the line
+        // process the line
         if (is_power_of_two) {
           processDataLineCooleyTukey(line_buf, line_len, log_len);
         }

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 354bbe8..6c237ba 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h

@@ -12,7 +12,7 @@
 
 namespace Eigen {
 
-// MakePointer class is used as a container of the adress space of the pointer
+// MakePointer class is used as a container of the address space of the pointer
 // on the host and on the device. From the host side it generates the T* pointer
 // and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to
 // T* m_data on the host. It is always called on the device.

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 91d4ead..f0f7c78 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h

@@ -272,8 +272,8 @@
           break;
         default:
           eigen_assert(false && "unexpected padding");
-          m_outputCols=0; // silence the uninitialised warnig;
-          m_outputRows=0; //// silence the uninitialised warnig;
+          m_outputCols=0; // silence the uninitialised warning;
+          m_outputRows=0; //// silence the uninitialised warning;
       }
     }
     eigen_assert(m_outputRows > 0);

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index bf5c532..25ba200 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h

@@ -167,7 +167,7 @@
     shift2 = log_div > 1 ? log_div-1 : 0;
   }
 
-  // Must have 0 <= numerator. On platforms that dont support the __uint128_t
+  // Must have 0 <= numerator. On platforms that don't support the __uint128_t
   // type numerator should also be less than 2^32-1.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
     eigen_assert(static_cast<typename UnsignedTraits<T>::type>(numerator) < NumTraits<UnsignedType>::highest()/2);

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 9489925..a379f5a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h

@@ -106,7 +106,7 @@
     /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
     if (GRange < outTileSize) outTileSize=GRange;
     /// creating the shared memory for calculating reduction.
-    /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
+    /// This one is used to collect all the reduced value of shared memory as we don't have global barrier on GPU. Once it is saved we can
     /// recursively apply reduction on it in order to reduce the whole.
     auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
     typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
@@ -150,7 +150,7 @@
 
     // getting final out buffer at the moment the created buffer is true because there is no need for assign
     /// creating the shared memory for calculating reduction.
-    /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
+    /// This one is used to collect all the reduced value of shared memory as we don't have global barrier on GPU. Once it is saved we can
     /// recursively apply reduction on it in order to reduce the whole.
       dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
       dev.sycl_queue().submit([&](cl::sycl::handler &cgh) {

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index 99245f7..b2b4fd8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h

@@ -31,7 +31,7 @@
   int refCount() const { return m_refcount; }
 
  private:
-  // No copy, no assigment;
+  // No copy, no assignment;
   TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
   TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other);
 

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
index a790570..a248e30 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h

@@ -117,7 +117,7 @@
 
 
 
-//TensorCustomOp must be specialised otherewise it will be captured by UnaryCategory while its action is different
+//TensorCustomOp must be specialised otherwise it will be captured by UnaryCategory while its action is different
 //from the UnaryCategory and it is similar to the general FunctorExtractor.
 /// specialisation of TensorCustomOp
 #define SYCLEXTRFUNCCUSTOMUNARYOP(CVQual)\

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
index e5b892f..a447c3f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclFunctors.h

@@ -80,7 +80,7 @@
     typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
     auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
     /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+    /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
     /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
     const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
     /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
@@ -121,7 +121,7 @@
     typedef typename ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
     auto device_expr = createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
     /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+    /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
     /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
     const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
     /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
@@ -168,7 +168,7 @@
     typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
     auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
     /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+    /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
     /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
     const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
     /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
@@ -215,7 +215,7 @@
     typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
     auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
     /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
-    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+    /// the first behaviour is when it is used as a root to launch the sub-kernel. The second one is when it is treated as a leafnode to pass the
     /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
     const auto device_self_expr= Eigen::TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, op);
     /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
index 58ab0f0..9e6c3e4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h

@@ -143,7 +143,7 @@
 /// \brief Collects internal details for generating index ranges [MIN, MAX)
 /// Declare primary template for index range builder
 /// \tparam MIN is the starting index in the tuple
-/// \tparam N represents sizeof..(elemens)- sizeof...(Is)
+/// \tparam N represents sizeof..(elements)- sizeof...(Is)
 /// \tparam Is... are the list of generated index so far
 template <size_t MIN, size_t N, size_t... Is>
 struct RangeBuilder;
@@ -161,7 +161,7 @@
 /// in this case we are recursively subtracting N by one and adding one
 /// index to Is... list until MIN==N
 /// \tparam MIN is the starting index in the tuple
-/// \tparam N represents sizeof..(elemens)- sizeof...(Is)
+/// \tparam N represents sizeof..(elements)- sizeof...(Is)
 /// \tparam Is... are the list of generated index so far
 template <size_t MIN, size_t N, size_t... Is>
 struct RangeBuilder : public RangeBuilder<MIN, N - 1, N - 1, Is...> {};

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 51c0995..ef199bf 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h

@@ -568,7 +568,7 @@
 
   Dimensions m_dimensions;
 
-  // Parameters passed to the costructor.
+  // Parameters passed to the constructor.
   Index m_plane_strides;
   Index m_row_strides;
   Index m_col_strides;

diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
index 0fe0b7c..04d6d6b 100644
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
+++ b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h

@@ -241,7 +241,7 @@
   * multiplying all elements in the given subgroup with the new
   * coset representative. Note that the first element of the
   * subgroup is always the identity element, so the first element of
-  * ther result of this template is going to be the coset
+  * the result of this template is going to be the coset
   * representative itself.
   *
   * Note that this template accepts an additional boolean parameter

diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
index 71d5555..0a71811 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h

@@ -33,10 +33,10 @@
 //   ec.Notify(true);
 //
 // Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
-// cheap, but they are executed only if the preceeding predicate check has
+// cheap, but they are executed only if the preceding predicate check has
 // failed.
 //
-// Algorihtm outline:
+// Algorithm outline:
 // There are two main variables: predicate (managed by user) and state_.
 // Operation closely resembles Dekker mutual algorithm:
 // https://en.wikipedia.org/wiki/Dekker%27s_algorithm
@@ -79,7 +79,7 @@
     uint64_t state = state_.load(std::memory_order_seq_cst);
     for (;;) {
       if (int64_t((state & kEpochMask) - epoch) < 0) {
-        // The preceeding waiter has not decided on its fate. Wait until it
+        // The preceding waiter has not decided on its fate. Wait until it
         // calls either CancelWait or CommitWait, or is notified.
         EIGEN_THREAD_YIELD();
         state = state_.load(std::memory_order_seq_cst);
@@ -110,7 +110,7 @@
     uint64_t state = state_.load(std::memory_order_relaxed);
     for (;;) {
       if (int64_t((state & kEpochMask) - epoch) < 0) {
-        // The preceeding waiter has not decided on its fate. Wait until it
+        // The preceding waiter has not decided on its fate. Wait until it
         // calls either CancelWait or CommitWait, or is notified.
         EIGEN_THREAD_YIELD();
         state = state_.load(std::memory_order_relaxed);

diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
index 49d0cdc..cb3690a 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h

@@ -198,7 +198,7 @@
   };
   std::mutex mutex_;
   // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
-  // front/back, repsectively. The remaining bits contain modification counters
+  // front/back, respectively. The remaining bits contain modification counters
   // that are incremented on Push operations. This allows us to (1) distinguish
   // between empty and full conditions (if we would use log(kSize) bits for
   // position, these conditions would be indistinguishable); (2) obtain

diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 96b3a82..ddd54f4 100644
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h

@@ -219,7 +219,7 @@
 
 #else
 
-// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen::array
+// The compiler supports c++11, and we're not targeting cuda: use std::array as Eigen::array
 #include <array>
 namespace Eigen {
 

diff --git a/unsupported/Eigen/NonLinearOptimization b/unsupported/Eigen/NonLinearOptimization
index 600ab4c..c22b890 100644
--- a/unsupported/Eigen/NonLinearOptimization
+++ b/unsupported/Eigen/NonLinearOptimization

@@ -35,7 +35,7 @@
   * a zero for the system (Powell hybrid "dogleg" method).
   *
   * This code is a port of minpack (http://en.wikipedia.org/wiki/MINPACK).
-  * Minpack is a very famous, old, robust and well-reknown package, written in 
+  * Minpack is a very famous, old, robust and well renowned package, written in
   * fortran. Those implementations have been carefully tuned, tested, and used
   * for several decades.
   *
@@ -63,7 +63,7 @@
   * Other tests were added by myself at the very beginning of the 
   * process and check the results for levenberg-marquardt using the reference data 
   * on http://www.itl.nist.gov/div898/strd/nls/nls_main.shtml. Since then i've 
-  * carefully checked that the same results were obtained when modifiying the 
+  * carefully checked that the same results were obtained when modifying the
   * code. Please note that we do not always get the exact same decimals as they do,
   * but this is ok : they use 128bits float, and we do the tests using the C type 'double',
   * which is 64 bits on most platforms (x86 and amd64, at least).

diff --git a/unsupported/Eigen/OpenGLSupport b/unsupported/Eigen/OpenGLSupport
index 87f5094..11d9956 100644
--- a/unsupported/Eigen/OpenGLSupport
+++ b/unsupported/Eigen/OpenGLSupport

@@ -25,7 +25,7 @@
   *
   * This module provides wrapper functions for a couple of OpenGL functions
   * which simplify the way to pass Eigen's object to openGL.
-  * Here is an exmaple:
+  * Here is an example:
   * 
   * \code
   * // You need to add path_to_eigen/unsupported to your include path.

diff --git a/unsupported/Eigen/src/BVH/KdBVH.h b/unsupported/Eigen/src/BVH/KdBVH.h
index 1b8d758..13f792c 100644
--- a/unsupported/Eigen/src/BVH/KdBVH.h
+++ b/unsupported/Eigen/src/BVH/KdBVH.h

@@ -170,7 +170,7 @@
   typedef internal::vector_int_pair<Scalar, Dim> VIPair;
   typedef std::vector<VIPair, aligned_allocator<VIPair> > VIPairList;
   typedef Matrix<Scalar, Dim, 1> VectorType;
-  struct VectorComparator //compares vectors, or, more specificall, VIPairs along a particular dimension
+  struct VectorComparator //compares vectors, or more specifically, VIPairs along a particular dimension
   {
     VectorComparator(int inDim) : dim(inDim) {}
     inline bool operator()(const VIPair &v1, const VIPair &v2) const { return v1.first[dim] < v2.first[dim]; }

diff --git a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
index 866a8a4..9f7bff7 100644
--- a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
+++ b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h

@@ -300,7 +300,7 @@
 
   /** \brief Reports whether previous computation was successful.
    *
-   * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+   * \returns \c Success if computation was successful, \c NoConvergence otherwise.
    */
   ComputationInfo info() const
   {

diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
index 28f52da..65c2e94 100644
--- a/unsupported/Eigen/src/EulerAngles/EulerSystem.h
+++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h

@@ -12,7 +12,7 @@
 
 namespace Eigen
 {
-  // Forward declerations
+  // Forward declarations
   template <typename _Scalar, class _System>
   class EulerAngles;
   

diff --git a/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h b/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
index dc0093e..37d5b4c 100644
--- a/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
+++ b/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h

@@ -99,7 +99,7 @@
 /** \ingroup IterativeSolvers_Module
   * Constrained conjugate gradient
   *
-  * Computes the minimum of \f$ 1/2((Ax).x) - bx \f$ under the contraint \f$ Cx \le f \f$
+  * Computes the minimum of \f$ 1/2((Ax).x) - bx \f$ under the constraint \f$ Cx \le f \f$
   */
 template<typename TMatrix, typename CMatrix,
          typename VectorX, typename VectorB, typename VectorF>

diff --git a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
index d603ba3..f40b80e 100644
--- a/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
+++ b/unsupported/Eigen/src/IterativeSolvers/DGMRES.h

@@ -39,7 +39,6 @@
 void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::Scalar& ncut)
 {
   eigen_assert(vec.size() == perm.size());
-  typedef typename IndexType::Scalar Index; 
   bool flag; 
   for (Index k  = 0; k < ncut; k++)
   {
@@ -112,7 +111,6 @@
     using Base::_solve_impl;
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
     typedef typename MatrixType::StorageIndex StorageIndex;
     typedef typename MatrixType::RealScalar RealScalar;
     typedef _Preconditioner Preconditioner;
@@ -146,7 +144,7 @@
   void _solve_with_guess_impl(const Rhs& b, Dest& x) const
   {    
     bool failed = false;
-    for(int j=0; j<b.cols(); ++j)
+    for(Index j=0; j<b.cols(); ++j)
     {
       m_iterations = Base::maxIterations();
       m_error = Base::m_tolerance;
@@ -170,17 +168,17 @@
   /** 
    * Get the restart value
     */
-  int restart() { return m_restart; }
+  Index restart() { return m_restart; }
   
   /** 
    * Set the restart value (default is 30)  
    */
-  void set_restart(const int restart) { m_restart=restart; }
+  Index set_restart(const Index restart) { m_restart=restart; }
   
   /** 
    * Set the number of eigenvalues to deflate at each restart 
    */
-  void setEigenv(const int neig) 
+  void setEigenv(const Index neig) 
   {
     m_neig = neig;
     if (neig+1 > m_maxNeig) m_maxNeig = neig+1; // To allow for complex conjugates
@@ -189,12 +187,12 @@
   /** 
    * Get the size of the deflation subspace size
    */ 
-  int deflSize() {return m_r; }
+  Index deflSize() {return m_r; }
   
   /**
    * Set the maximum size of the deflation subspace
    */
-  void setMaxEigenv(const int maxNeig) { m_maxNeig = maxNeig; }
+  void setMaxEigenv(const Index maxNeig) { m_maxNeig = maxNeig; }
   
   protected:
     // DGMRES algorithm 
@@ -202,27 +200,27 @@
     void dgmres(const MatrixType& mat,const Rhs& rhs, Dest& x, const Preconditioner& precond) const;
     // Perform one cycle of GMRES
     template<typename Dest>
-    int dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, int& nbIts) const; 
+    Index dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, Index& nbIts) const; 
     // Compute data to use for deflation 
-    int dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const;
+    Index dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const;
     // Apply deflation to a vector
     template<typename RhsType, typename DestType>
-    int dgmresApplyDeflation(const RhsType& In, DestType& Out) const; 
+    Index dgmresApplyDeflation(const RhsType& In, DestType& Out) const; 
     ComplexVector schurValues(const ComplexSchur<DenseMatrix>& schurofH) const;
     ComplexVector schurValues(const RealSchur<DenseMatrix>& schurofH) const;
     // Init data for deflation
     void dgmresInitDeflation(Index& rows) const; 
     mutable DenseMatrix m_V; // Krylov basis vectors
     mutable DenseMatrix m_H; // Hessenberg matrix 
-    mutable DenseMatrix m_Hes; // Initial hessenberg matrix wihout Givens rotations applied
+    mutable DenseMatrix m_Hes; // Initial hessenberg matrix without Givens rotations applied
     mutable Index m_restart; // Maximum size of the Krylov subspace
     mutable DenseMatrix m_U; // Vectors that form the basis of the invariant subspace 
     mutable DenseMatrix m_MU; // matrix operator applied to m_U (for next cycles)
     mutable DenseMatrix m_T; /* T=U^T*M^{-1}*A*U */
     mutable PartialPivLU<DenseMatrix> m_luT; // LU factorization of m_T
     mutable StorageIndex m_neig; //Number of eigenvalues to extract at each restart
-    mutable int m_r; // Current number of deflated eigenvalues, size of m_U
-    mutable int m_maxNeig; // Maximum number of eigenvalues to deflate
+    mutable Index m_r; // Current number of deflated eigenvalues, size of m_U
+    mutable Index m_maxNeig; // Maximum number of eigenvalues to deflate
     mutable RealScalar m_lambdaN; //Modulus of the largest eigenvalue of A
     mutable bool m_isDeflAllocated;
     mutable bool m_isDeflInitialized;
@@ -244,13 +242,13 @@
               const Preconditioner& precond) const
 {
   //Initialization
-  int n = mat.rows(); 
+  Index n = mat.rows(); 
   DenseVector r0(n); 
-  int nbIts = 0; 
+  Index nbIts = 0; 
   m_H.resize(m_restart+1, m_restart);
   m_Hes.resize(m_restart, m_restart);
   m_V.resize(n,m_restart+1);
-  //Initial residual vector and intial norm
+  //Initial residual vector and initial norm
   x = precond.solve(x);
   r0 = rhs - mat * x; 
   RealScalar beta = r0.norm(); 
@@ -284,7 +282,7 @@
  */
 template< typename _MatrixType, typename _Preconditioner>
 template<typename Dest>
-int DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, int& nbIts) const
+Index DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, Index& nbIts) const
 {
   //Initialization 
   DenseVector g(m_restart+1); // Right hand side of the least square problem
@@ -293,8 +291,8 @@
   m_V.col(0) = r0/beta; 
   m_info = NoConvergence; 
   std::vector<JacobiRotation<Scalar> >gr(m_restart); // Givens rotations
-  int it = 0; // Number of inner iterations 
-  int n = mat.rows();
+  Index it = 0; // Number of inner iterations 
+  Index n = mat.rows();
   DenseVector tv1(n), tv2(n);  //Temporary vectors
   while (m_info == NoConvergence && it < m_restart && nbIts < m_iterations)
   {    
@@ -312,7 +310,7 @@
    
     // Orthogonalize it with the previous basis in the basis using modified Gram-Schmidt
     Scalar coef; 
-    for (int i = 0; i <= it; ++i)
+    for (Index i = 0; i <= it; ++i)
     { 
       coef = tv1.dot(m_V.col(i));
       tv1 = tv1 - coef * m_V.col(i); 
@@ -328,7 +326,7 @@
     // FIXME Check for happy breakdown 
     
     // Update Hessenberg matrix with Givens rotations
-    for (int i = 1; i <= it; ++i) 
+    for (Index i = 1; i <= it; ++i) 
     {
       m_H.col(it).applyOnTheLeft(i-1,i,gr[i-1].adjoint());
     }
@@ -418,7 +416,7 @@
 }
 
 template< typename _MatrixType, typename _Preconditioner>
-int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const
+Index DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, StorageIndex& neig) const
 {
   // First, find the Schur form of the Hessenberg matrix H
   typename internal::conditional<NumTraits<Scalar>::IsComplex, ComplexSchur<DenseMatrix>, RealSchur<DenseMatrix> >::type schurofH; 
@@ -433,8 +431,8 @@
   
   // Reorder the absolute values of Schur values
   DenseRealVector modulEig(it); 
-  for (int j=0; j<it; ++j) modulEig(j) = std::abs(eig(j)); 
-  perm.setLinSpaced(it,0,it-1);
+  for (Index j=0; j<it; ++j) modulEig(j) = std::abs(eig(j)); 
+  perm.setLinSpaced(it,0,internal::convert_index<StorageIndex>(it-1));
   internal::sortWithPermutation(modulEig, perm, neig);
   
   if (!m_lambdaN)
@@ -442,7 +440,7 @@
     m_lambdaN = (std::max)(modulEig.maxCoeff(), m_lambdaN);
   }
   //Count the real number of extracted eigenvalues (with complex conjugates)
-  int nbrEig = 0; 
+  Index nbrEig = 0; 
   while (nbrEig < neig)
   {
     if(eig(perm(it-nbrEig-1)).imag() == RealScalar(0)) nbrEig++; 
@@ -451,7 +449,7 @@
   // Extract the  Schur vectors corresponding to the smallest Ritz values
   DenseMatrix Sr(it, nbrEig); 
   Sr.setZero();
-  for (int j = 0; j < nbrEig; j++)
+  for (Index j = 0; j < nbrEig; j++)
   {
     Sr.col(j) = schurofH.matrixU().col(perm(it-j-1));
   }
@@ -462,8 +460,8 @@
   if (m_r)
   {
    // Orthogonalize X against m_U using modified Gram-Schmidt
-   for (int j = 0; j < nbrEig; j++)
-     for (int k =0; k < m_r; k++)
+   for (Index j = 0; j < nbrEig; j++)
+     for (Index k =0; k < m_r; k++)
       X.col(j) = X.col(j) - (m_U.col(k).dot(X.col(j)))*m_U.col(k); 
   }
   
@@ -473,7 +471,7 @@
     dgmresInitDeflation(m); 
   DenseMatrix MX(m, nbrEig);
   DenseVector tv1(m);
-  for (int j = 0; j < nbrEig; j++)
+  for (Index j = 0; j < nbrEig; j++)
   {
     tv1 = mat * X.col(j);
     MX.col(j) = precond.solve(tv1);
@@ -488,8 +486,8 @@
   }
   
   // Save X into m_U and m_MX in m_MU
-  for (int j = 0; j < nbrEig; j++) m_U.col(m_r+j) = X.col(j);
-  for (int j = 0; j < nbrEig; j++) m_MU.col(m_r+j) = MX.col(j);
+  for (Index j = 0; j < nbrEig; j++) m_U.col(m_r+j) = X.col(j);
+  for (Index j = 0; j < nbrEig; j++) m_MU.col(m_r+j) = MX.col(j);
   // Increase the size of the invariant subspace
   m_r += nbrEig; 
   
@@ -502,7 +500,7 @@
 }
 template<typename _MatrixType, typename _Preconditioner>
 template<typename RhsType, typename DestType>
-int DGMRES<_MatrixType, _Preconditioner>::dgmresApplyDeflation(const RhsType &x, DestType &y) const
+Index DGMRES<_MatrixType, _Preconditioner>::dgmresApplyDeflation(const RhsType &x, DestType &y) const
 {
   DenseVector x1 = m_U.leftCols(m_r).transpose() * x; 
   y = x + m_U.leftCols(m_r) * ( m_lambdaN * m_luT.solve(x1) - x1);

diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h b/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
index ae9d793..1234858 100644
--- a/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h

@@ -73,7 +73,7 @@
             qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
             wa[k] = temp;
 
-            /*           accumulate the tranformation in the row of s. */
+            /*           accumulate the transformation in the row of s. */
             for (i = k+1; i<n; ++i) {
                 temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
                 sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];

diff --git a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
index 9954279..5f64501 100644
--- a/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
+++ b/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h

@@ -233,9 +233,9 @@
     
     /** 
      * \brief Reports whether the minimization was successful
-     * \returns \c Success if the minimization was succesful,
+     * \returns \c Success if the minimization was successful,
      *         \c NumericalIssue if a numerical problem arises during the 
-     *          minimization process, for exemple during the QR factorization
+     *          minimization process, for example during the QR factorization
      *         \c NoConvergence if the minimization did not converge after 
      *          the maximum number of function evaluation allowed
      *          \c InvalidInput if the input matrix is invalid

diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
index 85ab3d9..0335699 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h

@@ -313,7 +313,7 @@
       matrix_exp_pade17(A, U, V);
     }
   
-#elif LDBL_MANT_DIG <= 112  // quadruple precison
+#elif LDBL_MANT_DIG <= 112  // quadruple precision
   
     if (l1norm < 1.639394610288918690547467954466970e-005L) {
       matrix_exp_pade3(arg, U, V);

diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
index ebc433d..33609ae 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h

@@ -81,7 +81,7 @@
  *
  * \note Currently this class is only used by MatrixPower. One may
  * insist that this be nested into MatrixPower. This class is here to
- * faciliate future development of triangular matrix functions.
+ * facilitate future development of triangular matrix functions.
  */
 template<typename MatrixType>
 class MatrixPowerAtomic : internal::noncopyable

diff --git a/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h b/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
index feafd62..4f2f560 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h

@@ -61,7 +61,7 @@
             qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
             wa[k] = temp;
 
-            /*           accumulate the tranformation in the row of s. */
+            /*           accumulate the transformation in the row of s. */
             for (i = k+1; i<n; ++i) {
                 temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
                 sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];

diff --git a/unsupported/Eigen/src/NonLinearOptimization/r1updt.h b/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
index f287660..09fc652 100644
--- a/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
+++ b/unsupported/Eigen/src/NonLinearOptimization/r1updt.h

@@ -22,7 +22,7 @@
     Scalar temp;
     JacobiRotation<Scalar> givens;
 
-    // r1updt had a broader usecase, but we dont use it here. And, more
+    // r1updt had a broader usecase, but we don't use it here. And, more
     // importantly, we can not test it.
     eigen_assert(m==n);
     eigen_assert(u.size()==m);

diff --git a/unsupported/Eigen/src/Polynomials/Companion.h b/unsupported/Eigen/src/Polynomials/Companion.h
index e0af6eb..41a4efc 100644
--- a/unsupported/Eigen/src/Polynomials/Companion.h
+++ b/unsupported/Eigen/src/Polynomials/Companion.h

@@ -104,7 +104,7 @@
     /** Helper function for the balancing algorithm.
      * \returns true if the row and the column, having colNorm and rowNorm
      * as norms, are balanced, false otherwise.
-     * colB and rowB are repectively the multipliers for
+     * colB and rowB are respectively the multipliers for
      * the column and the row in order to balance them.
      * */
     bool balanced( RealScalar colNorm, RealScalar rowNorm,
@@ -113,7 +113,7 @@
     /** Helper function for the balancing algorithm.
      * \returns true if the row and the column, having colNorm and rowNorm
      * as norms, are balanced, false otherwise.
-     * colB and rowB are repectively the multipliers for
+     * colB and rowB are respectively the multipliers for
      * the column and the row in order to balance them.
      * */
     bool balancedR( RealScalar colNorm, RealScalar rowNorm,

diff --git a/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h b/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
index a1f54ed..bda057a 100644
--- a/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
+++ b/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h

@@ -41,7 +41,7 @@
 
     /** Sets the relative threshold value used to prune zero coefficients during the decomposition.
      *
-     * Setting a value greater than zero speeds up computation, and yields to an imcomplete
+     * Setting a value greater than zero speeds up computation, and yields to an incomplete
      * factorization with fewer non zero coefficients. Such approximate factors are especially
      * useful to initialize an iterative solver.
      *

diff --git a/unsupported/Eigen/src/Skyline/SkylineMatrix.h b/unsupported/Eigen/src/Skyline/SkylineMatrix.h
index a2a8933..f77d79a 100644
--- a/unsupported/Eigen/src/Skyline/SkylineMatrix.h
+++ b/unsupported/Eigen/src/Skyline/SkylineMatrix.h

@@ -206,26 +206,26 @@
             if (col > row) //upper matrix
             {
                 const Index minOuterIndex = inner - m_data.upperProfile(inner);
-                eigen_assert(outer >= minOuterIndex && "you try to acces a coeff that do not exist in the storage");
+                eigen_assert(outer >= minOuterIndex && "You tried to access a coeff that does not exist in the storage");
                 return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
             }
             if (col < row) //lower matrix
             {
                 const Index minInnerIndex = outer - m_data.lowerProfile(outer);
-                eigen_assert(inner >= minInnerIndex && "you try to acces a coeff that do not exist in the storage");
+                eigen_assert(inner >= minInnerIndex && "You tried to access a coeff that does not exist in the storage");
                 return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
             }
         } else {
             if (outer > inner) //upper matrix
             {
                 const Index maxOuterIndex = inner + m_data.upperProfile(inner);
-                eigen_assert(outer <= maxOuterIndex && "you try to acces a coeff that do not exist in the storage");
+                eigen_assert(outer <= maxOuterIndex && "You tried to access a coeff that does not exist in the storage");
                 return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
             }
             if (outer < inner) //lower matrix
             {
                 const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
-                eigen_assert(inner <= maxInnerIndex && "you try to acces a coeff that do not exist in the storage");
+                eigen_assert(inner <= maxInnerIndex && "You tried to access a coeff that does not exist in the storage");
                 return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
             }
         }
@@ -300,11 +300,11 @@
 
         if (IsRowMajor) {
             const Index minInnerIndex = outer - m_data.lowerProfile(outer);
-            eigen_assert(inner >= minInnerIndex && "you try to acces a coeff that do not exist in the storage");
+            eigen_assert(inner >= minInnerIndex && "You tried to access a coeff that does not exist in the storage");
             return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
         } else {
             const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
-            eigen_assert(inner <= maxInnerIndex && "you try to acces a coeff that do not exist in the storage");
+            eigen_assert(inner <= maxInnerIndex && "You tried to access a coeff that does not exist in the storage");
             return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
         }
     }
@@ -336,11 +336,11 @@
 
         if (IsRowMajor) {
             const Index minOuterIndex = inner - m_data.upperProfile(inner);
-            eigen_assert(outer >= minOuterIndex && "you try to acces a coeff that do not exist in the storage");
+            eigen_assert(outer >= minOuterIndex && "You tried to access a coeff that does not exist in the storage");
             return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
         } else {
             const Index maxOuterIndex = inner + m_data.upperProfile(inner);
-            eigen_assert(outer <= maxOuterIndex && "you try to acces a coeff that do not exist in the storage");
+            eigen_assert(outer <= maxOuterIndex && "You tried to access a coeff that does not exist in the storage");
             return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
         }
     }

diff --git a/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h b/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
index 037a13f..3f1ff14 100644
--- a/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
+++ b/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h

@@ -187,7 +187,7 @@
     /** Does nothing: provided for compatibility with SparseMatrix */
     inline void finalize() {}
 
-    /** Suppress all nonzeros which are smaller than \a reference under the tolerence \a epsilon */
+    /** Suppress all nonzeros which are smaller than \a reference under the tolerance \a epsilon */
     void prune(Scalar reference, RealScalar epsilon = NumTraits<RealScalar>::dummy_precision())
     {
       for (Index j=0; j<outerSize(); ++j)
@@ -224,21 +224,21 @@
       }
     }
 
-    /** The class DynamicSparseMatrix is deprectaed */
+    /** The class DynamicSparseMatrix is deprecated */
     EIGEN_DEPRECATED inline DynamicSparseMatrix()
       : m_innerSize(0), m_data(0)
     {
       eigen_assert(innerSize()==0 && outerSize()==0);
     }
 
-    /** The class DynamicSparseMatrix is deprectaed */
+    /** The class DynamicSparseMatrix is deprecated */
     EIGEN_DEPRECATED inline DynamicSparseMatrix(Index rows, Index cols)
       : m_innerSize(0)
     {
       resize(rows, cols);
     }
 
-    /** The class DynamicSparseMatrix is deprectaed */
+    /** The class DynamicSparseMatrix is deprecated */
     template<typename OtherDerived>
     EIGEN_DEPRECATED explicit inline DynamicSparseMatrix(const SparseMatrixBase<OtherDerived>& other)
       : m_innerSize(0)

diff --git a/unsupported/Eigen/src/SparseExtra/MarketIO.h b/unsupported/Eigen/src/SparseExtra/MarketIO.h
index 6d57ab2..1618b09 100644
--- a/unsupported/Eigen/src/SparseExtra/MarketIO.h
+++ b/unsupported/Eigen/src/SparseExtra/MarketIO.h

@@ -104,7 +104,7 @@
     out << value.real << " " << value.imag()<< "\n"; 
   }
 
-} // end namepsace internal
+} // end namespace internal
 
 inline bool getMarketHeader(const std::string& filename, int& sym, bool& iscomplex, bool& isvector)
 {

diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 7870290..444fd14 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h

@@ -1720,6 +1720,8 @@
   }
 };
 
+#endif  // EIGEN_HAS_C99_MATH
+
 /****************************************************************************
  * Implementation of Bessel function, based on Cephes                       *
  ****************************************************************************/
@@ -2048,8 +2050,6 @@
   }
 };
 
-#endif  // EIGEN_HAS_C99_MATH
-
 }  // end namespace internal
 
 namespace numext {

diff --git a/unsupported/Eigen/src/Splines/SplineFitting.h b/unsupported/Eigen/src/Splines/SplineFitting.h
index c761a9b..1a4b80a 100644
--- a/unsupported/Eigen/src/Splines/SplineFitting.h
+++ b/unsupported/Eigen/src/Splines/SplineFitting.h

@@ -181,7 +181,7 @@
    * \ingroup Splines_Module
    *
    * \param[in] pts The data points to which a spline should be fit.
-   * \param[out] chord_lengths The resulting chord lenggth vector.
+   * \param[out] chord_lengths The resulting chord length vector.
    *
    * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data
    **/   

diff --git a/unsupported/README.txt b/unsupported/README.txt
index 83479ff..70793bf 100644
--- a/unsupported/README.txt
+++ b/unsupported/README.txt

@@ -20,7 +20,7 @@
  - must rely on Eigen,
  - must be highly related to math,
  - should have some general purpose in the sense that it could
-   potentially become an offical Eigen module (or be merged into another one).
+   potentially become an official Eigen module (or be merged into another one).
 
 In doubt feel free to contact us. For instance, if your addons is very too specific
 but it shows an interesting way of using Eigen, then it could be a nice demo.

diff --git a/unsupported/bench/bench_svd.cpp b/unsupported/bench/bench_svd.cpp
index 01d8231..e7028a2 100644
--- a/unsupported/bench/bench_svd.cpp
+++ b/unsupported/bench/bench_svd.cpp

@@ -70,7 +70,7 @@
   std::cout<< std::endl;
   timerJacobi.reset();
   timerBDC.reset();
-  cout << " Computes rotaion matrix" <<endl;
+  cout << " Computes rotation matrix" <<endl;
   for (int k=1; k<=NUMBER_SAMPLE; ++k)
   {
     timerBDC.start();

diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 42b790d..e99eab0 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt

@@ -1,5 +1,5 @@
 # generate split test header file only if it does not yet exist
-# in order to prevent a rebuild everytime cmake is configured
+# in order to prevent a rebuild every time cmake is configured
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
   file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "")
   foreach(i RANGE 1 999)

diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp
index a917ec3..1d44182 100644
--- a/unsupported/test/autodiff_scalar.cpp
+++ b/unsupported/test/autodiff_scalar.cpp

@@ -81,7 +81,7 @@
   typedef std::numeric_limits<AD> A;
   typedef std::numeric_limits<Scalar> B;
 
-  // workaround "unsed typedef" warning:
+  // workaround "unused typedef" warning:
   VERIFY(!bool(internal::is_same<B, A>::value));
 
 #if EIGEN_HAS_CXX11

diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp
index 5c0ea58..a9d268e 100644
--- a/unsupported/test/cxx11_tensor_broadcasting.cpp
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp

@@ -180,6 +180,64 @@
 #endif
 }
 
+template <int DataLayout>
+static void test_simple_broadcasting_one_by_n()
+{
+  Tensor<float, 4, DataLayout> tensor(1,13,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 9;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 1;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 13);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 7);
+
+  for (int i = 0; i < 9; ++i) {
+    for (int j = 0; j < 13; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i%1,j%13,k%5,l%7), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_broadcasting_n_by_one()
+{
+  Tensor<float, 4, DataLayout> tensor(7,3,5,1);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 1;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 19;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 7);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 3);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 19);
+
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 19; ++l) {
+          VERIFY_IS_EQUAL(tensor(i%7,j%3,k%5,l%1), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
 
 void test_cxx11_tensor_broadcasting()
 {
@@ -191,4 +249,8 @@
   CALL_SUBTEST(test_static_broadcasting<RowMajor>());
   CALL_SUBTEST(test_fixed_size_broadcasting<ColMajor>());
   CALL_SUBTEST(test_fixed_size_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_n_by_one<RowMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_one_by_n<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting_n_by_one<ColMajor>());
 }

diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp
index f2f87f7..cf3e29f 100644
--- a/unsupported/test/cxx11_tensor_inflation_sycl.cpp
+++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp

@@ -22,10 +22,10 @@
 
 using Eigen::Tensor;
 
-// Inflation Defenition for each dimention the inflated val would be
+// Inflation Definition for each dimension the inflated val would be
 //((dim-1)*strid[dim] +1)
 
-// for 1 dimnention vector of size 3 with value (4,4,4) with the inflated stride value of 3 would be changed to
+// for 1 dimension vector of size 3 with value (4,4,4) with the inflated stride value of 3 would be changed to
 // tensor of size (2*3) +1 = 7 with the value of
 // (4, 0, 0, 4, 0, 0, 4).
 

diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index 167b75d..7a751ff 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu

@@ -247,7 +247,7 @@
   }
   for (int i = 0; i < num_elem; ++i) {
     std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
-    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
+    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accuracy nearby 1
       VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
     else
       VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));

diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu
index fa1a467..389c0a8 100644
--- a/unsupported/test/cxx11_tensor_random_cuda.cu
+++ b/unsupported/test/cxx11_tensor_random_cuda.cu

@@ -37,7 +37,7 @@
   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
   assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
 
-  // For now we just check thes code doesn't crash.
+  // For now we just check this code doesn't crash.
   // TODO: come up with a valid test of randomness
 }
 

diff --git a/unsupported/test/forward_adolc.cpp b/unsupported/test/forward_adolc.cpp
index 866db8e..6d0ae73 100644
--- a/unsupported/test/forward_adolc.cpp
+++ b/unsupported/test/forward_adolc.cpp

@@ -132,7 +132,7 @@
   }
 
   {
-    // simple instanciation tests
+    // simple instantiation tests
     Matrix<adtl::adouble,2,1> x;
     foo(x);
     Matrix<adtl::adouble,Dynamic,Dynamic> A(4,4);;

diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp
index 4f6723d..7cf4a77 100644
--- a/unsupported/test/sparse_extra.cpp
+++ b/unsupported/test/sparse_extra.cpp

@@ -8,7 +8,7 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 
-// import basic and product tests for deprectaed DynamicSparseMatrix
+// import basic and product tests for deprecated DynamicSparseMatrix
 #define EIGEN_NO_DEPRECATED_WARNING
 #include "sparse_basic.cpp"
 #include "sparse_product.cpp"

diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp
index 29ba620..802e161 100644
--- a/unsupported/test/special_functions.cpp
+++ b/unsupported/test/special_functions.cpp

@@ -335,6 +335,7 @@
         ArrayType test = betainc(a, b + one, x) + eps;
         verify_component_wise(test, expected););
   }
+#endif  // EIGEN_HAS_C99_MATH
 
   // Test Bessel function i0e. Reference results obtained with SciPy.
   {
@@ -403,6 +404,7 @@
     v_gamma_sample_der_alpha = np.vectorize(gamma_sample_der_alpha)(a, x)
   */
 
+#if EIGEN_HAS_C99_MATH
   // Test igamma_der_a
   {
     ArrayType a(30);
@@ -467,7 +469,7 @@
     CALL_SUBTEST(res = gamma_sample_der_alpha(alpha, sample);
                  verify_component_wise(res, v););
   }
-#endif
+#endif  // EIGEN_HAS_C99_MATH
 }
 
 void test_special_functions()