Added support for tensor reductions and concatenations

diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index ebe6419..11161a5 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor

@@ -34,12 +34,15 @@
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceType.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
 
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorBase.h"
 
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h"
+#include "unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h"
 #include "unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h"

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 2da8f8c..2f7c9ec 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h

@@ -204,12 +204,40 @@
       return TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>(derived(), thenTensor.derived(), elseTensor.derived());
     }
 
+    // Reductions.
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::SumReducer<Scalar>, const Dims, const Derived>
+    sum(const Dims& dims) const {
+      return TensorReductionOp<internal::SumReducer<Scalar>, const Dims, const Derived>(derived(), dims, internal::SumReducer<Scalar>());
+    }
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MaxReducer<Scalar>, const Dims, const Derived>
+    maximum(const Dims& dims) const {
+      return TensorReductionOp<internal::MaxReducer<Scalar>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<Scalar>());
+    }
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MinReducer<Scalar>, const Dims, const Derived>
+    minimum(const Dims& dims) const {
+      return TensorReductionOp<internal::MinReducer<Scalar>, const Dims, const Derived>(derived(), dims, internal::MinReducer<Scalar>());
+    }
+    template <typename Reducer, typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<Reducer, const Dims, const Derived>
+    reduce(const Dims& dims, const Reducer& reducer) const {
+      return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer);
+    }
+
     template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorBroadcastingOp<const Broadcast, const Derived>
     broadcast(const Broadcast& broadcast) const {
       return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), broadcast);
     }
 
+    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConcatenationOp<Axis, const Derived, const OtherDerived>
+    concatenate(const OtherDerived& other, Axis axis) const {
+      return TensorConcatenationOp<Axis, const Derived, const OtherDerived>(derived(), other.derived(), axis);
+    }
+
     // Morphing operators.
     template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorReshapingOp<const NewDimensions, const Derived>

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
new file mode 100644
index 0000000..b8e43f4
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h

@@ -0,0 +1,217 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
+
+namespace Eigen {
+
+/** \class TensorConcatenationOp
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor concatenation class.
+  *
+  *
+  */
+namespace internal {
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >
+{
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename promote_storage_type<typename LhsXprType::Scalar,
+                                        typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<LhsXprType>::Index,
+                                      typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef typename remove_reference<LhsNested>::type _LhsNested;
+  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  enum { Flags = 0 };
+};
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, Eigen::Dense>
+{
+  typedef const TensorConcatenationOp<Axis, LhsXprType, RhsXprType>& type;
+};
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+struct nested<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, 1, typename eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >::type>
+{
+  typedef TensorConcatenationOp<Axis, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+
+template<typename Axis, typename LhsXprType, typename RhsXprType>
+class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors>
+{
+  public:
+    typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar;
+    typedef typename internal::traits<TensorConcatenationOp>::Packet Packet;
+    typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind;
+    typedef typename internal::traits<TensorConcatenationOp>::Index Index;
+    typedef typename internal::nested<TensorConcatenationOp>::type Nested;
+    typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+                                                    typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+    typedef typename internal::promote_storage_type<typename LhsXprType::PacketReturnType,
+                                                    typename RhsXprType::PacketReturnType>::ret PacketReturnType;
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis)
+        : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {}
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename LhsXprType::Nested>::type&
+    lhsExpression() const { return m_lhs_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename RhsXprType::Nested>::type&
+    rhsExpression() const { return m_rhs_xpr; }
+
+    EIGEN_DEVICE_FUNC Axis axis() const { return m_axis; }
+
+  protected:
+    typename LhsXprType::Nested m_lhs_xpr;
+    typename RhsXprType::Nested m_rhs_xpr;
+    const Axis m_axis;
+};
+
+
+// Eval as rvalue
+template<typename Axis, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
+{
+  typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+  static const int RightNumDims = internal::array_size<typename TensorEvaluator<RightArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
+  {
+    EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(0 <= m_axis && m_axis < NumDims);
+    const Dimensions& lhs_dims = m_leftImpl.dimensions();
+    const Dimensions& rhs_dims = m_rightImpl.dimensions();
+    int i = 0;
+    for (; i < m_axis; ++i) {
+      eigen_assert(lhs_dims[i] > 0);
+      eigen_assert(lhs_dims[i] == rhs_dims[i]);
+      m_dimensions[i] = lhs_dims[i];
+    }
+    eigen_assert(lhs_dims[i] > 0);  // Now i == m_axis.
+    eigen_assert(rhs_dims[i] > 0);
+    m_dimensions[i] = lhs_dims[i] + rhs_dims[i];
+    for (++i; i < NumDims; ++i) {
+      eigen_assert(lhs_dims[i] > 0);
+      eigen_assert(lhs_dims[i] == rhs_dims[i]);
+      m_dimensions[i] = lhs_dims[i];
+    }
+
+    m_leftStrides[0] = 1;
+    m_rightStrides[0] = 1;
+    m_outputStrides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      m_leftStrides[i] = m_leftStrides[i-1] * lhs_dims[i-1];
+      m_rightStrides[i] = m_rightStrides[i-1] * rhs_dims[i-1];
+      m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data)
+  {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
+  {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
+  // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow.
+  // See CL/76180724 comments for more ideas.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    // Collect dimension-wise indices (subs).
+    array<Index, NumDims> subs;
+    for (int i = NumDims - 1; i > 0; --i) {
+      subs[i] = index / m_outputStrides[i];
+      index -= subs[i] * m_outputStrides[i];
+    }
+    subs[0] = index;
+
+    const Dimensions& left_dims = m_leftImpl.dimensions();
+    if (subs[m_axis] < left_dims[m_axis]) {
+      Index left_index = subs[0];
+      for (int i = 1; i < NumDims; ++i) {
+        left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
+      }
+      return m_leftImpl.coeff(left_index);
+    } else {
+      subs[m_axis] -= left_dims[m_axis];
+      const Dimensions& right_dims = m_rightImpl.dimensions();
+      Index right_index = subs[0];
+      for (int i = 1; i < NumDims; ++i) {
+        right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
+      }
+      return m_rightImpl.coeff(right_index);
+    }
+  }
+
+  // TODO(phli): Add a real vectorization.
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Scalar* data() const { return NULL; }
+
+  protected:
+    const Axis m_axis;
+    Dimensions m_dimensions;
+    array<Index, NumDims> m_outputStrides;
+    array<Index, NumDims> m_leftStrides;
+    array<Index, NumDims> m_rightStrides;
+    TensorEvaluator<LeftArgType, Device> m_leftImpl;
+    TensorEvaluator<RightArgType, Device> m_rightImpl;
+};
+
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index afbcc94..bc67586 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h

@@ -21,8 +21,9 @@
 template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
 template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
 template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
-template<typename XprType> class TensorReductionOp;
 template<typename Broadcast, typename XprType> class TensorBroadcastingOp;
+template<typename Op, typename Dims, typename XprType> class TensorReductionOp;
+template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
 template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp;
 template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
 template<typename NewDimensions, typename XprType> class TensorReshapingOp;

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
new file mode 100644
index 0000000..9298433
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h

@@ -0,0 +1,62 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
+
+namespace Eigen {
+namespace internal {
+
+// Standard reduction functors
+template <typename T> struct SumReducer
+{
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SumReducer() : m_sum(0) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) {
+    m_sum += t;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize() const {
+    return m_sum;
+  }
+
+ private:
+  T m_sum;
+};
+
+template <typename T> struct MaxReducer
+{
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaxReducer() : m_max((std::numeric_limits<T>::min)()) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) {
+    if (t > m_max) { m_max = t; }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize() const {
+    return m_max;
+  }
+
+ private:
+  T m_max;
+};
+
+template <typename T> struct MinReducer
+{
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MinReducer() : m_min((std::numeric_limits<T>::max)()) { }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t) {
+    if (t < m_min) { m_min = t; }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize() const {
+    return m_min;
+  }
+
+ private:
+  T m_min;
+};
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H

diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
new file mode 100644
index 0000000..eef9921
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h

@@ -0,0 +1,226 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+
+namespace Eigen {
+
+/** \class TensorReduction
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reduction class.
+  *
+  */
+
+namespace internal {
+template<typename Op, typename Dims, typename XprType>
+struct traits<TensorReductionOp<Op, Dims, XprType> >
+ : traits<XprType>
+{
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename internal::packet_traits<Scalar>::type Packet;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::Index Index;
+  typedef typename XprType::Nested Nested;
+};
+
+template<typename Op, typename Dims, typename XprType>
+struct eval<TensorReductionOp<Op, Dims, XprType>, Eigen::Dense>
+{
+  typedef const TensorReductionOp<Op, Dims, XprType>& type;
+};
+
+template<typename Op, typename Dims, typename XprType>
+struct nested<TensorReductionOp<Op, Dims, XprType>, 1, typename eval<TensorReductionOp<Op, Dims, XprType> >::type>
+{
+  typedef TensorReductionOp<Op, Dims, XprType> type;
+};
+
+}  // end namespace internal
+
+
+template <typename Op, typename Dims, typename XprType>
+class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType>, ReadOnlyAccessors> {
+  public:
+    typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
+    typedef typename Eigen::internal::traits<TensorReductionOp>::Packet Packet;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef typename XprType::CoeffReturnType CoeffReturnType;
+    typedef typename XprType::PacketReturnType PacketReturnType;
+    typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
+    { }
+    TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
+    { }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const XprType& expression() const { return m_expr; }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const Dims& dims() const { return m_dims; }
+    const Op& reducer() const { return m_reducer; }
+
+  protected:
+    typename XprType::Nested m_expr;
+    const Dims m_dims;
+    const Op m_reducer;
+};
+
+
+// Eval as rvalue
+template<typename Op, typename Dims, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
+{
+  typedef TensorReductionOp<Op, Dims, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static const int NumReducedDims = internal::array_size<Dims>::value;
+  static const int NumDims = (NumInputDims==NumReducedDims) ? 1 : NumInputDims - NumReducedDims;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = false,  // The code isn't vectorized properly yet
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_reducer(op.reducer())
+  {
+    EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    array<bool, NumInputDims> reduced;
+    for (int i = 0; i < NumInputDims; ++i) {
+      reduced[i] = false;
+    }
+    for (int i = 0; i < NumReducedDims; ++i) {
+      eigen_assert(op.dims()[i] >= 0);
+      eigen_assert(op.dims()[i] < NumInputDims);
+      reduced[op.dims()[i]] = true;
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    int outputIndex = 0;
+    int reduceIndex = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (reduced[i]) {
+        m_reducedDims[reduceIndex] = input_dims[i];
+        ++reduceIndex;
+      } else {
+        m_dimensions[outputIndex] = input_dims[i];
+        ++outputIndex;
+      }
+    }
+
+    m_outputStrides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
+    }
+
+    array<Index, NumInputDims> strides;
+    strides[0] = 1;
+    for (int i = 1; i < NumInputDims; ++i) {
+      strides[i] = strides[i-1] * input_dims[i-1];
+    }
+    outputIndex = 0;
+    reduceIndex = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (reduced[i]) {
+        m_reducedStrides[reduceIndex] = strides[i];
+        ++reduceIndex;
+      } else {
+        m_preservedStrides[outputIndex] = strides[i];
+        ++outputIndex;
+      }
+    }
+
+    // Special case for full reductions
+    if (NumInputDims == NumReducedDims) {
+      m_dimensions[0] = 1;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketReturnType PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    Op reducer(m_reducer);
+    reduce(firstInput(index), 0, reducer);
+    return reducer.finalize();
+  }
+
+  // TODO(bsteiner): provide a more efficient implementation.
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index+i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Scalar* data() const { return NULL; }
+
+  private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    Index startInput = 0;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / m_outputStrides[i];
+      startInput += idx * m_preservedStrides[i];
+      index -= idx * m_outputStrides[i];
+    }
+    startInput += index * m_preservedStrides[0];
+    return startInput;
+  }
+
+  EIGEN_DEVICE_FUNC void reduce(Index firstIndex, int DimIndex, Op& reducer) const {
+    for (int j = 0; j < m_reducedDims[DimIndex]; ++j) {
+      const Index input = firstIndex + j * m_reducedStrides[DimIndex];
+      if (DimIndex < NumReducedDims-1) {
+        reduce(input, DimIndex+1, reducer);
+      } else {
+        reducer.reduce(m_impl.coeff(input));
+      }
+    }
+  }
+
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_preservedStrides;
+  array<Index, NumReducedDims> m_reducedStrides;
+  array<Index, NumReducedDims> m_reducedDims;
+  Op m_reducer;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H

diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 8d4e7db..e83d8b5 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt

@@ -106,14 +106,16 @@
   ei_add_test(cxx11_tensor_convolution "-std=c++0x")
   ei_add_test(cxx11_tensor_expr "-std=c++0x")
 #  ei_add_test(cxx11_tensor_fixed_size "-std=c++0x")
-  ei_add_test(cxx11_tensor_of_const_values "-std=c++0x")
+#  ei_add_test(cxx11_tensor_of_const_values "-std=c++0x")
   ei_add_test(cxx11_tensor_of_strings "-std=c++0x")
   ei_add_test(cxx11_tensor_intdiv "-std=c++0x")
   ei_add_test(cxx11_tensor_lvalue "-std=c++0x")
   ei_add_test(cxx11_tensor_map "-std=c++0x")
   ei_add_test(cxx11_tensor_broadcasting "-std=c++0x")
+  ei_add_test(cxx11_tensor_concatenation "-std=c++0x")
   ei_add_test(cxx11_tensor_morphing "-std=c++0x")
   ei_add_test(cxx11_tensor_padding "-std=c++0x")
+  ei_add_test(cxx11_tensor_reduction "-std=c++0x")
 #  ei_add_test(cxx11_tensor_shuffling "-std=c++0x")
   ei_add_test(cxx11_tensor_striding "-std=c++0x")
 #  ei_add_test(cxx11_tensor_device  "-std=c++0x")

diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp
new file mode 100644
index 0000000..8fd4f5f
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_concatenation.cpp

@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+static void test_dimension_failures()
+{
+  Tensor<int, 3> left(2, 3, 1);
+  Tensor<int, 3> right(3, 3, 1);
+  left.setRandom();
+  right.setRandom();
+
+  // Okay; other dimensions are equal.
+  Tensor<int, 3> concatenation = left.concatenate(right, 0);
+
+  // Dimension mismatches.
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 1));
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 2));
+
+  // Axis > NumDims or < 0.
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 3));
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, -1));
+}
+
+static void test_static_dimension_failure()
+{
+  Tensor<int, 2> left(2, 3);
+  Tensor<int, 3> right(2, 3, 1);
+
+#ifdef CXX11_TENSOR_CONCATENATION_STATIC_DIMENSION_FAILURE
+  // Technically compatible, but we static assert that the inputs have same
+  // NumDims.
+  Tensor<int, 3> concatenation = left.concatenate(right, 0);
+#endif
+
+  // This can be worked around in this case.
+  Tensor<int, 3> concatenation = left
+      .reshape(Tensor<int, 3>::Dimensions{{2, 3, 1}})
+      .concatenate(right, 0);
+  Tensor<int, 2> alternative = left
+      .concatenate(right.reshape(Tensor<int, 2>::Dimensions{{2, 3}}), 0);
+}
+
+static void test_simple_concatenation()
+{
+  Tensor<int, 3> left(2, 3, 1);
+  Tensor<int, 3> right(2, 3, 1);
+  left.setRandom();
+  right.setRandom();
+
+  Tensor<int, 3> concatenation = left.concatenate(right, 0);
+  VERIFY_IS_EQUAL(concatenation.dimension(0), 4);
+  VERIFY_IS_EQUAL(concatenation.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation.dimension(2), 1);
+  for (int j = 0; j < 3; ++j) {
+    for (int i = 0; i < 2; ++i) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
+    }
+    for (int i = 2; i < 4; ++i) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i - 2, j, 0));
+    }
+  }
+
+  concatenation = left.concatenate(right, 1);
+  VERIFY_IS_EQUAL(concatenation.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation.dimension(1), 6);
+  VERIFY_IS_EQUAL(concatenation.dimension(2), 1);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
+    }
+    for (int j = 3; j < 6; ++j) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i, j - 3, 0));
+    }
+  }
+
+  concatenation = left.concatenate(right, 2);
+  VERIFY_IS_EQUAL(concatenation.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation.dimension(2), 2);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
+      VERIFY_IS_EQUAL(concatenation(i, j, 1), right(i, j, 0));
+    }
+  }
+}
+
+
+// TODO(phli): Add test once we have a real vectorized implementation.
+// static void test_vectorized_concatenation() {}
+
+
+void test_cxx11_tensor_concatenation()
+{
+   CALL_SUBTEST(test_dimension_failures());
+   CALL_SUBTEST(test_static_dimension_failure());
+   CALL_SUBTEST(test_simple_concatenation());
+   // CALL_SUBTEST(test_vectorized_concatenation());
+}

diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
new file mode 100644
index 0000000..27135b9
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_reduction.cpp

@@ -0,0 +1,147 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+static void test_simple_reductions()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+
+  Tensor<float, 2> result = tensor.sum(reduction_axis);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 5);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      float sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor(i, k, j, l);
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), sum);
+    }
+  }
+
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 2;
+  result = tensor.maximum(reduction_axis);
+  VERIFY_IS_EQUAL(result.dimension(0), 3);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      float max_val = std::numeric_limits<float>::lowest();
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          max_val = (std::max)(max_val, tensor(k, i, l, j));
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), max_val);
+    }
+  }
+
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 1;
+  result = tensor.minimum(reduction_axis);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      float min_val = (std::numeric_limits<float>::max)();
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          min_val = (std::min)(min_val, tensor(k,  l, i, j));
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), min_val);
+    }
+  }
+}
+
+
+static void test_full_reductions()
+{
+  Tensor<float, 2> tensor(2,3);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> reduction_axis;
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 1;
+
+  Tensor<float, 1> result = tensor.sum(reduction_axis);
+  VERIFY_IS_EQUAL(result.dimension(0), 1);
+
+  float sum = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      sum += tensor(i, j);
+    }
+  }
+  VERIFY_IS_APPROX(result(0), sum);
+
+  result = tensor.square().sum(reduction_axis).sqrt();
+  VERIFY_IS_EQUAL(result.dimension(0), 1);
+
+  sum = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      sum += tensor(i, j) * tensor(i, j);
+    }
+  }
+  VERIFY_IS_APPROX(result(0), sqrtf(sum));
+}
+
+
+struct UserReducer {
+  UserReducer(float offset) : offset_(offset), sum_(0.0f) {}
+  void reduce(const float val) {
+    sum_ += val * val;
+  }
+  float finalize() const {
+    return 1.0f / (sum_ + offset_);
+  }
+
+ private:
+  float offset_;
+  float sum_;
+};
+
+static void test_user_defined_reductions()
+{
+  Tensor<float, 2> tensor(5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 1> reduction_axis;
+  reduction_axis[0] = 1;
+
+  UserReducer reducer(10.0f);
+  Tensor<float, 1> result = tensor.reduce(reduction_axis, reducer);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  for (int i = 0; i < 5; ++i) {
+    float expected = 10.0f;
+    for (int j = 0; j < 7; ++j) {
+      expected += tensor(i, j) * tensor(i, j);
+    }
+    expected = 1.0f / expected;
+    VERIFY_IS_APPROX(result(i), expected);
+  }
+}
+
+
+void test_cxx11_tensor_reduction()
+{
+   CALL_SUBTEST(test_simple_reductions());
+   CALL_SUBTEST(test_full_reductions());
+   CALL_SUBTEST(test_user_defined_reductions());
+}