Use numext::mini/numext::maxi instead of std::min/std::max in the tensor code
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 8a04f7d..e60fab7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -934,8 +934,8 @@
     // Sizes of the blocks to load in cache. See the Goto paper for details.
     BlockingType blocking(m, n, k, 1, true);
     const Index kc = blocking.kc();
-    const Index mc = (std::min)(m, blocking.mc());
-    const Index nc = (std::min)(n, blocking.nc());
+    const Index mc = numext::mini(m, blocking.mc());
+    const Index nc = numext::mini(n, blocking.nc());
     const Index sizeA = mc * kc;
     const Index sizeB = kc * nc;
 
@@ -944,16 +944,16 @@
 
     for(Index i2=0; i2<m; i2+=mc)
     {
-      const Index actual_mc = (std::min)(i2+mc,m)-i2;
+      const Index actual_mc = numext::mini(i2+mc,m)-i2;
       for (Index k2 = 0; k2 < k; k2 += kc) {
         // make sure we don't overshoot right edge of left matrix, then pack vertical panel
-        const Index actual_kc = (std::min)(k2 + kc, k) - k2;
+        const Index actual_kc = numext::mini(k2 + kc, k) - k2;
         pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
 
         // series of horizontal blocks
         for (Index j2 = 0; j2 < n; j2 += nc) {
           // make sure we don't overshoot right edge of right matrix, then pack block
-          const Index actual_nc = (std::min)(j2 + nc, n) - j2;
+          const Index actual_nc = numext::mini(j2 + nc, n) - j2;
           pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
 
           // call gebp (matrix kernel)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 5703022..576bea2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -202,7 +202,7 @@
     // note: You can get away with allocating just a single blockA and offsets and meet the
     //       the alignment requirements with the assumption that
     //       (Traits::mr * sizeof(ResScalar)) % 16 == 0
-    const Index numBlockAs = (std::min)(num_threads, m_blocks);
+    const Index numBlockAs = numext::mini(num_threads, m_blocks);
     std::vector<LhsScalar *> blockAs;
     blockAs.reserve(num_threads);
     for (int i = 0; i < num_threads; i++) {
@@ -230,14 +230,14 @@
     for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
       const Index k_start = k_block_idx * kc;
       // make sure we don't overshoot right edge of left matrix
-      const Index actual_kc = (std::min)(k_start + kc, k) - k_start;
+      const Index actual_kc = numext::mini(k_start + kc, k) - k_start;
 
       for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) {
-        const Index num_blocks = (std::min)(m_blocks-m_block_idx, numBlockAs);
+        const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs);
 
         for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) {
           const Index m_start = mt_block_idx * mc;
-          const Index actual_mc = (std::min)(m_start + mc, m) - m_start;
+          const Index actual_mc = numext::mini(m_start + mc, m) - m_start;
           eigen_assert(actual_mc > 0);
 
           Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
@@ -275,7 +275,7 @@
 
         for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) {
           const Index n_start = n_block_idx * nc;
-          const Index actual_nc = (std::min)(n_start + nc, n) - n_start;
+          const Index actual_nc = numext::mini(n_start + nc, n) - n_start;
 
           // first make sure the previous kernels are all done before overwriting rhs. Also wait if
           // we're going to start new k. In both cases need_to_pack is true.
@@ -376,7 +376,7 @@
       if (m_base_start < arg.max_m) {
         Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
         wait_until_ready((*arg.lhs_notifications)[blockAId]);
-        const Index actual_mc = (std::min)(m_base_start + arg.mc, arg.max_m) - m_base_start;
+        const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start;
         gebp(arg.output.getSubMapper(m_base_start, arg.n),
              (*arg.blockAs)[blockAId], arg.blockB,
              actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 23500e1..a82bfc0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -857,29 +857,29 @@
         if (m_indices[0] == single_stride_dim) {
           // Maximum the reuse
           const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
-          maxX = (std::min<int>)(inner_dim, numX);
-          const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
-          block_size.x = (std::min)(maxThreadsPerBlock, maxX);
-          block_size.y = (std::min<int>)(maxThreadsPerBlock / block_size.x, maxP);
+          maxX = numext::mini<int>(inner_dim, numX);
+          const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
+          block_size.x = numext::mini(maxThreadsPerBlock, maxX);
+          block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
         }
         else {
           // Read as much as possible alongside the inner most dimension, that is the plane
           const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
-          const int maxP = (std::min<int>)(inner_dim, numP);
-          maxX = (std::min<int>)(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
+          const int maxP = numext::mini<int>(inner_dim, numP);
+          maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
 
-          block_size.x = (std::min)(warpSize, maxX);
-          block_size.y = (std::min<int>)(maxThreadsPerBlock/block_size.x, maxP);
+          block_size.x = numext::mini(warpSize, maxX);
+          block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
         }
 
         const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
         assert(shared_mem <= maxSharedMem);
 
         const int num_x_blocks = ceil(numX, maxX);
-        const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+        const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
         const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
 
-        dim3 num_blocks(num_x_blocks, std::min<int>(num_y_blocks, ceil(numP, block_size.y)));
+        dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
 
 
         //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
@@ -920,24 +920,24 @@
 
         // Snap maxX to warp size
         int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
-        const int maxX = (std::min<int>)(inner_dim, numX);
-        const int maxY = (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
-        const int maxP = (std::min<int>)(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
+        const int maxX = numext::mini<int>(inner_dim, numX);
+        const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
+        const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
 
         dim3 block_size;
-        block_size.x = (std::min)(1024, maxX);
-        block_size.y = (std::min<int>)(1024/block_size.x, maxY);
-        block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxP);
+        block_size.x = numext::mini(1024, maxX);
+        block_size.y = numext::mini<int>(1024/block_size.x, maxY);
+        block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
 
         const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
         assert(shared_mem <= maxSharedMem);
 
         const int num_x_blocks = ceil(numX, maxX);
         const int num_y_blocks = ceil(numY, maxY);
-        const int blocksPerProcessor = (std::min)(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+        const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
         const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
 
-        dim3 num_blocks(num_x_blocks, num_y_blocks, std::min<int>(num_z_blocks, ceil(numP, block_size.z)));
+        dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
 
 
         //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
@@ -999,14 +999,14 @@
         const int numZ = dimensions()[m_indices[idxZ]];
         const int numP = dimensions().TotalSize() / (numX*numY*numZ);
 
-        const int maxX = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
-        const int maxY = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
-        const int maxZ = (std::min<int>)(128, (std::min<int>)(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
+        const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
+        const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
+        const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
 
         dim3 block_size;
-        block_size.x = (std::min)(32, maxX);
-        block_size.y = (std::min)(32, maxY);
-        block_size.z = (std::min<int>)(1024/(block_size.x*block_size.y), maxZ);
+        block_size.x = numext::mini(32, maxX);
+        block_size.y = numext::mini(32, maxY);
+        block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
         dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
 
         const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 2c5e67f..b2800ae 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -123,7 +123,7 @@
       static const int PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
 
       int blocksz = std::ceil<int>(static_cast<float>(size)/device.numThreads()) + PacketSize - 1;
-      const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
+      const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
       const Index numblocks = size / blocksize;
 
       std::vector<Notification*> results;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 9f88197..d9061c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -147,7 +147,7 @@
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    return (std::max)(saccum, predux_max(vaccum));
+    return numext::maxi(saccum, predux_max(vaccum));
   }
 };
 
@@ -180,7 +180,7 @@
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    return (std::min)(saccum, predux_min(vaccum));
+    return numext::mini(saccum, predux_min(vaccum));
   }
 };