Fix tensor stridedlinearbuffercopy
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 7e92487..afa3d5b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -1054,28 +1054,28 @@
       }
       return;
     }
-
-    const IndexType vectorized_size = count - PacketSize;
+    
+    const IndexType vectorized_size = PacketSize * (count / PacketSize);
     IndexType i = 0;
 
     if (kind == StridedLinearBufferCopy::Kind::Linear) {
       // ******************************************************************** //
       // Linear copy from `src` to `dst`.
-      const IndexType unrolled_size = count - 4 * PacketSize;
+      const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
       eigen_assert(src_stride == 1 && dst_stride == 1);
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
+      for (; i < unrolled_size; i += 4 * PacketSize) {
         for (int j = 0; j < 4; ++j) {
           Packet p = ploadu<Packet>(src + i + j * PacketSize);
           pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
         }
       }
-      for (; i <= vectorized_size; i += PacketSize) {
+      for (; i < vectorized_size; i += PacketSize) {
         Packet p = ploadu<Packet>(src + i);
         pstoreu<Scalar, Packet>(dst + i, p);
       }
       if (HasHalfPacket) {
-        const IndexType vectorized_half_size = count - HalfPacketSize;
-        if (i <= vectorized_half_size) {
+        const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
+        if (i < vectorized_half_size) {
           HalfPacket p = ploadu<HalfPacket>(src + i);
           pstoreu<Scalar, HalfPacket>(dst + i, p);
           i += HalfPacketSize;
@@ -1088,13 +1088,13 @@
     } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
       // Scatter from `src` to `dst`.
       eigen_assert(src_stride == 1 && dst_stride != 1);
-      for (; i <= vectorized_size; i += PacketSize) {
+      for (; i < vectorized_size; i += PacketSize) {
         Packet p = ploadu<Packet>(src + i);
         pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
       }
       if (HasHalfPacket) {
-        const IndexType vectorized_half_size = count - HalfPacketSize;
-        if (i <= vectorized_half_size) {
+        const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
+        if (i < vectorized_half_size) {
           HalfPacket p = ploadu<HalfPacket>(src + i);
           pscatter<Scalar, HalfPacket>(dst + i * dst_stride, p, dst_stride);
           i += HalfPacketSize;
@@ -1107,20 +1107,21 @@
     } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
       // Fill `dst` with value at `*src`.
       eigen_assert(src_stride == 0 && dst_stride == 1);
-      const IndexType unrolled_size = count - 4 * PacketSize;
+
+      const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
       Scalar s = *src;
       Packet p = pset1<Packet>(s);
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
+      for (; i < unrolled_size; i += 4 * PacketSize) {
         for (int j = 0; j < 4; ++j) {
           pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
         }
       }
-      for (; i <= vectorized_size; i += PacketSize) {
+      for (; i < vectorized_size; i += PacketSize) {
         pstoreu<Scalar, Packet>(dst + i, p);
       }
       if (HasHalfPacket) {
-        const IndexType vectorized_half_size = count - HalfPacketSize;
-        if (i <= vectorized_half_size) {
+        const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
+        if (i < vectorized_half_size) {
           HalfPacket hp = pset1<HalfPacket>(s);
           pstoreu<Scalar, HalfPacket>(dst + i, hp);
           i += HalfPacketSize;
@@ -1135,12 +1136,12 @@
       eigen_assert(src_stride == 0 && dst_stride != 1);
       Scalar s = *src;
       Packet p = pset1<Packet>(s);
-      for (; i <= vectorized_size; i += PacketSize) {
+      for (; i < vectorized_size; i += PacketSize) {
         pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
       }
       if (HasHalfPacket) {
-        const IndexType vectorized_half_size = count - HalfPacketSize;
-        if (i <= vectorized_half_size) {
+        const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
+        if (i < vectorized_half_size) {
           HalfPacket hp = pset1<HalfPacket>(s);
           pscatter<Scalar, HalfPacket>(dst + i * dst_stride, hp, dst_stride);
           i += HalfPacketSize;
@@ -1153,13 +1154,13 @@
     } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
       // Gather from `src` into `dst`.
       eigen_assert(dst_stride == 1);
-      for (; i <= vectorized_size; i += PacketSize) {
+      for (; i < vectorized_size; i += PacketSize) {
         Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
         pstoreu<Scalar, Packet>(dst + i, p);
       }
       if (HasHalfPacket) {
-        const IndexType vectorized_half_size = count - HalfPacketSize;
-        if (i <= vectorized_half_size) {
+        const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
+        if (i < vectorized_half_size) {
           HalfPacket p =
               pgather<Scalar, HalfPacket>(src + i * src_stride, src_stride);
           pstoreu<Scalar, HalfPacket>(dst + i, p);
@@ -1456,11 +1457,11 @@
                                         IndexType eval_offset) {
       typedef typename packet_traits<Scalar>::type Packet;
 
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      const IndexType vectorized_size = count - PacketSize;
+      const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
+      const IndexType vectorized_size = PacketSize * (count / PacketSize);
       IndexType i = 0;
 
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
+      for (; i < unrolled_size; i += 4 * PacketSize) {
         for (int j = 0; j < 4; ++j) {
           const IndexType idx = eval_offset + i + j * PacketSize;
           Packet p = eval.template packet<Unaligned>(idx);
@@ -1468,7 +1469,7 @@
         }
       }
 
-      for (; i <= vectorized_size; i += PacketSize) {
+      for (; i < vectorized_size; i += PacketSize) {
         Packet p = eval.template packet<Unaligned>(eval_offset + i);
         pstoreu<Scalar>(target + i, p);
       }