Fixed the evaluation of expressions involving tensors of 2 or 3 elements on CUDA devices.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 4fa8e83..f27f643 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -168,11 +168,10 @@
     const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
     const Index vectorized_step_size = step_size * PacketSize;
     const Index vectorized_size = (size / PacketSize) * PacketSize;
-    Index i = first_index * PacketSize;
-    for ( ; i < vectorized_size; i += vectorized_step_size) {
+    for (Index i = first_index * PacketSize; i < vectorized_size; i += vectorized_step_size) {
       eval.evalPacket(i);
     }
-    for ( ; i < size; i += step_size) {
+    for (Index i = vectorized_size + first_index; i < size; i += step_size) {
       eval.evalScalar(i);
     }
   }