Avoid integer overflow in EigenMetaKernel indexing
- The current implementation computes `size + total_threads`, which can
overflow and cause CUDA_ERROR_ILLEGAL_ADDRESS when size is close to
the maximum representable value.
- The num_blocks calculation can also overflow due to the implementation
of divup().
- This patch prevents these overflows and allows the kernel to work
correctly for the full representable range of tensor sizes.
- Also adds relevant tests.
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 279be34..9a513dc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -553,11 +553,59 @@
};
#if defined(EIGEN_GPUCC)
+// Returns 1 if lhs + rhs would overflow, -1 if it would underflow, otherwise 0.
+template <typename Index>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int sum_will_overflow(Index lhs,
+ Index rhs) {
+ const Index highest = NumTraits<Index>::highest();
+ const Index lowest = NumTraits<Index>::lowest();
+ if (lhs > 0 && rhs > 0) {
+ return lhs > highest - rhs ? 1 : 0;
+ } else if (lhs < 0 && rhs < 0) {
+ return lhs < lowest - rhs ? -1 : 0;
+ } else {
+ return 0;
+ }
+}
+
+// Returns lhs + rhs, saturating to the highest/lowest representable value on
+// overflow/underflow respectively.
+template <typename Index>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) {
+ const Index highest = NumTraits<Index>::highest();
+ const Index lowest = NumTraits<Index>::lowest();
+ int overflow = sum_will_overflow(lhs, rhs);
+ return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs;
+}
+
+// A functor that adds step_size to a given index, saturating to avoid
+// overflow/underflow. If overflow/underflow is not possible, regular addition
+// is used (for efficiency).
+template <typename Index>
+struct SafeStep {
+ // lastIdx is one past the end of the possible indexes.
+ // step_size is the value that will be added to the given index when the
+ // functor is called.
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SafeStep(Index lastIdx, Index step_size)
+ : can_overflow_(sum_will_overflow(lastIdx, step_size)),
+ step_size_(step_size) {}
+
+ // Adds step_size to index, saturating on overflow (if overflow is possible).
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index operator()(Index index) const {
+ return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_;
+ }
+
+ private:
+ const bool can_overflow_;
+ const Index step_size_;
+};
+
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
struct EigenMetaKernelEval {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
- for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
+ SafeStep<StorageIndex> safe_step(lastIdx, step_size);
+ for (StorageIndex i = firstIdx; i < lastIdx; i = safe_step(i)) {
eval.evalScalar(i);
}
}
@@ -571,12 +619,16 @@
const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
const StorageIndex vectorized_step_size = step_size * PacketSize;
+ SafeStep<StorageIndex> safe_vectorized_step(vectorized_size,
+ vectorized_step_size);
// Use the vector path
for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
- i += vectorized_step_size) {
+ i = safe_vectorized_step(i)) {
eval.evalPacket(i);
}
- for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
+ SafeStep<StorageIndex> safe_step(lastIdx, step_size);
+ for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx;
+ i = safe_step(i)) {
eval.evalScalar(i);
}
}
@@ -603,8 +655,11 @@
if (needs_assign) {
const int block_size = device.maxGpuThreadsPerBlock();
- const int max_blocks = device.getNumGpuMultiProcessors() *
- device.maxGpuThreadsPerMultiProcessor() / block_size;
+ const int max_blocks =
+ numext::mini<int64_t>(device.getNumGpuMultiProcessors() *
+ device.maxGpuThreadsPerMultiProcessor(),
+ NumTraits<StorageIndex>::highest()) /
+ block_size;
const StorageIndex size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index b3f4a1c..cf891eb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -30,13 +30,15 @@
template <typename T, typename X, typename Y>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T divup(const X x, const Y y) {
- return static_cast<T>((x + y - 1) / y);
+ // Note: This form is used because it cannot overflow.
+ return static_cast<T>(x == 0 ? 0 : (x - 1) / y + 1);
}
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T divup(const T x, const T y) {
- return static_cast<T>((x + y - 1) / y);
+ // Note: This form is used because it cannot overflow.
+ return static_cast<T>(x == 0 ? 0 : (x - 1) / y + 1);
}
template <size_t n> struct max_n_1 {
diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu
index 0a37c02..83b150d 100644
--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@@ -66,6 +66,47 @@
gpuFree(d_in2);
}
+// Tests that there are no indexing overflows when computing tensors with the
+// max representable size.
+template <typename IndexType,
+ IndexType N = (std::numeric_limits<IndexType>::max)()>
+void test_gpu_nullary_max_size()
+{
+ typedef int8_t DataType;
+ typedef Tensor<DataType, 1, 0, IndexType> TensorType;
+ typedef Eigen::array<IndexType, 1> ArrayType;
+
+ const IndexType n = N;
+ TensorType in1((ArrayType(n)));
+ in1.setZero();
+
+ std::size_t in1_bytes = in1.size() * sizeof(DataType);
+
+ DataType* d_in1;
+ gpuMalloc((void**)(&d_in1), in1_bytes);
+
+ gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
+
+ Eigen::GpuStreamDevice stream;
+ Eigen::GpuDevice gpu_device(&stream);
+
+ Eigen::TensorMap<TensorType> gpu_in1(d_in1, ArrayType(n));
+
+ gpu_in1.device(gpu_device) = gpu_in1.constant(123);
+
+ TensorType new1((ArrayType(n)));
+
+ assert(gpuMemcpyAsync(new1.data(), d_in1, in1_bytes, gpuMemcpyDeviceToHost,
+ gpu_device.stream()) == gpuSuccess);
+ assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
+
+ for (IndexType i = 0; i < n; ++i) {
+ VERIFY_IS_EQUAL(new1(ArrayType(i)), 123);
+ }
+
+ gpuFree(d_in1);
+}
+
void test_gpu_elementwise_small() {
Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
@@ -1524,6 +1565,10 @@
EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
{
CALL_SUBTEST_1(test_gpu_nullary());
+ CALL_SUBTEST_1(test_gpu_nullary_max_size<int16_t>());
+ CALL_SUBTEST_1(test_gpu_nullary_max_size<int32_t>());
+ CALL_SUBTEST_1((test_gpu_nullary_max_size<
+ int64_t, (std::numeric_limits<int32_t>::max)() + 100ll>()));
CALL_SUBTEST_1(test_gpu_elementwise_small());
CALL_SUBTEST_1(test_gpu_elementwise());
CALL_SUBTEST_1(test_gpu_props());