|  | #ifndef GPU_TEST_HELPER_H | 
|  | #define GPU_TEST_HELPER_H | 
|  |  | 
|  | #include <Eigen/Core> | 
|  |  | 
|  | // Allow gpu** macros for generic tests. | 
|  | #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> | 
|  |  | 
|  | // std::tuple cannot be used on device, and there is a bug in cuda < 9.2 that | 
|  | // doesn't allow std::tuple to compile for host code either. In these cases, | 
|  | // use our custom implementation. | 
|  | #if defined(EIGEN_GPU_COMPILE_PHASE) || (defined(EIGEN_CUDACC) && EIGEN_CUDA_SDK_VER < 92000) | 
|  | #define EIGEN_USE_CUSTOM_TUPLE 1 | 
|  | #else | 
|  | #define EIGEN_USE_CUSTOM_TUPLE 0 | 
|  | #endif | 
|  |  | 
|  | #if EIGEN_USE_CUSTOM_TUPLE | 
|  | #include "../Eigen/src/Core/arch/GPU/Tuple.h" | 
|  | #else | 
|  | #include <tuple> | 
|  | #endif | 
|  | namespace Eigen { | 
|  |  | 
|  | namespace internal { | 
|  |  | 
|  | // Note: cannot re-use tuple_impl, since that will cause havoc for | 
|  | // tuple_test. | 
|  | namespace test_detail { | 
|  | // Use std::tuple on CPU, otherwise use the GPU-specific versions. | 
|  | #if !EIGEN_USE_CUSTOM_TUPLE | 
|  | using std::get; | 
|  | using std::make_tuple; | 
|  | using std::tie; | 
|  | using std::tuple; | 
|  | #else | 
|  | using tuple_impl::get; | 
|  | using tuple_impl::make_tuple; | 
|  | using tuple_impl::tie; | 
|  | using tuple_impl::tuple; | 
|  | #endif | 
|  | #undef EIGEN_USE_CUSTOM_TUPLE | 
|  | }  // namespace test_detail | 
|  |  | 
|  | template <size_t N, size_t Idx, typename OutputIndexSequence, typename... Ts> | 
|  | struct extract_output_indices_helper; | 
|  |  | 
|  | /** | 
|  | * Extracts a set of indices corresponding to non-const l-value reference | 
|  | * output types. | 
|  | * | 
|  | * \internal | 
|  | * \tparam N the number of types {T1, Ts...}. | 
|  | * \tparam Idx the "index" to append if T1 is an output type. | 
|  | * \tparam OutputIndices the current set of output indices. | 
|  | * \tparam T1 the next type to consider, with index Idx. | 
|  | * \tparam Ts the remaining types. | 
|  | */ | 
|  | template <size_t N, size_t Idx, size_t... OutputIndices, typename T1, typename... Ts> | 
|  | struct extract_output_indices_helper<N, Idx, std::index_sequence<OutputIndices...>, T1, Ts...> { | 
|  | using type = typename extract_output_indices_helper< | 
|  | N - 1, Idx + 1, | 
|  | typename std::conditional< | 
|  | // If is a non-const l-value reference, append index. | 
|  | std::is_lvalue_reference<T1>::value && !std::is_const<std::remove_reference_t<T1>>::value, | 
|  | std::index_sequence<OutputIndices..., Idx>, std::index_sequence<OutputIndices...>>::type, | 
|  | Ts...>::type; | 
|  | }; | 
|  |  | 
|  | // Base case. | 
|  | template <size_t Idx, size_t... OutputIndices> | 
|  | struct extract_output_indices_helper<0, Idx, std::index_sequence<OutputIndices...>> { | 
|  | using type = std::index_sequence<OutputIndices...>; | 
|  | }; | 
|  |  | 
|  | // Extracts a set of indices into Types... that correspond to non-const | 
|  | // l-value references. | 
|  | template <typename... Types> | 
|  | using extract_output_indices = | 
|  | typename extract_output_indices_helper<sizeof...(Types), 0, std::index_sequence<>, Types...>::type; | 
|  |  | 
|  | // Helper struct for dealing with Generic functors that may return void. | 
|  | struct void_helper { | 
|  | struct Void {}; | 
|  |  | 
|  | // Converts void -> Void, T otherwise. | 
|  | template <typename T> | 
|  | using ReturnType = typename std::conditional<std::is_same<T, void>::value, Void, T>::type; | 
|  |  | 
|  | // Non-void return value. | 
|  | template <typename Func, typename... Args> | 
|  | static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args) | 
|  | -> std::enable_if_t<!std::is_same<decltype(func(args...)), void>::value, decltype(func(args...))> { | 
|  | return func(std::forward<Args>(args)...); | 
|  | } | 
|  |  | 
|  | // Void return value. | 
|  | template <typename Func, typename... Args> | 
|  | static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC auto call(Func&& func, Args&&... args) | 
|  | -> std::enable_if_t<std::is_same<decltype(func(args...)), void>::value, Void> { | 
|  | func(std::forward<Args>(args)...); | 
|  | return Void{}; | 
|  | } | 
|  |  | 
|  | // Restores the original return type, Void -> void, T otherwise. | 
|  | template <typename T> | 
|  | static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC | 
|  | std::enable_if_t<!std::is_same<typename std::decay<T>::type, Void>::value, T> | 
|  | restore(T&& val) { | 
|  | return val; | 
|  | } | 
|  |  | 
|  | // Void case. | 
|  | template <typename T = void> | 
|  | static EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void restore(const Void&) {} | 
|  | }; | 
|  |  | 
|  | // Runs a kernel via serialized buffer.  Does this by deserializing the buffer | 
|  | // to construct the arguments, calling the kernel, then re-serialing the outputs. | 
|  | // The buffer contains | 
|  | //     [ input_buffer_size, args ] | 
|  | // After the kernel call, it is then populated with | 
|  | //     [ output_buffer_size, output_parameters, return_value ] | 
|  | // If the output_buffer_size exceeds the buffer's capacity, then only the | 
|  | // output_buffer_size is populated. | 
|  | template <typename Kernel, typename... Args, size_t... Indices, size_t... OutputIndices> | 
|  | EIGEN_DEVICE_FUNC void run_serialized(std::index_sequence<Indices...>, std::index_sequence<OutputIndices...>, | 
|  | Kernel kernel, uint8_t* buffer, size_t capacity) { | 
|  | using test_detail::get; | 
|  | using test_detail::make_tuple; | 
|  | using test_detail::tuple; | 
|  | // Deserialize input size and inputs. | 
|  | size_t input_size; | 
|  | const uint8_t* read_ptr = buffer; | 
|  | const uint8_t* read_end = buffer + capacity; | 
|  | read_ptr = Eigen::deserialize(read_ptr, read_end, input_size); | 
|  | // Create value-type instances to populate. | 
|  | auto args = make_tuple(typename std::decay<Args>::type{}...); | 
|  | EIGEN_UNUSED_VARIABLE(args)  // Avoid NVCC compile warning. | 
|  | // NVCC 9.1 requires us to spell out the template parameters explicitly. | 
|  | read_ptr = Eigen::deserialize(read_ptr, read_end, get<Indices, typename std::decay<Args>::type...>(args)...); | 
|  |  | 
|  | // Call function, with void->Void conversion so we are guaranteed a complete | 
|  | // output type. | 
|  | auto result = void_helper::call(kernel, get<Indices, typename std::decay<Args>::type...>(args)...); | 
|  |  | 
|  | // Determine required output size. | 
|  | size_t output_size = Eigen::serialize_size(capacity); | 
|  | output_size += Eigen::serialize_size(get<OutputIndices, typename std::decay<Args>::type...>(args)...); | 
|  | output_size += Eigen::serialize_size(result); | 
|  |  | 
|  | // Always serialize required buffer size. | 
|  | uint8_t* write_ptr = buffer; | 
|  | uint8_t* write_end = buffer + capacity; | 
|  | write_ptr = Eigen::serialize(write_ptr, write_end, output_size); | 
|  | // Null `write_ptr` can be safely passed along. | 
|  | // Serialize outputs if they fit in the buffer. | 
|  | if (output_size <= capacity) { | 
|  | // Collect outputs and result. | 
|  | write_ptr = Eigen::serialize(write_ptr, write_end, get<OutputIndices, typename std::decay<Args>::type...>(args)...); | 
|  | write_ptr = Eigen::serialize(write_ptr, write_end, result); | 
|  | } | 
|  | } | 
|  |  | 
|  | template <typename Kernel, typename... Args> | 
|  | EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_serialized(Kernel kernel, uint8_t* buffer, size_t capacity) { | 
|  | run_serialized<Kernel, Args...>(std::make_index_sequence<sizeof...(Args)>{}, extract_output_indices<Args...>{}, | 
|  | kernel, buffer, capacity); | 
|  | } | 
|  |  | 
|  | #ifdef EIGEN_GPUCC | 
|  |  | 
|  | // Checks for GPU errors and asserts / prints the error message. | 
|  | #define GPU_CHECK(expr)                                                 \ | 
|  | do {                                                                  \ | 
|  | gpuError_t err = expr;                                              \ | 
|  | if (err != gpuSuccess) {                                            \ | 
|  | printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); \ | 
|  | gpu_assert(false);                                                \ | 
|  | }                                                                   \ | 
|  | } while (0) | 
|  |  | 
|  | // Calls run_serialized on the GPU. | 
|  | template <typename Kernel, typename... Args> | 
|  | __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void run_serialized_on_gpu_meta_kernel(const Kernel kernel, uint8_t* buffer, | 
|  | size_t capacity) { | 
|  | run_serialized<Kernel, Args...>(kernel, buffer, capacity); | 
|  | } | 
|  |  | 
|  | // Runs kernel(args...) on the GPU via the serialization mechanism. | 
|  | // | 
|  | // Note: this may end up calling the kernel multiple times if the initial output | 
|  | // buffer is not large enough to hold the outputs. | 
|  | template <typename Kernel, typename... Args, size_t... Indices, size_t... OutputIndices> | 
|  | auto run_serialized_on_gpu(size_t buffer_capacity_hint, std::index_sequence<Indices...>, | 
|  | std::index_sequence<OutputIndices...>, Kernel kernel, Args&&... args) | 
|  | -> decltype(kernel(args...)) { | 
|  | // Compute the required serialization buffer capacity. | 
|  | // Round up input size to next power of two to give a little extra room | 
|  | // for outputs. | 
|  | size_t input_data_size = sizeof(size_t) + Eigen::serialize_size(args...); | 
|  |  | 
|  | size_t capacity; | 
|  | if (buffer_capacity_hint == 0) { | 
|  | // Estimate as the power of two larger than the total input size. | 
|  | capacity = sizeof(size_t); | 
|  | while (capacity <= input_data_size) { | 
|  | capacity *= 2; | 
|  | } | 
|  | } else { | 
|  | // Use the larger of the hint and the total input size. | 
|  | // Add sizeof(size_t) to the hint to account for storing the buffer capacity | 
|  | // itself so the user doesn't need to think about this. | 
|  | capacity = std::max<size_t>(buffer_capacity_hint + sizeof(size_t), input_data_size); | 
|  | } | 
|  | std::vector<uint8_t> buffer(capacity); | 
|  |  | 
|  | uint8_t* host_data = nullptr; | 
|  | uint8_t* host_data_end = nullptr; | 
|  | uint8_t* host_ptr = nullptr; | 
|  | uint8_t* device_data = nullptr; | 
|  | size_t output_data_size = 0; | 
|  |  | 
|  | // Allocate buffers and copy input data. | 
|  | capacity = std::max<size_t>(capacity, output_data_size); | 
|  | buffer.resize(capacity); | 
|  | host_data = buffer.data(); | 
|  | host_data_end = buffer.data() + capacity; | 
|  | host_ptr = Eigen::serialize(host_data, host_data_end, input_data_size); | 
|  | host_ptr = Eigen::serialize(host_ptr, host_data_end, args...); | 
|  |  | 
|  | // Copy inputs to host. | 
|  | gpuMalloc((void**)(&device_data), capacity); | 
|  | gpuMemcpy(device_data, buffer.data(), input_data_size, gpuMemcpyHostToDevice); | 
|  | GPU_CHECK(gpuDeviceSynchronize()); | 
|  |  | 
|  | // Run kernel. | 
|  | #ifdef EIGEN_USE_HIP | 
|  | hipLaunchKernelGGL(HIP_KERNEL_NAME(run_serialized_on_gpu_meta_kernel<Kernel, Args...>), 1, 1, 0, 0, kernel, | 
|  | device_data, capacity); | 
|  | #else | 
|  | run_serialized_on_gpu_meta_kernel<Kernel, Args...><<<1, 1>>>(kernel, device_data, capacity); | 
|  | #endif | 
|  | // Check pre-launch and kernel execution errors. | 
|  | GPU_CHECK(gpuGetLastError()); | 
|  | GPU_CHECK(gpuDeviceSynchronize()); | 
|  | // Copy back new output to host. | 
|  | gpuMemcpy(host_data, device_data, capacity, gpuMemcpyDeviceToHost); | 
|  | gpuFree(device_data); | 
|  | GPU_CHECK(gpuDeviceSynchronize()); | 
|  |  | 
|  | // Determine output buffer size. | 
|  | const uint8_t* c_host_ptr = Eigen::deserialize(host_data, host_data_end, output_data_size); | 
|  | // If the output doesn't fit in the buffer, spit out warning and fail. | 
|  | if (output_data_size > capacity) { | 
|  | std::cerr << "The serialized output does not fit in the output buffer, " << output_data_size << " vs capacity " | 
|  | << capacity << "." << std::endl | 
|  | << "Try specifying a minimum buffer capacity: " << std::endl | 
|  | << "  run_with_hint(" << output_data_size << ", ...)" << std::endl; | 
|  | VERIFY(false); | 
|  | } | 
|  |  | 
|  | // Deserialize outputs. | 
|  | auto args_tuple = test_detail::tie(args...); | 
|  | EIGEN_UNUSED_VARIABLE(args_tuple)  // Avoid NVCC compile warning. | 
|  | c_host_ptr = Eigen::deserialize(c_host_ptr, host_data_end, test_detail::get<OutputIndices, Args&...>(args_tuple)...); | 
|  |  | 
|  | // Maybe deserialize return value, properly handling void. | 
|  | typename void_helper::ReturnType<decltype(kernel(args...))> result; | 
|  | c_host_ptr = Eigen::deserialize(c_host_ptr, host_data_end, result); | 
|  | return void_helper::restore(result); | 
|  | } | 
|  |  | 
|  | #endif  // EIGEN_GPUCC | 
|  |  | 
|  | }  // namespace internal | 
|  |  | 
|  | /** | 
|  | * Runs a kernel on the CPU, returning the results. | 
|  | * \param kernel kernel to run. | 
|  | * \param args ... input arguments. | 
|  | * \return kernel(args...). | 
|  | */ | 
|  | template <typename Kernel, typename... Args> | 
|  | auto run_on_cpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { | 
|  | return kernel(std::forward<Args>(args)...); | 
|  | } | 
|  |  | 
|  | #ifdef EIGEN_GPUCC | 
|  |  | 
|  | /** | 
|  | * Runs a kernel on the GPU, returning the results. | 
|  | * | 
|  | * The kernel must be able to be passed directly as an input to a global | 
|  | * function (i.e. empty or POD).  Its inputs must be "Serializable" so we | 
|  | * can transfer them to the device, and the output must be a Serializable value | 
|  | * type so it can be transferred back from the device. | 
|  | * | 
|  | * \param kernel kernel to run. | 
|  | * \param args ... input arguments, must be "Serializable". | 
|  | * \return kernel(args...). | 
|  | */ | 
|  | template <typename Kernel, typename... Args> | 
|  | auto run_on_gpu(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { | 
|  | return internal::run_serialized_on_gpu<Kernel, Args...>( | 
|  | /*buffer_capacity_hint=*/0, std::make_index_sequence<sizeof...(Args)>{}, | 
|  | internal::extract_output_indices<Args...>{}, kernel, std::forward<Args>(args)...); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Runs a kernel on the GPU, returning the results. | 
|  | * | 
|  | * This version allows specifying a minimum buffer capacity size required for | 
|  | * serializing the puts to transfer results from device to host.  Use this when | 
|  | * `run_on_gpu(...)` fails to determine an appropriate capacity by default. | 
|  | * | 
|  | * \param buffer_capacity_hint minimum required buffer size for serializing | 
|  | *        outputs. | 
|  | * \param kernel kernel to run. | 
|  | * \param args ... input arguments, must be "Serializable". | 
|  | * \return kernel(args...). | 
|  | * \sa run_on_gpu | 
|  | */ | 
|  | template <typename Kernel, typename... Args> | 
|  | auto run_on_gpu_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { | 
|  | return internal::run_serialized_on_gpu<Kernel, Args...>( | 
|  | buffer_capacity_hint, std::make_index_sequence<sizeof...(Args)>{}, internal::extract_output_indices<Args...>{}, | 
|  | kernel, std::forward<Args>(args)...); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Kernel for determining basic Eigen compile-time information | 
|  | * (i.e. the cuda/hip arch) | 
|  | */ | 
|  | struct CompileTimeDeviceInfoKernel { | 
|  | struct Info { | 
|  | int cuda; | 
|  | int hip; | 
|  | }; | 
|  |  | 
|  | EIGEN_DEVICE_FUNC Info operator()() const { | 
|  | Info info = {-1, -1}; | 
|  | #if defined(__CUDA_ARCH__) | 
|  | info.cuda = static_cast<int>(__CUDA_ARCH__ + 0); | 
|  | #endif | 
|  | #if defined(EIGEN_HIP_DEVICE_COMPILE) | 
|  | info.hip = static_cast<int>(EIGEN_HIP_DEVICE_COMPILE + 0); | 
|  | #endif | 
|  | return info; | 
|  | } | 
|  | }; | 
|  |  | 
|  | /** | 
|  | * Queries and prints the compile-time and runtime GPU info. | 
|  | */ | 
|  | void print_gpu_device_info() { | 
|  | int device = 0; | 
|  | gpuDeviceProp_t deviceProp; | 
|  | gpuGetDeviceProperties(&deviceProp, device); | 
|  |  | 
|  | auto info = run_on_gpu(CompileTimeDeviceInfoKernel()); | 
|  |  | 
|  | std::cout << "GPU compile-time info:\n"; | 
|  |  | 
|  | #ifdef EIGEN_CUDACC | 
|  | std::cout << "  EIGEN_CUDACC:                " << int(EIGEN_CUDACC) << std::endl; | 
|  | #endif | 
|  |  | 
|  | #ifdef EIGEN_CUDA_SDK_VER | 
|  | std::cout << "  EIGEN_CUDA_SDK_VER:          " << int(EIGEN_CUDA_SDK_VER) << std::endl; | 
|  | #endif | 
|  |  | 
|  | #if EIGEN_COMP_NVCC | 
|  | std::cout << "  EIGEN_COMP_NVCC:             " << int(EIGEN_COMP_NVCC) << std::endl; | 
|  | #endif | 
|  |  | 
|  | #ifdef EIGEN_HIPCC | 
|  | std::cout << "  EIGEN_HIPCC:                 " << int(EIGEN_HIPCC) << std::endl; | 
|  | #endif | 
|  |  | 
|  | std::cout << "  EIGEN_CUDA_ARCH:             " << info.cuda << std::endl; | 
|  | std::cout << "  EIGEN_HIP_DEVICE_COMPILE:    " << info.hip << std::endl; | 
|  |  | 
|  | std::cout << "GPU device info:\n"; | 
|  | std::cout << "  name:                        " << deviceProp.name << std::endl; | 
|  | std::cout << "  capability:                  " << deviceProp.major << "." << deviceProp.minor << std::endl; | 
|  | std::cout << "  multiProcessorCount:         " << deviceProp.multiProcessorCount << std::endl; | 
|  | std::cout << "  maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl; | 
|  | std::cout << "  warpSize:                    " << deviceProp.warpSize << std::endl; | 
|  | std::cout << "  regsPerBlock:                " << deviceProp.regsPerBlock << std::endl; | 
|  | std::cout << "  concurrentKernels:           " << deviceProp.concurrentKernels << std::endl; | 
|  | std::cout << "  clockRate:                   " << deviceProp.clockRate << std::endl; | 
|  | std::cout << "  canMapHostMemory:            " << deviceProp.canMapHostMemory << std::endl; | 
|  | std::cout << "  computeMode:                 " << deviceProp.computeMode << std::endl; | 
|  | } | 
|  |  | 
|  | #endif  // EIGEN_GPUCC | 
|  |  | 
|  | /** | 
|  | * Runs a kernel on the GPU (if EIGEN_GPUCC), or CPU otherwise. | 
|  | * | 
|  | * This is to better support creating generic tests. | 
|  | * | 
|  | * The kernel must be able to be passed directly as an input to a global | 
|  | * function (i.e. empty or POD).  Its inputs must be "Serializable" so we | 
|  | * can transfer them to the device, and the output must be a Serializable value | 
|  | * type so it can be transferred back from the device. | 
|  | * | 
|  | * \param kernel kernel to run. | 
|  | * \param args ... input arguments, must be "Serializable". | 
|  | * \return kernel(args...). | 
|  | */ | 
|  | template <typename Kernel, typename... Args> | 
|  | auto run(Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { | 
|  | #ifdef EIGEN_GPUCC | 
|  | return run_on_gpu(kernel, std::forward<Args>(args)...); | 
|  | #else | 
|  | return run_on_cpu(kernel, std::forward<Args>(args)...); | 
|  | #endif | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Runs a kernel on the GPU (if EIGEN_GPUCC), or CPU otherwise. | 
|  | * | 
|  | * This version allows specifying a minimum buffer capacity size required for | 
|  | * serializing the puts to transfer results from device to host.  Use this when | 
|  | * `run(...)` fails to determine an appropriate capacity by default. | 
|  | * | 
|  | * \param buffer_capacity_hint minimum required buffer size for serializing | 
|  | *        outputs. | 
|  | * \param kernel kernel to run. | 
|  | * \param args ... input arguments, must be "Serializable". | 
|  | * \return kernel(args...). | 
|  | * \sa run | 
|  | */ | 
|  | template <typename Kernel, typename... Args> | 
|  | auto run_with_hint(size_t buffer_capacity_hint, Kernel kernel, Args&&... args) -> decltype(kernel(args...)) { | 
|  | #ifdef EIGEN_GPUCC | 
|  | return run_on_gpu_with_hint(buffer_capacity_hint, kernel, std::forward<Args>(args)...); | 
|  | #else | 
|  | EIGEN_UNUSED_VARIABLE(buffer_capacity_hint) | 
|  | return run_on_cpu(kernel, std::forward<Args>(args)...); | 
|  | #endif | 
|  | } | 
|  |  | 
|  | }  // namespace Eigen | 
|  |  | 
|  | #endif  // GPU_TEST_HELPER_H |