parallel_for's throw_on_error results in terminate
This repository was archived by the owner on Mar 21, 2024. It is now read-only.
This repository was archived by the owner on Mar 21, 2024. It is now read-only.
Description
We're using Thrust with Torch 1.7.1 and MinkowskiEngine 0.5.4 and experience a deterministic issue which makes the library unusable for long-running processes.
When we run parallel_for on large arrays there is several memory allocation steps and if we encounter an OOM error it simply results in a terminate instead of an error which could be handled. This is usually a result of an error thrown in a noexcept function.
Environment:
- Ubuntu 18.04
- CUDA 10.2
- Driver Version: 460.73.01
Code to reproduce:
#include <algorithm> #include <chrono> #include <iostream> #include <new> #include <numeric> #include <random> #include <vector> #include <cuda_runtime.h> #include <thrust/copy.h> #include <thrust/device_vector.h> #include <thrust/execution_policy.h> #include <thrust/iterator/constant_iterator.h> #include <thrust/iterator/counting_iterator.h> #include <thrust/sort.h> int main() { std::random_device rd; std::mt19937 engine; engine.seed(rd()); std::uniform_real_distribution<float> u(0, 90.); // Parameter for Tesla V100 16GB VRAM (Use 250M for 8GB VRAM): int N = 550000000; std::vector<float> v(N); std::generate(v.begin(), v.end(), [&]() { return u(engine); }); thrust::host_vector<float> hv(v.begin(), v.end()); thrust::device_vector<float> dv = hv; thrust::device_vector<float> res(dv.begin(), dv.end()); thrust::device_vector<int> index(N); thrust::sequence(thrust::device, index.begin(), index.end(), 0, 1); while (1) { try { std::cout << "step" << std::endl; thrust::sort_by_key(thrust::device, // dv.begin(), // key begin dv.end(), // key end thrust::make_zip_iterator( // value begin thrust::make_tuple( // dv.begin(), // index.begin() // ))); } catch (std::bad_alloc) { std::cout << "bad_alloc" << std::endl; } catch (...) { std::cout << "other error" << std::endl; } // thrust exception cannot be caught } cudaDeviceSynchronize(); return 0; }
Traceback:
#0 0x00007f07d8108fb7 in raise () from /lib/x86_64-linux-gnu/libc.so.6
#1 0x00007f07d810a921 in abort () from /lib/x86_64-linux-gnu/libc.so.6
#2 0x00007f07d8afd957 in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#3 0x00007f07d8b03ae6 in std::rethrow_exception(std::__exception_ptr::exception_ptr) () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#4 0x00007f07d8b02b49 in __cxa_throw_bad_array_new_length () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#5 0x00007f07d8b034b8 in __gxx_personality_v0 () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#6 0x00007f07d84cb573 in _Unwind_GetTextRelBase () from /lib/x86_64-linux-gnu/libgcc_s.so.1
#7 0x00007f07d84cbad1 in _Unwind_RaiseException () from /lib/x86_64-linux-gnu/libgcc_s.so.1
#8 0x00007f07d8b03d47 in __cxa_throw () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#9 0x000056485a33fb4d in thrust::cuda_cub::throw_on_error(cudaError, char const*) ()
#10 0x000056485a35c134 in void thrust::cuda_cub::parallel_for<thrust::cuda_cub::par_t, thrust::cuda_cub::for_each_f<thrust::pointer<thrust::tuple<float, int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default>, thrust::detail::wrapped_function<thrust::detail::allocator_traits_detail::gozer, void> >, long>(thrust::cuda_cub::execution_policy<thrust::cuda_cub::par_t>&, thrust::cuda_cub::for_each_f<thrust::pointer<thrust::tuple<float, int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>, thrust::cuda_cub::par_t, thrust::use_default, thrust::use_default>, thrust::detail::wrapped_function<thrust::detail::allocator_traits_detail::gozer, void> >, long) ()
I found that the issue is not present on CUDA 11.1, so we started to migrate the codebase over, but it might be useful for someone else who encounters this issue.