diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 097a41d4c4174..1f90dd6db5b15 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -591,6 +591,7 @@ set(files __numeric/transform_exclusive_scan.h __numeric/transform_inclusive_scan.h __numeric/transform_reduce.h + __pstl/cpu_algos/cpu_traits.h __random/bernoulli_distribution.h __random/binomial_distribution.h __random/cauchy_distribution.h diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h index 6980ded189ea2..53eae58f96095 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h @@ -9,52 +9,6 @@ #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H -#include <__config> - -/* - - // _Functor takes a subrange for [__first, __last) that should be executed in serial - template - optional<__empty> __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func); - - template - optional<_Tp> - __parallel_transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduction); - - // Cancel the execution of other jobs - they aren't needed anymore - void __cancel_execution(); - - template - optional __parallel_merge( - _RandomAccessIterator1 __first1, - _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, - _RandomAccessIterator3 __outit, - _Compare __comp, - _LeafMerge __leaf_merge); - - template - void __parallel_stable_sort(_RandomAccessIterator __first, - _RandomAccessIterator __last, - _Comp __comp, - _LeafSort __leaf_sort); - - TODO: Document the parallel backend - -Exception handling -================== - -CPU backends are expected to report errors (i.e. failure to allocate) by returning a disengaged `optional` from their -implementation. Exceptions shouldn't be used to report an internal failure-to-allocate, since all exceptions are turned -into a program termination at the front-end level. When a backend returns a disengaged `optional` to the frontend, the -frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to the user. -*/ - #include <__algorithm/pstl_backends/cpu_backends/any_of.h> #include <__algorithm/pstl_backends/cpu_backends/backend.h> #include <__algorithm/pstl_backends/cpu_backends/fill.h> @@ -64,5 +18,6 @@ frontend will turn that into a call to `std::__throw_bad_alloc();` to report the #include <__algorithm/pstl_backends/cpu_backends/stable_sort.h> #include <__algorithm/pstl_backends/cpu_backends/transform.h> #include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h> +#include <__config> #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h index 13dff80086e72..be5e54f3fa5c8 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h @@ -17,6 +17,7 @@ #include <__config> #include <__functional/operations.h> #include <__iterator/concepts.h> +#include <__pstl/cpu_algos/cpu_traits.h> #include <__type_traits/is_execution_policy.h> #include <__utility/move.h> #include <__utility/pair.h> @@ -30,13 +31,13 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD -template +template _LIBCPP_HIDE_FROM_ABI optional __parallel_or(_Index __first, _Index __last, _Brick __f) { std::atomic __found(false); - auto __ret = __par_backend::__parallel_for(__first, __last, [__f, &__found](_Index __i, _Index __j) { + auto __ret = __pstl::__cpu_traits<_Backend>::__parallel_for(__first, __last, [__f, &__found](_Index __i, _Index __j) { if (!__found.load(std::memory_order_relaxed) && __f(__i, __j)) { __found.store(true, std::memory_order_relaxed); - __par_backend::__cancel_execution(); + __pstl::__cpu_traits<_Backend>::__cancel_execution(); } }); if (!__ret) @@ -74,7 +75,7 @@ _LIBCPP_HIDE_FROM_ABI optional __pstl_any_of(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) { if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { - return std::__parallel_or( + return std::__parallel_or<__cpu_backend_tag>( __first, __last, [&__pred](_ForwardIterator __brick_first, _ForwardIterator __brick_last) { auto __res = std::__pstl_any_of<__remove_parallel_policy_t<_ExecutionPolicy>>( __cpu_backend_tag{}, __brick_first, __brick_last, __pred); diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h index ea2210a4a7adb..cb9425862a2b0 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h @@ -26,16 +26,20 @@ # pragma GCC system_header #endif -#if _LIBCPP_STD_VER >= 17 +#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD -struct __cpu_backend_tag {}; - -inline constexpr size_t __lane_size = 64; +# if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) +using __cpu_backend_tag = __pstl::__serial_backend_tag; +# elif defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) +using __cpu_backend_tag = __pstl::__std_thread_backend_tag; +# elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH) +using __cpu_backend_tag = __pstl::__libdispatch_backend_tag; +# endif _LIBCPP_END_NAMESPACE_STD -#endif // _LIBCPP_STD_VER >= 17 +#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17 #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_BACKEND_H diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h index 64babe9fd2bda..49a32f6c5ce55 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h @@ -13,6 +13,7 @@ #include <__algorithm/pstl_backends/cpu_backends/backend.h> #include <__config> #include <__iterator/concepts.h> +#include <__pstl/cpu_algos/cpu_traits.h> #include <__type_traits/is_execution_policy.h> #include <__utility/empty.h> #include @@ -39,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty> __pstl_fill(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { - return __par_backend::__parallel_for( + return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for( __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) { [[maybe_unused]] auto __res = std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>( __cpu_backend_tag{}, __brick_first, __brick_last, __value); diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h index 170470e4fb7ed..11a5668bf25af 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h @@ -16,6 +16,7 @@ #include <__functional/operations.h> #include <__iterator/concepts.h> #include <__iterator/iterator_traits.h> +#include <__pstl/cpu_algos/cpu_traits.h> #include <__type_traits/is_execution_policy.h> #include <__utility/move.h> #include <__utility/pair.h> @@ -33,7 +34,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD -template +template _LIBCPP_HIDE_FROM_ABI optional<_Index> __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool __b_first) { typedef typename std::iterator_traits<_Index>::difference_type _DifferenceType; @@ -41,8 +42,8 @@ __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool _DifferenceType __initial_dist = __b_first ? __n : -1; std::atomic<_DifferenceType> __extremum(__initial_dist); // TODO: find out what is better here: parallel_for or parallel_reduce - auto __res = - __par_backend::__parallel_for(__first, __last, [__comp, __f, __first, &__extremum](_Index __i, _Index __j) { + auto __res = __pstl::__cpu_traits<_Backend>::__parallel_for( + __first, __last, [__comp, __f, __first, &__extremum](_Index __i, _Index __j) { // See "Reducing Contention Through Priority Updates", PPoPP '13, for discussion of // why using a shared variable scales fairly well in this situation. if (__comp(__i - __first, __extremum)) { @@ -61,12 +62,12 @@ __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool return __extremum.load() != __initial_dist ? __first + __extremum.load() : __last; } -template +template _LIBCPP_HIDE_FROM_ABI _Index __simd_first(_Index __first, _DifferenceType __begin, _DifferenceType __end, _Compare __comp) noexcept { // Experiments show good block sizes like this - const _DifferenceType __block_size = 8; - alignas(__lane_size) _DifferenceType __lane[__block_size] = {0}; + const _DifferenceType __block_size = 8; + alignas(__pstl::__cpu_traits<_Backend>::__lane_size) _DifferenceType __lane[__block_size] = {0}; while (__end - __begin >= __block_size) { _DifferenceType __found = 0; _PSTL_PRAGMA_SIMD_REDUCTION(| : __found) for (_DifferenceType __i = __begin; __i < __begin + __block_size; ++__i) { @@ -102,7 +103,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator> __pstl_find_if(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) { if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { - return std::__parallel_find( + return std::__parallel_find<__cpu_backend_tag>( __first, __last, [&__pred](_ForwardIterator __brick_first, _ForwardIterator __brick_last) { @@ -116,9 +117,10 @@ __pstl_find_if(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __l } else if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> && __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { using __diff_t = __iter_diff_t<_ForwardIterator>; - return std::__simd_first(__first, __diff_t(0), __last - __first, [&__pred](_ForwardIterator __iter, __diff_t __i) { - return __pred(__iter[__i]); - }); + return std::__simd_first<__cpu_backend_tag>( + __first, __diff_t(0), __last - __first, [&__pred](_ForwardIterator __iter, __diff_t __i) { + return __pred(__iter[__i]); + }); } else { return std::find_if(__first, __last, __pred); } diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h index 81fd4526b8dbf..1667ec0f0c4f4 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h @@ -13,6 +13,7 @@ #include <__algorithm/pstl_backends/cpu_backends/backend.h> #include <__config> #include <__iterator/concepts.h> +#include <__pstl/cpu_algos/cpu_traits.h> #include <__type_traits/is_execution_policy.h> #include <__utility/empty.h> #include @@ -39,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty> __pstl_for_each(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) { if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { - return std::__par_backend::__parallel_for( + return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for( __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) { [[maybe_unused]] auto __res = std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>( __cpu_backend_tag{}, __brick_first, __brick_last, __func); diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h index e885e7f225172..8757f24968037 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h @@ -23,6 +23,7 @@ #include <__memory/construct_at.h> #include <__memory/unique_ptr.h> #include <__numeric/reduce.h> +#include <__pstl/cpu_algos/cpu_traits.h> #include <__utility/empty.h> #include <__utility/exception_guard.h> #include <__utility/move.h> @@ -37,10 +38,11 @@ _LIBCPP_PUSH_MACROS #if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 _LIBCPP_BEGIN_NAMESPACE_STD +namespace __pstl { -namespace __par_backend { -inline namespace __libdispatch { +struct __libdispatch_backend_tag {}; +namespace __libdispatch { // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do // we. // TODO: Do we want to add [[_Clang::__callback__(__func, __context, __)]]? @@ -77,267 +79,270 @@ __dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator _ return __empty{}; } +} // namespace __libdispatch -template -_LIBCPP_HIDE_FROM_ABI optional<__empty> -__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) { - return __libdispatch::__dispatch_parallel_for( - __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func)); -} - -template -struct __merge_range { - __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result) - : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {} +template <> +struct __cpu_traits<__libdispatch_backend_tag> { + template + _LIBCPP_HIDE_FROM_ABI static optional<__empty> + __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) { + return __libdispatch::__dispatch_parallel_for( + __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func)); + } - _RandomAccessIterator1 __mid1_; - _RandomAccessIterator2 __mid2_; - _RandomAccessIteratorOut __result_; -}; + template + struct __merge_range { + __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result) + : __mid1_(__mid1), __mid2_(__mid2), __result_(__result) {} -template -_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_merge( - _RandomAccessIterator1 __first1, - _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, - _RandomAccessIterator3 __result, - _Compare __comp, - _LeafMerge __leaf_merge) noexcept { - __chunk_partitions __partitions = - __libdispatch::__partition_chunks(std::max(__last1 - __first1, __last2 - __first2)); - - if (__partitions.__chunk_count_ == 0) - return __empty{}; + _RandomAccessIterator1 __mid1_; + _RandomAccessIterator2 __mid2_; + _RandomAccessIteratorOut __result_; + }; - if (__partitions.__chunk_count_ == 1) { - __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp); - return __empty{}; - } + template + _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge( + _RandomAccessIterator1 __first1, + _RandomAccessIterator1 __last1, + _RandomAccessIterator2 __first2, + _RandomAccessIterator2 __last2, + _RandomAccessIterator3 __result, + _Compare __comp, + _LeafMerge __leaf_merge) noexcept { + __libdispatch::__chunk_partitions __partitions = + __libdispatch::__partition_chunks(std::max(__last1 - __first1, __last2 - __first2)); + + if (__partitions.__chunk_count_ == 0) + return __empty{}; + + if (__partitions.__chunk_count_ == 1) { + __leaf_merge(__first1, __last1, __first2, __last2, __result, __comp); + return __empty{}; + } - using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>; - auto const __n_ranges = __partitions.__chunk_count_ + 1; + using __merge_range_t = __merge_range<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3>; + auto const __n_ranges = __partitions.__chunk_count_ + 1; - // TODO: use __uninitialized_buffer - auto __destroy = [=](__merge_range_t* __ptr) { - std::destroy_n(__ptr, __n_ranges); - std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges); - }; + // TODO: use __uninitialized_buffer + auto __destroy = [=](__merge_range_t* __ptr) { + std::destroy_n(__ptr, __n_ranges); + std::allocator<__merge_range_t>().deallocate(__ptr, __n_ranges); + }; - unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges( - [&]() -> __merge_range_t* { + unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges( + [&]() -> __merge_range_t* { # ifndef _LIBCPP_HAS_NO_EXCEPTIONS - try { + try { # endif - return std::allocator<__merge_range_t>().allocate(__n_ranges); + return std::allocator<__merge_range_t>().allocate(__n_ranges); # ifndef _LIBCPP_HAS_NO_EXCEPTIONS - } catch (const std::bad_alloc&) { - return nullptr; - } + } catch (const std::bad_alloc&) { + return nullptr; + } # endif - }(), - __destroy); - - if (!__ranges) - return nullopt; + }(), + __destroy); + + if (!__ranges) + return nullopt; + + // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case + __merge_range_t* __r = __ranges.get(); + std::__construct_at(__r++, __first1, __first2, __result); + + bool __iterate_first_range = __last1 - __first1 > __last2 - __first2; + + auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t { + auto [__mid1, __mid2] = [&] { + if (__iterate_first_range) { + auto __m1 = __first1 + __chunk_size; + auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp); + return std::make_pair(__m1, __m2); + } else { + auto __m2 = __first2 + __chunk_size; + auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp); + return std::make_pair(__m1, __m2); + } + }(); - // TODO: Improve the case where the smaller range is merged into just a few (or even one) chunks of the larger case - __merge_range_t* __r = __ranges.get(); - std::__construct_at(__r++, __first1, __first2, __result); + __result += (__mid1 - __first1) + (__mid2 - __first2); + __first1 = __mid1; + __first2 = __mid2; + return {std::move(__mid1), std::move(__mid2), __result}; + }; - bool __iterate_first_range = __last1 - __first1 > __last2 - __first2; + // handle first chunk + std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_)); - auto __compute_chunk = [&](size_t __chunk_size) -> __merge_range_t { - auto [__mid1, __mid2] = [&] { - if (__iterate_first_range) { - auto __m1 = __first1 + __chunk_size; - auto __m2 = std::lower_bound(__first2, __last2, __m1[-1], __comp); - return std::make_pair(__m1, __m2); - } else { - auto __m2 = __first2 + __chunk_size; - auto __m1 = std::lower_bound(__first1, __last1, __m2[-1], __comp); - return std::make_pair(__m1, __m2); - } - }(); + // handle 2 -> N - 1 chunks + for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i) + std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_)); - __result += (__mid1 - __first1) + (__mid2 - __first2); - __first1 = __mid1; - __first2 = __mid2; - return {std::move(__mid1), std::move(__mid2), __result}; - }; + // handle last chunk + std::__construct_at(__r, __last1, __last2, __result); - // handle first chunk - std::__construct_at(__r++, __compute_chunk(__partitions.__first_chunk_size_)); - - // handle 2 -> N - 1 chunks - for (ptrdiff_t __i = 0; __i != __partitions.__chunk_count_ - 2; ++__i) - std::__construct_at(__r++, __compute_chunk(__partitions.__chunk_size_)); - - // handle last chunk - std::__construct_at(__r, __last1, __last2, __result); - - __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) { - auto __first_iters = __ranges[__index]; - auto __last_iters = __ranges[__index + 1]; - __leaf_merge( - __first_iters.__mid1_, - __last_iters.__mid1_, - __first_iters.__mid2_, - __last_iters.__mid2_, - __first_iters.__result_, - __comp); - }); + __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __index) { + auto __first_iters = __ranges[__index]; + auto __last_iters = __ranges[__index + 1]; + __leaf_merge( + __first_iters.__mid1_, + __last_iters.__mid1_, + __first_iters.__mid2_, + __last_iters.__mid2_, + __first_iters.__result_, + __comp); + }); - return __empty{}; -} + return __empty{}; + } -template -_LIBCPP_HIDE_FROM_ABI optional<_Value> __parallel_transform_reduce( - _RandomAccessIterator __first, - _RandomAccessIterator __last, - _Transform __transform, - _Value __init, - _Combiner __combiner, - _Reduction __reduction) { - if (__first == __last) - return __init; - - auto __partitions = __libdispatch::__partition_chunks(__last - __first); - - auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) { - std::destroy_n(__ptr, __count); - std::allocator<_Value>().deallocate(__ptr, __count); - }; + template + _LIBCPP_HIDE_FROM_ABI static optional<_Value> __parallel_transform_reduce( + _RandomAccessIterator __first, + _RandomAccessIterator __last, + _Transform __transform, + _Value __init, + _Combiner __combiner, + _Reduction __reduction) { + if (__first == __last) + return __init; + + auto __partitions = __libdispatch::__partition_chunks(__last - __first); + + auto __destroy = [__count = __partitions.__chunk_count_](_Value* __ptr) { + std::destroy_n(__ptr, __count); + std::allocator<_Value>().deallocate(__ptr, __count); + }; - // TODO: use __uninitialized_buffer - // TODO: allocate one element per worker instead of one element per chunk - unique_ptr<_Value[], decltype(__destroy)> __values( - std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy); + // TODO: use __uninitialized_buffer + // TODO: allocate one element per worker instead of one element per chunk + unique_ptr<_Value[], decltype(__destroy)> __values( + std::allocator<_Value>().allocate(__partitions.__chunk_count_), __destroy); + + // __dispatch_apply is noexcept + __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) { + auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_; + auto __index = __chunk == 0 ? 0 + : (__chunk * __partitions.__chunk_size_) + + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_); + if (__this_chunk_size != 1) { + std::__construct_at( + __values.get() + __chunk, + __reduction(__first + __index + 2, + __first + __index + __this_chunk_size, + __combiner(__transform(__first + __index), __transform(__first + __index + 1)))); + } else { + std::__construct_at(__values.get() + __chunk, __transform(__first + __index)); + } + }); - // __dispatch_apply is noexcept - __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) { - auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_; - auto __index = - __chunk == 0 - ? 0 - : (__chunk * __partitions.__chunk_size_) + (__partitions.__first_chunk_size_ - __partitions.__chunk_size_); - if (__this_chunk_size != 1) { - std::__construct_at( - __values.get() + __chunk, - __reduction(__first + __index + 2, - __first + __index + __this_chunk_size, - __combiner(__transform(__first + __index), __transform(__first + __index + 1)))); - } else { - std::__construct_at(__values.get() + __chunk, __transform(__first + __index)); - } - }); + return std::reduce( + std::make_move_iterator(__values.get()), + std::make_move_iterator(__values.get() + __partitions.__chunk_count_), + std::move(__init), + __combiner); + } - return std::reduce( - std::make_move_iterator(__values.get()), - std::make_move_iterator(__values.get() + __partitions.__chunk_count_), - std::move(__init), - __combiner); -} + template + _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort( + _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) { + const auto __size = __last - __first; + auto __partitions = __libdispatch::__partition_chunks(__size); -template -_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_stable_sort( - _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) { - const auto __size = __last - __first; - auto __partitions = __libdispatch::__partition_chunks(__size); + if (__partitions.__chunk_count_ == 0) + return __empty{}; - if (__partitions.__chunk_count_ == 0) - return __empty{}; + if (__partitions.__chunk_count_ == 1) { + __leaf_sort(__first, __last, __comp); + return __empty{}; + } - if (__partitions.__chunk_count_ == 1) { - __leaf_sort(__first, __last, __comp); - return __empty{}; - } + using _Value = __iter_value_type<_RandomAccessIterator>; - using _Value = __iter_value_type<_RandomAccessIterator>; + auto __destroy = [__size](_Value* __ptr) { + std::destroy_n(__ptr, __size); + std::allocator<_Value>().deallocate(__ptr, __size); + }; - auto __destroy = [__size](_Value* __ptr) { - std::destroy_n(__ptr, __size); - std::allocator<_Value>().deallocate(__ptr, __size); - }; + // TODO: use __uninitialized_buffer + unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy); - // TODO: use __uninitialized_buffer - unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy); + // Initialize all elements to a moved-from state + // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928 + std::__construct_at(__values.get(), std::move(*__first)); + for (__iter_diff_t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) { + std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1])); + } + *__first = std::move(__values.get()[__size - 1]); + + __libdispatch::__dispatch_parallel_for( + __partitions, + __first, + [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) { + __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp); + }); + + bool __objects_are_in_buffer = false; + do { + const auto __old_chunk_size = __partitions.__chunk_size_; + if (__partitions.__chunk_count_ % 2 == 1) { + auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) { + std::inplace_merge( + __first_chunk_begin, + __first_chunk_begin + __partitions.__first_chunk_size_, + __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_, + __comp); + }; + if (__objects_are_in_buffer) + __inplace_merge_chunks(__values.get()); + else + __inplace_merge_chunks(__first); + __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_; + } else { + __partitions.__first_chunk_size_ += __partitions.__chunk_size_; + } - // Initialize all elements to a moved-from state - // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928 - std::__construct_at(__values.get(), std::move(*__first)); - for (__iter_diff_t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) { - std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1])); - } - *__first = std::move(__values.get()[__size - 1]); - - __libdispatch::__dispatch_parallel_for( - __partitions, - __first, - [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) { - __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp); - }); - - bool __objects_are_in_buffer = false; - do { - const auto __old_chunk_size = __partitions.__chunk_size_; - if (__partitions.__chunk_count_ % 2 == 1) { - auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) { - std::inplace_merge( - __first_chunk_begin, - __first_chunk_begin + __partitions.__first_chunk_size_, - __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_, - __comp); + __partitions.__chunk_size_ *= 2; + __partitions.__chunk_count_ /= 2; + + auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) { + __libdispatch::__dispatch_parallel_for( + __partitions, + __from_first, + [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) { + std::merge(std::make_move_iterator(__chunk_first), + std::make_move_iterator(__chunk_last - __old_chunk_size), + std::make_move_iterator(__chunk_last - __old_chunk_size), + std::make_move_iterator(__chunk_last), + __to_first + (__chunk_first - __from_first), + __comp); + }); }; + if (__objects_are_in_buffer) - __inplace_merge_chunks(__values.get()); + __merge_chunks(__values.get(), __first); else - __inplace_merge_chunks(__first); - __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_; - } else { - __partitions.__first_chunk_size_ += __partitions.__chunk_size_; - } - - __partitions.__chunk_size_ *= 2; - __partitions.__chunk_count_ /= 2; - - auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) { - __libdispatch::__dispatch_parallel_for( - __partitions, - __from_first, - [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) { - std::merge(std::make_move_iterator(__chunk_first), - std::make_move_iterator(__chunk_last - __old_chunk_size), - std::make_move_iterator(__chunk_last - __old_chunk_size), - std::make_move_iterator(__chunk_last), - __to_first + (__chunk_first - __from_first), - __comp); - }); - }; + __merge_chunks(__first, __values.get()); + __objects_are_in_buffer = !__objects_are_in_buffer; + } while (__partitions.__chunk_count_ > 1); - if (__objects_are_in_buffer) - __merge_chunks(__values.get(), __first); - else - __merge_chunks(__first, __values.get()); - __objects_are_in_buffer = !__objects_are_in_buffer; - } while (__partitions.__chunk_count_ > 1); + if (__objects_are_in_buffer) { + std::move(__values.get(), __values.get() + __size, __first); + } - if (__objects_are_in_buffer) { - std::move(__values.get(), __values.get() + __size, __first); + return __empty{}; } - return __empty{}; -} + _LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {} -_LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {} - -} // namespace __libdispatch -} // namespace __par_backend + static constexpr size_t __lane_size = 64; +}; +} // namespace __pstl _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h index b0db70f58b2ef..d034447904872 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h @@ -13,6 +13,7 @@ #include <__algorithm/pstl_backends/cpu_backends/backend.h> #include <__config> #include <__iterator/concepts.h> +#include <__pstl/cpu_algos/cpu_traits.h> #include <__type_traits/is_execution_policy.h> #include <__utility/move.h> #include @@ -45,7 +46,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_merge( __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value && __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) { - auto __res = __par_backend::__parallel_merge( + auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_merge( __first1, __last1, __first2, diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h index afcc7ffb26613..c3d2905daed17 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h @@ -11,6 +11,7 @@ #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_SERIAL_H #include <__config> +#include <__pstl/cpu_algos/cpu_traits.h> #include <__utility/empty.h> #include <__utility/move.h> #include @@ -26,54 +27,55 @@ _LIBCPP_PUSH_MACROS # include <__undef_macros> _LIBCPP_BEGIN_NAMESPACE_STD - -namespace __par_backend { -inline namespace __serial_cpu_backend { - -template -_LIBCPP_HIDE_FROM_ABI optional<__empty> -__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) { - __f(__first, __last); - return __empty{}; -} - -template -_LIBCPP_HIDE_FROM_ABI optional<_Tp> -__parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) { - return __reduce(std::move(__first), std::move(__last), std::move(__init)); -} - -template -_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_stable_sort( - _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) { - __leaf_sort(__first, __last, __comp); - return __empty{}; -} - -_LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {} - -template -_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_merge( - _RandomAccessIterator1 __first1, - _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, - _RandomAccessIterator3 __outit, - _Compare __comp, - _LeafMerge __leaf_merge) { - __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp); - return __empty{}; -} - -// TODO: Complete this list - -} // namespace __serial_cpu_backend -} // namespace __par_backend - +namespace __pstl { + +struct __serial_backend_tag {}; + +template <> +struct __cpu_traits<__serial_backend_tag> { + template + _LIBCPP_HIDE_FROM_ABI static optional<__empty> + __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) { + __f(__first, __last); + return __empty{}; + } + + template + _LIBCPP_HIDE_FROM_ABI static optional<_Tp> + __parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) { + return __reduce(std::move(__first), std::move(__last), std::move(__init)); + } + + template + _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort( + _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) { + __leaf_sort(__first, __last, __comp); + return __empty{}; + } + + _LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {} + + template + _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge( + _RandomAccessIterator1 __first1, + _RandomAccessIterator1 __last1, + _RandomAccessIterator2 __first2, + _RandomAccessIterator2 __last2, + _RandomAccessIterator3 __outit, + _Compare __comp, + _LeafMerge __leaf_merge) { + __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp); + return __empty{}; + } + + static constexpr size_t __lane_size = 64; +}; + +} // namespace __pstl _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h index 34c423586c4b7..ebfa0fc69147d 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h @@ -12,6 +12,7 @@ #include <__algorithm/pstl_backends/cpu_backends/backend.h> #include <__algorithm/stable_sort.h> #include <__config> +#include <__pstl/cpu_algos/cpu_traits.h> #include <__type_traits/is_execution_policy.h> #include <__utility/empty.h> #include @@ -28,7 +29,7 @@ template _LIBCPP_HIDE_FROM_ABI optional<__empty> __pstl_stable_sort(__cpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) { if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy>) { - return __par_backend::__parallel_stable_sort( + return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_stable_sort( __first, __last, __comp, [](_RandomAccessIterator __g_first, _RandomAccessIterator __g_last, _Comp __g_comp) { std::stable_sort(__g_first, __g_last, __g_comp); }); diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h index eb11a961b760c..8d1cb221c3d82 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h @@ -11,6 +11,7 @@ #include <__assert> #include <__config> +#include <__pstl/cpu_algos/cpu_traits.h> #include <__utility/empty.h> #include <__utility/move.h> #include @@ -29,52 +30,55 @@ _LIBCPP_PUSH_MACROS // by a proper implementation once the PSTL implementation is somewhat stable. _LIBCPP_BEGIN_NAMESPACE_STD - -namespace __par_backend { -inline namespace __thread_cpu_backend { - -template -_LIBCPP_HIDE_FROM_ABI optional<__empty> -__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) { - __f(__first, __last); - return __empty{}; -} - -template -_LIBCPP_HIDE_FROM_ABI optional<_Tp> -__parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) { - return __reduce(std::move(__first), std::move(__last), std::move(__init)); -} - -template -_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_stable_sort( - _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) { - __leaf_sort(__first, __last, __comp); - return __empty{}; -} - -_LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {} - -template -_LIBCPP_HIDE_FROM_ABI optional<__empty> __parallel_merge( - _RandomAccessIterator1 __first1, - _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, - _RandomAccessIterator3 __outit, - _Compare __comp, - _LeafMerge __leaf_merge) { - __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp); - return __empty{}; -} - -} // namespace __thread_cpu_backend -} // namespace __par_backend - +namespace __pstl { + +struct __std_thread_backend_tag {}; + +template <> +struct __cpu_traits<__std_thread_backend_tag> { + template + _LIBCPP_HIDE_FROM_ABI static optional<__empty> + __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) { + __f(__first, __last); + return __empty{}; + } + + template + _LIBCPP_HIDE_FROM_ABI static optional<_Tp> + __parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) { + return __reduce(std::move(__first), std::move(__last), std::move(__init)); + } + + template + _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort( + _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) { + __leaf_sort(__first, __last, __comp); + return __empty{}; + } + + _LIBCPP_HIDE_FROM_ABI static void __cancel_execution() {} + + template + _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge( + _RandomAccessIterator1 __first1, + _RandomAccessIterator1 __last1, + _RandomAccessIterator2 __first2, + _RandomAccessIterator2 __last2, + _RandomAccessIterator3 __outit, + _Compare __comp, + _LeafMerge __leaf_merge) { + __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp); + return __empty{}; + } + + static constexpr size_t __lane_size = 64; +}; + +} // namespace __pstl _LIBCPP_END_NAMESPACE_STD #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17 diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h index fdf1a2e78dad9..d4c383997a67a 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h @@ -14,6 +14,7 @@ #include <__config> #include <__iterator/concepts.h> #include <__iterator/iterator_traits.h> +#include <__pstl/cpu_algos/cpu_traits.h> #include <__type_traits/enable_if.h> #include <__type_traits/is_execution_policy.h> #include <__type_traits/remove_cvref.h> @@ -49,7 +50,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform( if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) { - std::__par_backend::__parallel_for( + __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for( __first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) { auto __res = std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>( __cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op); @@ -97,7 +98,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform( __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value && __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) { - auto __res = std::__par_backend::__parallel_for( + auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for( __first1, __last1, [__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) { diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h index 376abd39fa36e..956c7d6a88ce2 100644 --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h @@ -14,6 +14,7 @@ #include <__iterator/concepts.h> #include <__iterator/iterator_traits.h> #include <__numeric/transform_reduce.h> +#include <__pstl/cpu_algos/cpu_traits.h> #include <__type_traits/desugars_to.h> #include <__type_traits/is_arithmetic.h> #include <__type_traits/is_execution_policy.h> @@ -32,7 +33,8 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD -template = 0> _LIBCPP_HIDE_FROM_ABI _Tp __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _UnaryOperation __f) noexcept { - const _Size __block_size = __lane_size / sizeof(_Tp); + constexpr size_t __lane_size = __pstl::__cpu_traits<_Backend>::__lane_size; + const _Size __block_size = __lane_size / sizeof(_Tp); if (__n > 2 * __block_size && __block_size > 1) { alignas(__lane_size) char __lane_buffer[__lane_size]; _Tp* __lane = reinterpret_cast<_Tp*>(__lane_buffer); @@ -116,7 +120,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce( if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value && __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) { - return __par_backend::__parallel_transform_reduce( + return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_transform_reduce( __first1, std::move(__last1), [__first1, __first2, __transform](_ForwardIterator1 __iter) { @@ -138,7 +142,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce( } else if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> && __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value && __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) { - return std::__simd_transform_reduce( + return std::__simd_transform_reduce<__cpu_backend_tag>( __last1 - __first1, std::move(__init), std::move(__reduce), [&](__iter_diff_t<_ForwardIterator1> __i) { return __transform(__first1[__i], __first2[__i]); }); @@ -163,7 +167,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce( _UnaryOperation __transform) { if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { - return __par_backend::__parallel_transform_reduce( + return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_transform_reduce( std::move(__first), std::move(__last), [__transform](_ForwardIterator __iter) { return __transform(*__iter); }, @@ -182,7 +186,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce( }); } else if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> && __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { - return std::__simd_transform_reduce( + return std::__simd_transform_reduce<__cpu_backend_tag>( __last - __first, std::move(__init), std::move(__reduce), diff --git a/libcxx/include/__pstl/cpu_algos/cpu_traits.h b/libcxx/include/__pstl/cpu_algos/cpu_traits.h new file mode 100644 index 0000000000000..2f0db46e9be83 --- /dev/null +++ b/libcxx/include/__pstl/cpu_algos/cpu_traits.h @@ -0,0 +1,87 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___PSTL_CPU_ALGOS_CPU_TRAITS_H +#define _LIBCPP___PSTL_CPU_ALGOS_CPU_TRAITS_H + +#include <__config> +#include + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + +_LIBCPP_BEGIN_NAMESPACE_STD +namespace __pstl { + +// __cpu_traits +// +// This traits class encapsulates the basis operations for a CPU-based implementation of the PSTL. +// All the operations in the PSTL can be implemented from these basis operations, so a pure CPU backend +// only needs to customize these traits in order to get an implementation of the whole PSTL. +// +// Basis operations +// ================ +// +// template +// optional<__empty> __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func); +// - __func must take a subrange of [__first, __last) that should be executed in serial +// +// template +// optional<_Tp> __parallel_transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp, +// _Reduction); +// +// template +// optional<_RandomAccessIterator3> __parallel_merge(_RandomAccessIterator1 __first1, +// _RandomAccessIterator1 __last1, +// _RandomAccessIterator2 __first2, +// _RandomAccessIterator2 __last2, +// _RandomAccessIterator3 __outit, +// _Compare __comp, +// _LeafMerge __leaf_merge); +// +// template +// optional<__empty> __parallel_stable_sort(_RandomAccessIterator __first, +// _RandomAccessIterator __last, +// _Comp __comp, +// _LeafSort __leaf_sort); +// +// void __cancel_execution(); +// Cancel the execution of other jobs - they aren't needed anymore. This is not a binding request, +// some backends may not actually be able to cancel jobs. +// +// constexpr size_t __lane_size; +// Size of SIMD lanes. +// TODO: Merge this with __native_vector_size from __algorithm/simd_utils.h +// +// +// Exception handling +// ================== +// +// CPU backends are expected to report errors (i.e. failure to allocate) by returning a disengaged `optional` from their +// implementation. Exceptions shouldn't be used to report an internal failure-to-allocate, since all exceptions are +// turned into a program termination at the front-end level. When a backend returns a disengaged `optional` to the +// frontend, the frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to +// the user. + +template +struct __cpu_traits; + +} // namespace __pstl +_LIBCPP_END_NAMESPACE_STD + +_LIBCPP_POP_MACROS + +#endif // _LIBCPP___PSTL_CPU_ALGOS_CPU_TRAITS_H diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index ed45a1b183389..546e5dad1ccd5 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -1604,6 +1604,8 @@ module std_private_numeric_transform_exclusive_scan [system] { header "__numeric module std_private_numeric_transform_inclusive_scan [system] { header "__numeric/transform_inclusive_scan.h" } module std_private_numeric_transform_reduce [system] { header "__numeric/transform_reduce.h" } +module std_private_pstl_cpu_algos_cpu_traits [system] { header "__pstl/cpu_algos/cpu_traits.h" } + module std_private_queue_fwd [system] { header "__fwd/queue.h" } module std_private_random_bernoulli_distribution [system] { header "__random/bernoulli_distribution.h" } diff --git a/libcxx/src/pstl/libdispatch.cpp b/libcxx/src/pstl/libdispatch.cpp index 52d4afbcce6e0..d997a9c73463d 100644 --- a/libcxx/src/pstl/libdispatch.cpp +++ b/libcxx/src/pstl/libdispatch.cpp @@ -12,8 +12,7 @@ #include _LIBCPP_BEGIN_NAMESPACE_STD - -namespace __par_backend::inline __libdispatch { +namespace __pstl::__libdispatch { void __dispatch_apply(size_t chunk_count, void* context, void (*func)(void* context, size_t chunk)) noexcept { ::dispatch_apply_f(chunk_count, DISPATCH_APPLY_AUTO, context, func); @@ -29,7 +28,5 @@ __chunk_partitions __partition_chunks(ptrdiff_t element_count) noexcept { return partitions; } -// NOLINTNEXTLINE(llvm-namespace-comment) // This is https://llvm.org/PR56804 -} // namespace __par_backend::inline __libdispatch - +} // namespace __pstl::__libdispatch _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp b/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp index 554924a0179d5..8c7016a80b811 100644 --- a/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp +++ b/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp @@ -18,21 +18,21 @@ int main(int, char**) { { - auto chunks = std::__par_backend::__libdispatch::__partition_chunks(0); + auto chunks = std::__pstl::__libdispatch::__partition_chunks(0); assert(chunks.__chunk_count_ == 1); assert(chunks.__first_chunk_size_ == 0); assert(chunks.__chunk_size_ == 0); } { - auto chunks = std::__par_backend::__libdispatch::__partition_chunks(1); + auto chunks = std::__pstl::__libdispatch::__partition_chunks(1); assert(chunks.__chunk_count_ == 1); assert(chunks.__first_chunk_size_ == 1); assert(chunks.__chunk_size_ == 1); } for (std::ptrdiff_t i = 2; i != 2ll << 20; ++i) { - auto chunks = std::__par_backend::__libdispatch::__partition_chunks(i); + auto chunks = std::__pstl::__libdispatch::__partition_chunks(i); assert(chunks.__chunk_count_ >= 1); assert(chunks.__chunk_count_ <= i); assert((chunks.__chunk_count_ - 1) * chunks.__chunk_size_ + chunks.__first_chunk_size_ == i); diff --git a/libcxx/utils/generate_iwyu_mapping.py b/libcxx/utils/generate_iwyu_mapping.py index 8ab7b86299edc..b8a8580ea30f3 100644 --- a/libcxx/utils/generate_iwyu_mapping.py +++ b/libcxx/utils/generate_iwyu_mapping.py @@ -10,6 +10,7 @@ def IWYU_mapping(header: str) -> typing.Optional[typing.List[str]]: ignore = [ "__debug_utils/.+", "__fwd/get[.]h", + "__pstl/.+", "__support/.+", ] if any(re.match(pattern, header) for pattern in ignore):