From 69e5edba7a9788563f189bc06642c82b3570238d Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 17 Jul 2024 15:24:01 -0700 Subject: [PATCH 1/3] Align proposed extension with implementation --- .../sycl_ext_oneapi_group_sort.asciidoc | 61 +++++++++++-------- .../experimental/group_helpers_sorters.hpp | 14 ++--- 2 files changed, 42 insertions(+), 33 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc index b1e305ec649cd..17f7e52eb8d5a 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc @@ -14,7 +14,7 @@ == Notice [%hardbreaks] -Copyright (c) 2021-2022 Intel Corporation. All rights reserved. +Copyright (c) 2021-2024 Intel Corporation. All rights reserved. Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by @@ -34,19 +34,16 @@ SYCL specification refer to that revision. This extension also depends on the following other SYCL extensions: -* link:../experimental/sycl_ext_oneapi_properties.asciidoc[ - sycl_ext_oneapi_properties]. +* link:../experimental/sycl_ext_oneapi_properties.asciidoc[sycl_ext_oneapi_properties] == Status -This is a proposed update to an existing experimental extension. -Interfaces defined in this -specification may not be implemented yet or may be in a preliminary state. The -specification itself may also change in incompatible ways before it is -finalized. *Shipping software products should not rely on APIs defined in this -specification.* See -link:../experimental/sycl_ext_oneapi_group_sort.asciidoc[here] for the existing -extension, which is implemented. +This is an experimental extension specification, intended to provide early +access to features and gather community feedback. Interfaces defined in this +specification are implemented in {dpcpp}, but they are not finalized and may +change incompatibly in future versions of {dpcpp} without prior notice. +*Shipping software products should not rely on APIs defined in this +specification.* == Introduction @@ -963,20 +960,37 @@ Consider 2 layouts: |{2, 5, 8, 11} |=== -There are 2 properties that satisfy -link:sycl_ext_oneapi_properties.asciidoc[SYCL Properties Extension] +There are 2 compile-time properties that satisfy +link:../experimental/sycl_ext_oneapi_properties.asciidoc[SYCL Properties Extension] requirements: [source,c++] ---- -namespace sycl::ext::oneapi::experimental::property -{ - template - struct input_data_placement; // (1) +namespace sycl::ext::oneapi::experimental { - template - struct output_data_placement; // (2) -} +struct input_data_placement_key : /* unspecified */ { + template + using value_t = + property_value>; +}; + +struct output_data_placement_key : /* unspecified */ { + template + using value_t = + property_value>; +}; + +template +inline constexpr input_data_placement_key::value_t + input_data_placement; // (1) + +template +inline constexpr output_data_placement_key::value_t + output_data_placement; // (2) + +} // namespace sycl::ext::oneapi::experimental ---- 1. `input_data_placement` specifies the data placement for input. This is @@ -1192,12 +1206,6 @@ because it's easy to pass different comparator types. overloads with `Compare` objects seems extra and overloads with sorters, without sorters are enough. -== Non-implemented features -Please, note that following is not inplemented yet for the open-source repo: - -. `radix_sorter`, `radix_order`. -. fixed-size arrays and properties. - == Revision History [cols="5,15,15,70"] @@ -1211,4 +1219,5 @@ Please, note that following is not inplemented yet for the open-source repo: making the entire extension experimental |4|2022-11-14|Andrey Fedorov|Fixed size arrays, key-value sorting and properties |5|2023-11-09|Andrey Fedorov|Changed `memory_required` functions for default sorters +|6|2024-07-17|Artur Gainullin|Align the description of data placement properties with the implementation |======================================== diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp index 7ec5435771151..a12cff7ad8eb0 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp @@ -37,22 +37,22 @@ namespace sycl { inline namespace _V1 { namespace ext::oneapi::experimental { -enum class group_algorithm_data_placement { blocked, striped }; +enum class group_algorithm_data_placement : std::uint8_t { blocked, striped }; struct input_data_placement_key : detail::compile_time_property_key { template - using value_t = - property_value(Placement)>>; + using value_t = property_value< + input_data_placement_key, + std::integral_constant>; }; struct output_data_placement_key : detail::compile_time_property_key { template - using value_t = - property_value(Placement)>>; + using value_t = property_value< + output_data_placement_key, + std::integral_constant>; }; template From b6aceb7d73e9d49e6d6f6ca019fd76c046ae8b20 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 17 Jul 2024 15:25:47 -0700 Subject: [PATCH 2/3] Move extension from proposed to experimental directory --- .../sycl_ext_oneapi_group_sort.asciidoc | 1058 ++++++++++---- .../sycl_ext_oneapi_group_sort.asciidoc | 1223 ----------------- 2 files changed, 820 insertions(+), 1461 deletions(-) delete mode 100644 sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_group_sort.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_group_sort.asciidoc index 8b2098f092826..17f7e52eb8d5a 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_group_sort.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_group_sort.asciidoc @@ -8,53 +8,64 @@ :toc: left :encoding: utf-8 :lang: en - -:blank: pass:[ +] - -// Set the default source code type in this document to C++, -// for syntax highlighting purposes. This is needed because -// docbook uses c++ and html5 uses cpp. +:dpcpp: pass:[DPC++] :language: {basebackend@docbook:c++:cpp} == Notice -Copyright (c) 2021 Intel Corporation. All rights reserved. +[%hardbreaks] +Copyright (c) 2021-2024 Intel Corporation. All rights reserved. + +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks +of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by +permission by Khronos. -IMPORTANT: This specification is a draft. +== Contact -NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are -trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. -used by permission by Khronos. +To report problems with this extension, please open a new issue at: -NOTE: This document is better viewed when rendered as html with asciidoctor. -GitHub does not render image icons. +https://github.com/intel/llvm/issues -This extension is written against the SYCL 2020 revision 3 specification. All +== Dependencies + +This extension is written against the SYCL 2020 revision 5 specification. All references below to the "core SYCL specification" or to section numbers in the SYCL specification refer to that revision. -NOTE: This extension is experimental: interfaces are subject to change later. +This extension also depends on the following other SYCL extensions: + +* link:../experimental/sycl_ext_oneapi_properties.asciidoc[sycl_ext_oneapi_properties] + +== Status + +This is an experimental extension specification, intended to provide early +access to features and gather community feedback. Interfaces defined in this +specification are implemented in {dpcpp}, but they are not finalized and may +change incompatibly in future versions of {dpcpp} without prior notice. +*Shipping software products should not rely on APIs defined in this +specification.* == Introduction -This extension introduces sorting functions to the group algorithms library, along with -associated Sorter objects and Group Helper objects. +This extension introduces sorting functions to the group algorithms +library, along with associated Sorter objects and Group Helper objects. == Feature test macro This extension provides a feature-test macro as described in the core SYCL -specification section 6.3.3 "Feature test macros". Therefore, an -implementation supporting this extension must predefine the macro -`SYCL_EXT_ONEAPI_GROUP_SORT` to one of the values defined in the table below. -Applications can test for the existence of this macro to determine if the -implementation supports this feature, or applications can test the macro's -value to determine which of the extension's APIs the implementation supports. +specification. An implementation supporting this extension must predefine the +macro `SYCL_EXT_ONEAPI_GROUP_SORT` to one of the values defined in the table +below. Applications can test for the existence of this macro to determine if +the implementation supports this feature, or applications can test the macro's +value to determine which of the extension's features the implementation +supports. Table 1. Values of the `SYCL_EXT_ONEAPI_GROUP_SORT` macro. [%header,cols="1,5"] |=== |Value |Description |1 |Initial extension version. Base features are supported. +|2 |Interfaces with fixed-size arrays and key-value sorting are supported. |=== == Sorting functions @@ -75,90 +86,301 @@ position `i` in the ordered range. namespace sycl::ext::oneapi::experimental { template - void joint_sort(GroupHelper exec, Ptr first, Ptr last); // (1) + void joint_sort(GroupHelper gh, Ptr first, Ptr last); // (1) template - void joint_sort(GroupHelper exec, Ptr first, Ptr last, Compare comp); // (2) + void joint_sort(GroupHelper gh, Ptr first, Ptr last, Compare comp); // (2) template void joint_sort(Group g, Ptr first, Ptr last, Sorter sorter); // (3) template - T sort_over_group(GroupHelper exec, T val); // (4) + T sort_over_group(GroupHelper gh, T value); // (4) template - T sort_over_group(GroupHelper exec, T val, Compare comp); // (5) + T sort_over_group(GroupHelper gh, T value, Compare comp); // (5) template - T sort_over_group(Group g, T val, Sorter sorter); // (6) + T sort_over_group(Group g, T value, Sorter sorter); // (6) + + template + std::tuple + sort_key_value_over_group(GroupHelper gh, T key, U value); // (7) + + template + std::tuple + sort_key_value_over_group(GroupHelper gh, T key, U value, Compare comp); // (8) + + template + std::tuple + sort_key_value_over_group(Group g, T key, U value, Sorter sorter); // (9) + } ---- -1._Preconditions_: `first`, `last` must be the same for all work-items in the group. +(1) _Preconditions_: `first`, `last` must be the same for all work-items +in the group. + +_Constraints_: Only available if `GroupHelper` was created with a +work-group or sub-group and some associated scratch space. _Effects_: Sort the elements in the range `[first, last)` -using the `exec` group helper object. Elements are compared by `operator<`. +using the `gh` group helper object. Elements are compared by `operator<`. _Complexity_: Let `N` be `last - first`. `O(N*log(N)*log(N))` comparisons. -_Constraints_: Only available if `GroupHelper` was created with a work-group or sub-group and -some associated scratch space. +(2) _Preconditions_: `first`, `last` must be the same for all work-items +in the group. -2._Preconditions_: `first`, `last` must be the same for all work-items in the group. +_Constraints_: Only available if `GroupHelper` was created with +a work-group or a sub-group and some associated scratch space. _Mandates_: `comp` must satisfy the requirements of `Compare` from the {cpp} standard. _Effects_: Sort the elements in the range `[first, last)` with respect to the -binary comparison function object `comp` using the `exec` group helper object. +binary comparison function object `comp` using the `gh` group helper object. _Complexity_: Let `N` be `last - first`. `O(N*log(N)*log(N))` comparisons. -_Constraints_: Only available if `GroupHelper` was created with a work-group or a sub-group and -some associated scratch space. +(3) _Preconditions_: `first`, `last` must be the same +for all work-items in the group. -3._Preconditions_: `first`, `last` must be the same for all work-items in the group. +_Constraints_: All functions are available only if `Sorter` is +a SYCL Sorter and it provides `operator()(Group, Ptr, Ptr)` overload. _Effects_: Equivalent to: `sorter(g, first, last)`. -_Constraints_: All functions are available only if `Sorter` is a SYCL Sorter and -it provides `operator()(Group, Ptr, Ptr)` overload. +(4) _Constraints_: Only available if `GroupHelper` was created with +a work-group or a sub-group and some associated scratch space. -4._Returns_: The value returned on work-item `i` is the value in position `i` -of the ordered range resulting from sorting `val` from all work-items in the group. -Elements are compared by `operator<` -using the `exec` group helper object. +_Returns_: The value returned on work-item `i` is the value in position `i` +of the ordered range resulting from sorting `value` from all work-items +in the group. Elements are compared by `operator<` +using the `gh` group helper object. For multi-dimensional groups, the order of work-items in the group is determined by their linear id. _Complexity_: Let `N` be the work-group size. `O(N*log(N)*log(N))` comparisons. -_Constraints_: Only available if `GroupHelper` was created with a work-group or a sub-group and -some associated scratch space. +(5) _Constraints_: Only available if `GroupHelper` was created with +a work-group or a sub-group and some associated scratch space. -5._Mandates_: `comp` must satisfy the requirements of `Compare` from the {cpp} standard. +_Mandates_: `comp` must satisfy the requirements of `Compare` from +the {cpp} standard. _Returns_: The value returned on work-item `i` is the value in position `i` -of the ordered range resulting from sorting `val` from all work-items in the -`g` group with respect to the binary comparison function object `comp` -using the `exec` group helper object. +of the ordered range resulting from sorting `value` from all work-items in the +group with respect to the binary comparison function object `comp` +using the `gh` group helper object. +For multi-dimensional groups, the order of work-items in the group is +determined by their linear id. + +_Complexity_: Let `N` be the work-group or sub-group size. +`O(N*log(N)*log(N))` comparisons. + +(6) _Constraints_: All functions are available only if `Sorter` is +a SYCL Sorter and it provides `operator()(Group, T)` overload. + +_Effects_: Equivalent to: `return sorter(g, value)`. + +(7) _Constraints_: Only available if `GroupHelper` was created with +a work-group or a sub-group and some associated scratch space. + +_Returns_: The value returned on work-item `i` is the tuple of values +that are in position `i` +of the ordered range resulting from key-value sorting of `key` and `value` +from all work-items +in the group. Elements are compared by `operator<` +using the `gh` group helper object. For multi-dimensional groups, the order of work-items in the group is determined by their linear id. -_Complexity_: Let `N` be the work-group or sub-group size. `O(N*log(N)*log(N))` comparisons. +_Complexity_: Let `N` be the work-group size. `O(N*log(N)*log(N))` comparisons. + +(8) _Constraints_: Only available if `GroupHelper` was created with +a work-group or a sub-group and some associated scratch space. + +_Mandates_: `comp` must satisfy the requirements of `Compare` from +the {cpp} standard. + +_Returns_: The value returned on work-item `i` is the tuple of values +that are in position `i` +of the ordered range resulting from key-value sorting of `key` and `value` +from all work-items in the +group with respect to the binary comparison function object `comp` +using the `gh` group helper object. +For multi-dimensional groups, the order of work-items in the group is +determined by their linear id. + +_Complexity_: Let `N` be the work-group or sub-group size. +`O(N*log(N)*log(N))` comparisons. + +(9) _Constraints_: All functions are available only if `Sorter` is +a SYCL Sorter and it provides `operator()(Group, T, U)` overload. + +_Effects_: Equivalent to: `return sorter(g, key, value)`. + +NOTE: (7), (8), (9) functions are available starting in revision 2 of this extension. + +=== Functions with fixed-size arrays + +The functions in this section are additional overloads for functions defined above, +except one thing: each work-item provides a fixed-size array of elements rather than +a single element. + +NOTE: These functions are available starting in revision 2 of this extension. + +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + + template + void sort_over_group(GroupHelper gh, + sycl::span values, + Properties properties = {}); // (1) + + template + void sort_over_group(GroupHelper gh, + sycl::span values, + Compare comp, + Properties properties = {}); // (2) + + template + void sort_over_group(Group g, + sycl::span values, + Sorter sorter, + Properties properties = {}); // (3) + + template + void sort_key_value_over_group(GroupHelper gh, + sycl::span keys, + sycl::span values, + Properties properties = {}); // (4) + + template + void sort_key_value_over_group(GroupHelper gh, + sycl::span keys, + sycl::span values, + Compare comp, + Properties properties = {}); // (5) + + template + void sort_key_value_over_group(Group g, + sycl::span keys, + sycl::span values, + Sorter sorter, + Properties properties = {}); // (6) +} +---- + +NOTE: (4), (5), (6) functions below perform sorting +including key-value variant. +Key value sorting is a sorting algorithm where keys are compared, +but keys and values are reordered both. + +(1) _Constraints_: Only available if `GroupHelper` was created with +a work-group or a sub-group and some associated scratch space and +`sycl::ext::oneapi::is_property_list_v>` is true. + +_Effects_: Sort elements in the range containing of elements inside +`values` from all work-items from the group. +Result of sorting is placed into `values` with data placements +specified by `properties`. +Default data placements are those that are specified by the +`group_algorithm_data_placement::blocked` property. +Elements are compared by `operator<` using +the `gh` group helper object. + +_Complexity_: Let `N` be the group size. `O(N*log(N)*log(N))` comparisons. + +(2) _Constraints_: Only available if `GroupHelper` was created with +a work-group or a sub-group and some associated scratch space and +`sycl::ext::oneapi::is_property_list_v>` is true. + +_Mandates_: `comp` must satisfy the requirements of +`Compare` from the {cpp} standard. + +_Effects_: Sort elements in the range containing of elements +inside `values` from all work-items from the group with respect to +the binary comparison function object `comp` using the `gh` group +helper object. +Result of sorting is placed into `values` with data placements +specified by `properties`. +Default data placements are those that are specified by the +`group_algorithm_data_placement::blocked` property. + +_Complexity_: Let `N` be the work-group or sub-group size. +`O(N*log(N)*log(N))` comparisons. + +(3) _Constraints_: All functions are available only if `Sorter` is +a SYCL Sorter and it provides `operator()(Group, sycl::span)` overload and +`sycl::ext::oneapi::is_property_list_v>` is true. + +_Effects_: Equivalent to: `return sorter(g, values, properties)`. + +(4) _Constraints_: Only available if `GroupHelper` was created with +a work-group or a sub-group and some associated scratch space and +`sycl::ext::oneapi::is_property_list_v>` is true. + +_Effects_: Perform key-value sorting for elements in ranges +containing of elements inside `keys` and `values` from all work-items +from the group. +Result of sorting is placed into `keys` and `values` with +data placements specified by `properties`. +Default data placements are those that are specified by the +`group_algorithm_data_placement::blocked` property. +Elements are compared by `operator<` using the `gh` group helper object. + +_Complexity_: Let `N` be the group size. `O(N*log(N)*log(N))` comparisons. + +(5) _Constraints_: Only available if `GroupHelper` was created with +a work-group or a sub-group and some associated scratch space and +`sycl::ext::oneapi::is_property_list_v>` is true. + +_Mandates_: `comp` must satisfy the requirements of `Compare` from +the {cpp} standard. -_Constraints_: Only available if `GroupHelper` was created with a work-group or a sub-group and -some associated scratch space. +_Effects_: Perform key-value sorting for elements in ranges containing +of elements inside `keys` and `values` from all work-items from +the group with respect to the binary comparison +function object `comp` using the `gh` group helper object. +Result of sorting is placed into `keys` and `values` with data placements +specified by `properties`. +Default data placements are those that are specified by the +`group_algorithm_data_placement::blocked` property. +Elements are compared by `operator<`. -6._Effects_: Equivalent to: `return sorter(g, val)`. +_Complexity_: Let `N` be the work-group or sub-group size. +`O(N*log(N)*log(N))` comparisons. -_Constraints_: All functions are available only if `Sorter` is a SYCL Sorter and -it provides `operator()(Group, T)` overload. +(6) _Constraints_: All functions are available only if `Sorter` is +a SYCL Sorter and it provides `operator()(Group, sycl::span, sycl::span) +overload and `sycl::ext::oneapi::is_property_list_v>` +is true. + +_Effects_: Equivalent to: `return sorter(g, keys, values, properties)`. == Sorters -Sorter is a special type that encapsulates a sorting algorithm. Sorter may contain parameters -that help to get better performance. Data for sorting are provided to the `operator()` +Sorter is a special type that encapsulates a sorting algorithm. +Sorter may contain parameters +that help to get better performance. Data for sorting are provided +to `operator()` that should contain an implementation of a sorting algorithm. General semantics of `operator()` is following: @@ -168,14 +390,33 @@ template void operator()(Group g, Ptr first, Ptr last); template -T operator()(Group g, T val); +T operator()(Group g, T value); + +template +void operator()(Group g, + sycl::span values, + Properties properties); + +template +std::tuple operator()(Group g, T key, U value); + +template +void operator()(Group g, + sycl::span keys, + sycl::span values, + Properties properties); ---- -NOTE: At least one overload for `operator()` is required. -If only `void operator()(Group g, Ptr first, Ptr last);` is defined then a Sorter can be passed -to `joint_sort` function only. If it's passed to `sort_over_group`, it leads to a compilation -error. If only `T operator()(Group g, T val);` is defined then a Sorter can be passed to -`sort_over_group` function only. If it's passed to `joint_sort`, it leads to a compilation error. +NOTE: At least one `operator()` overload must be presented. +For example, if only `void operator()(Group g, Ptr first, Ptr last);` +is defined then a Sorter can be passed to `joint_sort` function only. +If it's passed to `sort_over_group`, it leads to a compilation +error. If only `T operator()(Group g, T value);` is defined then a Sorter +can be passed to `sort_over_group` function only. If it's passed to +`joint_sort`, it leads to a compilation error. Table 2. `operator()` for Sorters. |=== @@ -188,21 +429,55 @@ Available only if `sycl::is_group_v>` is true. `first`, `last` must be the same for all work-items in the group. |`template -T operator()(Group g, T val);` +T operator()(Group g, T value);` |Implements a sorting algorithm that calls by `sort_over_group`. Available only if `sycl::is_group_v>` is true. + +|`template +std::tuple operator()(Group g, T key, U value);` +|Implements a sorting algorithm that calls by `sort_key_value_over_group`. +Available only if `sycl::is_group_v>` is true. + +|`template +void operator()(Group g, sycl::span values, + Properties properties);` +|Implements a sorting algorithm that is called by `sort_over_group` and +that accepts +the `sycl::span` value as an input parameter. +Result of sorting is placed into `values` with data placements specified by +`properties`. +Default data placements are those that are specified by the +`group_algorithm_data_placement::blocked` property. +Available only if `sycl::is_group_v>` is true and +`ElementsPerWorkItem` is not equal to `sycl::dynamic_extent`. + +|`template +void operator()(Group g, sycl::span keys, + sycl::span values, + Properties properties);` +|Implements a sorting algorithm that is called by +`sort_key_value_over_group` and that +accepts two `sycl::span` values as input parameters. +Result of sorting is placed into `keys` and `values` with data placements +specified by `properties`. Default data placements are those that are +specified by the `group_algorithm_data_placement::blocked` property. +Available only if `sycl::is_group_v>` is true and +`ElementsPerWorkItem` is not equal to `sycl::dynamic_extent`. |=== SYCL provides some predefined sorters mentioned below. -However, custom sorters are particularly useful when the application knows the data has some -special property. For example, an application could implement a fast bitonic sort -if it knows the data size is a power of 2. +However, custom sorters are particularly useful when the application +knows the data has some special property. For example, an application +could implement a fast bitonic sort if it knows the data size is a power of 2. === Predefined Sorters ==== Sorting Order -`sorting_order` is an `enum` that defines a sorting order when `radix_sorter` is used. +`sorting_order` is an `enum` that defines a sorting order when +`radix_sorter` is used. Only ascending and descending orders are applicable. [source,c++] @@ -221,51 +496,140 @@ SYCL provides the following predefined classes: [source,c++] ---- -namespace sycl::ext::oneapi::experimental { - - template> - class default_sorter { - public: - template - default_sorter(sycl::span scratch, Compare comp = Compare()); - - template - void operator()(Group g, Ptr first, Ptr last); - - template - T operator()(Group g, T val); - - template - static constexpr std::size_t - memory_required(sycl::memory_scope scope, std::size_t range_size); - - template - static constexpr std::size_t - memory_required(sycl::memory_scope scope, sycl::range local_range); - }; - - template - class radix_sorter { - public: - template - radix_sorter(sycl::span scratch, - const std::bitset mask = - std::bitset (std::numeric_limits::max())); - - template - void operator()(Group g, Ptr first, Ptr last); - template - T operator()(Group g, T val); - - static constexpr std::size_t - memory_required(sycl::memory_scope scope, std::size_t range_size); - - template - static constexpr std::size_t - memory_required(sycl::memory_scope scope, sycl::range local_range); - }; +namespace sycl::ext::oneapi::experimental { + namespace default_sorters { + + template> + class joint_sorter{ + public: + template + joint_sorter(sycl::span scratch, Compare comp = {}); // (1) + + template + void operator()(Group g, Ptr first, Ptr last); // (2) + + template + static size_t + memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (3) + }; + + template, + std::size_t ElementsPerWorkItem = 1> + class group_sorter{ + public: + template + group_sorter(sycl::span scratch, Compare comp = {}); // (4) + + template + T operator()(Group g, T value); // (5) + + template + void operator()(Group g, + sycl::span values, + Properties properties); // (6) + + static size_t + memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (7) + }; + + template, + std::size_t ElementsPerWorkItem = 1> + class group_key_value_sorter{ + public: + template + group_key_value_sorter(sycl::span scratch, + Compare comp = {}); // (8) + + template + std::tuple operator()(Group g, T key, U value); // (9) + + template + void operator()(Group g, + sycl::span keys, + sycl::span values, + Properties property); // (10) + + static std::size_t + memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (11) + }; + } + + namespace radix_sorters{ + + template + class joint_sorter + { + public: + template + joint_sorter(sycl::span scratch, + const std::bitset mask = + std::bitset (std::numeric_limits::max())); // (12) + + template + void operator()(Group g, Ptr first, Ptr last); // (13) + + static constexpr std::size_t + memory_required(sycl::memory_scope scope, std::size_t range_size); // (14) + }; + + template + class group_sorter + { + public: + template + group_sorter(sycl::span scratch, + const std::bitset mask = + std::bitset (std::numeric_limits::max())); // (15) + + template + T operator()(Group g, T value); // (16) + + template + void operator()(Group g, + sycl::span values, + Properties properties); // (17) + + static constexpr std::size_t + memory_required(sycl::memory_scope scope, std::size_t range_size); // (18) + }; + + template + class group_key_value_sorter + { + public: + template + group_key_value_sorter(sycl::span scratch, + const std::bitset mask = + std::bitset (std::numeric_limits::max())); // (19) + + template + std::tuple operator()(Group g, T key, U value); // (20) + + template + void operator()(Group g, + sycl::span keys, + sycl::span values, + Properties properties); // (21) + + static constexpr std::size_t + memory_required(sycl::memory_scope scope, std::size_t range_size); // (22) + }; + + } } ---- @@ -273,127 +637,163 @@ Table 3. Description of predefined Sorters. |=== |Sorter|Description -|`template> -default_sorter` +|default sorters |Use a default sorting method based on an implementation-defined heuristic using `Compare` as the binary comparison function object. -The algorithm requires an additional memory that must be allocated on callers side. +The algorithm requires an additional memory that must be allocated on +callers side. Size of required memory (bytes) is defined by calling `memory_required`. -|`template -radix_sorter` +|radix sorters |Use radix sort as a sorting method. `Order` specify the sorting order. Only arithmetic types as `T` can be passed to `radix_sorter`. `BitsPerPass` is a number of bits that values are split by. -For example, if a sequence of `int32_t` is sorted using `BitsPerPass == 4` then one -pass of the radix sort algorithm considers only 4 bits. The number of passes is `32/4=8`. -The algorithm requires an additional memory that must be allocated on callers side. +For example, if a sequence of `int32_t` is sorted using +`BitsPerPass == 4` then one pass of the radix sort algorithm considers +only 4 bits. The number of passes is `32/4=8`. +The algorithm requires an additional memory that must be allocated on +callers side. Size of required memory (bytes) is defined by calling `memory_required`. |=== -Table 4. Constructors of the `default_sorter` class. -|=== -|Constructor|Description - -|`template -default_sorter(sycl::span scratch, Compare comp = Compare())` -|Creates the `default_sorter` object using `comp`. +(1), (4), (8) create the object using `comp`. Additional memory for the algorithm is provided using `scratch`. If `scratch.size()` is less than the value returned by -`memory_required`, behavior of the corresponding sorting algorithm is undefined. - -|=== +`memory_required`, behavior of the corresponding sorting algorithm +is undefined. -Table 5. Member functions of the `default_sorter` class. -|=== -|Member function|Description - -|`template -void operator()(Group g, Ptr first, Ptr last)` -|Implements a default sorting algorithm to be called by the `joint_sort` algorithm. +(2) Implements a default sorting algorithm to be called by +the `joint_sort` algorithm. _Complexity_: Let `N` be `last - first`. `O(N*log(N)*log(N))` comparisons. -|`template -T operator()(Group g, T val)` -|Implements a default sorting algorithm to be called by the `sort_over_group` algorithm. - -_Complexity_: Let `N` be the `Group` size. `O(N*log(N)*log(N))` comparisons. - -|`template -static std::size_t memory_required(sycl::memory_scope scope, std::size_t range_size)` -|Returns size of temporary memory (in bytes) that is required by -the default sorting algorithm defined by the sorter calling by `joint_sort`. +(3) Returns size of temporary memory (in bytes) that is required by +the default sorting algorithm defined by the sorter calling by `joint_sort` +depending on `d`. `range_size` represents a range size for sorting, e.g. `last-first` from `operator()` arguments. +It mustn't be called within a SYCL kernel, only on host. Result depends on the `scope` parameter: -use `sycl::memory_scope::work_group` to get memory size required for each work-group; -use `sycl::memory_scope::sub_group` to get memory size required for each sub-group. +use `sycl::memory_scope::work_group` to get memory size required +for each work-group; +use `sycl::memory_scope::sub_group` to get memory size required +for each sub-group. If other `scope` values are passed, behavior is unspecified. -|`static std::size_t memory_required(sycl::memory_scope scope, sycl::range local_range)` -|Returns size of temporary memory (in bytes) that is required by the default -sorting algorithm defined by the sorter calling by `sort_over_group`. -If `scope = sycl::memory_scope::work_group`, -`local_range` is a local range of `sycl::nd_range` that was used to run the kernel; -if `scope = sycl::memory_scope::sub_group`, `local_range` is a sub-group size. +(5) Implements a default sorting algorithm to be called by +the `sort_over_group` algorithm. + +_Complexity_: Let `N` be the `Group` size. `O(N*log(N)*log(N))` comparisons. + +(6) Implements a default sorting algorithm that is called by +`sort_over_group` and that accepts the `sycl::span` value as +an input parameter. + +_Complexity_: Let `N` be the `Group` size multiplied by `ElementsPerWorkItem`. +`O(N*log(N)*log(N))` comparisons. + +(7) Returns the size of temporary memory (in bytes) that is required by the default +sorting algorithm defined by the sorter calling by `sort_over_group` +depending on `d`. +`ElementsPerWorkItem` is the extent parameter for `sycl::span` +that is an input parameter for `sort_over_group`. +It mustn't be called within a SYCL kernel, only on host. +If `scope == sycl::memory_scope::work_group`, +`range_size` is the size of the local range for `sycl::nd_range` +that was used to run the kernel; +if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. If other `scope` values are passed, behavior is unspecified. -|=== -Table 6. Constructors of the `radix_sorter` class. -|=== -|Constructor|Description +(9) Implements a default key-value sorting algorithm that is called +by `sort_key_value_over_group` and that doesn't accept +`sycl::span` values as input parameters. + +_Complexity_: Let `N` be the `Group` size multiplied by `ElementsPerWorkItem`. +`O(N*log(N)*log(N))` comparisons. + +(10) Implements a default key-value sorting algorithm that is called +by `sort_key_value_over_group` and that +accepts `sycl::span` values as input parameters. + +_Complexity_: Let `N` be the `Group` size multiplied by `ElementsPerWorkItem`. +`O(N*log(N)*log(N))` comparisons. + +(11) Returns size of temporary memory (in bytes) that is required by +the default key-value +sorting algorithm defined by the sorter calling by `sort_key_value_over_group` +depending on `d`. +It mustn't be called within a SYCL kernel, only on host. +If `scope == sycl::memory_scope::work_group`, +`range_size` is the size of the local range for `sycl::nd_range` +that was used to run the kernel; +if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. +If other `scope` values are passed, behavior is unspecified. -|`template -radix_sorter(sycl::span scratch, const std::bitset mask = std::bitset -(std::numeric_limits::max()))` -|Creates the `radix_sorter` object to sort values considering only bits +(12), (15), (19) create +the class object to sort values considering only bits that corresponds to 1 in `mask`. Additional memory for the algorithm is provided using `scratch`. If `scratch.size()` is less than the value returned by `memory_required`, behavior of the corresponding sorting algorithm is undefined. -|=== +(13) Implements the radix sorting algorithm to be called by +the `joint_sort` algorithm. -Table 7. Member functions of the `radix_sorter` class. -|=== -|Member function|Description +(14) Returns size of temporary memory (in bytes) that is required by +the radix sort algorithm +calling by `joint_sort`. +`range_size` represents a range size for sorting, +e.g. `last-first` from `operator()` arguments. +Result depends on the `scope` parameter: +use `sycl::memory_scope::work_group` to get memory size required +for each work-group; +use `sycl::memory_scope::sub_group` to get memory size required +for each sub-group. +If other `scope` values are passed, behavior is unspecified. -|`template -void operator()(Group g, Ptr first, Ptr last)` -|Implements the radix sort algorithm to be called by the `joint_sort` algorithm. +(16) Implements the radix sorting algorithm to be called by +the `sort_over_group` algorithm. -|`template -T operator()(Group g, T val)` -|Implements the radix sort algorithm to be called by the `sort_over_group` algorithm. +(17) Implements the radix sorting algorithm that is called by +`sort_over_group` and that accepts +the `sycl::span` value as an input parameter. -|`static std::size_t -memory_required(sycl::memory_scope scope, std::size_t range_size)` -|Returns size of temporary memory (in bytes) that is required by the radix sort algorithm -calling by `joint_sort`. `range_size` represents a range size for sorting, -e.g. `last-first` from `operator()` arguments. -Result depends on the `scope` parameter: -use `sycl::memory_scope::work_group` to get memory size required for each work-group; -use `sycl::memory_scope::sub_group` to get memory size required for each sub-group. +(18) Returns size of temporary memory (in bytes) that is required by the radix +sorting algorithm defined by the sorter calling by `sort_over_group`. +`ElementsPerWorkItem` is a parameter for `sycl::span` +that is an input parameter for `sort_over_group`, where `T` is +a first template argument for `radix_sorter`. +If `scope == sycl::memory_scope::work_group`, +`range_size` is the size of the local range for `sycl::nd_range` +that was used to run the kernel; +if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. If other `scope` values are passed, behavior is unspecified. -|`template -static std::size_t -memory_required(sycl::memory_scope scope, sycl::range local_range)` -|Returns size of temporary memory (in bytes) that is required by the radix sort algorithm -calling by `sort_over_group`. -If `scope = sycl::memory_scope::work_group`, `local_range` is a local range of -`sycl::nd_range` that was used to run the kernel; -if `scope = sycl::memory_scope::sub_group`, `local_range` is a sub-group size. +(20) Implements the radix sorting algorithm that is called +by `sort_key_value_over_group` and that doesn't accept +`sycl::span` values as input parameters. + +(21) Implements the radix key-value sorting algorithm that is called +by `sort_key_value_over_group` and that +accepts `sycl::span` values as input parameters. + +(22) Returns size of temporary memory (in bytes) that is required by the radix key-value +sorting algorithm defined by the sorter calling by `sort_key_value_over_group` +with `sycl::span` and +`sycl::span` as input parameters. +If `scope == sycl::memory_scope::work_group`, +`range_size` is the size of the local range for `sycl::nd_range` +that was used to run the kernel; +if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. If other `scope` values are passed, behavior is unspecified. -|=== === Group Helper -The overloads of `joint_sort` and `sort_over_group` that do not take a Sorter parameter implicitly -use the default sorter. Since the default sorter requires the application to allocate some -temporary memory, the application must use a Group Helper object to communicate the location of -this memory. A Group Helper object is any object +The overloads of `joint_sort`, `sort_over_group`, `sort_key_value_over_group` +that do not take a Sorter parameter implicitly use the default sorter. +Since the default sorter requires the application to allocate some +temporary memory, the application must use a Group Helper object to +communicate the location of this memory. A Group Helper object is an object that has the following two public member functions: [source,c++] @@ -403,17 +803,19 @@ that has the following two public member functions: sycl::span get_memory() const ---- -Table 8. Member functions of group helpers. +Table 4. Member functions of group helpers. |=== |Member function|Description |`/* unspecified */ get_group() const` |Returns the group that is handled by the group helper object. -Assuming `Group` is a type of method's result `sycl::is_group_v>` must be true. +Assuming `Group` is a type of method's result +`sycl::is_group_v>` must be true. |`sycl::span get_memory() const` |Returns the memory object that the default sorter can use. -The return type is aligned with the first parameter of constructor for `default_sorter`. +The return type is aligned with the first parameter of constructor +for `default_sorter`. |=== ==== Predefined Group Helpers @@ -443,28 +845,30 @@ namespace sycl::ext::oneapi::experimental { } ---- -For most applications it is enough to pass an instance of the `group_with_scratchpad` class -instead of their own classes creation. +For most applications it is enough to pass an instance of +the `group_with_scratchpad` class instead of their own classes creation. -Table 9. Constructors of the `group_with_scratchpad` class. +Table 5. Constructors of the `group_with_scratchpad` class. |=== |Constructor|Description |`group_with_scratchpad(Group group, sycl::span scratch)` |Creates the `group_with_scratchpad` object using `group` and `scratch`. `sycl::is_group_v>` must be true. -`scratch.size()` must not be less than value returned by the `memory_required` method -of `default_sorter`. Otherwise, -behavior of sorting algorithm, which is called with the constructed object, is undefined. +`scratch.size()` must not be less than value returned by +the `memory_required` method of `default_sorter`. Otherwise, +behavior of sorting algorithm, which is called with the constructed +object, is undefined. The `scratch` value must be the same for all work-items in `group`. |=== -Table 10. Member functions of the `group_with_scratchpad` class. +Table 6. Member functions of the `group_with_scratchpad` class. |=== |Member function|Description |`Group get_group() const` -|Returns the `Group` class object that is handled by the `group_with_scratchpad` object. +|Returns the `Group` class object that is handled by +the `group_with_scratchpad` object. |`sycl::span get_memory() const` @@ -473,26 +877,135 @@ that is handled by the `group_with_scratchpad` object. |=== -==== Group Helper type trait +=== SYCL Properties for Interfaces with Fixed-size Private Arrays + +Group algorithms using the fixed-size array interface are performed across +`N * ElementsPerWorkItem` elements in the group, where +.`N` is the work-group size and `ElementsPerWorkItem` is the number of +elements that are processed by one work-item. + +When a work-item contributes multiple values to a group algorithm, +there are multiple ways to interpret the order of that data. +Let `r` is a virtual range for sorting of `N * ElementsPerWorkItem` elements. +The extension supports two data placements: + +a) Data from the +`[r + id * ElementsPerWorkItem; r + (id + 1) * ElementsPerWorkItem)` +virtual range +placed into the private memory under the span for `id`-th work-item. + +b) `i * N + id` element of `r` fill the `i`-th element of the private memory +under the span for `id`-th work-item. + +To specify a correct data placement for placing of resulting data +there is a enum: [source,c++] ---- -namespace sycl::ext::oneapi::experimental::detail { -template struct is_group_helper : std::false_type {}; +enum class group_algorithm_data_placement{ + blocked, + striped +}; +---- + +1.`sycl::ext::oneapi::experimental::group_algorithm_data_placement::blocked` +to specify a data placement described in a). + +2.`sycl::ext::oneapi::experimental::group_algorithm_data_placement::striped` +to specify a data placement described in b). + +Example: + +N = 3; + +|=== +|Work-item id|Input private fixed-size array + +|0 +|{11, 10, 9, 8} +|1 +|{7, 6, 5, 4} +|2 +|{3, 2, 1, 0} +|=== + +After performing sorting by ascending there is the following virtual range: +`{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}`. + +Consider 2 layouts: + +1.`sycl::ext::oneapi::experimental::group_algorithm_data_placement::blocked`. + +|=== +|Work-item id|Output private fixed-size array + +|0 +|{0, 1, 2, 3} +|1 +|{4, 5, 6, 7} +|2 +|{8, 9, 10, 11} +|=== + +2.`sycl::ext::oneapi::experimental::group_algorithm_data_placement::striped`. + +|=== +|Work-item id|Output private fixed-size array + +|0 +|{0, 3, 6, 9} +|1 +|{1, 4, 7, 10} +|2 +|{2, 5, 8, 11} +|=== + +There are 2 compile-time properties that satisfy +link:../experimental/sycl_ext_oneapi_properties.asciidoc[SYCL Properties Extension] +requirements: -template -struct is_group_helper> : std::true_type { +[source,c++] +---- +namespace sycl::ext::oneapi::experimental { + +struct input_data_placement_key : /* unspecified */ { + template + using value_t = + property_value>; +}; + +struct output_data_placement_key : /* unspecified */ { + template + using value_t = + property_value>; }; -} // namespace sycl::ext::oneapi::experimental::detail + +template +inline constexpr input_data_placement_key::value_t + input_data_placement; // (1) + +template +inline constexpr output_data_placement_key::value_t + output_data_placement; // (2) + +} // namespace sycl::ext::oneapi::experimental ---- -The `is_group_helper` type trait is used to determine which types of groups helpers are supported -by group functions, and to control when group functions participate in overload resolution. +1. `input_data_placement` specifies the data placement for input. This is +useful for stable sorts, which preserve the relative input order for elements +that compare equal or algorithms that can use a fact that sequences +can be partially sorted. +2. `output_data_placement` specifies the data placement for output. -`is_group_helper` is `std::true_type` if `T` is the type of group helper defined above and -`std::false_type` otherwise. A SYCL implementation may introduce additional specializations of -`is_group_helper` for implementation-defined group helper types, but users are not allowed to -define additional specializations for their own types. +Example: +`sort_over_group(g, my_span, properties, +output_data_placement>{});` + +It's specified that data initially in `my_span` satisfies the +`blocked` data placement. After sorting data will be placed to +`my_span` corresponding to the `striped` data placement. == Examples @@ -504,7 +1017,8 @@ define additional specializations for their own types. namespace my_sycl = sycl::ext::oneapi::experimental; // calculate required local memory size size_t temp_memory_size = - my_sycl::default_sorter<>::memory_required(sycl::memory_scope::work_group, n); + my_sycl::default_sorters::joint_sorter<>::memory_required( + d, sycl::memory_scope::work_group, n); q.submit([&](sycl::handler& h) { auto acc = sycl::accessor(buf, h); @@ -538,10 +1052,11 @@ namespace my_sycl = sycl::ext::oneapi::experimental; sycl::range<1> local_range{256}; // predefine radix_sorter to calculate local memory size -using RSorter = my_sycl::radix_sorter; +using RSorter = + my_sycl::radix_sorters::group_sorter; // calculate required local memory size size_t temp_memory_size = - RSorter::memory_required(sycl::memory_scope::work_group, local_range); + RSorter::memory_required(sycl::memory_scope::work_group, local_range.size()); q.submit([&](sycl::handler& h) { auto acc = sycl::accessor(buf, h); @@ -560,23 +1075,27 @@ q.submit([&](sycl::handler& h) { }); }); ... + ---- 3.Using `joint_sort` for key-value sorting (keys are compared, but keys and values are reordered both). NOTE: `oneapi::dpl::zip_iterator` is used here. -See https://spec.oneapi.com/versions/latest/elements/oneDPL/source/index.html[oneDPL Spec] +See https://spec.oneapi.com/versions/latest/elements/oneDPL/source/index.html[ + oneDPL Spec] for details. [source,c++] ---- ... namespace my_sycl = sycl::ext::oneapi::experimental; -using TupleType = typename std::iterator_traits>::value_type; +using TupleType = + typename std::iterator_traits>::value_type; // calculate required local memory size size_t temp_memory_size = - my_sycl::default_sorter<>::memory_required(sycl::memory_scope::work_group, n); + my_sycl::default_sorters::joint_sorter<>::memory_required( + d, sycl::memory_scope::work_group, n); q.submit([&](sycl::handler& h) { auto keys_acc = sycl::accessor(keys_buf, h); @@ -606,26 +1125,86 @@ q.submit([&](sycl::handler& h) { ... ---- -== Issues for later investigations +4.Using `sort_key_value_over_group` and `radix_sorter` with fixed-size arrays. -. Sort function can have interfaces with static arrays in private memory as well. -The concern is that it can require changes for other group algortihms as well since sort -basing on private memory is not very useful if other algorithms in the chain use local -memory only. It needs to make sure that we allow different layout for values in static arrays -between different work-items, e.g. "raw major" or "column major" format for storing. -. It can be a separate proposal for key-value sorting basing on Projections. -It needs to be investigated what is the response for that. -. Sorter traits can be useful if there are Finder, Reducer or other objects -will be added to the Spec to be used with other Group algorithms, e.g. find, reduce. -. Predefined sorters can be revisited to find a better interfaces for `memory_required` overloads. -. `group_with_scratchpad` can be replaced with concepts that requires 2 methods. +[source,c++] +---- +... +namespace my_sycl = sycl::ext::oneapi::experimental; -== Non-implemented features -Please, note that following is not inplemented yet for the open-source repo: +sycl::range<1> local_range{256}; +constexpr std::size_t ElementsPerWorkItem = 8; -. `radix_sorter`, `radix_order` -. sub-groups support. +// predefine radix_sorter to calculate local memory size +using RSorter = + my_sycl::radix_sorters::group_key_value_sorter; +// calculate required local memory size +size_t temp_memory_size = + RSorter::memory_required(sycl::memory_scope::work_group, local_range.size()); + +q.submit([&](sycl::handler& h) { + auto keys_acc = sycl::accessor(keys_buf, h); + auto vals_acc = sycl::accessor(vals_buf, h); + auto scratch = sycl::local_accessor( {temp_memory_size}, h); + + h.parallel_for( + sycl::nd_range<1>{ local_range, local_range }, + [=](sycl::nd_item<1> id) { + T keys_private[ElementsPerWorkItem]; + T vals_private[ElementsPerWorkItem]; + auto idx = id.get_global_id(); + for(std::size_t i = 0; i < ElementsPerWorkItem; ++i ) + { + keys_private[i] = keys_acc[idx * ElementsPerWorkItem + i]; + vals_private[i] = vals_acc[idx * ElementsPerWorkItem + i]; + } + + my_sycl::sort_key_value_over_group( + id.get_group(), + sycl::span{keys_private}, + sycl::span{vals_private}, + RSorter(sycl::span{scratch.get_pointer(), temp_memory_size}) + ); + ... + }); + }); +... +---- + +== Issues + +. It can be a separate proposal for key-value sorting basing on Projections. +It needs to be investigated what is the response for that. +. Sorter traits can be useful if there are Finder, Reducer or other objects +will be added to the Spec to be used with other Group algorithms, +e.g. find, reduce. +. Predefined sorters can be revisited to find a better interfaces +for `memory_required` overloads. +. `group_with_scratchpad` can be replaced with concepts that +requires 2 methods. +. Is Sorter needed to be applied to keys only or to keys and +values both in case of key-value sorting? +. Do we need to have separate predefined sorters for +fixed-size array interfaces? +e.g. instead of changing `default_sorter` and `radix_sorter` +to have new sorters `default_span_sorter`, `radix_span_sorter`. +. Will it be better to have an interface with `std::tuple` of `sycl::span` +to generalize key-value sorting? e.g. +`sort_over_group(group, std::make_tuple(sycl::span{keys}, sycl::span{values}), sorter);` +The thing is that tuple is not a span. It's better to have any _zip_span_ +that allows the SoA data pattern. Interfaces without tuple highlights that +we have parameters with different meaning: only keys are comparing, +but keys and values are moving both. However, it can look like +inconsistent comparing to other interfaces of sorting. +. It can be a problem that users need to put the same `Compare` type during +calling `memory_required` function and the `default_sorter` constructor +because it's easy to pass different comparator types. +. Think about reducing overloads for sorting functions. The thing is that +overloads with `Compare` objects seems extra and overloads with sorters, +without sorters are enough. == Revision History @@ -638,4 +1217,7 @@ Please, note that following is not inplemented yet for the open-source repo: |2|2021-09-15|Andrey Fedorov|Changes related to additional memory providing |3|2021-12-16|Andrey Fedorov|Some refactoring, sections reordering, making the entire extension experimental +|4|2022-11-14|Andrey Fedorov|Fixed size arrays, key-value sorting and properties +|5|2023-11-09|Andrey Fedorov|Changed `memory_required` functions for default sorters +|6|2024-07-17|Artur Gainullin|Align the description of data placement properties with the implementation |======================================== diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc deleted file mode 100644 index 17f7e52eb8d5a..0000000000000 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_group_sort.asciidoc +++ /dev/null @@ -1,1223 +0,0 @@ -= sycl_ext_oneapi_group_sort -:source-highlighter: coderay -:coderay-linenums-mode: table - -// This section needs to be after the document title. -:doctype: book -:toc2: -:toc: left -:encoding: utf-8 -:lang: en -:dpcpp: pass:[DPC++] -:language: {basebackend@docbook:c++:cpp} - -== Notice - -[%hardbreaks] -Copyright (c) 2021-2024 Intel Corporation. All rights reserved. - -Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks -of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by -permission by Khronos. - -== Contact - -To report problems with this extension, please open a new issue at: - -https://github.com/intel/llvm/issues - -== Dependencies - -This extension is written against the SYCL 2020 revision 5 specification. All -references below to the "core SYCL specification" or to section numbers in the -SYCL specification refer to that revision. - -This extension also depends on the following other SYCL extensions: - -* link:../experimental/sycl_ext_oneapi_properties.asciidoc[sycl_ext_oneapi_properties] - -== Status - -This is an experimental extension specification, intended to provide early -access to features and gather community feedback. Interfaces defined in this -specification are implemented in {dpcpp}, but they are not finalized and may -change incompatibly in future versions of {dpcpp} without prior notice. -*Shipping software products should not rely on APIs defined in this -specification.* - -== Introduction - -This extension introduces sorting functions to the group algorithms -library, along with associated Sorter objects and Group Helper objects. - -== Feature test macro - -This extension provides a feature-test macro as described in the core SYCL -specification. An implementation supporting this extension must predefine the -macro `SYCL_EXT_ONEAPI_GROUP_SORT` to one of the values defined in the table -below. Applications can test for the existence of this macro to determine if -the implementation supports this feature, or applications can test the macro's -value to determine which of the extension's features the implementation -supports. - -Table 1. Values of the `SYCL_EXT_ONEAPI_GROUP_SORT` macro. -[%header,cols="1,5"] -|=== -|Value |Description -|1 |Initial extension version. Base features are supported. -|2 |Interfaces with fixed-size arrays and key-value sorting are supported. -|=== - -== Sorting functions -The sort function from the {cpp} standard sorts elements with respect to -the binary comparison function object. - -SYCL provides two similar algorithms: - -`joint_sort` uses the work-items in a group to execute the corresponding -algorithm in parallel. - -`sort_over_group` performs a sort over values held directly by the work-items -in a group, and results returned to work-item `i` represent values that are in -position `i` in the ordered range. - -[source,c++] ----- -namespace sycl::ext::oneapi::experimental { - - template - void joint_sort(GroupHelper gh, Ptr first, Ptr last); // (1) - - template - void joint_sort(GroupHelper gh, Ptr first, Ptr last, Compare comp); // (2) - - template - void joint_sort(Group g, Ptr first, Ptr last, Sorter sorter); // (3) - - template - T sort_over_group(GroupHelper gh, T value); // (4) - - template - T sort_over_group(GroupHelper gh, T value, Compare comp); // (5) - - template - T sort_over_group(Group g, T value, Sorter sorter); // (6) - - template - std::tuple - sort_key_value_over_group(GroupHelper gh, T key, U value); // (7) - - template - std::tuple - sort_key_value_over_group(GroupHelper gh, T key, U value, Compare comp); // (8) - - template - std::tuple - sort_key_value_over_group(Group g, T key, U value, Sorter sorter); // (9) - -} ----- - -(1) _Preconditions_: `first`, `last` must be the same for all work-items -in the group. - -_Constraints_: Only available if `GroupHelper` was created with a -work-group or sub-group and some associated scratch space. - -_Effects_: Sort the elements in the range `[first, last)` -using the `gh` group helper object. Elements are compared by `operator<`. - -_Complexity_: Let `N` be `last - first`. `O(N*log(N)*log(N))` comparisons. - -(2) _Preconditions_: `first`, `last` must be the same for all work-items -in the group. - -_Constraints_: Only available if `GroupHelper` was created with -a work-group or a sub-group and some associated scratch space. - -_Mandates_: `comp` must satisfy the requirements of `Compare` from -the {cpp} standard. - -_Effects_: Sort the elements in the range `[first, last)` with respect to the -binary comparison function object `comp` using the `gh` group helper object. - -_Complexity_: Let `N` be `last - first`. `O(N*log(N)*log(N))` comparisons. - -(3) _Preconditions_: `first`, `last` must be the same -for all work-items in the group. - -_Constraints_: All functions are available only if `Sorter` is -a SYCL Sorter and it provides `operator()(Group, Ptr, Ptr)` overload. - -_Effects_: Equivalent to: `sorter(g, first, last)`. - -(4) _Constraints_: Only available if `GroupHelper` was created with -a work-group or a sub-group and some associated scratch space. - -_Returns_: The value returned on work-item `i` is the value in position `i` -of the ordered range resulting from sorting `value` from all work-items -in the group. Elements are compared by `operator<` -using the `gh` group helper object. -For multi-dimensional groups, the order of work-items in the group is -determined by their linear id. - -_Complexity_: Let `N` be the work-group size. `O(N*log(N)*log(N))` comparisons. - -(5) _Constraints_: Only available if `GroupHelper` was created with -a work-group or a sub-group and some associated scratch space. - -_Mandates_: `comp` must satisfy the requirements of `Compare` from -the {cpp} standard. - -_Returns_: The value returned on work-item `i` is the value in position `i` -of the ordered range resulting from sorting `value` from all work-items in the -group with respect to the binary comparison function object `comp` -using the `gh` group helper object. -For multi-dimensional groups, the order of work-items in the group is -determined by their linear id. - -_Complexity_: Let `N` be the work-group or sub-group size. -`O(N*log(N)*log(N))` comparisons. - -(6) _Constraints_: All functions are available only if `Sorter` is -a SYCL Sorter and it provides `operator()(Group, T)` overload. - -_Effects_: Equivalent to: `return sorter(g, value)`. - -(7) _Constraints_: Only available if `GroupHelper` was created with -a work-group or a sub-group and some associated scratch space. - -_Returns_: The value returned on work-item `i` is the tuple of values -that are in position `i` -of the ordered range resulting from key-value sorting of `key` and `value` -from all work-items -in the group. Elements are compared by `operator<` -using the `gh` group helper object. -For multi-dimensional groups, the order of work-items in the group is -determined by their linear id. - -_Complexity_: Let `N` be the work-group size. `O(N*log(N)*log(N))` comparisons. - -(8) _Constraints_: Only available if `GroupHelper` was created with -a work-group or a sub-group and some associated scratch space. - -_Mandates_: `comp` must satisfy the requirements of `Compare` from -the {cpp} standard. - -_Returns_: The value returned on work-item `i` is the tuple of values -that are in position `i` -of the ordered range resulting from key-value sorting of `key` and `value` -from all work-items in the -group with respect to the binary comparison function object `comp` -using the `gh` group helper object. -For multi-dimensional groups, the order of work-items in the group is -determined by their linear id. - -_Complexity_: Let `N` be the work-group or sub-group size. -`O(N*log(N)*log(N))` comparisons. - -(9) _Constraints_: All functions are available only if `Sorter` is -a SYCL Sorter and it provides `operator()(Group, T, U)` overload. - -_Effects_: Equivalent to: `return sorter(g, key, value)`. - -NOTE: (7), (8), (9) functions are available starting in revision 2 of this extension. - -=== Functions with fixed-size arrays - -The functions in this section are additional overloads for functions defined above, -except one thing: each work-item provides a fixed-size array of elements rather than -a single element. - -NOTE: These functions are available starting in revision 2 of this extension. - -[source,c++] ----- -namespace sycl::ext::oneapi::experimental { - - template - void sort_over_group(GroupHelper gh, - sycl::span values, - Properties properties = {}); // (1) - - template - void sort_over_group(GroupHelper gh, - sycl::span values, - Compare comp, - Properties properties = {}); // (2) - - template - void sort_over_group(Group g, - sycl::span values, - Sorter sorter, - Properties properties = {}); // (3) - - template - void sort_key_value_over_group(GroupHelper gh, - sycl::span keys, - sycl::span values, - Properties properties = {}); // (4) - - template - void sort_key_value_over_group(GroupHelper gh, - sycl::span keys, - sycl::span values, - Compare comp, - Properties properties = {}); // (5) - - template - void sort_key_value_over_group(Group g, - sycl::span keys, - sycl::span values, - Sorter sorter, - Properties properties = {}); // (6) -} ----- - -NOTE: (4), (5), (6) functions below perform sorting -including key-value variant. -Key value sorting is a sorting algorithm where keys are compared, -but keys and values are reordered both. - -(1) _Constraints_: Only available if `GroupHelper` was created with -a work-group or a sub-group and some associated scratch space and -`sycl::ext::oneapi::is_property_list_v>` is true. - -_Effects_: Sort elements in the range containing of elements inside -`values` from all work-items from the group. -Result of sorting is placed into `values` with data placements -specified by `properties`. -Default data placements are those that are specified by the -`group_algorithm_data_placement::blocked` property. -Elements are compared by `operator<` using -the `gh` group helper object. - -_Complexity_: Let `N` be the group size. `O(N*log(N)*log(N))` comparisons. - -(2) _Constraints_: Only available if `GroupHelper` was created with -a work-group or a sub-group and some associated scratch space and -`sycl::ext::oneapi::is_property_list_v>` is true. - -_Mandates_: `comp` must satisfy the requirements of -`Compare` from the {cpp} standard. - -_Effects_: Sort elements in the range containing of elements -inside `values` from all work-items from the group with respect to -the binary comparison function object `comp` using the `gh` group -helper object. -Result of sorting is placed into `values` with data placements -specified by `properties`. -Default data placements are those that are specified by the -`group_algorithm_data_placement::blocked` property. - -_Complexity_: Let `N` be the work-group or sub-group size. -`O(N*log(N)*log(N))` comparisons. - -(3) _Constraints_: All functions are available only if `Sorter` is -a SYCL Sorter and it provides `operator()(Group, sycl::span)` overload and -`sycl::ext::oneapi::is_property_list_v>` is true. - -_Effects_: Equivalent to: `return sorter(g, values, properties)`. - -(4) _Constraints_: Only available if `GroupHelper` was created with -a work-group or a sub-group and some associated scratch space and -`sycl::ext::oneapi::is_property_list_v>` is true. - -_Effects_: Perform key-value sorting for elements in ranges -containing of elements inside `keys` and `values` from all work-items -from the group. -Result of sorting is placed into `keys` and `values` with -data placements specified by `properties`. -Default data placements are those that are specified by the -`group_algorithm_data_placement::blocked` property. -Elements are compared by `operator<` using the `gh` group helper object. - -_Complexity_: Let `N` be the group size. `O(N*log(N)*log(N))` comparisons. - -(5) _Constraints_: Only available if `GroupHelper` was created with -a work-group or a sub-group and some associated scratch space and -`sycl::ext::oneapi::is_property_list_v>` is true. - -_Mandates_: `comp` must satisfy the requirements of `Compare` from -the {cpp} standard. - -_Effects_: Perform key-value sorting for elements in ranges containing -of elements inside `keys` and `values` from all work-items from -the group with respect to the binary comparison -function object `comp` using the `gh` group helper object. -Result of sorting is placed into `keys` and `values` with data placements -specified by `properties`. -Default data placements are those that are specified by the -`group_algorithm_data_placement::blocked` property. -Elements are compared by `operator<`. - -_Complexity_: Let `N` be the work-group or sub-group size. -`O(N*log(N)*log(N))` comparisons. - -(6) _Constraints_: All functions are available only if `Sorter` is -a SYCL Sorter and it provides `operator()(Group, sycl::span, sycl::span) -overload and `sycl::ext::oneapi::is_property_list_v>` -is true. - -_Effects_: Equivalent to: `return sorter(g, keys, values, properties)`. - -== Sorters - -Sorter is a special type that encapsulates a sorting algorithm. -Sorter may contain parameters -that help to get better performance. Data for sorting are provided -to `operator()` -that should contain an implementation of a sorting algorithm. -General semantics of `operator()` is following: - -[source,c++] ----- -template -void operator()(Group g, Ptr first, Ptr last); - -template -T operator()(Group g, T value); - -template -void operator()(Group g, - sycl::span values, - Properties properties); - -template -std::tuple operator()(Group g, T key, U value); - -template -void operator()(Group g, - sycl::span keys, - sycl::span values, - Properties properties); ----- - -NOTE: At least one `operator()` overload must be presented. -For example, if only `void operator()(Group g, Ptr first, Ptr last);` -is defined then a Sorter can be passed to `joint_sort` function only. -If it's passed to `sort_over_group`, it leads to a compilation -error. If only `T operator()(Group g, T value);` is defined then a Sorter -can be passed to `sort_over_group` function only. If it's passed to -`joint_sort`, it leads to a compilation error. - -Table 2. `operator()` for Sorters. -|=== -|`operator()`|Description - -|`template -void operator()(Group g, Ptr first, Ptr last);` -|Implements a sorting algorithm that calls by `joint_sort`. -Available only if `sycl::is_group_v>` is true. -`first`, `last` must be the same for all work-items in the group. - -|`template -T operator()(Group g, T value);` -|Implements a sorting algorithm that calls by `sort_over_group`. -Available only if `sycl::is_group_v>` is true. - -|`template -std::tuple operator()(Group g, T key, U value);` -|Implements a sorting algorithm that calls by `sort_key_value_over_group`. -Available only if `sycl::is_group_v>` is true. - -|`template -void operator()(Group g, sycl::span values, - Properties properties);` -|Implements a sorting algorithm that is called by `sort_over_group` and -that accepts -the `sycl::span` value as an input parameter. -Result of sorting is placed into `values` with data placements specified by -`properties`. -Default data placements are those that are specified by the -`group_algorithm_data_placement::blocked` property. -Available only if `sycl::is_group_v>` is true and -`ElementsPerWorkItem` is not equal to `sycl::dynamic_extent`. - -|`template -void operator()(Group g, sycl::span keys, - sycl::span values, - Properties properties);` -|Implements a sorting algorithm that is called by -`sort_key_value_over_group` and that -accepts two `sycl::span` values as input parameters. -Result of sorting is placed into `keys` and `values` with data placements -specified by `properties`. Default data placements are those that are -specified by the `group_algorithm_data_placement::blocked` property. -Available only if `sycl::is_group_v>` is true and -`ElementsPerWorkItem` is not equal to `sycl::dynamic_extent`. -|=== - -SYCL provides some predefined sorters mentioned below. -However, custom sorters are particularly useful when the application -knows the data has some special property. For example, an application -could implement a fast bitonic sort if it knows the data size is a power of 2. - -=== Predefined Sorters - -==== Sorting Order - -`sorting_order` is an `enum` that defines a sorting order when -`radix_sorter` is used. -Only ascending and descending orders are applicable. - -[source,c++] ----- -namespace sycl::ext::oneapi::experimental { - - enum class sorting_order { - ascending, - descending - }; - -} ----- - -SYCL provides the following predefined classes: - -[source,c++] ----- - -namespace sycl::ext::oneapi::experimental { - - namespace default_sorters { - - template> - class joint_sorter{ - public: - template - joint_sorter(sycl::span scratch, Compare comp = {}); // (1) - - template - void operator()(Group g, Ptr first, Ptr last); // (2) - - template - static size_t - memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (3) - }; - - template, - std::size_t ElementsPerWorkItem = 1> - class group_sorter{ - public: - template - group_sorter(sycl::span scratch, Compare comp = {}); // (4) - - template - T operator()(Group g, T value); // (5) - - template - void operator()(Group g, - sycl::span values, - Properties properties); // (6) - - static size_t - memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (7) - }; - - template, - std::size_t ElementsPerWorkItem = 1> - class group_key_value_sorter{ - public: - template - group_key_value_sorter(sycl::span scratch, - Compare comp = {}); // (8) - - template - std::tuple operator()(Group g, T key, U value); // (9) - - template - void operator()(Group g, - sycl::span keys, - sycl::span values, - Properties property); // (10) - - static std::size_t - memory_required(sycl::device d, sycl::memory_scope scope, std::size_t range_size); // (11) - }; - } - - namespace radix_sorters{ - - template - class joint_sorter - { - public: - template - joint_sorter(sycl::span scratch, - const std::bitset mask = - std::bitset (std::numeric_limits::max())); // (12) - - template - void operator()(Group g, Ptr first, Ptr last); // (13) - - static constexpr std::size_t - memory_required(sycl::memory_scope scope, std::size_t range_size); // (14) - }; - - template - class group_sorter - { - public: - template - group_sorter(sycl::span scratch, - const std::bitset mask = - std::bitset (std::numeric_limits::max())); // (15) - - template - T operator()(Group g, T value); // (16) - - template - void operator()(Group g, - sycl::span values, - Properties properties); // (17) - - static constexpr std::size_t - memory_required(sycl::memory_scope scope, std::size_t range_size); // (18) - }; - - template - class group_key_value_sorter - { - public: - template - group_key_value_sorter(sycl::span scratch, - const std::bitset mask = - std::bitset (std::numeric_limits::max())); // (19) - - template - std::tuple operator()(Group g, T key, U value); // (20) - - template - void operator()(Group g, - sycl::span keys, - sycl::span values, - Properties properties); // (21) - - static constexpr std::size_t - memory_required(sycl::memory_scope scope, std::size_t range_size); // (22) - }; - - } -} ----- - -Table 3. Description of predefined Sorters. -|=== -|Sorter|Description - -|default sorters -|Use a default sorting method based on an implementation-defined heuristic -using `Compare` as the binary comparison function object. -The algorithm requires an additional memory that must be allocated on -callers side. -Size of required memory (bytes) is defined by calling `memory_required`. - -|radix sorters -|Use radix sort as a sorting method. `Order` specify the sorting order. -Only arithmetic types as `T` can be passed to `radix_sorter`. -`BitsPerPass` is a number of bits that values are split by. -For example, if a sequence of `int32_t` is sorted using -`BitsPerPass == 4` then one pass of the radix sort algorithm considers -only 4 bits. The number of passes is `32/4=8`. -The algorithm requires an additional memory that must be allocated on -callers side. -Size of required memory (bytes) is defined by calling `memory_required`. -|=== - -(1), (4), (8) create the object using `comp`. -Additional memory for the algorithm is provided using `scratch`. -If `scratch.size()` is less than the value returned by -`memory_required`, behavior of the corresponding sorting algorithm -is undefined. - -(2) Implements a default sorting algorithm to be called by -the `joint_sort` algorithm. - -_Complexity_: Let `N` be `last - first`. `O(N*log(N)*log(N))` comparisons. - -(3) Returns size of temporary memory (in bytes) that is required by -the default sorting algorithm defined by the sorter calling by `joint_sort` -depending on `d`. -`range_size` represents a range size for sorting, -e.g. `last-first` from `operator()` arguments. -It mustn't be called within a SYCL kernel, only on host. -Result depends on the `scope` parameter: -use `sycl::memory_scope::work_group` to get memory size required -for each work-group; -use `sycl::memory_scope::sub_group` to get memory size required -for each sub-group. -If other `scope` values are passed, behavior is unspecified. - -(5) Implements a default sorting algorithm to be called by -the `sort_over_group` algorithm. - -_Complexity_: Let `N` be the `Group` size. `O(N*log(N)*log(N))` comparisons. - -(6) Implements a default sorting algorithm that is called by -`sort_over_group` and that accepts the `sycl::span` value as -an input parameter. - -_Complexity_: Let `N` be the `Group` size multiplied by `ElementsPerWorkItem`. -`O(N*log(N)*log(N))` comparisons. - -(7) Returns the size of temporary memory (in bytes) that is required by the default -sorting algorithm defined by the sorter calling by `sort_over_group` -depending on `d`. -`ElementsPerWorkItem` is the extent parameter for `sycl::span` -that is an input parameter for `sort_over_group`. -It mustn't be called within a SYCL kernel, only on host. -If `scope == sycl::memory_scope::work_group`, -`range_size` is the size of the local range for `sycl::nd_range` -that was used to run the kernel; -if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. -If other `scope` values are passed, behavior is unspecified. - -(9) Implements a default key-value sorting algorithm that is called -by `sort_key_value_over_group` and that doesn't accept -`sycl::span` values as input parameters. - -_Complexity_: Let `N` be the `Group` size multiplied by `ElementsPerWorkItem`. -`O(N*log(N)*log(N))` comparisons. - -(10) Implements a default key-value sorting algorithm that is called -by `sort_key_value_over_group` and that -accepts `sycl::span` values as input parameters. - -_Complexity_: Let `N` be the `Group` size multiplied by `ElementsPerWorkItem`. -`O(N*log(N)*log(N))` comparisons. - -(11) Returns size of temporary memory (in bytes) that is required by -the default key-value -sorting algorithm defined by the sorter calling by `sort_key_value_over_group` -depending on `d`. -It mustn't be called within a SYCL kernel, only on host. -If `scope == sycl::memory_scope::work_group`, -`range_size` is the size of the local range for `sycl::nd_range` -that was used to run the kernel; -if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. -If other `scope` values are passed, behavior is unspecified. - -(12), (15), (19) create -the class object to sort values considering only bits -that corresponds to 1 in `mask`. -Additional memory for the algorithm is provided using `scratch`. -If `scratch.size()` is less than the value returned by `memory_required`, -behavior of the corresponding sorting algorithm is undefined. - -(13) Implements the radix sorting algorithm to be called by -the `joint_sort` algorithm. - -(14) Returns size of temporary memory (in bytes) that is required by -the radix sort algorithm -calling by `joint_sort`. -`range_size` represents a range size for sorting, -e.g. `last-first` from `operator()` arguments. -Result depends on the `scope` parameter: -use `sycl::memory_scope::work_group` to get memory size required -for each work-group; -use `sycl::memory_scope::sub_group` to get memory size required -for each sub-group. -If other `scope` values are passed, behavior is unspecified. - -(16) Implements the radix sorting algorithm to be called by -the `sort_over_group` algorithm. - -(17) Implements the radix sorting algorithm that is called by -`sort_over_group` and that accepts -the `sycl::span` value as an input parameter. - -(18) Returns size of temporary memory (in bytes) that is required by the radix -sorting algorithm defined by the sorter calling by `sort_over_group`. -`ElementsPerWorkItem` is a parameter for `sycl::span` -that is an input parameter for `sort_over_group`, where `T` is -a first template argument for `radix_sorter`. -If `scope == sycl::memory_scope::work_group`, -`range_size` is the size of the local range for `sycl::nd_range` -that was used to run the kernel; -if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. -If other `scope` values are passed, behavior is unspecified. - -(20) Implements the radix sorting algorithm that is called -by `sort_key_value_over_group` and that doesn't accept -`sycl::span` values as input parameters. - -(21) Implements the radix key-value sorting algorithm that is called -by `sort_key_value_over_group` and that -accepts `sycl::span` values as input parameters. - -(22) Returns size of temporary memory (in bytes) that is required by the radix key-value -sorting algorithm defined by the sorter calling by `sort_key_value_over_group` -with `sycl::span` and -`sycl::span` as input parameters. -If `scope == sycl::memory_scope::work_group`, -`range_size` is the size of the local range for `sycl::nd_range` -that was used to run the kernel; -if `scope = sycl::memory_scope::sub_group`, `range_size` is a sub-group size. -If other `scope` values are passed, behavior is unspecified. - -=== Group Helper - -The overloads of `joint_sort`, `sort_over_group`, `sort_key_value_over_group` -that do not take a Sorter parameter implicitly use the default sorter. -Since the default sorter requires the application to allocate some -temporary memory, the application must use a Group Helper object to -communicate the location of this memory. A Group Helper object is an object -that has the following two public member functions: - -[source,c++] ----- -/* unspecified */ get_group() const; - -sycl::span get_memory() const ----- - -Table 4. Member functions of group helpers. -|=== -|Member function|Description - -|`/* unspecified */ get_group() const` -|Returns the group that is handled by the group helper object. -Assuming `Group` is a type of method's result -`sycl::is_group_v>` must be true. - -|`sycl::span get_memory() const` -|Returns the memory object that the default sorter can use. -The return type is aligned with the first parameter of constructor -for `default_sorter`. -|=== - -==== Predefined Group Helpers -SYCL introduces the following predefined group helper: - -[source,c++] ----- -namespace sycl::ext::oneapi::experimental { - - // Exposition only: all template arguments except Group are unspecified - template - class group_with_scratchpad - { - public: - group_with_scratchpad(Group group, sycl::span scratch); - Group get_group() const; - - sycl::span - get_memory() const; - }; - - // Deduction guides - template - group_with_scratchpad(Group, sycl::span) - -> group_with_scratchpad; - -} ----- - -For most applications it is enough to pass an instance of -the `group_with_scratchpad` class instead of their own classes creation. - -Table 5. Constructors of the `group_with_scratchpad` class. -|=== -|Constructor|Description - -|`group_with_scratchpad(Group group, sycl::span scratch)` -|Creates the `group_with_scratchpad` object using `group` and `scratch`. -`sycl::is_group_v>` must be true. -`scratch.size()` must not be less than value returned by -the `memory_required` method of `default_sorter`. Otherwise, -behavior of sorting algorithm, which is called with the constructed -object, is undefined. -The `scratch` value must be the same for all work-items in `group`. -|=== - -Table 6. Member functions of the `group_with_scratchpad` class. -|=== -|Member function|Description - -|`Group get_group() const` -|Returns the `Group` class object that is handled by -the `group_with_scratchpad` object. - -|`sycl::span -get_memory() const` -|Returns `sycl::span` that represents an additional memory -that is handled by the `group_with_scratchpad` object. - -|=== - -=== SYCL Properties for Interfaces with Fixed-size Private Arrays - -Group algorithms using the fixed-size array interface are performed across -`N * ElementsPerWorkItem` elements in the group, where -.`N` is the work-group size and `ElementsPerWorkItem` is the number of -elements that are processed by one work-item. - -When a work-item contributes multiple values to a group algorithm, -there are multiple ways to interpret the order of that data. -Let `r` is a virtual range for sorting of `N * ElementsPerWorkItem` elements. -The extension supports two data placements: - -a) Data from the -`[r + id * ElementsPerWorkItem; r + (id + 1) * ElementsPerWorkItem)` -virtual range -placed into the private memory under the span for `id`-th work-item. - -b) `i * N + id` element of `r` fill the `i`-th element of the private memory -under the span for `id`-th work-item. - -To specify a correct data placement for placing of resulting data -there is a enum: - -[source,c++] ----- -enum class group_algorithm_data_placement{ - blocked, - striped -}; ----- - -1.`sycl::ext::oneapi::experimental::group_algorithm_data_placement::blocked` -to specify a data placement described in a). - -2.`sycl::ext::oneapi::experimental::group_algorithm_data_placement::striped` -to specify a data placement described in b). - -Example: - -N = 3; - -|=== -|Work-item id|Input private fixed-size array - -|0 -|{11, 10, 9, 8} -|1 -|{7, 6, 5, 4} -|2 -|{3, 2, 1, 0} -|=== - -After performing sorting by ascending there is the following virtual range: -`{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}`. - -Consider 2 layouts: - -1.`sycl::ext::oneapi::experimental::group_algorithm_data_placement::blocked`. - -|=== -|Work-item id|Output private fixed-size array - -|0 -|{0, 1, 2, 3} -|1 -|{4, 5, 6, 7} -|2 -|{8, 9, 10, 11} -|=== - -2.`sycl::ext::oneapi::experimental::group_algorithm_data_placement::striped`. - -|=== -|Work-item id|Output private fixed-size array - -|0 -|{0, 3, 6, 9} -|1 -|{1, 4, 7, 10} -|2 -|{2, 5, 8, 11} -|=== - -There are 2 compile-time properties that satisfy -link:../experimental/sycl_ext_oneapi_properties.asciidoc[SYCL Properties Extension] -requirements: - -[source,c++] ----- -namespace sycl::ext::oneapi::experimental { - -struct input_data_placement_key : /* unspecified */ { - template - using value_t = - property_value>; -}; - -struct output_data_placement_key : /* unspecified */ { - template - using value_t = - property_value>; -}; - -template -inline constexpr input_data_placement_key::value_t - input_data_placement; // (1) - -template -inline constexpr output_data_placement_key::value_t - output_data_placement; // (2) - -} // namespace sycl::ext::oneapi::experimental ----- - -1. `input_data_placement` specifies the data placement for input. This is -useful for stable sorts, which preserve the relative input order for elements -that compare equal or algorithms that can use a fact that sequences -can be partially sorted. -2. `output_data_placement` specifies the data placement for output. - -Example: -`sort_over_group(g, my_span, properties, -output_data_placement>{});` - -It's specified that data initially in `my_span` satisfies the -`blocked` data placement. After sorting data will be placed to -`my_span` corresponding to the `striped` data placement. - -== Examples - -1.Using `joint_sort` without Sorters. - -[source,c++] ----- -... -namespace my_sycl = sycl::ext::oneapi::experimental; -// calculate required local memory size -size_t temp_memory_size = - my_sycl::default_sorters::joint_sorter<>::memory_required( - d, sycl::memory_scope::work_group, n); - -q.submit([&](sycl::handler& h) { - auto acc = sycl::accessor(buf, h); - auto scratch = sycl::local_accessor( {temp_memory_size}, h ); - - h.parallel_for( - sycl::nd_range<1>{ /*global_size = */ {256}, /*local_size = */ {256} }, - [=](sycl::nd_item<1> id) { - auto ptr = acc.get_pointer() + id.get_group(0) * n; - - my_sycl::joint_sort( - // create group helper using deduction guides - my_sycl::group_with_scratchpad( - id.get_group(), - sycl::span{scratch.get_pointer(), temp_memory_size} - ), - ptr, - ptr + n - ); - }); - }); -... ----- - -2.Using `sort_over_group` and `radix_sorter` - -[source,c++] ----- -... -namespace my_sycl = sycl::ext::oneapi::experimental; - -sycl::range<1> local_range{256}; -// predefine radix_sorter to calculate local memory size -using RSorter = - my_sycl::radix_sorters::group_sorter; -// calculate required local memory size -size_t temp_memory_size = - RSorter::memory_required(sycl::memory_scope::work_group, local_range.size()); - -q.submit([&](sycl::handler& h) { - auto acc = sycl::accessor(buf, h); - auto scratch = sycl::local_accessor( {temp_memory_size}, h); - - h.parallel_for( - sycl::nd_range<1>{ local_range, local_range }, - [=](sycl::nd_item<1> id) { - - acc[id.get_local_id()] = - my_sycl::sort_over_group( - id.get_group(), - acc[id.get_local_id()], - RSorter(sycl::span{scratch.get_pointer(), temp_memory_size}) - ); - }); - }); -... - ----- - -3.Using `joint_sort` for key-value sorting -(keys are compared, but keys and values are reordered both). - -NOTE: `oneapi::dpl::zip_iterator` is used here. -See https://spec.oneapi.com/versions/latest/elements/oneDPL/source/index.html[ - oneDPL Spec] -for details. - -[source,c++] ----- -... -namespace my_sycl = sycl::ext::oneapi::experimental; -using TupleType = - typename std::iterator_traits>::value_type; -// calculate required local memory size -size_t temp_memory_size = - my_sycl::default_sorters::joint_sorter<>::memory_required( - d, sycl::memory_scope::work_group, n); - -q.submit([&](sycl::handler& h) { - auto keys_acc = sycl::accessor(keys_buf, h); - auto vals_acc = sycl::accessor(vals_buf, h); - auto scratch = sycl::local_accessor( {temp_memory_size}, h); - - h.parallel_for( - sycl::nd_range<1>{ /*global_size = */ {1024}, /*local_size = */ {256} }, - [=](sycl::nd_item<1> id) { - size_t group_id = id.get_group(0); - auto keys_ptr = keys_acc.get_pointer() + group_id * n; - auto vals_ptr = vals_acc.get_pointer() + group_id * n; - auto first = oneapi::dpl::make_zip_iterator(keys_ptr, vals_ptr); - - my_sycl::joint_sort( - // create group excutor using deduction guides - my_sycl::group_with_scratchpad( - id.get_group(), - sycl::span{scratch.get_pointer(), temp_memory_size} - ), - first, - first + n, - [](auto x, auto y){ return std::get<0>(x) < std::get<0>(y); } - ); - }); - }); -... ----- - -4.Using `sort_key_value_over_group` and `radix_sorter` with fixed-size arrays. - -[source,c++] ----- -... -namespace my_sycl = sycl::ext::oneapi::experimental; - -sycl::range<1> local_range{256}; -constexpr std::size_t ElementsPerWorkItem = 8; - -// predefine radix_sorter to calculate local memory size -using RSorter = - my_sycl::radix_sorters::group_key_value_sorter; -// calculate required local memory size -size_t temp_memory_size = - RSorter::memory_required(sycl::memory_scope::work_group, local_range.size()); - -q.submit([&](sycl::handler& h) { - auto keys_acc = sycl::accessor(keys_buf, h); - auto vals_acc = sycl::accessor(vals_buf, h); - auto scratch = sycl::local_accessor( {temp_memory_size}, h); - - h.parallel_for( - sycl::nd_range<1>{ local_range, local_range }, - [=](sycl::nd_item<1> id) { - - T keys_private[ElementsPerWorkItem]; - T vals_private[ElementsPerWorkItem]; - auto idx = id.get_global_id(); - for(std::size_t i = 0; i < ElementsPerWorkItem; ++i ) - { - keys_private[i] = keys_acc[idx * ElementsPerWorkItem + i]; - vals_private[i] = vals_acc[idx * ElementsPerWorkItem + i]; - } - - my_sycl::sort_key_value_over_group( - id.get_group(), - sycl::span{keys_private}, - sycl::span{vals_private}, - RSorter(sycl::span{scratch.get_pointer(), temp_memory_size}) - ); - ... - }); - }); -... ----- - -== Issues - -. It can be a separate proposal for key-value sorting basing on Projections. -It needs to be investigated what is the response for that. -. Sorter traits can be useful if there are Finder, Reducer or other objects -will be added to the Spec to be used with other Group algorithms, -e.g. find, reduce. -. Predefined sorters can be revisited to find a better interfaces -for `memory_required` overloads. -. `group_with_scratchpad` can be replaced with concepts that -requires 2 methods. -. Is Sorter needed to be applied to keys only or to keys and -values both in case of key-value sorting? -. Do we need to have separate predefined sorters for -fixed-size array interfaces? -e.g. instead of changing `default_sorter` and `radix_sorter` -to have new sorters `default_span_sorter`, `radix_span_sorter`. -. Will it be better to have an interface with `std::tuple` of `sycl::span` -to generalize key-value sorting? e.g. -`sort_over_group(group, std::make_tuple(sycl::span{keys}, sycl::span{values}), sorter);` -The thing is that tuple is not a span. It's better to have any _zip_span_ -that allows the SoA data pattern. Interfaces without tuple highlights that -we have parameters with different meaning: only keys are comparing, -but keys and values are moving both. However, it can look like -inconsistent comparing to other interfaces of sorting. -. It can be a problem that users need to put the same `Compare` type during -calling `memory_required` function and the `default_sorter` constructor -because it's easy to pass different comparator types. -. Think about reducing overloads for sorting functions. The thing is that -overloads with `Compare` objects seems extra and overloads with sorters, -without sorters are enough. - -== Revision History - -[cols="5,15,15,70"] -[grid="rows"] -[options="header"] -|======================================== -|Rev|Date|Author|Changes -|1|2021-04-28|Andrey Fedorov|Initial public working draft -|2|2021-09-15|Andrey Fedorov|Changes related to additional memory providing -|3|2021-12-16|Andrey Fedorov|Some refactoring, sections reordering, -making the entire extension experimental -|4|2022-11-14|Andrey Fedorov|Fixed size arrays, key-value sorting and properties -|5|2023-11-09|Andrey Fedorov|Changed `memory_required` functions for default sorters -|6|2024-07-17|Artur Gainullin|Align the description of data placement properties with the implementation -|======================================== From 2e7c0800e9429a479e243fa887780228f26ff507 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Wed, 10 Jul 2024 23:58:10 -0700 Subject: [PATCH 3/3] Remove the first version of API and update feature macro value --- .../experimental/group_helpers_sorters.hpp | 152 ------------------ sycl/source/feature_test.hpp.in | 2 +- .../group_sort/group_and_joint_sort.cpp | 79 +-------- 3 files changed, 3 insertions(+), 230 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp index a12cff7ad8eb0..1d90b4318bc65 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp @@ -97,159 +97,8 @@ template class group_with_scratchpad { sycl::span get_memory() const { return scratch; } }; -// Default sorter provided by the first version of the extension specification. -template > class default_sorter { - Compare comp; - sycl::span scratch; - -public: - template - default_sorter(sycl::span scratch_, - Compare comp_ = Compare()) - : comp(comp_), scratch(scratch_) {} - - template - void operator()([[maybe_unused]] Group g, [[maybe_unused]] Ptr first, - [[maybe_unused]] Ptr last) { -#ifdef __SYCL_DEVICE_ONLY__ - using T = typename sycl::detail::GetValueType::type; - size_t n = std::distance(first, last); - T *scratch_begin = sycl::detail::align_scratch(scratch, g, n); - sycl::detail::merge_sort(g, first, n, comp, scratch_begin); -#else - throw sycl::exception( - std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), - "default_sorter constructor is not supported on host device."); -#endif - } - - template - T operator()([[maybe_unused]] Group g, T val) { -#ifdef __SYCL_DEVICE_ONLY__ - std::size_t local_id = g.get_local_linear_id(); - auto range_size = g.get_local_range().size(); - T *scratch_begin = sycl::detail::align_scratch( - scratch, g, /* output storage and temporary storage */ 2 * range_size); - scratch_begin[local_id] = val; - sycl::detail::merge_sort(g, scratch_begin, range_size, comp, - scratch_begin + range_size); - val = scratch_begin[local_id]; -#else - throw sycl::exception( - std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), - "default_sorter operator() is not supported on host device."); -#endif - return val; - } - - template - static constexpr size_t memory_required(sycl::memory_scope, - size_t range_size) { - return range_size * sizeof(T) + alignof(T); - } - - template - static constexpr size_t memory_required(sycl::memory_scope scope, - sycl::range r) { - return 2 * memory_required(scope, r.size()); - } -}; - enum class sorting_order { ascending, descending }; -namespace detail { - -template -struct ConvertToComp { - using Type = std::less; -}; - -template struct ConvertToComp { - using Type = std::greater; -}; -} // namespace detail - -// Radix sorter provided by the first version of the extension specification. -template -class radix_sorter { - - sycl::span scratch; - uint32_t first_bit = 0; - uint32_t last_bit = 0; - - static constexpr uint32_t bits = BitsPerPass; - using bitset_t = std::bitset; - -public: - template - radix_sorter(sycl::span scratch_, - const bitset_t mask = bitset_t{}.set()) - : scratch(scratch_) { - static_assert((std::is_arithmetic::value || - std::is_same::value || - std::is_same::value), - "radix sort is not usable"); - - for (first_bit = 0; first_bit < mask.size() && !mask[first_bit]; - ++first_bit) - ; - for (last_bit = first_bit; last_bit < mask.size() && mask[last_bit]; - ++last_bit) - ; - } - - template - void operator()([[maybe_unused]] GroupT g, [[maybe_unused]] PtrT first, - [[maybe_unused]] PtrT last) { -#ifdef __SYCL_DEVICE_ONLY__ - sycl::detail::privateDynamicSort( - g, first, /*empty*/ first, std::distance(first, last), scratch.data(), - first_bit, last_bit); -#else - throw sycl::exception( - std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), - "radix_sorter is not supported on host device."); -#endif - } - - template - ValT operator()([[maybe_unused]] GroupT g, [[maybe_unused]] ValT val) { -#ifdef __SYCL_DEVICE_ONLY__ - ValT result[]{val}; - sycl::detail::privateStaticSort( - g, result, /*empty*/ result, scratch.data(), first_bit, last_bit); - return result[0]; -#else - throw sycl::exception( - std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), - "radix_sorter is not supported on host device."); -#endif - } - - static constexpr size_t memory_required(sycl::memory_scope, - size_t range_size) { - return range_size * sizeof(ValT) + - (1 << bits) * range_size * sizeof(uint32_t) + alignof(uint32_t); - } - - // memory_helpers - template - static constexpr size_t memory_required(sycl::memory_scope, - sycl::range local_range) { - return (std::max)(local_range.size() * sizeof(ValT), - local_range.size() * (1 << bits) * sizeof(uint32_t)); - } -}; - -// Default sorters provided by the second version of the extension -// specification. namespace default_sorters { template > class joint_sorter { @@ -458,7 +307,6 @@ class group_key_value_sorter { }; } // namespace default_sorters -// Radix sorters provided by the second version of the extension specification. namespace radix_sorters { template #include -#if VERSION == 1 -template struct RadixSorterType; - -template struct RadixSorterType, T> { - using Type = - oneapi_exp::radix_sorter; -}; - -template struct RadixSorterType, T> { - using Type = - oneapi_exp::radix_sorter; -}; - -// Dummy overloads for CustomType which is not supported by radix sorter -template <> struct RadixSorterType, CustomType> { - using Type = - oneapi_exp::radix_sorter; -}; - -template <> struct RadixSorterType, CustomType> { - using Type = - oneapi_exp::radix_sorter; -}; -#endif - template void RunJointSort(sycl::queue &Q, const std::vector &DataToSort, const Compare &Comp) { @@ -76,40 +49,24 @@ void RunJointSort(sycl::queue &Q, const std::vector &DataToSort, constexpr size_t NumSubGroups = WGSize / ReqSubGroupSize; -#if VERSION == 1 - using RadixSorterT = typename RadixSorterType::Type; -#else using RadixSorterT = oneapi_exp::radix_sorters::joint_sorter< typename ConvertToSimpleType::Type, ConvertToSortingOrder::Type>; -#endif std::size_t LocalMemorySizeDefault = 0; std::size_t LocalMemorySizeRadix = 0; if (UseGroup == UseGroupT::SubGroup) { // Each sub-group needs a piece of memory for sorting -#if VERSION == 1 - LocalMemorySizeDefault = - oneapi_exp::default_sorter::template memory_required( - sycl::memory_scope::sub_group, ReqSubGroupSize * ElemsPerWI); -#else LocalMemorySizeDefault = oneapi_exp::default_sorters::joint_sorter< Compare>::template memory_required(sycl::memory_scope::sub_group, ReqSubGroupSize * ElemsPerWI); -#endif LocalMemorySizeRadix = RadixSorterT::memory_required( sycl::memory_scope::sub_group, ReqSubGroupSize * ElemsPerWI); } else { // A single chunk of memory for each work-group -#if VERSION == 1 - LocalMemorySizeDefault = - oneapi_exp::default_sorter::template memory_required( - sycl::memory_scope::work_group, WGSize * ElemsPerWI); -#else LocalMemorySizeDefault = oneapi_exp::default_sorters::joint_sorter< Compare>::template memory_required(sycl::memory_scope::work_group, WGSize * ElemsPerWI); -#endif LocalMemorySizeRadix = RadixSorterT::memory_required( sycl::memory_scope::sub_group, WGSize * ElemsPerWI); } @@ -203,13 +160,8 @@ void RunJointSort(sycl::queue &Q, const std::vector &DataToSort, oneapi_exp::joint_sort( Group, &AccToSort2[StartIdx], &AccToSort2[EndIdx], -#if VERSION == 1 - oneapi_exp::default_sorter(sycl::span{ - &ScratchDefault[LocalPartID], LocalMemorySizeDefault})); -#else oneapi_exp::default_sorters::joint_sorter(sycl::span{ &ScratchDefault[LocalPartID], LocalMemorySizeDefault})); -#endif const size_t LocalPartIDRadix = UseGroup == UseGroupT::SubGroup @@ -280,42 +232,20 @@ void RunSortOVerGroup(sycl::queue &Q, const std::vector &DataToSort, std::size_t LocalMemorySizeRadix = 0; if (UseGroup == UseGroupT::SubGroup) { // Each sub-group needs a piece of memory for sorting -#if VERSION == 1 - LocalMemorySizeDefault = - oneapi_exp::default_sorter::template memory_required( - sycl::memory_scope::sub_group, sycl::range<1>{ReqSubGroupSize}); -#else LocalMemorySizeDefault = oneapi_exp::default_sorters::group_sorter< T, Compare, 1>::memory_required(sycl::memory_scope::sub_group, ReqSubGroupSize); -#endif -#if VERSION == 1 - LocalMemorySizeRadix = RadixSorterT::memory_required( - sycl::memory_scope::sub_group, sycl::range<1>{ReqSubGroupSize}); -#else LocalMemorySizeRadix = RadixSorterT::memory_required( sycl::memory_scope::sub_group, ReqSubGroupSize); -#endif } else { // A single chunk of memory for each work-group -#if VERSION == 1 - LocalMemorySizeDefault = - oneapi_exp::default_sorter::template memory_required( - sycl::memory_scope::work_group, sycl::range<1>{NumOfElements}); -#else LocalMemorySizeDefault = oneapi_exp::default_sorters::group_sorter< T, Compare, 1>::memory_required(sycl::memory_scope::work_group, NumOfElements); -#endif -#if VERSION == 1 - LocalMemorySizeRadix = RadixSorterT::memory_required( - sycl::memory_scope::work_group, sycl::range<1>{NumOfElements}); -#else LocalMemorySizeRadix = RadixSorterT::memory_required( sycl::memory_scope::work_group, NumOfElements); -#endif } std::vector DataToSortCase0 = DataToSort; @@ -388,13 +318,8 @@ void RunSortOVerGroup(sycl::queue &Q, const std::vector &DataToSort, AccToSort2[GlobalLinearID] = oneapi_exp::sort_over_group( Group, AccToSort2[GlobalLinearID], -#if VERSION == 1 - oneapi_exp::default_sorter( - sycl::span{ScratchPtrDefault, LocalMemorySizeDefault})); -#else oneapi_exp::default_sorters::group_sorter( sycl::span{ScratchPtrDefault, LocalMemorySizeDefault})); -#endif // Each sub-group should use it's own part of the scratch pad const size_t ScratchShiftRadix =