From 45491179db862796d0318509e1d5dfed7f91ea5d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 11 Dec 2024 21:25:28 -0600 Subject: [PATCH 1/4] Optimization of custom_reduce_over_group function. The function used to perform custom reduction in a single work-item (leader of the work-group sequentially). It now does so cooperatively few iterations, and processes remaining non-reduced elements sequentially in the leading work-item. The custom_reduce_over_group got sped up about a factor of 3x. The following now shows timing of the reduction kernel ``` unitrace -d -v -i 20 python -c "import dpctl.tensor as dpt; dpt.min(dpt.ones(10**7, dtype=dpt.float32)).sycl_queue.wait()" ``` or par (less that 10%) slower than the int32 kernel, which uses built-in sycl::reduce_over_group: ``` unitrace -d -v -i 20 python -c "import dpctl.tensor as dpt; dpt.min(dpt.ones(10**7, dtype=dpt.int32)).sycl_queue.wait()" ``` --- .../libtensor/include/utils/sycl_utils.hpp | 64 +++++++++++++++++-- 1 file changed, 59 insertions(+), 5 deletions(-) diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index 19be8645c9..53ecbbbe09 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -132,27 +132,81 @@ size_t choose_workgroup_size(const size_t nelems, return wg; } +namespace +{ + +template +void _fold(LocAccT &local_mem_acc, + const std::uint32_t lid, + const std::uint32_t cutoff, + const std::uint32_t step, + const OpT &op) +{ + if (lid < cutoff) { + local_mem_acc[lid] = op(local_mem_acc[lid], local_mem_acc[step + lid]); + } +} + +template +void _fold(LocAccT &local_mem_acc, + const std::uint32_t lid, + const std::uint32_t step, + const OpT &op) +{ + if (lid < step) { + local_mem_acc[lid] = op(local_mem_acc[lid], local_mem_acc[step + lid]); + } +} + +} // namespace + template T custom_reduce_over_group(const GroupT &wg, LocAccT local_mem_acc, const T &local_val, const OpT &op) { - size_t wgs = wg.get_local_linear_range(); - local_mem_acc[wg.get_local_linear_id()] = local_val; + const std::uint32_t wgs = wg.get_local_linear_range(); + const std::uint32_t lid = wg.get_local_linear_id(); + local_mem_acc[lid] = local_val; sycl::group_barrier(wg, sycl::memory_scope::work_group); + std::uint32_t n_witems = wgs; + if (wgs & (wgs - 1)) { + // wgs is not a power of 2 +#pragma unroll + for (std::uint32_t sz = 1024; sz >= 32; sz >>= 1) { + if (n_witems >= sz) { + const std::uint32_t n_witems_ = (n_witems + 1) >> 1; + _fold(local_mem_acc, lid, n_witems - n_witems_, n_witems_, op); + sycl::group_barrier(wg, sycl::memory_scope::work_group); + n_witems = n_witems_; + } + } + } + else { + // wgs is a power of 2 +#pragma unroll + for (std::uint32_t sz = 1024; sz >= 32; sz >>= 1) { + if (n_witems >= sz) { + n_witems = (n_witems + 1) >> 1; + _fold(local_mem_acc, lid, n_witems, op); + sycl::group_barrier(wg, sycl::memory_scope::work_group); + } + } + } + T red_val_over_wg = local_mem_acc[0]; if (wg.leader()) { - for (size_t i = 1; i < wgs; ++i) { + for (std::uint32_t i = 1; i < n_witems; ++i) { red_val_over_wg = op(red_val_over_wg, local_mem_acc[i]); } } - sycl::group_barrier(wg, sycl::memory_scope::work_group); + // sycl::group_barrier(wg, sycl::memory_scope::work_group); - return sycl::group_broadcast(wg, red_val_over_wg); + return sycl::group_broadcast(wg, red_val_over_wg, 0); } template From 03910f3cfa9b4e95ef0670c409f64ada793216db Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 11 Dec 2024 22:11:14 -0600 Subject: [PATCH 2/4] Properly set properties of group_load/group_store to striped Doing so exactly recovers the behavior of sub_group::load, sub_group::store and eliminates warnings with 2025.1 and SYCLOS. With this change, enable use of group_load, group_store for DPC++ compiler with `__SYCL_MAJOR_VERSION >= 8u` which includes oneAPI DPC++ 2025.0.x compiler and SYCLOS bundle. --- .../tensor/libtensor/include/utils/sycl_utils.hpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index 53ecbbbe09..deda9cd810 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -482,7 +482,7 @@ struct Identity::value>> SYCL_EXT_ONEAPI_GROUP_LOAD_STORE #define USE_GROUP_LOAD_STORE 1 #else -#if defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER > 20250100u) +#if defined(__LIBSYCL_MAJOR_VERSION) && (__LIBSYCL_MAJOR_VERSION >= 8u) #define USE_GROUP_LOAD_STORE 1 #else #define USE_GROUP_LOAD_STORE 0 @@ -504,7 +504,8 @@ auto sub_group_load(const sycl::sub_group &sg, #if (USE_GROUP_LOAD_STORE) using ValueT = typename std::remove_cv_t; sycl::vec x{}; - ls_ns::group_load(sg, m_ptr, x, ls_ns::data_placement_blocked); + constexpr auto striped = ls_ns::properties{ls_ns::data_placement_striped}; + ls_ns::group_load(sg, m_ptr, x, striped); return x; #else return sg.load(m_ptr); @@ -520,7 +521,8 @@ auto sub_group_load(const sycl::sub_group &sg, #if (USE_GROUP_LOAD_STORE) using ValueT = typename std::remove_cv_t; ValueT x{}; - ls_ns::group_load(sg, m_ptr, x, ls_ns::data_placement_blocked); + constexpr auto striped = ls_ns::properties{ls_ns::data_placement_striped}; + ls_ns::group_load(sg, m_ptr, x, striped); return x; #else return sg.load(m_ptr); @@ -541,7 +543,8 @@ sub_group_store(const sycl::sub_group &sg, { #if (USE_GROUP_LOAD_STORE) static_assert(std::is_same_v); - ls_ns::group_store(sg, val, m_ptr, ls_ns::data_placement_blocked); + constexpr auto striped = ls_ns::properties{ls_ns::data_placement_striped}; + ls_ns::group_store(sg, val, m_ptr, striped); return; #else sg.store(m_ptr, val); @@ -561,7 +564,8 @@ sub_group_store(const sycl::sub_group &sg, sycl::multi_ptr m_ptr) { #if (USE_GROUP_LOAD_STORE) - ls_ns::group_store(sg, val, m_ptr, ls_ns::data_placement_blocked); + constexpr auto striped = ls_ns::properties{ls_ns::data_placement_striped}; + ls_ns::group_store(sg, val, m_ptr, striped); return; #else sg.store(m_ptr, val); From e15e3aae6e53e045f48d4d10118944d619231766 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 12 Dec 2024 13:44:15 -0600 Subject: [PATCH 3/4] Tweak bounds of cooperative reduction steps Factor out bounds as constexpr values, reused between power-of-2 branch and not-power-of-two branch. Lowered lower bounds from 32 to 8 based on pefrormance testing on PVC and Iris Xe. --- dpctl/tensor/libtensor/include/utils/sycl_utils.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index deda9cd810..3a7e70e7bc 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -166,6 +166,8 @@ T custom_reduce_over_group(const GroupT &wg, const T &local_val, const OpT &op) { + constexpr std::uint32_t low_sz = 8u; + constexpr std::uint32_t high_sz = 1024u; const std::uint32_t wgs = wg.get_local_linear_range(); const std::uint32_t lid = wg.get_local_linear_id(); @@ -176,7 +178,7 @@ T custom_reduce_over_group(const GroupT &wg, if (wgs & (wgs - 1)) { // wgs is not a power of 2 #pragma unroll - for (std::uint32_t sz = 1024; sz >= 32; sz >>= 1) { + for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1) { if (n_witems >= sz) { const std::uint32_t n_witems_ = (n_witems + 1) >> 1; _fold(local_mem_acc, lid, n_witems - n_witems_, n_witems_, op); @@ -188,7 +190,7 @@ T custom_reduce_over_group(const GroupT &wg, else { // wgs is a power of 2 #pragma unroll - for (std::uint32_t sz = 1024; sz >= 32; sz >>= 1) { + for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1) { if (n_witems >= sz) { n_witems = (n_witems + 1) >> 1; _fold(local_mem_acc, lid, n_witems, op); @@ -204,8 +206,6 @@ T custom_reduce_over_group(const GroupT &wg, } } - // sycl::group_barrier(wg, sycl::memory_scope::work_group); - return sycl::group_broadcast(wg, red_val_over_wg, 0); } From 63c19472a8ec62b3c97143785d51b0f882691f66 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 12 Dec 2024 13:55:04 -0600 Subject: [PATCH 4/4] Add line for performance improvement in reductions with custom reduce_over_group function --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ae78312038..0957f99e35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Implement radix sort algorithm to be used in `dpt.sort` and `dpt.argsort` [gh-1867](https://github.com/IntelPython/dpctl/pull/1867) * Extended `dpctl.SyclTimer` with `device_timer` keyword, implementing different methods of collecting device times [gh-1872](https://github.com/IntelPython/dpctl/pull/1872) * Improved performance of `tensor.cumulative_sum`, `tensor.cumulative_prod`, `tensor.cumulative_logsumexp` as well as performance of boolean indexing [gh-1923](https://github.com/IntelPython/dpctl/pull/1923) +* Improved performance of `tensor.min`, `tensor.max`, `tensor.logsumexp`, `tensor.reduce_hypot` for floating point type arrays by at least 2x [gh-1932](https://github.com/IntelPython/dpctl/pull/1932) ### Fixed