From 45491179db862796d0318509e1d5dfed7f91ea5d Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 11 Dec 2024 21:25:28 -0600
Subject: [PATCH 1/4] Optimization of custom_reduce_over_group function.

The function used to perform custom reduction in a single
work-item (leader of the work-group sequentially).

It now does so cooperatively few iterations, and
processes remaining non-reduced elements sequentially
in the leading work-item.

The custom_reduce_over_group got sped up about a factor of 3x.

The following now shows timing of the reduction kernel

```
unitrace -d -v -i 20 python -c "import dpctl.tensor as dpt; dpt.min(dpt.ones(10**7, dtype=dpt.float32)).sycl_queue.wait()"
```

or par (less that 10%) slower than the int32 kernel, which uses
built-in sycl::reduce_over_group:

```
unitrace -d -v -i 20 python -c "import dpctl.tensor as dpt; dpt.min(dpt.ones(10**7, dtype=dpt.int32)).sycl_queue.wait()"
```
---
 .../libtensor/include/utils/sycl_utils.hpp    | 64 +++++++++++++++++--
 1 file changed, 59 insertions(+), 5 deletions(-)
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index 19be8645c9..53ecbbbe09 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -132,27 +132,81 @@ size_t choose_workgroup_size(const size_t nelems,
     return wg;
 }
 
+namespace
+{
+
+template <typename LocAccT, typename OpT>
+void _fold(LocAccT &local_mem_acc,
+           const std::uint32_t lid,
+           const std::uint32_t cutoff,
+           const std::uint32_t step,
+           const OpT &op)
+{
+    if (lid < cutoff) {
+        local_mem_acc[lid] = op(local_mem_acc[lid], local_mem_acc[step + lid]);
+    }
+}
+
+template <typename LocAccT, typename OpT>
+void _fold(LocAccT &local_mem_acc,
+           const std::uint32_t lid,
+           const std::uint32_t step,
+           const OpT &op)
+{
+    if (lid < step) {
+        local_mem_acc[lid] = op(local_mem_acc[lid], local_mem_acc[step + lid]);
+    }
+}
+
+} // namespace
+
 template <typename T, typename GroupT, typename LocAccT, typename OpT>
 T custom_reduce_over_group(const GroupT &wg,
                            LocAccT local_mem_acc,
                            const T &local_val,
                            const OpT &op)
 {
-    size_t wgs = wg.get_local_linear_range();
-    local_mem_acc[wg.get_local_linear_id()] = local_val;
+    const std::uint32_t wgs = wg.get_local_linear_range();
+    const std::uint32_t lid = wg.get_local_linear_id();
 
+    local_mem_acc[lid] = local_val;
     sycl::group_barrier(wg, sycl::memory_scope::work_group);
 
+    std::uint32_t n_witems = wgs;
+    if (wgs & (wgs - 1)) {
+        // wgs is not a power of 2
+#pragma unroll
+        for (std::uint32_t sz = 1024; sz >= 32; sz >>= 1) {
+            if (n_witems >= sz) {
+                const std::uint32_t n_witems_ = (n_witems + 1) >> 1;
+                _fold(local_mem_acc, lid, n_witems - n_witems_, n_witems_, op);
+                sycl::group_barrier(wg, sycl::memory_scope::work_group);
+                n_witems = n_witems_;
+            }
+        }
+    }
+    else {
+        // wgs is a power of 2
+#pragma unroll
+        for (std::uint32_t sz = 1024; sz >= 32; sz >>= 1) {
+            if (n_witems >= sz) {
+                n_witems = (n_witems + 1) >> 1;
+                _fold(local_mem_acc, lid, n_witems, op);
+                sycl::group_barrier(wg, sycl::memory_scope::work_group);
+            }
+        }
+    }
+
     T red_val_over_wg = local_mem_acc[0];
     if (wg.leader()) {
-        for (size_t i = 1; i < wgs; ++i) {
+        for (std::uint32_t i = 1; i < n_witems; ++i) {
             red_val_over_wg = op(red_val_over_wg, local_mem_acc[i]);
         }
     }
 
-    sycl::group_barrier(wg, sycl::memory_scope::work_group);
+    // sycl::group_barrier(wg, sycl::memory_scope::work_group);
 
-    return sycl::group_broadcast(wg, red_val_over_wg);
+    return sycl::group_broadcast(wg, red_val_over_wg, 0);
 }
 
 template <typename T, typename GroupT, typename LocAccT, typename OpT>

From 03910f3cfa9b4e95ef0670c409f64ada793216db Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 11 Dec 2024 22:11:14 -0600
Subject: [PATCH 2/4] Properly set properties of group_load/group_store to
 striped

Doing so exactly recovers the behavior of sub_group::load<vec_sz>,
sub_group::store<vec_sz> and eliminates warnings with 2025.1 and SYCLOS.

With this change, enable use of group_load, group_store for DPC++
compiler with `__SYCL_MAJOR_VERSION >= 8u` which includes
oneAPI DPC++ 2025.0.x compiler and SYCLOS bundle.
---
 .../tensor/libtensor/include/utils/sycl_utils.hpp  | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index 53ecbbbe09..deda9cd810 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -482,7 +482,7 @@ struct Identity<Op, T, std::enable_if_t<UseBuiltInIdentity<Op, T>::value>>
     SYCL_EXT_ONEAPI_GROUP_LOAD_STORE
 #define USE_GROUP_LOAD_STORE 1
 #else
-#if defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER > 20250100u)
+#if defined(__LIBSYCL_MAJOR_VERSION) && (__LIBSYCL_MAJOR_VERSION >= 8u)
 #define USE_GROUP_LOAD_STORE 1
 #else
 #define USE_GROUP_LOAD_STORE 0
@@ -504,7 +504,8 @@ auto sub_group_load(const sycl::sub_group &sg,
 #if (USE_GROUP_LOAD_STORE)
     using ValueT = typename std::remove_cv_t<ElementType>;
     sycl::vec<ValueT, vec_sz> x{};
-    ls_ns::group_load(sg, m_ptr, x, ls_ns::data_placement_blocked);
+    constexpr auto striped = ls_ns::properties{ls_ns::data_placement_striped};
+    ls_ns::group_load(sg, m_ptr, x, striped);
     return x;
 #else
     return sg.load<vec_sz>(m_ptr);
@@ -520,7 +521,8 @@ auto sub_group_load(const sycl::sub_group &sg,
 #if (USE_GROUP_LOAD_STORE)
     using ValueT = typename std::remove_cv_t<ElementType>;
     ValueT x{};
-    ls_ns::group_load(sg, m_ptr, x, ls_ns::data_placement_blocked);
+    constexpr auto striped = ls_ns::properties{ls_ns::data_placement_striped};
+    ls_ns::group_load(sg, m_ptr, x, striped);
     return x;
 #else
     return sg.load(m_ptr);
@@ -541,7 +543,8 @@ sub_group_store(const sycl::sub_group &sg,
 {
 #if (USE_GROUP_LOAD_STORE)
     static_assert(std::is_same_v<VecT, ElementType>);
-    ls_ns::group_store(sg, val, m_ptr, ls_ns::data_placement_blocked);
+    constexpr auto striped = ls_ns::properties{ls_ns::data_placement_striped};
+    ls_ns::group_store(sg, val, m_ptr, striped);
     return;
 #else
     sg.store<vec_sz>(m_ptr, val);
@@ -561,7 +564,8 @@ sub_group_store(const sycl::sub_group &sg,
                 sycl::multi_ptr<ElementType, Space, DecorateAddress> m_ptr)
 {
 #if (USE_GROUP_LOAD_STORE)
-    ls_ns::group_store(sg, val, m_ptr, ls_ns::data_placement_blocked);
+    constexpr auto striped = ls_ns::properties{ls_ns::data_placement_striped};
+    ls_ns::group_store(sg, val, m_ptr, striped);
     return;
 #else
     sg.store(m_ptr, val);

From e15e3aae6e53e045f48d4d10118944d619231766 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 12 Dec 2024 13:44:15 -0600
Subject: [PATCH 3/4] Tweak bounds of cooperative reduction steps

Factor out bounds as constexpr values, reused between power-of-2
branch and not-power-of-two branch.

Lowered lower bounds from 32 to 8 based on pefrormance testing
on PVC and Iris Xe.
---
 dpctl/tensor/libtensor/include/utils/sycl_utils.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index deda9cd810..3a7e70e7bc 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -166,6 +166,8 @@ T custom_reduce_over_group(const GroupT &wg,
                            const T &local_val,
                            const OpT &op)
 {
+    constexpr std::uint32_t low_sz = 8u;
+    constexpr std::uint32_t high_sz = 1024u;
     const std::uint32_t wgs = wg.get_local_linear_range();
     const std::uint32_t lid = wg.get_local_linear_id();
 
@@ -176,7 +178,7 @@ T custom_reduce_over_group(const GroupT &wg,
     if (wgs & (wgs - 1)) {
         // wgs is not a power of 2
 #pragma unroll
-        for (std::uint32_t sz = 1024; sz >= 32; sz >>= 1) {
+        for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1) {
             if (n_witems >= sz) {
                 const std::uint32_t n_witems_ = (n_witems + 1) >> 1;
                 _fold(local_mem_acc, lid, n_witems - n_witems_, n_witems_, op);
@@ -188,7 +190,7 @@ T custom_reduce_over_group(const GroupT &wg,
     else {
         // wgs is a power of 2
 #pragma unroll
-        for (std::uint32_t sz = 1024; sz >= 32; sz >>= 1) {
+        for (std::uint32_t sz = high_sz; sz >= low_sz; sz >>= 1) {
             if (n_witems >= sz) {
                 n_witems = (n_witems + 1) >> 1;
                 _fold(local_mem_acc, lid, n_witems, op);
@@ -204,8 +206,6 @@ T custom_reduce_over_group(const GroupT &wg,
         }
     }
 
-    // sycl::group_barrier(wg, sycl::memory_scope::work_group);
-
     return sycl::group_broadcast(wg, red_val_over_wg, 0);
 }
 

From 63c19472a8ec62b3c97143785d51b0f882691f66 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Thu, 12 Dec 2024 13:55:04 -0600
Subject: [PATCH 4/4] Add line for performance improvement in reductions with
 custom reduce_over_group function

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ae78312038..0957f99e35 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 * Implement radix sort algorithm to be used in `dpt.sort` and `dpt.argsort` [gh-1867](https://github.com/IntelPython/dpctl/pull/1867)
 * Extended `dpctl.SyclTimer` with `device_timer` keyword, implementing different methods of collecting device times [gh-1872](https://github.com/IntelPython/dpctl/pull/1872)
 * Improved performance of `tensor.cumulative_sum`, `tensor.cumulative_prod`, `tensor.cumulative_logsumexp` as well as performance of boolean indexing [gh-1923](https://github.com/IntelPython/dpctl/pull/1923)
+* Improved performance of `tensor.min`, `tensor.max`, `tensor.logsumexp`, `tensor.reduce_hypot` for floating point type arrays by at least 2x [gh-1932](https://github.com/IntelPython/dpctl/pull/1932)
 
 ### Fixed