From 8be9696c0c2b5d23fbe6a0c2249749eaa6872e80 Mon Sep 17 00:00:00 2001
From: pgorlani <paolo.gorlani@codeplay.com>
Date: Wed, 13 Jul 2022 07:20:14 -0700
Subject: [PATCH 1/4] [SYCL][Reduction] Remove atomic64 check for `float`
 reductions

---
 sycl/include/sycl/ext/oneapi/reduction.hpp | 34 ++++++++++------------
 1 file changed, 15 insertions(+), 19 deletions(-)
diff --git a/sycl/include/sycl/ext/oneapi/reduction.hpp b/sycl/include/sycl/ext/oneapi/reduction.hpp
index 08598b2540cd5..bc292b55b260e 100644
--- a/sycl/include/sycl/ext/oneapi/reduction.hpp
+++ b/sycl/include/sycl/ext/oneapi/reduction.hpp
@@ -56,7 +56,8 @@ using IsReduOptForFastAtomicFetch =
 #ifdef SYCL_REDUCTION_DETERMINISTIC
     bool_constant<false>;
 #else
-    bool_constant<sycl::detail::is_sgeninteger<T>::value &&
+    bool_constant<((sycl::detail::is_sgenfloat<T>::value && sizeof(T) == 4) ||
+                   sycl::detail::is_sgeninteger<T>::value) &&
                   sycl::detail::IsValidAtomicType<T>::value &&
                   (sycl::detail::IsPlus<T, BinaryOperation>::value ||
                    sycl::detail::IsMinimum<T, BinaryOperation>::value ||
@@ -74,18 +75,15 @@ using IsReduOptForFastAtomicFetch =
 // IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits
 // using the reduce_over_group() algorithm to produce stable results across same
 // type devices.
-// TODO 32 bit floating point atomics are eventually expected to be supported by
-// the has_fast_atomics specialization. Once the reducer class is updated to
-// replace the deprecated atomic class with atomic_ref, the (sizeof(T) == 4)
-// case should be removed here and replaced in IsReduOptForFastAtomicFetch.
 template <typename T, class BinaryOperation>
-using IsReduOptForAtomic64Add =
+using IsReduOptForAtomic64Op =
 #ifdef SYCL_REDUCTION_DETERMINISTIC
     bool_constant<false>;
 #else
-    bool_constant<sycl::detail::IsPlus<T, BinaryOperation>::value &&
-                  sycl::detail::is_sgenfloat<T>::value &&
-                  (sizeof(T) == 4 || sizeof(T) == 8)>;
+    bool_constant<(sycl::detail::IsPlus<T, BinaryOperation>::value ||
+                   sycl::detail::IsMinimum<T, BinaryOperation>::value ||
+                   sycl::detail::IsMaximum<T, BinaryOperation>::value) &&
+                  sycl::detail::is_sgenfloat<T>::value && sizeof(T) == 8>;
 #endif
 
 // This type trait is used to detect if the group algorithm reduce() used with
@@ -248,7 +246,7 @@ template <class Reducer> class combiner {
             typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
               (IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value ||
-               IsReduOptForAtomic64Add<T, _BinaryOperation>::value) &&
+               IsReduOptForAtomic64Op<T, _BinaryOperation>::value) &&
               sycl::detail::IsPlus<T, _BinaryOperation>::value>
   atomic_combine(_T *ReduVarPtr) const {
     atomic_combine_impl<Space>(
@@ -294,7 +292,8 @@ template <class Reducer> class combiner {
   template <access::address_space Space = access::address_space::global_space,
             typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
-              IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value &&
+              (IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value ||
+               IsReduOptForAtomic64Op<T, _BinaryOperation>::value) &&
               sycl::detail::IsMinimum<T, _BinaryOperation>::value>
   atomic_combine(_T *ReduVarPtr) const {
     atomic_combine_impl<Space>(
@@ -305,7 +304,8 @@ template <class Reducer> class combiner {
   template <access::address_space Space = access::address_space::global_space,
             typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<BasicCheck<_T, Space, _BinaryOperation> &&
-              IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value &&
+              (IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value ||
+               IsReduOptForAtomic64Op<T, _BinaryOperation>::value) &&
               sycl::detail::IsMaximum<T, _BinaryOperation>::value>
   atomic_combine(_T *ReduVarPtr) const {
     atomic_combine_impl<Space>(
@@ -566,7 +566,7 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
                                     access::target::device, is_placeholder,
                                     ext::oneapi::accessor_property_list<>>;
   static constexpr bool has_atomic_add_float64 =
-      IsReduOptForAtomic64Add<T, BinaryOperation>::value;
+      IsReduOptForAtomic64Op<T, BinaryOperation>::value;
   static constexpr bool has_fast_atomics =
       IsReduOptForFastAtomicFetch<T, BinaryOperation>::value;
   static constexpr bool has_fast_reduce =
@@ -2040,12 +2040,8 @@ template <class KernelName> struct NDRangeAtomic64;
 } // namespace main_krn
 } // namespace reduction
 
-// Specialization for devices with the atomic64 aspect, which guarantees 64 (and
-// temporarily 32) bit floating point support for atomic add.
-// TODO 32 bit floating point atomics are eventually expected to be supported by
-// the has_fast_atomics specialization. Corresponding changes to
-// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also
-// be made.
+// Specialization for devices with the atomic64 aspect, which guarantees 64 bit
+// floating point support for atomic add.
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
 void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
                         const nd_range<Dims> &Range, Reduction &Redu) {

From 79881bfa0cd762e19812f3764622bb4217d02a6b Mon Sep 17 00:00:00 2001
From: pgorlani <92453485+pgorlani@users.noreply.github.com>
Date: Mon, 25 Jul 2022 09:28:29 +0100
Subject: [PATCH 2/4] Fix comment

Co-authored-by: aelovikov-intel <andrei.elovikov@intel.com>
---
 sycl/include/sycl/ext/oneapi/reduction.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/include/sycl/ext/oneapi/reduction.hpp b/sycl/include/sycl/ext/oneapi/reduction.hpp
index bc292b55b260e..07f3fb6b09908 100644
--- a/sycl/include/sycl/ext/oneapi/reduction.hpp
+++ b/sycl/include/sycl/ext/oneapi/reduction.hpp
@@ -2041,7 +2041,7 @@ template <class KernelName> struct NDRangeAtomic64;
 } // namespace reduction
 
 // Specialization for devices with the atomic64 aspect, which guarantees 64 bit
-// floating point support for atomic add.
+// floating point support for atomic reduction operation.
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
 void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
                         const nd_range<Dims> &Range, Reduction &Redu) {

From d0f45789bacb376152b04724cb439a981cb5c579 Mon Sep 17 00:00:00 2001
From: pgorlani <paolo.gorlani@codeplay.com>
Date: Mon, 25 Jul 2022 01:36:23 -0700
Subject: [PATCH 3/4] Fix comment

---
 sycl/include/sycl/ext/oneapi/reduction.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/include/sycl/ext/oneapi/reduction.hpp b/sycl/include/sycl/ext/oneapi/reduction.hpp
index 07f3fb6b09908..1e5cd45a2cf9a 100644
--- a/sycl/include/sycl/ext/oneapi/reduction.hpp
+++ b/sycl/include/sycl/ext/oneapi/reduction.hpp
@@ -2047,7 +2047,7 @@ void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
                         const nd_range<Dims> &Range, Reduction &Redu) {
   auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
   static_assert(Reduction::has_atomic_add_float64,
-                "Only suitable for reductions that have FP64 atomic add.");
+                "Only suitable for reductions that have FP64 atomic operations.");
   constexpr size_t NElements = Reduction::num_elements;
   using Name =
       __sycl_reduction_kernel<reduction::main_krn::NDRangeAtomic64, KernelName>;

From 934454b7749102f4c69172b9b97060952e752f49 Mon Sep 17 00:00:00 2001
From: pgorlani <paolo.gorlani@codeplay.com>
Date: Mon, 25 Jul 2022 01:58:10 -0700
Subject: [PATCH 4/4] Fix variable name

---
 sycl/include/sycl/ext/oneapi/reduction.hpp | 9 +++++----
 sycl/include/sycl/handler.hpp              | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/reduction.hpp b/sycl/include/sycl/ext/oneapi/reduction.hpp
index 1e5cd45a2cf9a..363faf9176589 100644
--- a/sycl/include/sycl/ext/oneapi/reduction.hpp
+++ b/sycl/include/sycl/ext/oneapi/reduction.hpp
@@ -565,7 +565,7 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
   using rw_accessor_type = accessor<T, accessor_dim, access::mode::read_write,
                                     access::target::device, is_placeholder,
                                     ext::oneapi::accessor_property_list<>>;
-  static constexpr bool has_atomic_add_float64 =
+  static constexpr bool has_float64_atomics =
       IsReduOptForAtomic64Op<T, BinaryOperation>::value;
   static constexpr bool has_fast_atomics =
       IsReduOptForFastAtomicFetch<T, BinaryOperation>::value;
@@ -645,7 +645,7 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
   /// require initialization with identity value, then return user's read-write
   /// accessor. Otherwise, create global buffer with 'num_elements' initialized
   /// with identity value and return an accessor to that buffer.
-  template <bool HasFastAtomics = (has_fast_atomics || has_atomic_add_float64)>
+  template <bool HasFastAtomics = (has_fast_atomics || has_float64_atomics)>
   std::enable_if_t<HasFastAtomics, rw_accessor_type>
   getReadWriteAccessorToInitializedMem(handler &CGH) {
     if constexpr (is_rw_acc) {
@@ -2046,8 +2046,9 @@ template <typename KernelName, typename KernelType, int Dims, class Reduction>
 void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
                         const nd_range<Dims> &Range, Reduction &Redu) {
   auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
-  static_assert(Reduction::has_atomic_add_float64,
-                "Only suitable for reductions that have FP64 atomic operations.");
+  static_assert(
+      Reduction::has_float64_atomics,
+      "Only suitable for reductions that have FP64 atomic operations.");
   constexpr size_t NElements = Reduction::num_elements;
   using Name =
       __sycl_reduction_kernel<reduction::main_krn::NDRangeAtomic64, KernelName>;
diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index 6425d59a9b5fe..62704252d2daf 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -1669,13 +1669,13 @@ class __SYCL_EXPORT handler {
   void parallel_for(nd_range<Dims> Range, Reduction Redu,
                     _KERNELFUNCPARAM(KernelFunc)) {
     if constexpr (!Reduction::has_fast_atomics &&
-                  !Reduction::has_atomic_add_float64) {
+                  !Reduction::has_float64_atomics) {
       // The most basic implementation.
       parallel_for_impl<KernelName>(Range, Redu, KernelFunc);
       return;
     } else { // Can't "early" return for "if constexpr".
       std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
-      if constexpr (Reduction::has_atomic_add_float64) {
+      if constexpr (Reduction::has_float64_atomics) {
         /// This version is a specialization for the add
         /// operator. It performs runtime checks for device aspect "atomic64";
         /// if found, fast sycl::atomic_ref operations are used to update the