From 8be9696c0c2b5d23fbe6a0c2249749eaa6872e80 Mon Sep 17 00:00:00 2001 From: pgorlani Date: Wed, 13 Jul 2022 07:20:14 -0700 Subject: [PATCH 1/4] [SYCL][Reduction] Remove atomic64 check for `float` reductions --- sycl/include/sycl/ext/oneapi/reduction.hpp | 34 ++++++++++------------ 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/reduction.hpp b/sycl/include/sycl/ext/oneapi/reduction.hpp index 08598b2540cd5..bc292b55b260e 100644 --- a/sycl/include/sycl/ext/oneapi/reduction.hpp +++ b/sycl/include/sycl/ext/oneapi/reduction.hpp @@ -56,7 +56,8 @@ using IsReduOptForFastAtomicFetch = #ifdef SYCL_REDUCTION_DETERMINISTIC bool_constant; #else - bool_constant::value && + bool_constant<((sycl::detail::is_sgenfloat::value && sizeof(T) == 4) || + sycl::detail::is_sgeninteger::value) && sycl::detail::IsValidAtomicType::value && (sycl::detail::IsPlus::value || sycl::detail::IsMinimum::value || @@ -74,18 +75,15 @@ using IsReduOptForFastAtomicFetch = // IsReduOptForFastReduce. The macro SYCL_REDUCTION_DETERMINISTIC prohibits // using the reduce_over_group() algorithm to produce stable results across same // type devices. -// TODO 32 bit floating point atomics are eventually expected to be supported by -// the has_fast_atomics specialization. Once the reducer class is updated to -// replace the deprecated atomic class with atomic_ref, the (sizeof(T) == 4) -// case should be removed here and replaced in IsReduOptForFastAtomicFetch. template -using IsReduOptForAtomic64Add = +using IsReduOptForAtomic64Op = #ifdef SYCL_REDUCTION_DETERMINISTIC bool_constant; #else - bool_constant::value && - sycl::detail::is_sgenfloat::value && - (sizeof(T) == 4 || sizeof(T) == 8)>; + bool_constant<(sycl::detail::IsPlus::value || + sycl::detail::IsMinimum::value || + sycl::detail::IsMaximum::value) && + sycl::detail::is_sgenfloat::value && sizeof(T) == 8>; #endif // This type trait is used to detect if the group algorithm reduce() used with @@ -248,7 +246,7 @@ template class combiner { typename _T = T, class _BinaryOperation = BinaryOperation> enable_if_t && (IsReduOptForFastAtomicFetch::value || - IsReduOptForAtomic64Add::value) && + IsReduOptForAtomic64Op::value) && sycl::detail::IsPlus::value> atomic_combine(_T *ReduVarPtr) const { atomic_combine_impl( @@ -294,7 +292,8 @@ template class combiner { template enable_if_t && - IsReduOptForFastAtomicFetch::value && + (IsReduOptForFastAtomicFetch::value || + IsReduOptForAtomic64Op::value) && sycl::detail::IsMinimum::value> atomic_combine(_T *ReduVarPtr) const { atomic_combine_impl( @@ -305,7 +304,8 @@ template class combiner { template enable_if_t && - IsReduOptForFastAtomicFetch::value && + (IsReduOptForFastAtomicFetch::value || + IsReduOptForAtomic64Op::value) && sycl::detail::IsMaximum::value> atomic_combine(_T *ReduVarPtr) const { atomic_combine_impl( @@ -566,7 +566,7 @@ class reduction_impl_algo : public reduction_impl_common { access::target::device, is_placeholder, ext::oneapi::accessor_property_list<>>; static constexpr bool has_atomic_add_float64 = - IsReduOptForAtomic64Add::value; + IsReduOptForAtomic64Op::value; static constexpr bool has_fast_atomics = IsReduOptForFastAtomicFetch::value; static constexpr bool has_fast_reduce = @@ -2040,12 +2040,8 @@ template struct NDRangeAtomic64; } // namespace main_krn } // namespace reduction -// Specialization for devices with the atomic64 aspect, which guarantees 64 (and -// temporarily 32) bit floating point support for atomic add. -// TODO 32 bit floating point atomics are eventually expected to be supported by -// the has_fast_atomics specialization. Corresponding changes to -// IsReduOptForAtomic64Add, as prescribed in its documentation, should then also -// be made. +// Specialization for devices with the atomic64 aspect, which guarantees 64 bit +// floating point support for atomic add. template void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, const nd_range &Range, Reduction &Redu) { From 79881bfa0cd762e19812f3764622bb4217d02a6b Mon Sep 17 00:00:00 2001 From: pgorlani <92453485+pgorlani@users.noreply.github.com> Date: Mon, 25 Jul 2022 09:28:29 +0100 Subject: [PATCH 2/4] Fix comment Co-authored-by: aelovikov-intel --- sycl/include/sycl/ext/oneapi/reduction.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/oneapi/reduction.hpp b/sycl/include/sycl/ext/oneapi/reduction.hpp index bc292b55b260e..07f3fb6b09908 100644 --- a/sycl/include/sycl/ext/oneapi/reduction.hpp +++ b/sycl/include/sycl/ext/oneapi/reduction.hpp @@ -2041,7 +2041,7 @@ template struct NDRangeAtomic64; } // namespace reduction // Specialization for devices with the atomic64 aspect, which guarantees 64 bit -// floating point support for atomic add. +// floating point support for atomic reduction operation. template void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, const nd_range &Range, Reduction &Redu) { From d0f45789bacb376152b04724cb439a981cb5c579 Mon Sep 17 00:00:00 2001 From: pgorlani Date: Mon, 25 Jul 2022 01:36:23 -0700 Subject: [PATCH 3/4] Fix comment --- sycl/include/sycl/ext/oneapi/reduction.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/oneapi/reduction.hpp b/sycl/include/sycl/ext/oneapi/reduction.hpp index 07f3fb6b09908..1e5cd45a2cf9a 100644 --- a/sycl/include/sycl/ext/oneapi/reduction.hpp +++ b/sycl/include/sycl/ext/oneapi/reduction.hpp @@ -2047,7 +2047,7 @@ void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, const nd_range &Range, Reduction &Redu) { auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH); static_assert(Reduction::has_atomic_add_float64, - "Only suitable for reductions that have FP64 atomic add."); + "Only suitable for reductions that have FP64 atomic operations."); constexpr size_t NElements = Reduction::num_elements; using Name = __sycl_reduction_kernel; From 934454b7749102f4c69172b9b97060952e752f49 Mon Sep 17 00:00:00 2001 From: pgorlani Date: Mon, 25 Jul 2022 01:58:10 -0700 Subject: [PATCH 4/4] Fix variable name --- sycl/include/sycl/ext/oneapi/reduction.hpp | 9 +++++---- sycl/include/sycl/handler.hpp | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/reduction.hpp b/sycl/include/sycl/ext/oneapi/reduction.hpp index 1e5cd45a2cf9a..363faf9176589 100644 --- a/sycl/include/sycl/ext/oneapi/reduction.hpp +++ b/sycl/include/sycl/ext/oneapi/reduction.hpp @@ -565,7 +565,7 @@ class reduction_impl_algo : public reduction_impl_common { using rw_accessor_type = accessor>; - static constexpr bool has_atomic_add_float64 = + static constexpr bool has_float64_atomics = IsReduOptForAtomic64Op::value; static constexpr bool has_fast_atomics = IsReduOptForFastAtomicFetch::value; @@ -645,7 +645,7 @@ class reduction_impl_algo : public reduction_impl_common { /// require initialization with identity value, then return user's read-write /// accessor. Otherwise, create global buffer with 'num_elements' initialized /// with identity value and return an accessor to that buffer. - template + template std::enable_if_t getReadWriteAccessorToInitializedMem(handler &CGH) { if constexpr (is_rw_acc) { @@ -2046,8 +2046,9 @@ template void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc, const nd_range &Range, Reduction &Redu) { auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH); - static_assert(Reduction::has_atomic_add_float64, - "Only suitable for reductions that have FP64 atomic operations."); + static_assert( + Reduction::has_float64_atomics, + "Only suitable for reductions that have FP64 atomic operations."); constexpr size_t NElements = Reduction::num_elements; using Name = __sycl_reduction_kernel; diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 6425d59a9b5fe..62704252d2daf 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1669,13 +1669,13 @@ class __SYCL_EXPORT handler { void parallel_for(nd_range Range, Reduction Redu, _KERNELFUNCPARAM(KernelFunc)) { if constexpr (!Reduction::has_fast_atomics && - !Reduction::has_atomic_add_float64) { + !Reduction::has_float64_atomics) { // The most basic implementation. parallel_for_impl(Range, Redu, KernelFunc); return; } else { // Can't "early" return for "if constexpr". std::shared_ptr QueueCopy = MQueue; - if constexpr (Reduction::has_atomic_add_float64) { + if constexpr (Reduction::has_float64_atomics) { /// This version is a specialization for the add /// operator. It performs runtime checks for device aspect "atomic64"; /// if found, fast sycl::atomic_ref operations are used to update the