diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 8044a09293c80..602a78cf56c73 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -853,7 +853,7 @@ class __SYCL_EXPORT handler { int Dims, typename Reduction> detail::enable_if_t - parallel_for(nd_range Range, Reduction &Redu, KernelType KernelFunc) { + parallel_for(nd_range Range, Reduction Redu, KernelType KernelFunc) { if (Reduction::is_usm) Redu.associateWithHandler(*this); shared_ptr_class QueueCopy = MQueue; @@ -886,7 +886,7 @@ class __SYCL_EXPORT handler { int Dims, typename Reduction> detail::enable_if_t - parallel_for(nd_range Range, Reduction &Redu, KernelType KernelFunc) { + parallel_for(nd_range Range, Reduction Redu, KernelType KernelFunc) { shared_ptr_class QueueCopy = MQueue; auto RWAcc = Redu.getReadWriteScalarAcc(*this); intel::detail::reduCGFunc(*this, KernelFunc, Range, Redu, @@ -920,7 +920,7 @@ class __SYCL_EXPORT handler { template detail::enable_if_t - parallel_for(nd_range Range, Reduction &Redu, KernelType KernelFunc) { + parallel_for(nd_range Range, Reduction Redu, KernelType KernelFunc) { size_t NWorkGroups = Range.get_group_range().size(); // This parallel_for() is lowered to the following sequence: diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp index 7a996f365915d..4b9bec9829c81 100644 --- a/sycl/include/CL/sycl/intel/reduction.hpp +++ b/sycl/include/CL/sycl/intel/reduction.hpp @@ -26,65 +26,105 @@ using cl::sycl::detail::is_geninteger8bit; using cl::sycl::detail::remove_AS; template -using IsReduOptForFastAtomicFetch = detail::bool_constant< - (is_geninteger32bit::value || is_geninteger64bit::value) && - (std::is_same>::value || - std::is_same>::value || - std::is_same>::value || - std::is_same>::value || - std::is_same>::value || - std::is_same>::value)>; +using IsReduPlus = detail::bool_constant< + std::is_same>::value || + std::is_same>::value>; + +template +using IsReduMultiplies = detail::bool_constant< + std::is_same>::value || + std::is_same>::value>; + +template +using IsReduMinimum = detail::bool_constant< + std::is_same>::value || + std::is_same>::value>; + +template +using IsReduMaximum = detail::bool_constant< + std::is_same>::value || + std::is_same>::value>; + +template +using IsReduBitOR = detail::bool_constant< + std::is_same>::value || + std::is_same>::value>; + +template +using IsReduBitXOR = detail::bool_constant< + std::is_same>::value || + std::is_same>::value>; + +template +using IsReduBitAND = detail::bool_constant< + std::is_same>::value || + std::is_same>::value>; + +template +using IsReduOptForFastAtomicFetch = + detail::bool_constant<(is_geninteger32bit::value || + is_geninteger64bit::value) && + (IsReduPlus::value || + IsReduMinimum::value || + IsReduMaximum::value || + IsReduBitOR::value || + IsReduBitXOR::value || + IsReduBitAND::value)>; template using IsReduOptForFastReduce = detail::bool_constant< (is_geninteger32bit::value || is_geninteger64bit::value || std::is_same::value || std::is_same::value || std::is_same::value) && - (std::is_same>::value || - std::is_same>::value || - std::is_same>::value)>; + (IsReduPlus::value || + IsReduMinimum::value || + IsReduMaximum::value)>; // Identity = 0 template using IsZeroIdentityOp = bool_constant< ((is_geninteger8bit::value || is_geninteger16bit::value || is_geninteger32bit::value || is_geninteger64bit::value) && - (std::is_same>::value || - std::is_same>::value || - std::is_same>::value)) || - ((std::is_same::value || std::is_same::value) && - std::is_same>::value)>; + (IsReduPlus::value || + IsReduBitOR::value || + IsReduBitXOR::value)) || + ((std::is_same::value || std::is_same::value || + std::is_same::value) && + IsReduPlus::value)>; // Identity = 1 template using IsOneIdentityOp = bool_constant< (is_geninteger8bit::value || is_geninteger16bit::value || is_geninteger32bit::value || is_geninteger64bit::value || - std::is_same::value || std::is_same::value) && - std::is_same>::value>; + std::is_same::value || std::is_same::value || + std::is_same::value) && + IsReduMultiplies::value>; // Identity = ~0 template using IsOnesIdentityOp = bool_constant< (is_geninteger8bit::value || is_geninteger16bit::value || is_geninteger32bit::value || is_geninteger64bit::value) && - std::is_same>::value>; + IsReduBitAND::value>; // Identity = template using IsMinimumIdentityOp = bool_constant< (is_geninteger8bit::value || is_geninteger16bit::value || is_geninteger32bit::value || is_geninteger64bit::value || - std::is_same::value || std::is_same::value) && - std::is_same>::value>; + std::is_same::value || std::is_same::value || + std::is_same::value) && + IsReduMinimum::value>; // Identity = template using IsMaximumIdentityOp = bool_constant< (is_geninteger8bit::value || is_geninteger16bit::value || is_geninteger32bit::value || is_geninteger64bit::value || - std::is_same::value || std::is_same::value) && - std::is_same>::value>; + std::is_same::value || std::is_same::value || + std::is_same::value) && + IsReduMaximum::value>; template using IsKnownIdentityOp = @@ -169,19 +209,23 @@ class reducer static enable_if_t::value, _T> getIdentity() { - return (std::numeric_limits<_T>::max)(); + return std::numeric_limits<_T>::has_infinity + ? std::numeric_limits<_T>::infinity() + : (std::numeric_limits<_T>::max)(); } /// Returns minimal possible value as identity for MAX operations. template static enable_if_t::value, _T> getIdentity() { - return (std::numeric_limits<_T>::min)(); + return std::numeric_limits<_T>::has_infinity + ? -std::numeric_limits<_T>::infinity() + : std::numeric_limits<_T>::lowest(); } template enable_if_t::value && - std::is_same>::value, + IsReduPlus::value, reducer &> operator+=(const _T &Partial) { combine(Partial); @@ -190,7 +234,7 @@ class reducer enable_if_t::value && - std::is_same>::value, + IsReduMultiplies::value, reducer &> operator*=(const _T &Partial) { combine(Partial); @@ -199,7 +243,7 @@ class reducer enable_if_t::value && - std::is_same>::value, + IsReduBitOR::value, reducer &> operator|=(const _T &Partial) { combine(Partial); @@ -208,7 +252,7 @@ class reducer enable_if_t::value && - std::is_same>::value, + IsReduBitXOR::value, reducer &> operator^=(const _T &Partial) { combine(Partial); @@ -217,7 +261,7 @@ class reducer enable_if_t::value && - std::is_same>::value, + IsReduBitAND::value, reducer &> operator&=(const _T &Partial) { combine(Partial); @@ -228,7 +272,7 @@ class reducer enable_if_t::type, T>::value && (is_geninteger32bit::value || is_geninteger64bit::value) && - std::is_same<_BinaryOperation, intel::plus>::value> + IsReduPlus::value> atomic_combine(_T *ReduVarPtr) const { atomic(global_ptr(ReduVarPtr)) .fetch_add(MValue); @@ -238,7 +282,7 @@ class reducer enable_if_t::type, T>::value && (is_geninteger32bit::value || is_geninteger64bit::value) && - std::is_same<_BinaryOperation, intel::bit_or>::value> + IsReduBitOR::value> atomic_combine(_T *ReduVarPtr) const { atomic(global_ptr(ReduVarPtr)) .fetch_or(MValue); @@ -248,7 +292,7 @@ class reducer enable_if_t::type, T>::value && (is_geninteger32bit::value || is_geninteger64bit::value) && - std::is_same<_BinaryOperation, intel::bit_xor>::value> + IsReduBitXOR::value> atomic_combine(_T *ReduVarPtr) const { atomic(global_ptr(ReduVarPtr)) .fetch_xor(MValue); @@ -258,7 +302,7 @@ class reducer enable_if_t::type, T>::value && (is_geninteger32bit::value || is_geninteger64bit::value) && - std::is_same<_BinaryOperation, intel::bit_and>::value> + IsReduBitAND::value> atomic_combine(_T *ReduVarPtr) const { atomic(global_ptr(ReduVarPtr)) .fetch_and(MValue); @@ -268,7 +312,7 @@ class reducer enable_if_t::type, T>::value && (is_geninteger32bit::value || is_geninteger64bit::value) && - std::is_same<_BinaryOperation, intel::minimum>::value> + IsReduMinimum::value> atomic_combine(_T *ReduVarPtr) const { atomic(global_ptr(ReduVarPtr)) .fetch_min(MValue); @@ -278,7 +322,7 @@ class reducer enable_if_t::type, T>::value && (is_geninteger32bit::value || is_geninteger64bit::value) && - std::is_same<_BinaryOperation, intel::maximum>::value> + IsReduMaximum::value> atomic_combine(_T *ReduVarPtr) const { atomic(global_ptr(ReduVarPtr)) .fetch_max(MValue); @@ -349,17 +393,20 @@ class reduction_impl { typename _T = T, class _BinaryOperation = BinaryOperation, enable_if_t::value> * = nullptr> reduction_impl(accessor_type &Acc, const T &Identity) - : MAcc(Acc), MIdentity(Identity) { + : MAcc(Acc), MIdentity(getIdentity()) { assert(Acc.get_count() == 1 && "Only scalar/1-element reductions are supported now."); - // For operations with known identity value the operator == is defined. - // It is sort of dilemma here: from one point of view - user may set - // such identity that would be enough for his data, i.e. identity=100 for - // min operation if user knows all data elements are less than 100. - // From another point of view - it is the source of unexpected errors, - // when the input data changes. - // Let's be strict for now and emit an error if identity is not proper. - assert(Identity == getIdentity() && "Unexpected Identity parameter value."); + // For now the implementation ignores the identity value given by user + // when the implementation knows the identity. + // The SPEC could prohibit passing identity parameter to operations with + // known identity, but that could have some bad consequences too. + // For example, at some moment the implementation may NOT know the identity + // for COMPLEX-PLUS reduction. User may create a program that would pass + // COMPLEX value (0,0) as identity for PLUS reduction. At some later moment + // when the implementation starts handling COMPLEX-PLUS as known operation + // the existing user's program remains compilable and working correctly. + // I.e. with this constructor here, adding more reduction operations to the + // list of known operations does not break the existing programs. } /// Constructs reduction_impl when the identity value is unknown. @@ -577,7 +624,8 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, typename Reduction::binary_operation BOp; size_t GID = NDIt.get_global_linear_id(); - auto Val = (GID < NWorkItems) ? Reducer.MValue : Reducer.getIdentity(); + typename Reduction::result_type Val = + (GID < NWorkItems) ? Reducer.MValue : Reducer.getIdentity(); Reducer.MValue = intel::reduce(NDIt.get_group(), Val, BOp); if (NDIt.get_local_linear_id() == 0) Reducer.atomic_combine(Out.get_pointer().get()); @@ -654,7 +702,7 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, size_t LID = NDIt.get_local_linear_id(); size_t GID = NDIt.get_global_linear_id(); // Copy the element to local memory to prepare it for tree-reduction. - auto ReduIdentity = Reducer.getIdentity(); + typename Reduction::result_type ReduIdentity = Reducer.getIdentity(); LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity; LocalReds[WGSize] = ReduIdentity; NDIt.barrier(); @@ -715,10 +763,13 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, // Compute the partial sum/reduction for the work-group. typename Reduction::binary_operation BOp; size_t WGID = NDIt.get_group_linear_id(); - auto V = intel::reduce(NDIt.get_group(), Reducer.MValue, BOp); - if (NDIt.get_local_linear_id() == 0) - Out.get_pointer().get()[WGID] = - IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; + typename Reduction::result_type PSum = + intel::reduce(NDIt.get_group(), Reducer.MValue, BOp); + if (NDIt.get_local_linear_id() == 0) { + if (IsUpdateOfUserAcc) + PSum = BOp(*(Out.get_pointer()), PSum); + Out.get_pointer().get()[WGID] = PSum; + } }); } else { // Inefficient case: non-uniform work-group require additional checks. @@ -734,11 +785,14 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, typename Reduction::binary_operation BOp; size_t GID = NDIt.get_global_linear_id(); size_t WGID = NDIt.get_group_linear_id(); - auto V = (GID < NWorkItems) ? Reducer.MValue : Reducer.getIdentity(); - V = intel::reduce(NDIt.get_group(), V, BOp); - if (NDIt.get_local_linear_id() == 0) - Out.get_pointer().get()[WGID] = - IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; + typename Reduction::result_type PSum = + (GID < NWorkItems) ? Reducer.MValue : Reducer.getIdentity(); + PSum = intel::reduce(NDIt.get_group(), PSum, BOp); + if (NDIt.get_local_linear_id() == 0) { + if (IsUpdateOfUserAcc) + PSum = BOp(*(Out.get_pointer()), PSum); + Out.get_pointer().get()[WGID] = PSum; + } }); } } @@ -777,7 +831,7 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH); auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, CGH); - auto ReduIdentity = Redu.getIdentity(); + typename Reduction::result_type ReduIdentity = Redu.getIdentity(); if (IsEfficientCase) { // Efficient case: work-groups are uniform and WGSize is is power of two. CGH.parallel_for(Range, [=](nd_item NDIt) { @@ -800,10 +854,12 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, } // Compute the partial sum/reduction for the work-group. - if (LID == 0) - Out.get_pointer().get()[NDIt.get_group_linear_id()] = - IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0]) - : LocalReds[0]; + if (LID == 0) { + typename Reduction::result_type PSum = LocalReds[0]; + if (IsUpdateOfUserAcc) + PSum = BOp(*(Out.get_pointer()), PSum); + Out.get_pointer().get()[NDIt.get_group_linear_id()] = PSum; + } }); } else { // Inefficient case: work-groups are non uniform or WGSize is not power @@ -844,9 +900,11 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, // Compute the partial sum/reduction for the work-group. if (LID == 0) { size_t GrID = NDIt.get_group_linear_id(); - auto V = BOp(LocalReds[0], LocalReds[WGSize]); - Out.get_pointer().get()[GrID] = - IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; + typename Reduction::result_type PSum = + BOp(LocalReds[0], LocalReds[WGSize]); + if (IsUpdateOfUserAcc) + PSum = BOp(*(Out.get_pointer()), PSum); + Out.get_pointer().get()[GrID] = PSum; } }); } @@ -884,10 +942,13 @@ reduAuxCGFunc(handler &CGH, const nd_range &Range, size_t NWorkItems, typename Reduction::binary_operation BOp; size_t WGID = NDIt.get_group_linear_id(); size_t GID = NDIt.get_global_linear_id(); - auto V = intel::reduce(NDIt.get_group(), In[GID], BOp); - if (NDIt.get_local_linear_id() == 0) - Out.get_pointer().get()[WGID] = - IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; + typename Reduction::result_type PSum = + intel::reduce(NDIt.get_group(), In[GID], BOp); + if (NDIt.get_local_linear_id() == 0) { + if (IsUpdateOfUserAcc) + PSum = BOp(*(Out.get_pointer()), PSum); + Out.get_pointer().get()[WGID] = PSum; + } }); } else { // Inefficient case: non-uniform work-groups require additional checks. @@ -899,12 +960,14 @@ reduAuxCGFunc(handler &CGH, const nd_range &Range, size_t NWorkItems, size_t WGID = NDIt.get_group_linear_id(); size_t GID = NDIt.get_global_linear_id(); typename Reduction::reducer_type Reducer; - auto V = + typename Reduction::result_type PSum = (GID < NWorkItems) ? In[GID] : Reduction::reducer_type::getIdentity(); - V = intel::reduce(NDIt.get_group(), V, BOp); - if (NDIt.get_local_linear_id() == 0) - Out.get_pointer().get()[WGID] = - IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; + PSum = intel::reduce(NDIt.get_group(), PSum, BOp); + if (NDIt.get_local_linear_id() == 0) { + if (IsUpdateOfUserAcc) + PSum = BOp(*(Out.get_pointer()), PSum); + Out.get_pointer().get()[WGID] = PSum; + } }); } } @@ -969,10 +1032,12 @@ reduAuxCGFunc(handler &CGH, const nd_range &Range, size_t NWorkItems, } // Compute the partial sum/reduction for the work-group. - if (LID == 0) - Out.get_pointer().get()[NDIt.get_group_linear_id()] = - IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0]) - : LocalReds[0]; + if (LID == 0) { + typename Reduction::result_type PSum = LocalReds[0]; + if (IsUpdateOfUserAcc) + PSum = BOp(*(Out.get_pointer()), PSum); + Out.get_pointer().get()[NDIt.get_group_linear_id()] = PSum; + } }); } else { // Inefficient case: work-groups are not fully loaded @@ -1010,9 +1075,11 @@ reduAuxCGFunc(handler &CGH, const nd_range &Range, size_t NWorkItems, // Compute the partial sum/reduction for the work-group. if (LID == 0) { size_t GrID = NDIt.get_group_linear_id(); - auto V = BOp(LocalReds[0], LocalReds[WGSize]); - Out.get_pointer().get()[GrID] = - IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; + typename Reduction::result_type PSum = + BOp(LocalReds[0], LocalReds[WGSize]); + if (IsUpdateOfUserAcc) + PSum = BOp(*(Out.get_pointer()), PSum); + Out.get_pointer().get()[GrID] = PSum; } }); } diff --git a/sycl/test/reduction/reduction_ctor.cpp b/sycl/test/reduction/reduction_ctor.cpp index 9e4fb306a95b9..7f8e8e9726e59 100644 --- a/sycl/test/reduction/reduction_ctor.cpp +++ b/sycl/test/reduction/reduction_ctor.cpp @@ -1,48 +1,15 @@ // RUN: %clangxx -fsycl %s -o %t.out // RUN: env SYCL_DEVICE_TYPE=HOST %t.out -//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// // This performs basic checks such as reduction creation, getIdentity() method, // and the combine() method of the aux class 'reducer'. +#include "reduction_utils.hpp" #include #include using namespace cl::sycl; -template -struct init_data_t { - void initInputData(T IdentityVal, - buffer &InBuf, - T &ExpectedReduValue) { - ExpectedReduValue = IdentityVal; - BinaryOperation Op; - auto In = InBuf.template get_access(); - for (int I = 0; I < N; ++I) { - In[I] = ((I + 1) % 5) + 1; - ExpectedReduValue = Op(ExpectedReduValue, In[I]); - } - } -}; - -template -struct init_data_t, N> { - void initInputData(T IdentityVal, buffer &InBuf, T &ExpectedReduValue) { - ExpectedReduValue = IdentityVal; - std::multiplies Op; - auto In = InBuf.template get_access(); - for (int I = 0; I < N; ++I) { - In[I] = 1 + (((I % 37) == 0) ? 1 : 0); - ExpectedReduValue = Op(ExpectedReduValue, In[I]); - } - } -}; template void test_reducer(Reduction &Redu, T A, T B) { @@ -157,15 +124,13 @@ int main() { testBoth>(0, 1, 8); testBoth>(0, 7, 3); testBoth>(~0, 7, 3); - testBoth>(std::numeric_limits::max(), 7, 3); - testBoth>(std::numeric_limits::min(), 7, 3); + testBoth>((std::numeric_limits::max)(), 7, 3); + testBoth>((std::numeric_limits::min)(), 7, 3); testBoth>(0, 1, 7); testBoth>(1, 1, 7); - testBoth>( - std::numeric_limits::max(), 7, 3); - testBoth>( - std::numeric_limits::min(), 7, 3); + testBoth>(getMaximumFPValue(), 7, 3); + testBoth>(getMinimumFPValue(), 7, 3); testUnknown, 0, PointPlus>(Point(0), Point(1), Point(7)); testUnknown, 1, PointPlus>(Point(0), Point(1), Point(7)); diff --git a/sycl/test/reduction/reduction_nd_s0_dw.cpp b/sycl/test/reduction/reduction_nd_s0_dw.cpp index 4160efcb5afba..61b6e10481546 100644 --- a/sycl/test/reduction/reduction_nd_s0_dw.cpp +++ b/sycl/test/reduction/reduction_nd_s0_dw.cpp @@ -79,17 +79,17 @@ int main() { test>(0, 8, 256); test>(0, 8, 256); test>(~0, 8, 256); - test>(std::numeric_limits::max(), 8, 256); - test>(std::numeric_limits::min(), 8, 256); + test>((std::numeric_limits::max)(), 8, 256); + test>((std::numeric_limits::min)(), 8, 256); // Check with various types. test>(1, 8, 256); - test>(std::numeric_limits::max(), 8, 256); - test>(std::numeric_limits::min(), 8, 256); + test>(getMaximumFPValue(), 8, 256); + test>(getMinimumFPValue(), 8, 256); test>(1, 8, 256); - test>(std::numeric_limits::max(), 8, 256); - test>(std::numeric_limits::min(), 8, 256); + test>(getMaximumFPValue(), 8, 256); + test>(getMinimumFPValue(), 8, 256); // Check with CUSTOM type. test, 0, CustomVecPlus>(CustomVec(0), 8, 256); diff --git a/sycl/test/reduction/reduction_nd_s0_rw.cpp b/sycl/test/reduction/reduction_nd_s0_rw.cpp index b16eee70656cd..bdcaad8433911 100644 --- a/sycl/test/reduction/reduction_nd_s0_rw.cpp +++ b/sycl/test/reduction/reduction_nd_s0_rw.cpp @@ -81,17 +81,17 @@ int main() { test>(0, 8, 256); test>(0, 8, 256); test>(~0, 8, 256); - test>(std::numeric_limits::max(), 8, 256); - test>(std::numeric_limits::min(), 8, 256); + test>((std::numeric_limits::max)(), 8, 256); + test>((std::numeric_limits::min)(), 8, 256); // Check with various types. test>(1, 8, 256); - test>(std::numeric_limits::max(), 8, 256); - test>(std::numeric_limits::min(), 8, 256); + test>(getMaximumFPValue(), 8, 256); + test>(getMinimumFPValue(), 8, 256); test>(1, 8, 256); - test>(std::numeric_limits::max(), 8, 256); - test>(std::numeric_limits::min(), 8, 256); + test>(getMaximumFPValue(), 8, 256); + test>(getMinimumFPValue(), 8, 256); // Check with CUSTOM type. test, 0, CustomVecPlus>(CustomVec(0), 8, 256); diff --git a/sycl/test/reduction/reduction_nd_s1_dw.cpp b/sycl/test/reduction/reduction_nd_s1_dw.cpp index eef8413a76a32..981d4e77f5e33 100644 --- a/sycl/test/reduction/reduction_nd_s1_dw.cpp +++ b/sycl/test/reduction/reduction_nd_s1_dw.cpp @@ -80,17 +80,17 @@ int main() { test>(0, 8, 256); test>(0, 8, 256); test>(~0, 8, 256); - test>(std::numeric_limits::max(), 8, 256); - test>(std::numeric_limits::min(), 8, 256); + test>((std::numeric_limits::max)(), 8, 256); + test>((std::numeric_limits::min)(), 8, 256); // Check with various types. test>(1, 8, 256); - test>(std::numeric_limits::max(), 8, 256); - test>(std::numeric_limits::min(), 8, 256); + test>(getMaximumFPValue(), 8, 256); + test>(getMinimumFPValue(), 8, 256); test>(1, 8, 256); - test>(std::numeric_limits::max(), 8, 256); - test>(std::numeric_limits::min(), 8, 256); + test>(getMaximumFPValue(), 8, 256); + test>(getMinimumFPValue(), 8, 256); // Check with CUSTOM type. test, 1, CustomVecPlus>(CustomVec(0), 8, 256); diff --git a/sycl/test/reduction/reduction_nd_s1_rw.cpp b/sycl/test/reduction/reduction_nd_s1_rw.cpp index 6c69c4d04f12d..28ec3989273ae 100644 --- a/sycl/test/reduction/reduction_nd_s1_rw.cpp +++ b/sycl/test/reduction/reduction_nd_s1_rw.cpp @@ -82,17 +82,17 @@ int main() { test>(0, 8, 256); test>(0, 8, 256); test>(~0, 8, 256); - test>(std::numeric_limits::max(), 8, 256); - test>(std::numeric_limits::min(), 8, 256); + test>((std::numeric_limits::max)(), 8, 256); + test>((std::numeric_limits::min)(), 8, 256); // Check with various types. test>(1, 8, 256); - test>(std::numeric_limits::max(), 8, 256); - test>(std::numeric_limits::min(), 8, 256); + test>(getMaximumFPValue(), 8, 256); + test>(getMinimumFPValue(), 8, 256); test>(1, 8, 256); - test>(std::numeric_limits::max(), 8, 256); - test>(std::numeric_limits::min(), 8, 256); + test>(getMaximumFPValue(), 8, 256); + test>(getMinimumFPValue(), 8, 256); // Check with CUSTOM type. test, 1, CustomVecPlus>(CustomVec(0), 8, 256); diff --git a/sycl/test/reduction/reduction_placeholder.cpp b/sycl/test/reduction/reduction_placeholder.cpp index 3b2ff95fb219b..e972105bbab50 100644 --- a/sycl/test/reduction/reduction_placeholder.cpp +++ b/sycl/test/reduction/reduction_placeholder.cpp @@ -72,8 +72,8 @@ int main() { test>(0, 4, 128); // fast reduce - test>(std::numeric_limits::max(), 5, 5 * 7); - test>(std::numeric_limits::min(), 4, 128); + test>(getMaximumFPValue(), 5, 5 * 7); + test>(getMinimumFPValue(), 4, 128); // generic algorithm test>(1, 7, 7 * 5); diff --git a/sycl/test/reduction/reduction_transparent.cpp b/sycl/test/reduction/reduction_transparent.cpp index d9392fc2784a4..1363312c812b7 100644 --- a/sycl/test/reduction/reduction_transparent.cpp +++ b/sycl/test/reduction/reduction_transparent.cpp @@ -1,8 +1,17 @@ +// UNSUPPORTED: cuda +// Reductions use work-group builtins not yet supported by CUDA. + // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out -// RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out -// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// TODO: enable all checks for CPU/ACC when CPU/ACC RT supports intel::reduce() +// for 'cl::sycl::half' type. +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DSKIP_FOR_HALF -o %t.no_half.out +// RUN: %ACC_RUN_PLACEHOLDER %t.no_half.out +// RUN: %CPU_RUN_PLACEHOLDER %t.no_half.out + +// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out +// TODO: Enable the test for HOST when it supports intel::reduce() and barrier() // This test performs basic checks of parallel_for(nd_range, reduction, func) // where func is a transparent functor. @@ -14,10 +23,14 @@ using namespace cl::sycl; template -class SomeClass; +class SomeIdClass; +template +class SomeNoIdClass; +// Checks reductions initialized with transparent functor and explicitly set +// identity value. template -void test(T Identity, size_t WGSize, size_t NWItems) { +void testId(T Identity, size_t WGSize, size_t NWItems) { buffer InBuf(NWItems); buffer OutBuf(1); @@ -32,13 +45,12 @@ void test(T Identity, size_t WGSize, size_t NWItems) { auto In = InBuf.template get_access(CGH); accessor Out(OutBuf, CGH); - auto Redu = intel::reduction(Out, Identity, BOp); range<1> GlobalRange(NWItems); range<1> LocalRange(WGSize); nd_range<1> NDRange(GlobalRange, LocalRange); - CGH.parallel_for>( - NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { + CGH.parallel_for>( + NDRange, intel::reduction(Out, Identity, BOp), [=](nd_item<1> NDIt, auto &Sum) { Sum.combine(In[NDIt.get_global_linear_id()]); }); }); @@ -54,12 +66,62 @@ void test(T Identity, size_t WGSize, size_t NWItems) { } } +// Checks reductions initialized with transparent functor and identity +// value not explicitly specified. The parameter 'Identity' is passed here +// only to pre-initialize input data correctly. +template +void testNoId(T Identity, size_t WGSize, size_t NWItems) { + buffer InBuf(NWItems); + buffer OutBuf(1); + + // Initialize. + BinaryOperation BOp; + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); + + // Compute. + queue Q; + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + accessor + Out(OutBuf, CGH); + + range<1> GlobalRange(NWItems); + range<1> LocalRange(WGSize); + nd_range<1> NDRange(GlobalRange, LocalRange); + CGH.parallel_for>( + NDRange, intel::reduction(Out, BOp), [=](nd_item<1> NDIt, auto &Sum) { + Sum.combine(In[NDIt.get_global_linear_id()]); + }); + }); + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + if (ComputedOut != CorrectOut) { + std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; + std::cout << "Computed value: " << ComputedOut + << ", Expected value: " << CorrectOut << "\n"; + assert(0 && "Wrong value."); + } +} + +template +void test(T Identity, size_t WGSize, size_t NWItems) { + testId(Identity, WGSize, NWItems); + testNoId(Identity, WGSize, NWItems); +} + int main() { #if __cplusplus >= 201402L - test>(std::numeric_limits::min(), 7, 7 * 5); + test>(getMinimumFPValue(), 7, 7 * 5); test>(0, 7, 49); test>(1, 4, 16); -#endif +#ifndef SKIP_FOR_HALF + test>(0, 4, 8); + test>(getMaximumFPValue(), 8, 32); +#endif // SKIP_FOR_HALF +#endif // __cplusplus >= 201402L std::cout << "Test passed\n"; return 0; diff --git a/sycl/test/reduction/reduction_usm.cpp b/sycl/test/reduction/reduction_usm.cpp index bb99f372818d9..592a36904a8e8 100644 --- a/sycl/test/reduction/reduction_usm.cpp +++ b/sycl/test/reduction/reduction_usm.cpp @@ -113,10 +113,8 @@ int main() { testUSM>(0, 4, 128); // fast reduce - testUSM>( - (std::numeric_limits::max)(), 5, 5 * 7); - testUSM>( - (std::numeric_limits::min)(), 4, 128); + testUSM>(getMaximumFPValue(), 5, 5 * 7); + testUSM>(getMinimumFPValue(), 4, 128); // generic algorithm testUSM>(1, 7, 7 * 5); diff --git a/sycl/test/reduction/reduction_utils.hpp b/sycl/test/reduction/reduction_utils.hpp index c7a1699298f02..d09bae11e6e6e 100644 --- a/sycl/test/reduction/reduction_utils.hpp +++ b/sycl/test/reduction/reduction_utils.hpp @@ -52,3 +52,17 @@ struct CustomVecPlus { return CV(A.X + B.X, A.Y + B.Y); } }; + +template +T getMinimumFPValue() { + return std::numeric_limits::has_infinity + ? -std::numeric_limits::infinity() + : std::numeric_limits::lowest(); +} + +template +T getMaximumFPValue() { + return std::numeric_limits::has_infinity + ? std::numeric_limits::infinity() + : (std::numeric_limits::max)(); +}