diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp index a11d3d89fb51..a0c415373b44 100644 --- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp +++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp @@ -619,24 +619,19 @@ struct get_reduction_aux_kernel_name_t { /// /// Briefly: calls user's lambda, ONEAPI::reduce() + atomic, INT + ADD/MIN/MAX. template + bool IsPow2WG, typename OutputT> enable_if_t reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, Reduction &, OutputT Out) { - size_t NWorkItems = Range.get_global_range().size(); using Name = typename get_reduction_main_kernel_name_t< - KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name; + KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name; CGH.parallel_for(Range, [=](nd_item NDIt) { // Call user's function. Reducer.MValue gets initialized there. typename Reduction::reducer_type Reducer; KernelFunc(NDIt, Reducer); typename Reduction::binary_operation BOp; - typename Reduction::result_type Val = - (UniformWG || NDIt.get_global_linear_id() < NWorkItems) - ? Reducer.MValue - : Reducer.getIdentity(); - Reducer.MValue = ONEAPI::reduce(NDIt.get_group(), Val, BOp); + Reducer.MValue = ONEAPI::reduce(NDIt.get_group(), Reducer.MValue, BOp); if (NDIt.get_local_linear_id() == 0) Reducer.atomic_combine(Reduction::getOutPointer(Out)); }); @@ -651,22 +646,21 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, /// /// Briefly: calls user's lambda, tree-reduction + atomic, INT + AND/OR/XOR. template + bool IsPow2WG, typename OutputT> enable_if_t reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, Reduction &Redu, OutputT Out) { - size_t NWorkItems = Range.get_global_range().size(); size_t WGSize = Range.get_local_range().size(); // Use local memory to reduce elements in work-groups into zero-th element. // If WGSize is not power of two, then WGSize+1 elements are allocated. // The additional last element is used to catch reduce elements that could // otherwise be lost in the tree-reduction algorithm used in the kernel. - size_t NLocalElements = WGSize + (UniformPow2WG ? 0 : 1); + size_t NLocalElements = WGSize + (IsPow2WG ? 0 : 1); auto LocalReds = Redu.getReadWriteLocalAcc(NLocalElements, CGH); using Name = typename get_reduction_main_kernel_name_t< - KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name; + KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name; CGH.parallel_for(Range, [=](nd_item NDIt) { // Call user's functions. Reducer.MValue gets initialized there. typename Reduction::reducer_type Reducer; @@ -676,12 +670,9 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, size_t LID = NDIt.get_local_linear_id(); // Copy the element to local memory to prepare it for tree-reduction. - typename Reduction::result_type ReduIdentity = Reducer.getIdentity(); - LocalReds[LID] = (UniformPow2WG || NDIt.get_global_linear_id() < NWorkItems) - ? Reducer.MValue - : ReduIdentity; - if (!UniformPow2WG) - LocalReds[WGSize] = ReduIdentity; + LocalReds[LID] = Reducer.MValue; + if (!IsPow2WG) + LocalReds[WGSize] = Reducer.getIdentity(); NDIt.barrier(); // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]. @@ -692,7 +683,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { if (LID < CurStep) LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); - else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1)) + else if (!IsPow2WG && LID == CurStep && (PrevStep & 0x1)) LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]); NDIt.barrier(); PrevStep = CurStep; @@ -700,7 +691,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, if (LID == 0) { Reducer.MValue = - UniformPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]); + IsPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]); Reducer.atomic_combine(Reduction::getOutPointer(Out)); } }); @@ -712,14 +703,14 @@ enable_if_t reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, Reduction &Redu, OutputT Out) { - size_t NWorkItems = Range.get_global_range().size(); size_t WGSize = Range.get_local_range().size(); - size_t NWorkGroups = Range.get_group_range().size(); - bool HasUniformWG = NWorkGroups * WGSize == NWorkItems; - if (!Reduction::has_fast_reduce) - HasUniformWG = HasUniformWG && (WGSize & (WGSize - 1)) == 0; - if (HasUniformWG) + // If the work group size is not pow of 2, then the kernel runs some + // additional code and checks in it. + // If the reduction has fast reduce then the kernel does not care if the work + // group size is pow of 2 or not, assume true for such cases. + bool IsPow2WG = Reduction::has_fast_reduce || ((WGSize & (WGSize - 1)) == 0); + if (IsPow2WG) reduCGFuncImpl( CGH, KernelFunc, Range, Redu, Out); else @@ -736,14 +727,12 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, /// /// Briefly: user's lambda, ONEAPI:reduce(), FP + ADD/MIN/MAX. template + bool IsPow2WG, typename OutputT> enable_if_t reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, Reduction &, OutputT Out) { - size_t NWorkItems = Range.get_global_range().size(); size_t NWorkGroups = Range.get_group_range().size(); - // This additional check is needed for 'read_write' accessor case only. // It does not slow-down the kernel writing to 'discard_write' accessor as // the condition seems to be resolved at compile time for 'discard_write'. @@ -751,7 +740,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1; using Name = typename get_reduction_main_kernel_name_t< - KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name; + KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name; CGH.parallel_for(Range, [=](nd_item NDIt) { // Call user's functions. Reducer.MValue gets initialized there. typename Reduction::reducer_type Reducer; @@ -759,10 +748,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, // Compute the partial sum/reduction for the work-group. size_t WGID = NDIt.get_group_linear_id(); - typename Reduction::result_type PSum = - (UniformWG || (NDIt.get_group_linear_id() < NWorkItems)) - ? Reducer.MValue - : Reducer.getIdentity(); + typename Reduction::result_type PSum = Reducer.MValue; typename Reduction::binary_operation BOp; PSum = ONEAPI::reduce(NDIt.get_group(), PSum, BOp); if (NDIt.get_local_linear_id() == 0) { @@ -782,11 +768,10 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, /// /// Briefly: user's lambda, tree-reduction, CUSTOM types/ops. template + bool IsPow2WG, typename OutputT> enable_if_t reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, Reduction &Redu, OutputT Out) { - size_t NWorkItems = Range.get_global_range().size(); size_t WGSize = Range.get_local_range().size(); size_t NWorkGroups = Range.get_group_range().size(); @@ -797,11 +782,11 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, // If WGSize is not power of two, then WGSize+1 elements are allocated. // The additional last element is used to catch elements that could // otherwise be lost in the tree-reduction algorithm. - size_t NumLocalElements = WGSize + (UniformPow2WG ? 0 : 1); + size_t NumLocalElements = WGSize + (IsPow2WG ? 0 : 1); auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH); typename Reduction::result_type ReduIdentity = Redu.getIdentity(); using Name = typename get_reduction_main_kernel_name_t< - KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name; + KernelName, KernelType, Reduction::is_usm, IsPow2WG, OutputT>::name; auto BOp = Redu.getBinaryOperation(); CGH.parallel_for(Range, [=](nd_item NDIt) { // Call user's functions. Reducer.MValue gets initialized there. @@ -810,10 +795,9 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, size_t WGSize = NDIt.get_local_range().size(); size_t LID = NDIt.get_local_linear_id(); - size_t GID = NDIt.get_global_linear_id(); // Copy the element to local memory to prepare it for tree-reduction. - LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity; - if (!UniformPow2WG) + LocalReds[LID] = Reducer.MValue; + if (!IsPow2WG) LocalReds[WGSize] = ReduIdentity; NDIt.barrier(); @@ -824,7 +808,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { if (LID < CurStep) LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); - else if (!UniformPow2WG && LID == CurStep && (PrevStep & 0x1)) + else if (!IsPow2WG && LID == CurStep && (PrevStep & 0x1)) LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]); NDIt.barrier(); PrevStep = CurStep; @@ -834,7 +818,7 @@ reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const nd_range &Range, if (LID == 0) { size_t GrID = NDIt.get_group_linear_id(); typename Reduction::result_type PSum = - UniformPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]); + IsPow2WG ? LocalReds[0] : BOp(LocalReds[0], LocalReds[WGSize]); if (IsUpdateOfUserVar) PSum = BOp(*(Reduction::getOutPointer(Out)), PSum); Reduction::getOutPointer(Out)[GrID] = PSum; @@ -846,19 +830,17 @@ template enable_if_t reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, Reduction &Redu) { - size_t NWorkItems = Range.get_global_range().size(); size_t WGSize = Range.get_local_range().size(); size_t NWorkGroups = Range.get_group_range().size(); - // The last work-group may be not fully loaded with work, or the work group - // size may be not power of two. Those two cases considered inefficient - // as they require additional code and checks in the kernel. - bool HasUniformWG = NWorkGroups * WGSize == NWorkItems; - if (!Reduction::has_fast_reduce) - HasUniformWG = HasUniformWG && ((WGSize & (WGSize - 1)) == 0); + // If the work group size is not pow of 2, then the kernel runs some + // additional code and checks in it. + // If the reduction has fast reduce then the kernel does not care if the work + // group size is pow of 2 or not, assume true for such cases. + bool IsPow2WG = Reduction::has_fast_reduce || ((WGSize & (WGSize - 1)) == 0); if (Reduction::is_usm && NWorkGroups == 1) { - if (HasUniformWG) + if (IsPow2WG) reduCGFuncImpl( CGH, KernelFunc, Range, Redu, Redu.getUSMPointer()); else @@ -866,7 +848,7 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range &Range, CGH, KernelFunc, Range, Redu, Redu.getUSMPointer()); } else { auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, CGH); - if (HasUniformWG) + if (IsPow2WG) reduCGFuncImpl( CGH, KernelFunc, Range, Redu, Out); else @@ -889,10 +871,10 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups, size_t WGSize, Reduction &, InputT In, OutputT Out) { using Name = typename get_reduction_aux_kernel_name_t< KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name; - bool IsUpdateOfUserVar = Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1; - nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)}; + range<1> GlobalRange = {UniformWG ? NWorkItems : NWorkGroups * WGSize}; + nd_range<1> Range{GlobalRange, range<1>(WGSize)}; CGH.parallel_for(Range, [=](nd_item<1> NDIt) { typename Reduction::binary_operation BOp; size_t WGID = NDIt.get_group_linear_id(); @@ -936,7 +918,8 @@ reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups, auto BOp = Redu.getBinaryOperation(); using Name = typename get_reduction_aux_kernel_name_t< KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name; - nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)}; + range<1> GlobalRange = {UniformPow2WG ? NWorkItems : NWorkGroups * WGSize}; + nd_range<1> Range{GlobalRange, range<1>(WGSize)}; CGH.parallel_for(Range, [=](nd_item<1> NDIt) { size_t WGSize = NDIt.get_local_range().size(); size_t LID = NDIt.get_local_linear_id(); diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index fa678f28593f..50169dbd116e 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -1193,7 +1193,9 @@ class __SYCL_EXPORT handler { size_t MaxWGSize = ONEAPI::detail::reduGetMaxWGSize(MQueue, OneElemSize); if (Range.get_local_range().size() > MaxWGSize) throw sycl::runtime_error("The implementation handling parallel_for with" - " reduction requires smaller work group size.", + " reduction requires work group size not bigger" + " than " + + std::to_string(MaxWGSize), PI_INVALID_WORK_GROUP_SIZE); // 1. Call the kernel that includes user's lambda function. diff --git a/sycl/source/detail/reduction.cpp b/sycl/source/detail/reduction.cpp index 910342977b13..db24ced823a0 100644 --- a/sycl/source/detail/reduction.cpp +++ b/sycl/source/detail/reduction.cpp @@ -52,18 +52,45 @@ __SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class Queue, size_t LocalMemBytesPerWorkItem) { device Dev = Queue->get_device(); - size_t WGSize = Dev.get_info(); + size_t MaxWGSize = Dev.get_info(); + size_t WGSizePerMem = MaxWGSize * 2; + size_t WGSize = MaxWGSize; if (LocalMemBytesPerWorkItem != 0) { size_t MemSize = Dev.get_info(); - size_t WGSizePerMem = MemSize / LocalMemBytesPerWorkItem; + WGSizePerMem = MemSize / LocalMemBytesPerWorkItem; - // If the work group size is not pow of two, then an additional element + // If the work group size is NOT power of two, then an additional element // in local memory is needed for the reduction algorithm and thus the real // work-group size requirement per available memory is stricter. - if ((WGSize & (WGSize - 1)) == 0) + if ((WGSizePerMem & (WGSizePerMem - 1)) != 0) WGSizePerMem--; WGSize = (std::min)(WGSizePerMem, WGSize); } + // TODO: This is a temporary workaround for a big problem of detecting + // the maximal usable work-group size. The detection method used above + // is based on maximal work-group size possible on the device is too risky + // as may return too big value. Even though it also tries using the memory + // factor into consideration, it is too rough estimation. For example, + // if (WGSize * LocalMemBytesPerWorkItem) is equal to local_mem_size, then + // the reduction local accessor takes all available local memory for it needs + // not leaving any local memory for other kernel needs (barriers, + // builtin calls, etc), which often leads to crushes with CL_OUT_OF_RESOURCES + // error, or in even worse cases it may cause silent writes/clobbers of + // the local memory assigned to one work-group by code in another work-group. + // It seems the only good solution for this work-group detection problem is + // kernel precompilation and querying the kernel properties. + if (WGSize >= 4) { + // Let's return a twice smaller number, but... do that only if the kernel + // is limited by memory, or the kernel uses opencl:cpu backend, which + // surprisingly uses lots of resources to run the kernels with reductions + // and often causes CL_OUT_OF_RESOURCES error even when reduction + // does not use local accessors. + if (WGSizePerMem < MaxWGSize * 2 || + (Queue->get_device().is_cpu() && + Queue->get_device().get_platform().get_backend() == backend::opencl)) + WGSize /= 2; + } + return WGSize; } diff --git a/sycl/test/on-device/reduction/reduction_big_data.cpp b/sycl/test/on-device/reduction/reduction_big_data.cpp new file mode 100644 index 000000000000..cfd6711f36ae --- /dev/null +++ b/sycl/test/on-device/reduction/reduction_big_data.cpp @@ -0,0 +1,109 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out + +// RUNx: %RUN_ON_HOST %t.out +// TODO: Enable the test for HOST when it supports ONEAPI::reduce() and +// barrier() + +// This test performs basic checks of parallel_for(nd_range, reduction, func) +// where the bigger data size and/or non-uniform work-group sizes may cause +// errors. + +#include "reduction_utils.hpp" +#include +#include +#include + +using namespace cl::sycl; + +template class KernelNameGroup; + +size_t getSafeMaxWGSize(size_t MaxWGSize, size_t MemSize, size_t OneElemSize) { + size_t MaxNumElems = MemSize / OneElemSize; + if ((MaxNumElems & (MaxNumElems - 1)) != 0) + MaxNumElems--; // Need 1 additional element in mem if not pow of 2 + return std::min(MaxNumElems / 2, MaxWGSize); +} + +template +void test(T Identity) { + queue Q; + device Device = Q.get_device(); + + std::size_t MaxWGSize = Device.get_info(); + std::size_t LocalMemSize = Device.get_info(); + std::cout << "Detected device::max_work_group_size = " << MaxWGSize << "\n"; + std::cout << "Detected device::local_mem_size = " << LocalMemSize << "\n"; + + size_t WGSize = getSafeMaxWGSize(MaxWGSize, LocalMemSize, sizeof(T)); + + size_t MaxGlobalMem = 2LL * 1024 * 1024 * 1024; // Don't use more than 2 Gb + // Limit max global range by mem and also subtract 1 to make it non-uniform. + size_t MaxGlobalRange = MaxGlobalMem / sizeof(T) - 1; + size_t NWorkItems = std::min(WGSize * MaxWGSize + 1, MaxGlobalRange); + + size_t NWorkGroups = (NWorkItems - 1) / WGSize + 1; + range<1> GlobalRange(NWorkGroups * WGSize); + range<1> LocalRange(WGSize); + nd_range<1> NDRange(GlobalRange, LocalRange); + std::cout << "Running the test with: GlobalRange = " << (NWorkGroups * WGSize) + << ", LocalRange = " << WGSize << ", NWorkItems = " << NWorkItems + << "\n"; + + buffer InBuf(NWorkItems); + buffer OutBuf(1); + + // Initialize. + BinaryOperation BOp; + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, NWorkItems); + + // Compute. + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + accessor + Out(OutBuf, CGH); + CGH.parallel_for(NDRange, ONEAPI::reduction(Out, Identity, BOp), + [=](nd_item<1> NDIt, auto &Sum) { + if (NDIt.get_global_linear_id() < NWorkItems) + Sum.combine( + In[NDIt.get_global_linear_id()]); + }); + }); + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + if (ComputedOut != CorrectOut) { + std::cout << "Computed value: " << ComputedOut + << ", Expected value: " << CorrectOut << "\n"; + assert(0 && "Wrong value."); + } + std::cout << "Test case passed\n\n"; +} + +template struct BigCustomVec : public CustomVec { + BigCustomVec() : CustomVec() {} + BigCustomVec(T X, T Y) : CustomVec(X, Y) {} + BigCustomVec(T V) : CustomVec(V) {} + unsigned char OtherData[512 - sizeof(CustomVec)]; +}; + +template struct BigCustomVecPlus { + using CV = BigCustomVec; + CV operator()(const CV &A, const CV &B) const { + return CV(A.X + B.X, A.Y + B.Y); + } +}; + +int main() { + test>(getMinimumFPValue()); + + using BCV = BigCustomVec; + test>(BCV(0)); + + std::cout << "Test passed\n"; + return 0; +}