diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 941e878ebb591..49767539bc683 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -107,32 +107,6 @@ template struct get_kernel_name_t { __SYCL_EXPORT device getDeviceFromHandler(handler &); -/// These are the forward declaration for the classes that help to create -/// names for additional kernels. It is used only when there are -/// more then 1 kernels in one parallel_for() implementing SYCL reduction. -template class __sycl_reduction_main_2nd_kernel; -template class __sycl_reduction_aux_1st_kernel; -template class __sycl_reduction_aux_2nd_kernel; - -/// Helper structs to get additional kernel name types based on given -/// \c Name and \c Type types: if \c Name is undefined (is a \c auto_name) then -/// \c Type becomes the \c Name. -template -struct get_reduction_main_2nd_kernel_name_t { - using name = __sycl_reduction_main_2nd_kernel< - typename get_kernel_name_t::name>; -}; -template -struct get_reduction_aux_1st_kernel_name_t { - using name = __sycl_reduction_aux_1st_kernel< - typename get_kernel_name_t::name>; -}; -template -struct get_reduction_aux_2nd_kernel_name_t { - using name = __sycl_reduction_aux_2nd_kernel< - typename get_kernel_name_t::name>; -}; - device getDeviceFromHandler(handler &); } // namespace detail @@ -142,6 +116,14 @@ namespace detail { template class reduction_impl; + +template +void reduCGFunc(handler &CGH, KernelType KernelFunc, + const nd_range &Range, Reduction &Redu); + +template +void reduAuxCGFunc(handler &CGH, const nd_range &Range, size_t NWorkItems, + size_t KernelRun, Reduction &Redu); } // namespace detail } // namespace intel @@ -231,13 +213,6 @@ class __SYCL_EXPORT handler { /// usage in finalize() method. void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; } - /// Stores the given \param Event to the \param Queue. - /// Even though MQueue is a field of handler, the method addEvent() of - /// queue_impl class cannot be called inside this handler.hpp file - /// as queue_impl is incomplete class for handler. - static void addEventToQueue(shared_ptr_class Queue, - cl::sycl::event Event); - /// Constructs CG object of specific type, passes it to Scheduler and /// returns sycl::event object representing the command group. /// It's expected that the method is the latest method executed before @@ -288,30 +263,6 @@ class __SYCL_EXPORT handler { /*index*/ 0); } - template - void dissociateWithHandler(accessor - Acc) { - detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc; - detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase); - detail::Requirement *Req = AccImpl.get(); - - // Remove accessor from the list of requirements, accessors storage, - // and from the list of associated accessors. - auto ReqIt = std::find(MRequirements.begin(), MRequirements.end(), Req); - auto AccIt = std::find(MAccStorage.begin(), MAccStorage.end(), AccImpl); - auto It = - std::find_if(MAssociatedAccesors.begin(), MAssociatedAccesors.end(), - [Req](const detail::ArgDesc &D) { return D.MPtr == Req; }); - assert((ReqIt != MRequirements.end() && AccIt != MAccStorage.end() && - It != MAssociatedAccesors.end()) && - "Cannot dissociate accessor."); - MRequirements.erase(ReqIt); - MAccStorage.erase(AccIt); - MAssociatedAccesors.erase(It); - } - // Recursively calls itself until arguments pack is fully processed. // The version for regular(standard layout) argument. template @@ -810,219 +761,6 @@ class __SYCL_EXPORT handler { #endif } - /// Implements a command group function that enqueues a kernel that calls - /// user's lambda function \param KernelFunc and does one iteration of - /// reduction of elements in each of work-groups. - /// This version uses tree-reduction algorithm to reduce elements in each - /// of work-groups. At the end of each work-group the partial sum is written - /// to a global buffer. - /// - /// Briefly: user's lambda, tree-reduction, CUSTOM types/ops. - template - void reduCGFunc(KernelType KernelFunc, const nd_range &Range, - Reduction &Redu) { - - size_t NWorkItems = Range.get_global_range().size(); - size_t WGSize = Range.get_local_range().size(); - size_t NWorkGroups = Range.get_group_range().size(); - - bool IsUnderLoaded = (NWorkGroups * WGSize - NWorkItems) != 0; - bool IsEfficientCase = !IsUnderLoaded && ((WGSize & (WGSize - 1)) == 0); - - bool IsUpdateOfUserAcc = - Reduction::accessor_mode == access::mode::read_write && - NWorkGroups == 1; - - // Use local memory to reduce elements in work-groups into 0-th element. - // If WGSize is not power of two, then WGSize+1 elements are allocated. - // The additional last element is used to catch elements that could - // otherwise be lost in the tree-reduction algorithm. - size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1); - auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this); - - auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, 0, *this); - auto ReduIdentity = Redu.getIdentity(); - if (IsEfficientCase) { - // Efficient case: work-groups are fully loaded and work-group size - // is power of two. - parallel_for(Range, [=](nd_item NDIt) { - // Call user's functions. Reducer.MValue gets initialized there. - typename Reduction::reducer_type Reducer(ReduIdentity); - KernelFunc(NDIt, Reducer); - - // Copy the element to local memory to prepare it for tree-reduction. - size_t LID = NDIt.get_local_linear_id(); - LocalReds[LID] = Reducer.MValue; - NDIt.barrier(); - - // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]. - typename Reduction::binary_operation BOp; - size_t WGSize = NDIt.get_local_range().size(); - for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) { - if (LID < CurStep) - LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); - NDIt.barrier(); - } - - // Compute the partial sum/reduction for the work-group. - if (LID == 0) - Out.get_pointer().get()[NDIt.get_group_linear_id()] = - IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0]) - : LocalReds[0]; - }); - } else { - // Inefficient case: work-groups are not fully loaded - // or WGSize is not power of two. - // These two inefficient cases are handled by one kernel, which - // can be split later into two separate kernels, if there are users who - // really need more efficient code for them. - using AuxName = typename detail::get_reduction_main_2nd_kernel_name_t< - KernelName, KernelType>::name; - parallel_for(Range, [=](nd_item NDIt) { - // Call user's functions. Reducer.MValue gets initialized there. - typename Reduction::reducer_type Reducer(ReduIdentity); - KernelFunc(NDIt, Reducer); - - size_t WGSize = NDIt.get_local_range().size(); - size_t LID = NDIt.get_local_linear_id(); - size_t GID = NDIt.get_global_linear_id(); - // Copy the element to local memory to prepare it for tree-reduction. - LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity; - LocalReds[WGSize] = ReduIdentity; - NDIt.barrier(); - - // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0] - // LocalReds[WGSize] accumulates last/odd elements when the step - // of tree-reduction loop is not even. - typename Reduction::binary_operation BOp; - size_t PrevStep = WGSize; - for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { - if (LID < CurStep) - LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); - else if (LID == CurStep && (PrevStep & 0x1)) - LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]); - NDIt.barrier(); - PrevStep = CurStep; - } - - // Compute the partial sum/reduction for the work-group. - if (LID == 0) { - auto GrID = NDIt.get_group_linear_id(); - auto V = BOp(LocalReds[0], LocalReds[WGSize]); - Out.get_pointer().get()[GrID] = - IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; - } - }); - } - } - - /// Implements a command group function that enqueues a kernel that does one - /// iteration of reduction of elements in each of work-groups. - /// This version uses tree-reduction algorithm to reduce elements in each - /// of work-groups. At the end of each work-group the partial sum is written - /// to a global buffer. - /// - /// Briefly: aux kernel, tree-reduction, CUSTOM types/ops. - template - void reduAuxCGFunc(const nd_range &Range, size_t NWorkItems, - size_t KernelRun, Reduction &Redu) { - size_t WGSize = Range.get_local_range().size(); - size_t NWorkGroups = Range.get_group_range().size(); - - // The last work-group may be not fully loaded with work, or the work group - // size may be not power of those. Those two cases considered inefficient - // as they require additional code and checks in the kernel. - bool IsUnderLoaded = NWorkGroups * WGSize != NWorkItems; - bool IsEfficientCase = !IsUnderLoaded && (WGSize & (WGSize - 1)) == 0; - - bool IsUpdateOfUserAcc = - Reduction::accessor_mode == access::mode::read_write && - NWorkGroups == 1; - - // Use local memory to reduce elements in work-groups into 0-th element. - // If WGSize is not power of two, then WGSize+1 elements are allocated. - // The additional last element is used to catch elements that could - // otherwise be lost in the tree-reduction algorithm. - size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1); - auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this); - - // Get read accessor to the buffer that was used as output - // in the previous kernel. After that create new output buffer if needed - // and get accessor to it (or use reduction's accessor if the kernel - // is the last one). - auto In = Redu.getReadAccToPreviousPartialReds(*this); - auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, KernelRun, *this); - - if (IsEfficientCase) { - // Efficient case: work-groups are fully loaded and work-group size - // is power of two. - using AuxName = typename detail::get_reduction_aux_1st_kernel_name_t< - KernelName, KernelType>::name; - parallel_for(Range, [=](nd_item NDIt) { - // Copy the element to local memory to prepare it for tree-reduction. - size_t LID = NDIt.get_local_linear_id(); - size_t GID = NDIt.get_global_linear_id(); - LocalReds[LID] = In[GID]; - NDIt.barrier(); - - // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0] - typename Reduction::binary_operation BOp; - size_t WGSize = NDIt.get_local_range().size(); - for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) { - if (LID < CurStep) - LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); - NDIt.barrier(); - } - - // Compute the partial sum/reduction for the work-group. - if (LID == 0) - Out.get_pointer().get()[NDIt.get_group_linear_id()] = - IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0]) - : LocalReds[0]; - }); - } else { - // Inefficient case: work-groups are not fully loaded - // or WGSize is not power of two. - // These two inefficient cases are handled by one kernel, which - // can be split later into two separate kernels, if there are users - // who really need more efficient code for them. - using AuxName = typename detail::get_reduction_aux_2nd_kernel_name_t< - KernelName, KernelType>::name; - auto ReduIdentity = Redu.getIdentity(); - parallel_for(Range, [=](nd_item NDIt) { - size_t WGSize = NDIt.get_local_range().size(); - size_t LID = NDIt.get_local_linear_id(); - size_t GID = NDIt.get_global_linear_id(); - // Copy the element to local memory to prepare it for tree-reduction - LocalReds[LID] = (GID < NWorkItems) ? In[GID] : ReduIdentity; - LocalReds[WGSize] = ReduIdentity; - NDIt.barrier(); - - // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0] - // LocalReds[WGSize] accumulates last/odd elements when the step - // of tree-reduction loop is not even. - typename Reduction::binary_operation BOp; - size_t PrevStep = WGSize; - for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { - if (LID < CurStep) - LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); - else if (LID == CurStep && (PrevStep & 0x1)) - LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]); - NDIt.barrier(); - PrevStep = CurStep; - } - - // Compute the partial sum/reduction for the work-group. - if (LID == 0) { - auto GrID = NDIt.get_group_linear_id(); - auto V = BOp(LocalReds[0], LocalReds[WGSize]); - Out.get_pointer().get()[GrID] = - IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; - } - }); - } - } - /// Defines and invokes a SYCL kernel function for the specified nd_range. /// Performs reduction operation specified in \param Redu. /// @@ -1063,30 +801,23 @@ class __SYCL_EXPORT handler { // necessary to reduce all partial sums into one final sum. // 1. Call the kernel that includes user's lambda function. - // If this kernel is going to be now last one, i.e. it does not write - // to user's accessor, then detach user's accessor from this kernel - // to make the dependencies between accessors and kernels more clean and - // correct. - if (NWorkGroups > 1) - dissociateWithHandler(Redu.MAcc); - - reduCGFunc(KernelFunc, Range, Redu); + intel::detail::reduCGFunc(*this, KernelFunc, Range, Redu); auto QueueCopy = MQueue; MLastEvent = this->finalize(); // 2. Run the additional aux kernel as many times as needed to reduce // all partial sums into one scalar. + + // TODO: user's nd_range and the work-group size specified there must + // be honored only for the main kernel that calls user's lambda functions. + // There is no need in using the same work-group size in these additional + // kernels. Thus, the better strategy here is to make the work-group size + // as big as possible to converge/reduce the partial sums into the last + // sum faster. size_t WGSize = Range.get_local_range().size(); size_t NWorkItems = NWorkGroups; size_t KernelRun = 1; while (NWorkItems > 1) { - // Before creating another kernel, add the event from the previous kernel - // to queue. - addEventToQueue(QueueCopy, MLastEvent); - - // TODO: here the work-group size is not limited by user's needs, - // the better strategy here is to make the work-group-size as big - // as possible. WGSize = std::min(WGSize, NWorkItems); NWorkGroups = NWorkItems / WGSize; // The last group may be not fully loaded. Still register it as a group. @@ -1102,8 +833,8 @@ class __SYCL_EXPORT handler { // Associate it with handler manually. if (NWorkGroups == 1) AuxHandler.associateWithHandler(Redu.MAcc); - AuxHandler.reduAuxCGFunc(Range, NWorkItems, - KernelRun, Redu); + intel::detail::reduAuxCGFunc( + AuxHandler, Range, NWorkItems, KernelRun, Redu); MLastEvent = AuxHandler.finalize(); NWorkItems = NWorkGroups; diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp index 3b6289cb03ecc..a22f7f38249d7 100644 --- a/sycl/include/CL/sycl/intel/reduction.hpp +++ b/sycl/include/CL/sycl/intel/reduction.hpp @@ -377,6 +377,248 @@ class reduction_impl { shared_ptr_class> MOutBufPtr; }; +/// These are the forward declaration for the classes that help to create +/// names for additional kernels. It is used only when there are +/// more then 1 kernels in one parallel_for() implementing SYCL reduction. +template class __sycl_reduction_main_2nd_kernel; +template class __sycl_reduction_aux_1st_kernel; +template class __sycl_reduction_aux_2nd_kernel; + +/// Helper structs to get additional kernel name types based on given +/// \c Name and \c Type types: if \c Name is undefined (is a \c auto_name) then +/// \c Type becomes the \c Name. +template +struct get_reduction_main_2nd_kernel_name_t { + using name = __sycl_reduction_main_2nd_kernel< + typename sycl::detail::get_kernel_name_t::name>; +}; +template +struct get_reduction_aux_1st_kernel_name_t { + using name = __sycl_reduction_aux_1st_kernel< + typename sycl::detail::get_kernel_name_t::name>; +}; +template +struct get_reduction_aux_2nd_kernel_name_t { + using name = __sycl_reduction_aux_2nd_kernel< + typename sycl::detail::get_kernel_name_t::name>; +}; + +/// Implements a command group function that enqueues a kernel that calls +/// user's lambda function \param KernelFunc and does one iteration of +/// reduction of elements in each of work-groups. +/// This version uses tree-reduction algorithm to reduce elements in each +/// of work-groups. At the end of each work-group the partial sum is written +/// to a global buffer. +/// +/// Briefly: user's lambda, tree-reduction, CUSTOM types/ops. +template +void reduCGFunc(handler &CGH, KernelType KernelFunc, + const nd_range &Range, Reduction &Redu) { + + size_t NWorkItems = Range.get_global_range().size(); + size_t WGSize = Range.get_local_range().size(); + size_t NWorkGroups = Range.get_group_range().size(); + + // The last work-group may be not fully loaded with work, or the work group + // size may be not power of two. Those two cases considered inefficient + // as they require additional code and checks in the kernel. + bool HasNonUniformWG = (NWorkGroups * WGSize - NWorkItems) != 0; + bool IsEfficientCase = !HasNonUniformWG && ((WGSize & (WGSize - 1)) == 0); + + bool IsUpdateOfUserAcc = + Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1; + + // Use local memory to reduce elements in work-groups into 0-th element. + // If WGSize is not power of two, then WGSize+1 elements are allocated. + // The additional last element is used to catch elements that could + // otherwise be lost in the tree-reduction algorithm. + size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1); + auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH); + + auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, 0, CGH); + auto ReduIdentity = Redu.getIdentity(); + if (IsEfficientCase) { + // Efficient case: work-groups are uniform and WGSize is is power of two. + CGH.parallel_for(Range, [=](nd_item NDIt) { + // Call user's functions. Reducer.MValue gets initialized there. + typename Reduction::reducer_type Reducer(ReduIdentity); + KernelFunc(NDIt, Reducer); + + // Copy the element to local memory to prepare it for tree-reduction. + size_t LID = NDIt.get_local_linear_id(); + LocalReds[LID] = Reducer.MValue; + NDIt.barrier(); + + // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]. + typename Reduction::binary_operation BOp; + size_t WGSize = NDIt.get_local_range().size(); + for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) + LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); + NDIt.barrier(); + } + + // Compute the partial sum/reduction for the work-group. + if (LID == 0) + Out.get_pointer().get()[NDIt.get_group_linear_id()] = + IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0]) + : LocalReds[0]; + }); + } else { + // Inefficient case: work-groups are non uniform or WGSize is not power + // of two, which requires more conditional, read and write operations. + // These two inefficient cases are handled by one kernel, which + // can be split later into two separate kernels, if there are users who + // really need more efficient code for them. + using AuxName = + typename get_reduction_main_2nd_kernel_name_t::name; + CGH.parallel_for(Range, [=](nd_item NDIt) { + // Call user's functions. Reducer.MValue gets initialized there. + typename Reduction::reducer_type Reducer(ReduIdentity); + KernelFunc(NDIt, Reducer); + + size_t WGSize = NDIt.get_local_range().size(); + size_t LID = NDIt.get_local_linear_id(); + size_t GID = NDIt.get_global_linear_id(); + // Copy the element to local memory to prepare it for tree-reduction. + LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity; + LocalReds[WGSize] = ReduIdentity; + NDIt.barrier(); + + // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0] + // LocalReds[WGSize] accumulates last/odd elements when the step + // of tree-reduction loop is not even. + typename Reduction::binary_operation BOp; + size_t PrevStep = WGSize; + for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) + LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); + else if (LID == CurStep && (PrevStep & 0x1)) + LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]); + NDIt.barrier(); + PrevStep = CurStep; + } + + // Compute the partial sum/reduction for the work-group. + if (LID == 0) { + size_t GrID = NDIt.get_group_linear_id(); + auto V = BOp(LocalReds[0], LocalReds[WGSize]); + Out.get_pointer().get()[GrID] = + IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; + } + }); + } +} + +/// Implements a command group function that enqueues a kernel that does one +/// iteration of reduction of elements in each of work-groups. +/// This version uses tree-reduction algorithm to reduce elements in each +/// of work-groups. At the end of each work-group the partial sum is written +/// to a global buffer. +/// +/// Briefly: aux kernel, tree-reduction, CUSTOM types/ops. +template +void reduAuxCGFunc(handler &CGH, const nd_range &Range, size_t NWorkItems, + size_t KernelRun, Reduction &Redu) { + size_t WGSize = Range.get_local_range().size(); + size_t NWorkGroups = Range.get_group_range().size(); + + // The last work-group may be not fully loaded with work, or the work group + // size may be not power of two. Those two cases considered inefficient + // as they require additional code and checks in the kernel. + bool HasNonUniformWG = NWorkGroups * WGSize != NWorkItems; + bool IsEfficientCase = !HasNonUniformWG && (WGSize & (WGSize - 1)) == 0; + + bool IsUpdateOfUserAcc = + Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1; + + // Use local memory to reduce elements in work-groups into 0-th element. + // If WGSize is not power of two, then WGSize+1 elements are allocated. + // The additional last element is used to catch elements that could + // otherwise be lost in the tree-reduction algorithm. + size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1); + auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH); + + // Get read accessor to the buffer that was used as output + // in the previous kernel. After that create new output buffer if needed + // and get accessor to it (or use reduction's accessor if the kernel + // is the last one). + auto In = Redu.getReadAccToPreviousPartialReds(CGH); + auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, KernelRun, CGH); + + if (IsEfficientCase) { + // Efficient case: work-groups are fully loaded and work-group size + // is power of two. + using AuxName = + typename get_reduction_aux_1st_kernel_name_t::name; + CGH.parallel_for(Range, [=](nd_item NDIt) { + // Copy the element to local memory to prepare it for tree-reduction. + size_t LID = NDIt.get_local_linear_id(); + size_t GID = NDIt.get_global_linear_id(); + LocalReds[LID] = In[GID]; + NDIt.barrier(); + + // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0] + typename Reduction::binary_operation BOp; + size_t WGSize = NDIt.get_local_range().size(); + for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) + LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); + NDIt.barrier(); + } + + // Compute the partial sum/reduction for the work-group. + if (LID == 0) + Out.get_pointer().get()[NDIt.get_group_linear_id()] = + IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0]) + : LocalReds[0]; + }); + } else { + // Inefficient case: work-groups are not fully loaded + // or WGSize is not power of two. + // These two inefficient cases are handled by one kernel, which + // can be split later into two separate kernels, if there are users + // who really need more efficient code for them. + using AuxName = + typename get_reduction_aux_2nd_kernel_name_t::name; + auto ReduIdentity = Redu.getIdentity(); + CGH.parallel_for(Range, [=](nd_item NDIt) { + size_t WGSize = NDIt.get_local_range().size(); + size_t LID = NDIt.get_local_linear_id(); + size_t GID = NDIt.get_global_linear_id(); + // Copy the element to local memory to prepare it for tree-reduction + LocalReds[LID] = (GID < NWorkItems) ? In[GID] : ReduIdentity; + LocalReds[WGSize] = ReduIdentity; + NDIt.barrier(); + + // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0] + // LocalReds[WGSize] accumulates last/odd elements when the step + // of tree-reduction loop is not even. + typename Reduction::binary_operation BOp; + size_t PrevStep = WGSize; + for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) + LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); + else if (LID == CurStep && (PrevStep & 0x1)) + LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]); + NDIt.barrier(); + PrevStep = CurStep; + } + + // Compute the partial sum/reduction for the work-group. + if (LID == 0) { + size_t GrID = NDIt.get_group_linear_id(); + auto V = BOp(LocalReds[0], LocalReds[WGSize]); + Out.get_pointer().get()[GrID] = + IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; + } + }); + } +} + } // namespace detail /// Creates and returns an object implementing the reduction functionality. diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 9b65ead92fcde..a2dba553923a7 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -351,11 +351,6 @@ class queue_impl { /// \return a native handle. pi_native_handle getNative() const; - /// Stores an event that should be associated with the queue - /// - /// \param Event is the event to be stored - void addEvent(event Event); - private: /// Performs command group submission to the queue. /// @@ -388,6 +383,11 @@ class queue_impl { /// \param Event is the event to be stored void addUSMEvent(event Event); + /// Stores an event that should be associated with the queue + /// + /// \param Event is the event to be stored + void addEvent(event Event); + /// Protects all the fields that can be changed by class' methods. mutex_class MMutex; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 8685b1060dcdc..6d018f6d3b122 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -19,11 +19,6 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -void handler::addEventToQueue(shared_ptr_class Queue, - cl::sycl::event Event) { - Queue->addEvent(std::move(Event)); -} - event handler::finalize() { // This block of code is needed only for reduction implementation. // It is harmless (does nothing) for everything else. diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 73e9c1ebb4dec..cac45ced92b59 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3231,7 +3231,6 @@ _ZN2cl4sycl7handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmb _ZN2cl4sycl7handler13getKernelNameB5cxx11Ev _ZN2cl4sycl7handler18extractArgsAndReqsEv _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tE -_ZN2cl4sycl7handler15addEventToQueueESt10shared_ptrINS0_6detail10queue_implEENS0_5eventE _ZN2cl4sycl7handler8finalizeEv _ZN2cl4sycl7program17build_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_ _ZN2cl4sycl7program19compile_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_