From 29cdc5e72673b27f945ff40adec83c18dc58ab94 Mon Sep 17 00:00:00 2001 From: Vyacheslav N Klochkov Date: Fri, 24 Apr 2020 12:56:11 -0700 Subject: [PATCH 1/6] [SYCL] Fix potential errors caused by new sycl::intel::detail namespace Signed-off-by: Vyacheslav N Klochkov --- sycl/include/CL/sycl/builtins.hpp | 7 ++----- sycl/include/CL/sycl/intel/function_pointer.hpp | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp index 400fd080cef1f..784dcfbac20df 100644 --- a/sycl/include/CL/sycl/builtins.hpp +++ b/sycl/include/CL/sycl/builtins.hpp @@ -24,11 +24,7 @@ namespace sycl { #else namespace __sycl_std = __host_std; #endif -} // namespace sycl -} // __SYCL_INLINE_NAMESPACE(cl) -__SYCL_INLINE_NAMESPACE(cl) { -namespace sycl { /* ----------------- 4.13.3 Math functions. ---------------------------------*/ // genfloat acos (genfloat x) template @@ -731,7 +727,8 @@ detail::enable_if_t::value, T> clz(T x) __NOEXC { namespace intel { // geninteger ctz (geninteger x) template -detail::enable_if_t::value, T> ctz(T x) __NOEXC { +sycl::detail::enable_if_t::value, T> +ctz(T x) __NOEXC { return __sycl_std::__invoke_ctz(x); } } // namespace intel diff --git a/sycl/include/CL/sycl/intel/function_pointer.hpp b/sycl/include/CL/sycl/intel/function_pointer.hpp index 2aa64cdd2e580..f812be911b788 100644 --- a/sycl/include/CL/sycl/intel/function_pointer.hpp +++ b/sycl/include/CL/sycl/intel/function_pointer.hpp @@ -81,7 +81,7 @@ device_func_ptr_holder_t get_device_func_ptr(FuncType F, const char *FuncName, PI_INVALID_OPERATION); } - return detail::getDeviceFunctionPointerImpl(D, P, FuncName); + return sycl::detail::getDeviceFunctionPointerImpl(D, P, FuncName); } } // namespace intel } // namespace sycl From 0596743d63a552719c5096cfe016c833ab28ab97 Mon Sep 17 00:00:00 2001 From: Vyacheslav N Klochkov Date: Fri, 24 Apr 2020 16:45:18 -0700 Subject: [PATCH 2/6] [SYCL] Implement basic reduction for parallel_for accepting nd_range This patch adds the algorithm that implements 1 reduction in parallel_for(). It handles all types and operations, including user's custom ones. The more efficient variants are on the way. What is NOT supported by this patch: - parallel_for(range, ...) // i.e. simple range without work-group sizes - parallel_for(nd_range, reduction1, reduction1, ...) // i.e. more than 1 reductions in paralell_for - USM - vector reductions (dims > 1 & #elements > 1) - HOST. The implmentation used in this patch uses barrier(), which is not supported on HOST yet. Signed-off-by: Vyacheslav N Klochkov --- sycl/include/CL/sycl/detail/cg.hpp | 8 +- sycl/include/CL/sycl/handler.hpp | 394 ++++++++++++++++++++- sycl/include/CL/sycl/intel/reduction.hpp | 26 ++ sycl/source/detail/queue_impl.hpp | 13 +- sycl/source/handler.cpp | 38 +- sycl/test/reduction/reduction_nd_s0_dw.cpp | 149 ++++++++ sycl/test/reduction/reduction_nd_s0_rw.cpp | 151 ++++++++ sycl/test/reduction/reduction_nd_s1_dw.cpp | 150 ++++++++ sycl/test/reduction/reduction_nd_s1_rw.cpp | 152 ++++++++ 9 files changed, 1057 insertions(+), 24 deletions(-) create mode 100644 sycl/test/reduction/reduction_nd_s0_dw.cpp create mode 100644 sycl/test/reduction/reduction_nd_s0_rw.cpp create mode 100644 sycl/test/reduction/reduction_nd_s1_dw.cpp create mode 100644 sycl/test/reduction/reduction_nd_s1_rw.cpp diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp index fa1b992624644..c35f68e143b33 100644 --- a/sycl/include/CL/sycl/detail/cg.hpp +++ b/sycl/include/CL/sycl/detail/cg.hpp @@ -46,7 +46,7 @@ class interop_handler { public: using QueueImplPtr = std::shared_ptr; - using ReqToMem = std::pair; + using ReqToMem = std::pair; interop_handler(std::vector MemObjs, QueueImplPtr Queue) : MQueue(std::move(Queue)), MMemObjs(std::move(MemObjs)) {} @@ -456,6 +456,7 @@ class CGExecKernel : public CG { string_class MKernelName; detail::OSModuleHandle MOSModuleHandle; vector_class> MStreams; + vector_class> MReductions; CGExecKernel(NDRDescT NDRDesc, unique_ptr_class HKernel, shared_ptr_class SyclKernel, @@ -467,14 +468,15 @@ class CGExecKernel : public CG { vector_class Args, string_class KernelName, detail::OSModuleHandle OSModuleHandle, vector_class> Streams, - CGTYPE Type, detail::code_location loc = {}) + vector_class> Reductions, CGTYPE Type, + detail::code_location loc = {}) : CG(Type, std::move(ArgsStorage), std::move(AccStorage), std::move(SharedPtrStorage), std::move(Requirements), std::move(Events), std::move(loc)), MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)), MSyclKernel(std::move(SyclKernel)), MArgs(std::move(Args)), MKernelName(std::move(KernelName)), MOSModuleHandle(OSModuleHandle), - MStreams(std::move(Streams)) { + MStreams(std::move(Streams)), MReductions(std::move(Reductions)) { assert((getType() == RUN_ON_HOST_INTEL || getType() == KERNEL) && "Wrong type of exec kernel CG."); } diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index dc3c49745b43f..a0e999bcf0da0 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -106,8 +106,45 @@ template struct get_kernel_name_t { }; __SYCL_EXPORT device getDeviceFromHandler(handler &); + +/// These are the forward declaration for the classes that help to create +/// names for additional kernels. It is used only when there are +/// more then 1 kernels in one parallel_for() implementing SYCL reduction. +template class __sycl_reduction_main_2nd_kernel; +template class __sycl_reduction_aux_1st_kernel; +template class __sycl_reduction_aux_2nd_kernel; + +/// Helper structs to get additional kernel name types based on given +/// \c Name and \c Type types: if \c Name is undefined (is a \c auto_name) then +/// \c Type becomes the \c Name. +template +struct get_reduction_main_2nd_kernel_name_t { + using name = __sycl_reduction_main_2nd_kernel< + typename get_kernel_name_t::name>; +}; +template +struct get_reduction_aux_1st_kernel_name_t { + using name = __sycl_reduction_aux_1st_kernel< + typename get_kernel_name_t::name>; +}; +template +struct get_reduction_aux_2nd_kernel_name_t { + using name = __sycl_reduction_aux_2nd_kernel< + typename get_kernel_name_t::name>; +}; + +device getDeviceFromHandler(handler &); + } // namespace detail +namespace intel { +namespace detail { +template +class reduction_impl; +} // namespace detail +} // namespace intel + /// 4.8.3 Command group handler class /// /// Objects of the handler class collect information about command group, such @@ -190,14 +227,24 @@ class __SYCL_EXPORT handler { return LambdaName == KernelName; } + /// Saves the location of user's code passed in \param CodeLoc for future + /// usage in finalize() method. + void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; } + + /// Stores the given \param Event to the \param Queue. + /// Even thought MQueue is a field of handler, the method addEvent() of + /// queue_impl class cannot be called inside this handler.hpp file + /// as queue_impl is incomplete class for handler. + static void addEventToQueue(shared_ptr_class Queue, + cl::sycl::event Event); + /// Constructs CG object of specific type, passes it to Scheduler and /// returns sycl::event object representing the command group. /// It's expected that the method is the latest method executed before /// object destruction. /// - /// \param Payload contains the code location of user code /// \return a SYCL event object representing the command group - event finalize(const cl::sycl::detail::code_location &Payload = {}); + event finalize(); /// Saves streams associated with this handler. /// @@ -208,6 +255,16 @@ class __SYCL_EXPORT handler { MStreamStorage.push_back(std::move(Stream)); } + /// Saves buffers and scalars associated with reduction to handler. + /// They are then forwarded to command group later and destroyed + /// only after the command group finishes the work on device/host. + /// + /// @param ReduObj is a pointer to object that must be preserved + /// for reduction until the . + void addReduction(shared_ptr_class ReduObj) { + MReductionStorage.push_back(std::move(ReduObj)); + } + ~handler() = default; bool is_host() { return MIsHost; } @@ -231,6 +288,30 @@ class __SYCL_EXPORT handler { /*index*/ 0); } + template + void dissociateWithHandler(accessor + Acc) { + detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc; + detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase); + detail::Requirement *Req = AccImpl.get(); + + // Remove accessor from the list of requirements, accessors storage, + // and from the list of associated accessors. + auto ReqIt = std::find(MRequirements.begin(), MRequirements.end(), Req); + auto AccIt = std::find(MAccStorage.begin(), MAccStorage.end(), AccImpl); + auto It = + std::find_if(MAssociatedAccesors.begin(), MAssociatedAccesors.end(), + [Req](const detail::ArgDesc &D) { return D.MPtr == Req; }); + assert((ReqIt != MRequirements.end() && AccIt != MAccStorage.end() && + It != MAssociatedAccesors.end()) && + "Cannot dissociate accessor."); + MRequirements.erase(ReqIt); + MAccStorage.erase(AccIt); + MAssociatedAccesors.erase(It); + } + // Recursively calls itself until arguments pack is fully processed. // The version for regular(standard layout) argument. template @@ -729,6 +810,305 @@ class __SYCL_EXPORT handler { #endif } + /// Implements a command group function that enqueues a kernel that calls + /// user's lambda function \param KernelFunc and does one iteration of + /// reduction of elements in each of work-groups. + /// This version uses tree-reduction algorithm to reduce elements in each + /// of work-groups. At the end of each work-groups the partial sum is written + /// to a global buffer. + /// + /// Briefly: user's lambda, tree-reduction, CUSTOM types/ops. + template + void reduCGFunc(KernelType KernelFunc, const nd_range &Range, + Reduction &Redu) { + + size_t NWorkItems = Range.get_global_range().size(); + size_t WGSize = Range.get_local_range().size(); + size_t NWorkGroups = Range.get_group_range().size(); + + bool IsUnderLoaded = (NWorkGroups * WGSize - NWorkItems) != 0; + size_t InefficientCase = (IsUnderLoaded || (WGSize & (WGSize - 1))) ? 1 : 0; + + bool IsUpdateOfUserAcc = + Reduction::accessor_mode == access::mode::read_write && + NWorkGroups == 1; + + // Use local memory to reduce elements in work-groups into 0-th element. + // If WGSize is not power of two, then WGSize+1 elements are allocated. + // The additional last element is used to catch reduce elements that could + // otherwise be lost in the tree-reduction algorithm used in the kernel. + auto LocalReds = Redu.getReadWriteLocalAcc(WGSize + InefficientCase, *this); + + auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, 0, *this); + auto ReduIdentity = Redu.getIdentity(); + if (!InefficientCase) { + // Efficient case: work-groups are fully loaded and work-group size + // is power of two. + parallel_for(Range, [=](nd_item NDIt) { + // Call user's functions. Reducer.MValue gets initialized there. + typename Reduction::reducer_type Reducer(ReduIdentity); + KernelFunc(NDIt, Reducer); + + // Copy the element to local memory to prepare it for tree-reduction. + size_t LID = NDIt.get_local_linear_id(); + LocalReds[LID] = Reducer.MValue; + NDIt.barrier(); + + // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]. + typename Reduction::binary_operation BOp; + size_t WGSize = NDIt.get_local_range().size(); + for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) + LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); + NDIt.barrier(); + } + + // Compute the the partial sum/reduction for the work-group. + if (LID == 0) + Out.get_pointer().get()[NDIt.get_group_linear_id()] = + IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0]) + : LocalReds[0]; + }); + } else { + // Inefficient case: work-groups are not fully loaded + // or WGSize is not power of two. + // These two inefficient cases are handled by one kernel, which + // can be split later into two separate kernels, if there are users who + // really need more efficient code for them. + using AuxName = typename detail::get_reduction_main_2nd_kernel_name_t< + KernelName, KernelType>::name; + parallel_for(Range, [=](nd_item NDIt) { + // Call user's functions. Reducer.MValue gets initialized there. + typename Reduction::reducer_type Reducer(ReduIdentity); + KernelFunc(NDIt, Reducer); + + size_t WGSize = NDIt.get_local_range().size(); + size_t LID = NDIt.get_local_linear_id(); + size_t GID = NDIt.get_global_linear_id(); + // Copy the element to local memory to prepare it for tree-reduction. + LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity; + LocalReds[WGSize] = ReduIdentity; + NDIt.barrier(); + + // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0] + // LocalReds[WGSize] accumulates last/odd elements when the step + // of tree-reduction loop is not even. + typename Reduction::binary_operation BOp; + size_t PrevStep = WGSize; + for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) + LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); + else if (LID == CurStep && (PrevStep & 0x1)) + LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]); + NDIt.barrier(); + PrevStep = CurStep; + } + + // Compute the the partial sum/reduction for the work-group. + if (LID == 0) { + auto GrID = NDIt.get_group_linear_id(); + auto V = BOp(LocalReds[0], LocalReds[WGSize]); + Out.get_pointer().get()[GrID] = + IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; + } + }); + } + } + + /// Implements a command group function that enqueues a kernel that does one + /// iteration of reduction of elements in each of work-groups. + /// This version uses tree-reduction algorithm to reduce elements in each + /// of work-groups. At the end of each work-groups the partial sum is written + /// to a global buffer. + /// + /// Briefly: aux kernel, tree-reduction, CUSTOM types/ops. + template + void reduAuxCGFunc(const nd_range &Range, size_t NWorkItems, + size_t KernelRun, Reduction &Redu) { + size_t WGSize = Range.get_local_range().size(); + size_t NWorkGroups = Range.get_group_range().size(); + + // The last work-group may be not fully loaded with work, or the work group + // size may be not power of those. Those two cases considered inefficient + // as they require additional code and checks in the kernel. + bool IsUnderLoaded = NWorkGroups * WGSize != NWorkItems; + size_t InefficientCase = (IsUnderLoaded || (WGSize & (WGSize - 1))) ? 1 : 0; + + bool IsUpdateOfUserAcc = + Reduction::accessor_mode == access::mode::read_write && + NWorkGroups == 1; + + // Use local memory to reduce elements in work-groups into 0-th element. + // If WGSize is not power of two, then WGSize+1 elements are allocated. + // The additional last element is used to catch reduce elements that + // could otherwise be lost in the tree-reduction algorithm. + auto LocalReds = Redu.getReadWriteLocalAcc(WGSize + InefficientCase, *this); + + // Get read accessor to the buffer that was used as output + // in the previous kernel. After that create new output buffer if needed + // and get accessor to it (or use reduction's accessor if the kernel + // is the last one). + auto In = Redu.getReadAccToPreviousPartialReds(*this); + auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, KernelRun, *this); + + if (!InefficientCase) { + // Efficient case: work-groups are fully loaded and work-group size + // is power of two. + using AuxName = typename detail::get_reduction_aux_1st_kernel_name_t< + KernelName, KernelType>::name; + parallel_for(Range, [=](nd_item NDIt) { + // Copy the element to local memory to prepare it for tree-reduction. + size_t LID = NDIt.get_local_linear_id(); + size_t GID = NDIt.get_global_linear_id(); + LocalReds[LID] = In[GID]; + NDIt.barrier(); + + // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0] + typename Reduction::binary_operation BOp; + size_t WGSize = NDIt.get_local_range().size(); + for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) + LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); + NDIt.barrier(); + } + + // Compute the the partial sum/reduction for the work-group. + if (LID == 0) + Out.get_pointer().get()[NDIt.get_group_linear_id()] = + IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0]) + : LocalReds[0]; + }); + } else { + // Inefficient case: work-groups are not fully loaded + // or WGSize is not power of two. + // These two inefficient cases are handled by one kernel, which + // can be split later into two separate kernels, if there are users + // who really need more efficient code for them. + using AuxName = typename detail::get_reduction_aux_2nd_kernel_name_t< + KernelName, KernelType>::name; + auto ReduIdentity = Redu.getIdentity(); + parallel_for(Range, [=](nd_item NDIt) { + size_t WGSize = NDIt.get_local_range().size(); + size_t LID = NDIt.get_local_linear_id(); + size_t GID = NDIt.get_global_linear_id(); + // Copy the element to local memory to prepare it for tree-reduction + LocalReds[LID] = (GID < NWorkItems) ? In[GID] : ReduIdentity; + LocalReds[WGSize] = ReduIdentity; + NDIt.barrier(); + + // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0] + // LocalReds[WGSize] accumulates last/odd elements when the step + // of tree-reduction loop is not even. + typename Reduction::binary_operation BOp; + size_t PrevStep = WGSize; + for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) + LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); + else if (LID == CurStep && (PrevStep & 0x1)) + LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]); + NDIt.barrier(); + PrevStep = CurStep; + } + + // Compute the the partial sum/reduction for the work-group. + if (LID == 0) { + auto GrID = NDIt.get_group_linear_id(); + auto V = BOp(LocalReds[0], LocalReds[WGSize]); + Out.get_pointer().get()[GrID] = + IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; + } + }); + } + } + + /// Defines and invokes a SYCL kernel function for the specified nd_range. + /// Performs reduction operation specified in \param Redu. + /// + /// The SYCL kernel function is defined as a lambda function or a named + /// function object type and given an id or item for indexing in the indexing + /// space defined by range. + /// If it is a named function object and the function object type is + /// globally visible, there is no need for the developer to provide + /// a kernel name for it. + /// + /// TODO: currently it calls only those versions of kernels that can handle + /// custom types and operations. Some of types and operations may use faster + /// implementations that use intel::reduce() and/or sycl::atomic.fetch_() + /// functions and thus provide much better performance. Those variants exist, + /// are fully functional. They just wait for their time for code-review. + /// TODO: Need to handle more than 1 reduction in parallel_for(). + /// TODO: Support HOST. The kernels called by this parallel_for() may use + /// some functionality that is not yet supported on HOST such as: + /// barrier(), and intel::reduce() that also may be used in more + /// optimized implementations waiting for their turn of code-review. + template + void parallel_for(nd_range Range, Reduction &Redu, + KernelType KernelFunc) { + size_t NWorkGroups = Range.get_group_range().size(); + + // This parallel_for() is lowered to the following sequence: + // 1) Call a kernel that a) call user's lambda function and b) performs + // one iteration of reduction, storing the partial reductions/sums + // to either a newly created global buffer or to user's reduction + // accessor. So, if the original 'Range' has totally + // N1 elements and work-group size is W, then after the first iteration + // there will be N2 partial sums where N2 = N1 / W. + // If (N2 == 1) then the partial sum is written to user's accessor. + // Otherwise, a new global buffer is created and partial sums are written + // to it. + // 2) Call an aux kernel (if necessary, i.e. if N2 > 1) as many times as + // necessary to reduce all partial sums into one final sum. + + // 1. Call the kernel that includes user's lambda function. + // If this kernel is going to be now last one, i.e. it does not write + // to user's accessor, then detach user's accessor from this kernel + // to make the dependencies between accessors and kernels more clean and + // correct. + if (NWorkGroups > 1) + dissociateWithHandler(Redu.MAcc); + + reduCGFunc(KernelFunc, Range, Redu); + auto QueueCopy = MQueue; + MLastEvent = this->finalize(); + + // 2. Run the additional aux kernel as many times as needed to reduce + // all partial sums into one scalar. + size_t WGSize = Range.get_local_range().size(); + size_t NWorkItems = NWorkGroups; + size_t KernelRun = 1; + while (NWorkItems > 1) { + // Before creating another kernel, add the event from the previous kernel + // to queue. + addEventToQueue(QueueCopy, MLastEvent); + + // TODO: here the work-group size is not limited by user's needs, + // the better strategy here is to make the work-group-size as big + // as possible. + WGSize = std::min(WGSize, NWorkItems); + NWorkGroups = NWorkItems / WGSize; + // The last group may be not fully loaded. Still register it as a group. + if ((NWorkItems % WGSize) != 0) + ++NWorkGroups; + auto Range = + nd_range<1>(range<1>(WGSize * NWorkGroups), range<1>(WGSize)); + + handler AuxHandler(QueueCopy, MIsHost); + AuxHandler.saveCodeLoc(MCodeLoc); + + // The last kernel DOES write to reductions's accessor. + // Associate it with handler manually. + if (NWorkGroups == 1) + AuxHandler.associateWithHandler(Redu.MAcc); + AuxHandler.reduAuxCGFunc(Range, NWorkItems, + KernelRun, Redu); + MLastEvent = AuxHandler.finalize(); + + NWorkItems = NWorkGroups; + ++KernelRun; + } // end while (NWorkItems > 1) + } + /// Hierarchical kernel invocation method of a kernel defined as a lambda /// encoding the body of each work-group to launch. /// @@ -1334,6 +1714,7 @@ class __SYCL_EXPORT handler { vector_class MAccStorage; vector_class MLocalAccStorage; vector_class> MStreamStorage; + vector_class> MReductionStorage; vector_class> MSharedPtrStorage; /// The list of arguments for the kernel. vector_class MArgs; @@ -1368,6 +1749,10 @@ class __SYCL_EXPORT handler { bool MIsHost = false; + detail::code_location MCodeLoc = {}; + bool MIsFinalized = false; + event MLastEvent; + // Make queue_impl class friend to be able to call finalize method. friend class detail::queue_impl; // Make accessor class friend to keep the list of associated accessors. @@ -1382,6 +1767,11 @@ class __SYCL_EXPORT handler { // Make stream class friend to be able to keep the list of associated streams friend class stream; friend class detail::stream_impl; + // Make reduction_impl friend to store buffers and arrays created for it + // in handler from reduction_impl methods. + template + friend class intel::detail::reduction_impl; }; } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp index 2b0fb264bc94f..3b6289cb03ecc 100644 --- a/sycl/include/CL/sycl/intel/reduction.hpp +++ b/sycl/include/CL/sycl/intel/reduction.hpp @@ -342,6 +342,31 @@ class reduction_impl { "Only scalar/1-element reductions are supported now."); } + accessor + getReadWriteLocalAcc(size_t Size, handler &CGH) { + return accessor(Size, CGH); + } + + accessor + getReadAccToPreviousPartialReds(handler &CGH) const { + CGH.addReduction(MOutBufPtr); + return accessor(*MOutBufPtr, CGH); + } + + accessor_type getWriteAccForPartialReds(size_t Size, size_t RunNumber, + handler &CGH) { + if (Size == 1) { + if (RunNumber > 0) + CGH.associateWithHandler(this->MAcc); + return this->MAcc; + } + // Create a new output buffer and return an accessor to it. + MOutBufPtr = std::make_shared>(range<1>(Size)); + CGH.addReduction(MOutBufPtr); + return accessor_type(*MOutBufPtr, CGH); + } /// User's accessor to where the reduction must be written. accessor_type MAcc; @@ -349,6 +374,7 @@ class reduction_impl { /// Identity of the BinaryOperation. /// The result of BinaryOperation(X, MIdentity) is equal to X for any X. const T MIdentity; + shared_ptr_class> MOutBufPtr; }; } // namespace detail diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 937223f114c5f..9b65ead92fcde 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -351,6 +351,11 @@ class queue_impl { /// \return a native handle. pi_native_handle getNative() const; + /// Stores an event that should be associated with the queue + /// + /// \param Event is the event to be stored + void addEvent(event Event); + private: /// Performs command group submission to the queue. /// @@ -362,8 +367,9 @@ class queue_impl { shared_ptr_class Self, const detail::code_location &Loc) { handler Handler(std::move(Self), MHostQueue); + Handler.saveCodeLoc(Loc); CGF(Handler); - event Event = Handler.finalize(Loc); + event Event = Handler.finalize(); addEvent(Event); return Event; } @@ -377,11 +383,6 @@ class queue_impl { void instrumentationEpilog(void *TelementryEvent, string_class &Name, int32_t StreamID, uint64_t IId); - /// Stores an event that should be associated with the queue - /// - /// \param Event is the event to be stored - void addEvent(event Event); - /// Stores a USM operation event that should be associated with the queue /// /// \param Event is the event to be stored diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index ac6baf67bfd25..a97f5005795df 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -13,12 +13,24 @@ #include #include #include +#include #include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -event handler::finalize(const cl::sycl::detail::code_location &Payload) { - sycl::event EventRet; + +void handler::addEventToQueue(shared_ptr_class Queue, + cl::sycl::event Event) { + Queue->addEvent(std::move(Event)); +} + +event handler::finalize() { + // This block of code is needed only to 5th/default reduction implementation. + // It is harmless (does nothing) for other implementations. + if (MIsFinalized) + return MLastEvent; + MIsFinalized = true; + unique_ptr_class CommandGroup; switch (MCGType) { case detail::CG::KERNEL: @@ -28,15 +40,15 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) { std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), std::move(MRequirements), std::move(MEvents), std::move(MArgs), std::move(MKernelName), - std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType, - Payload)); + std::move(MOSModuleHandle), std::move(MStreamStorage), + std::move(MReductionStorage), MCGType, MCodeLoc)); break; } case detail::CG::INTEROP_TASK_CODEPLAY: CommandGroup.reset(new detail::CGInteropTask( std::move(MInteropTask), std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), - std::move(MRequirements), std::move(MEvents), MCGType, Payload)); + std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc)); break; case detail::CG::COPY_ACC_TO_PTR: case detail::CG::COPY_PTR_TO_ACC: @@ -44,37 +56,37 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) { CommandGroup.reset(new detail::CGCopy( MCGType, MSrcPtr, MDstPtr, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), - std::move(MRequirements), std::move(MEvents), Payload)); + std::move(MRequirements), std::move(MEvents), MCodeLoc)); break; case detail::CG::FILL: CommandGroup.reset(new detail::CGFill( std::move(MPattern), MDstPtr, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), - std::move(MRequirements), std::move(MEvents), Payload)); + std::move(MRequirements), std::move(MEvents), MCodeLoc)); break; case detail::CG::UPDATE_HOST: CommandGroup.reset(new detail::CGUpdateHost( MDstPtr, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), std::move(MRequirements), - std::move(MEvents), Payload)); + std::move(MEvents), MCodeLoc)); break; case detail::CG::COPY_USM: CommandGroup.reset(new detail::CGCopyUSM( MSrcPtr, MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), - std::move(MRequirements), std::move(MEvents), Payload)); + std::move(MRequirements), std::move(MEvents), MCodeLoc)); break; case detail::CG::FILL_USM: CommandGroup.reset(new detail::CGFillUSM( std::move(MPattern), MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), - std::move(MRequirements), std::move(MEvents), Payload)); + std::move(MRequirements), std::move(MEvents), MCodeLoc)); break; case detail::CG::PREFETCH_USM: CommandGroup.reset(new detail::CGPrefetchUSM( MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), std::move(MRequirements), - std::move(MEvents), Payload)); + std::move(MEvents), MCodeLoc)); break; case detail::CG::NONE: throw runtime_error("Command group submitted without a kernel or a " @@ -88,8 +100,8 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) { detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG( std::move(CommandGroup), std::move(MQueue)); - EventRet = detail::createSyclObjFromImpl(Event); - return EventRet; + MLastEvent = detail::createSyclObjFromImpl(Event); + return MLastEvent; } void handler::processArg(void *Ptr, const detail::kernel_param_kind_t &Kind, diff --git a/sycl/test/reduction/reduction_nd_s0_dw.cpp b/sycl/test/reduction/reduction_nd_s0_dw.cpp new file mode 100644 index 0000000000000..fe764eb2e00cb --- /dev/null +++ b/sycl/test/reduction/reduction_nd_s0_dw.cpp @@ -0,0 +1,149 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// This test performs basic checks of parallel_for(nd_range, reduction, func) +// with reductions initialized with 0-dimensional discard_write accessor. + +#include +#include + +using namespace cl::sycl; + +template +void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, + BinaryOperation BOp, size_t N) { + ExpectedOut = Identity; + auto In = InBuf.template get_access(); + for (int I = 0; I < N; ++I) { + if (std::is_same>::value) + In[I] = 1 + (((I % 37) == 0) ? 1 : 0); + else + In[I] = ((I + 1) % 5) + 1.1; + ExpectedOut = BOp(ExpectedOut, In[I]); + } +}; + +template +class Known; +template +class Unknown; + +template +struct Vec { + Vec() : X(0), Y(0) {} + Vec(T X, T Y) : X(X), Y(Y) {} + Vec(T V) : X(V), Y(V) {} + bool operator==(const Vec &P) const { + return P.X == X && P.Y == Y; + } + bool operator!=(const Vec &P) const { + return !(*this == P); + } + T X; + T Y; +}; +template +bool operator==(const Vec &A, const Vec &B) { + return A.X == B.X && A.Y == B.Y; +} +template +std::ostream &operator<<(std::ostream &OS, const Vec &P) { + return OS << "(" << P.X << ", " << P.Y << ")"; +} + +template +struct VecPlus { + using P = Vec; + P operator()(const P &A, const P &B) const { + return P(A.X + B.X, A.Y + B.Y); + } +}; + +template +void test(T Identity, size_t WGSize, size_t NWItems) { + buffer InBuf(NWItems); + buffer OutBuf(1); + + // Initialize. + BinaryOperation BOp; + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); + + // Compute. + queue Q; + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + accessor + Out(OutBuf, CGH); + auto Redu = intel::reduction(Out, Identity, BOp); + + range<1> GlobalRange(NWItems); + range<1> LocalRange(WGSize); + nd_range<1> NDRange(GlobalRange, LocalRange); + CGH.parallel_for>( + NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { + Sum.combine(In[NDIt.get_global_linear_id()]); + }); + }); + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + if (ComputedOut != CorrectOut) { + std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; + std::cout << "Computed value: " << ComputedOut + << ", Expected value: " << CorrectOut << "\n"; + assert(0 && "Wrong value."); + } +} + +int main() { + // Check some less standards WG sizes and corner cases first. + test>(0, 2, 2); + test>(0, 7, 7); + test>(0, 9, 18); + test>(0, 49, 49 * 5); + + // Try some power-of-two work-group sizes. + test>(0, 2, 64); + test>(0, 4, 64); + test>(0, 8, 128); + test>(0, 16, 256); + test>(0, 32, 256); + test>(0, 64, 256); + test>(0, 128, 256); + test>(0, 256, 256); + + // Check with various operations. + test>(1, 8, 256); + test>(0, 8, 256); + test>(0, 8, 256); + test>(~0, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + // Check with various types. + test>(1, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + test>(1, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + // Check with CUSTOM type. + test, 0, VecPlus>(Vec(0), 8, 256); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/sycl/test/reduction/reduction_nd_s0_rw.cpp b/sycl/test/reduction/reduction_nd_s0_rw.cpp new file mode 100644 index 0000000000000..7ecee96832e4c --- /dev/null +++ b/sycl/test/reduction/reduction_nd_s0_rw.cpp @@ -0,0 +1,151 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// This test performs basic checks of parallel_for(nd_range, reduction, func) +// with reductions initialized with 0-dimensional read_write accessor. + +#include +#include + +using namespace cl::sycl; + +template +void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, + BinaryOperation BOp, size_t N) { + ExpectedOut = Identity; + auto In = InBuf.template get_access(); + for (int I = 0; I < N; ++I) { + if (std::is_same>::value) + In[I] = 1 + (((I % 37) == 0) ? 1 : 0); + else + In[I] = ((I + 1) % 5) + 1.1; + ExpectedOut = BOp(ExpectedOut, In[I]); + } +}; + +template +class Known; +template +class Unknown; + +template +struct Vec { + Vec() : X(0), Y(0) {} + Vec(T X, T Y) : X(X), Y(Y) {} + Vec(T V) : X(V), Y(V) {} + bool operator==(const Vec &P) const { + return P.X == X && P.Y == Y; + } + bool operator!=(const Vec &P) const { + return !(*this == P); + } + T X; + T Y; +}; +template +bool operator==(const Vec &A, const Vec &B) { + return A.X == B.X && A.Y == B.Y; +} +template +std::ostream &operator<<(std::ostream &OS, const Vec &P) { + return OS << "(" << P.X << ", " << P.Y << ")"; +} + +template +struct VecPlus { + using P = Vec; + P operator()(const P &A, const P &B) const { + return P(A.X + B.X, A.Y + B.Y); + } +}; + +template +void test(T Identity, size_t WGSize, size_t NWItems) { + buffer InBuf(NWItems); + buffer OutBuf(1); + + // Initialize. + BinaryOperation BOp; + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); + + (OutBuf.template get_access())[0] = Identity; + + // Compute. + queue Q; + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + accessor + Out(OutBuf, CGH); + auto Redu = intel::reduction(Out, Identity, BOp); + + range<1> GlobalRange(NWItems); + range<1> LocalRange(WGSize); + nd_range<1> NDRange(GlobalRange, LocalRange); + CGH.parallel_for>( + NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { + Sum.combine(In[NDIt.get_global_linear_id()]); + }); + }); + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + if (ComputedOut != CorrectOut) { + std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; + std::cout << "Computed value: " << ComputedOut + << ", Expected value: " << CorrectOut << "\n"; + assert(0 && "Wrong value."); + } +} + +int main() { + // Check some less standards WG sizes and corner cases first. + test>(0, 2, 2); + test>(0, 7, 7); + test>(0, 9, 18); + test>(0, 49, 49 * 5); + + // Try some power-of-two work-group sizes. + test>(0, 2, 64); + test>(0, 4, 64); + test>(0, 8, 128); + test>(0, 16, 256); + test>(0, 32, 256); + test>(0, 64, 256); + test>(0, 128, 256); + test>(0, 256, 256); + + // Check with various operations. + test>(1, 8, 256); + test>(0, 8, 256); + test>(0, 8, 256); + test>(~0, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + // Check with various types. + test>(1, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + test>(1, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + // Check with CUSTOM type. + test, 0, VecPlus>(Vec(0), 8, 256); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/sycl/test/reduction/reduction_nd_s1_dw.cpp b/sycl/test/reduction/reduction_nd_s1_dw.cpp new file mode 100644 index 0000000000000..27df7b4647771 --- /dev/null +++ b/sycl/test/reduction/reduction_nd_s1_dw.cpp @@ -0,0 +1,150 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// This test performs basic checks of parallel_for(nd_range, reduction, func) +// with reductions initialized with 1-dimensional discard_write accessor +// accessing 1 element buffer. + +#include +#include + +using namespace cl::sycl; + +template +void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, + BinaryOperation BOp, size_t N) { + ExpectedOut = Identity; + auto In = InBuf.template get_access(); + for (int I = 0; I < N; ++I) { + if (std::is_same>::value) + In[I] = 1 + (((I % 37) == 0) ? 1 : 0); + else + In[I] = ((I + 1) % 5) + 1.1; + ExpectedOut = BOp(ExpectedOut, In[I]); + } +}; + +template +class Known; +template +class Unknown; + +template +struct Vec { + Vec() : X(0), Y(0) {} + Vec(T X, T Y) : X(X), Y(Y) {} + Vec(T V) : X(V), Y(V) {} + bool operator==(const Vec &P) const { + return P.X == X && P.Y == Y; + } + bool operator!=(const Vec &P) const { + return !(*this == P); + } + T X; + T Y; +}; +template +bool operator==(const Vec &A, const Vec &B) { + return A.X == B.X && A.Y == B.Y; +} +template +std::ostream &operator<<(std::ostream &OS, const Vec &P) { + return OS << "(" << P.X << ", " << P.Y << ")"; +} + +template +struct VecPlus { + using P = Vec; + P operator()(const P &A, const P &B) const { + return P(A.X + B.X, A.Y + B.Y); + } +}; + +template +void test(T Identity, size_t WGSize, size_t NWItems) { + buffer InBuf(NWItems); + buffer OutBuf(1); + + // Initialize. + BinaryOperation BOp; + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); + + // Compute. + queue Q; + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + accessor + Out(OutBuf, CGH); + auto Redu = intel::reduction(Out, Identity, BOp); + + range<1> GlobalRange(NWItems); + range<1> LocalRange(WGSize); + nd_range<1> NDRange(GlobalRange, LocalRange); + CGH.parallel_for>( + NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { + Sum.combine(In[NDIt.get_global_linear_id()]); + }); + }); + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + if (ComputedOut != CorrectOut) { + std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; + std::cout << "Computed value: " << ComputedOut + << ", Expected value: " << CorrectOut << "\n"; + assert(0 && "Wrong value."); + } +} + +int main() { + // Check some less standards WG sizes and corner cases first. + test>(0, 2, 2); + test>(0, 7, 7); + test>(0, 9, 18); + test>(0, 49, 49 * 5); + + // Try some power-of-two work-group sizes. + test>(0, 2, 64); + test>(0, 4, 64); + test>(0, 8, 128); + test>(0, 16, 256); + test>(0, 32, 256); + test>(0, 64, 256); + test>(0, 128, 256); + test>(0, 256, 256); + + // Check with various operations. + test>(1, 8, 256); + test>(0, 8, 256); + test>(0, 8, 256); + test>(~0, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + // Check with various types. + test>(1, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + test>(1, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + // Check with CUSTOM type. + test, 1, VecPlus>(Vec(0), 8, 256); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/sycl/test/reduction/reduction_nd_s1_rw.cpp b/sycl/test/reduction/reduction_nd_s1_rw.cpp new file mode 100644 index 0000000000000..f416f473a69a0 --- /dev/null +++ b/sycl/test/reduction/reduction_nd_s1_rw.cpp @@ -0,0 +1,152 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// This test performs basic checks of parallel_for(nd_range, reduction, func) +// with reductions initialized with 1-dimensional read_write accessor +// accessing 1 element buffer. + +#include +#include + +using namespace cl::sycl; + +template +void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, + BinaryOperation BOp, size_t N) { + ExpectedOut = Identity; + auto In = InBuf.template get_access(); + for (int I = 0; I < N; ++I) { + if (std::is_same>::value) + In[I] = 1 + (((I % 37) == 0) ? 1 : 0); + else + In[I] = ((I + 1) % 5) + 1.1; + ExpectedOut = BOp(ExpectedOut, In[I]); + } +}; + +template +class Known; +template +class Unknown; + +template +struct Vec { + Vec() : X(0), Y(0) {} + Vec(T X, T Y) : X(X), Y(Y) {} + Vec(T V) : X(V), Y(V) {} + bool operator==(const Vec &P) const { + return P.X == X && P.Y == Y; + } + bool operator!=(const Vec &P) const { + return !(*this == P); + } + T X; + T Y; +}; +template +bool operator==(const Vec &A, const Vec &B) { + return A.X == B.X && A.Y == B.Y; +} +template +std::ostream &operator<<(std::ostream &OS, const Vec &P) { + return OS << "(" << P.X << ", " << P.Y << ")"; +} + +template +struct VecPlus { + using P = Vec; + P operator()(const P &A, const P &B) const { + return P(A.X + B.X, A.Y + B.Y); + } +}; + +template +void test(T Identity, size_t WGSize, size_t NWItems) { + buffer InBuf(NWItems); + buffer OutBuf(1); + + // Initialize. + BinaryOperation BOp; + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); + + (OutBuf.template get_access())[0] = Identity; + + // Compute. + queue Q; + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + accessor + Out(OutBuf, CGH); + auto Redu = intel::reduction(Out, Identity, BOp); + + range<1> GlobalRange(NWItems); + range<1> LocalRange(WGSize); + nd_range<1> NDRange(GlobalRange, LocalRange); + CGH.parallel_for>( + NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { + Sum.combine(In[NDIt.get_global_linear_id()]); + }); + }); + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + if (ComputedOut != CorrectOut) { + std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; + std::cout << "Computed value: " << ComputedOut + << ", Expected value: " << CorrectOut << "\n"; + assert(0 && "Wrong value."); + } +} + +int main() { + // Check some less standards WG sizes and corner cases first. + test>(0, 2, 2); + test>(0, 7, 7); + test>(0, 9, 18); + test>(0, 49, 49 * 5); + + // Try some power-of-two work-group sizes. + test>(0, 2, 64); + test>(0, 4, 64); + test>(0, 8, 128); + test>(0, 16, 256); + test>(0, 32, 256); + test>(0, 64, 256); + test>(0, 128, 256); + test>(0, 256, 256); + + // Check with various operations. + test>(1, 8, 256); + test>(0, 8, 256); + test>(0, 8, 256); + test>(~0, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + // Check with various types. + test>(1, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + test>(1, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + // Check with CUSTOM type. + test, 1, VecPlus>(Vec(0), 8, 256); + + std::cout << "Test passed\n"; + return 0; +} From f974f85c91790192a4affe68b91cc5dc483291a4 Mon Sep 17 00:00:00 2001 From: Vyacheslav N Klochkov Date: Fri, 24 Apr 2020 17:27:53 -0700 Subject: [PATCH 3/6] [SYCL][LIT] Add a new LIT test for reduction + conditional statement Signed-off-by: Vyacheslav N Klochkov --- .../reduction/reduction_nd_conditional.cpp | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 sycl/test/reduction/reduction_nd_conditional.cpp diff --git a/sycl/test/reduction/reduction_nd_conditional.cpp b/sycl/test/reduction/reduction_nd_conditional.cpp new file mode 100644 index 0000000000000..57029e4654415 --- /dev/null +++ b/sycl/test/reduction/reduction_nd_conditional.cpp @@ -0,0 +1,131 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +//==---reduction_nd_conditional.cpp - SYCL reduction + condition test ------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// This test performs basic checks of parallel_for(nd_range, reduction, func) +// with reduction and conditional increment of the reduction variable. + +#include +#include + +using namespace cl::sycl; + +template +void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, + BinaryOperation BOp, size_t N) { + ExpectedOut = Identity; + auto In = InBuf.template get_access(); + for (int I = 0; I < N; ++I) { + if (std::is_same>::value) + In[I] = 1 + (((I % 37) == 0) ? 1 : 0); + else + In[I] = I + 1 + 1.1; + + if (I < 2) + ExpectedOut = BOp(ExpectedOut, 99); + else if (I % 3) + ExpectedOut = BOp(ExpectedOut, In[I]); + else + ; // do nothing. + } +}; + +template +class Known; +template +class Unknown; + +template +struct Vec { + Vec() : X(0), Y(0) {} + Vec(T X, T Y) : X(X), Y(Y) {} + Vec(T V) : X(V), Y(V) {} + bool operator==(const Vec &P) const { + return P.X == X && P.Y == Y; + } + bool operator!=(const Vec &P) const { + return !(*this == P); + } + T X; + T Y; +}; +template +bool operator==(const Vec &A, const Vec &B) { + return A.X == B.X && A.Y == B.Y; +} +template +std::ostream &operator<<(std::ostream &OS, const Vec &P) { + return OS << "(" << P.X << ", " << P.Y << ")"; +} + +template +struct VecPlus { + using P = Vec; + P operator()(const P &A, const P &B) const { + return P(A.X + B.X, A.Y + B.Y); + } +}; + +template +void test(T Identity, size_t WGSize, size_t NWItems) { + buffer InBuf(NWItems); + buffer OutBuf(1); + + // Initialize. + BinaryOperation BOp; + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); + + // Compute. + queue Q; + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + accessor + Out(OutBuf, CGH); + auto Redu = intel::reduction(Out, Identity, BOp); + + range<1> GlobalRange(NWItems); + range<1> LocalRange(WGSize); + nd_range<1> NDRange(GlobalRange, LocalRange); + CGH.parallel_for>( + NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { + size_t I = NDIt.get_global_linear_id(); + if (I < 2) + Sum.combine(T(99)); + else if (I % 3) + Sum.combine(In[I]); + else + ; // do nothing. + }); + }); + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + if (ComputedOut != CorrectOut) { + std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; + std::cout << "Computed value: " << ComputedOut + << ", Expected value: " << CorrectOut << "\n"; + assert(0 && "Wrong value."); + } +} + +int main() { + test>(0, 2, 2); + test>(0, 7, 7); + test>(0, 2, 64); + test>(0, 16, 256); + + std::cout << "Test passed\n"; + return 0; +} From 3b4af65bd5d5e0e2b42c99736ace118f075f200e Mon Sep 17 00:00:00 2001 From: Vyacheslav N Klochkov Date: Fri, 24 Apr 2020 23:47:50 -0700 Subject: [PATCH 4/6] [SYCL] Fix abi issues caused by reduction implementation The fix also removes the field handler::MReductionsStorage and re-uses the existing MSharedPtrStorage to keep reductions buffers alive until the execution on device/host code using those buffers finishes. Signed-off-by: Vyacheslav N Klochkov --- sycl/include/CL/sycl/detail/cg.hpp | 6 ++---- sycl/include/CL/sycl/handler.hpp | 15 +++++++-------- sycl/source/handler.cpp | 4 ++-- sycl/test/abi/sycl_symbols_linux.dump | 3 ++- sycl/test/abi/symbol_size.cpp | 2 +- 5 files changed, 14 insertions(+), 16 deletions(-) diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp index c35f68e143b33..100c29e8ebe38 100644 --- a/sycl/include/CL/sycl/detail/cg.hpp +++ b/sycl/include/CL/sycl/detail/cg.hpp @@ -456,7 +456,6 @@ class CGExecKernel : public CG { string_class MKernelName; detail::OSModuleHandle MOSModuleHandle; vector_class> MStreams; - vector_class> MReductions; CGExecKernel(NDRDescT NDRDesc, unique_ptr_class HKernel, shared_ptr_class SyclKernel, @@ -468,15 +467,14 @@ class CGExecKernel : public CG { vector_class Args, string_class KernelName, detail::OSModuleHandle OSModuleHandle, vector_class> Streams, - vector_class> Reductions, CGTYPE Type, - detail::code_location loc = {}) + CGTYPE Type, detail::code_location loc = {}) : CG(Type, std::move(ArgsStorage), std::move(AccStorage), std::move(SharedPtrStorage), std::move(Requirements), std::move(Events), std::move(loc)), MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)), MSyclKernel(std::move(SyclKernel)), MArgs(std::move(Args)), MKernelName(std::move(KernelName)), MOSModuleHandle(OSModuleHandle), - MStreams(std::move(Streams)), MReductions(std::move(Reductions)) { + MStreams(std::move(Streams)) { assert((getType() == RUN_ON_HOST_INTEL || getType() == KERNEL) && "Wrong type of exec kernel CG."); } diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index a0e999bcf0da0..a76bcfe692b39 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -255,14 +255,14 @@ class __SYCL_EXPORT handler { MStreamStorage.push_back(std::move(Stream)); } - /// Saves buffers and scalars associated with reduction to handler. - /// They are then forwarded to command group later and destroyed - /// only after the command group finishes the work on device/host. + /// Saves buffers created by handling reduction feature in handler. + /// They are then forwarded to command group and destroyed only after + /// the command group finishes the work on device/host. + /// The 'MSharedPtrStorage' suits that need. /// - /// @param ReduObj is a pointer to object that must be preserved - /// for reduction until the . - void addReduction(shared_ptr_class ReduObj) { - MReductionStorage.push_back(std::move(ReduObj)); + /// @param ReduObj is a pointer to object that must be stored. + void addReduction(shared_ptr_class ReduObj) { + MSharedPtrStorage.push_back(std::move(ReduObj)); } ~handler() = default; @@ -1714,7 +1714,6 @@ class __SYCL_EXPORT handler { vector_class MAccStorage; vector_class MLocalAccStorage; vector_class> MStreamStorage; - vector_class> MReductionStorage; vector_class> MSharedPtrStorage; /// The list of arguments for the kernel. vector_class MArgs; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index a97f5005795df..bb0e221995f54 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -40,8 +40,8 @@ event handler::finalize() { std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), std::move(MRequirements), std::move(MEvents), std::move(MArgs), std::move(MKernelName), - std::move(MOSModuleHandle), std::move(MStreamStorage), - std::move(MReductionStorage), MCGType, MCodeLoc)); + std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType, + MCodeLoc)); break; } case detail::CG::INTEROP_TASK_CODEPLAY: diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index ccab7147a68c7..700716600350d 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3232,7 +3232,8 @@ _ZN2cl4sycl7handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmb _ZN2cl4sycl7handler13getKernelNameB5cxx11Ev _ZN2cl4sycl7handler18extractArgsAndReqsEv _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tE -_ZN2cl4sycl7handler8finalizeERKNS0_6detail13code_locationE +_ZN2cl4sycl7handler15addEventToQueueESt10shared_ptrINS0_6detail10queue_implEENS0_5eventE +_ZN2cl4sycl7handler8finalizeEv _ZN2cl4sycl7program17build_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_ _ZN2cl4sycl7program19compile_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_ _ZN2cl4sycl7program22build_with_kernel_nameENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_l diff --git a/sycl/test/abi/symbol_size.cpp b/sycl/test/abi/symbol_size.cpp index bfdeaf19df24f..9d2b8906e9c7f 100644 --- a/sycl/test/abi/symbol_size.cpp +++ b/sycl/test/abi/symbol_size.cpp @@ -43,7 +43,7 @@ int main() { check_size(); check_size(); check_size(); - check_size(); + check_size(); check_size, 16>(); check_size(); check_size(); From a4097523ce31adb5ca9bdbdd28697831d56efd9c Mon Sep 17 00:00:00 2001 From: Vyacheslav N Klochkov Date: Mon, 27 Apr 2020 12:57:33 -0700 Subject: [PATCH 5/6] [SYCL] Do additional changes per reviewer's comments, fix regressed LIT tests Signed-off-by: Vyacheslav N Klochkov --- sycl/include/CL/sycl/handler.hpp | 38 +++++------ sycl/source/handler.cpp | 4 +- sycl/test/abi/symbol_size.cpp | 4 ++ .../reduction/reduction_nd_conditional.cpp | 16 +---- sycl/test/reduction/reduction_nd_s0_dw.cpp | 63 ++----------------- sycl/test/reduction/reduction_nd_s0_rw.cpp | 63 ++----------------- sycl/test/reduction/reduction_nd_s1_dw.cpp | 63 ++----------------- sycl/test/reduction/reduction_nd_s1_rw.cpp | 63 ++----------------- sycl/test/reduction/reduction_utils.hpp | 54 ++++++++++++++++ 9 files changed, 103 insertions(+), 265 deletions(-) create mode 100644 sycl/test/reduction/reduction_utils.hpp diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index a76bcfe692b39..941e878ebb591 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -232,7 +232,7 @@ class __SYCL_EXPORT handler { void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; } /// Stores the given \param Event to the \param Queue. - /// Even thought MQueue is a field of handler, the method addEvent() of + /// Even though MQueue is a field of handler, the method addEvent() of /// queue_impl class cannot be called inside this handler.hpp file /// as queue_impl is incomplete class for handler. static void addEventToQueue(shared_ptr_class Queue, @@ -814,7 +814,7 @@ class __SYCL_EXPORT handler { /// user's lambda function \param KernelFunc and does one iteration of /// reduction of elements in each of work-groups. /// This version uses tree-reduction algorithm to reduce elements in each - /// of work-groups. At the end of each work-groups the partial sum is written + /// of work-groups. At the end of each work-group the partial sum is written /// to a global buffer. /// /// Briefly: user's lambda, tree-reduction, CUSTOM types/ops. @@ -827,7 +827,7 @@ class __SYCL_EXPORT handler { size_t NWorkGroups = Range.get_group_range().size(); bool IsUnderLoaded = (NWorkGroups * WGSize - NWorkItems) != 0; - size_t InefficientCase = (IsUnderLoaded || (WGSize & (WGSize - 1))) ? 1 : 0; + bool IsEfficientCase = !IsUnderLoaded && ((WGSize & (WGSize - 1)) == 0); bool IsUpdateOfUserAcc = Reduction::accessor_mode == access::mode::read_write && @@ -835,13 +835,14 @@ class __SYCL_EXPORT handler { // Use local memory to reduce elements in work-groups into 0-th element. // If WGSize is not power of two, then WGSize+1 elements are allocated. - // The additional last element is used to catch reduce elements that could - // otherwise be lost in the tree-reduction algorithm used in the kernel. - auto LocalReds = Redu.getReadWriteLocalAcc(WGSize + InefficientCase, *this); + // The additional last element is used to catch elements that could + // otherwise be lost in the tree-reduction algorithm. + size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1); + auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this); auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, 0, *this); auto ReduIdentity = Redu.getIdentity(); - if (!InefficientCase) { + if (IsEfficientCase) { // Efficient case: work-groups are fully loaded and work-group size // is power of two. parallel_for(Range, [=](nd_item NDIt) { @@ -863,7 +864,7 @@ class __SYCL_EXPORT handler { NDIt.barrier(); } - // Compute the the partial sum/reduction for the work-group. + // Compute the partial sum/reduction for the work-group. if (LID == 0) Out.get_pointer().get()[NDIt.get_group_linear_id()] = IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0]) @@ -904,7 +905,7 @@ class __SYCL_EXPORT handler { PrevStep = CurStep; } - // Compute the the partial sum/reduction for the work-group. + // Compute the partial sum/reduction for the work-group. if (LID == 0) { auto GrID = NDIt.get_group_linear_id(); auto V = BOp(LocalReds[0], LocalReds[WGSize]); @@ -918,7 +919,7 @@ class __SYCL_EXPORT handler { /// Implements a command group function that enqueues a kernel that does one /// iteration of reduction of elements in each of work-groups. /// This version uses tree-reduction algorithm to reduce elements in each - /// of work-groups. At the end of each work-groups the partial sum is written + /// of work-groups. At the end of each work-group the partial sum is written /// to a global buffer. /// /// Briefly: aux kernel, tree-reduction, CUSTOM types/ops. @@ -932,7 +933,7 @@ class __SYCL_EXPORT handler { // size may be not power of those. Those two cases considered inefficient // as they require additional code and checks in the kernel. bool IsUnderLoaded = NWorkGroups * WGSize != NWorkItems; - size_t InefficientCase = (IsUnderLoaded || (WGSize & (WGSize - 1))) ? 1 : 0; + bool IsEfficientCase = !IsUnderLoaded && (WGSize & (WGSize - 1)) == 0; bool IsUpdateOfUserAcc = Reduction::accessor_mode == access::mode::read_write && @@ -940,9 +941,10 @@ class __SYCL_EXPORT handler { // Use local memory to reduce elements in work-groups into 0-th element. // If WGSize is not power of two, then WGSize+1 elements are allocated. - // The additional last element is used to catch reduce elements that - // could otherwise be lost in the tree-reduction algorithm. - auto LocalReds = Redu.getReadWriteLocalAcc(WGSize + InefficientCase, *this); + // The additional last element is used to catch elements that could + // otherwise be lost in the tree-reduction algorithm. + size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1); + auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this); // Get read accessor to the buffer that was used as output // in the previous kernel. After that create new output buffer if needed @@ -951,7 +953,7 @@ class __SYCL_EXPORT handler { auto In = Redu.getReadAccToPreviousPartialReds(*this); auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, KernelRun, *this); - if (!InefficientCase) { + if (IsEfficientCase) { // Efficient case: work-groups are fully loaded and work-group size // is power of two. using AuxName = typename detail::get_reduction_aux_1st_kernel_name_t< @@ -972,7 +974,7 @@ class __SYCL_EXPORT handler { NDIt.barrier(); } - // Compute the the partial sum/reduction for the work-group. + // Compute the partial sum/reduction for the work-group. if (LID == 0) Out.get_pointer().get()[NDIt.get_group_linear_id()] = IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0]) @@ -1010,7 +1012,7 @@ class __SYCL_EXPORT handler { PrevStep = CurStep; } - // Compute the the partial sum/reduction for the work-group. + // Compute the partial sum/reduction for the work-group. if (LID == 0) { auto GrID = NDIt.get_group_linear_id(); auto V = BOp(LocalReds[0], LocalReds[WGSize]); @@ -1096,7 +1098,7 @@ class __SYCL_EXPORT handler { handler AuxHandler(QueueCopy, MIsHost); AuxHandler.saveCodeLoc(MCodeLoc); - // The last kernel DOES write to reductions's accessor. + // The last kernel DOES write to reduction's accessor. // Associate it with handler manually. if (NWorkGroups == 1) AuxHandler.associateWithHandler(Redu.MAcc); diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index bb0e221995f54..8685b1060dcdc 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -25,8 +25,8 @@ void handler::addEventToQueue(shared_ptr_class Queue, } event handler::finalize() { - // This block of code is needed only to 5th/default reduction implementation. - // It is harmless (does nothing) for other implementations. + // This block of code is needed only for reduction implementation. + // It is harmless (does nothing) for everything else. if (MIsFinalized) return MLastEvent; MIsFinalized = true; diff --git a/sycl/test/abi/symbol_size.cpp b/sycl/test/abi/symbol_size.cpp index 9d2b8906e9c7f..5b595a86978ea 100644 --- a/sycl/test/abi/symbol_size.cpp +++ b/sycl/test/abi/symbol_size.cpp @@ -43,7 +43,11 @@ int main() { check_size(); check_size(); check_size(); +#ifdef _MSC_VER + check_size(); +#else check_size(); +#endif check_size, 16>(); check_size(); check_size(); diff --git a/sycl/test/reduction/reduction_nd_conditional.cpp b/sycl/test/reduction/reduction_nd_conditional.cpp index 57029e4654415..2c67fda514b04 100644 --- a/sycl/test/reduction/reduction_nd_conditional.cpp +++ b/sycl/test/reduction/reduction_nd_conditional.cpp @@ -1,17 +1,9 @@ -// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUNx: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -//==---reduction_nd_conditional.cpp - SYCL reduction + condition test ------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - // This test performs basic checks of parallel_for(nd_range, reduction, func) // with reduction and conditional increment of the reduction variable. @@ -41,9 +33,7 @@ void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, }; template -class Known; -template -class Unknown; +class SomeClass; template struct Vec { @@ -97,7 +87,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) { range<1> GlobalRange(NWItems); range<1> LocalRange(WGSize); nd_range<1> NDRange(GlobalRange, LocalRange); - CGH.parallel_for>( + CGH.parallel_for>( NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { size_t I = NDIt.get_global_linear_id(); if (I < 2) diff --git a/sycl/test/reduction/reduction_nd_s0_dw.cpp b/sycl/test/reduction/reduction_nd_s0_dw.cpp index fe764eb2e00cb..a3a9a8095e0ad 100644 --- a/sycl/test/reduction/reduction_nd_s0_dw.cpp +++ b/sycl/test/reduction/reduction_nd_s0_dw.cpp @@ -1,73 +1,20 @@ -// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUNx: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// // This test performs basic checks of parallel_for(nd_range, reduction, func) // with reductions initialized with 0-dimensional discard_write accessor. +#include "reduction_utils.hpp" #include #include using namespace cl::sycl; -template -void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, - BinaryOperation BOp, size_t N) { - ExpectedOut = Identity; - auto In = InBuf.template get_access(); - for (int I = 0; I < N; ++I) { - if (std::is_same>::value) - In[I] = 1 + (((I % 37) == 0) ? 1 : 0); - else - In[I] = ((I + 1) % 5) + 1.1; - ExpectedOut = BOp(ExpectedOut, In[I]); - } -}; - -template -class Known; template -class Unknown; - -template -struct Vec { - Vec() : X(0), Y(0) {} - Vec(T X, T Y) : X(X), Y(Y) {} - Vec(T V) : X(V), Y(V) {} - bool operator==(const Vec &P) const { - return P.X == X && P.Y == Y; - } - bool operator!=(const Vec &P) const { - return !(*this == P); - } - T X; - T Y; -}; -template -bool operator==(const Vec &A, const Vec &B) { - return A.X == B.X && A.Y == B.Y; -} -template -std::ostream &operator<<(std::ostream &OS, const Vec &P) { - return OS << "(" << P.X << ", " << P.Y << ")"; -} - -template -struct VecPlus { - using P = Vec; - P operator()(const P &A, const P &B) const { - return P(A.X + B.X, A.Y + B.Y); - } -}; +class SomeClass; template void test(T Identity, size_t WGSize, size_t NWItems) { @@ -90,7 +37,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) { range<1> GlobalRange(NWItems); range<1> LocalRange(WGSize); nd_range<1> NDRange(GlobalRange, LocalRange); - CGH.parallel_for>( + CGH.parallel_for>( NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { Sum.combine(In[NDIt.get_global_linear_id()]); }); @@ -142,7 +89,7 @@ int main() { test>(std::numeric_limits::min(), 8, 256); // Check with CUSTOM type. - test, 0, VecPlus>(Vec(0), 8, 256); + test, 0, CustomVecPlus>(CustomVec(0), 8, 256); std::cout << "Test passed\n"; return 0; diff --git a/sycl/test/reduction/reduction_nd_s0_rw.cpp b/sycl/test/reduction/reduction_nd_s0_rw.cpp index 7ecee96832e4c..7f58d311424e8 100644 --- a/sycl/test/reduction/reduction_nd_s0_rw.cpp +++ b/sycl/test/reduction/reduction_nd_s0_rw.cpp @@ -1,73 +1,20 @@ -// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUNx: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// // This test performs basic checks of parallel_for(nd_range, reduction, func) // with reductions initialized with 0-dimensional read_write accessor. +#include "reduction_utils.hpp" #include #include using namespace cl::sycl; -template -void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, - BinaryOperation BOp, size_t N) { - ExpectedOut = Identity; - auto In = InBuf.template get_access(); - for (int I = 0; I < N; ++I) { - if (std::is_same>::value) - In[I] = 1 + (((I % 37) == 0) ? 1 : 0); - else - In[I] = ((I + 1) % 5) + 1.1; - ExpectedOut = BOp(ExpectedOut, In[I]); - } -}; - -template -class Known; template -class Unknown; - -template -struct Vec { - Vec() : X(0), Y(0) {} - Vec(T X, T Y) : X(X), Y(Y) {} - Vec(T V) : X(V), Y(V) {} - bool operator==(const Vec &P) const { - return P.X == X && P.Y == Y; - } - bool operator!=(const Vec &P) const { - return !(*this == P); - } - T X; - T Y; -}; -template -bool operator==(const Vec &A, const Vec &B) { - return A.X == B.X && A.Y == B.Y; -} -template -std::ostream &operator<<(std::ostream &OS, const Vec &P) { - return OS << "(" << P.X << ", " << P.Y << ")"; -} - -template -struct VecPlus { - using P = Vec; - P operator()(const P &A, const P &B) const { - return P(A.X + B.X, A.Y + B.Y); - } -}; +class SomeClass; template void test(T Identity, size_t WGSize, size_t NWItems) { @@ -92,7 +39,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) { range<1> GlobalRange(NWItems); range<1> LocalRange(WGSize); nd_range<1> NDRange(GlobalRange, LocalRange); - CGH.parallel_for>( + CGH.parallel_for>( NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { Sum.combine(In[NDIt.get_global_linear_id()]); }); @@ -144,7 +91,7 @@ int main() { test>(std::numeric_limits::min(), 8, 256); // Check with CUSTOM type. - test, 0, VecPlus>(Vec(0), 8, 256); + test, 0, CustomVecPlus>(CustomVec(0), 8, 256); std::cout << "Test passed\n"; return 0; diff --git a/sycl/test/reduction/reduction_nd_s1_dw.cpp b/sycl/test/reduction/reduction_nd_s1_dw.cpp index 27df7b4647771..7b0ca8aeebfe5 100644 --- a/sycl/test/reduction/reduction_nd_s1_dw.cpp +++ b/sycl/test/reduction/reduction_nd_s1_dw.cpp @@ -1,74 +1,21 @@ -// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUNx: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// // This test performs basic checks of parallel_for(nd_range, reduction, func) // with reductions initialized with 1-dimensional discard_write accessor // accessing 1 element buffer. +#include "reduction_utils.hpp" #include #include using namespace cl::sycl; -template -void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, - BinaryOperation BOp, size_t N) { - ExpectedOut = Identity; - auto In = InBuf.template get_access(); - for (int I = 0; I < N; ++I) { - if (std::is_same>::value) - In[I] = 1 + (((I % 37) == 0) ? 1 : 0); - else - In[I] = ((I + 1) % 5) + 1.1; - ExpectedOut = BOp(ExpectedOut, In[I]); - } -}; - -template -class Known; template -class Unknown; - -template -struct Vec { - Vec() : X(0), Y(0) {} - Vec(T X, T Y) : X(X), Y(Y) {} - Vec(T V) : X(V), Y(V) {} - bool operator==(const Vec &P) const { - return P.X == X && P.Y == Y; - } - bool operator!=(const Vec &P) const { - return !(*this == P); - } - T X; - T Y; -}; -template -bool operator==(const Vec &A, const Vec &B) { - return A.X == B.X && A.Y == B.Y; -} -template -std::ostream &operator<<(std::ostream &OS, const Vec &P) { - return OS << "(" << P.X << ", " << P.Y << ")"; -} - -template -struct VecPlus { - using P = Vec; - P operator()(const P &A, const P &B) const { - return P(A.X + B.X, A.Y + B.Y); - } -}; +class SomeClass; template void test(T Identity, size_t WGSize, size_t NWItems) { @@ -91,7 +38,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) { range<1> GlobalRange(NWItems); range<1> LocalRange(WGSize); nd_range<1> NDRange(GlobalRange, LocalRange); - CGH.parallel_for>( + CGH.parallel_for>( NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { Sum.combine(In[NDIt.get_global_linear_id()]); }); @@ -143,7 +90,7 @@ int main() { test>(std::numeric_limits::min(), 8, 256); // Check with CUSTOM type. - test, 1, VecPlus>(Vec(0), 8, 256); + test, 1, CustomVecPlus>(CustomVec(0), 8, 256); std::cout << "Test passed\n"; return 0; diff --git a/sycl/test/reduction/reduction_nd_s1_rw.cpp b/sycl/test/reduction/reduction_nd_s1_rw.cpp index f416f473a69a0..333b03bf40cb0 100644 --- a/sycl/test/reduction/reduction_nd_s1_rw.cpp +++ b/sycl/test/reduction/reduction_nd_s1_rw.cpp @@ -1,74 +1,21 @@ -// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out // RUNx: env SYCL_DEVICE_TYPE=HOST %t.out // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out // RUN: %ACC_RUN_PLACEHOLDER %t.out -//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// // This test performs basic checks of parallel_for(nd_range, reduction, func) // with reductions initialized with 1-dimensional read_write accessor // accessing 1 element buffer. +#include "reduction_utils.hpp" #include #include using namespace cl::sycl; -template -void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, - BinaryOperation BOp, size_t N) { - ExpectedOut = Identity; - auto In = InBuf.template get_access(); - for (int I = 0; I < N; ++I) { - if (std::is_same>::value) - In[I] = 1 + (((I % 37) == 0) ? 1 : 0); - else - In[I] = ((I + 1) % 5) + 1.1; - ExpectedOut = BOp(ExpectedOut, In[I]); - } -}; - -template -class Known; template -class Unknown; - -template -struct Vec { - Vec() : X(0), Y(0) {} - Vec(T X, T Y) : X(X), Y(Y) {} - Vec(T V) : X(V), Y(V) {} - bool operator==(const Vec &P) const { - return P.X == X && P.Y == Y; - } - bool operator!=(const Vec &P) const { - return !(*this == P); - } - T X; - T Y; -}; -template -bool operator==(const Vec &A, const Vec &B) { - return A.X == B.X && A.Y == B.Y; -} -template -std::ostream &operator<<(std::ostream &OS, const Vec &P) { - return OS << "(" << P.X << ", " << P.Y << ")"; -} - -template -struct VecPlus { - using P = Vec; - P operator()(const P &A, const P &B) const { - return P(A.X + B.X, A.Y + B.Y); - } -}; +class SomeClass; template void test(T Identity, size_t WGSize, size_t NWItems) { @@ -93,7 +40,7 @@ void test(T Identity, size_t WGSize, size_t NWItems) { range<1> GlobalRange(NWItems); range<1> LocalRange(WGSize); nd_range<1> NDRange(GlobalRange, LocalRange); - CGH.parallel_for>( + CGH.parallel_for>( NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { Sum.combine(In[NDIt.get_global_linear_id()]); }); @@ -145,7 +92,7 @@ int main() { test>(std::numeric_limits::min(), 8, 256); // Check with CUSTOM type. - test, 1, VecPlus>(Vec(0), 8, 256); + test, 1, CustomVecPlus>(CustomVec(0), 8, 256); std::cout << "Test passed\n"; return 0; diff --git a/sycl/test/reduction/reduction_utils.hpp b/sycl/test/reduction/reduction_utils.hpp new file mode 100644 index 0000000000000..c7a1699298f02 --- /dev/null +++ b/sycl/test/reduction/reduction_utils.hpp @@ -0,0 +1,54 @@ +#include + +using namespace cl::sycl; + +// Initializes 'InBuf' buffer with pseudo-random values, computes the reduction +// value for the buffer and writes it to 'ExpectedOut'. +template +void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, + BinaryOperation BOp, size_t N) { + ExpectedOut = Identity; + auto In = InBuf.template get_access(); + for (int I = 0; I < N; ++I) { + if (std::is_same>::value) + In[I] = 1 + (((I % 37) == 0) ? 1 : 0); + else + In[I] = ((I + 1) % 5) + 1.1; + ExpectedOut = BOp(ExpectedOut, In[I]); + } +}; + +// This type is needed only to check that custom types are properly handled +// in parallel_for() with reduction. For simplicity it needs a default +// constructor, a constructor with one argument, operators ==, != and +// printing to a stream. +template +struct CustomVec { + CustomVec() : X(0), Y(0) {} + CustomVec(T X, T Y) : X(X), Y(Y) {} + CustomVec(T V) : X(V), Y(V) {} + bool operator==(const CustomVec &V) const { + return V.X == X && V.Y == Y; + } + bool operator!=(const CustomVec &V) const { + return !(*this == V); + } + T X; + T Y; +}; +template +bool operator==(const CustomVec &A, const CustomVec &B) { + return A.X == B.X && A.Y == B.Y; +} +template +std::ostream &operator<<(std::ostream &OS, const CustomVec &V) { + return OS << "(" << V.X << ", " << V.Y << ")"; +} + +template +struct CustomVecPlus { + using CV = CustomVec; + CV operator()(const CV &A, const CV &B) const { + return CV(A.X + B.X, A.Y + B.Y); + } +}; From a6350ba37957d91d592a7c2c794e9cdd248b7cc3 Mon Sep 17 00:00:00 2001 From: Vyacheslav N Klochkov Date: Mon, 27 Apr 2020 14:41:44 -0700 Subject: [PATCH 6/6] [SYCL] Add a test to check reductions using transparent operators Signed-off-by: Vyacheslav N Klochkov --- sycl/test/reduction/reduction_transparent.cpp | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 sycl/test/reduction/reduction_transparent.cpp diff --git a/sycl/test/reduction/reduction_transparent.cpp b/sycl/test/reduction/reduction_transparent.cpp new file mode 100644 index 0000000000000..d9392fc2784a4 --- /dev/null +++ b/sycl/test/reduction/reduction_transparent.cpp @@ -0,0 +1,66 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(nd_range, reduction, func) +// where func is a transparent functor. + +#include "reduction_utils.hpp" +#include +#include + +using namespace cl::sycl; + +template +class SomeClass; + +template +void test(T Identity, size_t WGSize, size_t NWItems) { + buffer InBuf(NWItems); + buffer OutBuf(1); + + // Initialize. + BinaryOperation BOp; + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); + + // Compute. + queue Q; + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + accessor + Out(OutBuf, CGH); + auto Redu = intel::reduction(Out, Identity, BOp); + + range<1> GlobalRange(NWItems); + range<1> LocalRange(WGSize); + nd_range<1> NDRange(GlobalRange, LocalRange); + CGH.parallel_for>( + NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { + Sum.combine(In[NDIt.get_global_linear_id()]); + }); + }); + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + if (ComputedOut != CorrectOut) { + std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; + std::cout << "Computed value: " << ComputedOut + << ", Expected value: " << CorrectOut << "\n"; + assert(0 && "Wrong value."); + } +} + +int main() { +#if __cplusplus >= 201402L + test>(std::numeric_limits::min(), 7, 7 * 5); + test>(0, 7, 49); + test>(1, 4, 16); +#endif + + std::cout << "Test passed\n"; + return 0; +}