diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp index 400fd080cef1f..784dcfbac20df 100644 --- a/sycl/include/CL/sycl/builtins.hpp +++ b/sycl/include/CL/sycl/builtins.hpp @@ -24,11 +24,7 @@ namespace sycl { #else namespace __sycl_std = __host_std; #endif -} // namespace sycl -} // __SYCL_INLINE_NAMESPACE(cl) -__SYCL_INLINE_NAMESPACE(cl) { -namespace sycl { /* ----------------- 4.13.3 Math functions. ---------------------------------*/ // genfloat acos (genfloat x) template @@ -731,7 +727,8 @@ detail::enable_if_t::value, T> clz(T x) __NOEXC { namespace intel { // geninteger ctz (geninteger x) template -detail::enable_if_t::value, T> ctz(T x) __NOEXC { +sycl::detail::enable_if_t::value, T> +ctz(T x) __NOEXC { return __sycl_std::__invoke_ctz(x); } } // namespace intel diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp index fa1b992624644..100c29e8ebe38 100644 --- a/sycl/include/CL/sycl/detail/cg.hpp +++ b/sycl/include/CL/sycl/detail/cg.hpp @@ -46,7 +46,7 @@ class interop_handler { public: using QueueImplPtr = std::shared_ptr; - using ReqToMem = std::pair; + using ReqToMem = std::pair; interop_handler(std::vector MemObjs, QueueImplPtr Queue) : MQueue(std::move(Queue)), MMemObjs(std::move(MemObjs)) {} diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index dc3c49745b43f..941e878ebb591 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -106,8 +106,45 @@ template struct get_kernel_name_t { }; __SYCL_EXPORT device getDeviceFromHandler(handler &); + +/// These are the forward declaration for the classes that help to create +/// names for additional kernels. It is used only when there are +/// more then 1 kernels in one parallel_for() implementing SYCL reduction. +template class __sycl_reduction_main_2nd_kernel; +template class __sycl_reduction_aux_1st_kernel; +template class __sycl_reduction_aux_2nd_kernel; + +/// Helper structs to get additional kernel name types based on given +/// \c Name and \c Type types: if \c Name is undefined (is a \c auto_name) then +/// \c Type becomes the \c Name. +template +struct get_reduction_main_2nd_kernel_name_t { + using name = __sycl_reduction_main_2nd_kernel< + typename get_kernel_name_t::name>; +}; +template +struct get_reduction_aux_1st_kernel_name_t { + using name = __sycl_reduction_aux_1st_kernel< + typename get_kernel_name_t::name>; +}; +template +struct get_reduction_aux_2nd_kernel_name_t { + using name = __sycl_reduction_aux_2nd_kernel< + typename get_kernel_name_t::name>; +}; + +device getDeviceFromHandler(handler &); + } // namespace detail +namespace intel { +namespace detail { +template +class reduction_impl; +} // namespace detail +} // namespace intel + /// 4.8.3 Command group handler class /// /// Objects of the handler class collect information about command group, such @@ -190,14 +227,24 @@ class __SYCL_EXPORT handler { return LambdaName == KernelName; } + /// Saves the location of user's code passed in \param CodeLoc for future + /// usage in finalize() method. + void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; } + + /// Stores the given \param Event to the \param Queue. + /// Even though MQueue is a field of handler, the method addEvent() of + /// queue_impl class cannot be called inside this handler.hpp file + /// as queue_impl is incomplete class for handler. + static void addEventToQueue(shared_ptr_class Queue, + cl::sycl::event Event); + /// Constructs CG object of specific type, passes it to Scheduler and /// returns sycl::event object representing the command group. /// It's expected that the method is the latest method executed before /// object destruction. /// - /// \param Payload contains the code location of user code /// \return a SYCL event object representing the command group - event finalize(const cl::sycl::detail::code_location &Payload = {}); + event finalize(); /// Saves streams associated with this handler. /// @@ -208,6 +255,16 @@ class __SYCL_EXPORT handler { MStreamStorage.push_back(std::move(Stream)); } + /// Saves buffers created by handling reduction feature in handler. + /// They are then forwarded to command group and destroyed only after + /// the command group finishes the work on device/host. + /// The 'MSharedPtrStorage' suits that need. + /// + /// @param ReduObj is a pointer to object that must be stored. + void addReduction(shared_ptr_class ReduObj) { + MSharedPtrStorage.push_back(std::move(ReduObj)); + } + ~handler() = default; bool is_host() { return MIsHost; } @@ -231,6 +288,30 @@ class __SYCL_EXPORT handler { /*index*/ 0); } + template + void dissociateWithHandler(accessor + Acc) { + detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc; + detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase); + detail::Requirement *Req = AccImpl.get(); + + // Remove accessor from the list of requirements, accessors storage, + // and from the list of associated accessors. + auto ReqIt = std::find(MRequirements.begin(), MRequirements.end(), Req); + auto AccIt = std::find(MAccStorage.begin(), MAccStorage.end(), AccImpl); + auto It = + std::find_if(MAssociatedAccesors.begin(), MAssociatedAccesors.end(), + [Req](const detail::ArgDesc &D) { return D.MPtr == Req; }); + assert((ReqIt != MRequirements.end() && AccIt != MAccStorage.end() && + It != MAssociatedAccesors.end()) && + "Cannot dissociate accessor."); + MRequirements.erase(ReqIt); + MAccStorage.erase(AccIt); + MAssociatedAccesors.erase(It); + } + // Recursively calls itself until arguments pack is fully processed. // The version for regular(standard layout) argument. template @@ -729,6 +810,307 @@ class __SYCL_EXPORT handler { #endif } + /// Implements a command group function that enqueues a kernel that calls + /// user's lambda function \param KernelFunc and does one iteration of + /// reduction of elements in each of work-groups. + /// This version uses tree-reduction algorithm to reduce elements in each + /// of work-groups. At the end of each work-group the partial sum is written + /// to a global buffer. + /// + /// Briefly: user's lambda, tree-reduction, CUSTOM types/ops. + template + void reduCGFunc(KernelType KernelFunc, const nd_range &Range, + Reduction &Redu) { + + size_t NWorkItems = Range.get_global_range().size(); + size_t WGSize = Range.get_local_range().size(); + size_t NWorkGroups = Range.get_group_range().size(); + + bool IsUnderLoaded = (NWorkGroups * WGSize - NWorkItems) != 0; + bool IsEfficientCase = !IsUnderLoaded && ((WGSize & (WGSize - 1)) == 0); + + bool IsUpdateOfUserAcc = + Reduction::accessor_mode == access::mode::read_write && + NWorkGroups == 1; + + // Use local memory to reduce elements in work-groups into 0-th element. + // If WGSize is not power of two, then WGSize+1 elements are allocated. + // The additional last element is used to catch elements that could + // otherwise be lost in the tree-reduction algorithm. + size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1); + auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this); + + auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, 0, *this); + auto ReduIdentity = Redu.getIdentity(); + if (IsEfficientCase) { + // Efficient case: work-groups are fully loaded and work-group size + // is power of two. + parallel_for(Range, [=](nd_item NDIt) { + // Call user's functions. Reducer.MValue gets initialized there. + typename Reduction::reducer_type Reducer(ReduIdentity); + KernelFunc(NDIt, Reducer); + + // Copy the element to local memory to prepare it for tree-reduction. + size_t LID = NDIt.get_local_linear_id(); + LocalReds[LID] = Reducer.MValue; + NDIt.barrier(); + + // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]. + typename Reduction::binary_operation BOp; + size_t WGSize = NDIt.get_local_range().size(); + for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) + LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); + NDIt.barrier(); + } + + // Compute the partial sum/reduction for the work-group. + if (LID == 0) + Out.get_pointer().get()[NDIt.get_group_linear_id()] = + IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0]) + : LocalReds[0]; + }); + } else { + // Inefficient case: work-groups are not fully loaded + // or WGSize is not power of two. + // These two inefficient cases are handled by one kernel, which + // can be split later into two separate kernels, if there are users who + // really need more efficient code for them. + using AuxName = typename detail::get_reduction_main_2nd_kernel_name_t< + KernelName, KernelType>::name; + parallel_for(Range, [=](nd_item NDIt) { + // Call user's functions. Reducer.MValue gets initialized there. + typename Reduction::reducer_type Reducer(ReduIdentity); + KernelFunc(NDIt, Reducer); + + size_t WGSize = NDIt.get_local_range().size(); + size_t LID = NDIt.get_local_linear_id(); + size_t GID = NDIt.get_global_linear_id(); + // Copy the element to local memory to prepare it for tree-reduction. + LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity; + LocalReds[WGSize] = ReduIdentity; + NDIt.barrier(); + + // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0] + // LocalReds[WGSize] accumulates last/odd elements when the step + // of tree-reduction loop is not even. + typename Reduction::binary_operation BOp; + size_t PrevStep = WGSize; + for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) + LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); + else if (LID == CurStep && (PrevStep & 0x1)) + LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]); + NDIt.barrier(); + PrevStep = CurStep; + } + + // Compute the partial sum/reduction for the work-group. + if (LID == 0) { + auto GrID = NDIt.get_group_linear_id(); + auto V = BOp(LocalReds[0], LocalReds[WGSize]); + Out.get_pointer().get()[GrID] = + IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; + } + }); + } + } + + /// Implements a command group function that enqueues a kernel that does one + /// iteration of reduction of elements in each of work-groups. + /// This version uses tree-reduction algorithm to reduce elements in each + /// of work-groups. At the end of each work-group the partial sum is written + /// to a global buffer. + /// + /// Briefly: aux kernel, tree-reduction, CUSTOM types/ops. + template + void reduAuxCGFunc(const nd_range &Range, size_t NWorkItems, + size_t KernelRun, Reduction &Redu) { + size_t WGSize = Range.get_local_range().size(); + size_t NWorkGroups = Range.get_group_range().size(); + + // The last work-group may be not fully loaded with work, or the work group + // size may be not power of those. Those two cases considered inefficient + // as they require additional code and checks in the kernel. + bool IsUnderLoaded = NWorkGroups * WGSize != NWorkItems; + bool IsEfficientCase = !IsUnderLoaded && (WGSize & (WGSize - 1)) == 0; + + bool IsUpdateOfUserAcc = + Reduction::accessor_mode == access::mode::read_write && + NWorkGroups == 1; + + // Use local memory to reduce elements in work-groups into 0-th element. + // If WGSize is not power of two, then WGSize+1 elements are allocated. + // The additional last element is used to catch elements that could + // otherwise be lost in the tree-reduction algorithm. + size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1); + auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this); + + // Get read accessor to the buffer that was used as output + // in the previous kernel. After that create new output buffer if needed + // and get accessor to it (or use reduction's accessor if the kernel + // is the last one). + auto In = Redu.getReadAccToPreviousPartialReds(*this); + auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, KernelRun, *this); + + if (IsEfficientCase) { + // Efficient case: work-groups are fully loaded and work-group size + // is power of two. + using AuxName = typename detail::get_reduction_aux_1st_kernel_name_t< + KernelName, KernelType>::name; + parallel_for(Range, [=](nd_item NDIt) { + // Copy the element to local memory to prepare it for tree-reduction. + size_t LID = NDIt.get_local_linear_id(); + size_t GID = NDIt.get_global_linear_id(); + LocalReds[LID] = In[GID]; + NDIt.barrier(); + + // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0] + typename Reduction::binary_operation BOp; + size_t WGSize = NDIt.get_local_range().size(); + for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) + LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); + NDIt.barrier(); + } + + // Compute the partial sum/reduction for the work-group. + if (LID == 0) + Out.get_pointer().get()[NDIt.get_group_linear_id()] = + IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0]) + : LocalReds[0]; + }); + } else { + // Inefficient case: work-groups are not fully loaded + // or WGSize is not power of two. + // These two inefficient cases are handled by one kernel, which + // can be split later into two separate kernels, if there are users + // who really need more efficient code for them. + using AuxName = typename detail::get_reduction_aux_2nd_kernel_name_t< + KernelName, KernelType>::name; + auto ReduIdentity = Redu.getIdentity(); + parallel_for(Range, [=](nd_item NDIt) { + size_t WGSize = NDIt.get_local_range().size(); + size_t LID = NDIt.get_local_linear_id(); + size_t GID = NDIt.get_global_linear_id(); + // Copy the element to local memory to prepare it for tree-reduction + LocalReds[LID] = (GID < NWorkItems) ? In[GID] : ReduIdentity; + LocalReds[WGSize] = ReduIdentity; + NDIt.barrier(); + + // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0] + // LocalReds[WGSize] accumulates last/odd elements when the step + // of tree-reduction loop is not even. + typename Reduction::binary_operation BOp; + size_t PrevStep = WGSize; + for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) { + if (LID < CurStep) + LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]); + else if (LID == CurStep && (PrevStep & 0x1)) + LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]); + NDIt.barrier(); + PrevStep = CurStep; + } + + // Compute the partial sum/reduction for the work-group. + if (LID == 0) { + auto GrID = NDIt.get_group_linear_id(); + auto V = BOp(LocalReds[0], LocalReds[WGSize]); + Out.get_pointer().get()[GrID] = + IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V; + } + }); + } + } + + /// Defines and invokes a SYCL kernel function for the specified nd_range. + /// Performs reduction operation specified in \param Redu. + /// + /// The SYCL kernel function is defined as a lambda function or a named + /// function object type and given an id or item for indexing in the indexing + /// space defined by range. + /// If it is a named function object and the function object type is + /// globally visible, there is no need for the developer to provide + /// a kernel name for it. + /// + /// TODO: currently it calls only those versions of kernels that can handle + /// custom types and operations. Some of types and operations may use faster + /// implementations that use intel::reduce() and/or sycl::atomic.fetch_() + /// functions and thus provide much better performance. Those variants exist, + /// are fully functional. They just wait for their time for code-review. + /// TODO: Need to handle more than 1 reduction in parallel_for(). + /// TODO: Support HOST. The kernels called by this parallel_for() may use + /// some functionality that is not yet supported on HOST such as: + /// barrier(), and intel::reduce() that also may be used in more + /// optimized implementations waiting for their turn of code-review. + template + void parallel_for(nd_range Range, Reduction &Redu, + KernelType KernelFunc) { + size_t NWorkGroups = Range.get_group_range().size(); + + // This parallel_for() is lowered to the following sequence: + // 1) Call a kernel that a) call user's lambda function and b) performs + // one iteration of reduction, storing the partial reductions/sums + // to either a newly created global buffer or to user's reduction + // accessor. So, if the original 'Range' has totally + // N1 elements and work-group size is W, then after the first iteration + // there will be N2 partial sums where N2 = N1 / W. + // If (N2 == 1) then the partial sum is written to user's accessor. + // Otherwise, a new global buffer is created and partial sums are written + // to it. + // 2) Call an aux kernel (if necessary, i.e. if N2 > 1) as many times as + // necessary to reduce all partial sums into one final sum. + + // 1. Call the kernel that includes user's lambda function. + // If this kernel is going to be now last one, i.e. it does not write + // to user's accessor, then detach user's accessor from this kernel + // to make the dependencies between accessors and kernels more clean and + // correct. + if (NWorkGroups > 1) + dissociateWithHandler(Redu.MAcc); + + reduCGFunc(KernelFunc, Range, Redu); + auto QueueCopy = MQueue; + MLastEvent = this->finalize(); + + // 2. Run the additional aux kernel as many times as needed to reduce + // all partial sums into one scalar. + size_t WGSize = Range.get_local_range().size(); + size_t NWorkItems = NWorkGroups; + size_t KernelRun = 1; + while (NWorkItems > 1) { + // Before creating another kernel, add the event from the previous kernel + // to queue. + addEventToQueue(QueueCopy, MLastEvent); + + // TODO: here the work-group size is not limited by user's needs, + // the better strategy here is to make the work-group-size as big + // as possible. + WGSize = std::min(WGSize, NWorkItems); + NWorkGroups = NWorkItems / WGSize; + // The last group may be not fully loaded. Still register it as a group. + if ((NWorkItems % WGSize) != 0) + ++NWorkGroups; + auto Range = + nd_range<1>(range<1>(WGSize * NWorkGroups), range<1>(WGSize)); + + handler AuxHandler(QueueCopy, MIsHost); + AuxHandler.saveCodeLoc(MCodeLoc); + + // The last kernel DOES write to reduction's accessor. + // Associate it with handler manually. + if (NWorkGroups == 1) + AuxHandler.associateWithHandler(Redu.MAcc); + AuxHandler.reduAuxCGFunc(Range, NWorkItems, + KernelRun, Redu); + MLastEvent = AuxHandler.finalize(); + + NWorkItems = NWorkGroups; + ++KernelRun; + } // end while (NWorkItems > 1) + } + /// Hierarchical kernel invocation method of a kernel defined as a lambda /// encoding the body of each work-group to launch. /// @@ -1368,6 +1750,10 @@ class __SYCL_EXPORT handler { bool MIsHost = false; + detail::code_location MCodeLoc = {}; + bool MIsFinalized = false; + event MLastEvent; + // Make queue_impl class friend to be able to call finalize method. friend class detail::queue_impl; // Make accessor class friend to keep the list of associated accessors. @@ -1382,6 +1768,11 @@ class __SYCL_EXPORT handler { // Make stream class friend to be able to keep the list of associated streams friend class stream; friend class detail::stream_impl; + // Make reduction_impl friend to store buffers and arrays created for it + // in handler from reduction_impl methods. + template + friend class intel::detail::reduction_impl; }; } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/intel/function_pointer.hpp b/sycl/include/CL/sycl/intel/function_pointer.hpp index 2aa64cdd2e580..f812be911b788 100644 --- a/sycl/include/CL/sycl/intel/function_pointer.hpp +++ b/sycl/include/CL/sycl/intel/function_pointer.hpp @@ -81,7 +81,7 @@ device_func_ptr_holder_t get_device_func_ptr(FuncType F, const char *FuncName, PI_INVALID_OPERATION); } - return detail::getDeviceFunctionPointerImpl(D, P, FuncName); + return sycl::detail::getDeviceFunctionPointerImpl(D, P, FuncName); } } // namespace intel } // namespace sycl diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp index 2b0fb264bc94f..3b6289cb03ecc 100644 --- a/sycl/include/CL/sycl/intel/reduction.hpp +++ b/sycl/include/CL/sycl/intel/reduction.hpp @@ -342,6 +342,31 @@ class reduction_impl { "Only scalar/1-element reductions are supported now."); } + accessor + getReadWriteLocalAcc(size_t Size, handler &CGH) { + return accessor(Size, CGH); + } + + accessor + getReadAccToPreviousPartialReds(handler &CGH) const { + CGH.addReduction(MOutBufPtr); + return accessor(*MOutBufPtr, CGH); + } + + accessor_type getWriteAccForPartialReds(size_t Size, size_t RunNumber, + handler &CGH) { + if (Size == 1) { + if (RunNumber > 0) + CGH.associateWithHandler(this->MAcc); + return this->MAcc; + } + // Create a new output buffer and return an accessor to it. + MOutBufPtr = std::make_shared>(range<1>(Size)); + CGH.addReduction(MOutBufPtr); + return accessor_type(*MOutBufPtr, CGH); + } /// User's accessor to where the reduction must be written. accessor_type MAcc; @@ -349,6 +374,7 @@ class reduction_impl { /// Identity of the BinaryOperation. /// The result of BinaryOperation(X, MIdentity) is equal to X for any X. const T MIdentity; + shared_ptr_class> MOutBufPtr; }; } // namespace detail diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 937223f114c5f..9b65ead92fcde 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -351,6 +351,11 @@ class queue_impl { /// \return a native handle. pi_native_handle getNative() const; + /// Stores an event that should be associated with the queue + /// + /// \param Event is the event to be stored + void addEvent(event Event); + private: /// Performs command group submission to the queue. /// @@ -362,8 +367,9 @@ class queue_impl { shared_ptr_class Self, const detail::code_location &Loc) { handler Handler(std::move(Self), MHostQueue); + Handler.saveCodeLoc(Loc); CGF(Handler); - event Event = Handler.finalize(Loc); + event Event = Handler.finalize(); addEvent(Event); return Event; } @@ -377,11 +383,6 @@ class queue_impl { void instrumentationEpilog(void *TelementryEvent, string_class &Name, int32_t StreamID, uint64_t IId); - /// Stores an event that should be associated with the queue - /// - /// \param Event is the event to be stored - void addEvent(event Event); - /// Stores a USM operation event that should be associated with the queue /// /// \param Event is the event to be stored diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index ac6baf67bfd25..8685b1060dcdc 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -13,12 +13,24 @@ #include #include #include +#include #include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -event handler::finalize(const cl::sycl::detail::code_location &Payload) { - sycl::event EventRet; + +void handler::addEventToQueue(shared_ptr_class Queue, + cl::sycl::event Event) { + Queue->addEvent(std::move(Event)); +} + +event handler::finalize() { + // This block of code is needed only for reduction implementation. + // It is harmless (does nothing) for everything else. + if (MIsFinalized) + return MLastEvent; + MIsFinalized = true; + unique_ptr_class CommandGroup; switch (MCGType) { case detail::CG::KERNEL: @@ -29,14 +41,14 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) { std::move(MSharedPtrStorage), std::move(MRequirements), std::move(MEvents), std::move(MArgs), std::move(MKernelName), std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType, - Payload)); + MCodeLoc)); break; } case detail::CG::INTEROP_TASK_CODEPLAY: CommandGroup.reset(new detail::CGInteropTask( std::move(MInteropTask), std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), - std::move(MRequirements), std::move(MEvents), MCGType, Payload)); + std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc)); break; case detail::CG::COPY_ACC_TO_PTR: case detail::CG::COPY_PTR_TO_ACC: @@ -44,37 +56,37 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) { CommandGroup.reset(new detail::CGCopy( MCGType, MSrcPtr, MDstPtr, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), - std::move(MRequirements), std::move(MEvents), Payload)); + std::move(MRequirements), std::move(MEvents), MCodeLoc)); break; case detail::CG::FILL: CommandGroup.reset(new detail::CGFill( std::move(MPattern), MDstPtr, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), - std::move(MRequirements), std::move(MEvents), Payload)); + std::move(MRequirements), std::move(MEvents), MCodeLoc)); break; case detail::CG::UPDATE_HOST: CommandGroup.reset(new detail::CGUpdateHost( MDstPtr, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), std::move(MRequirements), - std::move(MEvents), Payload)); + std::move(MEvents), MCodeLoc)); break; case detail::CG::COPY_USM: CommandGroup.reset(new detail::CGCopyUSM( MSrcPtr, MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), - std::move(MRequirements), std::move(MEvents), Payload)); + std::move(MRequirements), std::move(MEvents), MCodeLoc)); break; case detail::CG::FILL_USM: CommandGroup.reset(new detail::CGFillUSM( std::move(MPattern), MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), - std::move(MRequirements), std::move(MEvents), Payload)); + std::move(MRequirements), std::move(MEvents), MCodeLoc)); break; case detail::CG::PREFETCH_USM: CommandGroup.reset(new detail::CGPrefetchUSM( MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), std::move(MRequirements), - std::move(MEvents), Payload)); + std::move(MEvents), MCodeLoc)); break; case detail::CG::NONE: throw runtime_error("Command group submitted without a kernel or a " @@ -88,8 +100,8 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) { detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG( std::move(CommandGroup), std::move(MQueue)); - EventRet = detail::createSyclObjFromImpl(Event); - return EventRet; + MLastEvent = detail::createSyclObjFromImpl(Event); + return MLastEvent; } void handler::processArg(void *Ptr, const detail::kernel_param_kind_t &Kind, diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index ccab7147a68c7..700716600350d 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3232,7 +3232,8 @@ _ZN2cl4sycl7handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmb _ZN2cl4sycl7handler13getKernelNameB5cxx11Ev _ZN2cl4sycl7handler18extractArgsAndReqsEv _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tE -_ZN2cl4sycl7handler8finalizeERKNS0_6detail13code_locationE +_ZN2cl4sycl7handler15addEventToQueueESt10shared_ptrINS0_6detail10queue_implEENS0_5eventE +_ZN2cl4sycl7handler8finalizeEv _ZN2cl4sycl7program17build_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_ _ZN2cl4sycl7program19compile_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_ _ZN2cl4sycl7program22build_with_kernel_nameENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_l diff --git a/sycl/test/abi/symbol_size.cpp b/sycl/test/abi/symbol_size.cpp index bfdeaf19df24f..5b595a86978ea 100644 --- a/sycl/test/abi/symbol_size.cpp +++ b/sycl/test/abi/symbol_size.cpp @@ -43,7 +43,11 @@ int main() { check_size(); check_size(); check_size(); - check_size(); +#ifdef _MSC_VER + check_size(); +#else + check_size(); +#endif check_size, 16>(); check_size(); check_size(); diff --git a/sycl/test/reduction/reduction_nd_conditional.cpp b/sycl/test/reduction/reduction_nd_conditional.cpp new file mode 100644 index 0000000000000..2c67fda514b04 --- /dev/null +++ b/sycl/test/reduction/reduction_nd_conditional.cpp @@ -0,0 +1,121 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(nd_range, reduction, func) +// with reduction and conditional increment of the reduction variable. + +#include +#include + +using namespace cl::sycl; + +template +void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, + BinaryOperation BOp, size_t N) { + ExpectedOut = Identity; + auto In = InBuf.template get_access(); + for (int I = 0; I < N; ++I) { + if (std::is_same>::value) + In[I] = 1 + (((I % 37) == 0) ? 1 : 0); + else + In[I] = I + 1 + 1.1; + + if (I < 2) + ExpectedOut = BOp(ExpectedOut, 99); + else if (I % 3) + ExpectedOut = BOp(ExpectedOut, In[I]); + else + ; // do nothing. + } +}; + +template +class SomeClass; + +template +struct Vec { + Vec() : X(0), Y(0) {} + Vec(T X, T Y) : X(X), Y(Y) {} + Vec(T V) : X(V), Y(V) {} + bool operator==(const Vec &P) const { + return P.X == X && P.Y == Y; + } + bool operator!=(const Vec &P) const { + return !(*this == P); + } + T X; + T Y; +}; +template +bool operator==(const Vec &A, const Vec &B) { + return A.X == B.X && A.Y == B.Y; +} +template +std::ostream &operator<<(std::ostream &OS, const Vec &P) { + return OS << "(" << P.X << ", " << P.Y << ")"; +} + +template +struct VecPlus { + using P = Vec; + P operator()(const P &A, const P &B) const { + return P(A.X + B.X, A.Y + B.Y); + } +}; + +template +void test(T Identity, size_t WGSize, size_t NWItems) { + buffer InBuf(NWItems); + buffer OutBuf(1); + + // Initialize. + BinaryOperation BOp; + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); + + // Compute. + queue Q; + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + accessor + Out(OutBuf, CGH); + auto Redu = intel::reduction(Out, Identity, BOp); + + range<1> GlobalRange(NWItems); + range<1> LocalRange(WGSize); + nd_range<1> NDRange(GlobalRange, LocalRange); + CGH.parallel_for>( + NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { + size_t I = NDIt.get_global_linear_id(); + if (I < 2) + Sum.combine(T(99)); + else if (I % 3) + Sum.combine(In[I]); + else + ; // do nothing. + }); + }); + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + if (ComputedOut != CorrectOut) { + std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; + std::cout << "Computed value: " << ComputedOut + << ", Expected value: " << CorrectOut << "\n"; + assert(0 && "Wrong value."); + } +} + +int main() { + test>(0, 2, 2); + test>(0, 7, 7); + test>(0, 2, 64); + test>(0, 16, 256); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/sycl/test/reduction/reduction_nd_s0_dw.cpp b/sycl/test/reduction/reduction_nd_s0_dw.cpp new file mode 100644 index 0000000000000..a3a9a8095e0ad --- /dev/null +++ b/sycl/test/reduction/reduction_nd_s0_dw.cpp @@ -0,0 +1,96 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(nd_range, reduction, func) +// with reductions initialized with 0-dimensional discard_write accessor. + +#include "reduction_utils.hpp" +#include +#include + +using namespace cl::sycl; + +template +class SomeClass; + +template +void test(T Identity, size_t WGSize, size_t NWItems) { + buffer InBuf(NWItems); + buffer OutBuf(1); + + // Initialize. + BinaryOperation BOp; + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); + + // Compute. + queue Q; + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + accessor + Out(OutBuf, CGH); + auto Redu = intel::reduction(Out, Identity, BOp); + + range<1> GlobalRange(NWItems); + range<1> LocalRange(WGSize); + nd_range<1> NDRange(GlobalRange, LocalRange); + CGH.parallel_for>( + NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { + Sum.combine(In[NDIt.get_global_linear_id()]); + }); + }); + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + if (ComputedOut != CorrectOut) { + std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; + std::cout << "Computed value: " << ComputedOut + << ", Expected value: " << CorrectOut << "\n"; + assert(0 && "Wrong value."); + } +} + +int main() { + // Check some less standards WG sizes and corner cases first. + test>(0, 2, 2); + test>(0, 7, 7); + test>(0, 9, 18); + test>(0, 49, 49 * 5); + + // Try some power-of-two work-group sizes. + test>(0, 2, 64); + test>(0, 4, 64); + test>(0, 8, 128); + test>(0, 16, 256); + test>(0, 32, 256); + test>(0, 64, 256); + test>(0, 128, 256); + test>(0, 256, 256); + + // Check with various operations. + test>(1, 8, 256); + test>(0, 8, 256); + test>(0, 8, 256); + test>(~0, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + // Check with various types. + test>(1, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + test>(1, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + // Check with CUSTOM type. + test, 0, CustomVecPlus>(CustomVec(0), 8, 256); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/sycl/test/reduction/reduction_nd_s0_rw.cpp b/sycl/test/reduction/reduction_nd_s0_rw.cpp new file mode 100644 index 0000000000000..7f58d311424e8 --- /dev/null +++ b/sycl/test/reduction/reduction_nd_s0_rw.cpp @@ -0,0 +1,98 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(nd_range, reduction, func) +// with reductions initialized with 0-dimensional read_write accessor. + +#include "reduction_utils.hpp" +#include +#include + +using namespace cl::sycl; + +template +class SomeClass; + +template +void test(T Identity, size_t WGSize, size_t NWItems) { + buffer InBuf(NWItems); + buffer OutBuf(1); + + // Initialize. + BinaryOperation BOp; + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); + + (OutBuf.template get_access())[0] = Identity; + + // Compute. + queue Q; + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + accessor + Out(OutBuf, CGH); + auto Redu = intel::reduction(Out, Identity, BOp); + + range<1> GlobalRange(NWItems); + range<1> LocalRange(WGSize); + nd_range<1> NDRange(GlobalRange, LocalRange); + CGH.parallel_for>( + NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { + Sum.combine(In[NDIt.get_global_linear_id()]); + }); + }); + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + if (ComputedOut != CorrectOut) { + std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; + std::cout << "Computed value: " << ComputedOut + << ", Expected value: " << CorrectOut << "\n"; + assert(0 && "Wrong value."); + } +} + +int main() { + // Check some less standards WG sizes and corner cases first. + test>(0, 2, 2); + test>(0, 7, 7); + test>(0, 9, 18); + test>(0, 49, 49 * 5); + + // Try some power-of-two work-group sizes. + test>(0, 2, 64); + test>(0, 4, 64); + test>(0, 8, 128); + test>(0, 16, 256); + test>(0, 32, 256); + test>(0, 64, 256); + test>(0, 128, 256); + test>(0, 256, 256); + + // Check with various operations. + test>(1, 8, 256); + test>(0, 8, 256); + test>(0, 8, 256); + test>(~0, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + // Check with various types. + test>(1, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + test>(1, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + // Check with CUSTOM type. + test, 0, CustomVecPlus>(CustomVec(0), 8, 256); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/sycl/test/reduction/reduction_nd_s1_dw.cpp b/sycl/test/reduction/reduction_nd_s1_dw.cpp new file mode 100644 index 0000000000000..7b0ca8aeebfe5 --- /dev/null +++ b/sycl/test/reduction/reduction_nd_s1_dw.cpp @@ -0,0 +1,97 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(nd_range, reduction, func) +// with reductions initialized with 1-dimensional discard_write accessor +// accessing 1 element buffer. + +#include "reduction_utils.hpp" +#include +#include + +using namespace cl::sycl; + +template +class SomeClass; + +template +void test(T Identity, size_t WGSize, size_t NWItems) { + buffer InBuf(NWItems); + buffer OutBuf(1); + + // Initialize. + BinaryOperation BOp; + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); + + // Compute. + queue Q; + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + accessor + Out(OutBuf, CGH); + auto Redu = intel::reduction(Out, Identity, BOp); + + range<1> GlobalRange(NWItems); + range<1> LocalRange(WGSize); + nd_range<1> NDRange(GlobalRange, LocalRange); + CGH.parallel_for>( + NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { + Sum.combine(In[NDIt.get_global_linear_id()]); + }); + }); + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + if (ComputedOut != CorrectOut) { + std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; + std::cout << "Computed value: " << ComputedOut + << ", Expected value: " << CorrectOut << "\n"; + assert(0 && "Wrong value."); + } +} + +int main() { + // Check some less standards WG sizes and corner cases first. + test>(0, 2, 2); + test>(0, 7, 7); + test>(0, 9, 18); + test>(0, 49, 49 * 5); + + // Try some power-of-two work-group sizes. + test>(0, 2, 64); + test>(0, 4, 64); + test>(0, 8, 128); + test>(0, 16, 256); + test>(0, 32, 256); + test>(0, 64, 256); + test>(0, 128, 256); + test>(0, 256, 256); + + // Check with various operations. + test>(1, 8, 256); + test>(0, 8, 256); + test>(0, 8, 256); + test>(~0, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + // Check with various types. + test>(1, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + test>(1, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + // Check with CUSTOM type. + test, 1, CustomVecPlus>(CustomVec(0), 8, 256); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/sycl/test/reduction/reduction_nd_s1_rw.cpp b/sycl/test/reduction/reduction_nd_s1_rw.cpp new file mode 100644 index 0000000000000..333b03bf40cb0 --- /dev/null +++ b/sycl/test/reduction/reduction_nd_s1_rw.cpp @@ -0,0 +1,99 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(nd_range, reduction, func) +// with reductions initialized with 1-dimensional read_write accessor +// accessing 1 element buffer. + +#include "reduction_utils.hpp" +#include +#include + +using namespace cl::sycl; + +template +class SomeClass; + +template +void test(T Identity, size_t WGSize, size_t NWItems) { + buffer InBuf(NWItems); + buffer OutBuf(1); + + // Initialize. + BinaryOperation BOp; + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); + + (OutBuf.template get_access())[0] = Identity; + + // Compute. + queue Q; + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + accessor + Out(OutBuf, CGH); + auto Redu = intel::reduction(Out, Identity, BOp); + + range<1> GlobalRange(NWItems); + range<1> LocalRange(WGSize); + nd_range<1> NDRange(GlobalRange, LocalRange); + CGH.parallel_for>( + NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { + Sum.combine(In[NDIt.get_global_linear_id()]); + }); + }); + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + if (ComputedOut != CorrectOut) { + std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; + std::cout << "Computed value: " << ComputedOut + << ", Expected value: " << CorrectOut << "\n"; + assert(0 && "Wrong value."); + } +} + +int main() { + // Check some less standards WG sizes and corner cases first. + test>(0, 2, 2); + test>(0, 7, 7); + test>(0, 9, 18); + test>(0, 49, 49 * 5); + + // Try some power-of-two work-group sizes. + test>(0, 2, 64); + test>(0, 4, 64); + test>(0, 8, 128); + test>(0, 16, 256); + test>(0, 32, 256); + test>(0, 64, 256); + test>(0, 128, 256); + test>(0, 256, 256); + + // Check with various operations. + test>(1, 8, 256); + test>(0, 8, 256); + test>(0, 8, 256); + test>(~0, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + // Check with various types. + test>(1, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + test>(1, 8, 256); + test>(std::numeric_limits::max(), 8, 256); + test>(std::numeric_limits::min(), 8, 256); + + // Check with CUSTOM type. + test, 1, CustomVecPlus>(CustomVec(0), 8, 256); + + std::cout << "Test passed\n"; + return 0; +} diff --git a/sycl/test/reduction/reduction_transparent.cpp b/sycl/test/reduction/reduction_transparent.cpp new file mode 100644 index 0000000000000..d9392fc2784a4 --- /dev/null +++ b/sycl/test/reduction/reduction_transparent.cpp @@ -0,0 +1,66 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// This test performs basic checks of parallel_for(nd_range, reduction, func) +// where func is a transparent functor. + +#include "reduction_utils.hpp" +#include +#include + +using namespace cl::sycl; + +template +class SomeClass; + +template +void test(T Identity, size_t WGSize, size_t NWItems) { + buffer InBuf(NWItems); + buffer OutBuf(1); + + // Initialize. + BinaryOperation BOp; + T CorrectOut; + initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); + + // Compute. + queue Q; + Q.submit([&](handler &CGH) { + auto In = InBuf.template get_access(CGH); + accessor + Out(OutBuf, CGH); + auto Redu = intel::reduction(Out, Identity, BOp); + + range<1> GlobalRange(NWItems); + range<1> LocalRange(WGSize); + nd_range<1> NDRange(GlobalRange, LocalRange); + CGH.parallel_for>( + NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { + Sum.combine(In[NDIt.get_global_linear_id()]); + }); + }); + + // Check correctness. + auto Out = OutBuf.template get_access(); + T ComputedOut = *(Out.get_pointer()); + if (ComputedOut != CorrectOut) { + std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; + std::cout << "Computed value: " << ComputedOut + << ", Expected value: " << CorrectOut << "\n"; + assert(0 && "Wrong value."); + } +} + +int main() { +#if __cplusplus >= 201402L + test>(std::numeric_limits::min(), 7, 7 * 5); + test>(0, 7, 49); + test>(1, 4, 16); +#endif + + std::cout << "Test passed\n"; + return 0; +} diff --git a/sycl/test/reduction/reduction_utils.hpp b/sycl/test/reduction/reduction_utils.hpp new file mode 100644 index 0000000000000..c7a1699298f02 --- /dev/null +++ b/sycl/test/reduction/reduction_utils.hpp @@ -0,0 +1,54 @@ +#include + +using namespace cl::sycl; + +// Initializes 'InBuf' buffer with pseudo-random values, computes the reduction +// value for the buffer and writes it to 'ExpectedOut'. +template +void initInputData(buffer &InBuf, T &ExpectedOut, T Identity, + BinaryOperation BOp, size_t N) { + ExpectedOut = Identity; + auto In = InBuf.template get_access(); + for (int I = 0; I < N; ++I) { + if (std::is_same>::value) + In[I] = 1 + (((I % 37) == 0) ? 1 : 0); + else + In[I] = ((I + 1) % 5) + 1.1; + ExpectedOut = BOp(ExpectedOut, In[I]); + } +}; + +// This type is needed only to check that custom types are properly handled +// in parallel_for() with reduction. For simplicity it needs a default +// constructor, a constructor with one argument, operators ==, != and +// printing to a stream. +template +struct CustomVec { + CustomVec() : X(0), Y(0) {} + CustomVec(T X, T Y) : X(X), Y(Y) {} + CustomVec(T V) : X(V), Y(V) {} + bool operator==(const CustomVec &V) const { + return V.X == X && V.Y == Y; + } + bool operator!=(const CustomVec &V) const { + return !(*this == V); + } + T X; + T Y; +}; +template +bool operator==(const CustomVec &A, const CustomVec &B) { + return A.X == B.X && A.Y == B.Y; +} +template +std::ostream &operator<<(std::ostream &OS, const CustomVec &V) { + return OS << "(" << V.X << ", " << V.Y << ")"; +} + +template +struct CustomVecPlus { + using CV = CustomVec; + CV operator()(const CV &A, const CV &B) const { + return CV(A.X + B.X, A.Y + B.Y); + } +};