diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 91b2b0e22eb1c..318a0b9827fe2 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -23,6 +23,7 @@ subject to change. Do not rely on these variables in production code. | SYCL_THROW_ON_BLOCK | Any(\*) | Throw an exception on attempt to wait for a blocked command. | | SYCL_DEVICELIB_INHIBIT_NATIVE | String of device library extensions (separated by a whitespace) | Do not rely on device native support for devicelib extensions listed in this option. | | SYCL_DEVICE_ALLOWLIST | A list of devices and their minimum driver version following the pattern: DeviceName:{{XXX}},DriverVersion:{{X.Y.Z.W}}. Also may contain PlatformName and PlatformVersion | Filter out devices that do not match the pattern specified. Regular expression can be passed and the DPC++ runtime will select only those devices which satisfy the regex. | +| SYCL_QUEUE_THREAD_POOL_SIZE | Positive integer | Number of threads in thread pool of queue. | `(*) Note: Any means this environment variable is effective when set to any non-null value.` ### SYCL_PRINT_EXECUTION_GRAPH Options diff --git a/sycl/include/CL/sycl/accessor.hpp b/sycl/include/CL/sycl/accessor.hpp index b2c2ec7af60d0..97feaa283182c 100644 --- a/sycl/include/CL/sycl/accessor.hpp +++ b/sycl/include/CL/sycl/accessor.hpp @@ -778,7 +778,7 @@ class accessor : template + (!IsPlaceH && (IsGlobalBuf || IsConstantBuf || IsHostBuf))> > accessor(buffer &BufferRef, handler &CommandGroupHandler) @@ -817,9 +817,9 @@ class accessor : #endif template 0) && (Dims == Dimensions) && - (!IsPlaceH && - (IsGlobalBuf || IsConstantBuf))>> + typename = detail::enable_if_t< + (Dims > 0) && (Dims == Dimensions) && + (!IsPlaceH && (IsGlobalBuf || IsConstantBuf || IsHostBuf))>> accessor(buffer &BufferRef, handler &CommandGroupHandler) #ifdef __SYCL_DEVICE_ONLY__ diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp index 100c29e8ebe38..26ee7b6f89837 100644 --- a/sycl/include/CL/sycl/detail/cg.hpp +++ b/sycl/include/CL/sycl/detail/cg.hpp @@ -215,6 +215,16 @@ class InteropTask { void call(cl::sycl::interop_handler &h) { MFunc(h); } }; +class HostTask { + std::function MHostTask; + +public: + HostTask() : MHostTask([]() {}) {} + HostTask(std::function &&Func) : MHostTask(Func) {} + + void call() { MHostTask(); } +}; + // Class which stores specific lambda object. template class HostKernel : public HostKernelBase { @@ -391,7 +401,8 @@ class CG { COPY_USM, FILL_USM, PREFETCH_USM, - INTEROP_TASK_CODEPLAY + CODEPLAY_INTEROP_TASK, + CODEPLAY_HOST_TASK }; CG(CGTYPE Type, vector_class> ArgsStorage, @@ -631,6 +642,24 @@ class CGInteropTask : public CG { MInteropTask(std::move(InteropTask)) {} }; +class CGHostTask : public CG { +public: + std::unique_ptr MHostTask; + vector_class MArgs; + + CGHostTask(std::unique_ptr HostTask, vector_class Args, + std::vector> ArgsStorage, + std::vector AccStorage, + std::vector> SharedPtrStorage, + std::vector Requirements, + std::vector Events, CGTYPE Type, + detail::code_location loc = {}) + : CG(Type, std::move(ArgsStorage), std::move(AccStorage), + std::move(SharedPtrStorage), std::move(Requirements), + std::move(Events), std::move(loc)), + MHostTask(std::move(HostTask)), MArgs(std::move(Args)) {} +}; + } // namespace detail } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index a2c24ee63febb..d08816bd56206 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -105,6 +105,27 @@ template struct get_kernel_name_t { using name = Type; }; +template struct check_fn_signature { + static_assert(std::integral_constant::value, + "Second template parameter is required to be of function type"); +}; + +template +struct check_fn_signature { +private: + template + static constexpr auto check(T *) -> typename std::is_same< + decltype(std::declval().operator()(std::declval()...)), + RetT>::type; + + template static constexpr std::false_type check(...); + + using type = decltype(check(0)); + +public: + static constexpr bool value = type::value; +}; + __SYCL_EXPORT device getDeviceFromHandler(handler &); } // namespace detail @@ -802,6 +823,20 @@ class __SYCL_EXPORT handler { MCGType = detail::CG::RUN_ON_HOST_INTEL; } + template + typename std::enable_if::type, void()>::value>::type + codeplay_host_task(FuncT Func) { + throwIfActionIsCreated(); + + MNDRDesc.set(range<1>(1)); + MArgs = std::move(MAssociatedAccesors); + + MHostTask.reset(new detail::HostTask(std::move(Func))); + + MCGType = detail::CG::CODEPLAY_HOST_TASK; + } + /// Defines and invokes a SYCL kernel function for the specified range and /// offset. /// @@ -1128,7 +1163,7 @@ class __SYCL_EXPORT handler { template void interop_task(FuncT Func) { MInteropTask.reset(new detail::InteropTask(std::move(Func))); - MCGType = detail::CG::INTEROP_TASK_CODEPLAY; + MCGType = detail::CG::CODEPLAY_INTEROP_TASK; } /// Defines and invokes a SYCL kernel function for the specified range. @@ -1586,6 +1621,8 @@ class __SYCL_EXPORT handler { vector_class MPattern; /// Storage for a lambda or function object. unique_ptr_class MHostKernel; + /// Storage for lambda/function when using HostTask + unique_ptr_class MHostTask; detail::OSModuleHandle MOSModuleHandle; // Storage for a lambda or function when using InteropTasks std::unique_ptr MInteropTask; diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index f3263720a2bb4..313ef01472d98 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -50,11 +50,31 @@ event_impl::~event_impl() { } void event_impl::waitInternal() const { - if (!MHostEvent) { + if (!MHostEvent && MEvent) { getPlugin().call(1, &MEvent); + return; } - // Waiting of host events is NOP so far as all operations on host device - // are blocking. + + while (MState != HES_Complete) + ; +} + +void event_impl::setComplete() { + if (MHostEvent || !MEvent) { +#ifndef NDEBUG + int Expected = HES_NotComplete; + int Desired = HES_Complete; + + bool Succeeded = MState.compare_exchange_strong(Expected, Desired); + + assert(Succeeded && "Unexpected state of event"); +#else + MState.store(static_cast(HES_Complete)); +#endif + return; + } + + assert(false && "setComplete is not supported for non-host event"); } const RT::PiEvent &event_impl::getHandleRef() const { return MEvent; } @@ -68,11 +88,15 @@ void event_impl::setContextImpl(const ContextImplPtr &Context) { MHostEvent = Context->is_host(); MOpenCLInterop = !MHostEvent; MContext = Context; + + MState = HES_NotComplete; } +event_impl::event_impl() : MState(HES_Complete) {} + event_impl::event_impl(RT::PiEvent Event, const context &SyclContext) : MEvent(Event), MContext(detail::getSyclObjImpl(SyclContext)), - MOpenCLInterop(true), MHostEvent(false) { + MOpenCLInterop(true), MHostEvent(false), MState(HES_Complete) { if (MContext->is_host()) { throw cl::sycl::invalid_parameter_error( @@ -96,12 +120,19 @@ event_impl::event_impl(RT::PiEvent Event, const context &SyclContext) } event_impl::event_impl(QueueImplPtr Queue) : MQueue(Queue) { - if (Queue->is_host() && - Queue->has_property()) { - MHostProfilingInfo.reset(new HostProfilingInfo()); - if (!MHostProfilingInfo) - throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY); + if (Queue->is_host()) { + MState.store(HES_NotComplete); + + if (Queue->has_property()) { + MHostProfilingInfo.reset(new HostProfilingInfo()); + if (!MHostProfilingInfo) + throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY); + } + + return; } + + MState.store(HES_Complete); } void *event_impl::instrumentationProlog(string_class &Name, int32_t StreamID, diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 94600f5eb6f9f..58616e13a12d2 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -14,6 +14,7 @@ #include #include +#include #include __SYCL_INLINE_NAMESPACE(cl) { @@ -32,7 +33,7 @@ class event_impl { /// Constructs a ready SYCL event. /// /// If the constructed SYCL event is waited on it will complete immediately. - event_impl() = default; + event_impl(); /// Constructs an event instance from a plug-in event handle. /// /// The SyclContext must match the plug-in context associated with the @@ -166,6 +167,13 @@ class event_impl { bool MHostEvent = true; std::unique_ptr MHostProfilingInfo; void *MCommand = nullptr; + + enum HostEventState : int { HES_NotComplete = 0, HES_Complete }; + + // State of host event. Employed only for host events and event with no + // backend's representation (e.g. alloca). Used values are listed in + // HostEventState enum. + std::atomic MState; }; } // namespace detail diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index da07df6b949db..f17807f36c687 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -195,6 +195,30 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { #endif } +void queue_impl::initHostTaskAndEventCallbackThreadPool() { + if (MHostTaskThreadPool) + return; + + int Size = 1; + + if (const char *val = std::getenv("SYCL_QUEUE_THREAD_POOL_SIZE")) + try { + Size = std::stoi(val); + } catch (...) { + throw invalid_parameter_error( + "Invalid value for SYCL_QUEUE_THREAD_POOL_SIZE environment variable", + PI_INVALID_VALUE); + } + + if (Size < 1) + throw invalid_parameter_error( + "Invalid value for SYCL_QUEUE_THREAD_POOL_SIZE environment variable", + PI_INVALID_VALUE); + + MHostTaskThreadPool.reset(new ThreadPool(Size)); + MHostTaskThreadPool->start(); +} + pi_native_handle queue_impl::getNative() const { auto Plugin = getPlugin(); pi_native_handle Handle; diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 64667cc9c5569..367bd95746614 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -21,6 +21,7 @@ #include #include #include +#include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { @@ -348,6 +349,13 @@ class queue_impl { MExceptions.PushBack(ExceptionPtr); } + ThreadPool &getThreadPool() { + if (!MHostTaskThreadPool) + initHostTaskAndEventCallbackThreadPool(); + + return *MHostTaskThreadPool; + } + /// Gets the native handle of the SYCL queue. /// /// \return a native handle. @@ -380,6 +388,8 @@ class queue_impl { void instrumentationEpilog(void *TelementryEvent, string_class &Name, int32_t StreamID, uint64_t IId); + void initHostTaskAndEventCallbackThreadPool(); + /// Stores a USM operation event that should be associated with the queue /// /// \param Event is the event to be stored @@ -414,6 +424,10 @@ class queue_impl { const bool MOpenCLInterop = false; // Assume OOO support by default. bool MSupportOOO = true; + + // Thread pool for host task and event callbacks execution. + // The thread pool is instantiated upon the very first call to getThreadPool() + std::unique_ptr MHostTaskThreadPool; }; } // namespace detail diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 1e5f1c8e4835a..eb40bcdf82044 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -41,6 +41,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { + #ifdef XPTI_ENABLE_INSTRUMENTATION // Global graph for the application extern xpti::trace_event_data_t *GSYCLGraphEvent; @@ -156,58 +157,79 @@ getPiEvents(const std::vector &EventImpls) { return RetPiEvents; } -void EventCompletionClbk(RT::PiEvent, pi_int32, void *data) { - // TODO: Handle return values. Store errors to async handler. - EventImplPtr *Event = (reinterpret_cast(data)); - RT::PiEvent &EventHandle = (*Event)->getHandleRef(); - const detail::plugin &Plugin = (*Event)->getPlugin(); - Plugin.call(EventHandle, PI_EVENT_COMPLETE); - delete (Event); -} +class DispatchHostTask { + ExecCGCommand *MThisCmd; -// Method prepares PI event's from list sycl::event's -std::vector Command::prepareEvents(ContextImplPtr Context) { - std::vector Result; - std::vector GlueEvents; - for (EventImplPtr &DepEvent : MDepsEvents) { - // Async work is not supported for host device. - if (DepEvent->is_host()) { - DepEvent->waitInternal(); - continue; + void waitForEvents() const { + std::map> + RequiredEventsPerPlugin; + + for (const EventImplPtr &Event : MThisCmd->MPreparedDepsEvents) { + const detail::plugin &Plugin = Event->getPlugin(); + RequiredEventsPerPlugin[&Plugin].push_back(Event); } - // The event handle can be null in case of, for example, alloca command, - // which is currently synchrounious, so don't generate OpenCL event. - if (DepEvent->getHandleRef() == nullptr) { - continue; + + // wait for dependency device events + // FIXME Current implementation of waiting for events will make the thread + // 'sleep' until all of dependency events are complete. We need a bit more + // sophisticated waiting mechanism to allow to utilize this thread for any + // other available job and resume once all required events are ready. + for (auto &PluginWithEvents : RequiredEventsPerPlugin) { + std::vector RawEvents = getPiEvents(PluginWithEvents.second); + PluginWithEvents.first->call(RawEvents.size(), + RawEvents.data()); } - ContextImplPtr DepEventContext = DepEvent->getContextImpl(); - - // If contexts don't match - connect them using user event - if (DepEventContext != Context && !Context->is_host()) { - EventImplPtr GlueEvent(new detail::event_impl()); - GlueEvent->setContextImpl(Context); - EventImplPtr *GlueEventCopy = - new EventImplPtr(GlueEvent); // To increase the reference count by 1. - - RT::PiEvent &GlueEventHandle = GlueEvent->getHandleRef(); - auto Plugin = Context->getPlugin(); - auto DepPlugin = DepEventContext->getPlugin(); - // Add an event on the current context that - // is triggered when the DepEvent is complete - Plugin.call(Context->getHandleRef(), - &GlueEventHandle); - - DepPlugin.call( - DepEvent->getHandleRef(), PI_EVENT_COMPLETE, EventCompletionClbk, - /*void *data=*/(GlueEventCopy)); - GlueEvents.push_back(GlueEvent); - Result.push_back(std::move(GlueEvent)); - continue; + + // wait for dependency host events + for (const EventImplPtr &Event : MThisCmd->MPreparedHostDepsEvents) { + Event->waitInternal(); } - Result.push_back(DepEvent); } - MDepsEvents.insert(MDepsEvents.end(), GlueEvents.begin(), GlueEvents.end()); - return Result; + +public: + DispatchHostTask(ExecCGCommand *ThisCmd) : MThisCmd{ThisCmd} {} + + void operator()() const { + waitForEvents(); + + assert(MThisCmd->getCG().getType() == CG::CGTYPE::CODEPLAY_HOST_TASK); + + CGHostTask &HostTask = static_cast(MThisCmd->getCG()); + + // we're ready to call the user-defined lambda now + HostTask.MHostTask->call(); + HostTask.MHostTask.reset(); + + // unblock user empty command here + EmptyCommand *EmptyCmd = MThisCmd->MEmptyCmd; + assert(EmptyCmd && "No empty command found"); + + // Completing command's event along with unblocking enqueue readiness of + // empty command may lead to quick deallocation of MThisCmd by some cleanup + // process. Thus we'll copy deps prior to completing of event and unblocking + // of empty command. + // Also, it's possible to have record deallocated prior to enqueue process. + // Thus we employ read-lock of graph. + { + Scheduler &Sched = Scheduler::getInstance(); + std::shared_lock Lock(Sched.MGraphLock); + + std::vector Deps = MThisCmd->MDeps; + + // update self-event status + MThisCmd->MEvent->setComplete(); + + EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; + + for (const DepDesc &Dep : Deps) + Scheduler::enqueueLeavesOfReqUnlocked(Dep.MDepRequirement); + } + } +}; + +void Command::waitForPreparedHostEvents() const { + for (const EventImplPtr &HostEvent : MPreparedHostDepsEvents) + HostEvent->waitInternal(); } void Command::waitForEvents(QueueImplPtr Queue, @@ -403,9 +425,37 @@ void Command::makeTraceEventEpilog() { #endif } +void Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) { + const ContextImplPtr &Context = getContext(); + + // 1. Async work is not supported for host device. + // 2. The event handle can be null in case of, for example, alloca command, + // which is currently synchrounious, so don't generate OpenCL event. + // Though, this event isn't host one as it's context isn't host one. + if (DepEvent->is_host() || DepEvent->getHandleRef() == nullptr) { + // call to waitInternal() is in waitForPreparedHostEvents() as it's called + // from enqueue process functions + MPreparedHostDepsEvents.push_back(DepEvent); + return; + } + + ContextImplPtr DepEventContext = DepEvent->getContextImpl(); + // If contexts don't match we'll connect them using host task + if (DepEventContext != Context && !Context->is_host()) { + Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder; + GB.connectDepEvent(this, DepEvent, Dep); + } else + MPreparedDepsEvents.push_back(std::move(DepEvent)); +} + +ContextImplPtr Command::getContext() const { + return detail::getSyclObjImpl(MQueue->get_context()); +} + void Command::addDep(DepDesc NewDep) { - if (NewDep.MDepCommand) - MDepsEvents.push_back(NewDep.MDepCommand->getEvent()); + if (NewDep.MDepCommand) { + processDepEvent(NewDep.MDepCommand->getEvent(), NewDep); + } MDeps.push_back(NewDep); #ifdef XPTI_ENABLE_INSTRUMENTATION emitEdgeEventForCommandDependence( @@ -424,7 +474,7 @@ void Command::addDep(EventImplPtr Event) { emitEdgeEventForEventDependence(Cmd, PiEventAddr); #endif - MDepsEvents.push_back(std::move(Event)); + processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr}); } void Command::emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr) { @@ -466,7 +516,7 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) { if (ThrowOnBlock) throw sycl::runtime_error( std::string("Waiting for blocked command. Block reason: ") + - std::string(MBlockReason), + std::string(getBlockReason()), PI_INVALID_OPERATION); #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -474,7 +524,7 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) { // event, which models the barrier while enqueuing along with the blocked // reason, as determined by the scheduler std::string Info = "enqueue.barrier["; - Info += std::string(MBlockReason) + "]"; + Info += std::string(getBlockReason()) + "]"; emitInstrumentation(xpti::trace_barrier_begin, Info.c_str()); #endif @@ -505,15 +555,21 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) { // has already been marked as "failed" if enqueueImp throws an exception. // This will avoid execution of the same failed command twice. MEnqueueStatus = EnqueueResultT::SyclEnqueueFailed; + MShouldCompleteEventIfPossible = true; cl_int Res = enqueueImp(); if (CL_SUCCESS != Res) EnqueueResult = EnqueueResultT(EnqueueResultT::SyclEnqueueFailed, this, Res); - else + else { + if (MShouldCompleteEventIfPossible && + (MEvent->is_host() || MEvent->getHandleRef() == nullptr)) + MEvent->setComplete(); + // Consider the command is successfully enqueued if return code is // CL_SUCCESS MEnqueueStatus = EnqueueResultT::SyclEnqueueSuccess; + } // Emit this correlation signal before the task end emitEnqueuedEventSignal(MEvent->getHandleRef()); @@ -564,6 +620,17 @@ void Command::resolveReleaseDependencies(std::set &DepList) { #endif } +const char *Command::getBlockReason() const { + switch (MBlockReason) { + case BlockReason::HostAccessor: + return "A Buffer is locked by the host accessor"; + case BlockReason::HostTask: + return "Blocked by host task"; + } + + return "Unknown block reason"; +} + AllocaCommandBase::AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, AllocaCommandBase *LinkedAllocaCmd) @@ -617,8 +684,8 @@ void AllocaCommand::emitInstrumentationData() { } cl_int AllocaCommand::enqueueImp() { - std::vector EventImpls = - Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context())); + waitForPreparedHostEvents(); + std::vector EventImpls = MPreparedDepsEvents; RT::PiEvent &Event = MEvent->getHandleRef(); @@ -628,6 +695,7 @@ cl_int AllocaCommand::enqueueImp() { if (MQueue->is_host()) { // Do not need to make allocation if we have a linked device allocation Command::waitForEvents(MQueue, EventImpls, Event); + return CL_SUCCESS; } HostPtr = MLinkedAllocaCmd->getMemAllocation(); @@ -637,6 +705,7 @@ cl_int AllocaCommand::enqueueImp() { MMemAllocation = MemoryManager::allocate( detail::getSyclObjImpl(MQueue->get_context()), getSYCLMemObj(), MInitFromUserData, HostPtr, std::move(EventImpls), Event); + return CL_SUCCESS; } @@ -707,8 +776,8 @@ void *AllocaSubBufCommand::getMemAllocation() const { } cl_int AllocaSubBufCommand::enqueueImp() { - std::vector EventImpls = - Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context())); + waitForPreparedHostEvents(); + std::vector EventImpls = MPreparedDepsEvents; RT::PiEvent &Event = MEvent->getHandleRef(); MMemAllocation = MemoryManager::allocateMemSubBuffer( @@ -716,6 +785,7 @@ cl_int AllocaSubBufCommand::enqueueImp() { MParentAlloca->getMemAllocation(), MRequirement.MElemSize, MRequirement.MOffsetInBytes, MRequirement.MAccessRange, std::move(EventImpls), Event); + return CL_SUCCESS; } @@ -767,8 +837,8 @@ void ReleaseCommand::emitInstrumentationData() { } cl_int ReleaseCommand::enqueueImp() { - std::vector EventImpls = - Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context())); + waitForPreparedHostEvents(); + std::vector EventImpls = MPreparedDepsEvents; std::vector RawEvents = getPiEvents(EventImpls); bool SkipRelease = false; @@ -878,8 +948,8 @@ void MapMemObject::emitInstrumentationData() { } cl_int MapMemObject::enqueueImp() { - std::vector EventImpls = - Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context())); + waitForPreparedHostEvents(); + std::vector EventImpls = MPreparedDepsEvents; std::vector RawEvents = getPiEvents(EventImpls); RT::PiEvent &Event = MEvent->getHandleRef(); @@ -887,6 +957,7 @@ cl_int MapMemObject::enqueueImp() { MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(), MQueue, MMapMode, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange, MSrcReq.MOffset, MSrcReq.MElemSize, std::move(RawEvents), Event); + return CL_SUCCESS; } @@ -934,14 +1005,15 @@ void UnMapMemObject::emitInstrumentationData() { } cl_int UnMapMemObject::enqueueImp() { - std::vector EventImpls = - Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context())); + waitForPreparedHostEvents(); + std::vector EventImpls = MPreparedDepsEvents; std::vector RawEvents = getPiEvents(EventImpls); RT::PiEvent &Event = MEvent->getHandleRef(); MemoryManager::unmap(MDstAllocaCmd->getSYCLMemObj(), MDstAllocaCmd->getMemAllocation(), MQueue, *MSrcPtr, std::move(RawEvents), Event); + return CL_SUCCESS; } @@ -1000,11 +1072,15 @@ void MemCpyCommand::emitInstrumentationData() { #endif } +ContextImplPtr MemCpyCommand::getContext() const { + const QueueImplPtr &Queue = MQueue->is_host() ? MSrcQueue : MQueue; + return detail::getSyclObjImpl(Queue->get_context()); +} + cl_int MemCpyCommand::enqueueImp() { - std::vector EventImpls; QueueImplPtr Queue = MQueue->is_host() ? MSrcQueue : MQueue; - EventImpls = - Command::prepareEvents(detail::getSyclObjImpl(Queue->get_context())); + waitForPreparedHostEvents(); + std::vector EventImpls = MPreparedDepsEvents; RT::PiEvent &Event = MEvent->getHandleRef(); @@ -1067,9 +1143,8 @@ void ExecCGCommand::flushStreams() { } cl_int UpdateHostRequirementCommand::enqueueImp() { - std::vector EventImpls; - EventImpls = - Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context())); + waitForPreparedHostEvents(); + std::vector EventImpls = MPreparedDepsEvents; RT::PiEvent &Event = MEvent->getHandleRef(); Command::waitForEvents(MQueue, EventImpls, Event); @@ -1077,6 +1152,7 @@ cl_int UpdateHostRequirementCommand::enqueueImp() { assert(MSrcAllocaCmd->getMemAllocation() && "Expected valid source pointer"); assert(MDstPtr && "Expected valid target pointer"); *MDstPtr = MSrcAllocaCmd->getMemAllocation(); + return CL_SUCCESS; } @@ -1140,10 +1216,15 @@ void MemCpyCommandHost::emitInstrumentationData() { #endif } +ContextImplPtr MemCpyCommandHost::getContext() const { + const QueueImplPtr &Queue = MQueue->is_host() ? MSrcQueue : MQueue; + return detail::getSyclObjImpl(Queue->get_context()); +} + cl_int MemCpyCommandHost::enqueueImp() { QueueImplPtr Queue = MQueue->is_host() ? MSrcQueue : MQueue; - std::vector EventImpls = - Command::prepareEvents(detail::getSyclObjImpl(Queue->get_context())); + waitForPreparedHostEvents(); + std::vector EventImpls = MPreparedDepsEvents; std::vector RawEvents = getPiEvents(EventImpls); RT::PiEvent &Event = MEvent->getHandleRef(); @@ -1153,6 +1234,7 @@ cl_int MemCpyCommandHost::enqueueImp() { if (MDstReq.MAccessMode == access::mode::discard_read_write || MDstReq.MAccessMode == access::mode::discard_write) { Command::waitForEvents(Queue, EventImpls, Event); + return CL_SUCCESS; } @@ -1162,23 +1244,43 @@ cl_int MemCpyCommandHost::enqueueImp() { MSrcReq.MOffset, MSrcReq.MElemSize, *MDstPtr, MQueue, MDstReq.MDims, MDstReq.MMemoryRange, MDstReq.MAccessRange, MDstReq.MOffset, MDstReq.MElemSize, std::move(RawEvents), Event); + return CL_SUCCESS; } -EmptyCommand::EmptyCommand(QueueImplPtr Queue, Requirement Req) - : Command(CommandType::EMPTY_TASK, std::move(Queue)), - MRequirement(std::move(Req)) { - +EmptyCommand::EmptyCommand(QueueImplPtr Queue) + : Command(CommandType::EMPTY_TASK, std::move(Queue)) { emitInstrumentationDataProxy(); } +cl_int EmptyCommand::enqueueImp() { + waitForPreparedHostEvents(); + waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef()); + + return CL_SUCCESS; +} + +void EmptyCommand::addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, + const Requirement *Req) { + const Requirement &ReqRef = *Req; + MRequirements.emplace_back(ReqRef); + const Requirement *const StoredReq = &MRequirements.back(); + + addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}); +} + void EmptyCommand::emitInstrumentationData() { #ifdef XPTI_ENABLE_INSTRUMENTATION if (!xptiTraceEnabled()) return; // Create a payload with the command name and an event using this payload to // emit a node_create - MAddress = MRequirement.MSYCLMemObj; + if (MRequirements.empty()) + return; + + Requirement &Req = *MRequirements.begin(); + + MAddress = Req.MSYCLMemObj; makeTraceEventProlog(MAddress); if (MFirstInstance) { @@ -1283,6 +1385,9 @@ static std::string cgTypeToString(detail::CG::CGTYPE Type) { case detail::CG::PREFETCH_USM: return "prefetch usm"; break; + case detail::CG::CODEPLAY_HOST_TASK: + return "host task"; + break; default: return "unknown"; break; @@ -1494,9 +1599,8 @@ void DispatchNativeKernel(void *Blob) { } cl_int ExecCGCommand::enqueueImp() { - std::vector EventImpls = - Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context())); - + waitForPreparedHostEvents(); + std::vector EventImpls = MPreparedDepsEvents; auto RawEvents = getPiEvents(EventImpls); RT::PiEvent &Event = MEvent->getHandleRef(); @@ -1520,6 +1624,7 @@ cl_int ExecCGCommand::enqueueImp() { Scheduler::getInstance().getDefaultHostQueue(), Req->MDims, Req->MAccessRange, Req->MAccessRange, /*DstOffset=*/{0, 0, 0}, Req->MElemSize, std::move(RawEvents), Event); + return CL_SUCCESS; } case CG::CGTYPE::COPY_PTR_TO_ACC: { @@ -1553,6 +1658,7 @@ cl_int ExecCGCommand::enqueueImp() { ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(), MQueue, ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange, ReqDst->MOffset, ReqDst->MElemSize, std::move(RawEvents), Event); + return CL_SUCCESS; } case CG::CGTYPE::FILL: { @@ -1565,6 +1671,7 @@ cl_int ExecCGCommand::enqueueImp() { Fill->MPattern.size(), Fill->MPattern.data(), Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize, std::move(RawEvents), Event); + return CL_SUCCESS; } case CG::CGTYPE::RUN_ON_HOST_INTEL: { @@ -1594,6 +1701,7 @@ cl_int ExecCGCommand::enqueueImp() { Plugin.call(RawEvents.size(), &RawEvents[0]); } DispatchNativeKernel((void *)ArgsBlob.data()); + return CL_SUCCESS; } @@ -1651,6 +1759,7 @@ cl_int ExecCGCommand::enqueueImp() { } ExecKernel->MHostKernel->call(NDRDesc, getEvent()->getHostProfilingInfo()); + return CL_SUCCESS; } @@ -1732,18 +1841,21 @@ cl_int ExecCGCommand::enqueueImp() { return detail::enqueue_kernel_launch::handleError(Error, DeviceImpl, Kernel, NDRDesc); } + return PI_SUCCESS; } case CG::CGTYPE::COPY_USM: { CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get(); MemoryManager::copy_usm(Copy->getSrc(), MQueue, Copy->getLength(), Copy->getDst(), std::move(RawEvents), Event); + return CL_SUCCESS; } case CG::CGTYPE::FILL_USM: { CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get(); MemoryManager::fill_usm(Fill->getDst(), MQueue, Fill->getLength(), Fill->getFill(), std::move(RawEvents), Event); + return CL_SUCCESS; } case CG::CGTYPE::PREFETCH_USM: { @@ -1751,9 +1863,10 @@ cl_int ExecCGCommand::enqueueImp() { MemoryManager::prefetch_usm(Prefetch->getDst(), MQueue, Prefetch->getLength(), std::move(RawEvents), Event); + return CL_SUCCESS; } - case CG::CGTYPE::INTEROP_TASK_CODEPLAY: { + case CG::CGTYPE::CODEPLAY_INTEROP_TASK: { const detail::plugin &Plugin = MQueue->getPlugin(); CGInteropTask *ExecInterop = (CGInteropTask *)MCommandGroup.get(); // Wait for dependencies to complete before dispatching work on the host @@ -1780,6 +1893,31 @@ cl_int ExecCGCommand::enqueueImp() { nullptr, &Event); Plugin.call( reinterpret_cast(MQueue->get())); + + return CL_SUCCESS; + } + case CG::CGTYPE::CODEPLAY_HOST_TASK: { + CGHostTask *HostTask = static_cast(MCommandGroup.get()); + + for (ArgDesc &Arg : HostTask->MArgs) { + switch (Arg.MType) { + case kernel_param_kind_t::kind_accessor: { + Requirement *Req = static_cast(Arg.MPtr); + AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); + + Req->MData = AllocaCmd->getMemAllocation(); + break; + } + default: + throw runtime_error("Unsupported arg type", PI_INVALID_VALUE); + } + } + + MQueue->getThreadPool().submit( + std::move(DispatchHostTask(this))); + + MShouldCompleteEventIfPossible = false; + return CL_SUCCESS; } case CG::CGTYPE::NONE: diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 80feb92ce9560..37880f41d1623 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -26,6 +27,7 @@ namespace detail { class queue_impl; class event_impl; class context_impl; +class DispatchHostTask; using QueueImplPtr = std::shared_ptr; using EventImplPtr = std::shared_ptr; @@ -35,6 +37,8 @@ class Command; class AllocaCommand; class AllocaCommandBase; class ReleaseCommand; +class ExecCGCommand; +class EmptyCommand; enum BlockingT { NON_BLOCKING = 0, BLOCKING }; @@ -95,7 +99,8 @@ class Command { MAP_MEM_OBJ, UNMAP_MEM_OBJ, UPDATE_REQUIREMENT, - EMPTY_TASK + EMPTY_TASK, + HOST_TASK }; Command(CommandType Type, QueueImplPtr Queue); @@ -166,14 +171,34 @@ class Command { virtual ~Command() = default; + const char *getBlockReason() const; + + virtual ContextImplPtr getContext() const; + protected: EventImplPtr MEvent; QueueImplPtr MQueue; - std::vector MDepsEvents; + + /// Dependency events prepared for waiting by backend. + /// See processDepEvent for details. + std::vector MPreparedDepsEvents; + std::vector MPreparedHostDepsEvents; void waitForEvents(QueueImplPtr Queue, std::vector &RawEvents, RT::PiEvent &Event); - std::vector prepareEvents(ContextImplPtr Context); + + void waitForPreparedHostEvents() const; + + /// Perform glueing of events from different contexts + /// \param DepEvent event this commands should depend on + /// \param Dep optional DepDesc to perform connection of events properly + /// + /// Glueing (i.e. connecting) will be performed if and only if DepEvent is + /// not from host context and its context doesn't match to context of this + /// command. Context of this command is fetched via getContext(). + /// + /// Optionality of Dep is set by Dep.MDepCommand not equal to nullptr. + void processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep); /// Private interface. Derived classes should implement this method. virtual cl_int enqueueImp() = 0; @@ -183,6 +208,8 @@ class Command { /// Mutex used to protect enqueueing from race conditions std::mutex MEnqueueMtx; + friend class DispatchHostTask; + public: /// Contains list of dependencies(edges) std::vector MDeps; @@ -193,7 +220,10 @@ class Command { /// Counts the number of memory objects this command is a leaf for. unsigned MLeafCounter = 0; - const char *MBlockReason = "Unknown"; + enum class BlockReason : int { HostAccessor = 0, HostTask }; + + // Only have reasonable value while MIsBlockable is true + BlockReason MBlockReason; /// Describes the status of the command. std::atomic MEnqueueStatus; @@ -224,23 +254,36 @@ class Command { bool MFirstInstance = false; /// Instance ID tracked for the command. uint64_t MInstanceID = 0; + + // This flag allows to control whether host event should be set complete + // after successfull enqueue of command. Event is considered as host event if + // either it's is_host() return true or there is no backend representation + // of event (i.e. getHandleRef() return reference to nullptr value). + // By default the flag is set to true due to most of host operations are + // synchronous. The only asynchronous operation currently is host-task. + bool MShouldCompleteEventIfPossible = true; }; /// The empty command does nothing during enqueue. The task can be used to /// implement lock in the graph, or to merge several nodes into one. class EmptyCommand : public Command { public: - EmptyCommand(QueueImplPtr Queue, Requirement Req); + EmptyCommand(QueueImplPtr Queue); void printDot(std::ostream &Stream) const final; - const Requirement *getRequirement() const final { return &MRequirement; } + const Requirement *getRequirement() const final { return &MRequirements[0]; } + void addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd, + const Requirement *Req); void emitInstrumentationData(); private: - cl_int enqueueImp() final { return CL_SUCCESS; } + cl_int enqueueImp() final; - Requirement MRequirement; + // Employing deque here as it allows to push_back/emplace_back without + // invalidation of pointer or reference to stored data item regardless of + // iterator invalidation. + std::deque MRequirements; }; /// The release command enqueues release of a memory object instance allocated @@ -380,6 +423,7 @@ class MemCpyCommand : public Command { void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MDstReq; } void emitInstrumentationData(); + ContextImplPtr getContext() const override final; private: cl_int enqueueImp() final; @@ -402,6 +446,7 @@ class MemCpyCommandHost : public Command { void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MDstReq; } void emitInstrumentationData(); + ContextImplPtr getContext() const override final; private: cl_int enqueueImp() final; @@ -424,12 +469,22 @@ class ExecCGCommand : public Command { void printDot(std::ostream &Stream) const final; void emitInstrumentationData(); + detail::CG &getCG() const { return *MCommandGroup; } + + // MEmptyCmd one is only employed if this command refers to host-task. + // MEmptyCmd due to unreliable mechanism of lookup for single EmptyCommand + // amongst users of host-task-representing command. This unreliability roots + // in cleanup process. + EmptyCommand *MEmptyCmd = nullptr; + private: cl_int enqueueImp() final; AllocaCommandBase *getAllocaForReq(Requirement *Req); std::unique_ptr MCommandGroup; + + friend class Command; }; class UpdateHostRequirementCommand : public Command { diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 4d749d92e9952..684c9069f3ce1 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -129,7 +129,7 @@ MemObjRecord *Scheduler::GraphBuilder::getMemObjRecord(SYCLMemObjI *MemObject) { MemObjRecord * Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue, - Requirement *Req) { + const Requirement *Req) { SYCLMemObjI *MemObject = Req->MSYCLMemObj; MemObjRecord *Record = getMemObjRecord(MemObject); @@ -416,17 +416,8 @@ Command *Scheduler::GraphBuilder::addHostAccessor(Requirement *Req, Command *UpdateHostAccCmd = insertUpdateHostReqCmd(Record, Req, HostQueue); // Need empty command to be blocked until host accessor is destructed - EmptyCommand *EmptyCmd = new EmptyCommand(HostQueue, *Req); - EmptyCmd->addDep( - DepDesc{UpdateHostAccCmd, EmptyCmd->getRequirement(), HostAllocaCmd}); - UpdateHostAccCmd->addUser(EmptyCmd); - - EmptyCmd->MIsBlockable = true; - EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueBlocked; - EmptyCmd->MBlockReason = "A Buffer is locked by the host accessor"; - - updateLeaves({UpdateHostAccCmd}, Record, Req->MAccessMode); - addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode); + EmptyCommand *EmptyCmd = addEmptyCmd( + UpdateHostAccCmd, {Req}, HostQueue, Command::BlockReason::HostAccessor); Req->MBlockedCmd = EmptyCmd; @@ -455,7 +446,8 @@ Command *Scheduler::GraphBuilder::addCGUpdateHost( /// 2. New and examined commands has non-overlapping requirements -> can bypass /// 3. New and examined commands have different contexts -> cannot bypass std::set -Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, Requirement *Req, +Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, + const Requirement *Req, const ContextImplPtr &Context) { std::set RetDeps; std::set Visited; @@ -522,8 +514,10 @@ DepDesc Scheduler::GraphBuilder::findDepForRecord(Command *Cmd, // The function searches for the alloca command matching context and // requirement. -AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq( - MemObjRecord *Record, Requirement *Req, const ContextImplPtr &Context) { +AllocaCommandBase * +Scheduler::GraphBuilder::findAllocaForReq(MemObjRecord *Record, + const Requirement *Req, + const ContextImplPtr &Context) { auto IsSuitableAlloca = [&Context, Req](AllocaCommandBase *AllocaCmd) { bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), Context); if (IsSuitableSubReq(Req)) { @@ -544,7 +538,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq( // Note, creation of new allocation command can lead to the current context // (Record->MCurContext) change. AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( - MemObjRecord *Record, Requirement *Req, QueueImplPtr Queue) { + MemObjRecord *Record, const Requirement *Req, QueueImplPtr Queue) { AllocaCommandBase *AllocaCmd = findAllocaForReq(Record, Req, Queue->getContextImplPtr()); @@ -614,6 +608,15 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( } else { LinkedAllocaCmd->MIsActive = false; Record->MCurContext = Queue->getContextImplPtr(); + + std::set Deps = + findDepsForReq(Record, Req, Queue->getContextImplPtr()); + for (Command *Dep : Deps) { + AllocaCmd->addDep(DepDesc{Dep, Req, LinkedAllocaCmd}); + Dep->addUser(AllocaCmd); + } + updateLeaves(Deps, Record, Req->MAccessMode); + addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode); } } } @@ -640,11 +643,50 @@ void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record, } } +template +typename std::enable_if< + std::is_same::type, Requirement>::value, + EmptyCommand *>::type +Scheduler::GraphBuilder::addEmptyCmd(Command *Cmd, const std::vector &Reqs, + const QueueImplPtr &Queue, + Command::BlockReason Reason) { + EmptyCommand *EmptyCmd = + new EmptyCommand(Scheduler::getInstance().getDefaultHostQueue()); + + if (!EmptyCmd) + throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY); + + EmptyCmd->MIsBlockable = true; + EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueBlocked; + EmptyCmd->MBlockReason = Reason; + + for (T *Req : Reqs) { + MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req); + AllocaCommandBase *AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue); + EmptyCmd->addRequirement(Cmd, AllocaCmd, Req); + } + + Cmd->addUser(EmptyCmd); + + const std::vector &Deps = Cmd->MDeps; + for (const DepDesc &Dep : Deps) { + const Requirement *Req = Dep.MDepRequirement; + MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj); + + updateLeaves({Cmd}, Record, Req->MAccessMode); + addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode); + } + + return EmptyCmd; +} + Command * Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, QueueImplPtr Queue) { const std::vector &Reqs = CommandGroup->MRequirements; const std::vector &Events = CommandGroup->MEvents; + const CG::CGTYPE CGType = CommandGroup->getType(); + std::unique_ptr NewCmd( new ExecCGCommand(std::move(CommandGroup), Queue)); if (!NewCmd) @@ -684,6 +726,7 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, // Set new command as user for dependencies and update leaves. // Node dependencies can be modified further when adding the node to leaves, // iterate over their copy. + // FIXME employ a reference here to eliminate copying of a vector std::vector Deps = NewCmd->MDeps; for (DepDesc &Dep : Deps) { Dep.MDepCommand->addUser(NewCmd.get()); @@ -698,8 +741,13 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, NewCmd->addDep(e); } + if (CGType == CG::CGTYPE::CODEPLAY_HOST_TASK) + NewCmd->MEmptyCmd = addEmptyCmd(NewCmd.get(), NewCmd->getCG().MRequirements, + Queue, Command::BlockReason::HostTask); + if (MPrintOptionsArray[AfterAddCG]) printGraphAsDot("after_addCG"); + return NewCmd.release(); } @@ -725,14 +773,35 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) { // Dependencies of the users will be cleaned up during the traversal for (Command *AllocaCmd : AllocaCommands) { Visited.insert(AllocaCmd); + for (Command *UserCmd : AllocaCmd->MUsers) - ToVisit.push(UserCmd); + // Linked alloca cmd may be in users of this alloca. We're not going to + // visit it. + if (UserCmd->getType() != Command::CommandType::ALLOCA) + ToVisit.push(UserCmd); + else + Visited.insert(UserCmd); + CmdsToDelete.push_back(AllocaCmd); // These commands will be deleted later, clear users now to avoid // updating them during edge removal AllocaCmd->MUsers.clear(); } + // Linked alloca's share dependencies. Unchain from deps linked alloca's. + // Any cmd of the alloca - linked_alloca may be used later on. + for (AllocaCommandBase *AllocaCmd : AllocaCommands) { + AllocaCommandBase *LinkedCmd = AllocaCmd->MLinkedAllocaCmd; + + if (LinkedCmd) { + assert(Visited.count(LinkedCmd)); + + for (DepDesc &Dep : AllocaCmd->MDeps) + if (Dep.MDepCommand) + Dep.MDepCommand->MUsers.erase(AllocaCmd); + } + } + // Traverse the graph using BFS while (!ToVisit.empty()) { Command *Cmd = ToVisit.front(); @@ -742,7 +811,8 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) { continue; for (Command *UserCmd : Cmd->MUsers) - ToVisit.push(UserCmd); + if (UserCmd->getType() != Command::CommandType::ALLOCA) + ToVisit.push(UserCmd); // Delete all dependencies on any allocations being removed // Track which commands should have their users updated @@ -820,6 +890,7 @@ void Scheduler::GraphBuilder::cleanupFinishedCommands(Command *FinishedCmd) { DepCmd->MUsers.erase(Cmd); } Cmd->getEvent()->setCommand(nullptr); + delete Cmd; } } @@ -833,6 +904,104 @@ void Scheduler::GraphBuilder::removeRecordForMemObj(SYCLMemObjI *MemObject) { MemObject->MRecord.reset(); } +// Make Cmd depend on DepEvent from different context. Connection is performed +// via distinct ConnectCmd with host task command group on host queue. Cmd will +// depend on ConnectCmd's host event. +// DepEvent may not have a command associated with it in at least two cases: +// - the command was deleted upon cleanup process; +// - DepEvent is user event. +// In both of these cases the only thing we can do is to make ConnectCmd depend +// on DepEvent. +// Otherwise, when there is a command associated with DepEvent, we make +// ConnectCmd depend on on this command. If there is valid, i.e. non-nil, +// requirement in Dep we make ConnectCmd depend on DepEvent's command with this +// requirement. +// Optionality of Dep is set by Dep.MDepCommand equal to nullptr. +void Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd, + EventImplPtr DepEvent, + const DepDesc &Dep) { + const ContextImplPtr &Context = Cmd->getContext(); + const ContextImplPtr &DepEventContext = DepEvent->getContextImpl(); + + assert(Context != DepEventContext); + + // construct Host Task type command manually and make it depend on DepEvent + ExecCGCommand *ConnectCmd = nullptr; + + { + std::unique_ptr HT(new detail::HostTask); + std::unique_ptr ConnectCG(new detail::CGHostTask( + std::move(HT), /* Args = */ {}, /* ArgsStorage = */ {}, + /* AccStorage = */ {}, /* SharedPtrStorage = */ {}, + /* Requirements = */ {}, /* DepEvents = */ {DepEvent}, + CG::CODEPLAY_HOST_TASK, /* Payload */ {})); + ConnectCmd = new ExecCGCommand( + std::move(ConnectCG), Scheduler::getInstance().getDefaultHostQueue()); + } + + if (!ConnectCmd) + throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY); + + if (Command *DepCmd = reinterpret_cast(DepEvent->getCommand())) + DepCmd->addUser(ConnectCmd); + + EmptyCommand *EmptyCmd = nullptr; + + if (Dep.MDepRequirement) { + // make ConnectCmd depend on requirement + ConnectCmd->addDep(Dep); + assert(reinterpret_cast(DepEvent->getCommand()) == + Dep.MDepCommand); + // add user to Dep.MDepCommand is already performed beyond this if branch + + MemObjRecord *Record = getMemObjRecord(Dep.MDepRequirement->MSYCLMemObj); + + updateLeaves({Dep.MDepCommand}, Record, Dep.MDepRequirement->MAccessMode); + addNodeToLeaves(Record, ConnectCmd, Dep.MDepRequirement->MAccessMode); + + const std::vector Reqs(1, Dep.MDepRequirement); + EmptyCmd = addEmptyCmd(ConnectCmd, Reqs, + Scheduler::getInstance().getDefaultHostQueue(), + Command::BlockReason::HostTask); + // Dependencies for EmptyCmd are set in addEmptyCmd for provided Reqs. + + // Depend Cmd on empty command + { + DepDesc CmdDep = Dep; + CmdDep.MDepCommand = EmptyCmd; + + Cmd->addDep(CmdDep); + } + } else { + EmptyCmd = addEmptyCmd( + ConnectCmd, {}, Scheduler::getInstance().getDefaultHostQueue(), + Command::BlockReason::HostTask); + + // There is no requirement thus, empty command will only depend on + // ConnectCmd via its event. + EmptyCmd->addDep(ConnectCmd->getEvent()); + ConnectCmd->addDep(DepEvent); + + // Depend Cmd on empty command + Cmd->addDep(EmptyCmd->getEvent()); + } + + EmptyCmd->addUser(Cmd); + + ConnectCmd->MEmptyCmd = EmptyCmd; + + // FIXME graph builder shouldn't really enqueue commands. We're in the middle + // of enqueue process for some command Cmd. We're going to add a dependency + // for it. Need some nice and cute solution to enqueue ConnectCmd via standard + // scheduler/graph processor mechanisms. + // Though, we need this call to enqueue to launch ConnectCmd. + EnqueueResultT Res; + bool Enqueued = Scheduler::GraphProcessor::enqueueCommand(ConnectCmd, Res); + if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) + throw runtime_error("Failed to enqueue a sync event between two contexts", + PI_INVALID_OPERATION); +} + } // namespace detail } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp index 08c8a67431376..aa72388a70a12 100644 --- a/sycl/source/detail/scheduler/graph_processor.cpp +++ b/sycl/source/detail/scheduler/graph_processor.cpp @@ -45,11 +45,7 @@ void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event) { // TODO: Reschedule commands. throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); - RT::PiEvent &CLEvent = Cmd->getEvent()->getHandleRef(); - if (CLEvent) { - const detail::plugin &Plugin = Event->getPlugin(); - Plugin.call(1, &CLEvent); - } + Cmd->getEvent()->waitInternal(); } bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd, diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 9c16eafb26ace..2cac4499e16d8 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -73,9 +73,16 @@ EventImplPtr Scheduler::addCG(std::unique_ptr CommandGroup, NewCmd = MGraphBuilder.addCGUpdateHost(std::move(CommandGroup), DefaultHostQueue); break; + case CG::CODEPLAY_HOST_TASK: + NewCmd = MGraphBuilder.addCG(std::move(CommandGroup), DefaultHostQueue); + break; default: NewCmd = MGraphBuilder.addCG(std::move(CommandGroup), std::move(Queue)); } + } + + { + std::shared_lock Lock(MGraphLock); // TODO: Check if lazy mode. EnqueueResultT Res; @@ -153,6 +160,7 @@ void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) { if (!Record) // No operations were performed on the mem object return; + waitForRecordToFinish(Record); MGraphBuilder.decrementLeafCountersForRecord(Record); MGraphBuilder.cleanupCommandsForRecord(Record); @@ -175,8 +183,19 @@ EventImplPtr Scheduler::addHostAccessor(Requirement *Req, } void Scheduler::releaseHostAccessor(Requirement *Req) { - Req->MBlockedCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; + Command *const BlockedCmd = Req->MBlockedCmd; + std::shared_lock Lock(MGraphLock); + + assert(BlockedCmd && "Can't find appropriate command to unblock"); + + BlockedCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; + + enqueueLeavesOfReqUnlocked(Req); +} + +// static +void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req) { MemObjRecord *Record = Req->MSYCLMemObj->MRecord.get(); auto EnqueueLeaves = [](CircularBuffer &Leaves) { for (Command *Cmd : Leaves) { diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index a86755a8406e8..b9bb2e8ef02b9 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -17,6 +17,7 @@ #include #include #include +#include #include /// \defgroup sycl_graph DPC++ Execution Graph @@ -172,6 +173,7 @@ namespace detail { class queue_impl; class event_impl; class context_impl; +class DispatchHostTask; using QueueImplPtr = std::shared_ptr; using EventImplPtr = std::shared_ptr; @@ -428,6 +430,8 @@ class Scheduler { Scheduler(); static Scheduler instance; + static void enqueueLeavesOfReqUnlocked(const Requirement *const Req); + /// Graph builder class. /// /// The graph builder provides means to change an existing graph (e.g. add @@ -489,7 +493,7 @@ class Scheduler { /// \return a pointer to MemObjRecord for pointer to memory object. If the /// record is not found, nullptr is returned. MemObjRecord *getOrInsertMemObjRecord(const QueueImplPtr &Queue, - Requirement *Req); + const Requirement *Req); /// Decrements leaf counters for all leaves of the record. void decrementLeafCountersForRecord(MemObjRecord *Record); @@ -508,6 +512,15 @@ class Scheduler { void updateLeaves(const std::set &Cmds, MemObjRecord *Record, access::mode AccessMode); + /// Perform connection of events in multiple contexts + /// \param Cmd dependant command + /// \param DepEvent event to depend on + /// \param Dep optional DepDesc to perform connection properly + /// + /// Optionality of Dep is set by Dep.MDepCommand equal to nullptr. + void connectDepEvent(Command *const Cmd, EventImplPtr DepEvent, + const DepDesc &Dep); + std::vector MMemObjs; private: @@ -533,21 +546,34 @@ class Scheduler { const QueueImplPtr &Queue); /// Finds dependencies for the requirement. - std::set findDepsForReq(MemObjRecord *Record, Requirement *Req, + std::set findDepsForReq(MemObjRecord *Record, + const Requirement *Req, const ContextImplPtr &Context); + template + typename std::enable_if< + std::is_same::type, Requirement>::value, + EmptyCommand *>::type + addEmptyCmd(Command *Cmd, const std::vector &Req, + const QueueImplPtr &Queue, Command::BlockReason Reason); + + protected: /// Finds a command dependency corresponding to the record. DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record); /// Searches for suitable alloca in memory record. - AllocaCommandBase *findAllocaForReq(MemObjRecord *Record, Requirement *Req, + AllocaCommandBase *findAllocaForReq(MemObjRecord *Record, + const Requirement *Req, const ContextImplPtr &Context); + friend class Command; + + private: /// Searches for suitable alloca in memory record. /// /// If none found, creates new one. AllocaCommandBase *getOrCreateAllocaForReq(MemObjRecord *Record, - Requirement *Req, + const Requirement *Req, QueueImplPtr Queue); void markModifiedIfWrite(MemObjRecord *Record, Requirement *Req); @@ -664,6 +690,9 @@ class Scheduler { std::shared_timed_mutex MGraphLock; QueueImplPtr DefaultHostQueue; + + friend class Command; + friend class DispatchHostTask; }; } // namespace detail diff --git a/sycl/source/detail/thread_pool.hpp b/sycl/source/detail/thread_pool.hpp new file mode 100644 index 0000000000000..2d7873748bcc9 --- /dev/null +++ b/sycl/source/detail/thread_pool.hpp @@ -0,0 +1,99 @@ +//===-- thread_pool.hpp - Simple thread pool --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +__SYCL_INLINE_NAMESPACE(cl) { +namespace sycl { +namespace detail { + +class ThreadPool { + std::vector MLaunchedThreads; + + size_t MThreadCount; + std::queue> MJobQueue; + std::mutex MJobQueueMutex; + std::condition_variable MDoSmthOrStop; + std::atomic_bool MStop; + + void worker() { + std::unique_lock Lock(MJobQueueMutex); + + while (true) { + MDoSmthOrStop.wait( + Lock, [this]() { return !MJobQueue.empty() || MStop.load(); }); + + if (MStop.load()) + break; + + std::function Job = std::move(MJobQueue.front()); + MJobQueue.pop(); + Lock.unlock(); + + Job(); + + Lock.lock(); + } + } + +public: + ThreadPool(unsigned int ThreadCount = 1) : MThreadCount(ThreadCount) {} + + ~ThreadPool() { finishAndWait(); } + + void start() { + MLaunchedThreads.reserve(MThreadCount); + + MStop.store(false); + + for (size_t Idx = 0; Idx < MThreadCount; ++Idx) + MLaunchedThreads.emplace_back([this] { worker(); }); + } + + void finishAndWait() { + MStop.store(true); + + MDoSmthOrStop.notify_all(); + + for (std::thread &Thread : MLaunchedThreads) + if (Thread.joinable()) + Thread.join(); + } + + template void submit(T &&Func) { + { + std::lock_guard Lock(MJobQueueMutex); + MJobQueue.emplace([F = std::move(Func)]() { F(); }); + } + + MDoSmthOrStop.notify_one(); + } + + void submit(std::function &&Func) { + { + std::lock_guard Lock(MJobQueueMutex); + MJobQueue.emplace(Func); + } + + MDoSmthOrStop.notify_one(); + } +}; + +} // namespace detail +} // namespace sycl +} // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 6d018f6d3b122..195cc9c1fc072 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -39,7 +39,7 @@ event handler::finalize() { MCodeLoc)); break; } - case detail::CG::INTEROP_TASK_CODEPLAY: + case detail::CG::CODEPLAY_INTEROP_TASK: CommandGroup.reset(new detail::CGInteropTask( std::move(MInteropTask), std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), @@ -83,6 +83,12 @@ event handler::finalize() { std::move(MSharedPtrStorage), std::move(MRequirements), std::move(MEvents), MCodeLoc)); break; + case detail::CG::CODEPLAY_HOST_TASK: + CommandGroup.reset(new detail::CGHostTask( + std::move(MHostTask), std::move(MArgs), std::move(MArgsStorage), + std::move(MAccStorage), std::move(MSharedPtrStorage), + std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc)); + break; case detail::CG::NONE: throw runtime_error("Command group submitted without a kernel or a " "explicit memory operation.", diff --git a/sycl/test/CMakeLists.txt b/sycl/test/CMakeLists.txt index bdb730b2c2152..ffbbbe6e38be3 100644 --- a/sycl/test/CMakeLists.txt +++ b/sycl/test/CMakeLists.txt @@ -10,6 +10,9 @@ set(LLVM_BUILD_LIBRARY_DIRS "${LLVM_BINARY_DIR}/lib/") set(RT_TEST_ARGS ${RT_TEST_ARGS} "-v") set(DEPLOY_RT_TEST_ARGS ${DEPLOY_RT_TEST_ARGS} "-v -D SYCL_TOOLS_DIR=${CMAKE_INSTALL_PREFIX}/bin -D SYCL_LIBS_DIR=${CMAKE_INSTALL_PREFIX}/lib${LLVM_LIBDIR_SUFFIX} -D SYCL_INCLUDE=${SYCL_INCLUDE_DEPLOY_DIR}") +find_package(Threads REQUIRED) +set(SYCL_THREADS_LIB ${CMAKE_THREAD_LIBS_INIT}) + configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py diff --git a/sycl/test/abi/layout_handler.cpp b/sycl/test/abi/layout_handler.cpp index 49887124183a4..65ad8670db1fc 100644 --- a/sycl/test/abi/layout_handler.cpp +++ b/sycl/test/abi/layout_handler.cpp @@ -31,6 +31,7 @@ // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 0 // CHECK-NEXT: FieldDecl {{.*}} MPattern 'vector_class':'std::vector>' // CHECK-NEXT: FieldDecl {{.*}} MHostKernel 'unique_ptr_class':'std::unique_ptr>' +// CHECK-NEXT: FieldDecl {{.*}} MHostTask 'unique_ptr_class':'std::unique_ptr>' // CHECK-NEXT: FieldDecl {{.*}} MOSModuleHandle 'detail::OSModuleHandle':'long' // CHECK-NEXT: FieldDecl {{.*}} MInteropTask 'std::unique_ptr':'std::unique_ptr>' // CHECK-NEXT: FieldDecl {{.*}} MEvents 'vector_class':'std::vector, std::allocator>>' diff --git a/sycl/test/abi/symbol_size.cpp b/sycl/test/abi/symbol_size.cpp index 5b595a86978ea..9380fd1526de1 100644 --- a/sycl/test/abi/symbol_size.cpp +++ b/sycl/test/abi/symbol_size.cpp @@ -44,9 +44,9 @@ int main() { check_size(); check_size(); #ifdef _MSC_VER - check_size(); -#else check_size(); +#else + check_size(); #endif check_size, 16>(); check_size(); diff --git a/sycl/test/host-interop-task/host-task-dependency.cpp b/sycl/test/host-interop-task/host-task-dependency.cpp new file mode 100644 index 0000000000000..942b79165aebd --- /dev/null +++ b/sycl/test/host-interop-task/host-task-dependency.cpp @@ -0,0 +1,196 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out %threads_lib +// RUN: %CPU_RUN_PLACEHOLDER SYCL_PI_TRACE=-1 %t.out 2>&1 %CPU_CHECK_PLACEHOLDER +// RUN: %GPU_RUN_PLACEHOLDER SYCL_PI_TRACE=-1 %t.out 2>&1 %GPU_CHECK_PLACEHOLDER +// RUN: %ACC_RUN_PLACEHOLDER SYCL_PI_TRACE=-1 %t.out 2>&1 %ACC_CHECK_PLACEHOLDER + +#include +#include +#include +#include +#include + +#include + +namespace S = cl::sycl; + +struct Context { + std::atomic_bool Flag; + S::queue &Queue; + S::buffer Buf1; + S::buffer Buf2; + S::buffer Buf3; + std::mutex Mutex; + std::condition_variable CV; +}; + +void Thread1Fn(Context *Ctx) { + // 0. initialize resulting buffer with apriori wrong result + { + S::accessor + Acc(Ctx->Buf1); + + for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) + Acc[Idx] = -1; + } + + { + S::accessor + Acc(Ctx->Buf2); + + for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) + Acc[Idx] = -2; + } + + { + S::accessor + Acc(Ctx->Buf3); + + for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) + Acc[Idx] = -3; + } + + // 1. submit task writing to buffer 1 + Ctx->Queue.submit([&](S::handler &CGH) { + S::accessor + GeneratorAcc(Ctx->Buf1, CGH); + + auto GeneratorKernel = [GeneratorAcc] { + for (size_t Idx = 0; Idx < GeneratorAcc.get_count(); ++Idx) + GeneratorAcc[Idx] = Idx; + }; + + CGH.single_task(GeneratorKernel); + }); + + // 2. submit host task writing from buf 1 to buf 2 + auto HostTaskEvent = Ctx->Queue.submit([&](S::handler &CGH) { + S::accessor + CopierSrcAcc(Ctx->Buf1, CGH); + S::accessor + CopierDstAcc(Ctx->Buf2, CGH); + + auto CopierHostTask = [CopierSrcAcc, CopierDstAcc, &Ctx] { + for (size_t Idx = 0; Idx < CopierDstAcc.get_count(); ++Idx) + CopierDstAcc[Idx] = CopierSrcAcc[Idx]; + + bool Expected = false; + bool Desired = true; + assert(Ctx->Flag.compare_exchange_strong(Expected, Desired)); + + { + std::lock_guard Lock(Ctx->Mutex); + Ctx->CV.notify_all(); + } + }; + + CGH.codeplay_host_task(CopierHostTask); + }); + + // 3. submit simple task to move data between two buffers + Ctx->Queue.submit([&](S::handler &CGH) { + S::accessor + SrcAcc(Ctx->Buf2, CGH); + S::accessor + DstAcc(Ctx->Buf3, CGH); + + CGH.depends_on(HostTaskEvent); + + auto CopierKernel = [SrcAcc, DstAcc] { + for (size_t Idx = 0; Idx < DstAcc.get_count(); ++Idx) + DstAcc[Idx] = SrcAcc[Idx]; + }; + + CGH.single_task(CopierKernel); + }); + + // 4. check data in buffer #3 + { + S::accessor + Acc(Ctx->Buf3); + + bool Failure = false; + + for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) { + fprintf(stderr, "Third buffer [%3zu] = %i\n", Idx, Acc[Idx]); + + Failure |= (Acc[Idx] != Idx); + } + + assert(!Failure && "Invalid data in third buffer"); + } +} + +void Thread2Fn(Context *Ctx) { + std::unique_lock Lock(Ctx->Mutex); + + // T2.1. Wait until flag F is set eq true. + Ctx->CV.wait(Lock, [Ctx] { return Ctx->Flag.load(); }); + + assert(Ctx->Flag.load()); +} + +void test() { + auto EH = [](S::exception_list EL) { + for (const std::exception_ptr &E : EL) { + throw E; + } + }; + + S::queue Queue(EH); + + Context Ctx{{false}, Queue, {10}, {10}, {10}, {}, {}}; + + // 0. setup: thread 1 T1: exec smth; thread 2 T2: waits; init flag F = false + auto A1 = std::async(std::launch::async, Thread1Fn, &Ctx); + auto A2 = std::async(std::launch::async, Thread2Fn, &Ctx); + + A1.get(); + A2.get(); + + assert(Ctx.Flag.load()); + + // 3. check via host accessor that buf 2 contains valid data + { + S::accessor + ResultAcc(Ctx.Buf2); + + bool Failure = false; + for (size_t Idx = 0; Idx < ResultAcc.get_count(); ++Idx) { + fprintf(stderr, "Second buffer [%3zu] = %i\n", Idx, ResultAcc[Idx]); + + Failure |= (ResultAcc[Idx] != Idx); + } + + assert(!Failure && "Invalid data in result buffer"); + } +} + +int main() { + test(); + + return 0; +} + +// launch of GeneratorTask kernel +// CHECK:---> piKernelCreate( +// CHECK: GeneratorTask +// CHECK:---> piEnqueueKernelLaunch( +// prepare for host task +// CHECK:---> piEnqueueMemBufferMap( +// launch of CopierTask kernel +// CHECK:---> piKernelCreate( +// CHECK: CopierTask +// CHECK:---> piEnqueueKernelLaunch( +// TODO need to check for piEventsWait as "wait on dependencies of host task". +// At the same time this piEventsWait may occur anywhere after +// piEnqueueMemBufferMap ("prepare for host task"). diff --git a/sycl/test/host-interop-task/host-task-two-queues.cpp b/sycl/test/host-interop-task/host-task-two-queues.cpp new file mode 100644 index 0000000000000..55134149be36b --- /dev/null +++ b/sycl/test/host-interop-task/host-task-two-queues.cpp @@ -0,0 +1,78 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include +#include + +namespace S = cl::sycl; + +#define WIDTH 5 +#define HEIGHT 5 + +void test() { + auto EH = [](S::exception_list EL) { + for (const std::exception_ptr &E : EL) { + throw E; + } + }; + + S::queue Q1(EH); + S::queue Q2(EH); + + std::vector DataA(WIDTH * HEIGHT, 2); + std::vector DataB(WIDTH * HEIGHT, 3); + std::vector DataC(WIDTH * HEIGHT, 1); + + S::buffer BufA{DataA.data(), S::range<2>{WIDTH, HEIGHT}}; + S::buffer BufB{DataB.data(), S::range<2>{WIDTH, HEIGHT}}; + S::buffer BufC{DataC.data(), S::range<2>{WIDTH, HEIGHT}}; + + auto CG1 = [&](S::handler &CGH) { + auto AccA = BufA.get_access(CGH); + auto AccB = BufB.get_access(CGH); + auto AccC = BufC.get_access(CGH); + auto Kernel = [=](S::nd_item<2> Item) { + size_t W = Item.get_global_id(0); + size_t H = Item.get_global_id(1); + AccC[W][H] += AccA[W][H] * AccB[W][H]; + }; + CGH.parallel_for(S::nd_range<2>({WIDTH, HEIGHT}, {1, 1}), Kernel); + }; + + auto CG2 = [&](S::handler &CGH) { + auto AccA = BufA.get_access(CGH); + auto AccB = BufB.get_access(CGH); + auto AccC = BufC.get_access(CGH); + + CGH.codeplay_host_task([=] { + for (size_t I = 0; I < WIDTH; ++I) + for (size_t J = 0; J < HEIGHT; ++J) { + std::cout << "C[" << I << "][" << J << "] = " << AccC[I][J] + << std::endl; + } + }); + }; + + static const size_t NTIMES = 100; + + for (size_t Idx = 0; Idx < NTIMES; ++Idx) { + Q1.submit(CG1); + Q2.submit(CG2); + Q2.submit(CG1); + Q1.submit(CG2); + } + + Q1.wait_and_throw(); + Q2.wait_and_throw(); + + for (size_t I = 0; I < WIDTH; ++I) + for (size_t J = 0; J < HEIGHT; ++J) + assert(DataC[I * HEIGHT + J] == (1 + 2 * 3 * NTIMES * 2)); +} + +int main(void) { + test(); + return 0; +} diff --git a/sycl/test/lit.cfg.py b/sycl/test/lit.cfg.py index c63efd551473b..4baaeac0f1dd3 100644 --- a/sycl/test/lit.cfg.py +++ b/sycl/test/lit.cfg.py @@ -58,6 +58,7 @@ llvm_config.with_environment('PATH', config.sycl_tools_dir, append_path=True) +config.substitutions.append( ('%threads_lib', config.sycl_threads_lib) ) config.substitutions.append( ('%sycl_libs_dir', config.sycl_libs_dir ) ) config.substitutions.append( ('%sycl_include', config.sycl_include ) ) config.substitutions.append( ('%sycl_source_dir', config.sycl_source_dir) ) diff --git a/sycl/test/lit.site.cfg.py.in b/sycl/test/lit.site.cfg.py.in index 82281dd5cb654..a8d456eddd3b5 100644 --- a/sycl/test/lit.site.cfg.py.in +++ b/sycl/test/lit.site.cfg.py.in @@ -20,6 +20,7 @@ config.llvm_build_bin_dir = "@LLVM_BUILD_BINARY_DIRS@" config.llvm_enable_projects = "@LLVM_ENABLE_PROJECTS@" +config.sycl_threads_lib = '@SYCL_THREADS_LIB@' import lit.llvm lit.llvm.initialize(lit_config, config)