diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 23008e75b80fb..6ecf34d005bd3 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -356,11 +356,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } }; - auto Event = submit_impl_and_postprocess(CGF, CodeLoc, PostProcess); - return discard_or_return(Event); + return submit_impl_and_postprocess(CGF, CodeLoc, PostProcess); #else - auto Event = submit_impl(CGF, CodeLoc); - return discard_or_return(Event); + return submit_impl(CGF, CodeLoc); #endif // __SYCL_USE_FALLBACK_ASSERT } @@ -395,12 +393,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } }; - auto Event = - submit_impl_and_postprocess(CGF, SecondaryQueue, CodeLoc, PostProcess); - return discard_or_return(Event); + return submit_impl_and_postprocess(CGF, SecondaryQueue, CodeLoc, + PostProcess); #else - auto Event = submit_impl(CGF, SecondaryQueue, CodeLoc); - return discard_or_return(Event); + return submit_impl(CGF, SecondaryQueue, CodeLoc); #endif // __SYCL_USE_FALLBACK_ASSERT } @@ -2814,6 +2810,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { /// Checks if the event needs to be discarded and if so, discards it and /// returns a discarded event. Otherwise, it returns input event. + /// TODO: move to impl class in the next ABI Breaking window event discard_or_return(const event &Event); // Function to postprocess submitted command diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp index 4103653d236a7..222b06127207d 100644 --- a/sycl/source/detail/helpers.cpp +++ b/sycl/source/detail/helpers.cpp @@ -21,6 +21,8 @@ namespace sycl { inline namespace _V1 { using ContextImplPtr = std::shared_ptr; namespace detail { +// TODO: remove from public header files and implementation during the next ABI +// Breaking window. Not used any more. std::vector getOrWaitEvents(std::vector DepEvents, ContextImplPtr Context) { std::vector Events; diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index b8785e12f0535..6fdfdd7a4d11c 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -26,9 +26,19 @@ namespace sycl { inline namespace _V1 { namespace detail { - std::atomic queue_impl::MNextAvailableQueueID = 0; +static std::vector +getPIEvents(const std::vector &DepEvents) { + std::vector RetPiEvents; + for (const sycl::event &Event : DepEvents) { + const EventImplPtr &EventImpl = detail::getSyclObjImpl(Event); + if (EventImpl->getHandleRef() != nullptr) + RetPiEvents.push_back(EventImpl->getHandleRef()); + } + return RetPiEvents; +} + template <> uint32_t queue_impl::get_info() const { sycl::detail::pi::PiResult result = PI_SUCCESS; @@ -63,16 +73,25 @@ static event createDiscardedEvent() { const std::vector & queue_impl::getExtendDependencyList(const std::vector &DepEvents, - std::vector &MutableVec) { - if (isInOrder()) { - std::optional ExternalEvent = popExternalEvent(); - if (ExternalEvent) { - MutableVec = DepEvents; - MutableVec.push_back(*ExternalEvent); - return MutableVec; - } - } - return DepEvents; + std::vector &MutableVec, + std::unique_lock &QueueLock) { + if (!isInOrder()) + return DepEvents; + + QueueLock.lock(); + EventImplPtr ExtraEvent = + MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; + std::optional ExternalEvent = popExternalEvent(); + + if (!ExternalEvent && !ExtraEvent) + return DepEvents; + + MutableVec = DepEvents; + if (ExternalEvent) + MutableVec.push_back(*ExternalEvent); + if (ExtraEvent) + MutableVec.push_back(detail::createSyclObjFromImpl(ExtraEvent)); + return MutableVec; } event queue_impl::memset(const std::shared_ptr &Self, @@ -106,7 +125,7 @@ event queue_impl::memset(const std::shared_ptr &Self, } return submitMemOpHelper( - Self, DepEvents, + Self, DepEvents, [&](handler &CGH) { CGH.memset(Ptr, Value, Count); }, [](const auto &...Args) { MemoryManager::fill_usm(Args...); }, Ptr, Self, Count, Value); } @@ -152,21 +171,17 @@ event queue_impl::memcpy(const std::shared_ptr &Self, #endif // If we have a command graph set we need to capture the copy through normal // queue submission rather than execute the copy directly. - if (MGraph.lock()) { - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.memcpy(Dest, Src, Count); - }, - Self, {}); - } + auto HandlerFunc = [&](handler &CGH) { CGH.memcpy(Dest, Src, Count); }; + if (MGraph.lock()) + return submitWithHandler(Self, DepEvents, HandlerFunc); + if ((!Src || !Dest) && Count != 0) { report(CodeLoc); throw runtime_error("NULL pointer argument in memory copy operation.", PI_ERROR_INVALID_VALUE); } return submitMemOpHelper( - Self, DepEvents, + Self, DepEvents, HandlerFunc, [](const auto &...Args) { MemoryManager::copy_usm(Args...); }, Src, Self, Count, Dest); } @@ -177,17 +192,12 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, const std::vector &DepEvents) { // If we have a command graph set we need to capture the advise through normal // queue submission. - if (MGraph.lock()) { - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.mem_advise(Ptr, Length, Advice); - }, - Self, {}); - } + auto HandlerFunc = [&](handler &CGH) { CGH.mem_advise(Ptr, Length, Advice); }; + if (MGraph.lock()) + return submitWithHandler(Self, DepEvents, HandlerFunc); return submitMemOpHelper( - Self, DepEvents, + Self, DepEvents, HandlerFunc, [](const auto &...Args) { MemoryManager::advise_usm(Args...); }, Ptr, Self, Length, Advice); } @@ -198,6 +208,10 @@ event queue_impl::memcpyToDeviceGlobal( const std::vector &DepEvents) { return submitMemOpHelper( Self, DepEvents, + [&](handler &CGH) { + CGH.memcpyToDeviceGlobal(DeviceGlobalPtr, Src, IsDeviceImageScope, + NumBytes, Offset); + }, [](const auto &...Args) { MemoryManager::copy_to_device_global(Args...); }, @@ -210,15 +224,25 @@ event queue_impl::memcpyFromDeviceGlobal( size_t Offset, const std::vector &DepEvents) { return submitMemOpHelper( Self, DepEvents, + [&](handler &CGH) { + CGH.memcpyFromDeviceGlobal(Dest, DeviceGlobalPtr, IsDeviceImageScope, + NumBytes, Offset); + }, [](const auto &...Args) { MemoryManager::copy_from_device_global(Args...); }, DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest); } -event queue_impl::getLastEvent() const { - std::lock_guard Lock{MLastEventMtx}; - return MDiscardEvents ? createDiscardedEvent() : MLastEvent; +event queue_impl::getLastEvent() { + std::lock_guard Lock{MMutex}; + if (MDiscardEvents) + return createDiscardedEvent(); + if (!MGraph.expired() && MGraphLastEventPtr) + return detail::createSyclObjFromImpl(MGraphLastEventPtr); + if (!MLastEventPtr) + MLastEventPtr = std::make_shared(std::nullopt); + return detail::createSyclObjFromImpl(MLastEventPtr); } void queue_impl::addEvent(const event &Event) { @@ -273,55 +297,90 @@ void queue_impl::addSharedEvent(const event &Event) { MEventsShared.push_back(Event); } -template +static bool +areEventsSafeForSchedulerBypass(const std::vector &DepEvents, + ContextImplPtr Context) { + auto CheckEvent = [&Context](const sycl::event &Event) { + const EventImplPtr &SyclEventImplPtr = detail::getSyclObjImpl(Event); + // Events that don't have an initialized context are throwaway events that + // don't represent actual dependencies. Calling getContextImpl() would set + // their context, which we wish to avoid as it is expensive. + if (!SyclEventImplPtr->isContextInitialized() && + !SyclEventImplPtr->is_host()) { + return true; + } + if (SyclEventImplPtr->is_host()) { + return SyclEventImplPtr->isCompleted(); + } + // Cross-context dependencies can't be passed to the backend directly. + if (SyclEventImplPtr->getContextImpl() != Context) + return false; + + // A nullptr here means that the commmand does not produce a PI event or it + // hasn't been enqueued yet. + return SyclEventImplPtr->getHandleRef() != nullptr; + }; + + return std::all_of(DepEvents.begin(), DepEvents.end(), + [&Context, &CheckEvent](const sycl::event &Event) { + return CheckEvent(Event); + }); +} + +template +event queue_impl::submitWithHandler(const std::shared_ptr &Self, + const std::vector &DepEvents, + HandlerFuncT HandlerFunc) { + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + HandlerFunc(CGH); + }, + Self, {}); +} + +template event queue_impl::submitMemOpHelper(const std::shared_ptr &Self, const std::vector &DepEvents, + HandlerFuncT HandlerFunc, MemOpFuncT MemOpFunc, MemOpArgTs... MemOpArgs) { - if (MHasDiscardEventsSupport) { - MemOpFunc(MemOpArgs..., getOrWaitEvents(DepEvents, MContext), - /*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr); - return createDiscardedEvent(); - } - - event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); + // We need to submit command and update the last event under same lock if we + // have in-order queue. { - // We need to submit command and update the last event under same lock if we - // have in-order queue. - auto ScopeLock = isInOrder() ? std::unique_lock(MLastEventMtx) - : std::unique_lock(); - // If the last submitted command in the in-order queue is host_task then - // wait for it before submitting usm command. - if (isInOrder() && MLastCGType == CG::CGTYPE::CodeplayHostTask) - MLastEvent.wait(); + std::unique_lock Lock(MMutex, std::defer_lock); std::vector MutableDepEvents; const std::vector &ExpandedDepEvents = - getExtendDependencyList(DepEvents, MutableDepEvents); - - auto EventImpl = detail::getSyclObjImpl(ResEvent); - MemOpFunc(MemOpArgs..., getOrWaitEvents(ExpandedDepEvents, MContext), - &EventImpl->getHandleRef(), EventImpl); - - if (MContext->is_host()) - return MDiscardEvents ? createDiscardedEvent() : event(); - - // When a queue is recorded by a graph, the dependencies are managed in the - // graph implementaton. Additionally, CG recorded for a graph are outside of - // the in-order queue execution sequence. Therefore, these CG must not - // update MLastEvent. - if (isInOrder() && (getCommandGraph() == nullptr)) { - MLastEvent = ResEvent; - // We don't create a command group for usm commands, so set it to None. - // This variable is used to perform explicit dependency management when - // required. - MLastCGType = CG::CGTYPE::None; + getExtendDependencyList(DepEvents, MutableDepEvents, Lock); + + if (areEventsSafeForSchedulerBypass(ExpandedDepEvents, MContext)) { + if (MHasDiscardEventsSupport) { + MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents), + /*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr); + return createDiscardedEvent(); + } + + event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); + auto EventImpl = detail::getSyclObjImpl(ResEvent); + MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents), + &EventImpl->getHandleRef(), EventImpl); + + if (MContext->is_host()) + return MDiscardEvents ? createDiscardedEvent() : event(); + + if (isInOrder()) { + auto &EventToStoreIn = + MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; + EventToStoreIn = EventImpl; + } + // Track only if we won't be able to handle it with piQueueFinish. + if (MEmulateOOO) + addSharedEvent(ResEvent); + return discard_or_return(ResEvent); } } - // Track only if we won't be able to handle it with piQueueFinish. - if (MEmulateOOO) - addSharedEvent(ResEvent); - return MDiscardEvents ? createDiscardedEvent() : ResEvent; + return submitWithHandler(Self, DepEvents, HandlerFunc); } void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc, @@ -495,9 +554,10 @@ bool queue_impl::ext_oneapi_empty() const { // If we have in-order queue where events are not discarded then just check // the status of the last event. if (isInOrder() && !MDiscardEvents) { - std::lock_guard Lock(MLastEventMtx); - return MLastEvent.get_info() == - info::event_command_status::complete; + std::lock_guard Lock(MMutex); + return !MLastEventPtr || + MLastEventPtr->get_info() == + info::event_command_status::complete; } // Check the status of the backend queue if this is not a host queue. @@ -533,6 +593,12 @@ bool queue_impl::ext_oneapi_empty() const { return true; } +event queue_impl::discard_or_return(const event &Event) { + if (!(MDiscardEvents)) + return Event; + return createDiscardedEvent(); +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 5aab3fe1598ff..722acf925cf8d 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -211,7 +211,7 @@ class queue_impl { #endif } - event getLastEvent() const; + event getLastEvent(); private: void queue_impl_interop(sycl::detail::pi::PiQueue PiQueue) { @@ -396,12 +396,14 @@ class queue_impl { const std::shared_ptr &SecondQueue, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess = nullptr) { + event ResEvent; try { - return submit_impl(CGF, Self, Self, SecondQueue, Loc, PostProcess); + ResEvent = submit_impl(CGF, Self, Self, SecondQueue, Loc, PostProcess); } catch (...) { - return SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue, Loc, - PostProcess); + ResEvent = SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue, + Loc, PostProcess); } + return discard_or_return(ResEvent); } /// Submits a command group function object to the queue, in order to be @@ -416,7 +418,8 @@ class queue_impl { const std::shared_ptr &Self, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess = nullptr) { - return submit_impl(CGF, Self, Self, nullptr, Loc, PostProcess); + auto ResEvent = submit_impl(CGF, Self, Self, nullptr, Loc, PostProcess); + return discard_or_return(ResEvent); } /// Performs a blocking wait for the completion of all enqueued tasks in the @@ -707,6 +710,7 @@ class queue_impl { std::shared_ptr Graph) { std::lock_guard Lock(MMutex); MGraph = Graph; + MGraphLastEventPtr = nullptr; } std::shared_ptr @@ -730,9 +734,11 @@ class queue_impl { const std::vector & getExtendDependencyList(const std::vector &DepEvents, - std::vector &MutableVec); + std::vector &MutableVec, + std::unique_lock &QueueLock); protected: + event discard_or_return(const event &Event); // Hook to the scheduler to clean up any fusion command held on destruction. void cleanup_fusion_cmd(); @@ -740,29 +746,22 @@ class queue_impl { template void finalizeHandler(HandlerType &Handler, const CG::CGTYPE &Type, event &EventRet) { - // When a queue is recorded by a graph, the dependencies are managed in the - // graph implementaton. Additionally, CG recorded for a graph are outside of - // the in-order queue execution sequence. Therefore, these CG must not - // update MLastEvent. - if (MIsInorder && (getCommandGraph() == nullptr)) { - - auto IsExpDepManaged = [](const CG::CGTYPE &Type) { - return Type == CG::CGTYPE::CodeplayHostTask; - }; - + if (MIsInorder) { // Accessing and changing of an event isn't atomic operation. - // Hence, here is are locks for thread-safety. - std::lock_guard LastEventLock{MLastEventMtx}; - - if (MLastCGType == CG::CGTYPE::None) - MLastCGType = Type; - // Also handles case when sync model changes. E.g. Last is host, new is - // kernel. - bool NeedSeparateDependencyMgmt = - IsExpDepManaged(Type) || IsExpDepManaged(MLastCGType); - - if (NeedSeparateDependencyMgmt) - Handler.depends_on(MLastEvent); + // Hence, here is the lock for thread-safety. + std::lock_guard Lock{MMutex}; + // This dependency is needed for the following purposes: + // - host tasks are handled by the runtime and cannot be implicitly + // synchronized by the backend. + // - to prevent the 2nd kernel enqueue when the 1st kernel is blocked + // by a host task. This dependency allows to build the enqueue order in + // the RT but will not be passed to the backend. See getPIEvents in + // Command. + auto &EventToBuildDeps = + MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; + if (EventToBuildDeps) + Handler.depends_on( + createSyclObjFromImpl(EventToBuildDeps)); // If there is an external event set, add it as a dependency and clear it. // We do not need to hold the lock as MLastEventMtx will ensure the last @@ -772,9 +771,7 @@ class queue_impl { Handler.depends_on(*ExternalEvent); EventRet = Handler.finalize(); - - MLastEvent = EventRet; - MLastCGType = Type; + EventToBuildDeps = getSyclObjImpl(EventRet); } else EventRet = Handler.finalize(); } @@ -844,20 +841,35 @@ class queue_impl { return Event; } - /// Performs direct submission of a memory operation. + /// Helper function for submitting a memory operation with a handler. + /// \param Self is a shared_ptr to this queue. + /// \param DepEvents is a vector of dependencies of the operation. + /// \param HandlerFunc is a function that submits the operation with a + /// handler. + template + event submitWithHandler(const std::shared_ptr &Self, + const std::vector &DepEvents, + HandlerFuncT HandlerFunc); + + /// Performs submission of a memory operation directly if scheduler can be + /// bypassed, or with a handler otherwise. /// /// \param Self is a shared_ptr to this queue. /// \param DepEvents is a vector of dependencies of the operation. - /// \param MemOpFunc is a function that forwards its arguments to the + /// \param HandlerFunc is a function that submits the operation with a + /// handler. + /// \param MemMngrFunc is a function that forwards its arguments to the /// appropriate memory manager function. - /// \param MemOpArgs are all the arguments that need to be passed to memory + /// \param MemMngrArgs are all the arguments that need to be passed to memory /// manager except the last three: dependencies, PI event and /// EventImplPtr are filled out by this helper. /// \return an event representing the submitted operation. - template + template event submitMemOpHelper(const std::shared_ptr &Self, const std::vector &DepEvents, - MemOpFuncT MemOpFunc, MemOpArgTs... MemOpArgs); + HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc, + MemMngrArgTs... MemOpArgs); // When instrumentation is enabled emits trace event for wait begin and // returns the telemetry event generated for the wait @@ -911,13 +923,12 @@ class queue_impl { buffer MAssertHappenedBuffer; // This event is employed for enhanced dependency tracking with in-order queue - // Access to the event should be guarded with MLastEventMtx - event MLastEvent; - mutable std::mutex MLastEventMtx; - // Used for in-order queues in pair with MLastEvent - // Host tasks are explicitly synchronized in RT, pi tasks - implicitly by - // backend. Using type to setup explicit sync between host and pi tasks. - CG::CGTYPE MLastCGType = CG::CGTYPE::None; + // Access to the event should be guarded with MMutex + EventImplPtr MLastEventPtr; + // Same as above but for graph begin-end recording cycle. + // Track deps within graph commands separately. + // Protected by common queue object mutex MMutex. + EventImplPtr MGraphLastEventPtr; const bool MIsInorder; diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp index f22913f990e9c..5ff184be64948 100644 --- a/sycl/source/detail/scheduler/graph_processor.cpp +++ b/sycl/source/detail/scheduler/graph_processor.cpp @@ -53,16 +53,6 @@ bool Scheduler::GraphProcessor::handleBlockingCmd(Command *Cmd, BlockingT Blocking) { if (Cmd == RootCommand || Blocking) return true; - // Async kernel enqueue depending on host task is not compatible with in order - // queue. If we have host_task_1, kernel_2 depending on host_task_1 and - // kernel_3 without explicit dependencies submitted to in order queue: host - // task blocks kernel_2 from being enqueued while kernel_3 has no such - // dependencies so in current impl it could be enqueued earlier that kernel_2. - // That makes it impossible to use this path with blocking users for in order - // queue. - if (QueueImplPtr Queue = RootCommand->getEvent()->getSubmittedQueue(); - Queue && Queue->isInOrder()) - return true; { std::lock_guard Guard(Cmd->MBlockedUsersMutex); if (Cmd->isBlocking()) { diff --git a/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp b/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp index d46fe6c80cc33..91171d6ce0dd1 100644 --- a/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp +++ b/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp @@ -54,7 +54,7 @@ int main() { // CHECK: Test2 // CHECK: ---> piEnqueueEventsWaitWithBarrier( - // CHECK: ZE ---> zeEventCreate + // CHECK: ZE ---> {{zeEventCreate|zeEventHostReset}} // CHECK: ZE ---> zeCommandListAppendWaitOnEvents // CHECK: ZE ---> zeCommandListAppendSignalEvent // CHECK: ) ---> pi_result : PI_SUCCESS diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index b8d45404c627d..ce7bb02e278b2 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -1199,9 +1199,11 @@ TEST_F(CommandGraphTest, InOrderQueueHostTaskAndGraph) { [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast); auto EventLastWaitList = EventLastImpl->getWaitList(); - // Previous task is not a host task. In Order queue dependency are managed by - // the backend for non-host kernels. - ASSERT_EQ(EventLastWaitList.size(), 0lu); + // Previous task is not a host task. Explicit dependency is still needed + // to properly handle blocked tasks (the event will be filtered out before + // submission to the backend). + ASSERT_EQ(EventLastWaitList.size(), 1lu); + ASSERT_EQ(EventLastWaitList[0], EventGraphImpl); } TEST_F(CommandGraphTest, InOrderQueueMemsetAndGraph) { diff --git a/sycl/unittests/scheduler/AllocaLinking.cpp b/sycl/unittests/scheduler/AllocaLinking.cpp index 0cce7dcd2ea92..a77995a203da3 100644 --- a/sycl/unittests/scheduler/AllocaLinking.cpp +++ b/sycl/unittests/scheduler/AllocaLinking.cpp @@ -46,6 +46,7 @@ static pi_result redefinedDeviceGetInfoAfter(pi_device Device, } TEST_F(SchedulerTest, AllocaLinking) { + HostUnifiedMemory = false; // This host device constructor should be placed before Mock.redefine // because it overrides the real implementation of get_device_info // which is needed when creating a host device. diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index 020a956537ddd..37bb5106d75b2 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -11,13 +11,14 @@ #include #include +#include #include #include #include -#include +#include using namespace sycl; @@ -32,6 +33,7 @@ inline pi_result redefinedEventsWait(pi_uint32 num_events, } TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) { + GEventsWaitCounter = 0; sycl::unittest::PiMock Mock; sycl::platform Plt = Mock.getPlatform(); Mock.redefineBefore(redefinedEventsWait); @@ -39,16 +41,131 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) { context Ctx{Plt}; queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; + auto buf = sycl::malloc_shared(1, InOrderQueue); + event Evt = InOrderQueue.submit( + [&](sycl::handler &CGH) { CGH.memset(buf, 0, sizeof(buf[0])); }); + InOrderQueue.submit([&](sycl::handler &CGH) { CGH.host_task([=] {}); }) + .wait(); + + EXPECT_EQ(GEventsWaitCounter, 1u); +} + +enum class CommandType { KERNEL = 1, MEMSET = 2 }; +std::vector> ExecutedCommands; + +inline pi_result customEnqueueKernelLaunch(pi_queue, pi_kernel, pi_uint32, + const size_t *, const size_t *, + const size_t *, + pi_uint32 EventsCount, + const pi_event *, pi_event *) { + ExecutedCommands.push_back({CommandType::KERNEL, EventsCount}); + return PI_SUCCESS; +} +inline pi_result customextUSMEnqueueMemset(pi_queue, void *, pi_int32, size_t, + pi_uint32 EventsCount, + const pi_event *, pi_event *) { + ExecutedCommands.push_back({CommandType::MEMSET, EventsCount}); + return PI_SUCCESS; +} + +TEST_F(SchedulerTest, InOrderQueueCrossDeps) { + ExecutedCommands.clear(); + sycl::unittest::PiMock Mock; + Mock.redefineBefore( + customEnqueueKernelLaunch); + Mock.redefineBefore( + customextUSMEnqueueMemset); + + sycl::platform Plt = Mock.getPlatform(); + + context Ctx{Plt}; + queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; + kernel_bundle KernelBundle = sycl::get_kernel_bundle(Ctx); auto ExecBundle = sycl::build(KernelBundle); - event Evt = InOrderQueue.submit([&](sycl::handler &CGH) { + std::mutex CvMutex; + std::condition_variable Cv; + bool ready = false; + + InOrderQueue.submit([&](sycl::handler &CGH) { + CGH.host_task([&] { + std::unique_lock lk(CvMutex); + Cv.wait(lk, [&ready] { return ready; }); + }); + }); + + auto buf = sycl::malloc_shared(1, InOrderQueue); + + event Ev1 = InOrderQueue.submit( + [&](sycl::handler &CGH) { CGH.memset(buf, 0, sizeof(buf[0])); }); + + event Ev2 = InOrderQueue.submit([&](sycl::handler &CGH) { CGH.use_kernel_bundle(ExecBundle); CGH.single_task>([] {}); }); - InOrderQueue.submit([&](sycl::handler &CGH) { CGH.host_task([=] {}); }) - .wait(); - EXPECT_TRUE(GEventsWaitCounter == 1); + { + std::unique_lock lk(CvMutex); + ready = true; + } + Cv.notify_one(); + + InOrderQueue.wait(); + + ASSERT_EQ(ExecutedCommands.size(), 2u); + EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::MEMSET); + EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u); + EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL); + EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); } + +TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { + ExecutedCommands.clear(); + sycl::unittest::PiMock Mock; + Mock.redefineBefore( + customEnqueueKernelLaunch); + Mock.redefineBefore( + customextUSMEnqueueMemset); + + sycl::platform Plt = Mock.getPlatform(); + if (Plt.is_host()) { + std::cout << "Not run due to host-only environment\n"; + GTEST_SKIP(); + } + + context Ctx{Plt}; + queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; + + std::mutex CvMutex; + std::condition_variable Cv; + bool ready = false; + + InOrderQueue.submit([&](sycl::handler &CGH) { + CGH.host_task([&] { + std::unique_lock lk(CvMutex); + Cv.wait(lk, [&ready] { return ready; }); + }); + }); + + auto buf = sycl::malloc_shared(1, InOrderQueue); + + event Ev1 = InOrderQueue.memset(buf, 0, sizeof(buf[0])); + + event Ev2 = InOrderQueue.single_task>([] {}); + + { + std::unique_lock lk(CvMutex); + ready = true; + } + Cv.notify_one(); + + InOrderQueue.wait(); + + ASSERT_EQ(ExecutedCommands.size(), 2u); + EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::MEMSET); + EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u); + EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL); + EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); +} \ No newline at end of file diff --git a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp index ca1e7d664bf58..4b50791981258 100644 --- a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp +++ b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp @@ -61,14 +61,17 @@ TEST_F(SchedulerTest, InOrderQueueSyncCheck) { // host | yes - always, separate sync management // host | yes - always, separate sync management // kernel | yes - change of sync approach - // kernel | no - sync between pi calls must be done by backend + // kernel | yes - sync between pi calls must be done by backend, but we + // still add dependency to handle the right order due to host task. This + // dependency will not be sent to backend. It is checked in + // SchedulerTest.InOrderQueueCrossDeps // host | yes - always, separate sync management sycl::event Event; // host task { LimitedHandlerSimulation MockCGH; - EXPECT_CALL(MockCGH, depends_on).Times(1); + EXPECT_CALL(MockCGH, depends_on).Times(0); Queue->finalizeHandler( MockCGH, detail::CG::CGTYPE::CodeplayHostTask, Event); } @@ -89,7 +92,7 @@ TEST_F(SchedulerTest, InOrderQueueSyncCheck) { // kernel task { LimitedHandlerSimulation MockCGH; - EXPECT_CALL(MockCGH, depends_on).Times(0); + EXPECT_CALL(MockCGH, depends_on).Times(1); Queue->finalizeHandler( MockCGH, detail::CG::CGTYPE::Kernel, Event); }