diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 21dc5c4923b4a..89e2f59db0ba3 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -322,37 +322,6 @@ void queue_impl::addSharedEvent(const event &Event) { MEventsShared.push_back(Event); } -static bool -areEventsSafeForSchedulerBypass(const std::vector &DepEvents, - ContextImplPtr Context) { - auto CheckEvent = [&Context](const sycl::event &Event) { - const EventImplPtr &SyclEventImplPtr = detail::getSyclObjImpl(Event); - // Events that don't have an initialized context are throwaway events that - // don't represent actual dependencies. Calling getContextImpl() would set - // their context, which we wish to avoid as it is expensive. - // NOP events also don't represent actual dependencies. - if ((!SyclEventImplPtr->isContextInitialized() && - !SyclEventImplPtr->is_host()) || - SyclEventImplPtr->isNOP()) { - return true; - } - if (SyclEventImplPtr->is_host()) { - return SyclEventImplPtr->isCompleted(); - } - // Cross-context dependencies can't be passed to the backend directly. - if (SyclEventImplPtr->getContextImpl() != Context) - return false; - - // A nullptr here means that the commmand does not produce a PI event or it - // hasn't been enqueued yet. - return SyclEventImplPtr->getHandleRef() != nullptr; - }; - - return std::all_of( - DepEvents.begin(), DepEvents.end(), - [&CheckEvent](const sycl::event &Event) { return CheckEvent(Event); }); -} - template event queue_impl::submitWithHandler(const std::shared_ptr &Self, const std::vector &DepEvents, @@ -382,8 +351,8 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr &Self, // If we have a command graph set we need to capture the op through the // handler rather than by-passing the scheduler. - if (MGraph.expired() && - areEventsSafeForSchedulerBypass(ExpandedDepEvents, MContext)) { + if (MGraph.expired() && Scheduler::areEventsSafeForSchedulerBypass( + ExpandedDepEvents, MContext)) { if (MSupportsDiscardingPiEvents) { MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents), /*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 4dfe4bd39d4d1..231437cd67f19 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -783,7 +783,6 @@ class queue_impl { EventRet = Handler.finalize(); } -protected: /// Performs command group submission to the queue. /// /// \param CGF is a function object containing command group. diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 9f25108db6d52..4bf64cfcccbaf 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -716,6 +716,48 @@ EventImplPtr Scheduler::addCommandGraphUpdate( return NewCmdEvent; } +bool CheckEventReadiness(const ContextImplPtr &Context, + const EventImplPtr &SyclEventImplPtr) { + // Events that don't have an initialized context are throwaway events that + // don't represent actual dependencies. Calling getContextImpl() would set + // their context, which we wish to avoid as it is expensive. + // NOP events also don't represent actual dependencies. + if ((!SyclEventImplPtr->isContextInitialized() && + !SyclEventImplPtr->is_host()) || + SyclEventImplPtr->isNOP()) { + return true; + } + if (SyclEventImplPtr->is_host()) { + return SyclEventImplPtr->isCompleted(); + } + // Cross-context dependencies can't be passed to the backend directly. + if (SyclEventImplPtr->getContextImpl() != Context) + return false; + + // A nullptr here means that the commmand does not produce a PI event or it + // hasn't been enqueued yet. + return SyclEventImplPtr->getHandleRef() != nullptr; +} + +bool Scheduler::areEventsSafeForSchedulerBypass( + const std::vector &DepEvents, ContextImplPtr Context) { + + return std::all_of( + DepEvents.begin(), DepEvents.end(), [&Context](const sycl::event &Event) { + const EventImplPtr &SyclEventImplPtr = detail::getSyclObjImpl(Event); + return CheckEventReadiness(Context, SyclEventImplPtr); + }); +} + +bool Scheduler::areEventsSafeForSchedulerBypass( + const std::vector &DepEvents, ContextImplPtr Context) { + + return std::all_of(DepEvents.begin(), DepEvents.end(), + [&Context](const EventImplPtr &SyclEventImplPtr) { + return CheckEventReadiness(Context, SyclEventImplPtr); + }); +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 8e619971fbc04..09437928f1d32 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -491,6 +491,13 @@ class Scheduler { const QueueImplPtr &Queue, std::vector Requirements, std::vector &Events); + static bool + areEventsSafeForSchedulerBypass(const std::vector &DepEvents, + ContextImplPtr Context); + static bool + areEventsSafeForSchedulerBypass(const std::vector &DepEvents, + ContextImplPtr Context); + protected: using RWLockT = std::shared_timed_mutex; using ReadLockT = std::shared_lock; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 17cde0994b314..c2171f53ac999 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -241,10 +241,12 @@ event handler::finalize() { } if (MQueue && !MGraph && !MSubgraphNode && !MQueue->getCommandGraph() && - !MQueue->is_in_fusion_mode() && - CGData.MRequirements.size() + CGData.MEvents.size() + - MStreamStorage.size() == - 0) { + !MQueue->is_in_fusion_mode() && !CGData.MRequirements.size() && + !MStreamStorage.size() && + (!CGData.MEvents.size() || + (MQueue->isInOrder() && + detail::Scheduler::areEventsSafeForSchedulerBypass( + CGData.MEvents, MQueue->getContextImplPtr())))) { // if user does not add a new dependency to the dependency graph, i.e. // the graph is not changed, and the queue is not in fusion mode, then // this faster path is used to submit kernel bypassing scheduler and diff --git a/sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp b/sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp index 6c8953c9e5d0c..59d11d861dd2b 100644 --- a/sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp +++ b/sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp @@ -312,74 +312,95 @@ TEST_F(CommandGraphTest, InOrderQueueWithPreviousHostTask) { } TEST_F(CommandGraphTest, InOrderQueueHostTaskAndGraph) { - sycl::property_list Properties{sycl::property::queue::in_order()}; - sycl::queue InOrderQueue{Dev, Properties}; - experimental::command_graph - InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; - - auto EventInitial = - InOrderQueue.submit([&](handler &CGH) { CGH.host_task([=]() {}); }); - auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial); - - // Record in-order queue with three nodes. - InOrderGraph.begin_recording(InOrderQueue); - auto Node1Graph = InOrderQueue.submit( - [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); - - auto PtrNode1 = - sycl::detail::getSyclObjImpl(InOrderGraph) - ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); - ASSERT_NE(PtrNode1, nullptr); - ASSERT_TRUE(PtrNode1->MPredecessors.empty()); - - auto Node2Graph = InOrderQueue.submit( - [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); - - auto PtrNode2 = - sycl::detail::getSyclObjImpl(InOrderGraph) - ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); - ASSERT_NE(PtrNode2, nullptr); - ASSERT_NE(PtrNode2, PtrNode1); - ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); - ASSERT_EQ(PtrNode1->MSuccessors.front().lock(), PtrNode2); - ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); - ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); - - auto Node3Graph = InOrderQueue.submit( - [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); - - auto PtrNode3 = - sycl::detail::getSyclObjImpl(InOrderGraph) - ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); - ASSERT_NE(PtrNode3, nullptr); - ASSERT_NE(PtrNode3, PtrNode2); - ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); - ASSERT_EQ(PtrNode2->MSuccessors.front().lock(), PtrNode3); - ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); - ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); - - InOrderGraph.end_recording(InOrderQueue); - - auto InOrderGraphExec = InOrderGraph.finalize(); - auto EventGraph = InOrderQueue.submit( - [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); }); - - auto EventGraphImpl = sycl::detail::getSyclObjImpl(EventGraph); - auto EventGraphWaitList = EventGraphImpl->getWaitList(); - // Previous task is a host task. Explicit dependency is needed to enforce the - // execution order. - ASSERT_EQ(EventGraphWaitList.size(), 1lu); - ASSERT_EQ(EventGraphWaitList[0], EventInitialImpl); - - auto EventLast = InOrderQueue.submit( - [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); - auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast); - auto EventLastWaitList = EventLastImpl->getWaitList(); - // Previous task is not a host task. Explicit dependency is still needed - // to properly handle blocked tasks (the event will be filtered out before - // submission to the backend). - ASSERT_EQ(EventLastWaitList.size(), 1lu); - ASSERT_EQ(EventLastWaitList[0], EventGraphImpl); + auto TestBody = [&](bool BlockHostTask) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Dev, Properties}; + experimental::command_graph + InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()}; + // Event dependency build depends on host task completion. Making it + // predictable with mutex in host task. + std::mutex HostTaskMutex; + std::unique_lock Lock(HostTaskMutex, std::defer_lock); + if (BlockHostTask) + Lock.lock(); + auto EventInitial = InOrderQueue.submit([&](handler &CGH) { + CGH.host_task([&HostTaskMutex]() { + std::lock_guard HostTaskLock(HostTaskMutex); + }); + }); + auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial); + + // Record in-order queue with three nodes. + InOrderGraph.begin_recording(InOrderQueue); + auto Node1Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode1 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode1, nullptr); + ASSERT_TRUE(PtrNode1->MPredecessors.empty()); + + auto Node2Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode2 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode2, nullptr); + ASSERT_NE(PtrNode2, PtrNode1); + ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode1->MSuccessors.front().lock(), PtrNode2); + ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1); + + auto Node3Graph = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto PtrNode3 = + sycl::detail::getSyclObjImpl(InOrderGraph) + ->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue)); + ASSERT_NE(PtrNode3, nullptr); + ASSERT_NE(PtrNode3, PtrNode2); + ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu); + ASSERT_EQ(PtrNode2->MSuccessors.front().lock(), PtrNode3); + ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu); + ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2); + + InOrderGraph.end_recording(InOrderQueue); + + auto InOrderGraphExec = InOrderGraph.finalize(); + + if (!BlockHostTask) + EventInitial.wait(); + auto EventGraph = InOrderQueue.submit( + [&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); }); + + auto EventGraphImpl = sycl::detail::getSyclObjImpl(EventGraph); + auto EventGraphWaitList = EventGraphImpl->getWaitList(); + // Previous task is a host task. Explicit dependency is needed to enforce + // the execution order. + ASSERT_EQ(EventGraphWaitList.size(), 1lu); + ASSERT_EQ(EventGraphWaitList[0], EventInitialImpl); + + auto EventLast = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast); + auto EventLastWaitList = EventLastImpl->getWaitList(); + // Previous task is not a host task. Explicit dependency is still needed + // to properly handle blocked tasks (the event will be filtered out before + // submission to the backend). + if (BlockHostTask) + Lock.unlock(); + ASSERT_EQ(EventLastWaitList.size(), size_t(BlockHostTask)); + if (EventLastWaitList.size()) { + ASSERT_EQ(EventLastWaitList[0], EventGraphImpl); + } + EventLast.wait(); + }; + + TestBody(false); + TestBody(true); } TEST_F(CommandGraphTest, InOrderQueueMemsetAndGraph) {