From ef0adffb713701ce7d0d0254e520b0979400167d Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Thu, 26 Oct 2023 10:26:21 -0700 Subject: [PATCH 01/40] draft beginning Signed-off-by: Tikhomirova, Kseniya --- sycl/include/sycl/detail/helpers.hpp | 2 + sycl/source/detail/helpers.cpp | 1 + sycl/source/detail/queue_impl.cpp | 105 +++++++++++++----- sycl/source/detail/queue_impl.hpp | 25 +---- .../scheduler/InOrderQueueHostTaskDeps.cpp | 77 ++++++++++++- 5 files changed, 157 insertions(+), 53 deletions(-) diff --git a/sycl/include/sycl/detail/helpers.hpp b/sycl/include/sycl/detail/helpers.hpp index 7e1fcb00a8aed..431750e4f7744 100644 --- a/sycl/include/sycl/detail/helpers.hpp +++ b/sycl/include/sycl/detail/helpers.hpp @@ -44,6 +44,8 @@ class buffer_impl; class context_impl; // The function returns list of events that can be passed to OpenCL API as // dependency list and waits for others. +__SYCL_EXPORT bool isEventsReady(const std::vector& DepEvents, const sycl::event* const ExtraDepEventPtr, ContextImplPtr Context); + __SYCL_EXPORT std::vector getOrWaitEvents(std::vector DepEvents, std::shared_ptr Context); diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp index 4103653d236a7..f639e09898ce5 100644 --- a/sycl/source/detail/helpers.cpp +++ b/sycl/source/detail/helpers.cpp @@ -21,6 +21,7 @@ namespace sycl { inline namespace _V1 { using ContextImplPtr = std::shared_ptr; namespace detail { + std::vector getOrWaitEvents(std::vector DepEvents, ContextImplPtr Context) { std::vector Events; diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 8e3b9070bd486..7ae4d2b82b2bd 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -26,6 +26,60 @@ namespace sycl { inline namespace _V1 { namespace detail { + + +std::vector +getPIEvents(const std::vector& DepEvents, sycl::event const * const ExtraDepEvent) { + std::vector RetPiEvents; + auto AddEvent = [&RetPiEvents](const sycl::event& Event) + { + auto EventImpl = detail::getSyclObjImpl(Event); + if (EventImpl->getHandleRef() == nullptr) + return; + RetPiEvents.push_back(EventImpl->getHandleRef()); + }; + if (ExtraDepEvent) + AddEvent(*ExtraDepEvent); + for_each(DepEvents.begin(), DepEvents.end(), AddEvent); + return RetPiEvents; +} + +bool isEventsReady(const std::vector& DepEvents, const sycl::event* const ExtraDepEventPtr, ContextImplPtr Context) +{ + auto CheckEvent = [&Context](sycl::event& Event) + { + auto SyclEventImplPtr = detail::getSyclObjImpl(Event); + // throwaway events created with empty constructor will not have a context + // (which is set lazily) calling getContextImpl() would set that + // context, which we wish to avoid as it is expensive. + if (!SyclEventImplPtr->isContextInitialized() && + !SyclEventImplPtr->is_host()) { + return true; + } + // The fusion command and its event are associated with a non-host context, + // but still does not produce a PI event. + to add field to event with producesPiEvent value + bool NoPiEvent = + SyclEventImplPtr->MCommand && + !static_cast(SyclEventImplPtr->MCommand)->producesPiEvent(); + if (SyclEventImplPtr->is_host() || + SyclEventImplPtr->getContextImpl() != Context || NoPiEvent) { + // Call wait, because the command for the event might not have been + // enqueued when kernel fusion is happening. + return false; + } else { + // In this path nullptr native event means that the command has not been + // enqueued. It may happen if async enqueue in a host task is involved. + if (SyclEventImplPtr->getHandleRef() == nullptr) { + return false; + } + } + return true; + } + + return (!ExtraDepEventPtr || CheckEvent(*ExtraDepEventPtr)) && std::all_of(DepEvents.begin(), DepEvents.end(), CheckEvent); +} + template <> uint32_t queue_impl::get_info() const { sycl::detail::pi::PiResult result = PI_SUCCESS; @@ -81,43 +135,40 @@ event queue_impl::memset(const std::shared_ptr &Self, // Emit a begin/end scope for this call PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin); #endif - if (MHasDiscardEventsSupport) { - MemoryManager::fill_usm(Ptr, Self, Count, Value, - getOrWaitEvents(DepEvents, MContext), nullptr); - return createDiscardedEvent(); - } - - event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); + // We need to submit command and update the last event under same lock if we + // have in-order queue. + if (std::unique_lock(isInOrder() ? MLastEventMtx : {}) && isEventsReady(DepEvents, isInOrder() ? &MLastEvent: nullptr, MContext)) { - // We need to submit command and update the last event under same lock if we - // have in-order queue. - auto ScopeLock = isInOrder() ? std::unique_lock(MLastEventMtx) - : std::unique_lock(); - // If the last submitted command in the in-order queue is host_task then - // wait for it before submitting usm command. - if (isInOrder() && MLastCGType == CG::CGTYPE::CodeplayHostTask) - MLastEvent.wait(); + if (MHasDiscardEventsSupport) { + MemoryManager::fill_usm(Ptr, Self, Count, Value, + getPIEvents(DepEvents, MLastEvent), nullptr); + return createDiscardedEvent(); + } + event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); auto EventImpl = detail::getSyclObjImpl(ResEvent); MemoryManager::fill_usm(Ptr, Self, Count, Value, - getOrWaitEvents(DepEvents, MContext), - &EventImpl->getHandleRef(), EventImpl); - + getPIEvents(DepEvents, MLastEvent), + &EventImpl->getHandleRef(), EventImpl); if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); - if (isInOrder()) { MLastEvent = ResEvent; - // We don't create a command group for usm commands, so set it to None. - // This variable is used to perform explicit dependency management when - // required. - MLastCGType = CG::CGTYPE::None; } + // Track only if we won't be able to handle it with piQueueFinish. + if (MEmulateOOO) + addSharedEvent(ResEvent); + return MDiscardEvents ? createDiscardedEvent() : ResEvent; + } + else + { + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.memset(Ptr, Value, Count); + }, + Self, {}); } - // Track only if we won't be able to handle it with piQueueFinish. - if (MEmulateOOO) - addSharedEvent(ResEvent); - return MDiscardEvents ? createDiscardedEvent() : ResEvent; } void report(const code_location &CodeLoc) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index bf37c764ead66..04fef581c611a 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -704,29 +704,16 @@ class queue_impl { void finalizeHandler(HandlerType &Handler, const CG::CGTYPE &Type, event &EventRet) { if (MIsInorder) { - - auto IsExpDepManaged = [](const CG::CGTYPE &Type) { - return Type == CG::CGTYPE::CodeplayHostTask; - }; - // Accessing and changing of an event isn't atomic operation. // Hence, here is the lock for thread-safety. std::lock_guard Lock{MLastEventMtx}; - - if (MLastCGType == CG::CGTYPE::None) - MLastCGType = Type; - // Also handles case when sync model changes. E.g. Last is host, new is - // kernel. - bool NeedSeparateDependencyMgmt = - IsExpDepManaged(Type) || IsExpDepManaged(MLastCGType); - - if (NeedSeparateDependencyMgmt) - Handler.depends_on(MLastEvent); + // This dependency is needed for the following purposes: + // - host tasks is handled by runtime and could not be implicitly synchronized by backend. + // - to prevent 2nd kernel enqueue when 1st kernel is blocked by host task. This dependency allows to build enqueue order in RT but will be not passed to backend. Look at getPIEvents in Command. + Handler.depends_on(MLastEvent); EventRet = Handler.finalize(); - MLastEvent = EventRet; - MLastCGType = Type; } else EventRet = Handler.finalize(); } @@ -851,10 +838,6 @@ class queue_impl { // Access to the event should be guarded with MLastEventMtx event MLastEvent; mutable std::mutex MLastEventMtx; - // Used for in-order queues in pair with MLastEvent - // Host tasks are explicitly synchronized in RT, pi tasks - implicitly by - // backend. Using type to setup explicit sync between host and pi tasks. - CG::CGTYPE MLastCGType = CG::CGTYPE::None; const bool MIsInorder; diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index 020a956537ddd..1a878cef721fb 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -11,13 +11,14 @@ #include #include +#include #include #include #include -#include +#include using namespace sycl; @@ -39,16 +40,82 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) { context Ctx{Plt}; queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; + auto buf = sycl::malloc_shared(1, InOrderQueue); + event Evt = InOrderQueue.submit( + [&](sycl::handler &CGH) { CGH.memset(buf, 0, sizeof(buf[0])); }); + InOrderQueue.submit([&](sycl::handler &CGH) { CGH.host_task([=] {}); }) + .wait(); + + EXPECT_TRUE(GEventsWaitCounter == 1); +} + +enum CommandType { KERNEL = 1, MEMSET = 2 }; +std::vector ExecutedCommands; + +inline pi_result customEnqueueKernelLaunch(pi_queue, pi_kernel, pi_uint32, + const size_t *, const size_t *, + const size_t *, pi_uint32, + const pi_event *, pi_event *) { + ExecutedCommands.push_back(CommandType::KERNEL); + return PI_SUCCESS; +} +inline pi_result customextUSMEnqueueMemset(pi_queue, void *, pi_int32, size_t, + pi_uint32, const pi_event *, + pi_event *) { + ExecutedCommands.push_back(CommandType::MEMSET); + return PI_SUCCESS; +} + +TEST_F(SchedulerTest, InOrderQueueCrossDeps) { + sycl::unittest::PiMock Mock; + Mock.redefineBefore( + customEnqueueKernelLaunch); + Mock.redefineBefore( + customextUSMEnqueueMemset); + + sycl::platform Plt = Mock.getPlatform(); + if (Plt.is_host()) { + std::cout << "Not run due to host-only environment\n"; + GTEST_SKIP(); + } + MockScheduler *MockSchedulerPtr = new MockScheduler(); + sycl::detail::GlobalHandler::instance().attachScheduler( + dynamic_cast(MockSchedulerPtr)); + + context Ctx{Plt}; + queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; + kernel_bundle KernelBundle = sycl::get_kernel_bundle(Ctx); auto ExecBundle = sycl::build(KernelBundle); - event Evt = InOrderQueue.submit([&](sycl::handler &CGH) { + std::mutex CvMutex; + std::condition_variable Cv; + + InOrderQueue.submit([&](sycl::handler &CGH) { + CGH.host_task([&] { + std::unique_lock lk(CvMutex); + Cv.wait(lk); + }); + }); + + auto buf = sycl::malloc_shared(1, InOrderQueue); + + event Ev1 = InOrderQueue.submit( + [&](sycl::handler &CGH) { CGH.memset(buf, 0, sizeof(buf[0])); }); + + event Ev2 = InOrderQueue.submit([&](sycl::handler &CGH) { CGH.use_kernel_bundle(ExecBundle); CGH.single_task>([] {}); }); - InOrderQueue.submit([&](sycl::handler &CGH) { CGH.host_task([=] {}); }) - .wait(); - EXPECT_TRUE(GEventsWaitCounter == 1); + Cv.notify_one(); + + InOrderQueue.wait(); + + sycl::detail::GlobalHandler::instance().attachScheduler(NULL); + + ASSERT_EQ(ExecutedCommands.size(), 2u); + EXPECT_EQ(ExecutedCommands[0], MEMSET); + EXPECT_EQ(ExecutedCommands[1], KERNEL); } From 26322d0ccbcc8366f07abbba0a78464b1b76fdbd Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Thu, 2 Nov 2023 05:25:38 -0700 Subject: [PATCH 02/40] Impl Signed-off-by: Tikhomirova, Kseniya --- sycl/include/sycl/detail/helpers.hpp | 2 - sycl/source/detail/event_impl.hpp | 5 + sycl/source/detail/queue_impl.cpp | 356 +++++++++++----------- sycl/source/detail/scheduler/commands.cpp | 11 + 4 files changed, 199 insertions(+), 175 deletions(-) diff --git a/sycl/include/sycl/detail/helpers.hpp b/sycl/include/sycl/detail/helpers.hpp index 431750e4f7744..7e1fcb00a8aed 100644 --- a/sycl/include/sycl/detail/helpers.hpp +++ b/sycl/include/sycl/detail/helpers.hpp @@ -44,8 +44,6 @@ class buffer_impl; class context_impl; // The function returns list of events that can be passed to OpenCL API as // dependency list and waits for others. -__SYCL_EXPORT bool isEventsReady(const std::vector& DepEvents, const sycl::event* const ExtraDepEventPtr, ContextImplPtr Context); - __SYCL_EXPORT std::vector getOrWaitEvents(std::vector DepEvents, std::shared_ptr Context); diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 067218f5a8459..61c1df76ff445 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -290,6 +290,9 @@ class event_impl { return MEventFromSubmitedExecCommandBuffer; } + void setProducesPiEvent(bool Value) { MProducesPiEvent = Value; } + bool producesPiEvent() const { return MProducesPiEvent; } + protected: // When instrumentation is enabled emits trace event for event wait begin and // returns the telemetry event generated for the wait @@ -348,6 +351,8 @@ class event_impl { // stored here. sycl::detail::pi::PiExtSyncPoint MSyncPoint; + bool MProducesPiEvent{false }; + friend std::vector getOrWaitEvents(std::vector DepEvents, std::shared_ptr Context); diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 7ae4d2b82b2bd..869132ce75f3b 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -46,7 +46,7 @@ getPIEvents(const std::vector& DepEvents, sycl::event const * const bool isEventsReady(const std::vector& DepEvents, const sycl::event* const ExtraDepEventPtr, ContextImplPtr Context) { - auto CheckEvent = [&Context](sycl::event& Event) + auto CheckEvent = [&Context](const sycl::event& Event) { auto SyclEventImplPtr = detail::getSyclObjImpl(Event); // throwaway events created with empty constructor will not have a context @@ -58,14 +58,8 @@ bool isEventsReady(const std::vector& DepEvents, const sycl::event* } // The fusion command and its event are associated with a non-host context, // but still does not produce a PI event. - to add field to event with producesPiEvent value - bool NoPiEvent = - SyclEventImplPtr->MCommand && - !static_cast(SyclEventImplPtr->MCommand)->producesPiEvent(); if (SyclEventImplPtr->is_host() || - SyclEventImplPtr->getContextImpl() != Context || NoPiEvent) { - // Call wait, because the command for the event might not have been - // enqueued when kernel fusion is happening. + SyclEventImplPtr->getContextImpl() != Context || !SyclEventImplPtr->producesPiEvent()) { return false; } else { // In this path nullptr native event means that the command has not been @@ -75,7 +69,7 @@ bool isEventsReady(const std::vector& DepEvents, const sycl::event* } } return true; - } + }; return (!ExtraDepEventPtr || CheckEvent(*ExtraDepEventPtr)) && std::all_of(DepEvents.begin(), DepEvents.end(), CheckEvent); } @@ -137,38 +131,45 @@ event queue_impl::memset(const std::shared_ptr &Self, #endif // We need to submit command and update the last event under same lock if we // have in-order queue. - if (std::unique_lock(isInOrder() ? MLastEventMtx : {}) && isEventsReady(DepEvents, isInOrder() ? &MLastEvent: nullptr, MContext)) { - if (MHasDiscardEventsSupport) { - MemoryManager::fill_usm(Ptr, Self, Count, Value, - getPIEvents(DepEvents, MLastEvent), nullptr); - return createDiscardedEvent(); + std::unique_lock quard(MLastEventMtx, std::defer_lock); + sycl::event* ExtraEventToWait = nullptr; + if (isInOrder()) + { + quard.lock(); + ExtraEventToWait = &MLastEvent; } + if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) + { + if (MHasDiscardEventsSupport) { + MemoryManager::fill_usm(Ptr, Self, Count, Value, + getPIEvents(DepEvents, ExtraEventToWait), nullptr); + return createDiscardedEvent(); + } - event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); - auto EventImpl = detail::getSyclObjImpl(ResEvent); - MemoryManager::fill_usm(Ptr, Self, Count, Value, - getPIEvents(DepEvents, MLastEvent), - &EventImpl->getHandleRef(), EventImpl); - if (MContext->is_host()) - return MDiscardEvents ? createDiscardedEvent() : event(); - if (isInOrder()) { - MLastEvent = ResEvent; + event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); + auto EventImpl = detail::getSyclObjImpl(ResEvent); + MemoryManager::fill_usm(Ptr, Self, Count, Value, + getPIEvents(DepEvents, ExtraEventToWait), + &EventImpl->getHandleRef(), EventImpl); + if (MContext->is_host()) + return MDiscardEvents ? createDiscardedEvent() : event(); + if (isInOrder()) { + MLastEvent = ResEvent; + } + // Track only if we won't be able to handle it with piQueueFinish. + if (MEmulateOOO) + addSharedEvent(ResEvent); + return MDiscardEvents ? createDiscardedEvent() : ResEvent; } - // Track only if we won't be able to handle it with piQueueFinish. - if (MEmulateOOO) - addSharedEvent(ResEvent); - return MDiscardEvents ? createDiscardedEvent() : ResEvent; - } - else - { - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.memset(Ptr, Value, Count); - }, - Self, {}); } + + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.memset(Ptr, Value, Count); + }, + Self, {}); } void report(const code_location &CodeLoc) { @@ -224,176 +225,185 @@ event queue_impl::memcpy(const std::shared_ptr &Self, throw runtime_error("NULL pointer argument in memory copy operation.", PI_ERROR_INVALID_VALUE); } - if (MHasDiscardEventsSupport) { - MemoryManager::copy_usm(Src, Self, Count, Dest, - getOrWaitEvents(DepEvents, MContext), nullptr); - return createDiscardedEvent(); - } - event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); { - // We need to submit command and update the last event under same lock if we - // have in-order queue. - auto ScopeLock = isInOrder() ? std::unique_lock(MLastEventMtx) - : std::unique_lock(); - // If the last submitted command in the in-order queue is host_task then - // wait for it before submitting usm command. - if (isInOrder() && MLastCGType == CG::CGTYPE::CodeplayHostTask) - MLastEvent.wait(); - - auto EventImpl = detail::getSyclObjImpl(ResEvent); - MemoryManager::copy_usm(Src, Self, Count, Dest, - getOrWaitEvents(DepEvents, MContext), - &EventImpl->getHandleRef(), EventImpl); - - if (MContext->is_host()) - return MDiscardEvents ? createDiscardedEvent() : event(); + std::unique_lock quard(MLastEventMtx, std::defer_lock); + sycl::event* ExtraEventToWait = nullptr; + if (isInOrder()) + { + quard.lock(); + ExtraEventToWait = &MLastEvent; + } + if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) + { + if (MHasDiscardEventsSupport) { + MemoryManager::copy_usm(Src, Self, Count, Dest, + getPIEvents(DepEvents, ExtraEventToWait), nullptr); + return createDiscardedEvent(); + } - if (isInOrder()) { - MLastEvent = ResEvent; - // We don't create a command group for usm commands, so set it to None. - // This variable is used to perform explicit dependency management when - // required. - MLastCGType = CG::CGTYPE::None; + event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); + auto EventImpl = detail::getSyclObjImpl(ResEvent); + MemoryManager::copy_usm(Src, Self, Count, Dest, + getPIEvents(DepEvents, ExtraEventToWait), + &EventImpl->getHandleRef(), EventImpl); + if (MContext->is_host()) + return MDiscardEvents ? createDiscardedEvent() : event(); + if (isInOrder()) { + MLastEvent = ResEvent; + } + // Track only if we won't be able to handle it with piQueueFinish. + if (MEmulateOOO) + addSharedEvent(ResEvent); + return MDiscardEvents ? createDiscardedEvent() : ResEvent; } } - // Track only if we won't be able to handle it with piQueueFinish. - if (MEmulateOOO) - addSharedEvent(ResEvent); - return MDiscardEvents ? createDiscardedEvent() : ResEvent; + + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.memcpy(Dest, Src, Count); + }, + Self, {}); } event queue_impl::mem_advise(const std::shared_ptr &Self, const void *Ptr, size_t Length, pi_mem_advice Advice, const std::vector &DepEvents) { - if (MHasDiscardEventsSupport) { - MemoryManager::advise_usm(Ptr, Self, Length, Advice, - getOrWaitEvents(DepEvents, MContext), nullptr); - return createDiscardedEvent(); - } - - event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); { - // We need to submit command and update the last event under same lock if we - // have in-order queue. - auto ScopeLock = isInOrder() ? std::unique_lock(MLastEventMtx) - : std::unique_lock(); - // If the last submitted command in the in-order queue is host_task then - // wait for it before submitting usm command. - if (isInOrder() && MLastCGType == CG::CGTYPE::CodeplayHostTask) - MLastEvent.wait(); - - auto EventImpl = detail::getSyclObjImpl(ResEvent); - MemoryManager::advise_usm(Ptr, Self, Length, Advice, - getOrWaitEvents(DepEvents, MContext), - &EventImpl->getHandleRef(), EventImpl); - - if (MContext->is_host()) - return MDiscardEvents ? createDiscardedEvent() : event(); - - if (isInOrder()) { - MLastEvent = ResEvent; - // We don't create a command group for usm commands, so set it to None. - // This variable is used to perform explicit dependency management when - // required. - MLastCGType = CG::CGTYPE::None; + std::unique_lock quard(MLastEventMtx, std::defer_lock); + sycl::event* ExtraEventToWait = nullptr; + if (isInOrder()) + { + quard.lock(); + ExtraEventToWait = &MLastEvent; + } + if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) + { + if (MHasDiscardEventsSupport) { + MemoryManager::advise_usm(Ptr, Self, Length, Advice, + getPIEvents(DepEvents, ExtraEventToWait), nullptr); + return createDiscardedEvent(); + } + + event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); + auto EventImpl = detail::getSyclObjImpl(ResEvent); + MemoryManager::advise_usm(Ptr, Self, Length, Advice, + getPIEvents(DepEvents, ExtraEventToWait), + &EventImpl->getHandleRef(), EventImpl); + if (MContext->is_host()) + return MDiscardEvents ? createDiscardedEvent() : event(); + if (isInOrder()) { + MLastEvent = ResEvent; + } + // Track only if we won't be able to handle it with piQueueFinish. + if (MEmulateOOO) + addSharedEvent(ResEvent); + return MDiscardEvents ? createDiscardedEvent() : ResEvent; } } - // Track only if we won't be able to handle it with piQueueFinish. - if (MEmulateOOO) - addSharedEvent(ResEvent); - return MDiscardEvents ? createDiscardedEvent() : ResEvent; + + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.mem_advise(Ptr, Length, Advice); + }, + Self, {}); } event queue_impl::memcpyToDeviceGlobal( const std::shared_ptr &Self, void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector &DepEvents) { - if (MHasDiscardEventsSupport) { - MemoryManager::copy_to_device_global( - DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Src, - getOrWaitEvents(DepEvents, MContext), nullptr); - return createDiscardedEvent(); - } - - event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); { - // We need to submit command and update the last event under same lock if we - // have in-order queue. - auto ScopeLock = isInOrder() ? std::unique_lock(MLastEventMtx) - : std::unique_lock(); - // If the last submitted command in the in-order queue is host_task then - // wait for it before submitting usm command. - if (isInOrder() && MLastCGType == CG::CGTYPE::CodeplayHostTask) - MLastEvent.wait(); - - auto EventImpl = detail::getSyclObjImpl(ResEvent); - MemoryManager::copy_to_device_global(DeviceGlobalPtr, IsDeviceImageScope, + std::unique_lock quard(MLastEventMtx, std::defer_lock); + sycl::event* ExtraEventToWait = nullptr; + if (isInOrder()) + { + quard.lock(); + ExtraEventToWait = &MLastEvent; + } + if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) + { + if (MHasDiscardEventsSupport) { + MemoryManager::copy_to_device_global(DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Src, - getOrWaitEvents(DepEvents, MContext), - &EventImpl->getHandleRef(), EventImpl); - - if (MContext->is_host()) - return MDiscardEvents ? createDiscardedEvent() : event(); - - if (isInOrder()) { - MLastEvent = ResEvent; - // We don't create a command group for usm commands, so set it to None. - // This variable is used to perform explicit dependency management when - // required. - MLastCGType = CG::CGTYPE::None; + getPIEvents(DepEvents, ExtraEventToWait), nullptr); + return createDiscardedEvent(); + } + + event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); + auto EventImpl = detail::getSyclObjImpl(ResEvent); + MemoryManager::copy_to_device_global(DeviceGlobalPtr, IsDeviceImageScope, + Self, NumBytes, Offset, Src, + getPIEvents(DepEvents, ExtraEventToWait), + &EventImpl->getHandleRef(), EventImpl); + if (MContext->is_host()) + return MDiscardEvents ? createDiscardedEvent() : event(); + if (isInOrder()) { + MLastEvent = ResEvent; + } + // Track only if we won't be able to handle it with piQueueFinish. + if (MEmulateOOO) + addSharedEvent(ResEvent); + return MDiscardEvents ? createDiscardedEvent() : ResEvent; } } - // Track only if we won't be able to handle it with piQueueFinish. - if (MEmulateOOO) - addSharedEvent(ResEvent); - return MDiscardEvents ? createDiscardedEvent() : ResEvent; + + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.memcpyToDeviceGlobal(DeviceGlobalPtr, Src, IsDeviceImageScope, NumBytes, Offset); + }, + Self, {}); } event queue_impl::memcpyFromDeviceGlobal( const std::shared_ptr &Self, void *Dest, const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector &DepEvents) { - if (MHasDiscardEventsSupport) { - MemoryManager::copy_from_device_global( - DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest, - getOrWaitEvents(DepEvents, MContext), nullptr); - return createDiscardedEvent(); - } + { + std::unique_lock quard(MLastEventMtx, std::defer_lock); + sycl::event* ExtraEventToWait = nullptr; + if (isInOrder()) + { + quard.lock(); + ExtraEventToWait = &MLastEvent; + } + if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) + { + if (MHasDiscardEventsSupport) { + MemoryManager::copy_from_device_global(DeviceGlobalPtr, IsDeviceImageScope, + Self, NumBytes, Offset, Dest, + getPIEvents(DepEvents, ExtraEventToWait), nullptr); + return createDiscardedEvent(); + } - event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); - { - // We need to submit command and update the last event under same lock if we - // have in-order queue. - auto ScopeLock = isInOrder() ? std::unique_lock(MLastEventMtx) - : std::unique_lock(); - // If the last submitted command in the in-order queue is host_task then - // wait for it before submitting usm command. - if (isInOrder() && MLastCGType == CG::CGTYPE::CodeplayHostTask) - MLastEvent.wait(); - - auto EventImpl = detail::getSyclObjImpl(ResEvent); - MemoryManager::copy_from_device_global( - DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest, - getOrWaitEvents(DepEvents, MContext), &EventImpl->getHandleRef(), - EventImpl); - - if (MContext->is_host()) - return MDiscardEvents ? createDiscardedEvent() : event(); - - if (isInOrder()) { - MLastEvent = ResEvent; - // We don't create a command group for usm commands, so set it to None. - // This variable is used to perform explicit dependency management when - // required. - MLastCGType = CG::CGTYPE::None; + event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); + auto EventImpl = detail::getSyclObjImpl(ResEvent); + MemoryManager::copy_from_device_global(DeviceGlobalPtr, IsDeviceImageScope, + Self, NumBytes, Offset, Dest, + getPIEvents(DepEvents, ExtraEventToWait), + &EventImpl->getHandleRef(), EventImpl); + if (MContext->is_host()) + return MDiscardEvents ? createDiscardedEvent() : event(); + if (isInOrder()) { + MLastEvent = ResEvent; + } + // Track only if we won't be able to handle it with piQueueFinish. + if (MEmulateOOO) + addSharedEvent(ResEvent); + return MDiscardEvents ? createDiscardedEvent() : ResEvent; } } - // Track only if we won't be able to handle it with piQueueFinish. - if (MEmulateOOO) - addSharedEvent(ResEvent); - return MDiscardEvents ? createDiscardedEvent() : ResEvent; + + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.memcpyFromDeviceGlobal(Dest, DeviceGlobalPtr, IsDeviceImageScope, NumBytes, Offset); + }, + Self, {}); } void queue_impl::addEvent(const event &Event) { diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 6d779ed7a4cd0..d9266353a017a 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -502,6 +502,7 @@ Command::Command( MEvent->setCommand(this); MEvent->setContextImpl(MQueue->getContextImplPtr()); MEvent->setStateIncomplete(); + MEvent->setProducesPiEvent(producesPiEvent()); MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -970,6 +971,7 @@ AllocaCommandBase::AllocaCommandBase(CommandType Type, QueueImplPtr Queue, MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst), MRequirement(std::move(Req)), MReleaseCmd(Queue, this) { MRequirement.MAccessMode = access::mode::read_write; + MEvent->setProducesPiEvent(producesPiEvent()); emitInstrumentationDataProxy(); } @@ -1088,6 +1090,7 @@ AllocaSubBufCommand::AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, // is added to this node, so this call must be before // the addDep() call. emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); Command *ConnectionCmd = addDep( DepDesc(MParentAlloca, getRequirement(), MParentAlloca), ToCleanUp); if (ConnectionCmd) @@ -1165,6 +1168,7 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const { ReleaseCommand::ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd) : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) { emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); } void ReleaseCommand::emitInstrumentationData() { @@ -1285,6 +1289,7 @@ MapMemObject::MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr), MMapMode(MapMode) { emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); } void MapMemObject::emitInstrumentationData() { @@ -1346,6 +1351,7 @@ UnMapMemObject::UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, : Command(CommandType::UNMAP_MEM_OBJ, std::move(Queue)), MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) { emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); } void UnMapMemObject::emitInstrumentationData() { @@ -1439,6 +1445,7 @@ MemCpyCommand::MemCpyCommand(Requirement SrcReq, MEvent->setWorkerQueue(MWorkerQueue); emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); } void MemCpyCommand::emitInstrumentationData() { @@ -1613,6 +1620,7 @@ MemCpyCommandHost::MemCpyCommandHost(Requirement SrcReq, MEvent->setWorkerQueue(MWorkerQueue); emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); } void MemCpyCommandHost::emitInstrumentationData() { @@ -1680,6 +1688,7 @@ pi_int32 MemCpyCommandHost::enqueueImp() { EmptyCommand::EmptyCommand(QueueImplPtr Queue) : Command(CommandType::EMPTY_TASK, std::move(Queue)) { emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); } pi_int32 EmptyCommand::enqueueImp() { @@ -1870,6 +1879,7 @@ ExecCGCommand::ExecCGCommand( } emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); } #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -3158,6 +3168,7 @@ KernelFusionCommand::KernelFusionCommand(QueueImplPtr Queue) : Command(Command::CommandType::FUSION, Queue), MStatus(FusionStatus::ACTIVE) { emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); } std::vector &KernelFusionCommand::auxiliaryCommands() { From 2bd4b1fd48524293025715eafadff9fc62501098 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Thu, 2 Nov 2023 05:29:47 -0700 Subject: [PATCH 03/40] fix clang-format Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/event_impl.hpp | 2 +- sycl/source/detail/helpers.cpp | 3 +- sycl/source/detail/queue_impl.cpp | 118 +++++++++++++++--------------- sycl/source/detail/queue_impl.hpp | 7 +- 4 files changed, 65 insertions(+), 65 deletions(-) diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 61c1df76ff445..b37952cd742d3 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -351,7 +351,7 @@ class event_impl { // stored here. sycl::detail::pi::PiExtSyncPoint MSyncPoint; - bool MProducesPiEvent{false }; + bool MProducesPiEvent{false}; friend std::vector getOrWaitEvents(std::vector DepEvents, diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp index f639e09898ce5..222b06127207d 100644 --- a/sycl/source/detail/helpers.cpp +++ b/sycl/source/detail/helpers.cpp @@ -21,7 +21,8 @@ namespace sycl { inline namespace _V1 { using ContextImplPtr = std::shared_ptr; namespace detail { - +// TODO: remove from public header files and implementation during the next ABI +// Breaking window. Not used any more. std::vector getOrWaitEvents(std::vector DepEvents, ContextImplPtr Context) { std::vector Events; diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 869132ce75f3b..32f6f57eabb3a 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -27,12 +27,11 @@ namespace sycl { inline namespace _V1 { namespace detail { - std::vector -getPIEvents(const std::vector& DepEvents, sycl::event const * const ExtraDepEvent) { +getPIEvents(const std::vector &DepEvents, + sycl::event const *const ExtraDepEvent) { std::vector RetPiEvents; - auto AddEvent = [&RetPiEvents](const sycl::event& Event) - { + auto AddEvent = [&RetPiEvents](const sycl::event &Event) { auto EventImpl = detail::getSyclObjImpl(Event); if (EventImpl->getHandleRef() == nullptr) return; @@ -44,10 +43,10 @@ getPIEvents(const std::vector& DepEvents, sycl::event const * const return RetPiEvents; } -bool isEventsReady(const std::vector& DepEvents, const sycl::event* const ExtraDepEventPtr, ContextImplPtr Context) -{ - auto CheckEvent = [&Context](const sycl::event& Event) - { +bool isEventsReady(const std::vector &DepEvents, + const sycl::event *const ExtraDepEventPtr, + ContextImplPtr Context) { + auto CheckEvent = [&Context](const sycl::event &Event) { auto SyclEventImplPtr = detail::getSyclObjImpl(Event); // throwaway events created with empty constructor will not have a context // (which is set lazily) calling getContextImpl() would set that @@ -59,7 +58,8 @@ bool isEventsReady(const std::vector& DepEvents, const sycl::event* // The fusion command and its event are associated with a non-host context, // but still does not produce a PI event. if (SyclEventImplPtr->is_host() || - SyclEventImplPtr->getContextImpl() != Context || !SyclEventImplPtr->producesPiEvent()) { + SyclEventImplPtr->getContextImpl() != Context || + !SyclEventImplPtr->producesPiEvent()) { return false; } else { // In this path nullptr native event means that the command has not been @@ -71,7 +71,8 @@ bool isEventsReady(const std::vector& DepEvents, const sycl::event* return true; }; - return (!ExtraDepEventPtr || CheckEvent(*ExtraDepEventPtr)) && std::all_of(DepEvents.begin(), DepEvents.end(), CheckEvent); + return (!ExtraDepEventPtr || CheckEvent(*ExtraDepEventPtr)) && + std::all_of(DepEvents.begin(), DepEvents.end(), CheckEvent); } template <> @@ -133,25 +134,24 @@ event queue_impl::memset(const std::shared_ptr &Self, // have in-order queue. { std::unique_lock quard(MLastEventMtx, std::defer_lock); - sycl::event* ExtraEventToWait = nullptr; - if (isInOrder()) - { + sycl::event *ExtraEventToWait = nullptr; + if (isInOrder()) { quard.lock(); ExtraEventToWait = &MLastEvent; } - if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) - { + if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::fill_usm(Ptr, Self, Count, Value, - getPIEvents(DepEvents, ExtraEventToWait), nullptr); + getPIEvents(DepEvents, ExtraEventToWait), + nullptr); return createDiscardedEvent(); } event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); auto EventImpl = detail::getSyclObjImpl(ResEvent); MemoryManager::fill_usm(Ptr, Self, Count, Value, - getPIEvents(DepEvents, ExtraEventToWait), - &EventImpl->getHandleRef(), EventImpl); + getPIEvents(DepEvents, ExtraEventToWait), + &EventImpl->getHandleRef(), EventImpl); if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); if (isInOrder()) { @@ -228,25 +228,24 @@ event queue_impl::memcpy(const std::shared_ptr &Self, { std::unique_lock quard(MLastEventMtx, std::defer_lock); - sycl::event* ExtraEventToWait = nullptr; - if (isInOrder()) - { + sycl::event *ExtraEventToWait = nullptr; + if (isInOrder()) { quard.lock(); ExtraEventToWait = &MLastEvent; } - if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) - { + if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::copy_usm(Src, Self, Count, Dest, - getPIEvents(DepEvents, ExtraEventToWait), nullptr); + getPIEvents(DepEvents, ExtraEventToWait), + nullptr); return createDiscardedEvent(); } event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); auto EventImpl = detail::getSyclObjImpl(ResEvent); MemoryManager::copy_usm(Src, Self, Count, Dest, - getPIEvents(DepEvents, ExtraEventToWait), - &EventImpl->getHandleRef(), EventImpl); + getPIEvents(DepEvents, ExtraEventToWait), + &EventImpl->getHandleRef(), EventImpl); if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); if (isInOrder()) { @@ -273,25 +272,24 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, const std::vector &DepEvents) { { std::unique_lock quard(MLastEventMtx, std::defer_lock); - sycl::event* ExtraEventToWait = nullptr; - if (isInOrder()) - { + sycl::event *ExtraEventToWait = nullptr; + if (isInOrder()) { quard.lock(); ExtraEventToWait = &MLastEvent; } - if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) - { + if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::advise_usm(Ptr, Self, Length, Advice, - getPIEvents(DepEvents, ExtraEventToWait), nullptr); + getPIEvents(DepEvents, ExtraEventToWait), + nullptr); return createDiscardedEvent(); } event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); auto EventImpl = detail::getSyclObjImpl(ResEvent); MemoryManager::advise_usm(Ptr, Self, Length, Advice, - getPIEvents(DepEvents, ExtraEventToWait), - &EventImpl->getHandleRef(), EventImpl); + getPIEvents(DepEvents, ExtraEventToWait), + &EventImpl->getHandleRef(), EventImpl); if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); if (isInOrder()) { @@ -318,27 +316,25 @@ event queue_impl::memcpyToDeviceGlobal( const std::vector &DepEvents) { { std::unique_lock quard(MLastEventMtx, std::defer_lock); - sycl::event* ExtraEventToWait = nullptr; - if (isInOrder()) - { + sycl::event *ExtraEventToWait = nullptr; + if (isInOrder()) { quard.lock(); ExtraEventToWait = &MLastEvent; } - if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) - { + if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { - MemoryManager::copy_to_device_global(DeviceGlobalPtr, IsDeviceImageScope, - Self, NumBytes, Offset, Src, - getPIEvents(DepEvents, ExtraEventToWait), nullptr); + MemoryManager::copy_to_device_global( + DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Src, + getPIEvents(DepEvents, ExtraEventToWait), nullptr); return createDiscardedEvent(); } event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); auto EventImpl = detail::getSyclObjImpl(ResEvent); - MemoryManager::copy_to_device_global(DeviceGlobalPtr, IsDeviceImageScope, - Self, NumBytes, Offset, Src, - getPIEvents(DepEvents, ExtraEventToWait), - &EventImpl->getHandleRef(), EventImpl); + MemoryManager::copy_to_device_global( + DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Src, + getPIEvents(DepEvents, ExtraEventToWait), &EventImpl->getHandleRef(), + EventImpl); if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); if (isInOrder()) { @@ -354,7 +350,8 @@ event queue_impl::memcpyToDeviceGlobal( return submit( [&](handler &CGH) { CGH.depends_on(DepEvents); - CGH.memcpyToDeviceGlobal(DeviceGlobalPtr, Src, IsDeviceImageScope, NumBytes, Offset); + CGH.memcpyToDeviceGlobal(DeviceGlobalPtr, Src, IsDeviceImageScope, + NumBytes, Offset); }, Self, {}); } @@ -363,29 +360,27 @@ event queue_impl::memcpyFromDeviceGlobal( const std::shared_ptr &Self, void *Dest, const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector &DepEvents) { - { + { std::unique_lock quard(MLastEventMtx, std::defer_lock); - sycl::event* ExtraEventToWait = nullptr; - if (isInOrder()) - { + sycl::event *ExtraEventToWait = nullptr; + if (isInOrder()) { quard.lock(); ExtraEventToWait = &MLastEvent; } - if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) - { + if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { - MemoryManager::copy_from_device_global(DeviceGlobalPtr, IsDeviceImageScope, - Self, NumBytes, Offset, Dest, - getPIEvents(DepEvents, ExtraEventToWait), nullptr); + MemoryManager::copy_from_device_global( + DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest, + getPIEvents(DepEvents, ExtraEventToWait), nullptr); return createDiscardedEvent(); } event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); auto EventImpl = detail::getSyclObjImpl(ResEvent); - MemoryManager::copy_from_device_global(DeviceGlobalPtr, IsDeviceImageScope, - Self, NumBytes, Offset, Dest, - getPIEvents(DepEvents, ExtraEventToWait), - &EventImpl->getHandleRef(), EventImpl); + MemoryManager::copy_from_device_global( + DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest, + getPIEvents(DepEvents, ExtraEventToWait), &EventImpl->getHandleRef(), + EventImpl); if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); if (isInOrder()) { @@ -401,7 +396,8 @@ event queue_impl::memcpyFromDeviceGlobal( return submit( [&](handler &CGH) { CGH.depends_on(DepEvents); - CGH.memcpyFromDeviceGlobal(Dest, DeviceGlobalPtr, IsDeviceImageScope, NumBytes, Offset); + CGH.memcpyFromDeviceGlobal(Dest, DeviceGlobalPtr, IsDeviceImageScope, + NumBytes, Offset); }, Self, {}); } diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 04fef581c611a..e1290f8a4bbb2 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -708,8 +708,11 @@ class queue_impl { // Hence, here is the lock for thread-safety. std::lock_guard Lock{MLastEventMtx}; // This dependency is needed for the following purposes: - // - host tasks is handled by runtime and could not be implicitly synchronized by backend. - // - to prevent 2nd kernel enqueue when 1st kernel is blocked by host task. This dependency allows to build enqueue order in RT but will be not passed to backend. Look at getPIEvents in Command. + // - host tasks is handled by runtime and could not be implicitly + // synchronized by backend. + // - to prevent 2nd kernel enqueue when 1st kernel is blocked by host + // task. This dependency allows to build enqueue order in RT but will + // be not passed to backend. Look at getPIEvents in Command. Handler.depends_on(MLastEvent); EventRet = Handler.finalize(); From 9dbeffaf2982bc64f9a80843946eee0b3453573c Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Thu, 2 Nov 2023 06:00:11 -0700 Subject: [PATCH 04/40] last changes Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/scheduler/graph_processor.cpp | 10 ---------- sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp | 4 +++- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp index f22913f990e9c..5ff184be64948 100644 --- a/sycl/source/detail/scheduler/graph_processor.cpp +++ b/sycl/source/detail/scheduler/graph_processor.cpp @@ -53,16 +53,6 @@ bool Scheduler::GraphProcessor::handleBlockingCmd(Command *Cmd, BlockingT Blocking) { if (Cmd == RootCommand || Blocking) return true; - // Async kernel enqueue depending on host task is not compatible with in order - // queue. If we have host_task_1, kernel_2 depending on host_task_1 and - // kernel_3 without explicit dependencies submitted to in order queue: host - // task blocks kernel_2 from being enqueued while kernel_3 has no such - // dependencies so in current impl it could be enqueued earlier that kernel_2. - // That makes it impossible to use this path with blocking users for in order - // queue. - if (QueueImplPtr Queue = RootCommand->getEvent()->getSubmittedQueue(); - Queue && Queue->isInOrder()) - return true; { std::lock_guard Guard(Cmd->MBlockedUsersMutex); if (Cmd->isBlocking()) { diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index 1a878cef721fb..03bfc23da80a1 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -91,11 +91,12 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) { std::mutex CvMutex; std::condition_variable Cv; + bool ready = false; InOrderQueue.submit([&](sycl::handler &CGH) { CGH.host_task([&] { std::unique_lock lk(CvMutex); - Cv.wait(lk); + Cv.wait(lk, [&ready] { return ready; }); }); }); @@ -109,6 +110,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) { CGH.single_task>([] {}); }); + ready = true; Cv.notify_one(); InOrderQueue.wait(); From 78b9801e050ca943ad58e390d4620523f304e1d4 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Thu, 2 Nov 2023 06:12:00 -0700 Subject: [PATCH 05/40] update UT Signed-off-by: Tikhomirova, Kseniya --- .../scheduler/InOrderQueueHostTaskDeps.cpp | 19 +++++++++++-------- .../scheduler/InOrderQueueSyncCheck.cpp | 7 +++++-- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index 03bfc23da80a1..b61ea137609e4 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -50,19 +50,20 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) { } enum CommandType { KERNEL = 1, MEMSET = 2 }; -std::vector ExecutedCommands; +std::vector> ExecutedCommands; inline pi_result customEnqueueKernelLaunch(pi_queue, pi_kernel, pi_uint32, const size_t *, const size_t *, - const size_t *, pi_uint32, + const size_t *, + pi_uint32 EventsCount, const pi_event *, pi_event *) { - ExecutedCommands.push_back(CommandType::KERNEL); + ExecutedCommands.push_back({CommandType::KERNEL, EventsCount}); return PI_SUCCESS; } inline pi_result customextUSMEnqueueMemset(pi_queue, void *, pi_int32, size_t, - pi_uint32, const pi_event *, - pi_event *) { - ExecutedCommands.push_back(CommandType::MEMSET); + pi_uint32 EventsCount, + const pi_event *, pi_event *) { + ExecutedCommands.push_back({CommandType::MEMSET, EventsCount}); return PI_SUCCESS; } @@ -118,6 +119,8 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) { sycl::detail::GlobalHandler::instance().attachScheduler(NULL); ASSERT_EQ(ExecutedCommands.size(), 2u); - EXPECT_EQ(ExecutedCommands[0], MEMSET); - EXPECT_EQ(ExecutedCommands[1], KERNEL); + EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, MEMSET); + EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u); + EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, KERNEL); + EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); } diff --git a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp index ca1e7d664bf58..5800cd9a0415d 100644 --- a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp +++ b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp @@ -61,7 +61,10 @@ TEST_F(SchedulerTest, InOrderQueueSyncCheck) { // host | yes - always, separate sync management // host | yes - always, separate sync management // kernel | yes - change of sync approach - // kernel | no - sync between pi calls must be done by backend + // kernel | yes - sync between pi calls must be done by backend, but we + // still add dependency to handle the right order due to host task. This + // dependency will not be sent to backend. It is checked in + // SchedulerTest.InOrderQueueCrossDeps // host | yes - always, separate sync management sycl::event Event; @@ -89,7 +92,7 @@ TEST_F(SchedulerTest, InOrderQueueSyncCheck) { // kernel task { LimitedHandlerSimulation MockCGH; - EXPECT_CALL(MockCGH, depends_on).Times(0); + EXPECT_CALL(MockCGH, depends_on).Times(1); Queue->finalizeHandler( MockCGH, detail::CG::CGTYPE::Kernel, Event); } From f6e0ca609b1569f78bb6c22e07a6705783c721db Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Thu, 2 Nov 2023 07:30:12 -0700 Subject: [PATCH 06/40] add UT Signed-off-by: Tikhomirova, Kseniya --- .../scheduler/InOrderQueueHostTaskDeps.cpp | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index b61ea137609e4..4610525ca303c 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -68,6 +68,7 @@ inline pi_result customextUSMEnqueueMemset(pi_queue, void *, pi_int32, size_t, } TEST_F(SchedulerTest, InOrderQueueCrossDeps) { + ExecutedCommands.clear(); sycl::unittest::PiMock Mock; Mock.redefineBefore( customEnqueueKernelLaunch); @@ -124,3 +125,54 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) { EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, KERNEL); EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); } + +TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { + ExecutedCommands.clear(); + sycl::unittest::PiMock Mock; + Mock.redefineBefore( + customEnqueueKernelLaunch); + Mock.redefineBefore( + customextUSMEnqueueMemset); + + sycl::platform Plt = Mock.getPlatform(); + if (Plt.is_host()) { + std::cout << "Not run due to host-only environment\n"; + GTEST_SKIP(); + } + MockScheduler *MockSchedulerPtr = new MockScheduler(); + sycl::detail::GlobalHandler::instance().attachScheduler( + dynamic_cast(MockSchedulerPtr)); + + context Ctx{Plt}; + queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; + + std::mutex CvMutex; + std::condition_variable Cv; + bool ready = false; + + InOrderQueue.submit([&](sycl::handler &CGH) { + CGH.host_task([&] { + std::unique_lock lk(CvMutex); + Cv.wait(lk, [&ready] { return ready; }); + }); + }); + + auto buf = sycl::malloc_shared(1, InOrderQueue); + + event Ev1 = InOrderQueue.memset(buf, 0, sizeof(buf[0])); + + event Ev2 = InOrderQueue.single_task>([] {}); + + ready = true; + Cv.notify_one(); + + InOrderQueue.wait(); + + sycl::detail::GlobalHandler::instance().attachScheduler(NULL); + + ASSERT_EQ(ExecutedCommands.size(), 2u); + EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, MEMSET); + EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u); + EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, KERNEL); + EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); +} \ No newline at end of file From 96652bc9b95c571ce9c3807b64a5bdbd5f3df2f2 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Thu, 2 Nov 2023 08:01:26 -0700 Subject: [PATCH 07/40] add test for producesPiEVent Signed-off-by: Tikhomirova, Kseniya --- sycl/unittests/scheduler/Commands.cpp | 38 +++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/sycl/unittests/scheduler/Commands.cpp b/sycl/unittests/scheduler/Commands.cpp index a995800643421..6221203bc04dc 100644 --- a/sycl/unittests/scheduler/Commands.cpp +++ b/sycl/unittests/scheduler/Commands.cpp @@ -8,6 +8,7 @@ #include "SchedulerTest.hpp" #include "SchedulerTestUtils.hpp" +#include #include #include @@ -84,3 +85,40 @@ TEST_F(SchedulerTest, WaitEmptyEventWithBarrier) { MS.Scheduler::addCG(std::move(CommandGroup), QueueImpl); } } + +TEST_F(SchedulerTest, CommandsPiEventExpectation) { + sycl::unittest::PiMock Mock; + sycl::platform Plt = Mock.getPlatform(); + context Ctx{Plt}; + queue Queue{Ctx, default_selector_v}; + detail::QueueImplPtr QueueImpl = detail::getSyclObjImpl(Queue); + MockScheduler MS; + + buffer Buf{range<1>(1)}; + std::shared_ptr BufImpl = detail::getSyclObjImpl(Buf); + detail::Requirement MockReq = getMockRequirement(Buf); + MockReq.MDims = 1; + MockReq.MSYCLMemObj = BufImpl.get(); + + std::vector AuxCmds; + detail::MemObjRecord *Record = + MS.getOrInsertMemObjRecord(QueueImpl, &MockReq, AuxCmds); + detail::AllocaCommandBase *AllocaCmd = + MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, AuxCmds); + EXPECT_EQ(AllocaCmd->producesPiEvent(), + AllocaCmd->getEvent()->producesPiEvent()); + EXPECT_EQ(AllocaCmd->producesPiEvent(), false); + + std::unique_ptr CG{ + new detail::CGFill(/*Pattern*/ {}, &MockReq, + detail::CG::StorageInitHelper( + /*ArgsStorage*/ {}, + /*AccStorage*/ {}, + /*SharedPtrStorage*/ {}, + /*Requirements*/ {&MockReq}, + /*Events*/ {}))}; + detail::EventImplPtr Event = MS.addCG(std::move(CG), QueueImpl); + auto *Cmd = static_cast(Event->getCommand()); + EXPECT_EQ(Cmd->producesPiEvent(), Event->producesPiEvent()); + EXPECT_EQ(Cmd->producesPiEvent(), true); +} \ No newline at end of file From b776832e1446473e74d40c82850e192b222f64ea Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Mon, 6 Nov 2023 05:22:07 -0800 Subject: [PATCH 08/40] improve Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/queue_impl.cpp | 66 +++++++++++-------- sycl/source/detail/queue_impl.hpp | 7 +- sycl/source/detail/scheduler/scheduler.cpp | 5 +- .../scheduler/InOrderQueueSyncCheck.cpp | 2 +- 4 files changed, 46 insertions(+), 34 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index a8a0b16f95a9f..0628581d808f7 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -30,25 +30,27 @@ std::atomic queue_impl::MNextAvailableQueueID = 0; std::vector getPIEvents(const std::vector &DepEvents, - sycl::event const *const ExtraDepEvent) { + const EventImplPtr &ExtraDepEvent) { std::vector RetPiEvents; - auto AddEvent = [&RetPiEvents](const sycl::event &Event) { - auto EventImpl = detail::getSyclObjImpl(Event); + auto AddEvent = [&RetPiEvents](const EventImplPtr &EventImpl) { if (EventImpl->getHandleRef() == nullptr) return; RetPiEvents.push_back(EventImpl->getHandleRef()); }; if (ExtraDepEvent) - AddEvent(*ExtraDepEvent); - for_each(DepEvents.begin(), DepEvents.end(), AddEvent); + AddEvent(ExtraDepEvent); + for_each(DepEvents.begin(), DepEvents.end(), + [&RetPiEvents, &AddEvent](const sycl::event &Event) { + auto EventImpl = detail::getSyclObjImpl(Event); + return AddEvent(EventImpl); + }); return RetPiEvents; } bool isEventsReady(const std::vector &DepEvents, - const sycl::event *const ExtraDepEventPtr, + const EventImplPtr &ExtraDepEventPtr, ContextImplPtr Context) { - auto CheckEvent = [&Context](const sycl::event &Event) { - auto SyclEventImplPtr = detail::getSyclObjImpl(Event); + auto CheckEvent = [&Context](const EventImplPtr &SyclEventImplPtr) { // throwaway events created with empty constructor will not have a context // (which is set lazily) calling getContextImpl() would set that // context, which we wish to avoid as it is expensive. @@ -56,10 +58,11 @@ bool isEventsReady(const std::vector &DepEvents, !SyclEventImplPtr->is_host()) { return true; } + if (SyclEventImplPtr->is_host()) + return SyclEventImplPtr->isCompleted(); // The fusion command and its event are associated with a non-host context, // but still does not produce a PI event. - if (SyclEventImplPtr->is_host() || - SyclEventImplPtr->getContextImpl() != Context || + if (SyclEventImplPtr->getContextImpl() != Context || !SyclEventImplPtr->producesPiEvent()) { return false; } else { @@ -72,8 +75,12 @@ bool isEventsReady(const std::vector &DepEvents, return true; }; - return (!ExtraDepEventPtr || CheckEvent(*ExtraDepEventPtr)) && - std::all_of(DepEvents.begin(), DepEvents.end(), CheckEvent); + return (!ExtraDepEventPtr || CheckEvent(ExtraDepEventPtr)) && + std::all_of(DepEvents.begin(), DepEvents.end(), + [&Context, &CheckEvent](const sycl::event &Event) { + auto SyclEventImplPtr = detail::getSyclObjImpl(Event); + return CheckEvent(SyclEventImplPtr); + }); } template <> @@ -136,10 +143,10 @@ event queue_impl::memset(const std::shared_ptr &Self, // have in-order queue. { std::unique_lock quard(MLastEventMtx, std::defer_lock); - sycl::event *ExtraEventToWait = nullptr; + EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { quard.lock(); - ExtraEventToWait = &MLastEvent; + ExtraEventToWait = MLastEventPtr; } if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { @@ -157,7 +164,7 @@ event queue_impl::memset(const std::shared_ptr &Self, if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); if (isInOrder()) { - MLastEvent = ResEvent; + MLastEventPtr = EventImpl; } // Track only if we won't be able to handle it with piQueueFinish. if (MEmulateOOO) @@ -231,10 +238,10 @@ event queue_impl::memcpy(const std::shared_ptr &Self, { std::unique_lock quard(MLastEventMtx, std::defer_lock); - sycl::event *ExtraEventToWait = nullptr; + EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { quard.lock(); - ExtraEventToWait = &MLastEvent; + ExtraEventToWait = MLastEventPtr; } if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { @@ -252,7 +259,7 @@ event queue_impl::memcpy(const std::shared_ptr &Self, if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); if (isInOrder()) { - MLastEvent = ResEvent; + MLastEventPtr = EventImpl; } // Track only if we won't be able to handle it with piQueueFinish. if (MEmulateOOO) @@ -275,10 +282,10 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, const std::vector &DepEvents) { { std::unique_lock quard(MLastEventMtx, std::defer_lock); - sycl::event *ExtraEventToWait = nullptr; + EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { quard.lock(); - ExtraEventToWait = &MLastEvent; + ExtraEventToWait = MLastEventPtr; } if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { @@ -296,7 +303,7 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); if (isInOrder()) { - MLastEvent = ResEvent; + MLastEventPtr = EventImpl; } // Track only if we won't be able to handle it with piQueueFinish. if (MEmulateOOO) @@ -319,10 +326,10 @@ event queue_impl::memcpyToDeviceGlobal( const std::vector &DepEvents) { { std::unique_lock quard(MLastEventMtx, std::defer_lock); - sycl::event *ExtraEventToWait = nullptr; + EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { quard.lock(); - ExtraEventToWait = &MLastEvent; + ExtraEventToWait = MLastEventPtr; } if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { @@ -341,7 +348,7 @@ event queue_impl::memcpyToDeviceGlobal( if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); if (isInOrder()) { - MLastEvent = ResEvent; + MLastEventPtr = EventImpl; } // Track only if we won't be able to handle it with piQueueFinish. if (MEmulateOOO) @@ -365,10 +372,10 @@ event queue_impl::memcpyFromDeviceGlobal( size_t Offset, const std::vector &DepEvents) { { std::unique_lock quard(MLastEventMtx, std::defer_lock); - sycl::event *ExtraEventToWait = nullptr; + EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { quard.lock(); - ExtraEventToWait = &MLastEvent; + ExtraEventToWait = MLastEventPtr; } if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { @@ -387,7 +394,7 @@ event queue_impl::memcpyFromDeviceGlobal( if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); if (isInOrder()) { - MLastEvent = ResEvent; + MLastEventPtr = EventImpl; } // Track only if we won't be able to handle it with piQueueFinish. if (MEmulateOOO) @@ -629,8 +636,9 @@ bool queue_impl::ext_oneapi_empty() const { // the status of the last event. if (isInOrder() && !MDiscardEvents) { std::lock_guard Lock(MLastEventMtx); - return MLastEvent.get_info() == - info::event_command_status::complete; + return !MLastEventPtr || + MLastEventPtr->get_info() == + info::event_command_status::complete; } // Check the status of the backend queue if this is not a host queue. diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 43011615b4a4b..2ebe6b8c701bd 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -723,10 +723,11 @@ class queue_impl { // - to prevent 2nd kernel enqueue when 1st kernel is blocked by host // task. This dependency allows to build enqueue order in RT but will // be not passed to backend. Look at getPIEvents in Command. - Handler.depends_on(MLastEvent); + if (MLastEventPtr) + Handler.depends_on(createSyclObjFromImpl(MLastEventPtr)); EventRet = Handler.finalize(); - MLastEvent = EventRet; + MLastEventPtr = getSyclObjImpl(EventRet); } else EventRet = Handler.finalize(); } @@ -849,7 +850,7 @@ class queue_impl { // This event is employed for enhanced dependency tracking with in-order queue // Access to the event should be guarded with MLastEventMtx - event MLastEvent; + EventImplPtr MLastEventPtr; mutable std::mutex MLastEventMtx; const bool MIsInorder; diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index e2a2286f677e0..e66639ba38d12 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -419,8 +419,9 @@ void Scheduler::releaseResources(BlockingT Blocking) { // queue_impl, ~queue_impl is called and buffer for assert (which is created // with size only so all confitions for deferred release are satisfied) is // added to deferred mem obj storage. So we may end up with leak. - while (!isDeferredMemObjectsEmpty()) + do { cleanupDeferredMemObjects(Blocking); + } while (Blocking == BlockingT::BLOCKING && !isDeferredMemObjectsEmpty()); } MemObjRecord *Scheduler::getMemObjRecord(const Requirement *const Req) { @@ -517,6 +518,8 @@ void Scheduler::cleanupDeferredMemObjects(BlockingT Blocking) { std::vector> ObjsReadyToRelease; { + static size_t count = 0; + // Lock is needed for checkLeavesCompletion - if walks through Record leaves ReadLockT Lock = ReadLockT(MGraphLock, std::try_to_lock); if (Lock.owns_lock()) { diff --git a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp index 5800cd9a0415d..4b50791981258 100644 --- a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp +++ b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp @@ -71,7 +71,7 @@ TEST_F(SchedulerTest, InOrderQueueSyncCheck) { // host task { LimitedHandlerSimulation MockCGH; - EXPECT_CALL(MockCGH, depends_on).Times(1); + EXPECT_CALL(MockCGH, depends_on).Times(0); Queue->finalizeHandler( MockCGH, detail::CG::CGTYPE::CodeplayHostTask, Event); } From 5e978748de264f47338c6308a79dff9e7db2fc45 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Mon, 6 Nov 2023 05:33:16 -0800 Subject: [PATCH 09/40] cleanup Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/scheduler/scheduler.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index e66639ba38d12..dde8333b971c1 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -518,8 +518,6 @@ void Scheduler::cleanupDeferredMemObjects(BlockingT Blocking) { std::vector> ObjsReadyToRelease; { - static size_t count = 0; - // Lock is needed for checkLeavesCompletion - if walks through Record leaves ReadLockT Lock = ReadLockT(MGraphLock, std::try_to_lock); if (Lock.owns_lock()) { From ee25f0a4032efe3d0497bd0b464f79ea7882e22d Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Fri, 10 Nov 2023 05:21:12 -0800 Subject: [PATCH 10/40] fix discard events usage Signed-off-by: Tikhomirova, Kseniya --- sycl/include/sycl/queue.hpp | 15 +++++------- sycl/source/detail/queue_impl.cpp | 24 +++++++++++++------ sycl/source/detail/queue_impl.hpp | 12 ++++++---- sycl/unittests/scheduler/AllocaLinking.cpp | 1 + .../scheduler/InOrderQueueHostTaskDeps.cpp | 3 ++- 5 files changed, 34 insertions(+), 21 deletions(-) diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index cdeb8d02369f6..655143d75dcfb 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -356,11 +356,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } }; - auto Event = submit_impl_and_postprocess(CGF, CodeLoc, PostProcess); - return discard_or_return(Event); + return submit_impl_and_postprocess(CGF, CodeLoc, PostProcess); #else - auto Event = submit_impl(CGF, CodeLoc); - return discard_or_return(Event); + return submit_impl(CGF, CodeLoc); #endif // __SYCL_USE_FALLBACK_ASSERT } @@ -395,12 +393,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { } }; - auto Event = - submit_impl_and_postprocess(CGF, SecondaryQueue, CodeLoc, PostProcess); - return discard_or_return(Event); + return submit_impl_and_postprocess(CGF, SecondaryQueue, CodeLoc, + PostProcess); #else - auto Event = submit_impl(CGF, SecondaryQueue, CodeLoc); - return discard_or_return(Event); + return submit_impl(CGF, SecondaryQueue, CodeLoc); #endif // __SYCL_USE_FALLBACK_ASSERT } @@ -2815,6 +2811,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { /// Checks if the event needs to be discarded and if so, discards it and /// returns a discarded event. Otherwise, it returns input event. + /// TODO: move to impl class in the next ABI Breaking window event discard_or_return(const event &Event); // Function to postprocess submitted command diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 0628581d808f7..4c4c00ed0e583 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -58,8 +58,9 @@ bool isEventsReady(const std::vector &DepEvents, !SyclEventImplPtr->is_host()) { return true; } - if (SyclEventImplPtr->is_host()) + if (SyclEventImplPtr->is_host()) { return SyclEventImplPtr->isCompleted(); + } // The fusion command and its event are associated with a non-host context, // but still does not produce a PI event. if (SyclEventImplPtr->getContextImpl() != Context || @@ -110,6 +111,7 @@ static event prepareSYCLEventAssociatedWithQueue( } static event createDiscardedEvent() { + EventImplPtr EventImpl = std::make_shared(event_impl::HES_Discarded); return createSyclObjFromImpl(EventImpl); @@ -169,7 +171,7 @@ event queue_impl::memset(const std::shared_ptr &Self, // Track only if we won't be able to handle it with piQueueFinish. if (MEmulateOOO) addSharedEvent(ResEvent); - return MDiscardEvents ? createDiscardedEvent() : ResEvent; + return discard_or_return(ResEvent); } } @@ -264,7 +266,7 @@ event queue_impl::memcpy(const std::shared_ptr &Self, // Track only if we won't be able to handle it with piQueueFinish. if (MEmulateOOO) addSharedEvent(ResEvent); - return MDiscardEvents ? createDiscardedEvent() : ResEvent; + return discard_or_return(ResEvent); } } @@ -300,15 +302,17 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, MemoryManager::advise_usm(Ptr, Self, Length, Advice, getPIEvents(DepEvents, ExtraEventToWait), &EventImpl->getHandleRef(), EventImpl); - if (MContext->is_host()) + if (MContext->is_host()) { return MDiscardEvents ? createDiscardedEvent() : event(); + } if (isInOrder()) { MLastEventPtr = EventImpl; } // Track only if we won't be able to handle it with piQueueFinish. if (MEmulateOOO) addSharedEvent(ResEvent); - return MDiscardEvents ? createDiscardedEvent() : ResEvent; + + return discard_or_return(ResEvent); } } @@ -353,7 +357,7 @@ event queue_impl::memcpyToDeviceGlobal( // Track only if we won't be able to handle it with piQueueFinish. if (MEmulateOOO) addSharedEvent(ResEvent); - return MDiscardEvents ? createDiscardedEvent() : ResEvent; + return discard_or_return(ResEvent); } } @@ -399,7 +403,7 @@ event queue_impl::memcpyFromDeviceGlobal( // Track only if we won't be able to handle it with piQueueFinish. if (MEmulateOOO) addSharedEvent(ResEvent); - return MDiscardEvents ? createDiscardedEvent() : ResEvent; + return discard_or_return(ResEvent); } } @@ -674,6 +678,12 @@ bool queue_impl::ext_oneapi_empty() const { return true; } +event queue_impl::discard_or_return(const event &Event) { + if (!(MDiscardEvents)) + return Event; + return createDiscardedEvent(); +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 2ebe6b8c701bd..b9997637f4760 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -385,12 +385,14 @@ class queue_impl { const std::shared_ptr &SecondQueue, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess = nullptr) { + event ResEvent; try { - return submit_impl(CGF, Self, Self, SecondQueue, Loc, PostProcess); + ResEvent = submit_impl(CGF, Self, Self, SecondQueue, Loc, PostProcess); } catch (...) { - return SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue, Loc, - PostProcess); + ResEvent = SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue, + Loc, PostProcess); } + return discard_or_return(ResEvent); } /// Submits a command group function object to the queue, in order to be @@ -405,7 +407,8 @@ class queue_impl { const std::shared_ptr &Self, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess = nullptr) { - return submit_impl(CGF, Self, Self, nullptr, Loc, PostProcess); + auto ResEvent = submit_impl(CGF, Self, Self, nullptr, Loc, PostProcess); + return discard_or_return(ResEvent); } /// Performs a blocking wait for the completion of all enqueued tasks in the @@ -706,6 +709,7 @@ class queue_impl { unsigned long long getQueueID() { return MQueueID; } protected: + event discard_or_return(const event &Event); // Hook to the scheduler to clean up any fusion command held on destruction. void cleanup_fusion_cmd(); diff --git a/sycl/unittests/scheduler/AllocaLinking.cpp b/sycl/unittests/scheduler/AllocaLinking.cpp index 0cce7dcd2ea92..a77995a203da3 100644 --- a/sycl/unittests/scheduler/AllocaLinking.cpp +++ b/sycl/unittests/scheduler/AllocaLinking.cpp @@ -46,6 +46,7 @@ static pi_result redefinedDeviceGetInfoAfter(pi_device Device, } TEST_F(SchedulerTest, AllocaLinking) { + HostUnifiedMemory = false; // This host device constructor should be placed before Mock.redefine // because it overrides the real implementation of get_device_info // which is needed when creating a host device. diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index 4610525ca303c..afdcb38702a8e 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -33,6 +33,7 @@ inline pi_result redefinedEventsWait(pi_uint32 num_events, } TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) { + GEventsWaitCounter = 0; sycl::unittest::PiMock Mock; sycl::platform Plt = Mock.getPlatform(); Mock.redefineBefore(redefinedEventsWait); @@ -46,7 +47,7 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) { InOrderQueue.submit([&](sycl::handler &CGH) { CGH.host_task([=] {}); }) .wait(); - EXPECT_TRUE(GEventsWaitCounter == 1); + EXPECT_EQ(GEventsWaitCounter, 1); } enum CommandType { KERNEL = 1, MEMSET = 2 }; From 5ada53c5863e209accd7250de10b542c87ca9e2a Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Fri, 10 Nov 2023 05:33:42 -0800 Subject: [PATCH 11/40] fix comparison Signed-off-by: Tikhomirova, Kseniya --- sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index afdcb38702a8e..795ee3a636a5f 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -47,7 +47,7 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) { InOrderQueue.submit([&](sycl::handler &CGH) { CGH.host_task([=] {}); }) .wait(); - EXPECT_EQ(GEventsWaitCounter, 1); + EXPECT_EQ(GEventsWaitCounter, 1u); } enum CommandType { KERNEL = 1, MEMSET = 2 }; From cd184bb21c192d5e01611bc5689553901adfcf73 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 22 Nov 2023 05:39:57 -0800 Subject: [PATCH 12/40] support for graph Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/queue_impl.cpp | 50 ++++++++++++++++++------------- sycl/source/detail/queue_impl.hpp | 14 +++++++-- 2 files changed, 41 insertions(+), 23 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 4c4c00ed0e583..b9d2013cb8dcc 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -144,11 +144,11 @@ event queue_impl::memset(const std::shared_ptr &Self, // We need to submit command and update the last event under same lock if we // have in-order queue. { - std::unique_lock quard(MLastEventMtx, std::defer_lock); + std::unique_lock guard(MLastEventMtx, std::defer_lock); EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { - quard.lock(); - ExtraEventToWait = MLastEventPtr; + guard.lock(); + ExtraEventToWait = MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; } if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { @@ -166,7 +166,9 @@ event queue_impl::memset(const std::shared_ptr &Self, if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); if (isInOrder()) { - MLastEventPtr = EventImpl; + auto &EventToStoreIn = + MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; + EventToStoreIn = EventImpl; } // Track only if we won't be able to handle it with piQueueFinish. if (MEmulateOOO) @@ -239,11 +241,11 @@ event queue_impl::memcpy(const std::shared_ptr &Self, } { - std::unique_lock quard(MLastEventMtx, std::defer_lock); + std::unique_lock guard(MLastEventMtx, std::defer_lock); EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { - quard.lock(); - ExtraEventToWait = MLastEventPtr; + guard.lock(); + ExtraEventToWait = MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; } if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { @@ -261,7 +263,9 @@ event queue_impl::memcpy(const std::shared_ptr &Self, if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); if (isInOrder()) { - MLastEventPtr = EventImpl; + auto &EventToStoreIn = + MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; + EventToStoreIn = EventImpl; } // Track only if we won't be able to handle it with piQueueFinish. if (MEmulateOOO) @@ -283,11 +287,11 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, pi_mem_advice Advice, const std::vector &DepEvents) { { - std::unique_lock quard(MLastEventMtx, std::defer_lock); + std::unique_lock guard(MLastEventMtx, std::defer_lock); EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { - quard.lock(); - ExtraEventToWait = MLastEventPtr; + guard.lock(); + ExtraEventToWait = MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; } if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { @@ -306,7 +310,9 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, return MDiscardEvents ? createDiscardedEvent() : event(); } if (isInOrder()) { - MLastEventPtr = EventImpl; + auto &EventToStoreIn = + MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; + EventToStoreIn = EventImpl; } // Track only if we won't be able to handle it with piQueueFinish. if (MEmulateOOO) @@ -329,11 +335,11 @@ event queue_impl::memcpyToDeviceGlobal( const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector &DepEvents) { { - std::unique_lock quard(MLastEventMtx, std::defer_lock); + std::unique_lock guard(MLastEventMtx, std::defer_lock); EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { - quard.lock(); - ExtraEventToWait = MLastEventPtr; + guard.lock(); + ExtraEventToWait = MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; } if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { @@ -352,7 +358,9 @@ event queue_impl::memcpyToDeviceGlobal( if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); if (isInOrder()) { - MLastEventPtr = EventImpl; + auto &EventToStoreIn = + MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; + EventToStoreIn = EventImpl; } // Track only if we won't be able to handle it with piQueueFinish. if (MEmulateOOO) @@ -375,11 +383,11 @@ event queue_impl::memcpyFromDeviceGlobal( const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector &DepEvents) { { - std::unique_lock quard(MLastEventMtx, std::defer_lock); + std::unique_lock guard(MLastEventMtx, std::defer_lock); EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { - quard.lock(); - ExtraEventToWait = MLastEventPtr; + guard.lock(); + ExtraEventToWait = MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; } if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { @@ -398,7 +406,9 @@ event queue_impl::memcpyFromDeviceGlobal( if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); if (isInOrder()) { - MLastEventPtr = EventImpl; + auto &EventToStoreIn = + MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; + EventToStoreIn = EventImpl; } // Track only if we won't be able to handle it with piQueueFinish. if (MEmulateOOO) diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index b9997637f4760..451b99d6563c5 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -699,6 +699,7 @@ class queue_impl { std::shared_ptr Graph) { std::lock_guard Lock(MMutex); MGraph = Graph; + MGraphLastEventPtr = nullptr; } std::shared_ptr @@ -727,11 +728,14 @@ class queue_impl { // - to prevent 2nd kernel enqueue when 1st kernel is blocked by host // task. This dependency allows to build enqueue order in RT but will // be not passed to backend. Look at getPIEvents in Command. - if (MLastEventPtr) - Handler.depends_on(createSyclObjFromImpl(MLastEventPtr)); + auto &EventToBuildDeps = + MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; + if (EventToBuildDeps) + Handler.depends_on( + createSyclObjFromImpl(EventToBuildDeps)); EventRet = Handler.finalize(); - MLastEventPtr = getSyclObjImpl(EventRet); + EventToBuildDeps = getSyclObjImpl(EventRet); } else EventRet = Handler.finalize(); } @@ -856,6 +860,10 @@ class queue_impl { // Access to the event should be guarded with MLastEventMtx EventImplPtr MLastEventPtr; mutable std::mutex MLastEventMtx; + // Same as above but for graph begin-end recording cycle. + // Track deps within graph commands separately. + // Protected by common queue object mutex MMutex. + EventImplPtr MGraphLastEventPtr; const bool MIsInorder; From 654ada2dcd40b3a8607f2b785750e9c56d7fd0ce Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 22 Nov 2023 07:55:50 -0800 Subject: [PATCH 13/40] fix barrier submission Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/scheduler/commands.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 1d439607e71f3..85e753f109106 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1771,8 +1771,7 @@ void EmptyCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "EMPTY NODE" - << "\\n"; + Stream << "EMPTY NODE" << "\\n"; Stream << "\"];" << std::endl; @@ -3087,7 +3086,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { std::vector Events = Barrier->MEventsWaitWithBarrier; std::vector PiEvents = getPiEventsBlocking(Events); - if (MQueue->getDeviceImplPtr()->is_host() || PiEvents.empty()) { + if (MQueue->getDeviceImplPtr()->is_host()) { // NOP for host device. // If Events is empty, then the barrier has no effect. return PI_SUCCESS; @@ -3095,8 +3094,11 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { const PluginPtr &Plugin = MQueue->getPlugin(); if (MEvent != nullptr) MEvent->setHostEnqueueTime(); + // This should not be skipped in case of in order queue. So do call even if + // PiEvents are empty. Plugin->call( - MQueue->getHandleRef(), PiEvents.size(), &PiEvents[0], Event); + MQueue->getHandleRef(), PiEvents.size(), + PiEvents.empty() ? nullptr : &PiEvents[0], Event); return PI_SUCCESS; } From cc0db2cfa3ef51246b61e4b7b56a7b8a06e63e39 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 22 Nov 2023 08:12:55 -0800 Subject: [PATCH 14/40] use common Mutex for last event usage Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/queue_impl.cpp | 12 ++++++------ sycl/source/detail/queue_impl.hpp | 5 ++--- sycl/source/detail/scheduler/commands.cpp | 5 ++--- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index b9d2013cb8dcc..af4e339ba127c 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -144,7 +144,7 @@ event queue_impl::memset(const std::shared_ptr &Self, // We need to submit command and update the last event under same lock if we // have in-order queue. { - std::unique_lock guard(MLastEventMtx, std::defer_lock); + std::unique_lock guard(MMutex, std::defer_lock); EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { guard.lock(); @@ -241,7 +241,7 @@ event queue_impl::memcpy(const std::shared_ptr &Self, } { - std::unique_lock guard(MLastEventMtx, std::defer_lock); + std::unique_lock guard(MMutex, std::defer_lock); EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { guard.lock(); @@ -287,7 +287,7 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, pi_mem_advice Advice, const std::vector &DepEvents) { { - std::unique_lock guard(MLastEventMtx, std::defer_lock); + std::unique_lock guard(MMutex, std::defer_lock); EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { guard.lock(); @@ -335,7 +335,7 @@ event queue_impl::memcpyToDeviceGlobal( const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector &DepEvents) { { - std::unique_lock guard(MLastEventMtx, std::defer_lock); + std::unique_lock guard(MMutex, std::defer_lock); EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { guard.lock(); @@ -383,7 +383,7 @@ event queue_impl::memcpyFromDeviceGlobal( const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector &DepEvents) { { - std::unique_lock guard(MLastEventMtx, std::defer_lock); + std::unique_lock guard(MMutex, std::defer_lock); EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { guard.lock(); @@ -649,7 +649,7 @@ bool queue_impl::ext_oneapi_empty() const { // If we have in-order queue where events are not discarded then just check // the status of the last event. if (isInOrder() && !MDiscardEvents) { - std::lock_guard Lock(MLastEventMtx); + std::lock_guard Lock(MMutex); return !MLastEventPtr || MLastEventPtr->get_info() == info::event_command_status::complete; diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 451b99d6563c5..6cd32a590a7a5 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -721,7 +721,7 @@ class queue_impl { if (MIsInorder) { // Accessing and changing of an event isn't atomic operation. // Hence, here is the lock for thread-safety. - std::lock_guard Lock{MLastEventMtx}; + std::lock_guard Lock{MMutex}; // This dependency is needed for the following purposes: // - host tasks is handled by runtime and could not be implicitly // synchronized by backend. @@ -857,9 +857,8 @@ class queue_impl { buffer MAssertHappenedBuffer; // This event is employed for enhanced dependency tracking with in-order queue - // Access to the event should be guarded with MLastEventMtx + // Access to the event should be guarded with MMutex EventImplPtr MLastEventPtr; - mutable std::mutex MLastEventMtx; // Same as above but for graph begin-end recording cycle. // Track deps within graph commands separately. // Protected by common queue object mutex MMutex. diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 85e753f109106..d1f6173906757 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -3088,14 +3088,13 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { getPiEventsBlocking(Events); if (MQueue->getDeviceImplPtr()->is_host()) { // NOP for host device. - // If Events is empty, then the barrier has no effect. return PI_SUCCESS; } const PluginPtr &Plugin = MQueue->getPlugin(); if (MEvent != nullptr) MEvent->setHostEnqueueTime(); - // This should not be skipped in case of in order queue. So do call even if - // PiEvents are empty. + // This should not be skipped even for in order queue, we need a proper + // event to wait for. Plugin->call( MQueue->getHandleRef(), PiEvents.size(), PiEvents.empty() ? nullptr : &PiEvents[0], Event); From 072121d302e8ac1e7bceccaac9f4f27506329546 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Fri, 24 Nov 2023 10:42:23 -0800 Subject: [PATCH 15/40] fix test Signed-off-by: Tikhomirova, Kseniya --- sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index 795ee3a636a5f..7939c5d311bda 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -81,9 +81,6 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) { std::cout << "Not run due to host-only environment\n"; GTEST_SKIP(); } - MockScheduler *MockSchedulerPtr = new MockScheduler(); - sycl::detail::GlobalHandler::instance().attachScheduler( - dynamic_cast(MockSchedulerPtr)); context Ctx{Plt}; queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; @@ -118,8 +115,6 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) { InOrderQueue.wait(); - sycl::detail::GlobalHandler::instance().attachScheduler(NULL); - ASSERT_EQ(ExecutedCommands.size(), 2u); EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, MEMSET); EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u); @@ -140,9 +135,6 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { std::cout << "Not run due to host-only environment\n"; GTEST_SKIP(); } - MockScheduler *MockSchedulerPtr = new MockScheduler(); - sycl::detail::GlobalHandler::instance().attachScheduler( - dynamic_cast(MockSchedulerPtr)); context Ctx{Plt}; queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; @@ -169,8 +161,6 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { InOrderQueue.wait(); - sycl::detail::GlobalHandler::instance().attachScheduler(NULL); - ASSERT_EQ(ExecutedCommands.size(), 2u); EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, MEMSET); EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u); From a2351929113d7880e254a748d300bd7c48185b90 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 6 Dec 2023 03:33:48 -0800 Subject: [PATCH 16/40] [SYCL] Remove WA for L0 for not immediate context usage Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/scheduler/commands.cpp | 46 ++--------------------- 1 file changed, 3 insertions(+), 43 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 15741a11d5a2b..46617b14909ce 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1388,26 +1388,7 @@ void UnMapMemObject::emitInstrumentationData() { #endif } -bool UnMapMemObject::producesPiEvent() const { - // TODO remove this workaround once the batching issue is addressed in Level - // Zero plugin. - // Consider the following scenario on Level Zero: - // 1. Kernel A, which uses buffer A, is submitted to queue A. - // 2. Kernel B, which uses buffer B, is submitted to queue B. - // 3. queueA.wait(). - // 4. queueB.wait(). - // DPCPP runtime used to treat unmap/write commands for buffer A/B as host - // dependencies (i.e. they were waited for prior to enqueueing any command - // that's dependent on them). This allowed Level Zero plugin to detect that - // each queue is idle on steps 1/2 and submit the command list right away. - // This is no longer the case since we started passing these dependencies in - // an event waitlist and Level Zero plugin attempts to batch these commands, - // so the execution of kernel B starts only on step 4. This workaround - // restores the old behavior in this case until this is resolved. - return MQueue->getDeviceImplPtr()->getBackend() != - backend::ext_oneapi_level_zero || - MEvent->getHandleRef() != nullptr; -} +bool UnMapMemObject::producesPiEvent() const { return true; } pi_int32 UnMapMemObject::enqueueImp() { waitForPreparedHostEvents(); @@ -1495,27 +1476,7 @@ const ContextImplPtr &MemCpyCommand::getWorkerContext() const { return getWorkerQueue()->getContextImplPtr(); } -bool MemCpyCommand::producesPiEvent() const { - // TODO remove this workaround once the batching issue is addressed in Level - // Zero plugin. - // Consider the following scenario on Level Zero: - // 1. Kernel A, which uses buffer A, is submitted to queue A. - // 2. Kernel B, which uses buffer B, is submitted to queue B. - // 3. queueA.wait(). - // 4. queueB.wait(). - // DPCPP runtime used to treat unmap/write commands for buffer A/B as host - // dependencies (i.e. they were waited for prior to enqueueing any command - // that's dependent on them). This allowed Level Zero plugin to detect that - // each queue is idle on steps 1/2 and submit the command list right away. - // This is no longer the case since we started passing these dependencies in - // an event waitlist and Level Zero plugin attempts to batch these commands, - // so the execution of kernel B starts only on step 4. This workaround - // restores the old behavior in this case until this is resolved. - return MQueue->is_host() || - MQueue->getDeviceImplPtr()->getBackend() != - backend::ext_oneapi_level_zero || - MEvent->getHandleRef() != nullptr; -} +bool MemCpyCommand::producesPiEvent() const { return true; } pi_int32 MemCpyCommand::enqueueImp() { waitForPreparedHostEvents(); @@ -1762,8 +1723,7 @@ void EmptyCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "EMPTY NODE" - << "\\n"; + Stream << "EMPTY NODE" << "\\n"; Stream << "\"];" << std::endl; From 6560246ecd6788fca97d99c5436ab0a2b0914dc0 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 6 Dec 2023 05:55:14 -0800 Subject: [PATCH 17/40] fix comments Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/scheduler/commands.cpp | 4 ---- sycl/source/detail/scheduler/commands.hpp | 2 -- 2 files changed, 6 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 46617b14909ce..662a8516bd4ed 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1388,8 +1388,6 @@ void UnMapMemObject::emitInstrumentationData() { #endif } -bool UnMapMemObject::producesPiEvent() const { return true; } - pi_int32 UnMapMemObject::enqueueImp() { waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; @@ -1476,8 +1474,6 @@ const ContextImplPtr &MemCpyCommand::getWorkerContext() const { return getWorkerQueue()->getContextImplPtr(); } -bool MemCpyCommand::producesPiEvent() const { return true; } - pi_int32 MemCpyCommand::enqueueImp() { waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index f2147c4bb7226..eb5883905e62a 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -558,7 +558,6 @@ class UnMapMemObject : public Command { void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MDstReq; } void emitInstrumentationData() override; - bool producesPiEvent() const final; private: pi_int32 enqueueImp() final; @@ -580,7 +579,6 @@ class MemCpyCommand : public Command { const Requirement *getRequirement() const final { return &MDstReq; } void emitInstrumentationData() final; const ContextImplPtr &getWorkerContext() const final; - bool producesPiEvent() const final; private: pi_int32 enqueueImp() final; From 44eabe55c1b6557652c58c8d16cb44aaa6dc2bac Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 6 Dec 2023 06:07:57 -0800 Subject: [PATCH 18/40] fix code-review comments Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/queue_impl.cpp | 72 +++++++++---------- sycl/source/detail/queue_impl.hpp | 11 +-- .../scheduler/InOrderQueueHostTaskDeps.cpp | 14 ++-- 3 files changed, 45 insertions(+), 52 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index e7354fdf4b6b1..3dcbff467f6da 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -28,7 +28,7 @@ inline namespace _V1 { namespace detail { std::atomic queue_impl::MNextAvailableQueueID = 0; -std::vector +static std::vector getPIEvents(const std::vector &DepEvents, const EventImplPtr &ExtraDepEvent) { std::vector RetPiEvents; @@ -47,12 +47,12 @@ getPIEvents(const std::vector &DepEvents, return RetPiEvents; } -bool isEventsReady(const std::vector &DepEvents, - const EventImplPtr &ExtraDepEventPtr, - ContextImplPtr Context) { +static bool isEventsReady(const std::vector &DepEvents, + const EventImplPtr &ExtraDepEventPtr, + ContextImplPtr Context) { auto CheckEvent = [&Context](const EventImplPtr &SyclEventImplPtr) { - // throwaway events created with empty constructor will not have a context - // (which is set lazily) calling getContextImpl() would set that + // throwaway events created with empty constructor will not have a + // context (which is set lazily) calling getContextImpl() would set that // context, which we wish to avoid as it is expensive. if (!SyclEventImplPtr->isContextInitialized() && !SyclEventImplPtr->is_host()) { @@ -61,19 +61,16 @@ bool isEventsReady(const std::vector &DepEvents, if (SyclEventImplPtr->is_host()) { return SyclEventImplPtr->isCompleted(); } - // The fusion command and its event are associated with a non-host context, - // but still does not produce a PI event. + // The fusion command and its event are associated with a non-host + // context, but still do not produce a PI event. if (SyclEventImplPtr->getContextImpl() != Context || - !SyclEventImplPtr->producesPiEvent()) { + !SyclEventImplPtr->producesPiEvent()) return false; - } else { - // In this path nullptr native event means that the command has not been - // enqueued. It may happen if async enqueue in a host task is involved. - if (SyclEventImplPtr->getHandleRef() == nullptr) { - return false; - } - } - return true; + + // In this path nullptr native event means that the command has not + // been enqueued. It may happen if async enqueue in a host task is + // involved. + return SyclEventImplPtr->getHandleRef() != nullptr; }; return (!ExtraDepEventPtr || CheckEvent(ExtraDepEventPtr)) && @@ -111,7 +108,6 @@ static event prepareSYCLEventAssociatedWithQueue( } static event createDiscardedEvent() { - EventImplPtr EventImpl = std::make_shared(event_impl::HES_Discarded); return createSyclObjFromImpl(EventImpl); @@ -155,7 +151,7 @@ event queue_impl::memset(const std::shared_ptr &Self, EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { guard.lock(); - ExtraEventToWait = MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; + ExtraEventToWait = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; } if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { @@ -252,7 +248,7 @@ event queue_impl::memcpy(const std::shared_ptr &Self, EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { guard.lock(); - ExtraEventToWait = MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; + ExtraEventToWait = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; } if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { @@ -298,7 +294,7 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { guard.lock(); - ExtraEventToWait = MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; + ExtraEventToWait = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; } if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { @@ -346,7 +342,7 @@ event queue_impl::memcpyToDeviceGlobal( EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { guard.lock(); - ExtraEventToWait = MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; + ExtraEventToWait = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; } if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { @@ -394,7 +390,7 @@ event queue_impl::memcpyFromDeviceGlobal( EventImplPtr ExtraEventToWait = nullptr; if (isInOrder()) { guard.lock(); - ExtraEventToWait = MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; + ExtraEventToWait = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; } if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { @@ -438,9 +434,9 @@ void queue_impl::addEvent(const event &Event) { assert(EImpl && "Event implementation is missing"); auto *Cmd = static_cast(EImpl->getCommand()); if (!Cmd) { - // if there is no command on the event, we cannot track it with MEventsWeak - // as that will leave it with no owner. Track in MEventsShared only if we're - // unable to call piQueueFinish during wait. + // if there is no command on the event, we cannot track it with + // MEventsWeak as that will leave it with no owner. Track in MEventsShared + // only if we're unable to call piQueueFinish during wait. if (is_host() || MEmulateOOO) addSharedEvent(Event); } @@ -462,18 +458,18 @@ void queue_impl::addSharedEvent(const event &Event) { // Events stored in MEventsShared are not released anywhere else aside from // calls to queue::wait/wait_and_throw, which a user application might not // make, and ~queue_impl(). If the number of events grows large enough, - // there's a good chance that most of them are already completed and ownership - // of them can be released. + // there's a good chance that most of them are already completed and + // ownership of them can be released. const size_t EventThreshold = 128; if (MEventsShared.size() >= EventThreshold) { // Generally, the vector is ordered so that the oldest events are in the - // front and the newer events are in the end. So, search to find the first - // event that isn't yet complete. All the events prior to that can be - // erased. This could leave some few events further on that have completed - // not yet erased, but that is OK. This cleanup doesn't have to be perfect. - // This also keeps the algorithm linear rather than quadratic because it - // doesn't continually recheck things towards the back of the list that - // really haven't had time to complete. + // front and the newer events are in the end. So, search to find the + // first event that isn't yet complete. All the events prior to that can + // be erased. This could leave some few events further on that have + // completed not yet erased, but that is OK. This cleanup doesn't have to + // be perfect. This also keeps the algorithm linear rather than quadratic + // because it doesn't continually recheck things towards the back of the + // list that really haven't had time to complete. MEventsShared.erase( MEventsShared.begin(), std::find_if( @@ -598,9 +594,9 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { } // If the queue is either a host one or does not support OOO (and we use // multiple in-order queues as a result of that), wait for each event - // directly. Otherwise, only wait for unenqueued or host task events, starting - // from the latest submitted task in order to minimize total amount of calls, - // then handle the rest with piQueueFinish. + // directly. Otherwise, only wait for unenqueued or host task events, + // starting from the latest submitted task in order to minimize total amount + // of calls, then handle the rest with piQueueFinish. const bool SupportsPiFinish = !is_host() && !MEmulateOOO; for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 6cd32a590a7a5..d5016875b3d76 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -723,11 +723,12 @@ class queue_impl { // Hence, here is the lock for thread-safety. std::lock_guard Lock{MMutex}; // This dependency is needed for the following purposes: - // - host tasks is handled by runtime and could not be implicitly - // synchronized by backend. - // - to prevent 2nd kernel enqueue when 1st kernel is blocked by host - // task. This dependency allows to build enqueue order in RT but will - // be not passed to backend. Look at getPIEvents in Command. + // - host tasks are handled by the runtime and cannot be implicitly + // synchronized by the backend. + // - to prevent the 2nd kernel enqueue when the 1st kernel is blocked + // by a host task. This dependency allows to build the enqueue order in + // the RT but will not be passed to the backend. See getPIEvents in + // Command. auto &EventToBuildDeps = MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; if (EventToBuildDeps) diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index 7939c5d311bda..6a57637fe859d 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -50,7 +50,7 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) { EXPECT_EQ(GEventsWaitCounter, 1u); } -enum CommandType { KERNEL = 1, MEMSET = 2 }; +enum class CommandType { KERNEL = 1, MEMSET = 2 }; std::vector> ExecutedCommands; inline pi_result customEnqueueKernelLaunch(pi_queue, pi_kernel, pi_uint32, @@ -77,10 +77,6 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) { customextUSMEnqueueMemset); sycl::platform Plt = Mock.getPlatform(); - if (Plt.is_host()) { - std::cout << "Not run due to host-only environment\n"; - GTEST_SKIP(); - } context Ctx{Plt}; queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; @@ -116,9 +112,9 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) { InOrderQueue.wait(); ASSERT_EQ(ExecutedCommands.size(), 2u); - EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, MEMSET); + EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::MEMSET); EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u); - EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, KERNEL); + EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL); EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); } @@ -162,8 +158,8 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { InOrderQueue.wait(); ASSERT_EQ(ExecutedCommands.size(), 2u); - EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, MEMSET); + EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::MEMSET); EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u); - EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, KERNEL); + EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL); EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); } \ No newline at end of file From bfcd9b63884c3316dfff8ed3c55835039a28ddd4 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 6 Dec 2023 06:18:47 -0800 Subject: [PATCH 19/40] fix tests Signed-off-by: Tikhomirova, Kseniya --- sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index 6a57637fe859d..37bb5106d75b2 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -106,7 +106,10 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) { CGH.single_task>([] {}); }); - ready = true; + { + std::unique_lock lk(CvMutex); + ready = true; + } Cv.notify_one(); InOrderQueue.wait(); @@ -152,7 +155,10 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { event Ev2 = InOrderQueue.single_task>([] {}); - ready = true; + { + std::unique_lock lk(CvMutex); + ready = true; + } Cv.notify_one(); InOrderQueue.wait(); From 93bee37c6747a6ae6295107f18204b7c85b81482 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 6 Dec 2023 08:11:00 -0800 Subject: [PATCH 20/40] make producesPiEvent non virtual Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/scheduler/commands.cpp | 47 +++++++---------------- sycl/source/detail/scheduler/commands.hpp | 17 +++----- 2 files changed, 20 insertions(+), 44 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index f57ec11c60b7d..2d8873fc7631f 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -501,19 +501,21 @@ void Command::waitForEvents(QueueImplPtr Queue, Command::Command( CommandType Type, QueueImplPtr Queue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, - const std::vector &SyncPoints) + const std::vector &SyncPoints, + bool ProducesPiEvent) : MQueue(std::move(Queue)), MEvent(std::make_shared(MQueue)), MPreparedDepsEvents(MEvent->getPreparedDepsEvents()), MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()), MType(Type), - MCommandBuffer(CommandBuffer), MSyncPointDeps(SyncPoints) { + MCommandBuffer(CommandBuffer), MSyncPointDeps(SyncPoints), + MProducesPiEvent(ProducesPiEvent) { MWorkerQueue = MQueue; MEvent->setWorkerQueue(MWorkerQueue); MEvent->setSubmittedQueue(MWorkerQueue); MEvent->setCommand(this); MEvent->setContextImpl(MQueue->getContextImplPtr()); MEvent->setStateIncomplete(); - MEvent->setProducesPiEvent(producesPiEvent()); + MEvent->setProducesPiEvent(MProducesPiEvent); MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -738,8 +740,6 @@ const QueueImplPtr &Command::getWorkerQueue() const { return MWorkerQueue; } -bool Command::producesPiEvent() const { return true; } - bool Command::supportsPostEnqueueCleanup() const { return true; } bool Command::readyForCleanup() const { @@ -977,11 +977,11 @@ AllocaCommandBase::AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, AllocaCommandBase *LinkedAllocaCmd, bool IsConst) - : Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd), + : Command(Type, Queue, nullptr, {}, false), + MLinkedAllocaCmd(LinkedAllocaCmd), MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst), MRequirement(std::move(Req)), MReleaseCmd(Queue, this) { MRequirement.MAccessMode = access::mode::read_write; - MEvent->setProducesPiEvent(producesPiEvent()); emitInstrumentationDataProxy(); } @@ -1007,8 +1007,6 @@ void AllocaCommandBase::emitInstrumentationData() { #endif } -bool AllocaCommandBase::producesPiEvent() const { return false; } - bool AllocaCommandBase::supportsPostEnqueueCleanup() const { return false; } bool AllocaCommandBase::readyForCleanup() const { return false; } @@ -1101,7 +1099,6 @@ AllocaSubBufCommand::AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, // is added to this node, so this call must be before // the addDep() call. emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); Command *ConnectionCmd = addDep( DepDesc(MParentAlloca, getRequirement(), MParentAlloca), ToCleanUp); if (ConnectionCmd) @@ -1178,9 +1175,9 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const { } ReleaseCommand::ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd) - : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) { + : Command(CommandType::RELEASE, std::move(Queue), nullptr, {}, false), + MAllocaCmd(AllocaCmd) { emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); } void ReleaseCommand::emitInstrumentationData() { @@ -1290,8 +1287,6 @@ void ReleaseCommand::printDot(std::ostream &Stream) const { } } -bool ReleaseCommand::producesPiEvent() const { return false; } - bool ReleaseCommand::supportsPostEnqueueCleanup() const { return false; } bool ReleaseCommand::readyForCleanup() const { return false; } @@ -1303,7 +1298,6 @@ MapMemObject::MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr), MMapMode(MapMode) { emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); } void MapMemObject::emitInstrumentationData() { @@ -1367,7 +1361,6 @@ UnMapMemObject::UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, : Command(CommandType::UNMAP_MEM_OBJ, std::move(Queue)), MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) { emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); } void UnMapMemObject::emitInstrumentationData() { @@ -1442,7 +1435,6 @@ MemCpyCommand::MemCpyCommand(Requirement SrcReq, MEvent->setWorkerQueue(MWorkerQueue); emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); } void MemCpyCommand::emitInstrumentationData() { @@ -1597,7 +1589,6 @@ MemCpyCommandHost::MemCpyCommandHost(Requirement SrcReq, MEvent->setWorkerQueue(MWorkerQueue); emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); } void MemCpyCommandHost::emitInstrumentationData() { @@ -1665,9 +1656,8 @@ pi_int32 MemCpyCommandHost::enqueueImp() { } EmptyCommand::EmptyCommand(QueueImplPtr Queue) - : Command(CommandType::EMPTY_TASK, std::move(Queue)) { + : Command(CommandType::EMPTY_TASK, std::move(Queue), nullptr, {}, false) { emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); } pi_int32 EmptyCommand::enqueueImp() { @@ -1741,8 +1731,6 @@ void EmptyCommand::printDot(std::ostream &Stream) const { } } -bool EmptyCommand::producesPiEvent() const { return false; } - void MemCpyCommandHost::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\""; @@ -1853,7 +1841,9 @@ ExecCGCommand::ExecCGCommand( sycl::detail::pi::PiExtCommandBuffer CommandBuffer, const std::vector &Dependencies) : Command(CommandType::RUN_CG, std::move(Queue), CommandBuffer, - Dependencies), + Dependencies, + !CommandBuffer && + CommandGroup->getType() != CG::CGTYPE::CodeplayHostTask), MCommandGroup(std::move(CommandGroup)) { if (MCommandGroup->getType() == detail::CG::CodeplayHostTask) { MEvent->setSubmittedQueue( @@ -1861,7 +1851,6 @@ ExecCGCommand::ExecCGCommand( } emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); } #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -3140,11 +3129,6 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_ERROR_INVALID_OPERATION; } -bool ExecCGCommand::producesPiEvent() const { - return !MCommandBuffer && - MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask; -} - bool ExecCGCommand::supportsPostEnqueueCleanup() const { // Host tasks are cleaned up upon completion instead. return Command::supportsPostEnqueueCleanup() && @@ -3158,10 +3142,9 @@ bool ExecCGCommand::readyForCleanup() const { } KernelFusionCommand::KernelFusionCommand(QueueImplPtr Queue) - : Command(Command::CommandType::FUSION, Queue), + : Command(Command::CommandType::FUSION, Queue, nullptr, {}, false), MStatus(FusionStatus::ACTIVE) { emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); } std::vector &KernelFusionCommand::auxiliaryCommands() { @@ -3176,8 +3159,6 @@ std::vector &KernelFusionCommand::getFusionList() { return MFusionList; } -bool KernelFusionCommand::producesPiEvent() const { return false; } - pi_int32 KernelFusionCommand::enqueueImp() { waitForPreparedHostEvents(); waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef()); diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index eb5883905e62a..59b5e7dc192cd 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -118,7 +118,8 @@ class Command { Command(CommandType Type, QueueImplPtr Queue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer = nullptr, - const std::vector &SyncPoints = {}); + const std::vector &SyncPoints = {}, + bool ProducesPiEvent = true); /// \param NewDep dependency to be added /// \param ToCleanUp container for commands that can be cleaned up. @@ -223,8 +224,8 @@ class Command { /// for memory copy commands. const QueueImplPtr &getWorkerQueue() const; - /// Returns true iff the command produces a PI event on non-host devices. - virtual bool producesPiEvent() const; + /// Returns true if the command produces a PI event on non-host devices. + bool producesPiEvent() const { return MProducesPiEvent; } /// Returns true iff this command can be freed by post enqueue cleanup. virtual bool supportsPostEnqueueCleanup() const; @@ -401,6 +402,8 @@ class Command { sycl::detail::pi::PiExtCommandBuffer MCommandBuffer; /// List of sync points for submissions to a command buffer. std::vector MSyncPointDeps; + + bool MProducesPiEvent; }; /// The empty command does nothing during enqueue. The task can be used to @@ -416,8 +419,6 @@ class EmptyCommand : public Command { void emitInstrumentationData() override; - bool producesPiEvent() const final; - private: pi_int32 enqueueImp() final; @@ -435,7 +436,6 @@ class ReleaseCommand : public Command { void printDot(std::ostream &Stream) const final; void emitInstrumentationData() override; - bool producesPiEvent() const final; bool supportsPostEnqueueCleanup() const final; bool readyForCleanup() const final; @@ -462,8 +462,6 @@ class AllocaCommandBase : public Command { void emitInstrumentationData() override; - bool producesPiEvent() const final; - bool supportsPostEnqueueCleanup() const final; bool readyForCleanup() const final; @@ -660,8 +658,6 @@ class ExecCGCommand : public Command { // necessary. KernelFusionCommand *MFusionCmd = nullptr; - bool producesPiEvent() const final; - bool supportsPostEnqueueCleanup() const final; bool readyForCleanup() const final; @@ -717,7 +713,6 @@ class KernelFusionCommand : public Command { void printDot(std::ostream &Stream) const final; void emitInstrumentationData() final; - bool producesPiEvent() const final; std::vector &auxiliaryCommands(); From db0654720609f6946c276741f9d9de6e70f26774 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Thu, 4 Jan 2024 04:49:55 -0800 Subject: [PATCH 21/40] Revert "make producesPiEvent non virtual" This reverts commit 93bee37c6747a6ae6295107f18204b7c85b81482. --- sycl/source/detail/scheduler/commands.cpp | 47 ++++++++++++++++------- sycl/source/detail/scheduler/commands.hpp | 17 +++++--- 2 files changed, 44 insertions(+), 20 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 8d2954c793a6e..7b0540ba82ef4 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -500,21 +500,19 @@ void Command::waitForEvents(QueueImplPtr Queue, Command::Command( CommandType Type, QueueImplPtr Queue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer, - const std::vector &SyncPoints, - bool ProducesPiEvent) + const std::vector &SyncPoints) : MQueue(std::move(Queue)), MEvent(std::make_shared(MQueue)), MPreparedDepsEvents(MEvent->getPreparedDepsEvents()), MPreparedHostDepsEvents(MEvent->getPreparedHostDepsEvents()), MType(Type), - MCommandBuffer(CommandBuffer), MSyncPointDeps(SyncPoints), - MProducesPiEvent(ProducesPiEvent) { + MCommandBuffer(CommandBuffer), MSyncPointDeps(SyncPoints) { MWorkerQueue = MQueue; MEvent->setWorkerQueue(MWorkerQueue); MEvent->setSubmittedQueue(MWorkerQueue); MEvent->setCommand(this); MEvent->setContextImpl(MQueue->getContextImplPtr()); MEvent->setStateIncomplete(); - MEvent->setProducesPiEvent(MProducesPiEvent); + MEvent->setProducesPiEvent(producesPiEvent()); MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -739,6 +737,8 @@ const QueueImplPtr &Command::getWorkerQueue() const { return MWorkerQueue; } +bool Command::producesPiEvent() const { return true; } + bool Command::supportsPostEnqueueCleanup() const { return true; } bool Command::readyForCleanup() const { @@ -972,11 +972,11 @@ AllocaCommandBase::AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, AllocaCommandBase *LinkedAllocaCmd, bool IsConst) - : Command(Type, Queue, nullptr, {}, false), - MLinkedAllocaCmd(LinkedAllocaCmd), + : Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd), MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst), MRequirement(std::move(Req)), MReleaseCmd(Queue, this) { MRequirement.MAccessMode = access::mode::read_write; + MEvent->setProducesPiEvent(producesPiEvent()); emitInstrumentationDataProxy(); } @@ -1002,6 +1002,8 @@ void AllocaCommandBase::emitInstrumentationData() { #endif } +bool AllocaCommandBase::producesPiEvent() const { return false; } + bool AllocaCommandBase::supportsPostEnqueueCleanup() const { return false; } bool AllocaCommandBase::readyForCleanup() const { return false; } @@ -1094,6 +1096,7 @@ AllocaSubBufCommand::AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, // is added to this node, so this call must be before // the addDep() call. emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); Command *ConnectionCmd = addDep( DepDesc(MParentAlloca, getRequirement(), MParentAlloca), ToCleanUp); if (ConnectionCmd) @@ -1170,9 +1173,9 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const { } ReleaseCommand::ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd) - : Command(CommandType::RELEASE, std::move(Queue), nullptr, {}, false), - MAllocaCmd(AllocaCmd) { + : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) { emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); } void ReleaseCommand::emitInstrumentationData() { @@ -1282,6 +1285,8 @@ void ReleaseCommand::printDot(std::ostream &Stream) const { } } +bool ReleaseCommand::producesPiEvent() const { return false; } + bool ReleaseCommand::supportsPostEnqueueCleanup() const { return false; } bool ReleaseCommand::readyForCleanup() const { return false; } @@ -1293,6 +1298,7 @@ MapMemObject::MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr), MMapMode(MapMode) { emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); } void MapMemObject::emitInstrumentationData() { @@ -1356,6 +1362,7 @@ UnMapMemObject::UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, : Command(CommandType::UNMAP_MEM_OBJ, std::move(Queue)), MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) { emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); } void UnMapMemObject::emitInstrumentationData() { @@ -1430,6 +1437,7 @@ MemCpyCommand::MemCpyCommand(Requirement SrcReq, MEvent->setWorkerQueue(MWorkerQueue); emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); } void MemCpyCommand::emitInstrumentationData() { @@ -1584,6 +1592,7 @@ MemCpyCommandHost::MemCpyCommandHost(Requirement SrcReq, MEvent->setWorkerQueue(MWorkerQueue); emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); } void MemCpyCommandHost::emitInstrumentationData() { @@ -1651,8 +1660,9 @@ pi_int32 MemCpyCommandHost::enqueueImp() { } EmptyCommand::EmptyCommand(QueueImplPtr Queue) - : Command(CommandType::EMPTY_TASK, std::move(Queue), nullptr, {}, false) { + : Command(CommandType::EMPTY_TASK, std::move(Queue)) { emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); } pi_int32 EmptyCommand::enqueueImp() { @@ -1726,6 +1736,8 @@ void EmptyCommand::printDot(std::ostream &Stream) const { } } +bool EmptyCommand::producesPiEvent() const { return false; } + void MemCpyCommandHost::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\""; @@ -1836,9 +1848,7 @@ ExecCGCommand::ExecCGCommand( sycl::detail::pi::PiExtCommandBuffer CommandBuffer, const std::vector &Dependencies) : Command(CommandType::RUN_CG, std::move(Queue), CommandBuffer, - Dependencies, - !CommandBuffer && - CommandGroup->getType() != CG::CGTYPE::CodeplayHostTask), + Dependencies), MCommandGroup(std::move(CommandGroup)) { if (MCommandGroup->getType() == detail::CG::CodeplayHostTask) { MEvent->setSubmittedQueue( @@ -1846,6 +1856,7 @@ ExecCGCommand::ExecCGCommand( } emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); } #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -3125,6 +3136,11 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_ERROR_INVALID_OPERATION; } +bool ExecCGCommand::producesPiEvent() const { + return !MCommandBuffer && + MCommandGroup->getType() != CG::CGTYPE::CodeplayHostTask; +} + bool ExecCGCommand::supportsPostEnqueueCleanup() const { // Host tasks are cleaned up upon completion instead. return Command::supportsPostEnqueueCleanup() && @@ -3138,9 +3154,10 @@ bool ExecCGCommand::readyForCleanup() const { } KernelFusionCommand::KernelFusionCommand(QueueImplPtr Queue) - : Command(Command::CommandType::FUSION, Queue, nullptr, {}, false), + : Command(Command::CommandType::FUSION, Queue), MStatus(FusionStatus::ACTIVE) { emitInstrumentationDataProxy(); + MEvent->setProducesPiEvent(producesPiEvent()); } std::vector &KernelFusionCommand::auxiliaryCommands() { @@ -3155,6 +3172,8 @@ std::vector &KernelFusionCommand::getFusionList() { return MFusionList; } +bool KernelFusionCommand::producesPiEvent() const { return false; } + pi_int32 KernelFusionCommand::enqueueImp() { waitForPreparedHostEvents(); waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef()); diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index b404687081ce8..b8e839bd0bee9 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -118,8 +118,7 @@ class Command { Command(CommandType Type, QueueImplPtr Queue, sycl::detail::pi::PiExtCommandBuffer CommandBuffer = nullptr, - const std::vector &SyncPoints = {}, - bool ProducesPiEvent = true); + const std::vector &SyncPoints = {}); /// \param NewDep dependency to be added /// \param ToCleanUp container for commands that can be cleaned up. @@ -224,8 +223,8 @@ class Command { /// for memory copy commands. const QueueImplPtr &getWorkerQueue() const; - /// Returns true if the command produces a PI event on non-host devices. - bool producesPiEvent() const { return MProducesPiEvent; } + /// Returns true iff the command produces a PI event on non-host devices. + virtual bool producesPiEvent() const; /// Returns true iff this command can be freed by post enqueue cleanup. virtual bool supportsPostEnqueueCleanup() const; @@ -402,8 +401,6 @@ class Command { sycl::detail::pi::PiExtCommandBuffer MCommandBuffer; /// List of sync points for submissions to a command buffer. std::vector MSyncPointDeps; - - bool MProducesPiEvent; }; /// The empty command does nothing during enqueue. The task can be used to @@ -419,6 +416,8 @@ class EmptyCommand : public Command { void emitInstrumentationData() override; + bool producesPiEvent() const final; + private: pi_int32 enqueueImp() final; @@ -436,6 +435,7 @@ class ReleaseCommand : public Command { void printDot(std::ostream &Stream) const final; void emitInstrumentationData() override; + bool producesPiEvent() const final; bool supportsPostEnqueueCleanup() const final; bool readyForCleanup() const final; @@ -462,6 +462,8 @@ class AllocaCommandBase : public Command { void emitInstrumentationData() override; + bool producesPiEvent() const final; + bool supportsPostEnqueueCleanup() const final; bool readyForCleanup() const final; @@ -658,6 +660,8 @@ class ExecCGCommand : public Command { // necessary. KernelFusionCommand *MFusionCmd = nullptr; + bool producesPiEvent() const final; + bool supportsPostEnqueueCleanup() const final; bool readyForCleanup() const final; @@ -713,6 +717,7 @@ class KernelFusionCommand : public Command { void printDot(std::ostream &Stream) const final; void emitInstrumentationData() final; + bool producesPiEvent() const final; std::vector &auxiliaryCommands(); From d9bb40a5f0420cf53fe69d9d3d1c6c6ffcdcc574 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Thu, 4 Jan 2024 04:51:55 -0800 Subject: [PATCH 22/40] Revert "Merge branch 'remove_L0_WA' into inorder_enqueue_issues" This reverts commit 17793309d110fd7529439b299d237fd9dd4590e7, reversing changes made to bfcd9b63884c3316dfff8ed3c55835039a28ddd4. --- sycl/source/detail/scheduler/commands.cpp | 43 +++++++++++++++++++++++ sycl/source/detail/scheduler/commands.hpp | 2 ++ 2 files changed, 45 insertions(+) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 7b0540ba82ef4..9073669e0b82c 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1389,6 +1389,27 @@ void UnMapMemObject::emitInstrumentationData() { #endif } +bool UnMapMemObject::producesPiEvent() const { + // TODO remove this workaround once the batching issue is addressed in Level + // Zero plugin. + // Consider the following scenario on Level Zero: + // 1. Kernel A, which uses buffer A, is submitted to queue A. + // 2. Kernel B, which uses buffer B, is submitted to queue B. + // 3. queueA.wait(). + // 4. queueB.wait(). + // DPCPP runtime used to treat unmap/write commands for buffer A/B as host + // dependencies (i.e. they were waited for prior to enqueueing any command + // that's dependent on them). This allowed Level Zero plugin to detect that + // each queue is idle on steps 1/2 and submit the command list right away. + // This is no longer the case since we started passing these dependencies in + // an event waitlist and Level Zero plugin attempts to batch these commands, + // so the execution of kernel B starts only on step 4. This workaround + // restores the old behavior in this case until this is resolved. + return MQueue->getDeviceImplPtr()->getBackend() != + backend::ext_oneapi_level_zero || + MEvent->getHandleRef() != nullptr; +} + pi_int32 UnMapMemObject::enqueueImp() { waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; @@ -1476,6 +1497,28 @@ const ContextImplPtr &MemCpyCommand::getWorkerContext() const { return getWorkerQueue()->getContextImplPtr(); } +bool MemCpyCommand::producesPiEvent() const { + // TODO remove this workaround once the batching issue is addressed in Level + // Zero plugin. + // Consider the following scenario on Level Zero: + // 1. Kernel A, which uses buffer A, is submitted to queue A. + // 2. Kernel B, which uses buffer B, is submitted to queue B. + // 3. queueA.wait(). + // 4. queueB.wait(). + // DPCPP runtime used to treat unmap/write commands for buffer A/B as host + // dependencies (i.e. they were waited for prior to enqueueing any command + // that's dependent on them). This allowed Level Zero plugin to detect that + // each queue is idle on steps 1/2 and submit the command list right away. + // This is no longer the case since we started passing these dependencies in + // an event waitlist and Level Zero plugin attempts to batch these commands, + // so the execution of kernel B starts only on step 4. This workaround + // restores the old behavior in this case until this is resolved. + return MQueue->is_host() || + MQueue->getDeviceImplPtr()->getBackend() != + backend::ext_oneapi_level_zero || + MEvent->getHandleRef() != nullptr; +} + pi_int32 MemCpyCommand::enqueueImp() { waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index b8e839bd0bee9..b8983be03d808 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -558,6 +558,7 @@ class UnMapMemObject : public Command { void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MDstReq; } void emitInstrumentationData() override; + bool producesPiEvent() const final; private: pi_int32 enqueueImp() final; @@ -579,6 +580,7 @@ class MemCpyCommand : public Command { const Requirement *getRequirement() const final { return &MDstReq; } void emitInstrumentationData() final; const ContextImplPtr &getWorkerContext() const final; + bool producesPiEvent() const final; private: pi_int32 enqueueImp() final; From 8abee620eadb5fdfd132b34e4abcb3dad2e4ccd9 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Thu, 4 Jan 2024 07:39:55 -0800 Subject: [PATCH 23/40] Check event handle instead of using producesPiEvent --- sycl/source/detail/event_impl.hpp | 5 --- sycl/source/detail/queue_impl.cpp | 17 +++++----- sycl/source/detail/scheduler/commands.cpp | 11 ------- sycl/unittests/scheduler/Commands.cpp | 38 ----------------------- 4 files changed, 8 insertions(+), 63 deletions(-) diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index b37952cd742d3..067218f5a8459 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -290,9 +290,6 @@ class event_impl { return MEventFromSubmitedExecCommandBuffer; } - void setProducesPiEvent(bool Value) { MProducesPiEvent = Value; } - bool producesPiEvent() const { return MProducesPiEvent; } - protected: // When instrumentation is enabled emits trace event for event wait begin and // returns the telemetry event generated for the wait @@ -351,8 +348,6 @@ class event_impl { // stored here. sycl::detail::pi::PiExtSyncPoint MSyncPoint; - bool MProducesPiEvent{false}; - friend std::vector getOrWaitEvents(std::vector DepEvents, std::shared_ptr Context); diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 2cb364510ec2c..dd17493b6f8b3 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -61,15 +61,13 @@ static bool isEventsReady(const std::vector &DepEvents, if (SyclEventImplPtr->is_host()) { return SyclEventImplPtr->isCompleted(); } - // The fusion command and its event are associated with a non-host - // context, but still do not produce a PI event. - if (SyclEventImplPtr->getContextImpl() != Context || - !SyclEventImplPtr->producesPiEvent()) + // Cross-context dependencies can't be passed directly. + if (SyclEventImplPtr->getContextImpl() != Context) return false; - // In this path nullptr native event means that the command has not - // been enqueued. It may happen if async enqueue in a host task is - // involved. + // A nullptr here means that the commmand does not produce a PI event or it + // hasn't been enqueued yet. Either way, this dependency needs to be handled + // by the scheduler. return SyclEventImplPtr->getHandleRef() != nullptr; }; @@ -430,8 +428,9 @@ event queue_impl::memcpyFromDeviceGlobal( } event queue_impl::getLastEvent() const { - std::lock_guard Lock{MLastEventMtx}; - return MDiscardEvents ? createDiscardedEvent() : MLastEvent; + std::lock_guard Lock{MMutex}; + return MDiscardEvents ? createDiscardedEvent() + : detail::createSyclObjFromImpl(MLastEventPtr); } void queue_impl::addEvent(const event &Event) { diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 9073669e0b82c..cbeb4e7569fa5 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -512,7 +512,6 @@ Command::Command( MEvent->setCommand(this); MEvent->setContextImpl(MQueue->getContextImplPtr()); MEvent->setStateIncomplete(); - MEvent->setProducesPiEvent(producesPiEvent()); MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -976,7 +975,6 @@ AllocaCommandBase::AllocaCommandBase(CommandType Type, QueueImplPtr Queue, MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MIsConst(IsConst), MRequirement(std::move(Req)), MReleaseCmd(Queue, this) { MRequirement.MAccessMode = access::mode::read_write; - MEvent->setProducesPiEvent(producesPiEvent()); emitInstrumentationDataProxy(); } @@ -1096,7 +1094,6 @@ AllocaSubBufCommand::AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, // is added to this node, so this call must be before // the addDep() call. emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); Command *ConnectionCmd = addDep( DepDesc(MParentAlloca, getRequirement(), MParentAlloca), ToCleanUp); if (ConnectionCmd) @@ -1175,7 +1172,6 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const { ReleaseCommand::ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd) : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) { emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); } void ReleaseCommand::emitInstrumentationData() { @@ -1298,7 +1294,6 @@ MapMemObject::MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr), MMapMode(MapMode) { emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); } void MapMemObject::emitInstrumentationData() { @@ -1362,7 +1357,6 @@ UnMapMemObject::UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, : Command(CommandType::UNMAP_MEM_OBJ, std::move(Queue)), MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) { emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); } void UnMapMemObject::emitInstrumentationData() { @@ -1458,7 +1452,6 @@ MemCpyCommand::MemCpyCommand(Requirement SrcReq, MEvent->setWorkerQueue(MWorkerQueue); emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); } void MemCpyCommand::emitInstrumentationData() { @@ -1635,7 +1628,6 @@ MemCpyCommandHost::MemCpyCommandHost(Requirement SrcReq, MEvent->setWorkerQueue(MWorkerQueue); emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); } void MemCpyCommandHost::emitInstrumentationData() { @@ -1705,7 +1697,6 @@ pi_int32 MemCpyCommandHost::enqueueImp() { EmptyCommand::EmptyCommand(QueueImplPtr Queue) : Command(CommandType::EMPTY_TASK, std::move(Queue)) { emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); } pi_int32 EmptyCommand::enqueueImp() { @@ -1899,7 +1890,6 @@ ExecCGCommand::ExecCGCommand( } emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); } #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -3200,7 +3190,6 @@ KernelFusionCommand::KernelFusionCommand(QueueImplPtr Queue) : Command(Command::CommandType::FUSION, Queue), MStatus(FusionStatus::ACTIVE) { emitInstrumentationDataProxy(); - MEvent->setProducesPiEvent(producesPiEvent()); } std::vector &KernelFusionCommand::auxiliaryCommands() { diff --git a/sycl/unittests/scheduler/Commands.cpp b/sycl/unittests/scheduler/Commands.cpp index 6221203bc04dc..a995800643421 100644 --- a/sycl/unittests/scheduler/Commands.cpp +++ b/sycl/unittests/scheduler/Commands.cpp @@ -8,7 +8,6 @@ #include "SchedulerTest.hpp" #include "SchedulerTestUtils.hpp" -#include #include #include @@ -85,40 +84,3 @@ TEST_F(SchedulerTest, WaitEmptyEventWithBarrier) { MS.Scheduler::addCG(std::move(CommandGroup), QueueImpl); } } - -TEST_F(SchedulerTest, CommandsPiEventExpectation) { - sycl::unittest::PiMock Mock; - sycl::platform Plt = Mock.getPlatform(); - context Ctx{Plt}; - queue Queue{Ctx, default_selector_v}; - detail::QueueImplPtr QueueImpl = detail::getSyclObjImpl(Queue); - MockScheduler MS; - - buffer Buf{range<1>(1)}; - std::shared_ptr BufImpl = detail::getSyclObjImpl(Buf); - detail::Requirement MockReq = getMockRequirement(Buf); - MockReq.MDims = 1; - MockReq.MSYCLMemObj = BufImpl.get(); - - std::vector AuxCmds; - detail::MemObjRecord *Record = - MS.getOrInsertMemObjRecord(QueueImpl, &MockReq, AuxCmds); - detail::AllocaCommandBase *AllocaCmd = - MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, AuxCmds); - EXPECT_EQ(AllocaCmd->producesPiEvent(), - AllocaCmd->getEvent()->producesPiEvent()); - EXPECT_EQ(AllocaCmd->producesPiEvent(), false); - - std::unique_ptr CG{ - new detail::CGFill(/*Pattern*/ {}, &MockReq, - detail::CG::StorageInitHelper( - /*ArgsStorage*/ {}, - /*AccStorage*/ {}, - /*SharedPtrStorage*/ {}, - /*Requirements*/ {&MockReq}, - /*Events*/ {}))}; - detail::EventImplPtr Event = MS.addCG(std::move(CG), QueueImpl); - auto *Cmd = static_cast(Event->getCommand()); - EXPECT_EQ(Cmd->producesPiEvent(), Event->producesPiEvent()); - EXPECT_EQ(Cmd->producesPiEvent(), true); -} \ No newline at end of file From 696861b5c231c776cf416926440e37e20f582280 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Thu, 4 Jan 2024 07:49:16 -0800 Subject: [PATCH 24/40] Revert unrelated changes --- sycl/source/detail/scheduler/commands.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index cbeb4e7569fa5..09e08ded2df49 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1757,7 +1757,8 @@ void EmptyCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "EMPTY NODE" << "\\n"; + Stream << "EMPTY NODE" + << "\\n"; Stream << "\"];" << std::endl; From ddf8b9fa771cdf60235a74938680f2617aabad5a Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Thu, 4 Jan 2024 07:59:09 -0800 Subject: [PATCH 25/40] Revert barrier changes --- sycl/source/detail/scheduler/commands.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 09e08ded2df49..db14a10943ce3 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -3062,18 +3062,16 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { std::vector Events = Barrier->MEventsWaitWithBarrier; std::vector PiEvents = getPiEventsBlocking(Events); - if (MQueue->getDeviceImplPtr()->is_host()) { + if (MQueue->getDeviceImplPtr()->is_host() || PiEvents.empty()) { // NOP for host device. + // If Events is empty, then the barrier has no effect. return PI_SUCCESS; } const PluginPtr &Plugin = MQueue->getPlugin(); if (MEvent != nullptr) MEvent->setHostEnqueueTime(); - // This should not be skipped even for in order queue, we need a proper - // event to wait for. Plugin->call( - MQueue->getHandleRef(), PiEvents.size(), - PiEvents.empty() ? nullptr : &PiEvents[0], Event); + MQueue->getHandleRef(), PiEvents.size(), &PiEvents[0], Event); return PI_SUCCESS; } From 9f2c79d3b414eef7a4055aaaafc7c64de8ae1603 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Thu, 4 Jan 2024 10:02:25 -0800 Subject: [PATCH 26/40] Apply comments --- sycl/source/detail/queue_impl.cpp | 62 +++++++++++++++---------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index dd17493b6f8b3..f6bb4865f3915 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -39,20 +39,17 @@ getPIEvents(const std::vector &DepEvents, }; if (ExtraDepEvent) AddEvent(ExtraDepEvent); - for_each(DepEvents.begin(), DepEvents.end(), - [&RetPiEvents, &AddEvent](const sycl::event &Event) { - auto EventImpl = detail::getSyclObjImpl(Event); - return AddEvent(EventImpl); - }); + for (const sycl::event &Event : DepEvents) + AddEvent(detail::getSyclObjImpl(Event)); return RetPiEvents; } -static bool isEventsReady(const std::vector &DepEvents, - const EventImplPtr &ExtraDepEventPtr, - ContextImplPtr Context) { +static bool canBypassScheduler(const std::vector &DepEvents, + const EventImplPtr &ExtraDepEventPtr, + ContextImplPtr Context) { auto CheckEvent = [&Context](const EventImplPtr &SyclEventImplPtr) { - // throwaway events created with empty constructor will not have a - // context (which is set lazily) calling getContextImpl() would set that + // Throwaway events created with empty constructor will not have a + // context (it is set lazily). Calling getContextImpl() would set that // context, which we wish to avoid as it is expensive. if (!SyclEventImplPtr->isContextInitialized() && !SyclEventImplPtr->is_host()) { @@ -61,13 +58,12 @@ static bool isEventsReady(const std::vector &DepEvents, if (SyclEventImplPtr->is_host()) { return SyclEventImplPtr->isCompleted(); } - // Cross-context dependencies can't be passed directly. + // Cross-context dependencies can't be passed to the backend directly. if (SyclEventImplPtr->getContextImpl() != Context) return false; // A nullptr here means that the commmand does not produce a PI event or it - // hasn't been enqueued yet. Either way, this dependency needs to be handled - // by the scheduler. + // hasn't been enqueued yet. return SyclEventImplPtr->getHandleRef() != nullptr; }; @@ -151,7 +147,7 @@ event queue_impl::memset(const std::shared_ptr &Self, guard.lock(); ExtraEventToWait = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; } - if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { + if (canBypassScheduler(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::fill_usm(Ptr, Self, Count, Value, getPIEvents(DepEvents, ExtraEventToWait), @@ -248,7 +244,7 @@ event queue_impl::memcpy(const std::shared_ptr &Self, guard.lock(); ExtraEventToWait = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; } - if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { + if (canBypassScheduler(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::copy_usm(Src, Self, Count, Dest, getPIEvents(DepEvents, ExtraEventToWait), @@ -294,7 +290,7 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, guard.lock(); ExtraEventToWait = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; } - if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { + if (canBypassScheduler(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::advise_usm(Ptr, Self, Length, Advice, getPIEvents(DepEvents, ExtraEventToWait), @@ -342,7 +338,7 @@ event queue_impl::memcpyToDeviceGlobal( guard.lock(); ExtraEventToWait = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; } - if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { + if (canBypassScheduler(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::copy_to_device_global( DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Src, @@ -390,7 +386,7 @@ event queue_impl::memcpyFromDeviceGlobal( guard.lock(); ExtraEventToWait = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; } - if (isEventsReady(DepEvents, ExtraEventToWait, MContext)) { + if (canBypassScheduler(DepEvents, ExtraEventToWait, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::copy_from_device_global( DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest, @@ -438,9 +434,9 @@ void queue_impl::addEvent(const event &Event) { assert(EImpl && "Event implementation is missing"); auto *Cmd = static_cast(EImpl->getCommand()); if (!Cmd) { - // if there is no command on the event, we cannot track it with - // MEventsWeak as that will leave it with no owner. Track in MEventsShared - // only if we're unable to call piQueueFinish during wait. + // if there is no command on the event, we cannot track it with MEventsWeak + // as that will leave it with no owner. Track in MEventsShared only if we're + // unable to call piQueueFinish during wait. if (is_host() || MEmulateOOO) addSharedEvent(Event); } @@ -462,18 +458,18 @@ void queue_impl::addSharedEvent(const event &Event) { // Events stored in MEventsShared are not released anywhere else aside from // calls to queue::wait/wait_and_throw, which a user application might not // make, and ~queue_impl(). If the number of events grows large enough, - // there's a good chance that most of them are already completed and - // ownership of them can be released. + // there's a good chance that most of them are already completed and ownership + // of them can be released. const size_t EventThreshold = 128; if (MEventsShared.size() >= EventThreshold) { // Generally, the vector is ordered so that the oldest events are in the - // front and the newer events are in the end. So, search to find the - // first event that isn't yet complete. All the events prior to that can - // be erased. This could leave some few events further on that have - // completed not yet erased, but that is OK. This cleanup doesn't have to - // be perfect. This also keeps the algorithm linear rather than quadratic - // because it doesn't continually recheck things towards the back of the - // list that really haven't had time to complete. + // front and the newer events are in the end. So, search to find the first + // event that isn't yet complete. All the events prior to that can be + // erased. This could leave some few events further on that have completed + // not yet erased, but that is OK. This cleanup doesn't have to be perfect. + // This also keeps the algorithm linear rather than quadratic because it + // doesn't continually recheck things towards the back of the list that + // really haven't had time to complete. MEventsShared.erase( MEventsShared.begin(), std::find_if( @@ -598,9 +594,9 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { } // If the queue is either a host one or does not support OOO (and we use // multiple in-order queues as a result of that), wait for each event - // directly. Otherwise, only wait for unenqueued or host task events, - // starting from the latest submitted task in order to minimize total amount - // of calls, then handle the rest with piQueueFinish. + // directly. Otherwise, only wait for unenqueued or host task events, starting + // from the latest submitted task in order to minimize total amount of calls, + // then handle the rest with piQueueFinish. const bool SupportsPiFinish = !is_host() && !MEmulateOOO; for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { From 3214f35faf33f8672f43a3a57b0d853f86aebc95 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Mon, 8 Jan 2024 09:24:42 -0800 Subject: [PATCH 27/40] Create the last event if it doesn't exist --- sycl/source/detail/queue_impl.cpp | 9 ++++++--- sycl/source/detail/queue_impl.hpp | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index f6bb4865f3915..c9284cb30e787 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -423,10 +423,13 @@ event queue_impl::memcpyFromDeviceGlobal( Self, {}); } -event queue_impl::getLastEvent() const { +event queue_impl::getLastEvent() { std::lock_guard Lock{MMutex}; - return MDiscardEvents ? createDiscardedEvent() - : detail::createSyclObjFromImpl(MLastEventPtr); + if (MDiscardEvents) + return createDiscardedEvent(); + if (!MLastEventPtr) + MLastEventPtr = std::make_shared(std::nullopt); + return detail::createSyclObjFromImpl(MLastEventPtr); } void queue_impl::addEvent(const event &Event) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 6fce5582b2803..fcb5b413728f5 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -211,7 +211,7 @@ class queue_impl { #endif } - event getLastEvent() const; + event getLastEvent(); private: void queue_impl_interop(sycl::detail::pi::PiQueue PiQueue) { From 0ba5561f8d3d8bb1457aa8045e69dd07b988917c Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 9 Jan 2024 09:47:14 -0800 Subject: [PATCH 28/40] Account for separate tracking of graph events in getLastEvent --- sycl/source/detail/queue_impl.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index c9284cb30e787..198c4e354ff90 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -427,9 +427,10 @@ event queue_impl::getLastEvent() { std::lock_guard Lock{MMutex}; if (MDiscardEvents) return createDiscardedEvent(); - if (!MLastEventPtr) - MLastEventPtr = std::make_shared(std::nullopt); - return detail::createSyclObjFromImpl(MLastEventPtr); + EventImplPtr &LastEvent = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; + if (!LastEvent) + LastEvent = std::make_shared(std::nullopt); + return detail::createSyclObjFromImpl(LastEvent); } void queue_impl::addEvent(const event &Event) { From 1abe98cb2cedf7b35556bb4036030eb214ecb79a Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 9 Jan 2024 09:57:14 -0800 Subject: [PATCH 29/40] Appease clang-format --- sycl/CMakeLists.txt | 1 + sycl/source/detail/queue_impl.cpp | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index f6d5d7db6c487..563e9a2b2f2b1 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -12,6 +12,7 @@ option(SYCL_ENABLE_COVERAGE "Enables code coverage for runtime and unit tests" O option(SYCL_ENABLE_STACK_PRINTING "Enables stack printing on crashes of SYCL applications" OFF) option(SYCL_LIB_WITH_DEBUG_SYMBOLS "Builds SYCL runtime libraries with debug symbols" OFF) +add_definitions(-g -O0) if (NOT SYCL_COVERAGE_PATH) set(SYCL_COVERAGE_PATH "${CMAKE_CURRENT_BINARY_DIR}/profiles") endif() diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 198c4e354ff90..fc622423c7dbc 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -427,7 +427,8 @@ event queue_impl::getLastEvent() { std::lock_guard Lock{MMutex}; if (MDiscardEvents) return createDiscardedEvent(); - EventImplPtr &LastEvent = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; + EventImplPtr &LastEvent = + MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; if (!LastEvent) LastEvent = std::make_shared(std::nullopt); return detail::createSyclObjFromImpl(LastEvent); From 5588795bd6e40eef77c6b9ba78263a2892815eed Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 9 Jan 2024 10:25:42 -0800 Subject: [PATCH 30/40] Remove accidental edit --- sycl/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index 563e9a2b2f2b1..f6d5d7db6c487 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -12,7 +12,6 @@ option(SYCL_ENABLE_COVERAGE "Enables code coverage for runtime and unit tests" O option(SYCL_ENABLE_STACK_PRINTING "Enables stack printing on crashes of SYCL applications" OFF) option(SYCL_LIB_WITH_DEBUG_SYMBOLS "Builds SYCL runtime libraries with debug symbols" OFF) -add_definitions(-g -O0) if (NOT SYCL_COVERAGE_PATH) set(SYCL_COVERAGE_PATH "${CMAKE_CURRENT_BINARY_DIR}/profiles") endif() From d32f9029564b3710d587ffd9db6343a98f45a56a Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 9 Jan 2024 11:01:04 -0800 Subject: [PATCH 31/40] Reuse last non-graph event if a graph one doesn't exist --- sycl/source/detail/queue_impl.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index fc622423c7dbc..606ce872a929e 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -427,11 +427,11 @@ event queue_impl::getLastEvent() { std::lock_guard Lock{MMutex}; if (MDiscardEvents) return createDiscardedEvent(); - EventImplPtr &LastEvent = - MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; - if (!LastEvent) - LastEvent = std::make_shared(std::nullopt); - return detail::createSyclObjFromImpl(LastEvent); + if (!MGraph.expired() && MGraphLastEventPtr) + return detail::createSyclObjFromImpl(MGraphLastEventPtr); + if (!MLastEventPtr) + MLastEventPtr = std::make_shared(std::nullopt); + return detail::createSyclObjFromImpl(MLastEventPtr); } void queue_impl::addEvent(const event &Event) { From 3196bfc10fefdb26c9b8c3b56071da836b1969f8 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Wed, 17 Jan 2024 09:46:06 -0800 Subject: [PATCH 32/40] Reuse getExtendDependencyList for the extra event --- sycl/source/detail/queue_impl.cpp | 128 ++++++++++++------------------ sycl/source/detail/queue_impl.hpp | 3 +- 2 files changed, 53 insertions(+), 78 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index b97fb52ecf8f7..5729cc7efcadb 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -29,23 +29,17 @@ namespace detail { std::atomic queue_impl::MNextAvailableQueueID = 0; static std::vector -getPIEvents(const std::vector &DepEvents, - const EventImplPtr &ExtraDepEvent) { +getPIEvents(const std::vector &DepEvents) { std::vector RetPiEvents; - auto AddEvent = [&RetPiEvents](const EventImplPtr &EventImpl) { - if (EventImpl->getHandleRef() == nullptr) - return; - RetPiEvents.push_back(EventImpl->getHandleRef()); - }; - if (ExtraDepEvent) - AddEvent(ExtraDepEvent); - for (const sycl::event &Event : DepEvents) - AddEvent(detail::getSyclObjImpl(Event)); + for (const sycl::event &Event : DepEvents) { + const EventImplPtr &EventImpl = detail::getSyclObjImpl(Event); + if (EventImpl->getHandleRef() != nullptr) + RetPiEvents.push_back(EventImpl->getHandleRef()); + } return RetPiEvents; } static bool canBypassScheduler(const std::vector &DepEvents, - const EventImplPtr &ExtraDepEventPtr, ContextImplPtr Context) { auto CheckEvent = [&Context](const EventImplPtr &SyclEventImplPtr) { // Throwaway events created with empty constructor will not have a @@ -67,8 +61,7 @@ static bool canBypassScheduler(const std::vector &DepEvents, return SyclEventImplPtr->getHandleRef() != nullptr; }; - return (!ExtraDepEventPtr || CheckEvent(ExtraDepEventPtr)) && - std::all_of(DepEvents.begin(), DepEvents.end(), + return std::all_of(DepEvents.begin(), DepEvents.end(), [&Context, &CheckEvent](const sycl::event &Event) { auto SyclEventImplPtr = detail::getSyclObjImpl(Event); return CheckEvent(SyclEventImplPtr); @@ -109,16 +102,25 @@ static event createDiscardedEvent() { const std::vector & queue_impl::getExtendDependencyList(const std::vector &DepEvents, - std::vector &MutableVec) { - if (isInOrder()) { - std::optional ExternalEvent = popExternalEvent(); - if (ExternalEvent) { - MutableVec = DepEvents; - MutableVec.push_back(*ExternalEvent); - return MutableVec; - } - } - return DepEvents; + std::vector &MutableVec, + std::unique_lock &QueueLock) { + if (!isInOrder()) + return DepEvents; + + QueueLock.lock(); + EventImplPtr ExtraEvent = + MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; + std::optional ExternalEvent = popExternalEvent(); + + if (!ExternalEvent && !ExtraEvent) + return DepEvents; + + MutableVec = DepEvents; + if (ExternalEvent) + MutableVec.push_back(*ExternalEvent); + if (ExtraEvent) + MutableVec.push_back(detail::createSyclObjFromImpl(ExtraEvent)); + return MutableVec; } event queue_impl::memset(const std::shared_ptr &Self, @@ -155,29 +157,23 @@ event queue_impl::memset(const std::shared_ptr &Self, // We need to submit command and update the last event under same lock if we // have in-order queue. { - std::unique_lock guard(MMutex, std::defer_lock); - EventImplPtr ExtraEventToWait = nullptr; + std::unique_lock Lock(MMutex, std::defer_lock); std::vector MutableDepEvents; const std::vector &ExpandedDepEvents = - getExtendDependencyList(DepEvents, MutableDepEvents); + getExtendDependencyList(DepEvents, MutableDepEvents, Lock); - if (isInOrder()) { - guard.lock(); - ExtraEventToWait = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; - } - if (canBypassScheduler(ExpandedDepEvents, ExtraEventToWait, MContext)) { + if (canBypassScheduler(ExpandedDepEvents, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::fill_usm(Ptr, Self, Count, Value, - getPIEvents(ExpandedDepEvents, ExtraEventToWait), - nullptr); + getPIEvents(ExpandedDepEvents), nullptr); return createDiscardedEvent(); } event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); auto EventImpl = detail::getSyclObjImpl(ResEvent); MemoryManager::fill_usm(Ptr, Self, Count, Value, - getPIEvents(ExpandedDepEvents, ExtraEventToWait), + getPIEvents(ExpandedDepEvents), &EventImpl->getHandleRef(), EventImpl); if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); @@ -257,29 +253,23 @@ event queue_impl::memcpy(const std::shared_ptr &Self, } { - std::unique_lock guard(MMutex, std::defer_lock); - EventImplPtr ExtraEventToWait = nullptr; + std::unique_lock Lock(MMutex, std::defer_lock); std::vector MutableDepEvents; const std::vector &ExpandedDepEvents = - getExtendDependencyList(DepEvents, MutableDepEvents); + getExtendDependencyList(DepEvents, MutableDepEvents, Lock); - if (isInOrder()) { - guard.lock(); - ExtraEventToWait = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; - } - if (canBypassScheduler(ExpandedDepEvents, ExtraEventToWait, MContext)) { + if (canBypassScheduler(ExpandedDepEvents, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::copy_usm(Src, Self, Count, Dest, - getPIEvents(ExpandedDepEvents, ExtraEventToWait), - nullptr); + getPIEvents(ExpandedDepEvents), nullptr); return createDiscardedEvent(); } event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); auto EventImpl = detail::getSyclObjImpl(ResEvent); MemoryManager::copy_usm(Src, Self, Count, Dest, - getPIEvents(ExpandedDepEvents, ExtraEventToWait), + getPIEvents(ExpandedDepEvents), &EventImpl->getHandleRef(), EventImpl); if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); @@ -308,29 +298,23 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, pi_mem_advice Advice, const std::vector &DepEvents) { { - std::unique_lock guard(MMutex, std::defer_lock); - EventImplPtr ExtraEventToWait = nullptr; + std::unique_lock Lock(MMutex, std::defer_lock); std::vector MutableDepEvents; const std::vector &ExpandedDepEvents = - getExtendDependencyList(DepEvents, MutableDepEvents); + getExtendDependencyList(DepEvents, MutableDepEvents, Lock); - if (isInOrder()) { - guard.lock(); - ExtraEventToWait = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; - } - if (canBypassScheduler(ExpandedDepEvents, ExtraEventToWait, MContext)) { + if (canBypassScheduler(ExpandedDepEvents, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::advise_usm(Ptr, Self, Length, Advice, - getPIEvents(ExpandedDepEvents, ExtraEventToWait), - nullptr); + getPIEvents(ExpandedDepEvents), nullptr); return createDiscardedEvent(); } event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); auto EventImpl = detail::getSyclObjImpl(ResEvent); MemoryManager::advise_usm(Ptr, Self, Length, Advice, - getPIEvents(ExpandedDepEvents, ExtraEventToWait), + getPIEvents(ExpandedDepEvents), &EventImpl->getHandleRef(), EventImpl); if (MContext->is_host()) { return MDiscardEvents ? createDiscardedEvent() : event(); @@ -361,22 +345,17 @@ event queue_impl::memcpyToDeviceGlobal( const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector &DepEvents) { { - std::unique_lock guard(MMutex, std::defer_lock); - EventImplPtr ExtraEventToWait = nullptr; + std::unique_lock Lock(MMutex, std::defer_lock); std::vector MutableDepEvents; const std::vector &ExpandedDepEvents = - getExtendDependencyList(DepEvents, MutableDepEvents); + getExtendDependencyList(DepEvents, MutableDepEvents, Lock); - if (isInOrder()) { - guard.lock(); - ExtraEventToWait = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; - } - if (canBypassScheduler(ExpandedDepEvents, ExtraEventToWait, MContext)) { + if (canBypassScheduler(ExpandedDepEvents, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::copy_to_device_global( DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Src, - getPIEvents(ExpandedDepEvents, ExtraEventToWait), nullptr); + getPIEvents(ExpandedDepEvents), nullptr); return createDiscardedEvent(); } @@ -384,7 +363,7 @@ event queue_impl::memcpyToDeviceGlobal( auto EventImpl = detail::getSyclObjImpl(ResEvent); MemoryManager::copy_to_device_global( DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Src, - getPIEvents(ExpandedDepEvents, ExtraEventToWait), &EventImpl->getHandleRef(), + getPIEvents(ExpandedDepEvents), &EventImpl->getHandleRef(), EventImpl); if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); @@ -414,22 +393,17 @@ event queue_impl::memcpyFromDeviceGlobal( const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector &DepEvents) { { - std::unique_lock guard(MMutex, std::defer_lock); - EventImplPtr ExtraEventToWait = nullptr; + std::unique_lock Lock(MMutex, std::defer_lock); std::vector MutableDepEvents; const std::vector &ExpandedDepEvents = - getExtendDependencyList(DepEvents, MutableDepEvents); + getExtendDependencyList(DepEvents, MutableDepEvents, Lock); - if (isInOrder()) { - guard.lock(); - ExtraEventToWait = MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr; - } - if (canBypassScheduler(ExpandedDepEvents, ExtraEventToWait, MContext)) { + if (canBypassScheduler(ExpandedDepEvents, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::copy_from_device_global( DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest, - getPIEvents(ExpandedDepEvents, ExtraEventToWait), nullptr); + getPIEvents(ExpandedDepEvents), nullptr); return createDiscardedEvent(); } @@ -437,7 +411,7 @@ event queue_impl::memcpyFromDeviceGlobal( auto EventImpl = detail::getSyclObjImpl(ResEvent); MemoryManager::copy_from_device_global( DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest, - getPIEvents(ExpandedDepEvents, ExtraEventToWait), &EventImpl->getHandleRef(), + getPIEvents(ExpandedDepEvents), &EventImpl->getHandleRef(), EventImpl); if (MContext->is_host()) return MDiscardEvents ? createDiscardedEvent() : event(); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 21d1b2b0ca7be..9a2729c638541 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -734,7 +734,8 @@ class queue_impl { const std::vector & getExtendDependencyList(const std::vector &DepEvents, - std::vector &MutableVec); + std::vector &MutableVec, + std::unique_lock &QueueLock); protected: event discard_or_return(const event &Event); From 0f79f7649db45064b2c5f856c2421ff925239f98 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Wed, 17 Jan 2024 10:13:07 -0800 Subject: [PATCH 33/40] Rename a function to account for other cases where scheduler can't be bypassed --- sycl/source/detail/queue_impl.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 12e5024978589..47e467aa2634d 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -39,8 +39,9 @@ getPIEvents(const std::vector &DepEvents) { return RetPiEvents; } -static bool canBypassScheduler(const std::vector &DepEvents, - ContextImplPtr Context) { +static bool +checkEventsForSchedulerBypass(const std::vector &DepEvents, + ContextImplPtr Context) { auto CheckEvent = [&Context](const EventImplPtr &SyclEventImplPtr) { // Throwaway events created with empty constructor will not have a // context (it is set lazily). Calling getContextImpl() would set that @@ -163,7 +164,7 @@ event queue_impl::memset(const std::shared_ptr &Self, const std::vector &ExpandedDepEvents = getExtendDependencyList(DepEvents, MutableDepEvents, Lock); - if (canBypassScheduler(ExpandedDepEvents, MContext)) { + if (checkEventsForSchedulerBypass(ExpandedDepEvents, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::fill_usm(Ptr, Self, Count, Value, getPIEvents(ExpandedDepEvents), nullptr); @@ -259,7 +260,7 @@ event queue_impl::memcpy(const std::shared_ptr &Self, const std::vector &ExpandedDepEvents = getExtendDependencyList(DepEvents, MutableDepEvents, Lock); - if (canBypassScheduler(ExpandedDepEvents, MContext)) { + if (checkEventsForSchedulerBypass(ExpandedDepEvents, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::copy_usm(Src, Self, Count, Dest, getPIEvents(ExpandedDepEvents), nullptr); @@ -306,7 +307,8 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, // If we have a command graph set we need to capture the advise through normal // queue submission. - if (!MGraph.lock() && canBypassScheduler(ExpandedDepEvents, MContext)) { + if (!MGraph.lock() && + checkEventsForSchedulerBypass(ExpandedDepEvents, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::advise_usm(Ptr, Self, Length, Advice, getPIEvents(ExpandedDepEvents), nullptr); @@ -353,7 +355,7 @@ event queue_impl::memcpyToDeviceGlobal( const std::vector &ExpandedDepEvents = getExtendDependencyList(DepEvents, MutableDepEvents, Lock); - if (canBypassScheduler(ExpandedDepEvents, MContext)) { + if (checkEventsForSchedulerBypass(ExpandedDepEvents, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::copy_to_device_global( DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Src, @@ -401,7 +403,7 @@ event queue_impl::memcpyFromDeviceGlobal( const std::vector &ExpandedDepEvents = getExtendDependencyList(DepEvents, MutableDepEvents, Lock); - if (canBypassScheduler(ExpandedDepEvents, MContext)) { + if (checkEventsForSchedulerBypass(ExpandedDepEvents, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::copy_from_device_global( DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest, From 2063434f0d264e6de3525924a44efe7c942e454c Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Wed, 17 Jan 2024 10:19:19 -0800 Subject: [PATCH 34/40] Reduce code duplication for graph-related submissions --- sycl/source/detail/queue_impl.cpp | 39 +++++++++++++++++-------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 47e467aa2634d..4341df0a1cb97 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -239,14 +239,17 @@ event queue_impl::memcpy(const std::shared_ptr &Self, #endif // If we have a command graph set we need to capture the copy through normal // queue submission rather than execute the copy directly. - if (MGraph.lock()) { + auto submitWithScheduler = [&]() { return submit( [&](handler &CGH) { CGH.depends_on(DepEvents); CGH.memcpy(Dest, Src, Count); }, Self, {}); - } + }; + if (MGraph.lock()) + return submitWithScheduler(); + if ((!Src || !Dest) && Count != 0) { report(CodeLoc); throw runtime_error("NULL pointer argument in memory copy operation.", @@ -286,18 +289,26 @@ event queue_impl::memcpy(const std::shared_ptr &Self, } } - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.memcpy(Dest, Src, Count); - }, - Self, {}); + return submitWithScheduler(); } event queue_impl::mem_advise(const std::shared_ptr &Self, const void *Ptr, size_t Length, pi_mem_advice Advice, const std::vector &DepEvents) { + // If we have a command graph set we need to capture the advise through normal + // queue submission. + auto submitWithScheduler = [&]() { + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.mem_advise(Ptr, Length, Advice); + }, + Self, {}); + }; + if (MGraph.lock()) + return submitWithScheduler(); + { std::unique_lock Lock(MMutex, std::defer_lock); @@ -305,10 +316,7 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, const std::vector &ExpandedDepEvents = getExtendDependencyList(DepEvents, MutableDepEvents, Lock); - // If we have a command graph set we need to capture the advise through normal - // queue submission. - if (!MGraph.lock() && - checkEventsForSchedulerBypass(ExpandedDepEvents, MContext)) { + if (checkEventsForSchedulerBypass(ExpandedDepEvents, MContext)) { if (MHasDiscardEventsSupport) { MemoryManager::advise_usm(Ptr, Self, Length, Advice, getPIEvents(ExpandedDepEvents), nullptr); @@ -336,12 +344,7 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, } } - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.mem_advise(Ptr, Length, Advice); - }, - Self, {}); + return submitWithScheduler(); } event queue_impl::memcpyToDeviceGlobal( From 7f21bb76f1d3b91370a5bce10b63e1b624b5019d Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Thu, 18 Jan 2024 06:35:32 -0800 Subject: [PATCH 35/40] Adjust the failing test to new behavior --- sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp b/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp index d46fe6c80cc33..d760cdebff5c0 100644 --- a/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp +++ b/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp @@ -54,7 +54,7 @@ int main() { // CHECK: Test2 // CHECK: ---> piEnqueueEventsWaitWithBarrier( - // CHECK: ZE ---> zeEventCreate + // CHECK: ZE ---> zeEventHostReset // CHECK: ZE ---> zeCommandListAppendWaitOnEvents // CHECK: ZE ---> zeCommandListAppendSignalEvent // CHECK: ) ---> pi_result : PI_SUCCESS From 5f5191db722c5dc4cb2e640e4f88985d7d4d7aac Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Thu, 18 Jan 2024 08:03:31 -0800 Subject: [PATCH 36/40] Adjust the failing test --- sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp b/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp index d760cdebff5c0..91171d6ce0dd1 100644 --- a/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp +++ b/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp @@ -54,7 +54,7 @@ int main() { // CHECK: Test2 // CHECK: ---> piEnqueueEventsWaitWithBarrier( - // CHECK: ZE ---> zeEventHostReset + // CHECK: ZE ---> {{zeEventCreate|zeEventHostReset}} // CHECK: ZE ---> zeCommandListAppendWaitOnEvents // CHECK: ZE ---> zeCommandListAppendSignalEvent // CHECK: ) ---> pi_result : PI_SUCCESS From e882b194f65b172277fd2a03c1fd48287cb8cdcb Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 23 Jan 2024 07:41:57 -0800 Subject: [PATCH 37/40] Refactor common submission code + apply other comments --- sycl/source/detail/queue_impl.cpp | 335 ++++++++++++------------------ sycl/source/detail/queue_impl.hpp | 17 ++ 2 files changed, 154 insertions(+), 198 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 4341df0a1cb97..62288268f0c88 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -39,36 +39,6 @@ getPIEvents(const std::vector &DepEvents) { return RetPiEvents; } -static bool -checkEventsForSchedulerBypass(const std::vector &DepEvents, - ContextImplPtr Context) { - auto CheckEvent = [&Context](const EventImplPtr &SyclEventImplPtr) { - // Throwaway events created with empty constructor will not have a - // context (it is set lazily). Calling getContextImpl() would set that - // context, which we wish to avoid as it is expensive. - if (!SyclEventImplPtr->isContextInitialized() && - !SyclEventImplPtr->is_host()) { - return true; - } - if (SyclEventImplPtr->is_host()) { - return SyclEventImplPtr->isCompleted(); - } - // Cross-context dependencies can't be passed to the backend directly. - if (SyclEventImplPtr->getContextImpl() != Context) - return false; - - // A nullptr here means that the commmand does not produce a PI event or it - // hasn't been enqueued yet. - return SyclEventImplPtr->getHandleRef() != nullptr; - }; - - return std::all_of(DepEvents.begin(), DepEvents.end(), - [&Context, &CheckEvent](const sycl::event &Event) { - auto SyclEventImplPtr = detail::getSyclObjImpl(Event); - return CheckEvent(SyclEventImplPtr); - }); -} - template <> uint32_t queue_impl::get_info() const { sycl::detail::pi::PiResult result = PI_SUCCESS; @@ -155,40 +125,19 @@ event queue_impl::memset(const std::shared_ptr &Self, "for use with the SYCL Graph extension."); } - // We need to submit command and update the last event under same lock if we - // have in-order queue. - { - std::unique_lock Lock(MMutex, std::defer_lock); - - std::vector MutableDepEvents; - const std::vector &ExpandedDepEvents = - getExtendDependencyList(DepEvents, MutableDepEvents, Lock); - - if (checkEventsForSchedulerBypass(ExpandedDepEvents, MContext)) { - if (MHasDiscardEventsSupport) { - MemoryManager::fill_usm(Ptr, Self, Count, Value, - getPIEvents(ExpandedDepEvents), nullptr); - return createDiscardedEvent(); - } - - event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); - auto EventImpl = detail::getSyclObjImpl(ResEvent); - MemoryManager::fill_usm(Ptr, Self, Count, Value, - getPIEvents(ExpandedDepEvents), - &EventImpl->getHandleRef(), EventImpl); - if (MContext->is_host()) - return MDiscardEvents ? createDiscardedEvent() : event(); - if (isInOrder()) { - auto &EventToStoreIn = - MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; - EventToStoreIn = EventImpl; - } - // Track only if we won't be able to handle it with piQueueFinish. - if (MEmulateOOO) - addSharedEvent(ResEvent); - return discard_or_return(ResEvent); - } - } + auto DiscardPiEventFunc = [&](const std::vector &PiDepEvents) { + MemoryManager::fill_usm(Ptr, Self, Count, Value, PiDepEvents, nullptr); + }; + auto KeepPiEventFunc = [&](const std::vector &PiDepEvents, + pi::PiEvent *OutEvent, + const detail::EventImplPtr &OutEventImpl) { + MemoryManager::fill_usm(Ptr, Self, Count, Value, PiDepEvents, OutEvent, + OutEventImpl); + }; + std::optional Result = tryBypassingScheduler( + Self, DepEvents, DiscardPiEventFunc, KeepPiEventFunc); + if (Result) + return *Result; return submit( [&](handler &CGH) { @@ -256,38 +205,19 @@ event queue_impl::memcpy(const std::shared_ptr &Self, PI_ERROR_INVALID_VALUE); } - { - std::unique_lock Lock(MMutex, std::defer_lock); - - std::vector MutableDepEvents; - const std::vector &ExpandedDepEvents = - getExtendDependencyList(DepEvents, MutableDepEvents, Lock); - - if (checkEventsForSchedulerBypass(ExpandedDepEvents, MContext)) { - if (MHasDiscardEventsSupport) { - MemoryManager::copy_usm(Src, Self, Count, Dest, - getPIEvents(ExpandedDepEvents), nullptr); - return createDiscardedEvent(); - } - - event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); - auto EventImpl = detail::getSyclObjImpl(ResEvent); - MemoryManager::copy_usm(Src, Self, Count, Dest, - getPIEvents(ExpandedDepEvents), - &EventImpl->getHandleRef(), EventImpl); - if (MContext->is_host()) - return MDiscardEvents ? createDiscardedEvent() : event(); - if (isInOrder()) { - auto &EventToStoreIn = - MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; - EventToStoreIn = EventImpl; - } - // Track only if we won't be able to handle it with piQueueFinish. - if (MEmulateOOO) - addSharedEvent(ResEvent); - return discard_or_return(ResEvent); - } - } + auto DiscardPiEventFunc = [&](const std::vector &PiDepEvents) { + MemoryManager::copy_usm(Src, Self, Count, Dest, PiDepEvents, nullptr); + }; + auto KeepPiEventFunc = [&](const std::vector &PiDepEvents, + pi::PiEvent *OutEvent, + const detail::EventImplPtr &OutEventImpl) { + MemoryManager::copy_usm(Src, Self, Count, Dest, PiDepEvents, OutEvent, + OutEventImpl); + }; + std::optional Result = tryBypassingScheduler( + Self, DepEvents, DiscardPiEventFunc, KeepPiEventFunc); + if (Result) + return *Result; return submitWithScheduler(); } @@ -309,40 +239,19 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, if (MGraph.lock()) return submitWithScheduler(); - { - std::unique_lock Lock(MMutex, std::defer_lock); - - std::vector MutableDepEvents; - const std::vector &ExpandedDepEvents = - getExtendDependencyList(DepEvents, MutableDepEvents, Lock); - - if (checkEventsForSchedulerBypass(ExpandedDepEvents, MContext)) { - if (MHasDiscardEventsSupport) { - MemoryManager::advise_usm(Ptr, Self, Length, Advice, - getPIEvents(ExpandedDepEvents), nullptr); - return createDiscardedEvent(); - } - - event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); - auto EventImpl = detail::getSyclObjImpl(ResEvent); - MemoryManager::advise_usm(Ptr, Self, Length, Advice, - getPIEvents(ExpandedDepEvents), - &EventImpl->getHandleRef(), EventImpl); - if (MContext->is_host()) { - return MDiscardEvents ? createDiscardedEvent() : event(); - } - if (isInOrder()) { - auto &EventToStoreIn = - MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; - EventToStoreIn = EventImpl; - } - // Track only if we won't be able to handle it with piQueueFinish. - if (MEmulateOOO) - addSharedEvent(ResEvent); - - return discard_or_return(ResEvent); - } - } + auto DiscardPiEventFunc = [&](const std::vector &PiDepEvents) { + MemoryManager::advise_usm(Ptr, Self, Length, Advice, PiDepEvents, nullptr); + }; + auto KeepPiEventFunc = [&](const std::vector &PiDepEvents, + pi::PiEvent *OutEvent, + const detail::EventImplPtr &OutEventImpl) { + MemoryManager::advise_usm(Ptr, Self, Length, Advice, PiDepEvents, OutEvent, + OutEventImpl); + }; + std::optional Result = tryBypassingScheduler( + Self, DepEvents, DiscardPiEventFunc, KeepPiEventFunc); + if (Result) + return *Result; return submitWithScheduler(); } @@ -351,40 +260,22 @@ event queue_impl::memcpyToDeviceGlobal( const std::shared_ptr &Self, void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector &DepEvents) { - { - std::unique_lock Lock(MMutex, std::defer_lock); - - std::vector MutableDepEvents; - const std::vector &ExpandedDepEvents = - getExtendDependencyList(DepEvents, MutableDepEvents, Lock); - - if (checkEventsForSchedulerBypass(ExpandedDepEvents, MContext)) { - if (MHasDiscardEventsSupport) { - MemoryManager::copy_to_device_global( - DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Src, - getPIEvents(ExpandedDepEvents), nullptr); - return createDiscardedEvent(); - } - - event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); - auto EventImpl = detail::getSyclObjImpl(ResEvent); - MemoryManager::copy_to_device_global( - DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Src, - getPIEvents(ExpandedDepEvents), &EventImpl->getHandleRef(), - EventImpl); - if (MContext->is_host()) - return MDiscardEvents ? createDiscardedEvent() : event(); - if (isInOrder()) { - auto &EventToStoreIn = - MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; - EventToStoreIn = EventImpl; - } - // Track only if we won't be able to handle it with piQueueFinish. - if (MEmulateOOO) - addSharedEvent(ResEvent); - return discard_or_return(ResEvent); - } - } + auto DiscardPiEventFunc = [&](const std::vector &PiDepEvents) { + MemoryManager::copy_to_device_global(DeviceGlobalPtr, IsDeviceImageScope, + Self, NumBytes, Offset, Src, + PiDepEvents, nullptr); + }; + auto KeepPiEventFunc = [&](const std::vector &PiDepEvents, + pi::PiEvent *OutEvent, + const detail::EventImplPtr &OutEventImpl) { + MemoryManager::copy_to_device_global(DeviceGlobalPtr, IsDeviceImageScope, + Self, NumBytes, Offset, Src, + PiDepEvents, OutEvent, OutEventImpl); + }; + std::optional Result = tryBypassingScheduler( + Self, DepEvents, DiscardPiEventFunc, KeepPiEventFunc); + if (Result) + return *Result; return submit( [&](handler &CGH) { @@ -399,40 +290,22 @@ event queue_impl::memcpyFromDeviceGlobal( const std::shared_ptr &Self, void *Dest, const void *DeviceGlobalPtr, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, const std::vector &DepEvents) { - { - std::unique_lock Lock(MMutex, std::defer_lock); - - std::vector MutableDepEvents; - const std::vector &ExpandedDepEvents = - getExtendDependencyList(DepEvents, MutableDepEvents, Lock); - - if (checkEventsForSchedulerBypass(ExpandedDepEvents, MContext)) { - if (MHasDiscardEventsSupport) { - MemoryManager::copy_from_device_global( - DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest, - getPIEvents(ExpandedDepEvents), nullptr); - return createDiscardedEvent(); - } - - event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); - auto EventImpl = detail::getSyclObjImpl(ResEvent); - MemoryManager::copy_from_device_global( - DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest, - getPIEvents(ExpandedDepEvents), &EventImpl->getHandleRef(), - EventImpl); - if (MContext->is_host()) - return MDiscardEvents ? createDiscardedEvent() : event(); - if (isInOrder()) { - auto &EventToStoreIn = - MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; - EventToStoreIn = EventImpl; - } - // Track only if we won't be able to handle it with piQueueFinish. - if (MEmulateOOO) - addSharedEvent(ResEvent); - return discard_or_return(ResEvent); - } - } + auto DiscardPiEventFunc = [&](const std::vector &PiDepEvents) { + MemoryManager::copy_from_device_global(DeviceGlobalPtr, IsDeviceImageScope, + Self, NumBytes, Offset, Dest, + PiDepEvents, nullptr); + }; + auto KeepPiEventFunc = [&](const std::vector &PiDepEvents, + pi::PiEvent *OutEvent, + const detail::EventImplPtr &OutEventImpl) { + MemoryManager::copy_from_device_global(DeviceGlobalPtr, IsDeviceImageScope, + Self, NumBytes, Offset, Dest, + PiDepEvents, OutEvent, OutEventImpl); + }; + std::optional Result = tryBypassingScheduler( + Self, DepEvents, DiscardPiEventFunc, KeepPiEventFunc); + if (Result) + return *Result; return submit( [&](handler &CGH) { @@ -506,6 +379,72 @@ void queue_impl::addSharedEvent(const event &Event) { MEventsShared.push_back(Event); } +static bool +areEventsSafeForSchedulerBypass(const std::vector &DepEvents, + ContextImplPtr Context) { + auto CheckEvent = [&Context](const sycl::event &Event) { + const EventImplPtr &SyclEventImplPtr = detail::getSyclObjImpl(Event); + // Events that don't have an initialized context are throwaway evemts that + // don't represent actual dependencies. Calling getContextImpl() would set + // their context, which we wish to avoid as it is expensive. + if (!SyclEventImplPtr->isContextInitialized() && + !SyclEventImplPtr->is_host()) { + return true; + } + if (SyclEventImplPtr->is_host()) { + return SyclEventImplPtr->isCompleted(); + } + // Cross-context dependencies can't be passed to the backend directly. + if (SyclEventImplPtr->getContextImpl() != Context) + return false; + + // A nullptr here means that the commmand does not produce a PI event or it + // hasn't been enqueued yet. + return SyclEventImplPtr->getHandleRef() != nullptr; + }; + + return std::all_of(DepEvents.begin(), DepEvents.end(), + [&Context, &CheckEvent](const sycl::event &Event) { + return CheckEvent(Event); + }); +} + +template +std::optional queue_impl::tryBypassingScheduler( + const std::shared_ptr &Self, + const std::vector &DepEvents, + DiscardPiEventFuncT DiscardPiEventFunc, KeepPiEventFuncT KeepPiEventFunc) { + // We need to submit command and update the last event under same lock if we + // have in-order queue. + std::unique_lock Lock(MMutex, std::defer_lock); + + std::vector MutableDepEvents; + const std::vector &ExpandedDepEvents = + getExtendDependencyList(DepEvents, MutableDepEvents, Lock); + + if (!areEventsSafeForSchedulerBypass(ExpandedDepEvents, MContext)) + return std::nullopt; + if (MHasDiscardEventsSupport) { + DiscardPiEventFunc(getPIEvents(ExpandedDepEvents)); + return createDiscardedEvent(); + } + + event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); + auto EventImpl = detail::getSyclObjImpl(ResEvent); + KeepPiEventFunc(getPIEvents(ExpandedDepEvents), &EventImpl->getHandleRef(), + EventImpl); + if (MContext->is_host()) + return MDiscardEvents ? createDiscardedEvent() : event(); + if (isInOrder()) { + auto &EventToStoreIn = MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; + EventToStoreIn = EventImpl; + } + // Track only if we won't be able to handle it with piQueueFinish. + if (MEmulateOOO) + addSharedEvent(ResEvent); + return discard_or_return(ResEvent); +} + void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc, std::string &Name, int32_t StreamID, uint64_t &IId) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 9a2729c638541..2300f4a6f308e 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -841,6 +841,23 @@ class queue_impl { return Event; } + /// Helper function for memory operation shortcuts. Checks if the scheduler + /// can be bypassed and submits the operation directly if so. + /// + /// \param Self is a shared_ptr to this queue. + /// \param DepEvents is a vector of dependencies of the operation. + /// \param DiscardPiEventFunc is a function that performs submission while + /// discarding the PI event. + /// \param KeepPiEventFunc is a function that performs submission while + /// keeping the PI event. + /// \return a SYCL event if direct submission is possible. + template + std::optional + tryBypassingScheduler(const std::shared_ptr &Self, + const std::vector &DepEvents, + DiscardPiEventFuncT DiscardPiEventFunc, + KeepPiEventFuncT KeepPiEventFunc); + // When instrumentation is enabled emits trace event for wait begin and // returns the telemetry event generated for the wait void *instrumentationProlog(const detail::code_location &CodeLoc, From b509f6811b23fba95b18576b18f31591bfd00fd8 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Wed, 24 Jan 2024 10:16:24 -0800 Subject: [PATCH 38/40] Apply clang-format & add/update comments --- sycl/source/detail/queue_impl.cpp | 28 ++++++++++++++++++---------- sycl/source/detail/queue_impl.hpp | 23 ++++++++++++++++------- 2 files changed, 34 insertions(+), 17 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 099f64776ad2e..813d051cfd0b2 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -126,8 +126,7 @@ event queue_impl::memset(const std::shared_ptr &Self, } return submitMemOpHelper( - Self, DepEvents, - [&](handler &CGH) {CGH.memset(Ptr, Value, Count);}, + Self, DepEvents, [&](handler &CGH) { CGH.memset(Ptr, Value, Count); }, [](const auto &...Args) { MemoryManager::fill_usm(Args...); }, Ptr, Self, Count, Value); } @@ -173,7 +172,7 @@ event queue_impl::memcpy(const std::shared_ptr &Self, #endif // If we have a command graph set we need to capture the copy through normal // queue submission rather than execute the copy directly. - auto HandlerFunc = [&](handler &CGH) {CGH.memcpy(Dest, Src, Count);}; + auto HandlerFunc = [&](handler &CGH) { CGH.memcpy(Dest, Src, Count); }; if (MGraph.lock()) return submitWithHandler(Self, DepEvents, HandlerFunc); @@ -195,7 +194,7 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, const std::vector &DepEvents) { // If we have a command graph set we need to capture the advise through normal // queue submission. - auto HandlerFunc = [&](handler &CGH) {CGH.mem_advise(Ptr, Length, Advice);}; + auto HandlerFunc = [&](handler &CGH) { CGH.mem_advise(Ptr, Length, Advice); }; if (MGraph.lock()) return submitWithHandler(Self, DepEvents, HandlerFunc); @@ -211,7 +210,10 @@ event queue_impl::memcpyToDeviceGlobal( const std::vector &DepEvents) { return submitMemOpHelper( Self, DepEvents, - [&](handler &CGH) {CGH.memcpyToDeviceGlobal(DeviceGlobalPtr, Src, IsDeviceImageScope, NumBytes, Offset);}, + [&](handler &CGH) { + CGH.memcpyToDeviceGlobal(DeviceGlobalPtr, Src, IsDeviceImageScope, + NumBytes, Offset); + }, [](const auto &...Args) { MemoryManager::copy_to_device_global(Args...); }, @@ -224,7 +226,10 @@ event queue_impl::memcpyFromDeviceGlobal( size_t Offset, const std::vector &DepEvents) { return submitMemOpHelper( Self, DepEvents, - [&](handler &CGH) {CGH.memcpyFromDeviceGlobal(Dest, DeviceGlobalPtr, IsDeviceImageScope, NumBytes, Offset);}, + [&](handler &CGH) { + CGH.memcpyFromDeviceGlobal(Dest, DeviceGlobalPtr, IsDeviceImageScope, + NumBytes, Offset); + }, [](const auto &...Args) { MemoryManager::copy_from_device_global(Args...); }, @@ -325,11 +330,13 @@ areEventsSafeForSchedulerBypass(const std::vector &DepEvents, } template -event queue_impl::submitWithHandler(const std::shared_ptr &Self, const std::vector &DepEvents, HandlerFuncT HandlerFunc) { +event queue_impl::submitWithHandler(const std::shared_ptr &Self, + const std::vector &DepEvents, + HandlerFuncT HandlerFunc) { return submit( [&](handler &CGH) { CGH.depends_on(DepEvents); - HandlerFunc(CGH); + HandlerFunc(CGH); }, Self, {}); } @@ -337,7 +344,7 @@ event queue_impl::submitWithHandler(const std::shared_ptr &Self, con template event queue_impl::submitMemOpHelper(const std::shared_ptr &Self, const std::vector &DepEvents, - HandlerFuncT HandlerFunc, + HandlerFuncT HandlerFunc, MemOpFuncT MemOpFunc, MemOpArgTs... MemOpArgs) { // We need to submit command and update the last event under same lock if we @@ -365,7 +372,8 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr &Self, return MDiscardEvents ? createDiscardedEvent() : event(); if (isInOrder()) { - auto &EventToStoreIn = MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; + auto &EventToStoreIn = + MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr; EventToStoreIn = EventImpl; } // Track only if we won't be able to handle it with piQueueFinish. diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 015dd0664f594..722acf925cf8d 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -841,26 +841,35 @@ class queue_impl { return Event; } + /// Helper function for submitting a memory operation with a handler. + /// \param Self is a shared_ptr to this queue. + /// \param DepEvents is a vector of dependencies of the operation. + /// \param HandlerFunc is a function that submits the operation with a + /// handler. template - event submitWithHandler(const std::shared_ptr &Self, const std::vector &DepEvents, HandlerFuncT HandlerFunc); + event submitWithHandler(const std::shared_ptr &Self, + const std::vector &DepEvents, + HandlerFuncT HandlerFunc); - /// Performs direct submission of a memory operation. + /// Performs submission of a memory operation directly if scheduler can be + /// bypassed, or with a handler otherwise. /// /// \param Self is a shared_ptr to this queue. /// \param DepEvents is a vector of dependencies of the operation. - /// \param HandlerFunc is a function that submits the operation via the - /// handler (and the scheduler). + /// \param HandlerFunc is a function that submits the operation with a + /// handler. /// \param MemMngrFunc is a function that forwards its arguments to the /// appropriate memory manager function. /// \param MemMngrArgs are all the arguments that need to be passed to memory /// manager except the last three: dependencies, PI event and /// EventImplPtr are filled out by this helper. /// \return an event representing the submitted operation. - template + template event submitMemOpHelper(const std::shared_ptr &Self, const std::vector &DepEvents, - HandlerFuncT HandlerFunc, - MemMngrFuncT MemMngrFunc, MemMngrArgTs... MemOpArgs); + HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc, + MemMngrArgTs... MemOpArgs); // When instrumentation is enabled emits trace event for wait begin and // returns the telemetry event generated for the wait From e31dfe3cd23418f7fc14886cadbbff76cb1a35be Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Wed, 24 Jan 2024 10:36:51 -0800 Subject: [PATCH 39/40] Trim unrelated edits --- sycl/source/detail/queue_impl.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 813d051cfd0b2..67ddecbb370c9 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -118,7 +118,6 @@ event queue_impl::memset(const std::shared_ptr &Self, // Emit a begin/end scope for this call PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin); #endif - if (MGraph.lock()) { throw sycl::exception(make_error_code(errc::invalid), "The memset feature is not yet available " @@ -181,7 +180,6 @@ event queue_impl::memcpy(const std::shared_ptr &Self, throw runtime_error("NULL pointer argument in memory copy operation.", PI_ERROR_INVALID_VALUE); } - return submitMemOpHelper( Self, DepEvents, HandlerFunc, [](const auto &...Args) { MemoryManager::copy_usm(Args...); }, Src, Self, From 15eb641a2d43ba1c61f2a6a9746110d201ede220 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Mon, 29 Jan 2024 06:10:33 -0800 Subject: [PATCH 40/40] Fix comment typo --- sycl/source/detail/queue_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 67ddecbb370c9..6fdfdd7a4d11c 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -302,7 +302,7 @@ areEventsSafeForSchedulerBypass(const std::vector &DepEvents, ContextImplPtr Context) { auto CheckEvent = [&Context](const sycl::event &Event) { const EventImplPtr &SyclEventImplPtr = detail::getSyclObjImpl(Event); - // Events that don't have an initialized context are throwaway evemts that + // Events that don't have an initialized context are throwaway events that // don't represent actual dependencies. Calling getContextImpl() would set // their context, which we wish to avoid as it is expensive. if (!SyclEventImplPtr->isContextInitialized() &&