From 92e35aeabe381de36fe0ea0cd4d1830184a216e1 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Wed, 21 Jul 2021 15:35:41 +0300 Subject: [PATCH 1/2] [SYCL] Revert queue::wait() to its old behaviour with Level Zero Revert the behaviour of queue::wait() to calling event::wait() on all events associated with it for Level Zero backend due to significant performance regressions introduced with that change. --- sycl/source/detail/queue_impl.cpp | 63 +++++++++++-------- .../plugins/level_zero_batch_event_status.cpp | 4 +- .../plugins/level_zero_batch_test.cpp | 6 +- sycl/unittests/queue/Wait.cpp | 8 +++ 4 files changed, 51 insertions(+), 30 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index c92cfa1676918..fdf97f9c3b06b 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -263,41 +263,54 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { WeakEvents.swap(MEventsWeak); SharedEvents.swap(MEventsShared); } + const detail::plugin &Plugin = getPlugin(); // If the queue is either a host one or does not support OOO (and we use // multiple in-order queues as a result of that), wait for each event // directly. Otherwise, only wait for unenqueued or host task events, starting // from the latest submitted task in order to minimize total amount of calls, // then handle the rest with piQueueFinish. - bool SupportsPiFinish = !is_host() && MSupportOOO; - for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); - EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { - if (std::shared_ptr EventImplSharedPtr = - EventImplWeakPtrIt->lock()) { - // A nullptr PI event indicates that piQueueFinish will not cover it, - // either because it's a host task event or an unenqueued one. - if (!SupportsPiFinish || nullptr == EventImplSharedPtr->getHandleRef()) { - EventImplSharedPtr->wait(EventImplSharedPtr); - } - } - } - if (SupportsPiFinish) { - const detail::plugin &Plugin = getPlugin(); - Plugin.call(getHandleRef()); + // TODO the new workflow has worse performance with Level Zero, keep the old + // behavior until this is addressed + if (Plugin.getBackend() == backend::level_zero) { for (std::weak_ptr &EventImplWeakPtr : WeakEvents) if (std::shared_ptr EventImplSharedPtr = EventImplWeakPtr.lock()) - EventImplSharedPtr->cleanupCommand(EventImplSharedPtr); - // FIXME these events are stored for level zero until as a workaround, - // remove once piEventRelease no longer calls wait on the event in the - // plugin. - if (Plugin.getBackend() == backend::level_zero) { - SharedEvents.clear(); - } - assert(SharedEvents.empty() && "Queues that support calling piQueueFinish " - "shouldn't have shared events"); - } else { + EventImplSharedPtr->wait(EventImplSharedPtr); for (event &Event : SharedEvents) Event.wait(); + } else { + bool SupportsPiFinish = !is_host() && MSupportOOO; + for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); + EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { + if (std::shared_ptr EventImplSharedPtr = + EventImplWeakPtrIt->lock()) { + // A nullptr PI event indicates that piQueueFinish will not cover it, + // either because it's a host task event or an unenqueued one. + if (!SupportsPiFinish || + nullptr == EventImplSharedPtr->getHandleRef()) { + EventImplSharedPtr->wait(EventImplSharedPtr); + } + } + } + if (SupportsPiFinish) { + Plugin.call(getHandleRef()); + for (std::weak_ptr &EventImplWeakPtr : WeakEvents) + if (std::shared_ptr EventImplSharedPtr = + EventImplWeakPtr.lock()) + EventImplSharedPtr->cleanupCommand(EventImplSharedPtr); + // FIXME these events are stored for level zero until as a workaround, + // remove once piEventRelease no longer calls wait on the event in the + // plugin. + if (Plugin.getBackend() == backend::level_zero) { + SharedEvents.clear(); + } + assert(SharedEvents.empty() && + "Queues that support calling piQueueFinish " + "shouldn't have shared events"); + } else { + for (event &Event : SharedEvents) + Event.wait(); + } } #ifdef XPTI_ENABLE_INSTRUMENTATION instrumentationEpilog(TelemetryEvent, Name, StreamID, IId); diff --git a/sycl/test/on-device/plugins/level_zero_batch_event_status.cpp b/sycl/test/on-device/plugins/level_zero_batch_event_status.cpp index 167df039b4474..d98140ac4b1ff 100644 --- a/sycl/test/on-device/plugins/level_zero_batch_event_status.cpp +++ b/sycl/test/on-device/plugins/level_zero_batch_event_status.cpp @@ -25,10 +25,10 @@ // CHECK: ZE ---> zeCommandListClose // CHECK: ZE ---> zeCommandQueueExecuteCommandLists // CHECK: ---> piEventGetInfo -// CHECK-NOT: piQueueFinish +// CHECK-NOT: piEventsWait // CHECK: ---> piEnqueueKernelLaunch // CHECK: ZE ---> zeCommandListAppendLaunchKernel -// CHECK: ---> piQueueFinish +// CHECK: ---> piEventsWait // Look for close and Execute after piEventsWait // CHECK: ZE ---> zeCommandListClose // CHECK: ZE ---> zeCommandQueueExecuteCommandLists diff --git a/sycl/test/on-device/plugins/level_zero_batch_test.cpp b/sycl/test/on-device/plugins/level_zero_batch_test.cpp index 43426e6fb6dd1..f834e2bfab121 100644 --- a/sycl/test/on-device/plugins/level_zero_batch_test.cpp +++ b/sycl/test/on-device/plugins/level_zero_batch_test.cpp @@ -86,7 +86,7 @@ // CKB4: ZE ---> zeCommandQueueExecuteCommandLists( // CKB8: ZE ---> zeCommandListClose( // CKB8: ZE ---> zeCommandQueueExecuteCommandLists( -// CKALL: ---> piQueueFinish( +// CKALL: ---> piEventsWait( // CKB3: ZE ---> zeCommandListClose( // CKB3: ZE ---> zeCommandQueueExecuteCommandLists( // CKB5: ZE ---> zeCommandListClose( @@ -142,7 +142,7 @@ // CKB4: ZE ---> zeCommandQueueExecuteCommandLists( // CKB8: ZE ---> zeCommandListClose( // CKB8: ZE ---> zeCommandQueueExecuteCommandLists( -// CKALL: ---> piQueueFinish( +// CKALL: ---> piEventsWait( // CKB3: ZE ---> zeCommandListClose( // CKB3: ZE ---> zeCommandQueueExecuteCommandLists( // CKB5: ZE ---> zeCommandListClose( @@ -198,7 +198,7 @@ // CKB4: ZE ---> zeCommandQueueExecuteCommandLists( // CKB8: ZE ---> zeCommandListClose( // CKB8: ZE ---> zeCommandQueueExecuteCommandLists( -// CKALL: ---> piQueueFinish( +// CKALL: ---> piEventsWait( // CKB3: ZE ---> zeCommandListClose( // CKB3: ZE ---> zeCommandQueueExecuteCommandLists( // CKB5: ZE ---> zeCommandListClose( diff --git a/sycl/unittests/queue/Wait.cpp b/sycl/unittests/queue/Wait.cpp index 417b2c5078c94..8592b5cd8d8a7 100644 --- a/sycl/unittests/queue/Wait.cpp +++ b/sycl/unittests/queue/Wait.cpp @@ -92,6 +92,14 @@ bool preparePiMock(platform &Plt) { << std::endl; return false; } + // TODO remove once queue:wait() is lowered to PiQueueFinish with level zero + // as well. + if (detail::getSyclObjImpl(Plt)->getPlugin().getBackend() == + backend::level_zero) { + std::cout << "Not run on Level Zero, old behavior is kept there temporarily" + << std::endl; + return false; + } unittest::PiMock Mock{Plt}; Mock.redefine(redefinedQueueCreate); From 333b711061f055f9f3d863ea010fffc98e983bb4 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Wed, 21 Jul 2021 17:51:14 +0300 Subject: [PATCH 2/2] Fix attempting to get a plugin for host device --- sycl/source/detail/queue_impl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index fdf97f9c3b06b..33cd17500207d 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -263,7 +263,6 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { WeakEvents.swap(MEventsWeak); SharedEvents.swap(MEventsShared); } - const detail::plugin &Plugin = getPlugin(); // If the queue is either a host one or does not support OOO (and we use // multiple in-order queues as a result of that), wait for each event // directly. Otherwise, only wait for unenqueued or host task events, starting @@ -271,7 +270,7 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { // then handle the rest with piQueueFinish. // TODO the new workflow has worse performance with Level Zero, keep the old // behavior until this is addressed - if (Plugin.getBackend() == backend::level_zero) { + if (!is_host() && getPlugin().getBackend() == backend::level_zero) { for (std::weak_ptr &EventImplWeakPtr : WeakEvents) if (std::shared_ptr EventImplSharedPtr = EventImplWeakPtr.lock()) @@ -293,6 +292,7 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { } } if (SupportsPiFinish) { + const detail::plugin &Plugin = getPlugin(); Plugin.call(getHandleRef()); for (std::weak_ptr &EventImplWeakPtr : WeakEvents) if (std::shared_ptr EventImplSharedPtr =