diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index c92cfa1676918..33cd17500207d 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -268,36 +268,49 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { // directly. Otherwise, only wait for unenqueued or host task events, starting // from the latest submitted task in order to minimize total amount of calls, // then handle the rest with piQueueFinish. - bool SupportsPiFinish = !is_host() && MSupportOOO; - for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); - EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { - if (std::shared_ptr EventImplSharedPtr = - EventImplWeakPtrIt->lock()) { - // A nullptr PI event indicates that piQueueFinish will not cover it, - // either because it's a host task event or an unenqueued one. - if (!SupportsPiFinish || nullptr == EventImplSharedPtr->getHandleRef()) { - EventImplSharedPtr->wait(EventImplSharedPtr); - } - } - } - if (SupportsPiFinish) { - const detail::plugin &Plugin = getPlugin(); - Plugin.call(getHandleRef()); + // TODO the new workflow has worse performance with Level Zero, keep the old + // behavior until this is addressed + if (!is_host() && getPlugin().getBackend() == backend::level_zero) { for (std::weak_ptr &EventImplWeakPtr : WeakEvents) if (std::shared_ptr EventImplSharedPtr = EventImplWeakPtr.lock()) - EventImplSharedPtr->cleanupCommand(EventImplSharedPtr); - // FIXME these events are stored for level zero until as a workaround, - // remove once piEventRelease no longer calls wait on the event in the - // plugin. - if (Plugin.getBackend() == backend::level_zero) { - SharedEvents.clear(); - } - assert(SharedEvents.empty() && "Queues that support calling piQueueFinish " - "shouldn't have shared events"); - } else { + EventImplSharedPtr->wait(EventImplSharedPtr); for (event &Event : SharedEvents) Event.wait(); + } else { + bool SupportsPiFinish = !is_host() && MSupportOOO; + for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); + EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { + if (std::shared_ptr EventImplSharedPtr = + EventImplWeakPtrIt->lock()) { + // A nullptr PI event indicates that piQueueFinish will not cover it, + // either because it's a host task event or an unenqueued one. + if (!SupportsPiFinish || + nullptr == EventImplSharedPtr->getHandleRef()) { + EventImplSharedPtr->wait(EventImplSharedPtr); + } + } + } + if (SupportsPiFinish) { + const detail::plugin &Plugin = getPlugin(); + Plugin.call(getHandleRef()); + for (std::weak_ptr &EventImplWeakPtr : WeakEvents) + if (std::shared_ptr EventImplSharedPtr = + EventImplWeakPtr.lock()) + EventImplSharedPtr->cleanupCommand(EventImplSharedPtr); + // FIXME these events are stored for level zero until as a workaround, + // remove once piEventRelease no longer calls wait on the event in the + // plugin. + if (Plugin.getBackend() == backend::level_zero) { + SharedEvents.clear(); + } + assert(SharedEvents.empty() && + "Queues that support calling piQueueFinish " + "shouldn't have shared events"); + } else { + for (event &Event : SharedEvents) + Event.wait(); + } } #ifdef XPTI_ENABLE_INSTRUMENTATION instrumentationEpilog(TelemetryEvent, Name, StreamID, IId); diff --git a/sycl/test/on-device/plugins/level_zero_batch_event_status.cpp b/sycl/test/on-device/plugins/level_zero_batch_event_status.cpp index 167df039b4474..d98140ac4b1ff 100644 --- a/sycl/test/on-device/plugins/level_zero_batch_event_status.cpp +++ b/sycl/test/on-device/plugins/level_zero_batch_event_status.cpp @@ -25,10 +25,10 @@ // CHECK: ZE ---> zeCommandListClose // CHECK: ZE ---> zeCommandQueueExecuteCommandLists // CHECK: ---> piEventGetInfo -// CHECK-NOT: piQueueFinish +// CHECK-NOT: piEventsWait // CHECK: ---> piEnqueueKernelLaunch // CHECK: ZE ---> zeCommandListAppendLaunchKernel -// CHECK: ---> piQueueFinish +// CHECK: ---> piEventsWait // Look for close and Execute after piEventsWait // CHECK: ZE ---> zeCommandListClose // CHECK: ZE ---> zeCommandQueueExecuteCommandLists diff --git a/sycl/test/on-device/plugins/level_zero_batch_test.cpp b/sycl/test/on-device/plugins/level_zero_batch_test.cpp index 43426e6fb6dd1..f834e2bfab121 100644 --- a/sycl/test/on-device/plugins/level_zero_batch_test.cpp +++ b/sycl/test/on-device/plugins/level_zero_batch_test.cpp @@ -86,7 +86,7 @@ // CKB4: ZE ---> zeCommandQueueExecuteCommandLists( // CKB8: ZE ---> zeCommandListClose( // CKB8: ZE ---> zeCommandQueueExecuteCommandLists( -// CKALL: ---> piQueueFinish( +// CKALL: ---> piEventsWait( // CKB3: ZE ---> zeCommandListClose( // CKB3: ZE ---> zeCommandQueueExecuteCommandLists( // CKB5: ZE ---> zeCommandListClose( @@ -142,7 +142,7 @@ // CKB4: ZE ---> zeCommandQueueExecuteCommandLists( // CKB8: ZE ---> zeCommandListClose( // CKB8: ZE ---> zeCommandQueueExecuteCommandLists( -// CKALL: ---> piQueueFinish( +// CKALL: ---> piEventsWait( // CKB3: ZE ---> zeCommandListClose( // CKB3: ZE ---> zeCommandQueueExecuteCommandLists( // CKB5: ZE ---> zeCommandListClose( @@ -198,7 +198,7 @@ // CKB4: ZE ---> zeCommandQueueExecuteCommandLists( // CKB8: ZE ---> zeCommandListClose( // CKB8: ZE ---> zeCommandQueueExecuteCommandLists( -// CKALL: ---> piQueueFinish( +// CKALL: ---> piEventsWait( // CKB3: ZE ---> zeCommandListClose( // CKB3: ZE ---> zeCommandQueueExecuteCommandLists( // CKB5: ZE ---> zeCommandListClose( diff --git a/sycl/unittests/queue/Wait.cpp b/sycl/unittests/queue/Wait.cpp index 417b2c5078c94..8592b5cd8d8a7 100644 --- a/sycl/unittests/queue/Wait.cpp +++ b/sycl/unittests/queue/Wait.cpp @@ -92,6 +92,14 @@ bool preparePiMock(platform &Plt) { << std::endl; return false; } + // TODO remove once queue:wait() is lowered to PiQueueFinish with level zero + // as well. + if (detail::getSyclObjImpl(Plt)->getPlugin().getBackend() == + backend::level_zero) { + std::cout << "Not run on Level Zero, old behavior is kept there temporarily" + << std::endl; + return false; + } unittest::PiMock Mock{Plt}; Mock.redefine(redefinedQueueCreate);