From 1a0033de339e92c3732e2e51783a49961de77e0b Mon Sep 17 00:00:00 2001 From: Igor Chorazewicz Date: Tue, 10 Jun 2025 19:01:13 +0000 Subject: [PATCH 1/8] [SYCL][UR][L0 v2] implement OOO immediate queue by using multiple in-order queues and round-robin strategy to dispatch work. With this approach we don't need to worry about events' lifetime. Since we are still using counter-based events, we don't need any special logic to handle cases where event release is called right after being passed as signal event or as part of a wait list. --- .../source/adapters/level_zero/CMakeLists.txt | 2 + .../adapters/level_zero/v2/event_pool.hpp | 30 ++ .../adapters/level_zero/v2/lockable.hpp | 1 + .../adapters/level_zero/v2/queue_create.cpp | 15 +- .../adapters/level_zero/v2/queue_handle.hpp | 4 +- .../v2/queue_immediate_in_order.cpp | 6 +- .../v2/queue_immediate_in_order.hpp | 121 ++--- .../v2/queue_immediate_out_of_order.cpp | 189 +++++++ .../v2/queue_immediate_out_of_order.hpp | 503 ++++++++++++++++++ 9 files changed, 805 insertions(+), 66 deletions(-) create mode 100644 unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp create mode 100644 unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp diff --git a/unified-runtime/source/adapters/level_zero/CMakeLists.txt b/unified-runtime/source/adapters/level_zero/CMakeLists.txt index 8aaf2da885a27..8532263dd8aa9 100644 --- a/unified-runtime/source/adapters/level_zero/CMakeLists.txt +++ b/unified-runtime/source/adapters/level_zero/CMakeLists.txt @@ -164,6 +164,7 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/v2/lockable.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_out_of_order.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/api.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_buffer.cpp @@ -180,6 +181,7 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_create.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_out_of_order.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.cpp ) install_ur_library(ur_adapter_level_zero_v2) diff --git a/unified-runtime/source/adapters/level_zero/v2/event_pool.hpp b/unified-runtime/source/adapters/level_zero/v2/event_pool.hpp index a92a7fc72fa36..df517268a647e 100644 --- a/unified-runtime/source/adapters/level_zero/v2/event_pool.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/event_pool.hpp @@ -60,4 +60,34 @@ class event_pool { ur_mutex mutex; }; +// Only create an event when requested by the user. +static inline ur_event_handle_t +createEventIfRequested(event_pool *eventPool, ur_event_handle_t *phEvent, + ur_queue_t_ *queue) { + if (phEvent == nullptr) { + return nullptr; + } + + (*phEvent) = eventPool->allocate(); + (*phEvent)->setQueue(queue); + return (*phEvent); +} + +// Always creates an event (used in functions that need to store the event +// internally). If event was requested by the user, also increase ref count of +// that event to avoid pre-mature release. +static inline ur_event_handle_t createEventAndRetain(event_pool *eventPool, + ur_event_handle_t *phEvent, + ur_queue_t_ *queue) { + auto hEvent = eventPool->allocate(); + hEvent->setQueue(queue); + + if (phEvent) { + (*phEvent) = hEvent; + hEvent->retain(); + } + + return hEvent; +} + } // namespace v2 diff --git a/unified-runtime/source/adapters/level_zero/v2/lockable.hpp b/unified-runtime/source/adapters/level_zero/v2/lockable.hpp index dd8670295ef7a..93442d40acac8 100644 --- a/unified-runtime/source/adapters/level_zero/v2/lockable.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/lockable.hpp @@ -18,6 +18,7 @@ template struct locked { object_ = object; } T *operator->() { return object_; } + auto &operator[](size_t index) { return (*object_)[index]; } private: std::unique_lock lock_; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp index 23259d9c34139..3bc67ce04868f 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp @@ -69,9 +69,18 @@ ur_result_t urQueueCreate(ur_context_handle_t hContext, auto zeIndex = v2::getZeIndex(pProperties); - *phQueue = ur_queue_handle_t_::create( - hContext, hDevice, v2::getZeOrdinal(hDevice), v2::getZePriority(flags), - zeIndex, v2::eventFlagsFromQueueFlags(flags), flags); + if ((flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0 && + !zeIndex.has_value()) { + *phQueue = + ur_queue_handle_t_::create( + hContext, hDevice, v2::getZeOrdinal(hDevice), + v2::getZePriority(flags), v2::eventFlagsFromQueueFlags(flags), + flags); + } else { + *phQueue = ur_queue_handle_t_::create( + hContext, hDevice, v2::getZeOrdinal(hDevice), v2::getZePriority(flags), + zeIndex, v2::eventFlagsFromQueueFlags(flags), flags); + } return UR_RESULT_SUCCESS; } catch (...) { diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp index 75bf4a16faf61..9831afdbc9e4c 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp @@ -15,11 +15,13 @@ #include "../common.hpp" #include "queue_immediate_in_order.hpp" +#include "queue_immediate_out_of_order.hpp" #include #include struct ur_queue_handle_t_ : ur::handle_base { - using data_variant = std::variant; + using data_variant = std::variant; data_variant queue_data; static constexpr uintptr_t queue_offset = diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 258cd45eb407c..cc9b464333e70 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -140,10 +140,12 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier( // zeCommandListAppendWaitOnEvents if ((flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0) { return commandListManager.lock()->appendEventsWaitWithBarrier( - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } else { return commandListManager.lock()->appendEventsWait( - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } } diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index d7d879d9df417..362a6ea31c9f4 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -33,32 +33,6 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_queue_flags_t flags; v2::raii::cache_borrowed_event_pool eventPool; - // Only create an event when requested by the user. - ur_event_handle_t createEventIfRequested(ur_event_handle_t *phEvent) { - if (phEvent == nullptr) { - return nullptr; - } - - (*phEvent) = eventPool->allocate(); - (*phEvent)->setQueue(this); - return (*phEvent); - } - - // Always creates an event (used in functions that need to store the event - // internally). If event was requested by the user, also increase ref count of - // that event to avoid pre-mature release. - ur_event_handle_t createEventAndRetain(ur_event_handle_t *phEvent) { - auto hEvent = eventPool->allocate(); - hEvent->setQueue(this); - - if (phEvent) { - (*phEvent) = hEvent; - hEvent->retain(); - } - - return hEvent; - } - public: ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t, uint32_t ordinal, @@ -88,7 +62,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { return commandListManager.lock()->appendKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, numPropsInLaunchPropList, launchPropList, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueEventsWaitWithBarrier(uint32_t numEventsInWaitList, @@ -99,7 +74,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendEventsWait( - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, @@ -117,7 +93,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendMemBufferRead( hBuffer, blockingRead, offset, size, pDst, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferWrite(ur_mem_handle_t hBuffer, bool blockingWrite, @@ -128,7 +105,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendMemBufferWrite( hBuffer, blockingWrite, offset, size, pSrc, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferReadRect( @@ -141,7 +119,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { return commandListManager.lock()->appendMemBufferReadRect( hBuffer, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferWriteRect( @@ -154,7 +133,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { return commandListManager.lock()->appendMemBufferWriteRect( hBuffer, blockingWrite, bufferOrigin, hostOrigin, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferCopy(ur_mem_handle_t hBufferSrc, @@ -165,7 +145,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendMemBufferCopy( hBufferSrc, hBufferDst, srcOffset, dstOffset, size, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferCopyRect( @@ -178,7 +159,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { return commandListManager.lock()->appendMemBufferCopyRect( hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferFill(ur_mem_handle_t hBuffer, @@ -189,7 +171,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendMemBufferFill( hBuffer, pPattern, patternSize, offset, size, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemImageRead(ur_mem_handle_t hImage, bool blockingRead, @@ -201,7 +184,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendMemImageRead( hImage, blockingRead, origin, region, rowPitch, slicePitch, pDst, - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, @@ -213,7 +197,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendMemImageWrite( hImage, blockingWrite, origin, region, rowPitch, slicePitch, pSrc, - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t @@ -224,7 +209,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendMemImageCopy( hImageSrc, hImageDst, srcOrigin, dstOrigin, region, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferMap(ur_mem_handle_t hBuffer, bool blockingMap, @@ -235,7 +221,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { void **ppRetMap) override { return commandListManager.lock()->appendMemBufferMap( hBuffer, blockingMap, mapFlags, offset, size, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent), ppRetMap); + phEventWaitList, createEventIfRequested(eventPool.get(), phEvent, this), + ppRetMap); } ur_result_t enqueueMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, @@ -244,7 +231,7 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendMemUnmap( hMem, pMappedPtr, numEventsInWaitList, phEventWaitList, - createEventIfRequested(phEvent)); + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMFill(void *pMem, size_t patternSize, @@ -254,7 +241,7 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMFill( pMem, patternSize, pPattern, size, numEventsInWaitList, phEventWaitList, - createEventIfRequested(phEvent)); + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMMemcpy(bool blocking, void *pDst, const void *pSrc, @@ -263,7 +250,7 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMMemcpy( blocking, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, - createEventIfRequested(phEvent)); + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMFill2D(void *pMem, size_t pitch, size_t patternSize, @@ -273,7 +260,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMFill2D( pMem, pitch, patternSize, pPattern, width, height, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMMemcpy2D(bool blocking, void *pDst, size_t dstPitch, @@ -284,7 +272,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMMemcpy2D( blocking, pDst, dstPitch, pSrc, srcPitch, width, height, - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMPrefetch(const void *pMem, size_t size, @@ -294,14 +283,15 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMPrefetch( pMem, size, flags, numEventsInWaitList, phEventWaitList, - createEventIfRequested(phEvent)); + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMAdvise(const void *pMem, size_t size, ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMAdvise( - pMem, size, advice, 0, nullptr, createEventIfRequested(phEvent)); + pMem, size, advice, 0, nullptr, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueDeviceGlobalVariableWrite( @@ -311,7 +301,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendDeviceGlobalVariableWrite( hProgram, name, blockingWrite, count, offset, pSrc, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueDeviceGlobalVariableRead( @@ -321,7 +312,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendDeviceGlobalVariableRead( hProgram, name, blockingRead, count, offset, pDst, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueReadHostPipe(ur_program_handle_t hProgram, @@ -332,7 +324,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendReadHostPipe( hProgram, pipe_symbol, blocking, pDst, size, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueWriteHostPipe(ur_program_handle_t hProgram, @@ -343,7 +336,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendWriteHostPipe( hProgram, pipe_symbol, blocking, pSrc, size, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMDeviceAllocExp( @@ -353,7 +347,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { void **ppMem, ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMAllocHelper( this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(phEvent), UR_USM_TYPE_DEVICE); + ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + UR_USM_TYPE_DEVICE); } ur_result_t enqueueUSMSharedAllocExp( @@ -363,7 +358,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { void **ppMem, ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMAllocHelper( this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(phEvent), UR_USM_TYPE_SHARED); + ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + UR_USM_TYPE_SHARED); } ur_result_t @@ -374,7 +370,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMAllocHelper( this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(phEvent), UR_USM_TYPE_HOST); + ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + UR_USM_TYPE_HOST); } ur_result_t enqueueUSMFreeExp(ur_usm_pool_handle_t pPool, void *pMem, @@ -383,7 +380,7 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMFreeExp( this, pPool, pMem, numEventsInWaitList, phEventWaitList, - createEventAndRetain(phEvent)); + createEventAndRetain(eventPool.get(), phEvent, this)); } ur_result_t bindlessImagesImageCopyExp( @@ -398,7 +395,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { return commandListManager.lock()->bindlessImagesImageCopyExp( pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, pDstImageFormat, pCopyRegion, imageCopyFlags, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t bindlessImagesWaitExternalSemaphoreExp( @@ -408,7 +406,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->bindlessImagesWaitExternalSemaphoreExp( hSemaphore, hasWaitValue, waitValue, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t bindlessImagesSignalExternalSemaphoreExp( @@ -418,7 +417,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->bindlessImagesSignalExternalSemaphoreExp( hSemaphore, hasSignalValue, signalValue, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t @@ -427,7 +427,7 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendTimestampRecordingExp( blocking, numEventsInWaitList, phEventWaitList, - createEventIfRequested(phEvent)); + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t @@ -437,7 +437,7 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendCommandBufferExp( hCommandBuffer, numEventsInWaitList, phEventWaitList, - createEventAndRetain(phEvent)); + createEventAndRetain(eventPool.get(), phEvent, this)); } ur_result_t enqueueNativeCommandExp( @@ -448,7 +448,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendNativeCommandExp( pfnNativeEnqueue, data, numMemsInMemList, phMemList, pProperties, - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } }; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp new file mode 100644 index 0000000000000..6feab20f65792 --- /dev/null +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp @@ -0,0 +1,189 @@ +//===--------- queue_immediate_in_order.cpp - Level Zero Adapter ---------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "queue_immediate_out_of_order.hpp" +#include "../common/latency_tracker.hpp" +#include "ur.hpp" + +namespace v2 { + +// Helper function to intialize std::array of command list manager. +// This is needed because command list manager does not have a default +// constructor. +template +std::array createCommandListManagers( + ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal, + ze_command_queue_priority_t priority, std::index_sequence) { + return { + ((void)Is, ur_command_list_manager( + hContext, hDevice, + hContext->getCommandListCache().getImmediateCommandList( + hDevice->ZeDevice, + {true, ordinal, true /* always enable copy offload */}, + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, priority)))...}; +} + +template +std::array +createCommandListManagers(ur_context_handle_t hContext, + ur_device_handle_t hDevice, uint32_t ordinal, + ze_command_queue_priority_t priority) { + return createCommandListManagers(hContext, hDevice, ordinal, priority, + std::make_index_sequence{}); +} + +ur_queue_immediate_out_of_order_t::ur_queue_immediate_out_of_order_t( + ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal, + ze_command_queue_priority_t priority, event_flags_t eventFlags, + ur_queue_flags_t flags) + : hContext(hContext), hDevice(hDevice), + commandListManagers(createCommandListManagers( + hContext, hDevice, ordinal, priority)), + eventPool(hContext->getEventPoolCache(PoolCacheType::Immediate) + .borrow(hDevice->Id.value(), eventFlags)), + flags(flags) { + for (size_t i = 0; i < numCommandLists; i++) { + barrierEvents[i] = eventPool->allocate(); + } +} + +ur_result_t ur_queue_immediate_out_of_order_t::queueGetInfo( + ur_queue_info_t propName, size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + // TODO: consider support for queue properties and size + switch ((uint32_t)propName) { // cast to avoid warnings on EXT enum values + case UR_QUEUE_INFO_CONTEXT: + return ReturnValue(hContext); + case UR_QUEUE_INFO_DEVICE: + return ReturnValue(hDevice); + case UR_QUEUE_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{RefCount.load()}); + case UR_QUEUE_INFO_FLAGS: + return ReturnValue(flags); + case UR_QUEUE_INFO_SIZE: + case UR_QUEUE_INFO_DEVICE_DEFAULT: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + case UR_QUEUE_INFO_EMPTY: { + auto isCmdListEmpty = [](ze_command_list_handle_t cmdList) { + auto status = ZE_CALL_NOCHECK(zeCommandListHostSynchronize, (cmdList, 0)); + if (status == ZE_RESULT_SUCCESS) { + return true; + } else if (status == ZE_RESULT_NOT_READY) { + return false; + } else { + throw ze2urResult(status); + } + }; + + auto commandListManagersLocked = commandListManagers.lock(); + + bool empty = std::all_of( + commandListManagersLocked->begin(), commandListManagersLocked->end(), + [&](auto &cmdListManager) { + return isCmdListEmpty(cmdListManager.getZeCommandList()); + }); + + return ReturnValue(empty); + } + default: + UR_LOG(ERR, + "Unsupported ParamName in urQueueGetInfo: " + "ParamName=ParamName={}(0x{})", + propName, logger::toHex(propName)); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_immediate_out_of_order_t::queueGetNativeHandle( + ur_queue_native_desc_t * /*pDesc*/, ur_native_handle_t *phNativeQueue) { + *phNativeQueue = reinterpret_cast( + (*commandListManagers.get_no_lock())[getNextCommandListId()] + .getZeCommandList()); + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_immediate_out_of_order_t::queueFinish() { + TRACK_SCOPE_LATENCY("ur_queue_immediate_out_of_order_t::queueFinish"); + + auto commandListManagersLocked = commandListManagers.lock(); + + for (size_t i = 0; i < numCommandLists; i++) { + ZE2UR_CALL(zeCommandListHostSynchronize, + (commandListManagersLocked[i].getZeCommandList(), UINT64_MAX)); + UR_CALL(commandListManagersLocked[i].releaseSubmittedKernels()); + } + + hContext->getAsyncPool()->cleanupPoolsForQueue(this); + hContext->forEachUsmPool([this](ur_usm_pool_handle_t hPool) { + hPool->cleanupPoolsForQueue(this); + return true; + }); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_immediate_out_of_order_t::queueFlush() { + return UR_RESULT_SUCCESS; +} + +ur_queue_immediate_out_of_order_t::~ur_queue_immediate_out_of_order_t() { + try { + UR_CALL_THROWS(queueFinish()); + + for (size_t i = 0; i < numCommandLists; i++) { + barrierEvents[i]->release(); + } + } catch (...) { + // Ignore errors during destruction + } +} + +ur_result_t ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier( + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY( + "ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier"); + // For in-order queue we don't need a real L0 barrier, just wait for + // requested events in potentially different queues and add a "barrier" + // event signal because it is already guaranteed that previous commands + // in this queue are completed when the signal is started. However, we do + // need to use barrier if profiling is enabled: see + // zeCommandListAppendWaitOnEvents + bool needsRealBarrier = (flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0; + auto appendEventsWaitFn = + needsRealBarrier ? &ur_command_list_manager::appendEventsWaitWithBarrier + : &ur_command_list_manager::appendEventsWait; + + auto commandListManagersLocked = commandListManagers.lock(); + + // Enqueue wait for the user-provider events on the first command list. + std::invoke(appendEventsWaitFn, commandListManagersLocked[0], + numEventsInWaitList, phEventWaitList, barrierEvents[0]); + + // Submit barrier or request barrierEvents[id] to be signaled on remaining + // command lists. + for (size_t id = 1; id < numCommandLists; id++) { + std::invoke(appendEventsWaitFn, commandListManagersLocked[id], 0, nullptr, + barrierEvents[id]); + } + + if (phEvent) { + UR_CALL(commandListManagersLocked[0].appendEventsWait( + numCommandLists, barrierEvents.data(), + createEventIfRequested(eventPool.get(), phEvent, this))); + } + + return UR_RESULT_SUCCESS; +} + +} // namespace v2 diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp new file mode 100644 index 0000000000000..eb58ed017736d --- /dev/null +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp @@ -0,0 +1,503 @@ +//===--------- queue_immediate_in_order.hpp - Level Zero Adapter ---------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include "../common.hpp" +#include "../device.hpp" + +#include "context.hpp" +#include "event.hpp" +#include "event_pool_cache.hpp" +#include "queue_api.hpp" + +#include "command_list_manager.hpp" +#include "lockable.hpp" +#include "ur/ur.hpp" + +namespace v2 { + +struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { +private: + static constexpr size_t numCommandLists = 4; + + ur_context_handle_t hContext; + ur_device_handle_t hDevice; + + std::atomic commandListIndex = 0; + lockable> + commandListManagers; + + v2::raii::cache_borrowed_event_pool eventPool; + + ur_queue_flags_t flags; + + std::array barrierEvents; + + uint32_t getNextCommandListId() { + return commandListIndex.fetch_add(1, std::memory_order_relaxed) % + numCommandLists; + } + +public: + ur_queue_immediate_out_of_order_t(ur_context_handle_t, ur_device_handle_t, + uint32_t ordinal, + ze_command_queue_priority_t priority, + event_flags_t eventFlags, + ur_queue_flags_t flags); + + ~ur_queue_immediate_out_of_order_t(); + + ur_result_t queueGetInfo(ur_queue_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) override; + ur_result_t queueGetNativeHandle(ur_queue_native_desc_t *pDesc, + ur_native_handle_t *phNativeQueue) override; + ur_result_t queueFinish() override; + ur_result_t queueFlush() override; + ur_result_t enqueueKernelLaunch( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendKernelLaunch( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + ur_result_t + enqueueEventsWaitWithBarrier(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueEventsWait(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendEventsWait( + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + ur_result_t + enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return enqueueEventsWaitWithBarrier(numEventsInWaitList, phEventWaitList, + phEvent); + } + + ur_result_t enqueueMemBufferRead(ur_mem_handle_t hBuffer, bool blockingRead, + size_t offset, size_t size, void *pDst, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemBufferRead( + hBuffer, blockingRead, offset, size, pDst, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemBufferWrite(ur_mem_handle_t hBuffer, bool blockingWrite, + size_t offset, size_t size, + const void *pSrc, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemBufferWrite( + hBuffer, blockingWrite, offset, size, pSrc, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemBufferReadRect( + ur_mem_handle_t hBuffer, bool blockingRead, ur_rect_offset_t bufferOrigin, + ur_rect_offset_t hostOrigin, ur_rect_region_t region, + size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, + size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemBufferReadRect( + hBuffer, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemBufferWriteRect( + ur_mem_handle_t hBuffer, bool blockingWrite, + ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemBufferWriteRect( + hBuffer, blockingWrite, bufferOrigin, hostOrigin, region, + bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemBufferCopy(ur_mem_handle_t hBufferSrc, + ur_mem_handle_t hBufferDst, size_t srcOffset, + size_t dstOffset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemBufferCopy( + hBufferSrc, hBufferDst, srcOffset, dstOffset, size, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemBufferCopyRect( + ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, + size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemBufferCopyRect( + hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, srcRowPitch, + srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemBufferFill(ur_mem_handle_t hBuffer, + const void *pPattern, size_t patternSize, + size_t offset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemBufferFill( + hBuffer, pPattern, patternSize, offset, size, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemImageRead(ur_mem_handle_t hImage, bool blockingRead, + ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pDst, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemImageRead( + hImage, blockingRead, origin, region, rowPitch, slicePitch, pDst, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, + ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pSrc, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemImageWrite( + hImage, blockingWrite, origin, region, rowPitch, slicePitch, pSrc, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t + enqueueMemImageCopy(ur_mem_handle_t hImageSrc, ur_mem_handle_t hImageDst, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemImageCopy( + hImageSrc, hImageDst, srcOrigin, dstOrigin, region, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemBufferMap(ur_mem_handle_t hBuffer, bool blockingMap, + ur_map_flags_t mapFlags, size_t offset, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, + void **ppRetMap) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemBufferMap( + hBuffer, blockingMap, mapFlags, offset, size, numEventsInWaitList, + phEventWaitList, createEventIfRequested(eventPool.get(), phEvent, this), + ppRetMap); + } + + ur_result_t enqueueMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemUnmap( + hMem, pMappedPtr, numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueUSMFill(void *pMem, size_t patternSize, + const void *pPattern, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMFill( + pMem, patternSize, pPattern, size, numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueUSMMemcpy(bool blocking, void *pDst, const void *pSrc, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMMemcpy( + blocking, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueUSMFill2D(void *pMem, size_t pitch, size_t patternSize, + const void *pPattern, size_t width, + size_t height, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMFill2D( + pMem, pitch, patternSize, pPattern, width, height, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueUSMMemcpy2D(bool blocking, void *pDst, size_t dstPitch, + const void *pSrc, size_t srcPitch, + size_t width, size_t height, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMMemcpy2D( + blocking, pDst, dstPitch, pSrc, srcPitch, width, height, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueUSMPrefetch(const void *pMem, size_t size, + ur_usm_migration_flags_t flags, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMPrefetch( + pMem, size, flags, numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueUSMAdvise(const void *pMem, size_t size, + ur_usm_advice_flags_t advice, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMAdvise( + pMem, size, advice, 0, nullptr, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueDeviceGlobalVariableWrite( + ur_program_handle_t hProgram, const char *name, bool blockingWrite, + size_t count, size_t offset, const void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId] + .appendDeviceGlobalVariableWrite( + hProgram, name, blockingWrite, count, offset, pSrc, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueDeviceGlobalVariableRead( + ur_program_handle_t hProgram, const char *name, bool blockingRead, + size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId] + .appendDeviceGlobalVariableRead( + hProgram, name, blockingRead, count, offset, pDst, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueReadHostPipe(ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pDst, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendReadHostPipe( + hProgram, pipe_symbol, blocking, pDst, size, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueWriteHostPipe(ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pSrc, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendWriteHostPipe( + hProgram, pipe_symbol, blocking, pSrc, size, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueUSMDeviceAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMAllocHelper( + this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, + ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + UR_USM_TYPE_DEVICE); + } + + ur_result_t enqueueUSMSharedAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMAllocHelper( + this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, + ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + UR_USM_TYPE_SHARED); + } + + ur_result_t + enqueueUSMHostAllocExp(ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, void **ppMem, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMAllocHelper( + this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, + ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + UR_USM_TYPE_HOST); + } + + ur_result_t enqueueUSMFreeExp(ur_usm_pool_handle_t pPool, void *pMem, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMFreeExp( + this, pPool, pMem, numEventsInWaitList, phEventWaitList, + createEventAndRetain(eventPool.get(), phEvent, this)); + } + + ur_result_t bindlessImagesImageCopyExp( + const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, + const ur_image_desc_t *pDstImageDesc, + const ur_image_format_t *pSrcImageFormat, + const ur_image_format_t *pDstImageFormat, + ur_exp_image_copy_region_t *pCopyRegion, + ur_exp_image_copy_flags_t imageCopyFlags, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].bindlessImagesImageCopyExp( + pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, + pDstImageFormat, pCopyRegion, imageCopyFlags, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t bindlessImagesWaitExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, + uint64_t waitValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId] + .bindlessImagesWaitExternalSemaphoreExp( + hSemaphore, hasWaitValue, waitValue, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t bindlessImagesSignalExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, + uint64_t signalValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId] + .bindlessImagesSignalExternalSemaphoreExp( + hSemaphore, hasSignalValue, signalValue, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t + enqueueTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId] + .appendTimestampRecordingExp( + blocking, numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t + enqueueCommandBufferExp(ur_exp_command_buffer_handle_t hCommandBuffer, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendCommandBufferExp( + hCommandBuffer, numEventsInWaitList, phEventWaitList, + createEventAndRetain(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueNativeCommandExp( + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t numMemsInMemList, const ur_mem_handle_t *phMemList, + const ur_exp_enqueue_native_command_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendNativeCommandExp( + pfnNativeEnqueue, data, numMemsInMemList, phMemList, pProperties, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } +}; + +} // namespace v2 From 44d5522e7b0ec565ca3b3fb083526ea3462a6960 Mon Sep 17 00:00:00 2001 From: Igor Chorazewicz Date: Wed, 11 Jun 2025 22:22:23 +0000 Subject: [PATCH 2/8] Move eventPool member up - it should be destroyed after cmd list manager --- .../adapters/level_zero/v2/queue_immediate_out_of_order.cpp | 4 ++-- .../adapters/level_zero/v2/queue_immediate_out_of_order.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp index 6feab20f65792..4a1e69555c326 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp @@ -44,10 +44,10 @@ ur_queue_immediate_out_of_order_t::ur_queue_immediate_out_of_order_t( ze_command_queue_priority_t priority, event_flags_t eventFlags, ur_queue_flags_t flags) : hContext(hContext), hDevice(hDevice), - commandListManagers(createCommandListManagers( - hContext, hDevice, ordinal, priority)), eventPool(hContext->getEventPoolCache(PoolCacheType::Immediate) .borrow(hDevice->Id.value(), eventFlags)), + commandListManagers(createCommandListManagers( + hContext, hDevice, ordinal, priority)), flags(flags) { for (size_t i = 0; i < numCommandLists; i++) { barrierEvents[i] = eventPool->allocate(); diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp index eb58ed017736d..48bd6328c08e8 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp @@ -30,12 +30,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { ur_context_handle_t hContext; ur_device_handle_t hDevice; + v2::raii::cache_borrowed_event_pool eventPool; + std::atomic commandListIndex = 0; lockable> commandListManagers; - v2::raii::cache_borrowed_event_pool eventPool; - ur_queue_flags_t flags; std::array barrierEvents; From 8a3e47ece090e9870cb140846445a0104895c8c3 Mon Sep 17 00:00:00 2001 From: Igor Chorazewicz Date: Thu, 12 Jun 2025 17:46:20 +0000 Subject: [PATCH 3/8] fixes --- .../adapters/level_zero/v2/queue_create.cpp | 7 ++- .../v2/queue_immediate_out_of_order.cpp | 51 ++++++++----------- .../v2/queue_immediate_out_of_order.hpp | 7 ++- unified-runtime/source/common/ur_util.hpp | 15 ++++++ 4 files changed, 44 insertions(+), 36 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp index 3bc67ce04868f..2ed41e8c535c4 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp @@ -69,13 +69,12 @@ ur_result_t urQueueCreate(ur_context_handle_t hContext, auto zeIndex = v2::getZeIndex(pProperties); - if ((flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0 && - !zeIndex.has_value()) { + if ((flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0) { *phQueue = ur_queue_handle_t_::create( hContext, hDevice, v2::getZeOrdinal(hDevice), - v2::getZePriority(flags), v2::eventFlagsFromQueueFlags(flags), - flags); + v2::getZePriority(flags), zeIndex, + v2::eventFlagsFromQueueFlags(flags), flags); } else { *phQueue = ur_queue_handle_t_::create( hContext, hDevice, v2::getZeOrdinal(hDevice), v2::getZePriority(flags), diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp index 4a1e69555c326..5465c29bb278b 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp @@ -1,6 +1,6 @@ //===--------- queue_immediate_in_order.cpp - Level Zero Adapter ---------===// // -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2025 Intel Corporation // // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM // Exceptions. See LICENSE.TXT @@ -14,40 +14,29 @@ namespace v2 { -// Helper function to intialize std::array of command list manager. -// This is needed because command list manager does not have a default -// constructor. -template -std::array createCommandListManagers( - ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal, - ze_command_queue_priority_t priority, std::index_sequence) { - return { - ((void)Is, ur_command_list_manager( - hContext, hDevice, - hContext->getCommandListCache().getImmediateCommandList( - hDevice->ZeDevice, - {true, ordinal, true /* always enable copy offload */}, - ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, priority)))...}; -} - template -std::array -createCommandListManagers(ur_context_handle_t hContext, - ur_device_handle_t hDevice, uint32_t ordinal, - ze_command_queue_priority_t priority) { - return createCommandListManagers(hContext, hDevice, ordinal, priority, - std::make_index_sequence{}); +std::array createCommandListManagers( + ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal, + ze_command_queue_priority_t priority, std::optional index) { + return createArrayOf([&](size_t) { + return ur_command_list_manager( + hContext, hDevice, + hContext->getCommandListCache().getImmediateCommandList( + hDevice->ZeDevice, + {true, ordinal, true /* always enable copy offload */}, + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, priority, index)); + }); } ur_queue_immediate_out_of_order_t::ur_queue_immediate_out_of_order_t( ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal, - ze_command_queue_priority_t priority, event_flags_t eventFlags, - ur_queue_flags_t flags) + ze_command_queue_priority_t priority, std::optional index, + event_flags_t eventFlags, ur_queue_flags_t flags) : hContext(hContext), hDevice(hDevice), eventPool(hContext->getEventPoolCache(PoolCacheType::Immediate) .borrow(hDevice->Id.value(), eventFlags)), commandListManagers(createCommandListManagers( - hContext, hDevice, ordinal, priority)), + hContext, hDevice, ordinal, priority, index)), flags(flags) { for (size_t i = 0; i < numCommandLists; i++) { barrierEvents[i] = eventPool->allocate(); @@ -153,11 +142,11 @@ ur_result_t ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier( ur_event_handle_t *phEvent) { TRACK_SCOPE_LATENCY( "ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier"); - // For in-order queue we don't need a real L0 barrier, just wait for - // requested events in potentially different queues and add a "barrier" - // event signal because it is already guaranteed that previous commands - // in this queue are completed when the signal is started. However, we do - // need to use barrier if profiling is enabled: see + // Since we use L0 in-order command lists, we don't need a real L0 barrier, + // just wait for requested events in potentially different queues and add a + // "barrier" event signal because it is already guaranteed that previous + // commands in this queue are completed when the signal is started. However, + // we do need to use barrier if profiling is enabled: see // zeCommandListAppendWaitOnEvents bool needsRealBarrier = (flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0; auto appendEventsWaitFn = diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp index 48bd6328c08e8..1d0bf5636d58c 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp @@ -1,6 +1,6 @@ //===--------- queue_immediate_in_order.hpp - Level Zero Adapter ---------===// // -// Copyright (C) 2024 Intel Corporation +// Copyright (C) 2025 Intel Corporation // // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM // Exceptions. See LICENSE.TXT @@ -25,6 +25,10 @@ namespace v2 { struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { private: + // Number of command lists was chosen experimentally as a compromise + // between number of allowed concurrent launches and overhead of + // iterating over the command lists to synchronize them. + // This might need to be changed for future hardware. static constexpr size_t numCommandLists = 4; ur_context_handle_t hContext; @@ -49,6 +53,7 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { ur_queue_immediate_out_of_order_t(ur_context_handle_t, ur_device_handle_t, uint32_t ordinal, ze_command_queue_priority_t priority, + std::optional index, event_flags_t eventFlags, ur_queue_flags_t flags); diff --git a/unified-runtime/source/common/ur_util.hpp b/unified-runtime/source/common/ur_util.hpp index b76223a63bfa2..37595846b7a46 100644 --- a/unified-runtime/source/common/ur_util.hpp +++ b/unified-runtime/source/common/ur_util.hpp @@ -553,4 +553,19 @@ inline bool isPointerAlignedTo(uint32_t Alignment, void *Ptr) { reinterpret_cast(Ptr) % Alignment == 0; } +template +std::array createArrayOfHelper(F &&f, + std::index_sequence) { + return {(f(Is))...}; +} + +// Helper function to intialize std::array of non-default constructible +// types. Calls provided ctor function (passing index to the array) to create +// each element of the array. +template +std::array createArrayOf(F &&ctor) { + return createArrayOfHelper(std::forward(f), + std::make_index_sequence{}); +} + #endif /* UR_UTIL_H */ From f69ea849746cbfc312538ae82967490505a120d5 Mon Sep 17 00:00:00 2001 From: Igor Chorazewicz Date: Thu, 12 Jun 2025 18:05:06 +0000 Subject: [PATCH 4/8] fix typo --- .../adapters/level_zero/v2/queue_immediate_out_of_order.cpp | 2 +- unified-runtime/source/common/ur_util.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp index 5465c29bb278b..00be9c761fc49 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp @@ -18,7 +18,7 @@ template std::array createCommandListManagers( ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal, ze_command_queue_priority_t priority, std::optional index) { - return createArrayOf([&](size_t) { + return createArrayOf([&](size_t) { return ur_command_list_manager( hContext, hDevice, hContext->getCommandListCache().getImmediateCommandList( diff --git a/unified-runtime/source/common/ur_util.hpp b/unified-runtime/source/common/ur_util.hpp index 37595846b7a46..b3f652325a1da 100644 --- a/unified-runtime/source/common/ur_util.hpp +++ b/unified-runtime/source/common/ur_util.hpp @@ -564,7 +564,7 @@ std::array createArrayOfHelper(F &&f, // each element of the array. template std::array createArrayOf(F &&ctor) { - return createArrayOfHelper(std::forward(f), + return createArrayOfHelper(std::forward(ctor), std::make_index_sequence{}); } From 015403942cddc2b004bd2b77176f0f40c5489882 Mon Sep 17 00:00:00 2001 From: Igor Chorazewicz Date: Thu, 12 Jun 2025 19:48:26 +0000 Subject: [PATCH 5/8] retain context in queue/command buffer --- .../adapters/level_zero/v2/queue_immediate_out_of_order.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp index 00be9c761fc49..0b2b8dbc71908 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp @@ -41,6 +41,8 @@ ur_queue_immediate_out_of_order_t::ur_queue_immediate_out_of_order_t( for (size_t i = 0; i < numCommandLists; i++) { barrierEvents[i] = eventPool->allocate(); } + + ur::level_zero::urContextRetain(hContext); } ur_result_t ur_queue_immediate_out_of_order_t::queueGetInfo( @@ -132,6 +134,8 @@ ur_queue_immediate_out_of_order_t::~ur_queue_immediate_out_of_order_t() { for (size_t i = 0; i < numCommandLists; i++) { barrierEvents[i]->release(); } + + ur::level_zero::urContextRelease(hContext); } catch (...) { // Ignore errors during destruction } From 6a791c6497395a7d7fb7288ad5618e7ad7c14ba7 Mon Sep 17 00:00:00 2001 From: Igor Chorazewicz Date: Fri, 13 Jun 2025 18:20:56 +0000 Subject: [PATCH 6/8] fix barrier impl --- .../v2/queue_immediate_out_of_order.cpp | 29 ++++++++++++------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp index 0b2b8dbc71908..b547380d22e22 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp @@ -153,27 +153,34 @@ ur_result_t ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier( // we do need to use barrier if profiling is enabled: see // zeCommandListAppendWaitOnEvents bool needsRealBarrier = (flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0; - auto appendEventsWaitFn = - needsRealBarrier ? &ur_command_list_manager::appendEventsWaitWithBarrier + auto barrierFn = needsRealBarrier + ? &ur_command_list_manager::appendEventsWaitWithBarrier : &ur_command_list_manager::appendEventsWait; auto commandListManagersLocked = commandListManagers.lock(); // Enqueue wait for the user-provider events on the first command list. - std::invoke(appendEventsWaitFn, commandListManagersLocked[0], - numEventsInWaitList, phEventWaitList, barrierEvents[0]); + UR_CALL(commandListManagersLocked[0].appendEventsWait( + numEventsInWaitList, phEventWaitList, barrierEvents[0])); - // Submit barrier or request barrierEvents[id] to be signaled on remaining - // command lists. + // Request barrierEvents[id] to be signaled on remaining command lists. for (size_t id = 1; id < numCommandLists; id++) { - std::invoke(appendEventsWaitFn, commandListManagersLocked[id], 0, nullptr, - barrierEvents[id]); + UR_CALL(commandListManagersLocked[id].appendEventsWait(0, nullptr, + barrierEvents[id])); } + // Enqueue barriers on all command lists by waiting on barrierEvents. + if (phEvent) { - UR_CALL(commandListManagersLocked[0].appendEventsWait( - numCommandLists, barrierEvents.data(), - createEventIfRequested(eventPool.get(), phEvent, this))); + UR_CALL( + std::invoke(barrierFn, commandListManagersLocked[0], numCommandLists, + barrierEvents.data(), + createEventIfRequested(eventPool.get(), phEvent, this))); + } + + for (size_t id = phEvent ? 1 : 0; id < numCommandLists; id++) { + UR_CALL(std::invoke(barrierFn, commandListManagersLocked[0], + numCommandLists, barrierEvents.data(), nullptr)); } return UR_RESULT_SUCCESS; From 95f51c2c8db08f2459b29789c86eabcbe4915ca1 Mon Sep 17 00:00:00 2001 From: Igor Chorazewicz Date: Tue, 24 Jun 2025 16:24:24 +0000 Subject: [PATCH 7/8] mark failing tests as unsupported --- sycl/test-e2e/ProfilingTag/profiling_queue.cpp | 3 +++ sycl/test-e2e/WorkGroupMemory/basic_usage.cpp | 2 ++ 2 files changed, 5 insertions(+) diff --git a/sycl/test-e2e/ProfilingTag/profiling_queue.cpp b/sycl/test-e2e/ProfilingTag/profiling_queue.cpp index d7c98f06060ad..297b1ef294b5a 100644 --- a/sycl/test-e2e/ProfilingTag/profiling_queue.cpp +++ b/sycl/test-e2e/ProfilingTag/profiling_queue.cpp @@ -24,6 +24,9 @@ // UNSUPPORTED: cuda // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/14053 +// UNSUPPORTED: level_zero_v2_adapter +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/19116 + #include "common.hpp" int main() { diff --git a/sycl/test-e2e/WorkGroupMemory/basic_usage.cpp b/sycl/test-e2e/WorkGroupMemory/basic_usage.cpp index c63f16733b289..42940a999ac2f 100644 --- a/sycl/test-e2e/WorkGroupMemory/basic_usage.cpp +++ b/sycl/test-e2e/WorkGroupMemory/basic_usage.cpp @@ -1,5 +1,7 @@ // UNSUPPORTED: hip // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/17339 +// UNSUPPORTED: level_zero_v2_adapter +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/19116 // RUN: %{build} -o %t.out // RUN: %{run} %t.out // XFAIL: spirv-backend From daf9e2dda7dae6b13f6f9b71c075ffcff710b201 Mon Sep 17 00:00:00 2001 From: Igor Chorazewicz Date: Tue, 24 Jun 2025 16:25:29 +0000 Subject: [PATCH 8/8] context release/retain is handled by cmd list mgr --- .../adapters/level_zero/v2/queue_immediate_out_of_order.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp index b547380d22e22..bfb6079af3ea5 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp @@ -41,8 +41,6 @@ ur_queue_immediate_out_of_order_t::ur_queue_immediate_out_of_order_t( for (size_t i = 0; i < numCommandLists; i++) { barrierEvents[i] = eventPool->allocate(); } - - ur::level_zero::urContextRetain(hContext); } ur_result_t ur_queue_immediate_out_of_order_t::queueGetInfo( @@ -134,8 +132,6 @@ ur_queue_immediate_out_of_order_t::~ur_queue_immediate_out_of_order_t() { for (size_t i = 0; i < numCommandLists; i++) { barrierEvents[i]->release(); } - - ur::level_zero::urContextRelease(hContext); } catch (...) { // Ignore errors during destruction }