diff --git a/sycl/test-e2e/ProfilingTag/profiling_queue.cpp b/sycl/test-e2e/ProfilingTag/profiling_queue.cpp index d7c98f06060ad..297b1ef294b5a 100644 --- a/sycl/test-e2e/ProfilingTag/profiling_queue.cpp +++ b/sycl/test-e2e/ProfilingTag/profiling_queue.cpp @@ -24,6 +24,9 @@ // UNSUPPORTED: cuda // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/14053 +// UNSUPPORTED: level_zero_v2_adapter +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/19116 + #include "common.hpp" int main() { diff --git a/sycl/test-e2e/WorkGroupMemory/basic_usage.cpp b/sycl/test-e2e/WorkGroupMemory/basic_usage.cpp index c63f16733b289..42940a999ac2f 100644 --- a/sycl/test-e2e/WorkGroupMemory/basic_usage.cpp +++ b/sycl/test-e2e/WorkGroupMemory/basic_usage.cpp @@ -1,5 +1,7 @@ // UNSUPPORTED: hip // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/17339 +// UNSUPPORTED: level_zero_v2_adapter +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/19116 // RUN: %{build} -o %t.out // RUN: %{run} %t.out // XFAIL: spirv-backend diff --git a/unified-runtime/source/adapters/level_zero/CMakeLists.txt b/unified-runtime/source/adapters/level_zero/CMakeLists.txt index 8aaf2da885a27..8532263dd8aa9 100644 --- a/unified-runtime/source/adapters/level_zero/CMakeLists.txt +++ b/unified-runtime/source/adapters/level_zero/CMakeLists.txt @@ -164,6 +164,7 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/v2/lockable.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_out_of_order.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/api.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/command_buffer.cpp @@ -180,6 +181,7 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_create.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_out_of_order.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.cpp ) install_ur_library(ur_adapter_level_zero_v2) diff --git a/unified-runtime/source/adapters/level_zero/v2/event_pool.hpp b/unified-runtime/source/adapters/level_zero/v2/event_pool.hpp index a92a7fc72fa36..df517268a647e 100644 --- a/unified-runtime/source/adapters/level_zero/v2/event_pool.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/event_pool.hpp @@ -60,4 +60,34 @@ class event_pool { ur_mutex mutex; }; +// Only create an event when requested by the user. +static inline ur_event_handle_t +createEventIfRequested(event_pool *eventPool, ur_event_handle_t *phEvent, + ur_queue_t_ *queue) { + if (phEvent == nullptr) { + return nullptr; + } + + (*phEvent) = eventPool->allocate(); + (*phEvent)->setQueue(queue); + return (*phEvent); +} + +// Always creates an event (used in functions that need to store the event +// internally). If event was requested by the user, also increase ref count of +// that event to avoid pre-mature release. +static inline ur_event_handle_t createEventAndRetain(event_pool *eventPool, + ur_event_handle_t *phEvent, + ur_queue_t_ *queue) { + auto hEvent = eventPool->allocate(); + hEvent->setQueue(queue); + + if (phEvent) { + (*phEvent) = hEvent; + hEvent->retain(); + } + + return hEvent; +} + } // namespace v2 diff --git a/unified-runtime/source/adapters/level_zero/v2/lockable.hpp b/unified-runtime/source/adapters/level_zero/v2/lockable.hpp index dd8670295ef7a..93442d40acac8 100644 --- a/unified-runtime/source/adapters/level_zero/v2/lockable.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/lockable.hpp @@ -18,6 +18,7 @@ template struct locked { object_ = object; } T *operator->() { return object_; } + auto &operator[](size_t index) { return (*object_)[index]; } private: std::unique_lock lock_; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp index 23259d9c34139..2ed41e8c535c4 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp @@ -69,9 +69,17 @@ ur_result_t urQueueCreate(ur_context_handle_t hContext, auto zeIndex = v2::getZeIndex(pProperties); - *phQueue = ur_queue_handle_t_::create( - hContext, hDevice, v2::getZeOrdinal(hDevice), v2::getZePriority(flags), - zeIndex, v2::eventFlagsFromQueueFlags(flags), flags); + if ((flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0) { + *phQueue = + ur_queue_handle_t_::create( + hContext, hDevice, v2::getZeOrdinal(hDevice), + v2::getZePriority(flags), zeIndex, + v2::eventFlagsFromQueueFlags(flags), flags); + } else { + *phQueue = ur_queue_handle_t_::create( + hContext, hDevice, v2::getZeOrdinal(hDevice), v2::getZePriority(flags), + zeIndex, v2::eventFlagsFromQueueFlags(flags), flags); + } return UR_RESULT_SUCCESS; } catch (...) { diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp index 75bf4a16faf61..9831afdbc9e4c 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp @@ -15,11 +15,13 @@ #include "../common.hpp" #include "queue_immediate_in_order.hpp" +#include "queue_immediate_out_of_order.hpp" #include #include struct ur_queue_handle_t_ : ur::handle_base { - using data_variant = std::variant; + using data_variant = std::variant; data_variant queue_data; static constexpr uintptr_t queue_offset = diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 258cd45eb407c..cc9b464333e70 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -140,10 +140,12 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier( // zeCommandListAppendWaitOnEvents if ((flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0) { return commandListManager.lock()->appendEventsWaitWithBarrier( - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } else { return commandListManager.lock()->appendEventsWait( - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } } diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index d7d879d9df417..362a6ea31c9f4 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -33,32 +33,6 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_queue_flags_t flags; v2::raii::cache_borrowed_event_pool eventPool; - // Only create an event when requested by the user. - ur_event_handle_t createEventIfRequested(ur_event_handle_t *phEvent) { - if (phEvent == nullptr) { - return nullptr; - } - - (*phEvent) = eventPool->allocate(); - (*phEvent)->setQueue(this); - return (*phEvent); - } - - // Always creates an event (used in functions that need to store the event - // internally). If event was requested by the user, also increase ref count of - // that event to avoid pre-mature release. - ur_event_handle_t createEventAndRetain(ur_event_handle_t *phEvent) { - auto hEvent = eventPool->allocate(); - hEvent->setQueue(this); - - if (phEvent) { - (*phEvent) = hEvent; - hEvent->retain(); - } - - return hEvent; - } - public: ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t, uint32_t ordinal, @@ -88,7 +62,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { return commandListManager.lock()->appendKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, numPropsInLaunchPropList, launchPropList, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueEventsWaitWithBarrier(uint32_t numEventsInWaitList, @@ -99,7 +74,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendEventsWait( - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, @@ -117,7 +93,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendMemBufferRead( hBuffer, blockingRead, offset, size, pDst, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferWrite(ur_mem_handle_t hBuffer, bool blockingWrite, @@ -128,7 +105,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendMemBufferWrite( hBuffer, blockingWrite, offset, size, pSrc, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferReadRect( @@ -141,7 +119,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { return commandListManager.lock()->appendMemBufferReadRect( hBuffer, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferWriteRect( @@ -154,7 +133,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { return commandListManager.lock()->appendMemBufferWriteRect( hBuffer, blockingWrite, bufferOrigin, hostOrigin, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferCopy(ur_mem_handle_t hBufferSrc, @@ -165,7 +145,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendMemBufferCopy( hBufferSrc, hBufferDst, srcOffset, dstOffset, size, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferCopyRect( @@ -178,7 +159,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { return commandListManager.lock()->appendMemBufferCopyRect( hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferFill(ur_mem_handle_t hBuffer, @@ -189,7 +171,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendMemBufferFill( hBuffer, pPattern, patternSize, offset, size, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemImageRead(ur_mem_handle_t hImage, bool blockingRead, @@ -201,7 +184,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendMemImageRead( hImage, blockingRead, origin, region, rowPitch, slicePitch, pDst, - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, @@ -213,7 +197,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendMemImageWrite( hImage, blockingWrite, origin, region, rowPitch, slicePitch, pSrc, - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t @@ -224,7 +209,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendMemImageCopy( hImageSrc, hImageDst, srcOrigin, dstOrigin, region, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferMap(ur_mem_handle_t hBuffer, bool blockingMap, @@ -235,7 +221,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { void **ppRetMap) override { return commandListManager.lock()->appendMemBufferMap( hBuffer, blockingMap, mapFlags, offset, size, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent), ppRetMap); + phEventWaitList, createEventIfRequested(eventPool.get(), phEvent, this), + ppRetMap); } ur_result_t enqueueMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, @@ -244,7 +231,7 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendMemUnmap( hMem, pMappedPtr, numEventsInWaitList, phEventWaitList, - createEventIfRequested(phEvent)); + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMFill(void *pMem, size_t patternSize, @@ -254,7 +241,7 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMFill( pMem, patternSize, pPattern, size, numEventsInWaitList, phEventWaitList, - createEventIfRequested(phEvent)); + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMMemcpy(bool blocking, void *pDst, const void *pSrc, @@ -263,7 +250,7 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMMemcpy( blocking, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, - createEventIfRequested(phEvent)); + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMFill2D(void *pMem, size_t pitch, size_t patternSize, @@ -273,7 +260,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMFill2D( pMem, pitch, patternSize, pPattern, width, height, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMMemcpy2D(bool blocking, void *pDst, size_t dstPitch, @@ -284,7 +272,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMMemcpy2D( blocking, pDst, dstPitch, pSrc, srcPitch, width, height, - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMPrefetch(const void *pMem, size_t size, @@ -294,14 +283,15 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMPrefetch( pMem, size, flags, numEventsInWaitList, phEventWaitList, - createEventIfRequested(phEvent)); + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMAdvise(const void *pMem, size_t size, ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMAdvise( - pMem, size, advice, 0, nullptr, createEventIfRequested(phEvent)); + pMem, size, advice, 0, nullptr, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueDeviceGlobalVariableWrite( @@ -311,7 +301,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendDeviceGlobalVariableWrite( hProgram, name, blockingWrite, count, offset, pSrc, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueDeviceGlobalVariableRead( @@ -321,7 +312,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendDeviceGlobalVariableRead( hProgram, name, blockingRead, count, offset, pDst, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueReadHostPipe(ur_program_handle_t hProgram, @@ -332,7 +324,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendReadHostPipe( hProgram, pipe_symbol, blocking, pDst, size, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueWriteHostPipe(ur_program_handle_t hProgram, @@ -343,7 +336,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendWriteHostPipe( hProgram, pipe_symbol, blocking, pSrc, size, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMDeviceAllocExp( @@ -353,7 +347,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { void **ppMem, ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMAllocHelper( this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(phEvent), UR_USM_TYPE_DEVICE); + ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + UR_USM_TYPE_DEVICE); } ur_result_t enqueueUSMSharedAllocExp( @@ -363,7 +358,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { void **ppMem, ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMAllocHelper( this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(phEvent), UR_USM_TYPE_SHARED); + ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + UR_USM_TYPE_SHARED); } ur_result_t @@ -374,7 +370,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMAllocHelper( this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(phEvent), UR_USM_TYPE_HOST); + ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + UR_USM_TYPE_HOST); } ur_result_t enqueueUSMFreeExp(ur_usm_pool_handle_t pPool, void *pMem, @@ -383,7 +380,7 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendUSMFreeExp( this, pPool, pMem, numEventsInWaitList, phEventWaitList, - createEventAndRetain(phEvent)); + createEventAndRetain(eventPool.get(), phEvent, this)); } ur_result_t bindlessImagesImageCopyExp( @@ -398,7 +395,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { return commandListManager.lock()->bindlessImagesImageCopyExp( pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, pDstImageFormat, pCopyRegion, imageCopyFlags, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t bindlessImagesWaitExternalSemaphoreExp( @@ -408,7 +406,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->bindlessImagesWaitExternalSemaphoreExp( hSemaphore, hasWaitValue, waitValue, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t bindlessImagesSignalExternalSemaphoreExp( @@ -418,7 +417,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->bindlessImagesSignalExternalSemaphoreExp( hSemaphore, hasSignalValue, signalValue, numEventsInWaitList, - phEventWaitList, createEventIfRequested(phEvent)); + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t @@ -427,7 +427,7 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendTimestampRecordingExp( blocking, numEventsInWaitList, phEventWaitList, - createEventIfRequested(phEvent)); + createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t @@ -437,7 +437,7 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendCommandBufferExp( hCommandBuffer, numEventsInWaitList, phEventWaitList, - createEventAndRetain(phEvent)); + createEventAndRetain(eventPool.get(), phEvent, this)); } ur_result_t enqueueNativeCommandExp( @@ -448,7 +448,8 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_event_handle_t *phEvent) override { return commandListManager.lock()->appendNativeCommandExp( pfnNativeEnqueue, data, numMemsInMemList, phMemList, pProperties, - numEventsInWaitList, phEventWaitList, createEventIfRequested(phEvent)); + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); } }; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp new file mode 100644 index 0000000000000..bfb6079af3ea5 --- /dev/null +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp @@ -0,0 +1,185 @@ +//===--------- queue_immediate_in_order.cpp - Level Zero Adapter ---------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "queue_immediate_out_of_order.hpp" +#include "../common/latency_tracker.hpp" +#include "ur.hpp" + +namespace v2 { + +template +std::array createCommandListManagers( + ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal, + ze_command_queue_priority_t priority, std::optional index) { + return createArrayOf([&](size_t) { + return ur_command_list_manager( + hContext, hDevice, + hContext->getCommandListCache().getImmediateCommandList( + hDevice->ZeDevice, + {true, ordinal, true /* always enable copy offload */}, + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, priority, index)); + }); +} + +ur_queue_immediate_out_of_order_t::ur_queue_immediate_out_of_order_t( + ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal, + ze_command_queue_priority_t priority, std::optional index, + event_flags_t eventFlags, ur_queue_flags_t flags) + : hContext(hContext), hDevice(hDevice), + eventPool(hContext->getEventPoolCache(PoolCacheType::Immediate) + .borrow(hDevice->Id.value(), eventFlags)), + commandListManagers(createCommandListManagers( + hContext, hDevice, ordinal, priority, index)), + flags(flags) { + for (size_t i = 0; i < numCommandLists; i++) { + barrierEvents[i] = eventPool->allocate(); + } +} + +ur_result_t ur_queue_immediate_out_of_order_t::queueGetInfo( + ur_queue_info_t propName, size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + // TODO: consider support for queue properties and size + switch ((uint32_t)propName) { // cast to avoid warnings on EXT enum values + case UR_QUEUE_INFO_CONTEXT: + return ReturnValue(hContext); + case UR_QUEUE_INFO_DEVICE: + return ReturnValue(hDevice); + case UR_QUEUE_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{RefCount.load()}); + case UR_QUEUE_INFO_FLAGS: + return ReturnValue(flags); + case UR_QUEUE_INFO_SIZE: + case UR_QUEUE_INFO_DEVICE_DEFAULT: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + case UR_QUEUE_INFO_EMPTY: { + auto isCmdListEmpty = [](ze_command_list_handle_t cmdList) { + auto status = ZE_CALL_NOCHECK(zeCommandListHostSynchronize, (cmdList, 0)); + if (status == ZE_RESULT_SUCCESS) { + return true; + } else if (status == ZE_RESULT_NOT_READY) { + return false; + } else { + throw ze2urResult(status); + } + }; + + auto commandListManagersLocked = commandListManagers.lock(); + + bool empty = std::all_of( + commandListManagersLocked->begin(), commandListManagersLocked->end(), + [&](auto &cmdListManager) { + return isCmdListEmpty(cmdListManager.getZeCommandList()); + }); + + return ReturnValue(empty); + } + default: + UR_LOG(ERR, + "Unsupported ParamName in urQueueGetInfo: " + "ParamName=ParamName={}(0x{})", + propName, logger::toHex(propName)); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_immediate_out_of_order_t::queueGetNativeHandle( + ur_queue_native_desc_t * /*pDesc*/, ur_native_handle_t *phNativeQueue) { + *phNativeQueue = reinterpret_cast( + (*commandListManagers.get_no_lock())[getNextCommandListId()] + .getZeCommandList()); + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_immediate_out_of_order_t::queueFinish() { + TRACK_SCOPE_LATENCY("ur_queue_immediate_out_of_order_t::queueFinish"); + + auto commandListManagersLocked = commandListManagers.lock(); + + for (size_t i = 0; i < numCommandLists; i++) { + ZE2UR_CALL(zeCommandListHostSynchronize, + (commandListManagersLocked[i].getZeCommandList(), UINT64_MAX)); + UR_CALL(commandListManagersLocked[i].releaseSubmittedKernels()); + } + + hContext->getAsyncPool()->cleanupPoolsForQueue(this); + hContext->forEachUsmPool([this](ur_usm_pool_handle_t hPool) { + hPool->cleanupPoolsForQueue(this); + return true; + }); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_immediate_out_of_order_t::queueFlush() { + return UR_RESULT_SUCCESS; +} + +ur_queue_immediate_out_of_order_t::~ur_queue_immediate_out_of_order_t() { + try { + UR_CALL_THROWS(queueFinish()); + + for (size_t i = 0; i < numCommandLists; i++) { + barrierEvents[i]->release(); + } + } catch (...) { + // Ignore errors during destruction + } +} + +ur_result_t ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier( + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY( + "ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier"); + // Since we use L0 in-order command lists, we don't need a real L0 barrier, + // just wait for requested events in potentially different queues and add a + // "barrier" event signal because it is already guaranteed that previous + // commands in this queue are completed when the signal is started. However, + // we do need to use barrier if profiling is enabled: see + // zeCommandListAppendWaitOnEvents + bool needsRealBarrier = (flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0; + auto barrierFn = needsRealBarrier + ? &ur_command_list_manager::appendEventsWaitWithBarrier + : &ur_command_list_manager::appendEventsWait; + + auto commandListManagersLocked = commandListManagers.lock(); + + // Enqueue wait for the user-provider events on the first command list. + UR_CALL(commandListManagersLocked[0].appendEventsWait( + numEventsInWaitList, phEventWaitList, barrierEvents[0])); + + // Request barrierEvents[id] to be signaled on remaining command lists. + for (size_t id = 1; id < numCommandLists; id++) { + UR_CALL(commandListManagersLocked[id].appendEventsWait(0, nullptr, + barrierEvents[id])); + } + + // Enqueue barriers on all command lists by waiting on barrierEvents. + + if (phEvent) { + UR_CALL( + std::invoke(barrierFn, commandListManagersLocked[0], numCommandLists, + barrierEvents.data(), + createEventIfRequested(eventPool.get(), phEvent, this))); + } + + for (size_t id = phEvent ? 1 : 0; id < numCommandLists; id++) { + UR_CALL(std::invoke(barrierFn, commandListManagersLocked[0], + numCommandLists, barrierEvents.data(), nullptr)); + } + + return UR_RESULT_SUCCESS; +} + +} // namespace v2 diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp new file mode 100644 index 0000000000000..1d0bf5636d58c --- /dev/null +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp @@ -0,0 +1,508 @@ +//===--------- queue_immediate_in_order.hpp - Level Zero Adapter ---------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include "../common.hpp" +#include "../device.hpp" + +#include "context.hpp" +#include "event.hpp" +#include "event_pool_cache.hpp" +#include "queue_api.hpp" + +#include "command_list_manager.hpp" +#include "lockable.hpp" +#include "ur/ur.hpp" + +namespace v2 { + +struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { +private: + // Number of command lists was chosen experimentally as a compromise + // between number of allowed concurrent launches and overhead of + // iterating over the command lists to synchronize them. + // This might need to be changed for future hardware. + static constexpr size_t numCommandLists = 4; + + ur_context_handle_t hContext; + ur_device_handle_t hDevice; + + v2::raii::cache_borrowed_event_pool eventPool; + + std::atomic commandListIndex = 0; + lockable> + commandListManagers; + + ur_queue_flags_t flags; + + std::array barrierEvents; + + uint32_t getNextCommandListId() { + return commandListIndex.fetch_add(1, std::memory_order_relaxed) % + numCommandLists; + } + +public: + ur_queue_immediate_out_of_order_t(ur_context_handle_t, ur_device_handle_t, + uint32_t ordinal, + ze_command_queue_priority_t priority, + std::optional index, + event_flags_t eventFlags, + ur_queue_flags_t flags); + + ~ur_queue_immediate_out_of_order_t(); + + ur_result_t queueGetInfo(ur_queue_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) override; + ur_result_t queueGetNativeHandle(ur_queue_native_desc_t *pDesc, + ur_native_handle_t *phNativeQueue) override; + ur_result_t queueFinish() override; + ur_result_t queueFlush() override; + ur_result_t enqueueKernelLaunch( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendKernelLaunch( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + ur_result_t + enqueueEventsWaitWithBarrier(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueEventsWait(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendEventsWait( + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + ur_result_t + enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return enqueueEventsWaitWithBarrier(numEventsInWaitList, phEventWaitList, + phEvent); + } + + ur_result_t enqueueMemBufferRead(ur_mem_handle_t hBuffer, bool blockingRead, + size_t offset, size_t size, void *pDst, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemBufferRead( + hBuffer, blockingRead, offset, size, pDst, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemBufferWrite(ur_mem_handle_t hBuffer, bool blockingWrite, + size_t offset, size_t size, + const void *pSrc, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemBufferWrite( + hBuffer, blockingWrite, offset, size, pSrc, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemBufferReadRect( + ur_mem_handle_t hBuffer, bool blockingRead, ur_rect_offset_t bufferOrigin, + ur_rect_offset_t hostOrigin, ur_rect_region_t region, + size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, + size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemBufferReadRect( + hBuffer, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemBufferWriteRect( + ur_mem_handle_t hBuffer, bool blockingWrite, + ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemBufferWriteRect( + hBuffer, blockingWrite, bufferOrigin, hostOrigin, region, + bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemBufferCopy(ur_mem_handle_t hBufferSrc, + ur_mem_handle_t hBufferDst, size_t srcOffset, + size_t dstOffset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemBufferCopy( + hBufferSrc, hBufferDst, srcOffset, dstOffset, size, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemBufferCopyRect( + ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, + size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemBufferCopyRect( + hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, srcRowPitch, + srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemBufferFill(ur_mem_handle_t hBuffer, + const void *pPattern, size_t patternSize, + size_t offset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemBufferFill( + hBuffer, pPattern, patternSize, offset, size, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemImageRead(ur_mem_handle_t hImage, bool blockingRead, + ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pDst, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemImageRead( + hImage, blockingRead, origin, region, rowPitch, slicePitch, pDst, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, + ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pSrc, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemImageWrite( + hImage, blockingWrite, origin, region, rowPitch, slicePitch, pSrc, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t + enqueueMemImageCopy(ur_mem_handle_t hImageSrc, ur_mem_handle_t hImageDst, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemImageCopy( + hImageSrc, hImageDst, srcOrigin, dstOrigin, region, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueMemBufferMap(ur_mem_handle_t hBuffer, bool blockingMap, + ur_map_flags_t mapFlags, size_t offset, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, + void **ppRetMap) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemBufferMap( + hBuffer, blockingMap, mapFlags, offset, size, numEventsInWaitList, + phEventWaitList, createEventIfRequested(eventPool.get(), phEvent, this), + ppRetMap); + } + + ur_result_t enqueueMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendMemUnmap( + hMem, pMappedPtr, numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueUSMFill(void *pMem, size_t patternSize, + const void *pPattern, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMFill( + pMem, patternSize, pPattern, size, numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueUSMMemcpy(bool blocking, void *pDst, const void *pSrc, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMMemcpy( + blocking, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueUSMFill2D(void *pMem, size_t pitch, size_t patternSize, + const void *pPattern, size_t width, + size_t height, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMFill2D( + pMem, pitch, patternSize, pPattern, width, height, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueUSMMemcpy2D(bool blocking, void *pDst, size_t dstPitch, + const void *pSrc, size_t srcPitch, + size_t width, size_t height, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMMemcpy2D( + blocking, pDst, dstPitch, pSrc, srcPitch, width, height, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueUSMPrefetch(const void *pMem, size_t size, + ur_usm_migration_flags_t flags, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMPrefetch( + pMem, size, flags, numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueUSMAdvise(const void *pMem, size_t size, + ur_usm_advice_flags_t advice, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMAdvise( + pMem, size, advice, 0, nullptr, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueDeviceGlobalVariableWrite( + ur_program_handle_t hProgram, const char *name, bool blockingWrite, + size_t count, size_t offset, const void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId] + .appendDeviceGlobalVariableWrite( + hProgram, name, blockingWrite, count, offset, pSrc, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueDeviceGlobalVariableRead( + ur_program_handle_t hProgram, const char *name, bool blockingRead, + size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId] + .appendDeviceGlobalVariableRead( + hProgram, name, blockingRead, count, offset, pDst, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueReadHostPipe(ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pDst, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendReadHostPipe( + hProgram, pipe_symbol, blocking, pDst, size, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueWriteHostPipe(ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pSrc, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendWriteHostPipe( + hProgram, pipe_symbol, blocking, pSrc, size, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueUSMDeviceAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMAllocHelper( + this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, + ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + UR_USM_TYPE_DEVICE); + } + + ur_result_t enqueueUSMSharedAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMAllocHelper( + this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, + ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + UR_USM_TYPE_SHARED); + } + + ur_result_t + enqueueUSMHostAllocExp(ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, void **ppMem, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMAllocHelper( + this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, + ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + UR_USM_TYPE_HOST); + } + + ur_result_t enqueueUSMFreeExp(ur_usm_pool_handle_t pPool, void *pMem, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendUSMFreeExp( + this, pPool, pMem, numEventsInWaitList, phEventWaitList, + createEventAndRetain(eventPool.get(), phEvent, this)); + } + + ur_result_t bindlessImagesImageCopyExp( + const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, + const ur_image_desc_t *pDstImageDesc, + const ur_image_format_t *pSrcImageFormat, + const ur_image_format_t *pDstImageFormat, + ur_exp_image_copy_region_t *pCopyRegion, + ur_exp_image_copy_flags_t imageCopyFlags, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].bindlessImagesImageCopyExp( + pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, + pDstImageFormat, pCopyRegion, imageCopyFlags, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t bindlessImagesWaitExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, + uint64_t waitValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId] + .bindlessImagesWaitExternalSemaphoreExp( + hSemaphore, hasWaitValue, waitValue, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t bindlessImagesSignalExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, + uint64_t signalValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId] + .bindlessImagesSignalExternalSemaphoreExp( + hSemaphore, hasSignalValue, signalValue, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t + enqueueTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId] + .appendTimestampRecordingExp( + blocking, numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } + + ur_result_t + enqueueCommandBufferExp(ur_exp_command_buffer_handle_t hCommandBuffer, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendCommandBufferExp( + hCommandBuffer, numEventsInWaitList, phEventWaitList, + createEventAndRetain(eventPool.get(), phEvent, this)); + } + + ur_result_t enqueueNativeCommandExp( + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t numMemsInMemList, const ur_mem_handle_t *phMemList, + const ur_exp_enqueue_native_command_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto commandListId = getNextCommandListId(); + return commandListManagers.lock()[commandListId].appendNativeCommandExp( + pfnNativeEnqueue, data, numMemsInMemList, phMemList, pProperties, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(eventPool.get(), phEvent, this)); + } +}; + +} // namespace v2 diff --git a/unified-runtime/source/common/ur_util.hpp b/unified-runtime/source/common/ur_util.hpp index b76223a63bfa2..b3f652325a1da 100644 --- a/unified-runtime/source/common/ur_util.hpp +++ b/unified-runtime/source/common/ur_util.hpp @@ -553,4 +553,19 @@ inline bool isPointerAlignedTo(uint32_t Alignment, void *Ptr) { reinterpret_cast(Ptr) % Alignment == 0; } +template +std::array createArrayOfHelper(F &&f, + std::index_sequence) { + return {(f(Is))...}; +} + +// Helper function to intialize std::array of non-default constructible +// types. Calls provided ctor function (passing index to the array) to create +// each element of the array. +template +std::array createArrayOf(F &&ctor) { + return createArrayOfHelper(std::forward(ctor), + std::make_index_sequence{}); +} + #endif /* UR_UTIL_H */