From 642f267e8ab159e0025efd199fc0286fcf7d5955 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Fri, 10 Feb 2023 19:56:41 -0800 Subject: [PATCH 01/50] [SYCL][UR][L0] First version of UR L0 adapter Signed-off-by: Jaime Arteaga --- sycl/plugins/level_zero/CMakeLists.txt | 4 +- sycl/plugins/level_zero/pi_level_zero.cpp | 8232 +---------------- sycl/plugins/level_zero/pi_level_zero.hpp | 1322 +-- sycl/plugins/level_zero/ur_bindings.hpp | 38 - sycl/plugins/unified_runtime/CMakeLists.txt | 5 +- sycl/plugins/unified_runtime/pi2ur.hpp | 2723 +++++- .../unified_runtime/pi_unified_runtime.cpp | 1017 +- .../ur/adapters/level_zero/ur_level_zero.cpp | 1569 ---- .../ur/adapters/level_zero/ur_level_zero.hpp | 196 +- .../level_zero/ur_level_zero_common.hpp | 169 +- .../level_zero/ur_level_zero_context.cpp | 684 ++ .../level_zero/ur_level_zero_context.hpp | 231 +- .../level_zero/ur_level_zero_device.cpp | 1256 +++ .../level_zero/ur_level_zero_device.hpp | 156 + .../level_zero/ur_level_zero_event.cpp | 1167 +++ .../level_zero/ur_level_zero_event.hpp | 261 +- .../level_zero/ur_level_zero_kernel.cpp | 771 ++ .../level_zero/ur_level_zero_kernel.hpp | 97 + .../adapters/level_zero/ur_level_zero_mem.cpp | 3058 +++++- .../adapters/level_zero/ur_level_zero_mem.hpp | 293 +- .../level_zero/ur_level_zero_module.cpp | 9 - .../level_zero/ur_level_zero_module.hpp | 18 - .../level_zero/ur_level_zero_platform.cpp | 531 ++ .../level_zero/ur_level_zero_platform.hpp | 44 + .../level_zero/ur_level_zero_program.cpp | 758 ++ .../level_zero/ur_level_zero_program.hpp | 123 +- .../level_zero/ur_level_zero_queue.cpp | 1782 ++++ .../level_zero/ur_level_zero_queue.hpp | 502 +- .../level_zero/ur_level_zero_sampler.cpp | 203 + .../level_zero/ur_level_zero_sampler.hpp | 7 +- .../level_zero/ur_loader_interface.cpp | 201 +- sycl/plugins/unified_runtime/ur/ur.hpp | 43 +- sycl/plugins/unified_runtime/ur_bindings.hpp | 41 - 33 files changed, 16142 insertions(+), 11369 deletions(-) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.hpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_module.cpp delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_module.hpp mode change 100755 => 100644 sycl/plugins/unified_runtime/ur_bindings.hpp diff --git a/sycl/plugins/level_zero/CMakeLists.txt b/sycl/plugins/level_zero/CMakeLists.txt index 2b9dff977029d..3cd25f2dc6826 100755 --- a/sycl/plugins/level_zero/CMakeLists.txt +++ b/sycl/plugins/level_zero/CMakeLists.txt @@ -106,7 +106,7 @@ add_sycl_plugin(level_zero "../unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp" - "../unified_runtime/ur/adapters/level_zero/ur_level_zero_module.hpp" + "../unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.hpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.hpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_program.hpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp" @@ -117,7 +117,7 @@ add_sycl_plugin(level_zero "../unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp" - "../unified_runtime/ur/adapters/level_zero/ur_level_zero_module.cpp" + "../unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp" diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 5e53aac3c6a2d..44d747c12b871 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -12,2174 +12,17 @@ /// \ingroup sycl_pi_level_zero #include "pi_level_zero.hpp" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "ur/usm_allocator_config.hpp" #include "ur_bindings.hpp" -extern "C" { -// Forward declarartions. -static pi_result piQueueReleaseInternal(pi_queue Queue); -static pi_result piEventReleaseInternal(pi_event Event); -static pi_result EventCreate(pi_context Context, pi_queue Queue, - bool HostVisible, pi_event *RetEvent); -} - // Defined in tracing.cpp void enableZeTracing(); void disableZeTracing(); -namespace { - -// This is an experimental option to test performance of device to device copy -// operations on copy engines (versus compute engine) -static const bool UseCopyEngineForD2DCopy = [] { - const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_D2D_COPY"); - const char *PiRet = - std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY"); - const char *CopyEngineForD2DCopy = UrRet ? UrRet : (PiRet ? PiRet : nullptr); - - return (CopyEngineForD2DCopy && (std::stoi(CopyEngineForD2DCopy) != 0)); -}(); - -// This is an experimental option that allows the use of copy engine, if -// available in the device, in Level Zero plugin for copy operations submitted -// to an in-order queue. The default is 1. -static const bool UseCopyEngineForInOrderQueue = [] { - const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE"); - const char *PiRet = - std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE"); - const char *CopyEngineForInOrderQueue = - UrRet ? UrRet : (PiRet ? PiRet : nullptr); - - return (!CopyEngineForInOrderQueue || - (std::stoi(CopyEngineForInOrderQueue) != 0)); -}(); - -// This is an experimental option that allows the use of multiple command lists -// when submitting barriers. The default is 0. -static const bool UseMultipleCmdlistBarriers = [] { - const char *UrRet = std::getenv("UR_L0_USE_MULTIPLE_COMMANDLIST_BARRIERS"); - const char *PiRet = - std::getenv("SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS"); - const char *UseMultipleCmdlistBarriersFlag = - UrRet ? UrRet : (PiRet ? PiRet : nullptr); - - if (!UseMultipleCmdlistBarriersFlag) - return true; - return std::stoi(UseMultipleCmdlistBarriersFlag) > 0; -}(); - -// This is an experimental option that allows to disable caching of events in -// the context. -static const bool DisableEventsCaching = [] { - const char *UrRet = std::getenv("UR_L0_DISABLE_EVENTS_CACHING"); - const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_EVENTS_CACHING"); - const char *DisableEventsCachingFlag = - UrRet ? UrRet : (PiRet ? PiRet : nullptr); - - if (!DisableEventsCachingFlag) - return false; - return std::stoi(DisableEventsCachingFlag) != 0; -}(); - -// This is an experimental option that allows reset and reuse of uncompleted -// events in the in-order queue with discard_events property. -static const bool ReuseDiscardedEvents = [] { - const char *UrRet = std::getenv("UR_L0_REUSE_DISCARDED_EVENTS"); - const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_REUSE_DISCARDED_EVENTS"); - const char *ReuseDiscardedEventsFlag = - UrRet ? UrRet : (PiRet ? PiRet : nullptr); - - if (!ReuseDiscardedEventsFlag) - return true; - return std::stoi(ReuseDiscardedEventsFlag) > 0; -}(); - -// Due to a bug with 2D memory copy to and from non-USM pointers, this option is -// disabled by default. -static const bool UseMemcpy2DOperations = [] { - const char *UrRet = std::getenv("UR_L0_USE_NATIVE_USM_MEMCPY2D"); - const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D"); - const char *UseMemcpy2DOperationsFlag = - UrRet ? UrRet : (PiRet ? PiRet : nullptr); - - if (!UseMemcpy2DOperationsFlag) - return false; - return std::stoi(UseMemcpy2DOperationsFlag) > 0; -}(); - -// Map from L0 to PI result. -static inline pi_result mapError(ze_result_t Result) { - return ur2piResult(ze2urResult(Result)); -} - -// Trace a call to Level-Zero RT -#define ZE_CALL(ZeName, ZeArgs) \ - { \ - ze_result_t ZeResult = ZeName ZeArgs; \ - if (auto Result = ZeCall().doCall(ZeResult, #ZeName, #ZeArgs, true)) \ - return mapError(Result); \ - } - -// Trace an internal PI call; returns in case of an error. -#define PI_CALL(Call) \ - { \ - if (PrintTrace) \ - fprintf(stderr, "PI ---> %s\n", #Call); \ - pi_result Result = (Call); \ - if (Result != PI_SUCCESS) \ - return Result; \ - } - -// Controls if we should choose doing eager initialization -// to make it happen on warmup paths and have the reportable -// paths be less likely affected. -// -static bool doEagerInit = [] { - const char *UrRet = std::getenv("UR_L0_EAGER_INIT"); - const char *PiRet = std::getenv("SYCL_EAGER_INIT"); - const char *EagerInit = UrRet ? UrRet : (PiRet ? PiRet : nullptr); - return EagerInit ? std::atoi(EagerInit) != 0 : false; -}(); - -// Maximum number of events that can be present in an event ZePool is captured -// here. Setting it to 256 gave best possible performance for several -// benchmarks. -static const pi_uint32 MaxNumEventsPerPool = [] { - const char *UrRet = std::getenv("UR_L0_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); - const char *PiRet = std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); - const char *MaxNumEventsPerPoolEnv = - UrRet ? UrRet : (PiRet ? PiRet : nullptr); - - pi_uint32 Result = - MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256; - if (Result <= 0) - Result = 256; - return Result; -}(); - -// Helper function to implement zeHostSynchronize. -// The behavior is to avoid infinite wait during host sync under ZE_DEBUG. -// This allows for a much more responsive debugging of hangs. -// -template -ze_result_t zeHostSynchronizeImpl(Func Api, T Handle) { - if (!UrL0Debug) { - return Api(Handle, UINT64_MAX); - } - - ze_result_t R; - while ((R = Api(Handle, 1000)) == ZE_RESULT_NOT_READY) - ; - return R; -} - -// Template function to do various types of host synchronizations. -// This is intended to be used instead of direct calls to specific -// Level-Zero synchronization APIs. -// -template ze_result_t zeHostSynchronize(T Handle); -template <> ze_result_t zeHostSynchronize(ze_event_handle_t Handle) { - return zeHostSynchronizeImpl(zeEventHostSynchronize, Handle); -} -template <> ze_result_t zeHostSynchronize(ze_command_queue_handle_t Handle) { - return zeHostSynchronizeImpl(zeCommandQueueSynchronize, Handle); -} - -} // anonymous namespace - -// UR_L0_LEVEL_ZERO_USE_COMPUTE_ENGINE can be set to an integer (>=0) in -// which case all compute commands will be submitted to the command-queue -// with the given index in the compute command group. If it is instead set -// to negative then all available compute engines may be used. -// -// The default value is "0". -// -static const std::pair getRangeOfAllowedComputeEngines() { - const char *UrRet = std::getenv("UR_L0_USE_COMPUTE_ENGINE"); - const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE"); - const char *EnvVar = UrRet ? UrRet : (PiRet ? PiRet : nullptr); - - // If the environment variable is not set only use "0" CCS for now. - // TODO: allow all CCSs when HW support is complete. - if (!EnvVar) - return std::pair(0, 0); - - auto EnvVarValue = std::atoi(EnvVar); - if (EnvVarValue >= 0) { - return std::pair(EnvVarValue, EnvVarValue); - } - - return std::pair(0, INT_MAX); -} - -pi_platform _pi_context::getPlatform() const { return Devices[0]->Platform; } - -bool _pi_context::isValidDevice(pi_device Device) const { - while (Device) { - if (std::find(Devices.begin(), Devices.end(), Device) != Devices.end()) - return true; - Device = Device->RootDevice; - } - return false; -} - -pi_result -_pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &Pool, - size_t &Index, bool HostVisible, - bool ProfilingEnabled) { - // Lock while updating event pool machinery. - std::scoped_lock Lock(ZeEventPoolCacheMutex); - - std::list *ZePoolCache = - getZeEventPoolCache(HostVisible, ProfilingEnabled); - - if (!ZePoolCache->empty()) { - if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) { - if (DisableEventsCaching) { - // Remove full pool from the cache if events caching is disabled. - ZePoolCache->erase(ZePoolCache->begin()); - } else { - // If event caching is enabled then we don't destroy events so there is - // no need to remove pool from the cache and add it back when it has - // available slots. Just keep it in the tail of the cache so that all - // pools can be destroyed during context destruction. - ZePoolCache->push_front(nullptr); - } - } - } - if (ZePoolCache->empty()) { - ZePoolCache->push_back(nullptr); - } - - // We shall be adding an event to the front pool. - ze_event_pool_handle_t *ZePool = &ZePoolCache->front(); - Index = 0; - // Create one event ZePool per MaxNumEventsPerPool events - if (*ZePool == nullptr) { - ZeStruct ZeEventPoolDesc; - ZeEventPoolDesc.count = MaxNumEventsPerPool; - ZeEventPoolDesc.flags = 0; - if (HostVisible) - ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE; - if (ProfilingEnabled) - ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; - urPrint("ze_event_pool_desc_t flags set to: %d\n", ZeEventPoolDesc.flags); - - std::vector ZeDevices; - std::for_each(Devices.begin(), Devices.end(), [&](const pi_device &D) { - ZeDevices.push_back(D->ZeDevice); - }); - - ZE_CALL(zeEventPoolCreate, (ZeContext, &ZeEventPoolDesc, ZeDevices.size(), - &ZeDevices[0], ZePool)); - NumEventsAvailableInEventPool[*ZePool] = MaxNumEventsPerPool - 1; - NumEventsUnreleasedInEventPool[*ZePool] = 1; - } else { - Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[*ZePool]; - --NumEventsAvailableInEventPool[*ZePool]; - ++NumEventsUnreleasedInEventPool[*ZePool]; - } - Pool = *ZePool; - return PI_SUCCESS; -} - -pi_result _pi_context::decrementUnreleasedEventsInPool(pi_event Event) { - std::shared_lock EventLock(Event->Mutex, std::defer_lock); - std::scoped_lock> LockAll( - ZeEventPoolCacheMutex, EventLock); - if (!Event->ZeEventPool) { - // This must be an interop event created on a users's pool. - // Do nothing. - return PI_SUCCESS; - } - - std::list *ZePoolCache = - getZeEventPoolCache(Event->isHostVisible(), Event->isProfilingEnabled()); - - // Put the empty pool to the cache of the pools. - if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) - die("Invalid event release: event pool doesn't have unreleased events"); - if (--NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) { - if (ZePoolCache->front() != Event->ZeEventPool) { - ZePoolCache->push_back(Event->ZeEventPool); - } - NumEventsAvailableInEventPool[Event->ZeEventPool] = MaxNumEventsPerPool; - } - - return PI_SUCCESS; -} - -// Forward declarations -static pi_result enqueueMemCopyHelper(pi_command_type CommandType, - pi_queue Queue, void *Dst, - pi_bool BlockingWrite, size_t Size, - const void *Src, - pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, - pi_event *Event, bool PreferCopyEngine); - -static pi_result enqueueMemCopyRectHelper( - pi_command_type CommandType, pi_queue Queue, const void *SrcBuffer, - void *DstBuffer, pi_buff_rect_offset SrcOrigin, - pi_buff_rect_offset DstOrigin, pi_buff_rect_region Region, - size_t SrcRowPitch, size_t DstRowPitch, size_t SrcSlicePitch, - size_t DstSlicePitch, pi_bool Blocking, pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *Event, - bool PreferCopyEngine = false); - -bool _pi_queue::doReuseDiscardedEvents() { - return ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents(); -} - -pi_result _pi_queue::resetDiscardedEvent(pi_command_list_ptr_t CommandList) { - if (LastCommandEvent && LastCommandEvent->IsDiscarded) { - ZE_CALL(zeCommandListAppendBarrier, - (CommandList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); - ZE_CALL(zeCommandListAppendEventReset, - (CommandList->first, LastCommandEvent->ZeEvent)); - - // Create new pi_event but with the same ze_event_handle_t. We are going - // to use this pi_event for the next command with discarded event. - pi_event PiEvent; - try { - PiEvent = new _pi_event(LastCommandEvent->ZeEvent, - LastCommandEvent->ZeEventPool, Context, - PI_COMMAND_TYPE_USER, true); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - if (LastCommandEvent->isHostVisible()) - PiEvent->HostVisibleEvent = PiEvent; - - PI_CALL(addEventToQueueCache(PiEvent)); - } - - return PI_SUCCESS; -} - -// This helper function creates a pi_event and associate a pi_queue. -// Note that the caller of this function must have acquired lock on the Queue -// that is passed in. -// \param Queue pi_queue to associate with a new event. -// \param Event a pointer to hold the newly created pi_event -// \param CommandType various command type determined by the caller -// \param CommandList is the command list where the event is added -// \param IsInternal tells if the event is internal, i.e. visible in the L0 -// plugin only. -// \param HostVisible tells if the event must be created in the -// host-visible pool. If not set then this function will decide. -inline static pi_result -createEventAndAssociateQueue(pi_queue Queue, pi_event *Event, - pi_command_type CommandType, - pi_command_list_ptr_t CommandList, bool IsInternal, - std::optional HostVisible = std::nullopt) { - - if (!HostVisible.has_value()) { - // Internal/discarded events do not need host-scope visibility. - HostVisible = - IsInternal ? false : Queue->Device->ZeEventsScope == AllHostVisible; - } - - // If event is discarded then try to get event from the queue cache. - *Event = - IsInternal ? Queue->getEventFromQueueCache(HostVisible.value()) : nullptr; - - if (*Event == nullptr) - PI_CALL(EventCreate(Queue->Context, Queue, HostVisible.value(), Event)); - - (*Event)->Queue = Queue; - (*Event)->CommandType = CommandType; - (*Event)->IsDiscarded = IsInternal; - (*Event)->CommandList = CommandList; - // Discarded event doesn't own ze_event, it is used by multiple pi_event - // objects. We destroy corresponding ze_event by releasing events from the - // events cache at queue destruction. Event in the cache owns the Level Zero - // event. - if (IsInternal) - (*Event)->OwnZeEvent = false; - - // Append this Event to the CommandList, if any - if (CommandList != Queue->CommandListMap.end()) { - CommandList->second.append(*Event); - (*Event)->RefCount.increment(); - } - - // We need to increment the reference counter here to avoid pi_queue - // being released before the associated pi_event is released because - // piEventRelease requires access to the associated pi_queue. - // In piEventRelease, the reference counter of the Queue is decremented - // to release it. - Queue->RefCount.increment(); - - // SYCL RT does not track completion of the events, so it could - // release a PI event as soon as that's not being waited in the app. - // But we have to ensure that the event is not destroyed before - // it is really signalled, so retain it explicitly here and - // release in CleanupCompletedEvent(Event). - // If the event is internal then don't increment the reference count as this - // event will not be waited/released by SYCL RT, so it must be destroyed by - // EventRelease in resetCommandList. - if (!IsInternal) - PI_CALL(piEventRetain(*Event)); - - return PI_SUCCESS; -} - -pi_result _pi_queue::signalEventFromCmdListIfLastEventDiscarded( - pi_command_list_ptr_t CommandList) { - // We signal new event at the end of command list only if we have queue with - // discard_events property and the last command event is discarded. - if (!(doReuseDiscardedEvents() && LastCommandEvent && - LastCommandEvent->IsDiscarded)) - return PI_SUCCESS; - - // NOTE: We create this "glue" event not as internal so it is not - // participating in the discarded events reset/reuse logic, but - // with no host-visibility since it is not going to be waited - // from the host. - pi_event Event; - PI_CALL(createEventAndAssociateQueue( - this, &Event, PI_COMMAND_TYPE_USER, CommandList, - /* IsInternal */ false, /* HostVisible */ false)); - PI_CALL(piEventReleaseInternal(Event)); - LastCommandEvent = Event; - - ZE_CALL(zeCommandListAppendSignalEvent, (CommandList->first, Event->ZeEvent)); - return PI_SUCCESS; -} - -pi_event _pi_queue::getEventFromQueueCache(bool HostVisible) { - auto Cache = HostVisible ? &EventCaches[0] : &EventCaches[1]; - - // If we don't have any events, return nullptr. - // If we have only a single event then it was used by the last command and we - // can't use it now because we have to enforce round robin between two events. - if (Cache->size() < 2) - return nullptr; - - // If there are two events then return an event from the beginning of the list - // since event of the last command is added to the end of the list. - auto It = Cache->begin(); - pi_event RetEvent = *It; - Cache->erase(It); - return RetEvent; -} - -pi_result _pi_queue::addEventToQueueCache(pi_event Event) { - auto Cache = Event->isHostVisible() ? &EventCaches[0] : &EventCaches[1]; - Cache->emplace_back(Event); - return PI_SUCCESS; -} - -// Get value of the threshold for number of events in immediate command lists. -// If number of events in the immediate command list exceeds this threshold then -// cleanup process for those events is executed. -static const size_t ImmCmdListsEventCleanupThreshold = [] { - const char *UrRet = - std::getenv("UR_L0_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD"); - const char *PiRet = std::getenv( - "SYCL_PI_LEVEL_ZERO_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD"); - const char *ImmCmdListsEventCleanupThresholdStr = - UrRet ? UrRet : (PiRet ? PiRet : nullptr); - - static constexpr int Default = 1000; - if (!ImmCmdListsEventCleanupThresholdStr) - return Default; - - int Threshold = std::atoi(ImmCmdListsEventCleanupThresholdStr); - - // Basically disable threshold if negative value is provided. - if (Threshold < 0) - return INT_MAX; - - return Threshold; -}(); - -// Get value of the threshold for number of active command lists allowed before -// we start heuristically cleaning them up. -static const size_t CmdListsCleanupThreshold = [] { - const char *UrRet = std::getenv("UR_L0_COMMANDLISTS_CLEANUP_THRESHOLD"); - const char *PiRet = - std::getenv("SYCL_PI_LEVEL_ZERO_COMMANDLISTS_CLEANUP_THRESHOLD"); - const char *CmdListsCleanupThresholdStr = - UrRet ? UrRet : (PiRet ? PiRet : nullptr); - - static constexpr int Default = 20; - if (!CmdListsCleanupThresholdStr) - return Default; - - int Threshold = std::atoi(CmdListsCleanupThresholdStr); - - // Basically disable threshold if negative value is provided. - if (Threshold < 0) - return INT_MAX; - - return Threshold; -}(); - -pi_device _pi_context::getRootDevice() const { - assert(Devices.size() > 0); - - if (Devices.size() == 1) - return Devices[0]; - - // Check if we have context with subdevices of the same device (context - // may include root device itself as well) - pi_device ContextRootDevice = - Devices[0]->RootDevice ? Devices[0]->RootDevice : Devices[0]; - - // For context with sub subdevices, the ContextRootDevice might still - // not be the root device. - // Check whether the ContextRootDevice is the subdevice or root device. - if (ContextRootDevice->isSubDevice()) { - ContextRootDevice = ContextRootDevice->RootDevice; - } - - for (auto &Device : Devices) { - if ((!Device->RootDevice && Device != ContextRootDevice) || - (Device->RootDevice && Device->RootDevice != ContextRootDevice)) { - ContextRootDevice = nullptr; - break; - } - } - return ContextRootDevice; -} - -pi_result _pi_context::initialize() { - - // Helper lambda to create various USM allocators for a device. - // Note that the CCS devices and their respective subdevices share a - // common ze_device_handle and therefore, also share USM allocators. - auto createUSMAllocators = [this](pi_device Device) { - SharedMemAllocContexts.emplace( - std::piecewise_construct, std::make_tuple(Device->ZeDevice), - std::make_tuple( - std::unique_ptr( - new USMSharedMemoryAlloc(this, Device)), - USMAllocatorConfigInstance.Configs[usm_settings::MemType::Shared])); - - SharedReadOnlyMemAllocContexts.emplace( - std::piecewise_construct, std::make_tuple(Device->ZeDevice), - std::make_tuple(std::unique_ptr( - new USMSharedReadOnlyMemoryAlloc(this, Device)), - USMAllocatorConfigInstance - .Configs[usm_settings::MemType::SharedReadOnly])); - - DeviceMemAllocContexts.emplace( - std::piecewise_construct, std::make_tuple(Device->ZeDevice), - std::make_tuple( - std::unique_ptr( - new USMDeviceMemoryAlloc(this, Device)), - USMAllocatorConfigInstance.Configs[usm_settings::MemType::Device])); - }; - - // Recursive helper to call createUSMAllocators for all sub-devices - std::function createUSMAllocatorsRecursive; - createUSMAllocatorsRecursive = - [createUSMAllocators, - &createUSMAllocatorsRecursive](pi_device Device) -> void { - createUSMAllocators(Device); - for (auto &SubDevice : Device->SubDevices) - createUSMAllocatorsRecursive(SubDevice); - }; - - // Create USM allocator context for each pair (device, context). - // - for (auto &Device : Devices) { - createUSMAllocatorsRecursive(Device); - } - // Create USM allocator context for host. Device and Shared USM allocations - // are device-specific. Host allocations are not device-dependent therefore - // we don't need a map with device as key. - HostMemAllocContext = std::make_unique( - std::unique_ptr(new USMHostMemoryAlloc(this)), - USMAllocatorConfigInstance.Configs[usm_settings::MemType::Host]); - - // We may allocate memory to this root device so create allocators. - if (SingleRootDevice && - DeviceMemAllocContexts.find(SingleRootDevice->ZeDevice) == - DeviceMemAllocContexts.end()) { - createUSMAllocators(SingleRootDevice); - } - - // Create the immediate command list to be used for initializations. - // Created as synchronous so level-zero performs implicit synchronization and - // there is no need to query for completion in the plugin - // - // TODO: we use Device[0] here as the single immediate command-list - // for buffer creation and migration. Initialization is in - // in sync and is always performed to Devices[0] as well but - // D2D migartion, if no P2P, is broken since it should use - // immediate command-list for the specfic devices, and this single one. - // - pi_device Device = SingleRootDevice ? SingleRootDevice : Devices[0]; - - // Prefer to use copy engine for initialization copies, - // if available and allowed (main copy engine with index 0). - ZeStruct ZeCommandQueueDesc; - const auto &Range = getRangeOfAllowedCopyEngines((ur_device_handle_t)Device); - ZeCommandQueueDesc.ordinal = - Device->QueueGroup[_pi_device::queue_group_info_t::Compute].ZeOrdinal; - if (Range.first >= 0 && - Device->QueueGroup[_pi_device::queue_group_info_t::MainCopy].ZeOrdinal != - -1) - ZeCommandQueueDesc.ordinal = - Device->QueueGroup[_pi_device::queue_group_info_t::MainCopy].ZeOrdinal; - - ZeCommandQueueDesc.index = 0; - ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS; - ZE_CALL( - zeCommandListCreateImmediate, - (ZeContext, Device->ZeDevice, &ZeCommandQueueDesc, &ZeCommandListInit)); - return PI_SUCCESS; -} - -pi_result _pi_context::finalize() { - // This function is called when pi_context is deallocated, piContextRelease. - // There could be some memory that may have not been deallocated. - // For example, event and event pool caches would be still alive. - - if (!DisableEventsCaching) { - std::scoped_lock Lock(EventCacheMutex); - for (auto &EventCache : EventCaches) { - for (auto &Event : EventCache) { - auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent)); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return mapError(ZeResult); - - delete Event; - } - EventCache.clear(); - } - } - { - std::scoped_lock Lock(ZeEventPoolCacheMutex); - for (auto &ZePoolCache : ZeEventPoolCache) { - for (auto &ZePool : ZePoolCache) { - auto ZeResult = ZE_CALL_NOCHECK(zeEventPoolDestroy, (ZePool)); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return mapError(ZeResult); - } - ZePoolCache.clear(); - } - } - - // Destroy the command list used for initializations - auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListInit)); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return mapError(ZeResult); - - std::scoped_lock Lock(ZeCommandListCacheMutex); - for (auto &List : ZeComputeCommandListCache) { - for (auto &Item : List.second) { - ze_command_list_handle_t ZeCommandList = Item.first; - if (ZeCommandList) { - auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return mapError(ZeResult); - } - } - } - for (auto &List : ZeCopyCommandListCache) { - for (auto &Item : List.second) { - ze_command_list_handle_t ZeCommandList = Item.first; - if (ZeCommandList) { - auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return mapError(ZeResult); - } - } - } - return PI_SUCCESS; -} - -bool pi_command_list_info_t::isCopy(pi_queue Queue) const { - return ZeQueueDesc.ordinal != - (uint32_t)Queue->Device - ->QueueGroup[_pi_device::queue_group_info_t::type::Compute] - .ZeOrdinal; -} - -bool _pi_queue::isInOrderQueue() const { - // If out-of-order queue property is not set, then this is a in-order queue. - return ((this->Properties & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) == - 0); -} - -bool _pi_queue::isDiscardEvents() const { - return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS) != 0); -} - -bool _pi_queue::isPriorityLow() const { - return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW) != 0); -} - -bool _pi_queue::isPriorityHigh() const { - return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH) != 0); -} - -pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList, - bool MakeAvailable, - std::vector &EventListToCleanup, - bool CheckStatus) { - bool UseCopyEngine = CommandList->second.isCopy(this); - - // Immediate commandlists do not have an associated fence. - if (CommandList->second.ZeFence != nullptr) { - // Fence had been signalled meaning the associated command-list completed. - // Reset the fence and put the command list into a cache for reuse in PI - // calls. - ZE_CALL(zeFenceReset, (CommandList->second.ZeFence)); - ZE_CALL(zeCommandListReset, (CommandList->first)); - CommandList->second.ZeFenceInUse = false; - CommandList->second.IsClosed = false; - } - - auto &EventList = CommandList->second.EventList; - // Check if standard commandlist or fully synced in-order queue. - // If one of those conditions is met then we are sure that all events are - // completed so we don't need to check event status. - if (!CheckStatus || CommandList->second.ZeFence != nullptr || - (isInOrderQueue() && !LastCommandEvent)) { - // Remember all the events in this command list which needs to be - // released/cleaned up and clear event list associated with command list. - std::move(std::begin(EventList), std::end(EventList), - std::back_inserter(EventListToCleanup)); - EventList.clear(); - } else if (!isDiscardEvents()) { - // If events in the queue are discarded then we can't check their status. - // Helper for checking of event completion - auto EventCompleted = [](pi_event Event) -> bool { - std::scoped_lock EventLock(Event->Mutex); - ze_result_t ZeResult = - Event->Completed - ? ZE_RESULT_SUCCESS - : ZE_CALL_NOCHECK(zeEventQueryStatus, (Event->ZeEvent)); - return ZeResult == ZE_RESULT_SUCCESS; - }; - // Handle in-order specially as we can just in few checks (with binary - // search) a completed event and then all events before it are also - // done. - if (isInOrderQueue()) { - size_t Bisect = EventList.size(); - size_t Iter = 0; - for (auto it = EventList.rbegin(); it != EventList.rend(); ++Iter) { - if (!EventCompleted(*it)) { - if (Bisect > 1 && Iter < 3) { // Heuristically limit by 3 checks - Bisect >>= 1; - it += Bisect; - continue; - } - break; - } - // Bulk move of event up to "it" to the list ready for cleanup - std::move(it, EventList.rend(), std::back_inserter(EventListToCleanup)); - EventList.erase(EventList.begin(), it.base()); - break; - } - return PI_SUCCESS; - } - // For immediate commandlist reset only those events that have signalled. - for (auto it = EventList.begin(); it != EventList.end();) { - // Break early as soon as we found first incomplete event because next - // events are submitted even later. We are not trying to find all - // completed events here because it may be costly. I.e. we are checking - // only elements which are most likely completed because they were - // submitted earlier. It is guaranteed that all events will be eventually - // cleaned up at queue sync/release. - if (!EventCompleted(*it)) - break; - - EventListToCleanup.push_back(std::move((*it))); - it = EventList.erase(it); - } - } - - // Standard commandlists move in and out of the cache as they are recycled. - // Immediate commandlists are always available. - if (CommandList->second.ZeFence != nullptr && MakeAvailable) { - std::scoped_lock Lock(this->Context->ZeCommandListCacheMutex); - auto &ZeCommandListCache = - UseCopyEngine - ? this->Context->ZeCopyCommandListCache[this->Device->ZeDevice] - : this->Context->ZeComputeCommandListCache[this->Device->ZeDevice]; - ZeCommandListCache.push_back( - {CommandList->first, CommandList->second.ZeQueueDesc}); - } - - return PI_SUCCESS; -} - -// Configuration of the command-list batching. -struct zeCommandListBatchConfig { - // Default value of 0. This specifies to use dynamic batch size adjustment. - // Other values will try to collect specified amount of commands. - pi_uint32 Size{0}; - - // If doing dynamic batching, specifies start batch size. - pi_uint32 DynamicSizeStart{4}; - - // The maximum size for dynamic batch. - pi_uint32 DynamicSizeMax{64}; - - // The step size for dynamic batch increases. - pi_uint32 DynamicSizeStep{1}; - - // Thresholds for when increase batch size (number of closed early is small - // and number of closed full is high). - pi_uint32 NumTimesClosedEarlyThreshold{3}; - pi_uint32 NumTimesClosedFullThreshold{8}; - - // Tells the starting size of a batch. - pi_uint32 startSize() const { return Size > 0 ? Size : DynamicSizeStart; } - // Tells is we are doing dynamic batch size adjustment. - bool dynamic() const { return Size == 0; } -}; - -// Helper function to initialize static variables that holds batch config info -// for compute and copy command batching. -static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) { - zeCommandListBatchConfig Config{}; // default initialize - - // Default value of 0. This specifies to use dynamic batch size adjustment. - const char *UrRet = nullptr; - const char *PiRet = nullptr; - if (IsCopy) { - UrRet = std::getenv("UR_L0_COPY_BATCH_SIZE"); - PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE"); - } else { - UrRet = std::getenv("UR_L0_BATCH_SIZE"); - PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE"); - } - const char *BatchSizeStr = UrRet ? UrRet : (PiRet ? PiRet : nullptr); - - if (BatchSizeStr) { - pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr); - // Level Zero may only support a limted number of commands per command - // list. The actual upper limit is not specified by the Level Zero - // Specification. For now we allow an arbitrary upper limit. - if (BatchSizeStrVal > 0) { - Config.Size = BatchSizeStrVal; - } else if (BatchSizeStrVal == 0) { - Config.Size = 0; - // We are requested to do dynamic batching. Collect specifics, if any. - // The extended format supported is ":" separated values. - // - // NOTE: these extra settings are experimental and are intended to - // be used only for finding a better default heuristic. - // - std::string BatchConfig(BatchSizeStr); - size_t Ord = 0; - size_t Pos = 0; - while (true) { - if (++Ord > 5) - break; - - Pos = BatchConfig.find(":", Pos); - if (Pos == std::string::npos) - break; - ++Pos; // past the ":" - - pi_uint32 Val; - try { - Val = std::stoi(BatchConfig.substr(Pos)); - } catch (...) { - if (IsCopy) - urPrint("UR_L0_COPY_BATCH_SIZE: failed to parse value\n"); - else - urPrint("UR_L0_BATCH_SIZE: failed to parse value\n"); - break; - } - switch (Ord) { - case 1: - Config.DynamicSizeStart = Val; - break; - case 2: - Config.DynamicSizeMax = Val; - break; - case 3: - Config.DynamicSizeStep = Val; - break; - case 4: - Config.NumTimesClosedEarlyThreshold = Val; - break; - case 5: - Config.NumTimesClosedFullThreshold = Val; - break; - default: - die("Unexpected batch config"); - } - if (IsCopy) - urPrint("UR_L0_COPY_BATCH_SIZE: dynamic batch param " - "#%d: %d\n", - (int)Ord, (int)Val); - else - urPrint("UR_L0_BATCH_SIZE: dynamic batch param #%d: %d\n", (int)Ord, - (int)Val); - }; - - } else { - // Negative batch sizes are silently ignored. - if (IsCopy) - urPrint("UR_L0_COPY_BATCH_SIZE: ignored negative value\n"); - else - urPrint("UR_L0_BATCH_SIZE: ignored negative value\n"); - } - } - return Config; -} - -// Static variable that holds batch config info for compute command batching. -static const zeCommandListBatchConfig ZeCommandListBatchComputeConfig = [] { - using IsCopy = bool; - return ZeCommandListBatchConfig(IsCopy{false}); -}(); - -// Static variable that holds batch config info for copy command batching. -static const zeCommandListBatchConfig ZeCommandListBatchCopyConfig = [] { - using IsCopy = bool; - return ZeCommandListBatchConfig(IsCopy{true}); -}(); - -// Control if wait with barrier is implemented by signal of an event -// as opposed by true barrier command for in-order queue. -static const bool InOrderBarrierBySignal = [] { - const char *UrRet = std::getenv("UR_L0_IN_ORDER_BARRIER_BY_SIGNAL"); - return (UrRet ? std::atoi(UrRet) : true); -}(); - -_pi_queue::_pi_queue(std::vector &ComputeQueues, - std::vector &CopyQueues, - pi_context Context, pi_device Device, - bool OwnZeCommandQueue, - pi_queue_properties PiQueueProperties, - int ForceComputeIndex) - : Context{Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue}, - Properties(PiQueueProperties) { - UsingImmCmdLists = Device->useImmediateCommandLists(); - urPrint("ImmCmdList setting (%s)\n", (UsingImmCmdLists ? "YES" : "NO")); - - // Compute group initialization. - // First, see if the queue's device allows for round-robin or it is - // fixed to one particular compute CCS (it is so for sub-sub-devices). - auto &ComputeQueueGroupInfo = Device->QueueGroup[queue_type::Compute]; - pi_queue_group_t ComputeQueueGroup{this, queue_type::Compute}; - ComputeQueueGroup.ZeQueues = ComputeQueues; - // Create space to hold immediate commandlists corresponding to the - // ZeQueues - if (UsingImmCmdLists) { - ComputeQueueGroup.ImmCmdLists = std::vector( - ComputeQueueGroup.ZeQueues.size(), CommandListMap.end()); - } - if (ComputeQueueGroupInfo.ZeIndex >= 0) { - // Sub-sub-device - - // sycl::ext::intel::property::queue::compute_index works with any - // backend/device by allowing single zero index if multiple compute CCSes - // are not supported. Sub-sub-device falls into the same bucket. - assert(ForceComputeIndex <= 0); - ComputeQueueGroup.LowerIndex = ComputeQueueGroupInfo.ZeIndex; - ComputeQueueGroup.UpperIndex = ComputeQueueGroupInfo.ZeIndex; - ComputeQueueGroup.NextIndex = ComputeQueueGroupInfo.ZeIndex; - } else if (ForceComputeIndex >= 0) { - ComputeQueueGroup.LowerIndex = ForceComputeIndex; - ComputeQueueGroup.UpperIndex = ForceComputeIndex; - ComputeQueueGroup.NextIndex = ForceComputeIndex; - } else { - // Set-up to round-robin across allowed range of engines. - uint32_t FilterLowerIndex = getRangeOfAllowedComputeEngines().first; - uint32_t FilterUpperIndex = getRangeOfAllowedComputeEngines().second; - FilterUpperIndex = std::min((size_t)FilterUpperIndex, - FilterLowerIndex + ComputeQueues.size() - 1); - if (FilterLowerIndex <= FilterUpperIndex) { - ComputeQueueGroup.LowerIndex = FilterLowerIndex; - ComputeQueueGroup.UpperIndex = FilterUpperIndex; - ComputeQueueGroup.NextIndex = ComputeQueueGroup.LowerIndex; - } else { - die("No compute queue available/allowed."); - } - } - if (UsingImmCmdLists) { - // Create space to hold immediate commandlists corresponding to the - // ZeQueues - ComputeQueueGroup.ImmCmdLists = std::vector( - ComputeQueueGroup.ZeQueues.size(), CommandListMap.end()); - } - ComputeQueueGroupsByTID.set(ComputeQueueGroup); - - // Copy group initialization. - pi_queue_group_t CopyQueueGroup{this, queue_type::MainCopy}; - const auto &Range = getRangeOfAllowedCopyEngines((ur_device_handle_t)Device); - if (Range.first < 0 || Range.second < 0) { - // We are asked not to use copy engines, just do nothing. - // Leave CopyQueueGroup.ZeQueues empty, and it won't be used. - } else { - uint32_t FilterLowerIndex = Range.first; - uint32_t FilterUpperIndex = Range.second; - FilterUpperIndex = std::min((size_t)FilterUpperIndex, - FilterLowerIndex + CopyQueues.size() - 1); - if (FilterLowerIndex <= FilterUpperIndex) { - CopyQueueGroup.ZeQueues = CopyQueues; - CopyQueueGroup.LowerIndex = FilterLowerIndex; - CopyQueueGroup.UpperIndex = FilterUpperIndex; - CopyQueueGroup.NextIndex = CopyQueueGroup.LowerIndex; - // Create space to hold immediate commandlists corresponding to the - // ZeQueues - if (UsingImmCmdLists) { - CopyQueueGroup.ImmCmdLists = std::vector( - CopyQueueGroup.ZeQueues.size(), CommandListMap.end()); - } - } - } - CopyQueueGroupsByTID.set(CopyQueueGroup); - - // Initialize compute/copy command batches. - ComputeCommandBatch.OpenCommandList = CommandListMap.end(); - CopyCommandBatch.OpenCommandList = CommandListMap.end(); - ComputeCommandBatch.QueueBatchSize = - ZeCommandListBatchComputeConfig.startSize(); - CopyCommandBatch.QueueBatchSize = ZeCommandListBatchCopyConfig.startSize(); -} - -static pi_result CleanupCompletedEvent(pi_event Event, - bool QueueLocked = false); - -// Helper function to perform the necessary cleanup of the events from reset cmd -// list. -static pi_result -CleanupEventListFromResetCmdList(std::vector &EventListToCleanup, - bool QueueLocked = false) { - for (auto &Event : EventListToCleanup) { - // We don't need to synchronize the events since the fence associated with - // the command list was synchronized. - { - std::scoped_lock EventLock(Event->Mutex); - Event->Completed = true; - } - PI_CALL(CleanupCompletedEvent(Event, QueueLocked)); - // This event was removed from the command list, so decrement ref count - // (it was incremented when they were added to the command list). - PI_CALL(piEventReleaseInternal(Event)); - } - return PI_SUCCESS; -} - -/// @brief Cleanup events in the immediate lists of the queue. -/// @param Queue Queue where events need to be cleaned up. -/// @param QueueLocked Indicates if the queue mutex is locked by caller. -/// @param QueueSynced 'true' if queue was synchronized before the -/// call and no other commands were submitted after synchronization, 'false' -/// otherwise. -/// @param CompletedEvent Hint providing an event which was synchronized before -/// the call, in case of in-order queue it allows to cleanup all preceding -/// events. -/// @return PI_SUCCESS if successful, PI error code otherwise. -static pi_result CleanupEventsInImmCmdLists(pi_queue Queue, - bool QueueLocked = false, - bool QueueSynced = false, - pi_event CompletedEvent = nullptr) { - // Handle only immediate command lists here. - if (!Queue || !Queue->UsingImmCmdLists) - return PI_SUCCESS; - - std::vector EventListToCleanup; - { - std::unique_lock QueueLock(Queue->Mutex, std::defer_lock); - if (!QueueLocked) - QueueLock.lock(); - // If queue is locked and fully synchronized then cleanup all events. - // If queue is not locked then by this time there may be new submitted - // commands so we can't do full cleanup. - if (QueueLocked && - (QueueSynced || (Queue->isInOrderQueue() && - (CompletedEvent == Queue->LastCommandEvent || - !Queue->LastCommandEvent)))) { - Queue->LastCommandEvent = nullptr; - for (auto &&It = Queue->CommandListMap.begin(); - It != Queue->CommandListMap.end(); ++It) { - PI_CALL(Queue->resetCommandList(It, true, EventListToCleanup, - /* CheckStatus */ false)); - } - } else if (Queue->isInOrderQueue() && CompletedEvent) { - // If the queue is in-order and we have information about completed event - // then cleanup all events in the command list preceding to CompletedEvent - // including itself. - - // Check that the comleted event has associated command list. - if (!(CompletedEvent->CommandList && - CompletedEvent->CommandList.value() != Queue->CommandListMap.end())) - return PI_SUCCESS; - - auto &CmdListEvents = - CompletedEvent->CommandList.value()->second.EventList; - auto CompletedEventIt = - std::find(CmdListEvents.begin(), CmdListEvents.end(), CompletedEvent); - if (CompletedEventIt != CmdListEvents.end()) { - // We can cleanup all events prior to the completed event in this - // command list and completed event itself. - // TODO: we can potentially cleanup more events here by finding - // completed events on another command lists, but it is currently not - // implemented. - std::move(std::begin(CmdListEvents), CompletedEventIt + 1, - std::back_inserter(EventListToCleanup)); - CmdListEvents.erase(CmdListEvents.begin(), CompletedEventIt + 1); - } - } else { - // Fallback to resetCommandList over all command lists. - for (auto &&It = Queue->CommandListMap.begin(); - It != Queue->CommandListMap.end(); ++It) { - PI_CALL(Queue->resetCommandList(It, true, EventListToCleanup, - /* CheckStatus */ true)); - } - } - } - PI_CALL(CleanupEventListFromResetCmdList(EventListToCleanup, QueueLocked)); - return PI_SUCCESS; -} - -/// @brief Reset signalled command lists in the queue and put them to the cache -/// of command lists. Also cleanup events associated with signalled command -/// lists. Queue must be locked by the caller for modification. -/// @param Queue Queue where we look for signalled command lists and cleanup -/// events. -/// @return PI_SUCCESS if successful, PI error code otherwise. -static pi_result resetCommandLists(pi_queue Queue) { - // Handle immediate command lists here, they don't need to be reset and we - // only need to cleanup events. - if (Queue->UsingImmCmdLists) { - PI_CALL(CleanupEventsInImmCmdLists(Queue, true /*locked*/)); - return PI_SUCCESS; - } - - // We need events to be cleaned up out of scope where queue is locked to avoid - // nested locks, because event cleanup requires event to be locked. Nested - // locks are hard to control and can cause deadlocks if mutexes are locked in - // different order. - std::vector EventListToCleanup; - - // We check for command lists that have been already signalled, but have not - // been added to the available list yet. Each command list has a fence - // associated which tracks if a command list has completed dispatch of its - // commands and is ready for reuse. If a command list is found to have been - // signalled, then the command list & fence are reset and command list is - // returned to the command list cache. All events associated with command - // list are cleaned up if command list was reset. - for (auto &&it = Queue->CommandListMap.begin(); - it != Queue->CommandListMap.end(); ++it) { - // Immediate commandlists don't use a fence and are handled separately - // above. - assert(it->second.ZeFence != nullptr); - // It is possible that the fence was already noted as signalled and - // reset. In that case the ZeFenceInUse flag will be false. - if (it->second.ZeFenceInUse) { - ze_result_t ZeResult = - ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence)); - if (ZeResult == ZE_RESULT_SUCCESS) - PI_CALL(Queue->resetCommandList(it, true, EventListToCleanup)); - } - } - CleanupEventListFromResetCmdList(EventListToCleanup, true /*locked*/); - return PI_SUCCESS; -} - -// Retrieve an available command list to be used in a PI call. -pi_result _pi_context::getAvailableCommandList( - pi_queue Queue, pi_command_list_ptr_t &CommandList, bool UseCopyEngine, - bool AllowBatching, ze_command_queue_handle_t *ForcedCmdQueue) { - // Immediate commandlists have been pre-allocated and are always available. - if (Queue->UsingImmCmdLists) { - CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList(); - if (CommandList->second.EventList.size() > - ImmCmdListsEventCleanupThreshold) { - std::vector EventListToCleanup; - Queue->resetCommandList(CommandList, false, EventListToCleanup); - CleanupEventListFromResetCmdList(EventListToCleanup, true); - } - PI_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList)); - if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine)) - return Res; - return PI_SUCCESS; - } else { - // Cleanup regular command-lists if there are too many. - // It handles the case that the queue is not synced to the host - // for a long time and we want to reclaim the command-lists for - // use by other queues. - if (Queue->CommandListMap.size() > CmdListsCleanupThreshold) { - resetCommandLists(Queue); - } - } - - auto &CommandBatch = - UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch; - // Handle batching of commands - // First see if there is an command-list open for batching commands - // for this queue. - if (Queue->hasOpenCommandList(UseCopyEngine)) { - if (AllowBatching) { - CommandList = CommandBatch.OpenCommandList; - PI_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList)); - return PI_SUCCESS; - } - // If this command isn't allowed to be batched or doesn't match the forced - // command queue, then we need to go ahead and execute what is already in - // the batched list, and then go on to process this. On exit from - // executeOpenCommandList OpenCommandList will be invalidated. - if (auto Res = Queue->executeOpenCommandList(UseCopyEngine)) - return Res; - // Note that active barriers do not need to be inserted here as they will - // have been enqueued into the command-list when they were created. - } - - // Create/Reuse the command list, because in Level Zero commands are added to - // the command lists, and later are then added to the command queue. - // Each command list is paired with an associated fence to track when the - // command list is available for reuse. - _pi_result pi_result = PI_ERROR_OUT_OF_RESOURCES; - - // Initally, we need to check if a command list has already been created - // on this device that is available for use. If so, then reuse that - // Level-Zero Command List and Fence for this PI call. - { - // Make sure to acquire the lock before checking the size, or there - // will be a race condition. - std::scoped_lock Lock(Queue->Context->ZeCommandListCacheMutex); - // Under mutex since operator[] does insertion on the first usage for every - // unique ZeDevice. - auto &ZeCommandListCache = - UseCopyEngine - ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice] - : Queue->Context - ->ZeComputeCommandListCache[Queue->Device->ZeDevice]; - - for (auto ZeCommandListIt = ZeCommandListCache.begin(); - ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) { - auto &ZeCommandList = ZeCommandListIt->first; - auto it = Queue->CommandListMap.find(ZeCommandList); - if (it != Queue->CommandListMap.end()) { - if (ForcedCmdQueue && *ForcedCmdQueue != it->second.ZeQueue) - continue; - CommandList = it; - if (CommandList->second.ZeFence != nullptr) - CommandList->second.ZeFenceInUse = true; - } else { - // If there is a command list available on this context, but it - // wasn't yet used in this queue then create a new entry in this - // queue's map to hold the fence and other associated command - // list information. - auto &QGroup = Queue->getQueueGroup(UseCopyEngine); - uint32_t QueueGroupOrdinal; - auto &ZeCommandQueue = ForcedCmdQueue - ? *ForcedCmdQueue - : QGroup.getZeQueue(&QueueGroupOrdinal); - if (ForcedCmdQueue) - QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue); - - ze_fence_handle_t ZeFence; - ZeStruct ZeFenceDesc; - ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); - ZeStruct ZeQueueDesc; - ZeQueueDesc.ordinal = QueueGroupOrdinal; - CommandList = - Queue->CommandListMap - .emplace(ZeCommandList, - pi_command_list_info_t{ZeFence, true, false, - ZeCommandQueue, ZeQueueDesc}) - .first; - } - ZeCommandListCache.erase(ZeCommandListIt); - if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList)) - return Res; - if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine)) - return Res; - return PI_SUCCESS; - } - } - - // If there are no available command lists in the cache, then we check for - // command lists that have already signalled, but have not been added to the - // available list yet. Each command list has a fence associated which tracks - // if a command list has completed dispatch of its commands and is ready for - // reuse. If a command list is found to have been signalled, then the - // command list & fence are reset and we return. - for (auto it = Queue->CommandListMap.begin(); - it != Queue->CommandListMap.end(); ++it) { - // Make sure this is the command list type needed. - if (UseCopyEngine != it->second.isCopy(Queue)) - continue; - - ze_result_t ZeResult = - ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence)); - if (ZeResult == ZE_RESULT_SUCCESS) { - std::vector EventListToCleanup; - Queue->resetCommandList(it, false, EventListToCleanup); - CleanupEventListFromResetCmdList(EventListToCleanup, - true /* QueueLocked */); - CommandList = it; - CommandList->second.ZeFenceInUse = true; - if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList)) - return Res; - return PI_SUCCESS; - } - } - - // If there are no available command lists nor signalled command lists, - // then we must create another command list. - pi_result = Queue->createCommandList(UseCopyEngine, CommandList); - CommandList->second.ZeFenceInUse = true; - return pi_result; -} - -_pi_queue::pi_queue_group_t &_pi_queue::getQueueGroup(bool UseCopyEngine) { - auto &Map = (UseCopyEngine ? CopyQueueGroupsByTID : ComputeQueueGroupsByTID); - return Map.get(); -} - -// Helper function to create a new command-list to this queue and associated -// fence tracking its completion. This command list & fence are added to the -// map of command lists in this queue with ZeFenceInUse = false. -// The caller must hold a lock of the queue already. -pi_result -_pi_queue::createCommandList(bool UseCopyEngine, - pi_command_list_ptr_t &CommandList, - ze_command_queue_handle_t *ForcedCmdQueue) { - - ze_fence_handle_t ZeFence; - ZeStruct ZeFenceDesc; - ze_command_list_handle_t ZeCommandList; - - uint32_t QueueGroupOrdinal; - auto &QGroup = getQueueGroup(UseCopyEngine); - auto &ZeCommandQueue = - ForcedCmdQueue ? *ForcedCmdQueue : QGroup.getZeQueue(&QueueGroupOrdinal); - if (ForcedCmdQueue) - QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue); - - ZeStruct ZeCommandListDesc; - ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; - - ZE_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice, - &ZeCommandListDesc, &ZeCommandList)); - - ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); - ZeStruct ZeQueueDesc; - ZeQueueDesc.ordinal = QueueGroupOrdinal; - std::tie(CommandList, std::ignore) = CommandListMap.insert( - std::pair( - ZeCommandList, {ZeFence, false, false, ZeCommandQueue, ZeQueueDesc})); - - PI_CALL(insertStartBarrierIfDiscardEventsMode(CommandList)); - PI_CALL(insertActiveBarriers(CommandList, UseCopyEngine)); - return PI_SUCCESS; -} - -void _pi_queue::adjustBatchSizeForFullBatch(bool IsCopy) { - auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; - auto &ZeCommandListBatchConfig = - IsCopy ? ZeCommandListBatchCopyConfig : ZeCommandListBatchComputeConfig; - pi_uint32 &QueueBatchSize = CommandBatch.QueueBatchSize; - // QueueBatchSize of 0 means never allow batching. - if (QueueBatchSize == 0 || !ZeCommandListBatchConfig.dynamic()) - return; - CommandBatch.NumTimesClosedFull += 1; - - // If the number of times the list has been closed early is low, and - // the number of times it has been closed full is high, then raise - // the batching size slowly. Don't raise it if it is already pretty - // high. - if (CommandBatch.NumTimesClosedEarly <= - ZeCommandListBatchConfig.NumTimesClosedEarlyThreshold && - CommandBatch.NumTimesClosedFull > - ZeCommandListBatchConfig.NumTimesClosedFullThreshold) { - if (QueueBatchSize < ZeCommandListBatchConfig.DynamicSizeMax) { - QueueBatchSize += ZeCommandListBatchConfig.DynamicSizeStep; - urPrint("Raising QueueBatchSize to %d\n", QueueBatchSize); - } - CommandBatch.NumTimesClosedEarly = 0; - CommandBatch.NumTimesClosedFull = 0; - } -} - -void _pi_queue::adjustBatchSizeForPartialBatch(bool IsCopy) { - auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; - auto &ZeCommandListBatchConfig = - IsCopy ? ZeCommandListBatchCopyConfig : ZeCommandListBatchComputeConfig; - pi_uint32 &QueueBatchSize = CommandBatch.QueueBatchSize; - // QueueBatchSize of 0 means never allow batching. - if (QueueBatchSize == 0 || !ZeCommandListBatchConfig.dynamic()) - return; - CommandBatch.NumTimesClosedEarly += 1; - - // If we are closing early more than about 3x the number of times - // it is closing full, lower the batch size to the value of the - // current open command list. This is trying to quickly get to a - // batch size that will be able to be closed full at least once - // in a while. - if (CommandBatch.NumTimesClosedEarly > - (CommandBatch.NumTimesClosedFull + 1) * 3) { - QueueBatchSize = CommandBatch.OpenCommandList->second.size() - 1; - if (QueueBatchSize < 1) - QueueBatchSize = 1; - urPrint("Lowering QueueBatchSize to %d\n", QueueBatchSize); - CommandBatch.NumTimesClosedEarly = 0; - CommandBatch.NumTimesClosedFull = 0; - } -} - -void _pi_queue::CaptureIndirectAccesses() { - for (auto &Kernel : KernelsToBeSubmitted) { - if (!Kernel->hasIndirectAccess()) - continue; - - auto &Contexts = Device->Platform->Contexts; - for (auto &Ctx : Contexts) { - for (auto &Elem : Ctx->MemAllocs) { - const auto &Pair = Kernel->MemAllocs.insert(&Elem); - // Kernel is referencing this memory allocation from now. - // If this memory allocation was already captured for this kernel, it - // means that kernel is submitted several times. Increase reference - // count only once because we release all allocations only when - // SubmissionsCount turns to 0. We don't want to know how many times - // allocation was retained by each submission. - if (Pair.second) - Elem.second.RefCount.increment(); - } - } - Kernel->SubmissionsCount++; - } - KernelsToBeSubmitted.clear(); -} - -pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, - bool IsBlocking, - bool OKToBatchCommand) { - // Do nothing if command list is already closed. - if (CommandList->second.IsClosed) - return PI_SUCCESS; - - bool UseCopyEngine = CommandList->second.isCopy(this); - - // If the current LastCommandEvent is the nullptr, then it means - // either that no command has ever been issued to the queue - // or it means that the LastCommandEvent has been signalled and - // therefore that this Queue is idle. - // - // NOTE: this behavior adds some flakyness to the batching - // since last command's event may or may not be completed by the - // time we get here depending on timings and system/gpu load. - // So, disable it for modes where we print PI traces. Printing - // traces incurs much different timings than real execution - // ansyway, and many regression tests use it. - // - bool CurrentlyEmpty = !PrintTrace && this->LastCommandEvent == nullptr; - - // The list can be empty if command-list only contains signals of proxy - // events. It is possible that executeCommandList is called twice for the same - // command list without new appended command. We don't to want process the - // same last command event twice that's why additionally check that new - // command was appended to the command list. - if (!CommandList->second.EventList.empty() && - this->LastCommandEvent != CommandList->second.EventList.back()) { - this->LastCommandEvent = CommandList->second.EventList.back(); - if (doReuseDiscardedEvents()) { - PI_CALL(resetDiscardedEvent(CommandList)); - } - } - - this->LastUsedCommandList = CommandList; - - if (!UsingImmCmdLists) { - // Batch if allowed to, but don't batch if we know there are no kernels - // from this queue that are currently executing. This is intended to get - // kernels started as soon as possible when there are no kernels from this - // queue awaiting execution, while allowing batching to occur when there - // are kernels already executing. Also, if we are using fixed size batching, - // as indicated by !ZeCommandListBatch.dynamic(), then just ignore - // CurrentlyEmpty as we want to strictly follow the batching the user - // specified. - auto &CommandBatch = UseCopyEngine ? CopyCommandBatch : ComputeCommandBatch; - auto &ZeCommandListBatchConfig = UseCopyEngine - ? ZeCommandListBatchCopyConfig - : ZeCommandListBatchComputeConfig; - if (OKToBatchCommand && this->isBatchingAllowed(UseCopyEngine) && - (!ZeCommandListBatchConfig.dynamic() || !CurrentlyEmpty)) { - - if (hasOpenCommandList(UseCopyEngine) && - CommandBatch.OpenCommandList != CommandList) - die("executeCommandList: OpenCommandList should be equal to" - "null or CommandList"); - - if (CommandList->second.size() < CommandBatch.QueueBatchSize) { - CommandBatch.OpenCommandList = CommandList; - return PI_SUCCESS; - } - - adjustBatchSizeForFullBatch(UseCopyEngine); - CommandBatch.OpenCommandList = CommandListMap.end(); - } - } - - auto &ZeCommandQueue = CommandList->second.ZeQueue; - // Scope of the lock must be till the end of the function, otherwise new mem - // allocs can be created between the moment when we made a snapshot and the - // moment when command list is closed and executed. But mutex is locked only - // if indirect access tracking enabled, because std::defer_lock is used. - // unique_lock destructor at the end of the function will unlock the mutex - // if it was locked (which happens only if IndirectAccessTrackingEnabled is - // true). - std::unique_lock ContextsLock( - Device->Platform->ContextsMutex, std::defer_lock); - - if (IndirectAccessTrackingEnabled) { - // We are going to submit kernels for execution. If indirect access flag is - // set for a kernel then we need to make a snapshot of existing memory - // allocations in all contexts in the platform. We need to lock the mutex - // guarding the list of contexts in the platform to prevent creation of new - // memory alocations in any context before we submit the kernel for - // execution. - ContextsLock.lock(); - CaptureIndirectAccesses(); - } - - if (!UsingImmCmdLists) { - // In this mode all inner-batch events have device visibility only, - // and we want the last command in the batch to signal a host-visible - // event that anybody waiting for any event in the batch will - // really be using. - // We need to create a proxy host-visible event only if the list of events - // in the command list is not empty, otherwise we are going to just create - // and remove proxy event right away and dereference deleted object - // afterwards. - if (Device->ZeEventsScope == LastCommandInBatchHostVisible && - !CommandList->second.EventList.empty()) { - // If there are only internal events in the command list then we don't - // need to create host proxy event. - auto Result = - std::find_if(CommandList->second.EventList.begin(), - CommandList->second.EventList.end(), - [](pi_event E) { return E->hasExternalRefs(); }); - if (Result != CommandList->second.EventList.end()) { - // Create a "proxy" host-visible event. - // - pi_event HostVisibleEvent; - auto Res = createEventAndAssociateQueue( - this, &HostVisibleEvent, PI_COMMAND_TYPE_USER, CommandList, - /* IsInternal */ false, /* HostVisible */ true); - if (Res) - return Res; - - // Update each command's event in the command-list to "see" this - // proxy event as a host-visible counterpart. - for (auto &Event : CommandList->second.EventList) { - std::scoped_lock EventLock(Event->Mutex); - // Internal event doesn't need host-visible proxy. - if (!Event->hasExternalRefs()) - continue; - - if (!Event->HostVisibleEvent) { - Event->HostVisibleEvent = HostVisibleEvent; - HostVisibleEvent->RefCount.increment(); - } - } - - // Decrement the reference count of the event such that all the - // remaining references are from the other commands in this batch and - // from the command-list itself. This host-visible event will not be - // waited/released by SYCL RT, so it must be destroyed after all events - // in the batch are gone. We know that refcount is more than 2 because - // we check that EventList of the command list is not empty above, i.e. - // after createEventAndAssociateQueue ref count is 2 and then +1 for - // each event in the EventList. - PI_CALL(piEventReleaseInternal(HostVisibleEvent)); - - if (doReuseDiscardedEvents()) { - // If we have in-order queue with discarded events then we want to - // treat this event as regular event. We insert a barrier in the next - // command list to wait for this event. - LastCommandEvent = HostVisibleEvent; - } else { - // For all other queues treat this as a special event and indicate no - // cleanup is needed. - // TODO: always treat this host event as a regular event. - PI_CALL(piEventReleaseInternal(HostVisibleEvent)); - HostVisibleEvent->CleanedUp = true; - } - - // Finally set to signal the host-visible event at the end of the - // command-list after a barrier that waits for all commands - // completion. - if (doReuseDiscardedEvents() && LastCommandEvent && - LastCommandEvent->IsDiscarded) { - // If we the last event is discarded then we already have a barrier - // inserted, so just signal the event. - ZE_CALL(zeCommandListAppendSignalEvent, - (CommandList->first, HostVisibleEvent->ZeEvent)); - } else { - ZE_CALL(zeCommandListAppendBarrier, - (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr)); - } - } else { - // If we don't have host visible proxy then signal event if needed. - this->signalEventFromCmdListIfLastEventDiscarded(CommandList); - } - } else { - // If we don't have host visible proxy then signal event if needed. - this->signalEventFromCmdListIfLastEventDiscarded(CommandList); - } - - // Close the command list and have it ready for dispatch. - ZE_CALL(zeCommandListClose, (CommandList->first)); - // Mark this command list as closed. - CommandList->second.IsClosed = true; - this->LastUsedCommandList = CommandListMap.end(); - // Offload command list to the GPU for asynchronous execution - auto ZeCommandList = CommandList->first; - auto ZeResult = ZE_CALL_NOCHECK( - zeCommandQueueExecuteCommandLists, - (ZeCommandQueue, 1, &ZeCommandList, CommandList->second.ZeFence)); - if (ZeResult != ZE_RESULT_SUCCESS) { - this->Healthy = false; - if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) { - // Turn into a more informative end-user error. - return PI_ERROR_COMMAND_EXECUTION_FAILURE; - } - return mapError(ZeResult); - } - } - - // Check global control to make every command blocking for debugging. - if (IsBlocking || (UrL0Serialize & UrL0SerializeBlock) != 0) { - if (UsingImmCmdLists) { - synchronize(); - } else { - // Wait until command lists attached to the command queue are executed. - ZE_CALL(zeHostSynchronize, (ZeCommandQueue)); - } - } - return PI_SUCCESS; -} - -bool _pi_queue::isBatchingAllowed(bool IsCopy) const { - auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; - return (CommandBatch.QueueBatchSize > 0 && - ((UrL0Serialize & UrL0SerializeBlock) == 0)); -} - -// Return the index of the next queue to use based on a -// round robin strategy and the queue group ordinal. -uint32_t _pi_queue::pi_queue_group_t::getQueueIndex(uint32_t *QueueGroupOrdinal, - uint32_t *QueueIndex, - bool QueryOnly) { - auto CurrentIndex = NextIndex; - - if (!QueryOnly) { - ++NextIndex; - if (NextIndex > UpperIndex) - NextIndex = LowerIndex; - } - - // Find out the right queue group ordinal (first queue might be "main" or - // "link") - auto QueueType = Type; - if (QueueType != queue_type::Compute) - QueueType = (CurrentIndex == 0 && Queue->Device->hasMainCopyEngine()) - ? queue_type::MainCopy - : queue_type::LinkCopy; - - *QueueGroupOrdinal = Queue->Device->QueueGroup[QueueType].ZeOrdinal; - // Adjust the index to the L0 queue group since we represent "main" and - // "link" - // L0 groups with a single copy group ("main" would take "0" index). - auto ZeCommandQueueIndex = CurrentIndex; - if (QueueType == queue_type::LinkCopy && Queue->Device->hasMainCopyEngine()) { - ZeCommandQueueIndex -= 1; - } - *QueueIndex = ZeCommandQueueIndex; - - return CurrentIndex; -} - -int32_t _pi_queue::pi_queue_group_t::getCmdQueueOrdinal( - ze_command_queue_handle_t CmdQueue) { - // Find out the right queue group ordinal (first queue might be "main" or - // "link") - auto QueueType = Type; - if (QueueType != queue_type::Compute) - QueueType = (ZeQueues[0] == CmdQueue && Queue->Device->hasMainCopyEngine()) - ? queue_type::MainCopy - : queue_type::LinkCopy; - return Queue->Device->QueueGroup[QueueType].ZeOrdinal; -} - -// This function will return one of possibly multiple available native -// queues and the value of the queue group ordinal. -ze_command_queue_handle_t & -_pi_queue::pi_queue_group_t::getZeQueue(uint32_t *QueueGroupOrdinal) { - - // QueueIndex is the proper L0 index. - // Index is the plugins concept of index, with main and link copy engines in - // one range. - uint32_t QueueIndex; - auto Index = getQueueIndex(QueueGroupOrdinal, &QueueIndex); - - ze_command_queue_handle_t &ZeQueue = ZeQueues[Index]; - if (ZeQueue) - return ZeQueue; - - ZeStruct ZeCommandQueueDesc; - ZeCommandQueueDesc.ordinal = *QueueGroupOrdinal; - ZeCommandQueueDesc.index = QueueIndex; - ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; - const char *Priority = "Normal"; - if (Queue->isPriorityLow()) { - ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW; - Priority = "Low"; - } else if (Queue->isPriorityHigh()) { - ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH; - Priority = "High"; - } - - // Evaluate performance of explicit usage for "0" index. - if (QueueIndex != 0) { - ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY; - } - - urPrint("[getZeQueue]: create queue ordinal = %d, index = %d " - "(round robin in [%d, %d]) priority = %s\n", - ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex, - UpperIndex, Priority); - - auto ZeResult = ZE_CALL_NOCHECK( - zeCommandQueueCreate, (Queue->Context->ZeContext, Queue->Device->ZeDevice, - &ZeCommandQueueDesc, &ZeQueue)); - if (ZeResult) { - die("[L0] getZeQueue: failed to create queue"); - } - - return ZeQueue; -} - -// This function will return one of possibly multiple available -// immediate commandlists associated with this Queue. -pi_command_list_ptr_t &_pi_queue::pi_queue_group_t::getImmCmdList() { - uint32_t QueueIndex, QueueOrdinal; - auto Index = getQueueIndex(&QueueOrdinal, &QueueIndex); - - if (ImmCmdLists[Index] != Queue->CommandListMap.end()) - return ImmCmdLists[Index]; - - ZeStruct ZeCommandQueueDesc; - ZeCommandQueueDesc.ordinal = QueueOrdinal; - ZeCommandQueueDesc.index = QueueIndex; - ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; - const char *Priority = "Normal"; - if (Queue->isPriorityLow()) { - ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW; - Priority = "Low"; - } else if (Queue->isPriorityHigh()) { - ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH; - Priority = "High"; - } - // Evaluate performance of explicit usage for "0" index. - if (QueueIndex != 0) { - ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY; - } - - // Check if context's command list cache has an immediate command list with - // matching index. - ze_command_list_handle_t ZeCommandList = nullptr; - { - // Acquire lock to avoid race conditions. - std::scoped_lock Lock(Queue->Context->ZeCommandListCacheMutex); - // Under mutex since operator[] does insertion on the first usage for every - // unique ZeDevice. - auto &ZeCommandListCache = - isCopy() - ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice] - : Queue->Context - ->ZeComputeCommandListCache[Queue->Device->ZeDevice]; - for (auto ZeCommandListIt = ZeCommandListCache.begin(); - ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) { - const auto &Desc = (*ZeCommandListIt).second; - if (Desc.index == ZeCommandQueueDesc.index && - Desc.flags == ZeCommandQueueDesc.flags && - Desc.mode == ZeCommandQueueDesc.mode && - Desc.priority == ZeCommandQueueDesc.priority) { - ZeCommandList = (*ZeCommandListIt).first; - ZeCommandListCache.erase(ZeCommandListIt); - break; - } - } - } - - // If cache didn't contain a command list, create one. - if (!ZeCommandList) { - urPrint("[getZeQueue]: create queue ordinal = %d, index = %d " - "(round robin in [%d, %d]) priority = %s\n", - ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex, - UpperIndex, Priority); - - ZE_CALL_NOCHECK(zeCommandListCreateImmediate, - (Queue->Context->ZeContext, Queue->Device->ZeDevice, - &ZeCommandQueueDesc, &ZeCommandList)); - } - - ImmCmdLists[Index] = - Queue->CommandListMap - .insert(std::pair{ - ZeCommandList, - {nullptr, true, false, nullptr, ZeCommandQueueDesc}}) - .first; - - return ImmCmdLists[Index]; -} - -pi_command_list_ptr_t _pi_queue::eventOpenCommandList(pi_event Event) { - using IsCopy = bool; - - if (UsingImmCmdLists) { - // When using immediate commandlists there are no open command lists. - return CommandListMap.end(); - } - - if (hasOpenCommandList(IsCopy{false})) { - const auto &ComputeEventList = - ComputeCommandBatch.OpenCommandList->second.EventList; - if (std::find(ComputeEventList.begin(), ComputeEventList.end(), Event) != - ComputeEventList.end()) - return ComputeCommandBatch.OpenCommandList; - } - if (hasOpenCommandList(IsCopy{true})) { - const auto &CopyEventList = - CopyCommandBatch.OpenCommandList->second.EventList; - if (std::find(CopyEventList.begin(), CopyEventList.end(), Event) != - CopyEventList.end()) - return CopyCommandBatch.OpenCommandList; - } - return CommandListMap.end(); -} - -pi_result _pi_queue::insertStartBarrierIfDiscardEventsMode( - pi_command_list_ptr_t &CmdList) { - // If current command list is different from the last command list then insert - // a barrier waiting for the last command event. - if (doReuseDiscardedEvents() && CmdList != LastUsedCommandList && - LastCommandEvent) { - ZE_CALL(zeCommandListAppendBarrier, - (CmdList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); - LastCommandEvent = nullptr; - } - return PI_SUCCESS; -} - -pi_result _pi_queue::insertActiveBarriers(pi_command_list_ptr_t &CmdList, - bool UseCopyEngine) { - // Early exit if there are no active barriers. - if (ActiveBarriers.empty()) - return PI_SUCCESS; - - // Create a wait-list and retain events. - _pi_ze_event_list_t ActiveBarriersWaitList; - if (auto Res = ActiveBarriersWaitList.createAndRetainPiZeEventList( - ActiveBarriers.vector().size(), ActiveBarriers.vector().data(), this, - UseCopyEngine)) - return Res; - - // We can now replace active barriers with the ones in the wait list. - if (auto Res = ActiveBarriers.clear()) - return Res; - - if (ActiveBarriersWaitList.Length == 0) { - return PI_SUCCESS; - } - - for (pi_uint32 I = 0; I < ActiveBarriersWaitList.Length; ++I) { - auto &Event = ActiveBarriersWaitList.PiEventList[I]; - ActiveBarriers.add(Event); - } - - pi_event Event = nullptr; - if (auto Res = createEventAndAssociateQueue( - this, &Event, PI_COMMAND_TYPE_USER, CmdList, /*IsInternal*/ true)) - return Res; - - Event->WaitList = ActiveBarriersWaitList; - Event->OwnZeEvent = true; - - // If there are more active barriers, insert a barrier on the command-list. We - // do not need an event for finishing so we pass nullptr. - ZE_CALL(zeCommandListAppendBarrier, - (CmdList->first, nullptr, ActiveBarriersWaitList.Length, - ActiveBarriersWaitList.ZeEventList)); - return PI_SUCCESS; -} - -pi_result _pi_queue::executeOpenCommandList(bool IsCopy) { - auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; - // If there are any commands still in the open command list for this - // queue, then close and execute that command list now. - if (hasOpenCommandList(IsCopy)) { - adjustBatchSizeForPartialBatch(IsCopy); - auto Res = executeCommandList(CommandBatch.OpenCommandList, false, false); - CommandBatch.OpenCommandList = CommandListMap.end(); - return Res; - } - - return PI_SUCCESS; -} - -static const bool FilterEventWaitList = [] { - const char *UrRet = std::getenv("UR_L0_FILTER_EVENT_WAIT_LIST"); - const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST"); - return (UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0)); -}(); - -pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( - pi_uint32 EventListLength, const pi_event *EventList, pi_queue CurQueue, - bool UseCopyEngine) { - this->Length = 0; - this->ZeEventList = nullptr; - this->PiEventList = nullptr; - - if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) { - if (CurQueue->UsingImmCmdLists) { - if (ReuseDiscardedEvents && CurQueue->isDiscardEvents()) { - // If queue is in-order with discarded events and if - // new command list is different from the last used command list then - // signal new event from the last immediate command list. We are going - // to insert a barrier in the new command list waiting for that event. - auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine); - uint32_t QueueGroupOrdinal, QueueIndex; - auto NextIndex = - QueueGroup.getQueueIndex(&QueueGroupOrdinal, &QueueIndex, - /*QueryOnly */ true); - auto NextImmCmdList = QueueGroup.ImmCmdLists[NextIndex]; - if (CurQueue->LastUsedCommandList != CurQueue->CommandListMap.end() && - CurQueue->LastUsedCommandList != NextImmCmdList) { - CurQueue->signalEventFromCmdListIfLastEventDiscarded( - CurQueue->LastUsedCommandList); - } - } - } else { - // Ensure LastCommandEvent's batch is submitted if it is differrent - // from the one this command is going to. If we reuse discarded events - // then signalEventFromCmdListIfLastEventDiscarded will be called at batch - // close if needed. - const auto &OpenCommandList = - CurQueue->eventOpenCommandList(CurQueue->LastCommandEvent); - if (OpenCommandList != CurQueue->CommandListMap.end() && - OpenCommandList->second.isCopy(CurQueue) != UseCopyEngine) { - - if (auto Res = CurQueue->executeOpenCommandList( - OpenCommandList->second.isCopy(CurQueue))) - return Res; - } - } - } - - // For in-order queues, every command should be executed only after the - // previous command has finished. The event associated with the last - // enqueued command is added into the waitlist to ensure in-order semantics. - bool IncludeLastCommandEvent = - CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr; - - // If the last event is discarded then we already have a barrier waiting for - // that event, so must not include the last command event into the wait - // list because it will cause waiting for event which was reset. - if (ReuseDiscardedEvents && CurQueue->isDiscardEvents() && - CurQueue->LastCommandEvent && CurQueue->LastCommandEvent->IsDiscarded) - IncludeLastCommandEvent = false; - - try { - pi_uint32 TmpListLength = 0; - - if (IncludeLastCommandEvent) { - this->ZeEventList = new ze_event_handle_t[EventListLength + 1]; - this->PiEventList = new pi_event[EventListLength + 1]; - std::shared_lock Lock(CurQueue->LastCommandEvent->Mutex); - this->ZeEventList[0] = CurQueue->LastCommandEvent->ZeEvent; - this->PiEventList[0] = CurQueue->LastCommandEvent; - TmpListLength = 1; - } else if (EventListLength > 0) { - this->ZeEventList = new ze_event_handle_t[EventListLength]; - this->PiEventList = new pi_event[EventListLength]; - } - - if (EventListLength > 0) { - for (pi_uint32 I = 0; I < EventListLength; I++) { - PI_ASSERT(EventList[I] != nullptr, PI_ERROR_INVALID_VALUE); - { - std::shared_lock Lock(EventList[I]->Mutex); - if (EventList[I]->Completed) - continue; - - // Poll of the host-visible events. - auto HostVisibleEvent = EventList[I]->HostVisibleEvent; - if (FilterEventWaitList && HostVisibleEvent) { - auto Res = ZE_CALL_NOCHECK(zeEventQueryStatus, - (HostVisibleEvent->ZeEvent)); - if (Res == ZE_RESULT_SUCCESS) { - // Event has already completed, don't put it into the list - continue; - } - } - } - - auto Queue = EventList[I]->Queue; - if (Queue) { - // The caller of createAndRetainPiZeEventList must already hold - // a lock of the CurQueue. Additionally lock the Queue if it - // is different from CurQueue. - // TODO: rework this to avoid deadlock when another thread is - // locking the same queues but in a different order. - auto Lock = ((Queue == CurQueue) - ? std::unique_lock() - : std::unique_lock(Queue->Mutex)); - - // If the event that is going to be waited is in an open batch - // different from where this next command is going to be added, - // then we have to force execute of that open command-list - // to avoid deadlocks. - // - const auto &OpenCommandList = - Queue->eventOpenCommandList(EventList[I]); - if (OpenCommandList != Queue->CommandListMap.end()) { - - if (Queue == CurQueue && - OpenCommandList->second.isCopy(Queue) == UseCopyEngine) { - // Don't force execute the batch yet since the new command - // is going to the same open batch as the dependent event. - } else { - if (auto Res = Queue->executeOpenCommandList( - OpenCommandList->second.isCopy(Queue))) - return Res; - } - } - } else { - // There is a dependency on an interop-event. - // Similarily to the above to avoid dead locks ensure that - // execution of all prior commands in the current command- - // batch is visible to the host. This may not be the case - // when we intended to have only last command in the batch - // produce host-visible event, e.g. - // - // event0 = interop event - // event1 = command1 (already in batch, no deps) - // event2 = command2 (is being added, dep on event0) - // event3 = signal host-visible event for the batch - // event1.wait() - // event0.signal() - // - // Make sure that event1.wait() will wait for a host-visible - // event that is signalled before the command2 is enqueued. - if (CurQueue->Device->ZeEventsScope != AllHostVisible) { - CurQueue->executeAllOpenCommandLists(); - } - } - - std::shared_lock Lock(EventList[I]->Mutex); - this->ZeEventList[TmpListLength] = EventList[I]->ZeEvent; - this->PiEventList[TmpListLength] = EventList[I]; - TmpListLength += 1; - } - } - - this->Length = TmpListLength; - - } catch (...) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } - - for (pi_uint32 I = 0; I < this->Length; I++) { - this->PiEventList[I]->RefCount.increment(); - } - - return PI_SUCCESS; -} - -static void printZeEventList(const _pi_ze_event_list_t &PiZeEventList) { - urPrint(" NumEventsInWaitList %d:", PiZeEventList.Length); - - for (pi_uint32 I = 0; I < PiZeEventList.Length; I++) { - urPrint(" %#llx", ur_cast(PiZeEventList.ZeEventList[I])); - } - - urPrint("\n"); -} - -pi_result _pi_ze_event_list_t::collectEventsForReleaseAndDestroyPiZeEventList( - std::list &EventsToBeReleased) { - // acquire a lock before reading the length and list fields. - // Acquire the lock, copy the needed data locally, and reset - // the fields, then release the lock. - // Only then do we do the actual actions to release and destroy, - // holding the lock for the minimum time necessary. - pi_uint32 LocLength = 0; - ze_event_handle_t *LocZeEventList = nullptr; - pi_event *LocPiEventList = nullptr; - - { - // acquire the lock and copy fields locally - // Lock automatically releases when this goes out of scope. - std::scoped_lock lock(this->PiZeEventListMutex); - - LocLength = Length; - LocZeEventList = ZeEventList; - LocPiEventList = PiEventList; - - Length = 0; - ZeEventList = nullptr; - PiEventList = nullptr; - - // release lock by ending scope. - } - - for (pi_uint32 I = 0; I < LocLength; I++) { - // Add the event to be released to the list - EventsToBeReleased.push_back(LocPiEventList[I]); - } - - if (LocZeEventList != nullptr) { - delete[] LocZeEventList; - } - if (LocPiEventList != nullptr) { - delete[] LocPiEventList; - } - - return PI_SUCCESS; -} - extern "C" { // Forward declarations decltype(piEventCreate) piEventCreate; -static ze_result_t -checkUnresolvedSymbols(ze_module_handle_t ZeModule, - ze_module_build_log_handle_t *ZeBuildLog); - pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, pi_uint32 *NumPlatforms) { return pi2ur::piPlatformsGet(NumEntries, Platforms, NumPlatforms); @@ -2188,10 +31,6 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, pi_result piPlatformGetInfo(pi_platform Platform, pi_platform_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - urPrint("==========================\n"); - urPrint("SYCL over Level-Zero %s\n", Platform->ZeDriverVersion.c_str()); - urPrint("==========================\n"); - // To distinguish this L0 platform from Unified Runtime one. if (ParamName == PI_PLATFORM_INFO_NAME) { ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); @@ -2203,86 +42,17 @@ pi_result piPlatformGetInfo(pi_platform Platform, pi_platform_info ParamName, pi_result piextPlatformGetNativeHandle(pi_platform Platform, pi_native_handle *NativeHandle) { - PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM); - PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - auto ZeDriver = ur_cast(NativeHandle); - // Extract the Level Zero driver handle from the given PI platform - *ZeDriver = Platform->ZeDriver; - return PI_SUCCESS; + return pi2ur::piextPlatformGetNativeHandle(Platform, NativeHandle); } pi_result piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle, pi_platform *Platform) { - PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM); - PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - auto ZeDriver = ur_cast(NativeHandle); - - pi_uint32 NumPlatforms = 0; - pi_result Res = piPlatformsGet(0, nullptr, &NumPlatforms); - if (Res != PI_SUCCESS) { - return Res; - } - - if (NumPlatforms) { - std::vector Platforms(NumPlatforms); - PI_CALL(piPlatformsGet(NumPlatforms, Platforms.data(), nullptr)); - - // The SYCL spec requires that the set of platforms must remain fixed for - // the duration of the application's execution. We assume that we found all - // of the Level Zero drivers when we initialized the platform cache, so the - // "NativeHandle" must already be in the cache. If it is not, this must not - // be a valid Level Zero driver. - for (const pi_platform &CachedPlatform : Platforms) { - if (CachedPlatform->ZeDriver == ZeDriver) { - *Platform = CachedPlatform; - return PI_SUCCESS; - } - } - } - - return PI_ERROR_INVALID_VALUE; + return pi2ur::piextPlatformCreateWithNativeHandle(NativeHandle, Platform); } pi_result piPluginGetLastError(char **message) { - return pi2ur::piPluginGetLastError(message); -} - -// Returns plugin specific backend option. -// Current support is only for optimization options. -// Return '-ze-opt-disable' for frontend_option = -O0. -// Return '-ze-opt-level=1' for frontend_option = -O1 or -O2. -// Return '-ze-opt-level=2' for frontend_option = -O3. -pi_result piPluginGetBackendOption(pi_platform, const char *frontend_option, - const char **backend_option) { - using namespace std::literals; - if (frontend_option == nullptr) { - return PI_ERROR_INVALID_VALUE; - } - if (frontend_option == ""sv) { - *backend_option = ""; - return PI_SUCCESS; - } - if (frontend_option == "-O0"sv) { - *backend_option = "-ze-opt-disable"; - return PI_SUCCESS; - } - if (frontend_option == "-O1"sv || frontend_option == "-O2"sv) { - *backend_option = "-ze-opt-level=1"; - return PI_SUCCESS; - } - if (frontend_option == "-O3"sv) { - *backend_option = "-ze-opt-level=2"; - return PI_SUCCESS; - } - return PI_ERROR_INVALID_VALUE; -} - -pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, - pi_uint32 NumEntries, pi_device *Devices, - pi_uint32 *NumDevices) { - return pi2ur::piDevicesGet(Platform, DeviceType, NumEntries, Devices, NumDevices); } @@ -2313,95 +83,22 @@ pi_result piextDeviceSelectBinary(pi_device Device, // TODO: does this need to be context? pi_device_binary *Binaries, pi_uint32 NumBinaries, pi_uint32 *SelectedBinaryInd) { - - PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); - PI_ASSERT(SelectedBinaryInd, PI_ERROR_INVALID_VALUE); - PI_ASSERT(NumBinaries == 0 || Binaries, PI_ERROR_INVALID_VALUE); - - // TODO: this is a bare-bones implementation for choosing a device image - // that would be compatible with the targeted device. An AOT-compiled - // image is preferred over SPIR-V for known devices (i.e. Intel devices) - // The implementation makes no effort to differentiate between multiple images - // for the given device, and simply picks the first one compatible. - // - // Real implementation will use the same mechanism OpenCL ICD dispatcher - // uses. Something like: - // PI_VALIDATE_HANDLE_RETURN_HANDLE(ctx, PI_ERROR_INVALID_CONTEXT); - // return context->dispatch->piextDeviceSelectIR( - // ctx, images, num_images, selected_image); - // where context->dispatch is set to the dispatch table provided by PI - // plugin for platform/device the ctx was created for. - - // Look for GEN binary, which we known can only be handled by Level-Zero now. - const char *BinaryTarget = __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN; - - // Find the appropriate device image, fallback to spirv if not found - constexpr pi_uint32 InvalidInd = std::numeric_limits::max(); - pi_uint32 Spirv = InvalidInd; - - for (pi_uint32 i = 0; i < NumBinaries; ++i) { - if (strcmp(Binaries[i]->DeviceTargetSpec, BinaryTarget) == 0) { - *SelectedBinaryInd = i; - return PI_SUCCESS; - } - if (strcmp(Binaries[i]->DeviceTargetSpec, - __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64) == 0) - Spirv = i; - } - // Points to a spirv image, if such indeed was found - if ((*SelectedBinaryInd = Spirv) != InvalidInd) - return PI_SUCCESS; - - // No image can be loaded for the given device - return PI_ERROR_INVALID_BINARY; + return pi2ur::piextDeviceSelectBinary(Device, Binaries, NumBinaries, + SelectedBinaryInd); } pi_result piextDeviceGetNativeHandle(pi_device Device, pi_native_handle *NativeHandle) { - PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); - PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - auto ZeDevice = ur_cast(NativeHandle); - // Extract the Level Zero module handle from the given PI device - *ZeDevice = Device->ZeDevice; - return PI_SUCCESS; + return pi2ur::piextDeviceGetNativeHandle(Device, NativeHandle); } pi_result piextDeviceCreateWithNativeHandle(pi_native_handle NativeHandle, pi_platform Platform, pi_device *Device) { - PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); - PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - - auto ZeDevice = ur_cast(NativeHandle); - - // The SYCL spec requires that the set of devices must remain fixed for the - // duration of the application's execution. We assume that we found all of the - // Level Zero devices when we initialized the platforms/devices cache, so the - // "NativeHandle" must already be in the cache. If it is not, this must not be - // a valid Level Zero device. - // - // TODO: maybe we should populate cache of platforms if it wasn't already. - // For now assert that is was populated. - PI_ASSERT(PiPlatformCachePopulated, PI_ERROR_INVALID_VALUE); - const std::lock_guard Lock{*PiPlatformsCacheMutex}; - - pi_device Dev = nullptr; - for (pi_platform ThePlatform : *PiPlatformsCache) { - Dev = ThePlatform->getDeviceFromNativeHandle(ZeDevice); - if (Dev) { - // Check that the input Platform, if was given, matches the found one. - PI_ASSERT(!Platform || Platform == ThePlatform, - PI_ERROR_INVALID_PLATFORM); - break; - } - } - if (Dev == nullptr) - return PI_ERROR_INVALID_VALUE; - - *Device = Dev; - return PI_SUCCESS; + return pi2ur::piextDeviceCreateWithNativeHandle(NativeHandle, Platform, + Device); } pi_result piContextCreate(const pi_context_properties *Properties, @@ -2410,96 +107,28 @@ pi_result piContextCreate(const pi_context_properties *Properties, const void *PrivateInfo, size_t CB, void *UserData), void *UserData, pi_context *RetContext) { - (void)Properties; - (void)PFnNotify; - (void)UserData; - PI_ASSERT(NumDevices, PI_ERROR_INVALID_VALUE); - PI_ASSERT(Devices, PI_ERROR_INVALID_DEVICE); - PI_ASSERT(RetContext, PI_ERROR_INVALID_VALUE); - - pi_platform Platform = (*Devices)->Platform; - ZeStruct ContextDesc; - ContextDesc.flags = 0; - - ze_context_handle_t ZeContext; - ZE_CALL(zeContextCreate, (Platform->ZeDriver, &ContextDesc, &ZeContext)); - try { - *RetContext = new _pi_context(ZeContext, NumDevices, Devices, true); - (*RetContext)->initialize(); - if (IndirectAccessTrackingEnabled) { - std::scoped_lock Lock(Platform->ContextsMutex); - Platform->Contexts.push_back(*RetContext); - } - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - return PI_SUCCESS; + return pi2ur::piContextCreate(Properties, NumDevices, Devices, PFnNotify, + UserData, RetContext); } pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - - std::shared_lock Lock(Context->Mutex); - ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); - switch (ParamName) { - case PI_CONTEXT_INFO_DEVICES: - return ReturnValue(&Context->Devices[0], Context->Devices.size()); - case PI_CONTEXT_INFO_NUM_DEVICES: - return ReturnValue(pi_uint32(Context->Devices.size())); - case PI_CONTEXT_INFO_REFERENCE_COUNT: - return ReturnValue(pi_uint32{Context->RefCount.load()}); - case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: - // 2D USM memcpy is supported unless disabled through - // UR_L0_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D. - return ReturnValue(pi_bool{UseMemcpy2DOperations}); - case PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT: - case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT: - // 2D USM fill and memset is not supported. - return ReturnValue(pi_bool{false}); - case PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: - case PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: - case PI_EXT_CONTEXT_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: - case PI_EXT_CONTEXT_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { - // These queries should be dealt with in context_impl.cpp by calling the - // queries of each device separately and building the intersection set. - setErrorMessage("These queries should have never come here.", - UR_RESULT_ERROR_INVALID_VALUE); - return PI_ERROR_PLUGIN_SPECIFIC_ERROR; - } - default: - // TODO: implement other parameters - die("piGetContextInfo: unsuppported ParamName."); - } - - return PI_SUCCESS; + return pi2ur::piContextGetInfo(Context, ParamName, ParamValueSize, ParamValue, + ParamValueSizeRet); } // FIXME: Dummy implementation to prevent link fail pi_result piextContextSetExtendedDeleter(pi_context Context, pi_context_extended_deleter Function, void *UserData) { - (void)Context; - (void)Function; - (void)UserData; - die("piextContextSetExtendedDeleter: not supported"); - return PI_SUCCESS; + return pi2ur::piextContextSetExtendedDeleter(Context, Function, UserData); } pi_result piextContextGetNativeHandle(pi_context Context, pi_native_handle *NativeHandle) { - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - - auto ZeContext = ur_cast(NativeHandle); - // Extract the Level Zero queue handle from the given PI queue - *ZeContext = Context->ZeContext; - return PI_SUCCESS; + return pi2ur::piextContextGetNativeHandle(Context, NativeHandle); } pi_result piextContextCreateWithNativeHandle(pi_native_handle NativeHandle, @@ -2507,81 +136,17 @@ pi_result piextContextCreateWithNativeHandle(pi_native_handle NativeHandle, const pi_device *Devices, bool OwnNativeHandle, pi_context *RetContext) { - PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - PI_ASSERT(Devices, PI_ERROR_INVALID_DEVICE); - PI_ASSERT(RetContext, PI_ERROR_INVALID_VALUE); - PI_ASSERT(NumDevices, PI_ERROR_INVALID_VALUE); - - try { - *RetContext = new _pi_context(ur_cast(NativeHandle), - NumDevices, Devices, OwnNativeHandle); - (*RetContext)->initialize(); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - return PI_SUCCESS; + return pi2ur::piextContextCreateWithNativeHandle( + NativeHandle, NumDevices, Devices, OwnNativeHandle, RetContext); } pi_result piContextRetain(pi_context Context) { - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - - Context->RefCount.increment(); - return PI_SUCCESS; -} - -// Helper function to release the context, a caller must lock the platform-level -// mutex guarding the container with contexts because the context can be removed -// from the list of tracked contexts. -pi_result ContextReleaseHelper(pi_context Context) { - - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - - if (!Context->RefCount.decrementAndTest()) - return PI_SUCCESS; - - if (IndirectAccessTrackingEnabled) { - pi_platform Plt = Context->getPlatform(); - auto &Contexts = Plt->Contexts; - auto It = std::find(Contexts.begin(), Contexts.end(), Context); - if (It != Contexts.end()) - Contexts.erase(It); - } - ze_context_handle_t DestoryZeContext = - Context->OwnZeContext ? Context->ZeContext : nullptr; - - // Clean up any live memory associated with Context - pi_result Result = Context->finalize(); - - // We must delete Context first and then destroy zeContext because - // Context deallocation requires ZeContext in some member deallocation of - // pi_context. - delete Context; - - // Destruction of some members of pi_context uses L0 context - // and therefore it must be valid at that point. - // Technically it should be placed to the destructor of pi_context - // but this makes API error handling more complex. - if (DestoryZeContext) { - auto ZeResult = ZE_CALL_NOCHECK(zeContextDestroy, (DestoryZeContext)); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return mapError(ZeResult); - } - return Result; + return pi2ur::piContextRetain(Context); } pi_result piContextRelease(pi_context Context) { - pi_platform Plt = Context->getPlatform(); - std::unique_lock ContextsLock(Plt->ContextsMutex, - std::defer_lock); - if (IndirectAccessTrackingEnabled) - ContextsLock.lock(); - - return ContextReleaseHelper(Context); + return pi2ur::piContextRelease(Context); } pi_result piQueueCreate(pi_context Context, pi_device Device, @@ -2592,1063 +157,83 @@ pi_result piQueueCreate(pi_context Context, pi_device Device, pi_result piextQueueCreate(pi_context Context, pi_device Device, pi_queue_properties *Properties, pi_queue *Queue) { - PI_ASSERT(Properties, PI_ERROR_INVALID_VALUE); - // Expect flags mask to be passed first. - PI_ASSERT(Properties[0] == PI_QUEUE_FLAGS, PI_ERROR_INVALID_VALUE); - pi_queue_properties Flags = Properties[1]; - - PI_ASSERT(Properties[2] == 0 || - (Properties[2] == PI_QUEUE_COMPUTE_INDEX && Properties[4] == 0), - PI_ERROR_INVALID_VALUE); - auto ForceComputeIndex = Properties[2] == PI_QUEUE_COMPUTE_INDEX - ? static_cast(Properties[3]) - : -1; // Use default/round-robin. - - // Check that unexpected bits are not set. - PI_ASSERT( - !(Flags & ~(PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE | - PI_QUEUE_FLAG_PROFILING_ENABLE | PI_QUEUE_FLAG_ON_DEVICE | - PI_QUEUE_FLAG_ON_DEVICE_DEFAULT | - PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS | - PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW | - PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH)), - PI_ERROR_INVALID_VALUE); - - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); - PI_ASSERT(Context->isValidDevice(Device), PI_ERROR_INVALID_DEVICE); - - // Create placeholder queues in the compute queue group. - // Actual L0 queues will be created at first use. - std::vector ZeComputeCommandQueues( - Device->QueueGroup[_pi_queue::queue_type::Compute].ZeProperties.numQueues, - nullptr); - - // Create placeholder queues in the copy queue group (main and link - // native groups are combined into one group). - // Actual L0 queues will be created at first use. - size_t NumCopyGroups = 0; - if (Device->hasMainCopyEngine()) { - NumCopyGroups += Device->QueueGroup[_pi_queue::queue_type::MainCopy] - .ZeProperties.numQueues; - } - if (Device->hasLinkCopyEngine()) { - NumCopyGroups += Device->QueueGroup[_pi_queue::queue_type::LinkCopy] - .ZeProperties.numQueues; - } - std::vector ZeCopyCommandQueues(NumCopyGroups, - nullptr); - - try { - *Queue = new _pi_queue(ZeComputeCommandQueues, ZeCopyCommandQueues, Context, - Device, true, Flags, ForceComputeIndex); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - // Do eager initialization of Level Zero handles on request. - if (doEagerInit) { - pi_queue Q = *Queue; - // Creates said number of command-lists. - auto warmupQueueGroup = [Q](bool UseCopyEngine, - uint32_t RepeatCount) -> pi_result { - pi_command_list_ptr_t CommandList; - while (RepeatCount--) { - if (Q->UsingImmCmdLists) { - CommandList = Q->getQueueGroup(UseCopyEngine).getImmCmdList(); - } else { - // Heuristically create some number of regular command-list to reuse. - for (int I = 0; I < 10; ++I) { - PI_CALL(Q->createCommandList(UseCopyEngine, CommandList)); - // Immediately return them to the cache of available command-lists. - std::vector EventsUnused; - PI_CALL(Q->resetCommandList(CommandList, true /* MakeAvailable */, - EventsUnused)); - } - } - } - return PI_SUCCESS; - }; - // Create as many command-lists as there are queues in the group. - // With this the underlying round-robin logic would initialize all - // native queues, and create command-lists and their fences. - // At this point only the thread creating the queue will have associated - // command-lists. Other threads have not accessed the queue yet. So we can - // only warmup the initial thread's command-lists. - auto QueueGroup = Q->ComputeQueueGroupsByTID.get(); - PI_CALL(warmupQueueGroup(false, QueueGroup.UpperIndex - - QueueGroup.LowerIndex + 1)); - if (Q->useCopyEngine()) { - auto QueueGroup = Q->CopyQueueGroupsByTID.get(); - PI_CALL(warmupQueueGroup(true, QueueGroup.UpperIndex - - QueueGroup.LowerIndex + 1)); - } - // TODO: warmup event pools. Both host-visible and device-only. - } - return PI_SUCCESS; + return pi2ur::piextQueueCreate(Context, Device, Properties, Queue); } pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - std::shared_lock Lock(Queue->Mutex); - ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); - // TODO: consider support for queue properties and size - switch (ParamName) { - case PI_QUEUE_INFO_CONTEXT: - return ReturnValue(Queue->Context); - case PI_QUEUE_INFO_DEVICE: - return ReturnValue(Queue->Device); - case PI_QUEUE_INFO_REFERENCE_COUNT: - return ReturnValue(pi_uint32{Queue->RefCount.load()}); - case PI_QUEUE_INFO_PROPERTIES: - die("PI_QUEUE_INFO_PROPERTIES in piQueueGetInfo not implemented\n"); - break; - case PI_QUEUE_INFO_SIZE: - die("PI_QUEUE_INFO_SIZE in piQueueGetInfo not implemented\n"); - break; - case PI_QUEUE_INFO_DEVICE_DEFAULT: - die("PI_QUEUE_INFO_DEVICE_DEFAULT in piQueueGetInfo not implemented\n"); - break; - case PI_EXT_ONEAPI_QUEUE_INFO_EMPTY: { - // We can exit early if we have in-order queue. - if (Queue->isInOrderQueue()) { - if (!Queue->LastCommandEvent) - return ReturnValue(pi_bool{true}); - - // We can check status of the event only if it isn't discarded otherwise - // it may be reset (because we are free to reuse such events) and - // zeEventQueryStatus will hang. - // TODO: use more robust way to check that ZeEvent is not owned by - // LastCommandEvent. - if (!Queue->LastCommandEvent->IsDiscarded) { - ze_result_t ZeResult = ZE_CALL_NOCHECK( - zeEventQueryStatus, (Queue->LastCommandEvent->ZeEvent)); - if (ZeResult == ZE_RESULT_NOT_READY) { - return ReturnValue(pi_bool{false}); - } else if (ZeResult != ZE_RESULT_SUCCESS) { - return mapError(ZeResult); - } - return ReturnValue(pi_bool{true}); - } - // For immediate command lists we have to check status of the event - // because immediate command lists are not associated with level zero - // queue. Conservatively return false in this case because last event is - // discarded and we can't check its status. - if (Queue->UsingImmCmdLists) - return ReturnValue(pi_bool{false}); - } - - // If we have any open command list which is not empty then return false - // because it means that there are commands which are not even submitted for - // execution yet. - using IsCopy = bool; - if (Queue->hasOpenCommandList(IsCopy{true}) || - Queue->hasOpenCommandList(IsCopy{false})) - return ReturnValue(pi_bool{false}); - - for (const auto &QueueMap : - {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID}) { - for (const auto &QueueGroup : QueueMap) { - if (Queue->UsingImmCmdLists) { - // Immediate command lists are not associated with any Level Zero - // queue, that's why we have to check status of events in each - // immediate command list. Start checking from the end and exit early - // if some event is not completed. - for (const auto &ImmCmdList : QueueGroup.second.ImmCmdLists) { - if (ImmCmdList == Queue->CommandListMap.end()) - continue; - - auto EventList = ImmCmdList->second.EventList; - for (auto It = EventList.crbegin(); It != EventList.crend(); It++) { - ze_result_t ZeResult = - ZE_CALL_NOCHECK(zeEventQueryStatus, ((*It)->ZeEvent)); - if (ZeResult == ZE_RESULT_NOT_READY) { - return ReturnValue(pi_bool{false}); - } else if (ZeResult != ZE_RESULT_SUCCESS) { - return mapError(ZeResult); - } - } - } - } else { - for (const auto &ZeQueue : QueueGroup.second.ZeQueues) { - if (!ZeQueue) - continue; - // Provide 0 as the timeout parameter to immediately get the status - // of the Level Zero queue. - ze_result_t ZeResult = ZE_CALL_NOCHECK(zeCommandQueueSynchronize, - (ZeQueue, /* timeout */ 0)); - if (ZeResult == ZE_RESULT_NOT_READY) { - return ReturnValue(pi_bool{false}); - } else if (ZeResult != ZE_RESULT_SUCCESS) { - return mapError(ZeResult); - } - } - } - } - } - return ReturnValue(pi_bool{true}); - } - default: - urPrint("Unsupported ParamName in piQueueGetInfo: ParamName=%d(0x%x)\n", - ParamName, ParamName); - return PI_ERROR_INVALID_VALUE; - } - - return PI_SUCCESS; + return pi2ur::piQueueGetInfo(Queue, ParamName, ParamValueSize, ParamValue, + ParamValueSizeRet); } -pi_result piQueueRetain(pi_queue Queue) { - { - std::scoped_lock Lock(Queue->Mutex); - Queue->RefCountExternal++; - } - Queue->RefCount.increment(); - return PI_SUCCESS; -} +pi_result piQueueRetain(pi_queue Queue) { return pi2ur::piQueueRetain(Queue); } pi_result piQueueRelease(pi_queue Queue) { - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - std::vector EventListToCleanup; - - { - std::scoped_lock Lock(Queue->Mutex); - - if ((--Queue->RefCountExternal) != 0) - return PI_SUCCESS; - - // When external reference count goes to zero it is still possible - // that internal references still exists, e.g. command-lists that - // are not yet completed. So do full queue synchronization here - // and perform proper cleanup. - // - // It is possible to get to here and still have an open command list - // if no wait or finish ever occurred for this queue. - if (auto Res = Queue->executeAllOpenCommandLists()) - return Res; - - // Make sure all commands get executed. - Queue->synchronize(); - - // Destroy all the fences created associated with this queue. - for (auto it = Queue->CommandListMap.begin(); - it != Queue->CommandListMap.end(); ++it) { - // This fence wasn't yet signalled when we polled it for recycling - // the command-list, so need to release the command-list too. - // For immediate commandlists we don't need to do an L0 reset of the - // commandlist but do need to do event cleanup which is also in the - // resetCommandList function. - // If the fence is a nullptr we are using immediate commandlists, - // otherwise regular commandlists which use a fence. - if (it->second.ZeFence == nullptr || it->second.ZeFenceInUse) { - Queue->resetCommandList(it, true, EventListToCleanup); - } - // TODO: remove "if" when the problem is fixed in the level zero - // runtime. Destroy only if a queue is healthy. Destroying a fence may - // cause a hang otherwise. - // If the fence is a nullptr we are using immediate commandlists. - if (Queue->Healthy && it->second.ZeFence != nullptr) { - auto ZeResult = ZE_CALL_NOCHECK(zeFenceDestroy, (it->second.ZeFence)); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return mapError(ZeResult); - } - if (Queue->UsingImmCmdLists && Queue->OwnZeCommandQueue) { - std::scoped_lock Lock( - Queue->Context->ZeCommandListCacheMutex); - const pi_command_list_info_t &MapEntry = it->second; - if (MapEntry.CanReuse) { - // Add commandlist to the cache for future use. - // It will be deleted when the context is destroyed. - auto &ZeCommandListCache = - MapEntry.isCopy(Queue) - ? Queue->Context - ->ZeCopyCommandListCache[Queue->Device->ZeDevice] - : Queue->Context - ->ZeComputeCommandListCache[Queue->Device->ZeDevice]; - ZeCommandListCache.push_back({it->first, it->second.ZeQueueDesc}); - } else { - // A non-reusable comamnd list that came from a make_queue call is - // destroyed since it cannot be recycled. - ze_command_list_handle_t ZeCommandList = it->first; - if (ZeCommandList) { - ZE_CALL(zeCommandListDestroy, (ZeCommandList)); - } - } - } - } - Queue->CommandListMap.clear(); - } - - for (auto &Event : EventListToCleanup) { - // We don't need to synchronize the events since the queue - // synchronized above already does that. - { - std::scoped_lock EventLock(Event->Mutex); - Event->Completed = true; - } - PI_CALL(CleanupCompletedEvent(Event)); - // This event was removed from the command list, so decrement ref count - // (it was incremented when they were added to the command list). - PI_CALL(piEventReleaseInternal(Event)); - } - PI_CALL(piQueueReleaseInternal(Queue)); - return PI_SUCCESS; + return pi2ur::piQueueRelease(Queue); } -static pi_result piQueueReleaseInternal(pi_queue Queue) { - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - if (!Queue->RefCount.decrementAndTest()) - return PI_SUCCESS; - - for (auto &Cache : Queue->EventCaches) - for (auto &Event : Cache) - PI_CALL(piEventReleaseInternal(Event)); - - if (Queue->OwnZeCommandQueue) { - for (auto &QueueMap : - {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID}) - for (auto &QueueGroup : QueueMap) - for (auto &ZeQueue : QueueGroup.second.ZeQueues) - if (ZeQueue) { - auto ZeResult = ZE_CALL_NOCHECK(zeCommandQueueDestroy, (ZeQueue)); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return mapError(ZeResult); - } - } - - urPrint("piQueueRelease(compute) NumTimesClosedFull %d, " - "NumTimesClosedEarly %d\n", - Queue->ComputeCommandBatch.NumTimesClosedFull, - Queue->ComputeCommandBatch.NumTimesClosedEarly); - urPrint("piQueueRelease(copy) NumTimesClosedFull %d, NumTimesClosedEarly " - "%d\n", - Queue->CopyCommandBatch.NumTimesClosedFull, - Queue->CopyCommandBatch.NumTimesClosedEarly); - - delete Queue; - - return PI_SUCCESS; -} - -pi_result piQueueFinish(pi_queue Queue) { - // Wait until command lists attached to the command queue are executed. - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - if (Queue->UsingImmCmdLists) { - // Lock automatically releases when this goes out of scope. - std::scoped_lock Lock(Queue->Mutex); - - Queue->synchronize(); - } else { - std::unique_lock Lock(Queue->Mutex); - std::vector ZeQueues; - - // execute any command list that may still be open. - if (auto Res = Queue->executeAllOpenCommandLists()) - return Res; - - // Make a copy of queues to sync and release the lock. - for (auto &QueueMap : - {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID}) - for (auto &QueueGroup : QueueMap) - std::copy(QueueGroup.second.ZeQueues.begin(), - QueueGroup.second.ZeQueues.end(), - std::back_inserter(ZeQueues)); - - // Remember the last command's event. - auto LastCommandEvent = Queue->LastCommandEvent; - - // Don't hold a lock to the queue's mutex while waiting. - // This allows continue working with the queue from other threads. - // TODO: this currently exhibits some issues in the driver, so - // we control this with an env var. Remove this control when - // we settle one way or the other. - const char *UrRet = std::getenv("UR_L0_QUEUE_FINISH_HOLD_LOCK"); - const char *PiRet = - std::getenv("SYCL_PI_LEVEL_ZERO_QUEUE_FINISH_HOLD_LOCK"); - const bool HoldLock = - UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0); - - if (!HoldLock) { - Lock.unlock(); - } - - for (auto &ZeQueue : ZeQueues) { - if (ZeQueue) - ZE_CALL(zeHostSynchronize, (ZeQueue)); - } - - // Prevent unneeded already finished events to show up in the wait list. - // We can only do so if nothing else was submitted to the queue - // while we were synchronizing it. - if (!HoldLock) { - std::scoped_lock Lock(Queue->Mutex); - if (LastCommandEvent == Queue->LastCommandEvent) { - Queue->LastCommandEvent = nullptr; - } - } else { - Queue->LastCommandEvent = nullptr; - } - } - // Reset signalled command lists and return them back to the cache of - // available command lists. Events in the immediate command lists are cleaned - // up in synchronize(). - if (!Queue->UsingImmCmdLists) { - std::unique_lock Lock(Queue->Mutex); - resetCommandLists(Queue); - } - return PI_SUCCESS; -} +pi_result piQueueFinish(pi_queue Queue) { return pi2ur::piQueueFinish(Queue); } -// Flushing cross-queue dependencies is covered by createAndRetainPiZeEventList, -// so this can be left as a no-op. -pi_result piQueueFlush(pi_queue Queue) { - (void)Queue; - return PI_SUCCESS; -} +pi_result piQueueFlush(pi_queue Queue) { return pi2ur::piQueueFlush(Queue); } pi_result piextQueueGetNativeHandle(pi_queue Queue, - pi_native_handle *NativeHandle, - int32_t *NativeHandleDesc) { - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - PI_ASSERT(NativeHandleDesc, PI_ERROR_INVALID_VALUE); - - // Lock automatically releases when this goes out of scope. - std::shared_lock lock(Queue->Mutex); - - // Get handle to this thread's queue group. - auto &QueueGroup = Queue->getQueueGroup(false /*compute*/); - - if (Queue->UsingImmCmdLists) { - auto ZeCmdList = ur_cast(NativeHandle); - // Extract the Level Zero command list handle from the given PI queue - *ZeCmdList = QueueGroup.getImmCmdList()->first; - *NativeHandleDesc = true; - } else { - auto ZeQueue = ur_cast(NativeHandle); - // Extract a Level Zero compute queue handle from the given PI queue - uint32_t QueueGroupOrdinalUnused; - *ZeQueue = QueueGroup.getZeQueue(&QueueGroupOrdinalUnused); - *NativeHandleDesc = false; - } - return PI_SUCCESS; -} + pi_native_handle *NativeHandle) { -void _pi_queue::pi_queue_group_t::setImmCmdList( - ze_command_list_handle_t ZeCommandList) { - // An immediate command list was given to us but we don't have the queue - // descriptor information. Create a dummy and note that it is not recycleable. - ZeStruct ZeQueueDesc; - ImmCmdLists = std::vector( - 1, - Queue->CommandListMap - .insert(std::pair{ - ZeCommandList, - {nullptr, true, false, nullptr, ZeQueueDesc, false}}) - .first); + return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle); } pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle, - int32_t NativeHandleDesc, pi_context Context, pi_device Device, bool OwnNativeHandle, - pi_queue_properties *Properties, pi_queue *Queue) { - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); - - // The NativeHandleDesc has value if if the native handle is an immediate - // command list. - if (NativeHandleDesc == 1) { - std::vector ComputeQueues{nullptr}; - std::vector CopyQueues; - - *Queue = new _pi_queue(ComputeQueues, CopyQueues, Context, Device, - OwnNativeHandle, Properties[1]); - auto &InitialGroup = (*Queue)->ComputeQueueGroupsByTID.begin()->second; - InitialGroup.setImmCmdList(ur_cast(NativeHandle)); - } else { - auto ZeQueue = ur_cast(NativeHandle); - // Assume this is the "0" index queue in the compute command-group. - std::vector ZeQueues{ZeQueue}; - - // TODO: see what we can do to correctly initialize PI queue for - // compute vs. copy Level-Zero queue. Currently we will send - // all commands to the "ZeQueue". - std::vector ZeroCopyQueues; - - *Queue = new _pi_queue(ZeQueues, ZeroCopyQueues, Context, Device, - OwnNativeHandle, Properties[1]); - } - (*Queue)->UsingImmCmdLists = (NativeHandleDesc == 1); - return PI_SUCCESS; -} - -// If indirect access tracking is enabled then performs reference counting, -// otherwise just calls zeMemAllocDevice. -static pi_result ZeDeviceMemAllocHelper(void **ResultPtr, pi_context Context, - pi_device Device, size_t Size) { - pi_platform Plt = Device->Platform; - std::unique_lock ContextsLock(Plt->ContextsMutex, - std::defer_lock); - if (IndirectAccessTrackingEnabled) { - // Lock the mutex which is guarding contexts container in the platform. - // This prevents new kernels from being submitted in any context while - // we are in the process of allocating a memory, this is needed to - // properly capture allocations by kernels with indirect access. - ContextsLock.lock(); - // We are going to defer memory release if there are kernels with - // indirect access, that is why explicitly retain context to be sure - // that it is released after all memory allocations in this context are - // released. - PI_CALL(piContextRetain(Context)); - } - - ze_device_mem_alloc_desc_t ZeDesc = {}; - ZeDesc.flags = 0; - ZeDesc.ordinal = 0; - ZE_CALL(zeMemAllocDevice, - (Context->ZeContext, &ZeDesc, Size, 1, Device->ZeDevice, ResultPtr)); - - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*ResultPtr), - std::forward_as_tuple(Context)); - } - return PI_SUCCESS; -} - -// If indirect access tracking is enabled then performs reference counting, -// otherwise just calls zeMemAllocHost. -static pi_result ZeHostMemAllocHelper(void **ResultPtr, pi_context Context, - size_t Size) { - pi_platform Plt = Context->getPlatform(); - std::unique_lock ContextsLock(Plt->ContextsMutex, - std::defer_lock); - if (IndirectAccessTrackingEnabled) { - // Lock the mutex which is guarding contexts container in the platform. - // This prevents new kernels from being submitted in any context while - // we are in the process of allocating a memory, this is needed to - // properly capture allocations by kernels with indirect access. - ContextsLock.lock(); - // We are going to defer memory release if there are kernels with - // indirect access, that is why explicitly retain context to be sure - // that it is released after all memory allocations in this context are - // released. - PI_CALL(piContextRetain(Context)); - } - - ZeStruct ZeDesc; - ZeDesc.flags = 0; - ZE_CALL(zeMemAllocHost, (Context->ZeContext, &ZeDesc, Size, 1, ResultPtr)); - - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*ResultPtr), - std::forward_as_tuple(Context)); - } - return PI_SUCCESS; + return pi2ur::piextQueueCreateWithNativeHandle(NativeHandle, Context, Device, + OwnNativeHandle, Queue); } pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, void *HostPtr, pi_mem *RetMem, const pi_mem_properties *properties) { - - // TODO: implement support for more access modes - if (!((Flags & PI_MEM_FLAGS_ACCESS_RW) || - (Flags & PI_MEM_ACCESS_READ_ONLY))) { - die("piMemBufferCreate: Level-Zero supports read-write and read-only " - "buffer," - "but not other accesses (such as write-only) yet."); - } - - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - PI_ASSERT(RetMem, PI_ERROR_INVALID_VALUE); - - if (properties != nullptr) { - die("piMemBufferCreate: no mem properties goes to Level-Zero RT yet"); - } - - if (Flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) { - // Having PI_MEM_FLAGS_HOST_PTR_ALLOC for buffer requires allocation of - // pinned host memory, see: - // sycl/doc/extensions/supported/sycl_ext_oneapi_use_pinned_host_memory_property.asciidoc - // We are however missing such functionality in Level Zero, so we just - // ignore the flag for now. - // - } - - // If USM Import feature is enabled and hostptr is supplied, - // import the hostptr if not already imported into USM. - // Data transfer rate is maximized when both source and destination - // are USM pointers. Promotion of the host pointer to USM thus - // optimizes data transfer performance. - bool HostPtrImported = false; - if (ZeUSMImport.Enabled && HostPtr != nullptr && - (Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0) { - // Query memory type of the host pointer - ze_device_handle_t ZeDeviceHandle; - ZeStruct ZeMemoryAllocationProperties; - ZE_CALL(zeMemGetAllocProperties, - (Context->ZeContext, HostPtr, &ZeMemoryAllocationProperties, - &ZeDeviceHandle)); - - // If not shared of any type, we can import the ptr - if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) { - // Promote the host ptr to USM host memory - ze_driver_handle_t driverHandle = Context->getPlatform()->ZeDriver; - ZeUSMImport.doZeUSMImport(driverHandle, HostPtr, Size); - HostPtrImported = true; - } - } - - pi_buffer Buffer = nullptr; - auto HostPtrOrNull = - (Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? ur_cast(HostPtr) : nullptr; - try { - Buffer = new _pi_buffer(Context, Size, HostPtrOrNull, HostPtrImported); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - // Initialize the buffer with user data - if (HostPtr) { - if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 || - (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) { - - // We don't yet know which device needs this buffer, so make the first - // device in the context be the master, and hold the initial valid - // allocation. - char *ZeHandleDst; - PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, - Context->Devices[0])); - if (Buffer->OnHost) { - // Do a host to host copy. - // For an imported HostPtr the copy is unneeded. - if (!HostPtrImported) - memcpy(ZeHandleDst, HostPtr, Size); - } else { - // Initialize the buffer synchronously with immediate offload - // zeCommandListAppendMemoryCopy must not be called from simultaneous - // threads with the same command list handle, so we need exclusive lock. - std::scoped_lock Lock(Context->ImmediateCommandListMutex); - ZE_CALL(zeCommandListAppendMemoryCopy, - (Context->ZeCommandListInit, ZeHandleDst, HostPtr, Size, - nullptr, 0, nullptr)); - } - } else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) { - // Nothing more to do. - } else { - die("piMemBufferCreate: not implemented"); - } - } - - *RetMem = Buffer; - return PI_SUCCESS; + return pi2ur::piMemBufferCreate(Context, Flags, Size, HostPtr, RetMem, + properties); } pi_result piMemGetInfo(pi_mem Mem, pi_mem_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - PI_ASSERT(Mem, PI_ERROR_INVALID_VALUE); - // piMemImageGetInfo must be used for images, except for shared params (like - // Context, AccessMode, etc) - PI_ASSERT(ParamName == PI_MEM_CONTEXT || !Mem->isImage(), - PI_ERROR_INVALID_VALUE); - - std::shared_lock Lock(Mem->Mutex); - ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); - - switch (ParamName) { - case PI_MEM_CONTEXT: - return ReturnValue(Mem->Context); - case PI_MEM_SIZE: { - // Get size of the allocation - auto Buffer = ur_cast(Mem); - return ReturnValue(size_t{Buffer->Size}); - } - default: - die("piMemGetInfo: Parameter is not implemented"); - } - - return {}; -} - -pi_result piMemRetain(pi_mem Mem) { - PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); - - Mem->RefCount.increment(); - return PI_SUCCESS; + return pi2ur::piMemGetInfo(Mem, ParamName, ParamValueSize, ParamValue, + ParamValueSizeRet); } -// If indirect access tracking is not enabled then this functions just performs -// zeMemFree. If indirect access tracking is enabled then reference counting is -// performed. -static pi_result ZeMemFreeHelper(pi_context Context, void *Ptr) { - pi_platform Plt = Context->getPlatform(); - std::unique_lock ContextsLock(Plt->ContextsMutex, - std::defer_lock); - if (IndirectAccessTrackingEnabled) { - ContextsLock.lock(); - auto It = Context->MemAllocs.find(Ptr); - if (It == std::end(Context->MemAllocs)) { - die("All memory allocations must be tracked!"); - } - if (!It->second.RefCount.decrementAndTest()) { - // Memory can't be deallocated yet. - return PI_SUCCESS; - } - - // Reference count is zero, it is ok to free memory. - // We don't need to track this allocation anymore. - Context->MemAllocs.erase(It); - } - - ZE_CALL(zeMemFree, (Context->ZeContext, Ptr)); +pi_result piMemRetain(pi_mem Mem) { return pi2ur::piMemRetain(Mem); } - if (IndirectAccessTrackingEnabled) - PI_CALL(ContextReleaseHelper(Context)); - - return PI_SUCCESS; -} - -static pi_result USMFreeHelper(pi_context Context, void *Ptr, - bool OwnZeMemHandle = true); - -pi_result piMemRelease(pi_mem Mem) { - PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); - - if (!Mem->RefCount.decrementAndTest()) - return PI_SUCCESS; - - if (Mem->isImage()) { - char *ZeHandleImage; - auto Image = static_cast(Mem); - if (Image->OwnZeMemHandle) { - PI_CALL(Mem->getZeHandle(ZeHandleImage, _pi_mem::write_only)); - auto ZeResult = ZE_CALL_NOCHECK( - zeImageDestroy, (ur_cast(ZeHandleImage))); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return mapError(ZeResult); - } - } else { - auto Buffer = static_cast(Mem); - Buffer->free(); - } - delete Mem; - - return PI_SUCCESS; -} - -static pi_result pi2zeImageDesc(const pi_image_format *ImageFormat, - const pi_image_desc *ImageDesc, - ZeStruct &ZeImageDesc) { - ze_image_format_type_t ZeImageFormatType; - size_t ZeImageFormatTypeSize; - switch (ImageFormat->image_channel_data_type) { - case PI_IMAGE_CHANNEL_TYPE_FLOAT: - ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT; - ZeImageFormatTypeSize = 32; - break; - case PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT: - ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT; - ZeImageFormatTypeSize = 16; - break; - case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: - ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT; - ZeImageFormatTypeSize = 32; - break; - case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: - ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT; - ZeImageFormatTypeSize = 16; - break; - case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: - ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT; - ZeImageFormatTypeSize = 8; - break; - case PI_IMAGE_CHANNEL_TYPE_UNORM_INT16: - ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM; - ZeImageFormatTypeSize = 16; - break; - case PI_IMAGE_CHANNEL_TYPE_UNORM_INT8: - ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM; - ZeImageFormatTypeSize = 8; - break; - case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT32: - ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT; - ZeImageFormatTypeSize = 32; - break; - case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT16: - ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT; - ZeImageFormatTypeSize = 16; - break; - case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8: - ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT; - ZeImageFormatTypeSize = 8; - break; - case PI_IMAGE_CHANNEL_TYPE_SNORM_INT16: - ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM; - ZeImageFormatTypeSize = 16; - break; - case PI_IMAGE_CHANNEL_TYPE_SNORM_INT8: - ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM; - ZeImageFormatTypeSize = 8; - break; - default: - urPrint("piMemImageCreate: unsupported image data type: data type = %d\n", - ImageFormat->image_channel_data_type); - return PI_ERROR_INVALID_VALUE; - } - - // TODO: populate the layout mapping - ze_image_format_layout_t ZeImageFormatLayout; - switch (ImageFormat->image_channel_order) { - case PI_IMAGE_CHANNEL_ORDER_RGBA: - switch (ZeImageFormatTypeSize) { - case 8: - ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8; - break; - case 16: - ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16; - break; - case 32: - ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32; - break; - default: - urPrint("piMemImageCreate: unexpected data type Size\n"); - return PI_ERROR_INVALID_VALUE; - } - break; - default: - urPrint("format layout = %d\n", ImageFormat->image_channel_order); - die("piMemImageCreate: unsupported image format layout\n"); - break; - } - - ze_image_format_t ZeFormatDesc = { - ZeImageFormatLayout, ZeImageFormatType, - // TODO: are swizzles deducted from image_format->image_channel_order? - ZE_IMAGE_FORMAT_SWIZZLE_R, ZE_IMAGE_FORMAT_SWIZZLE_G, - ZE_IMAGE_FORMAT_SWIZZLE_B, ZE_IMAGE_FORMAT_SWIZZLE_A}; - - ze_image_type_t ZeImageType; - switch (ImageDesc->image_type) { - case PI_MEM_TYPE_IMAGE1D: - ZeImageType = ZE_IMAGE_TYPE_1D; - break; - case PI_MEM_TYPE_IMAGE2D: - ZeImageType = ZE_IMAGE_TYPE_2D; - break; - case PI_MEM_TYPE_IMAGE3D: - ZeImageType = ZE_IMAGE_TYPE_3D; - break; - case PI_MEM_TYPE_IMAGE1D_ARRAY: - ZeImageType = ZE_IMAGE_TYPE_1DARRAY; - break; - case PI_MEM_TYPE_IMAGE2D_ARRAY: - ZeImageType = ZE_IMAGE_TYPE_2DARRAY; - break; - default: - urPrint("piMemImageCreate: unsupported image type\n"); - return PI_ERROR_INVALID_VALUE; - } - - ZeImageDesc.arraylevels = 0; - ZeImageDesc.flags = 0; - ZeImageDesc.type = ZeImageType; - ZeImageDesc.format = ZeFormatDesc; - ZeImageDesc.width = ur_cast(ImageDesc->image_width); - ZeImageDesc.height = ur_cast(ImageDesc->image_height); - ZeImageDesc.depth = ur_cast(ImageDesc->image_depth); - ZeImageDesc.arraylevels = ur_cast(ImageDesc->image_array_size); - ZeImageDesc.miplevels = ImageDesc->num_mip_levels; - - return PI_SUCCESS; -} +pi_result piMemRelease(pi_mem Mem) { return pi2ur::piMemRelease(Mem); } pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, const pi_image_format *ImageFormat, const pi_image_desc *ImageDesc, void *HostPtr, pi_mem *RetImage) { - - // TODO: implement read-only, write-only - if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) { - die("piMemImageCreate: Level-Zero implements only read-write buffer," - "no read-only or write-only yet."); - } - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE); - PI_ASSERT(ImageFormat, PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); - - ZeStruct ZeImageDesc; - pi_result DescriptionResult = - pi2zeImageDesc(ImageFormat, ImageDesc, ZeImageDesc); - if (DescriptionResult != PI_SUCCESS) - return DescriptionResult; - - std::shared_lock Lock(Context->Mutex); - - // Currently we have the "0" device in context with mutliple root devices to - // own the image. - // TODO: Implement explicit copying for acessing the image from other devices - // in the context. - pi_device Device = Context->SingleRootDevice ? Context->SingleRootDevice - : Context->Devices[0]; - ze_image_handle_t ZeHImage; - ZE_CALL(zeImageCreate, - (Context->ZeContext, Device->ZeDevice, &ZeImageDesc, &ZeHImage)); - - try { - auto ZePIImage = new _pi_image(Context, ZeHImage, /*OwnNativeHandle=*/true); - *RetImage = ZePIImage; - -#ifndef NDEBUG - ZePIImage->ZeImageDesc = ZeImageDesc; -#endif // !NDEBUG - - if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 || - (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) { - // Initialize image synchronously with immediate offload. - // zeCommandListAppendImageCopyFromMemory must not be called from - // simultaneous threads with the same command list handle, so we need - // exclusive lock. - std::scoped_lock Lock(Context->ImmediateCommandListMutex); - ZE_CALL(zeCommandListAppendImageCopyFromMemory, - (Context->ZeCommandListInit, ZeHImage, HostPtr, nullptr, nullptr, - 0, nullptr)); - } - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - return PI_SUCCESS; + + return pi2ur::piMemImageCreate(Context, Flags, ImageFormat, ImageDesc, + HostPtr, RetImage); } pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) { - PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); - std::shared_lock Guard(Mem->Mutex); - char *ZeHandle; - PI_CALL(Mem->getZeHandle(ZeHandle, _pi_mem::read_write)); - *NativeHandle = ur_cast(ZeHandle); - return PI_SUCCESS; + return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle); } pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, pi_context Context, bool ownNativeHandle, pi_mem *Mem) { - PI_ASSERT(Mem, PI_ERROR_INVALID_VALUE); - PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - - std::shared_lock Lock(Context->Mutex); - - // Get base of the allocation - void *Base; - size_t Size; - void *Ptr = ur_cast(NativeHandle); - ZE_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, &Base, &Size)); - PI_ASSERT(Ptr == Base, PI_ERROR_INVALID_VALUE); - - ZeStruct ZeMemProps; - ze_device_handle_t ZeDevice = nullptr; - ZE_CALL(zeMemGetAllocProperties, - (Context->ZeContext, Ptr, &ZeMemProps, &ZeDevice)); - - // Check type of the allocation - switch (ZeMemProps.type) { - case ZE_MEMORY_TYPE_HOST: - case ZE_MEMORY_TYPE_SHARED: - case ZE_MEMORY_TYPE_DEVICE: - break; - case ZE_MEMORY_TYPE_UNKNOWN: - // Memory allocation is unrelated to the context - return PI_ERROR_INVALID_CONTEXT; - default: - die("Unexpected memory type"); - } - - pi_device Device = nullptr; - if (ZeDevice) { - Device = Context->getPlatform()->getDeviceFromNativeHandle(ZeDevice); - PI_ASSERT(Context->isValidDevice(Device), PI_ERROR_INVALID_CONTEXT); - } - - try { - *Mem = new _pi_buffer(Context, Size, Device, ur_cast(NativeHandle), - ownNativeHandle); - - pi_platform Plt = Context->getPlatform(); - std::unique_lock ContextsLock(Plt->ContextsMutex, - std::defer_lock); - // If we don't own the native handle then we can't control deallocation of - // that memory so there is no point of keeping track of the memory - // allocation for deferred memory release in the mode when indirect access - // tracking is enabled. - if (IndirectAccessTrackingEnabled && ownNativeHandle) { - // We need to keep track of all memory allocations in the context - ContextsLock.lock(); - // Retain context to be sure that it is released after all memory - // allocations in this context are released. - PI_CALL(piContextRetain(Context)); - - Context->MemAllocs.emplace( - std::piecewise_construct, std::forward_as_tuple(Ptr), - std::forward_as_tuple(Context, ownNativeHandle)); - } - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - // Initialize the buffer as necessary - auto Buffer = ur_cast(*Mem); - if (Device) { - // If this allocation is on a device, then we re-use it for the buffer. - // Nothing to do. - } else if (Buffer->OnHost) { - // If this is host allocation and buffer always stays on host there - // nothing more to do. - } else { - // In all other cases (shared allocation, or host allocation that cannot - // represent the buffer in this context) copy the data to a newly - // created device allocation. - char *ZeHandleDst; - PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Device)); - - // zeCommandListAppendMemoryCopy must not be called from simultaneous - // threads with the same command list handle, so we need exclusive lock. - std::scoped_lock Lock(Context->ImmediateCommandListMutex); - ZE_CALL(zeCommandListAppendMemoryCopy, - (Context->ZeCommandListInit, ZeHandleDst, Ptr, Size, nullptr, 0, - nullptr)); - } - - return PI_SUCCESS; + return pi2ur::piextMemCreateWithNativeHandle(NativeHandle, Context, + ownNativeHandle, Mem); } pi_result piextMemImageCreateWithNativeHandle( pi_native_handle NativeHandle, pi_context Context, bool OwnNativeHandle, - [[maybe_unused]] const pi_image_format *ImageFormat, - [[maybe_unused]] const pi_image_desc *ImageDesc, pi_mem *RetImage) { + const pi_image_format *ImageFormat, const pi_image_desc *ImageDesc, + pi_mem *RetImage) { PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE); PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); @@ -3656,7 +241,7 @@ pi_result piextMemImageCreateWithNativeHandle( std::shared_lock Lock(Context->Mutex); - ze_image_handle_t ZeHImage = ur_cast(NativeHandle); + ze_image_handle_t ZeHImage = pi_cast(NativeHandle); try { auto ZePIImage = new _pi_image(Context, ZeHImage, OwnNativeHandle); @@ -3683,22 +268,7 @@ pi_result piextMemImageCreateWithNativeHandle( pi_result piProgramCreate(pi_context Context, const void *ILBytes, size_t Length, pi_program *Program) { - - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - PI_ASSERT(ILBytes && Length, PI_ERROR_INVALID_VALUE); - PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); - - // NOTE: the Level Zero module creation is also building the program, so we - // are deferring it until the program is ready to be built. - - try { - *Program = new _pi_program(_pi_program::IL, Context, ILBytes, Length); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - return PI_SUCCESS; + return pi2ur::piProgramCreate(Context, ILBytes, Length, Program); } pi_result piProgramCreateWithBinary( @@ -3706,168 +276,26 @@ pi_result piProgramCreateWithBinary( const size_t *Lengths, const unsigned char **Binaries, size_t NumMetadataEntries, const pi_device_binary_property *Metadata, pi_int32 *BinaryStatus, pi_program *Program) { - (void)Metadata; - (void)NumMetadataEntries; - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - PI_ASSERT(DeviceList && NumDevices, PI_ERROR_INVALID_VALUE); - PI_ASSERT(Binaries && Lengths, PI_ERROR_INVALID_VALUE); - PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); - - // For now we support only one device. - if (NumDevices != 1) { - urPrint("piProgramCreateWithBinary: level_zero supports only one device."); - return PI_ERROR_INVALID_VALUE; - } - if (!Binaries[0] || !Lengths[0]) { - if (BinaryStatus) - *BinaryStatus = PI_ERROR_INVALID_VALUE; - return PI_ERROR_INVALID_VALUE; - } - - size_t Length = Lengths[0]; - auto Binary = Binaries[0]; - - // In OpenCL, clCreateProgramWithBinary() can be used to load any of the - // following: "program executable", "compiled program", or "library of - // compiled programs". In addition, the loaded program can be either - // IL (SPIR-v) or native device code. For now, we assume that - // piProgramCreateWithBinary() is only used to load a "program executable" - // as native device code. - // If we wanted to support all the same cases as OpenCL, we would need to - // somehow examine the binary image to distinguish the cases. Alternatively, - // we could change the PI interface and have the caller pass additional - // information to distinguish the cases. - - try { - *Program = new _pi_program(_pi_program::Native, Context, Binary, Length); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - if (BinaryStatus) - *BinaryStatus = PI_SUCCESS; - return PI_SUCCESS; + return pi2ur::piProgramCreateWithBinary(Context, NumDevices, DeviceList, + Lengths, Binaries, NumMetadataEntries, + Metadata, BinaryStatus, Program); } pi_result piclProgramCreateWithSource(pi_context Context, pi_uint32 Count, const char **Strings, const size_t *Lengths, pi_program *RetProgram) { - - (void)Context; - (void)Count; - (void)Strings; - (void)Lengths; - (void)RetProgram; - urPrint("piclProgramCreateWithSource: not supported in Level Zero\n"); - return PI_ERROR_INVALID_OPERATION; + return pi2ur::piclProgramCreateWithSource(Context, Count, Strings, Lengths, + RetProgram); } pi_result piProgramGetInfo(pi_program Program, pi_program_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); - - ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); - switch (ParamName) { - case PI_PROGRAM_INFO_REFERENCE_COUNT: - return ReturnValue(pi_uint32{Program->RefCount.load()}); - case PI_PROGRAM_INFO_NUM_DEVICES: - // TODO: return true number of devices this program exists for. - return ReturnValue(pi_uint32{1}); - case PI_PROGRAM_INFO_DEVICES: - // TODO: return all devices this program exists for. - return ReturnValue(Program->Context->Devices[0]); - case PI_PROGRAM_INFO_BINARY_SIZES: { - std::shared_lock Guard(Program->Mutex); - size_t SzBinary; - if (Program->State == _pi_program::IL || - Program->State == _pi_program::Native || - Program->State == _pi_program::Object) { - SzBinary = Program->CodeLength; - } else if (Program->State == _pi_program::Exe) { - ZE_CALL(zeModuleGetNativeBinary, (Program->ZeModule, &SzBinary, nullptr)); - } else { - return PI_ERROR_INVALID_PROGRAM; - } - // This is an array of 1 element, initialized as if it were scalar. - return ReturnValue(size_t{SzBinary}); - } - case PI_PROGRAM_INFO_BINARIES: { - // The caller sets "ParamValue" to an array of pointers, one for each - // device. Since Level Zero supports only one device, there is only one - // pointer. If the pointer is NULL, we don't do anything. Otherwise, we - // copy the program's binary image to the buffer at that pointer. - uint8_t **PBinary = ur_cast(ParamValue); - if (!PBinary[0]) - break; - - std::shared_lock Guard(Program->Mutex); - if (Program->State == _pi_program::IL || - Program->State == _pi_program::Native || - Program->State == _pi_program::Object) { - std::memcpy(PBinary[0], Program->Code.get(), Program->CodeLength); - } else if (Program->State == _pi_program::Exe) { - size_t SzBinary = 0; - ZE_CALL(zeModuleGetNativeBinary, - (Program->ZeModule, &SzBinary, PBinary[0])); - } else { - return PI_ERROR_INVALID_PROGRAM; - } - break; - } - case PI_PROGRAM_INFO_NUM_KERNELS: { - std::shared_lock Guard(Program->Mutex); - uint32_t NumKernels; - if (Program->State == _pi_program::IL || - Program->State == _pi_program::Native || - Program->State == _pi_program::Object) { - return PI_ERROR_INVALID_PROGRAM_EXECUTABLE; - } else if (Program->State == _pi_program::Exe) { - NumKernels = 0; - ZE_CALL(zeModuleGetKernelNames, - (Program->ZeModule, &NumKernels, nullptr)); - } else { - return PI_ERROR_INVALID_PROGRAM; - } - return ReturnValue(size_t{NumKernels}); - } - case PI_PROGRAM_INFO_KERNEL_NAMES: - try { - std::shared_lock Guard(Program->Mutex); - std::string PINames{""}; - if (Program->State == _pi_program::IL || - Program->State == _pi_program::Native || - Program->State == _pi_program::Object) { - return PI_ERROR_INVALID_PROGRAM_EXECUTABLE; - } else if (Program->State == _pi_program::Exe) { - uint32_t Count = 0; - ZE_CALL(zeModuleGetKernelNames, (Program->ZeModule, &Count, nullptr)); - std::unique_ptr PNames(new const char *[Count]); - ZE_CALL(zeModuleGetKernelNames, - (Program->ZeModule, &Count, PNames.get())); - for (uint32_t I = 0; I < Count; ++I) { - PINames += (I > 0 ? ";" : ""); - PINames += PNames[I]; - } - } else { - return PI_ERROR_INVALID_PROGRAM; - } - return ReturnValue(PINames.c_str()); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - default: - die("piProgramGetInfo: not implemented"); - } - - return PI_SUCCESS; + return pi2ur::piProgramGetInfo(Program, ParamName, ParamValueSize, ParamValue, + ParamValueSizeRet); } pi_result piProgramLink(pi_context Context, pi_uint32 NumDevices, @@ -3876,169 +304,9 @@ pi_result piProgramLink(pi_context Context, pi_uint32 NumDevices, const pi_program *InputPrograms, void (*PFnNotify)(pi_program Program, void *UserData), void *UserData, pi_program *RetProgram) { - // We only support one device with Level Zero currently. - if (NumDevices != 1) { - urPrint("piProgramLink: level_zero supports only one device."); - return PI_ERROR_INVALID_VALUE; - } - - // We do not support any link flags at this time because the Level Zero API - // does not have any way to pass flags that are specific to linking. - if (Options && *Options != '\0') { - std::string ErrorMessage( - "Level Zero does not support kernel link flags: \""); - ErrorMessage.append(Options); - ErrorMessage.push_back('\"'); - pi_program Program = - new _pi_program(_pi_program::Invalid, Context, ErrorMessage); - *RetProgram = Program; - return PI_ERROR_LINK_PROGRAM_FAILURE; - } - - // Validate input parameters. - PI_ASSERT(DeviceList, PI_ERROR_INVALID_DEVICE); - PI_ASSERT(Context->isValidDevice(DeviceList[0]), PI_ERROR_INVALID_DEVICE); - PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE); - if (NumInputPrograms == 0 || InputPrograms == nullptr) - return PI_ERROR_INVALID_VALUE; - - pi_result PiResult = PI_SUCCESS; - try { - // Acquire a "shared" lock on each of the input programs, and also validate - // that they are all in Object state. - // - // There is no danger of deadlock here even if two threads call - // piProgramLink simultaneously with the same input programs in a different - // order. If we were acquiring these with "exclusive" access, this could - // lead to a classic lock ordering deadlock. However, there is no such - // deadlock potential with "shared" access. There could also be a deadlock - // potential if there was some other code that holds more than one of these - // locks simultaneously with "exclusive" access. However, there is no such - // code like that, so this is also not a danger. - std::vector> Guards(NumInputPrograms); - for (pi_uint32 I = 0; I < NumInputPrograms; I++) { - std::shared_lock Guard(InputPrograms[I]->Mutex); - Guards[I].swap(Guard); - if (InputPrograms[I]->State != _pi_program::Object) { - return PI_ERROR_INVALID_OPERATION; - } - } - - // Previous calls to piProgramCompile did not actually compile the SPIR-V. - // Instead, we postpone compilation until this point, when all the modules - // are linked together. By doing compilation and linking together, the JIT - // compiler is able see all modules and do cross-module optimizations. - // - // Construct a ze_module_program_exp_desc_t which contains information about - // all of the modules that will be linked together. - ZeStruct ZeExtModuleDesc; - std::vector CodeSizes(NumInputPrograms); - std::vector CodeBufs(NumInputPrograms); - std::vector BuildFlagPtrs(NumInputPrograms); - std::vector SpecConstPtrs(NumInputPrograms); - std::vector<_pi_program::SpecConstantShim> SpecConstShims; - SpecConstShims.reserve(NumInputPrograms); - - for (pi_uint32 I = 0; I < NumInputPrograms; I++) { - pi_program Program = InputPrograms[I]; - CodeSizes[I] = Program->CodeLength; - CodeBufs[I] = Program->Code.get(); - BuildFlagPtrs[I] = Program->BuildFlags.c_str(); - SpecConstShims.emplace_back(Program); - SpecConstPtrs[I] = SpecConstShims[I].ze(); - } - - ZeExtModuleDesc.count = NumInputPrograms; - ZeExtModuleDesc.inputSizes = CodeSizes.data(); - ZeExtModuleDesc.pInputModules = CodeBufs.data(); - ZeExtModuleDesc.pBuildFlags = BuildFlagPtrs.data(); - ZeExtModuleDesc.pConstants = SpecConstPtrs.data(); - - ZeStruct ZeModuleDesc; - ZeModuleDesc.pNext = &ZeExtModuleDesc; - ZeModuleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV; - - // This works around a bug in the Level Zero driver. When "ZE_DEBUG=-1", - // the driver does validation of the API calls, and it expects - // "pInputModule" to be non-NULL and "inputSize" to be non-zero. This - // validation is wrong when using the "ze_module_program_exp_desc_t" - // extension because those fields are supposed to be ignored. As a - // workaround, set both fields to 1. - // - // TODO: Remove this workaround when the driver is fixed. - ZeModuleDesc.pInputModule = reinterpret_cast(1); - ZeModuleDesc.inputSize = 1; - - // We need a Level Zero extension to compile multiple programs together into - // a single Level Zero module. However, we don't need that extension if - // there happens to be only one input program. - // - // The "|| (NumInputPrograms == 1)" term is a workaround for a bug in the - // Level Zero driver. The driver's "ze_module_program_exp_desc_t" - // extension should work even in the case when there is just one input - // module. However, there is currently a bug in the driver that leads to a - // crash. As a workaround, do not use the extension when there is one - // input module. - // - // TODO: Remove this workaround when the driver is fixed. - if (!DeviceList[0]->Platform->ZeDriverModuleProgramExtensionFound || - (NumInputPrograms == 1)) { - if (NumInputPrograms == 1) { - ZeModuleDesc.pNext = nullptr; - ZeModuleDesc.inputSize = ZeExtModuleDesc.inputSizes[0]; - ZeModuleDesc.pInputModule = ZeExtModuleDesc.pInputModules[0]; - ZeModuleDesc.pBuildFlags = ZeExtModuleDesc.pBuildFlags[0]; - ZeModuleDesc.pConstants = ZeExtModuleDesc.pConstants[0]; - } else { - urPrint("piProgramLink: level_zero driver does not have static linking " - "support."); - return PI_ERROR_INVALID_VALUE; - } - } - - // Call the Level Zero API to compile, link, and create the module. - ze_device_handle_t ZeDevice = DeviceList[0]->ZeDevice; - ze_context_handle_t ZeContext = Context->ZeContext; - ze_module_handle_t ZeModule = nullptr; - ze_module_build_log_handle_t ZeBuildLog = nullptr; - ze_result_t ZeResult = - ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc, - &ZeModule, &ZeBuildLog)); - - // We still create a _pi_program object even if there is a BUILD_FAILURE - // because we need the object to hold the ZeBuildLog. There is no build - // log created for other errors, so we don't create an object. - PiResult = mapError(ZeResult); - if (ZeResult != ZE_RESULT_SUCCESS && - ZeResult != ZE_RESULT_ERROR_MODULE_BUILD_FAILURE) { - return PiResult; - } - - // The call to zeModuleCreate does not report an error if there are - // unresolved symbols because it thinks these could be resolved later via a - // call to zeModuleDynamicLink. However, modules created with piProgramLink - // are supposed to be fully linked and ready to use. Therefore, do an extra - // check now for unresolved symbols. Note that we still create a - // _pi_program if there are unresolved symbols because the ZeBuildLog tells - // which symbols are unresolved. - if (ZeResult == ZE_RESULT_SUCCESS) { - ZeResult = checkUnresolvedSymbols(ZeModule, &ZeBuildLog); - if (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE) { - PiResult = PI_ERROR_LINK_PROGRAM_FAILURE; - } else if (ZeResult != ZE_RESULT_SUCCESS) { - return mapError(ZeResult); - } - } - - _pi_program::state State = - (PiResult == PI_SUCCESS) ? _pi_program::Exe : _pi_program::Invalid; - *RetProgram = new _pi_program(State, Context, ZeModule, ZeBuildLog); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - return PiResult; + return pi2ur::piProgramLink(Context, NumDevices, DeviceList, Options, + NumInputPrograms, InputPrograms, PFnNotify, + UserData, RetProgram); } pi_result piProgramCompile( @@ -4046,532 +314,92 @@ pi_result piProgramCompile( const char *Options, pi_uint32 NumInputHeaders, const pi_program *InputHeaders, const char **HeaderIncludeNames, void (*PFnNotify)(pi_program Program, void *UserData), void *UserData) { - (void)NumInputHeaders; - (void)InputHeaders; - (void)HeaderIncludeNames; - PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); - - if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList)) - return PI_ERROR_INVALID_VALUE; - - // These aren't supported. - PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE); - - std::scoped_lock Guard(Program->Mutex); - - // It's only valid to compile a program created from IL (we don't support - // programs created from source code). - // - // The OpenCL spec says that the header parameters are ignored when compiling - // IL programs, so we don't validate them. - if (Program->State != _pi_program::IL) - return PI_ERROR_INVALID_OPERATION; - - // We don't compile anything now. Instead, we delay compilation until - // piProgramLink, where we do both compilation and linking as a single step. - // This produces better code because the driver can do cross-module - // optimizations. Therefore, we just remember the compilation flags, so we - // can use them later. - if (Options) - Program->BuildFlags = Options; - Program->State = _pi_program::Object; - - return PI_SUCCESS; + return pi2ur::piProgramCompile(Program, NumDevices, DeviceList, Options, + NumInputHeaders, InputHeaders, + HeaderIncludeNames, PFnNotify, UserData); } pi_result piProgramBuild(pi_program Program, pi_uint32 NumDevices, const pi_device *DeviceList, const char *Options, void (*PFnNotify)(pi_program Program, void *UserData), void *UserData) { - - PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); - if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList)) - return PI_ERROR_INVALID_VALUE; - - // We only support build to one device with Level Zero now. - // TODO: we should eventually build to the possibly multiple root - // devices in the context. - if (NumDevices != 1) { - urPrint("piProgramBuild: level_zero supports only one device."); - return PI_ERROR_INVALID_VALUE; - } - - // These aren't supported. - PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE); - - std::scoped_lock Guard(Program->Mutex); - // Check if device belongs to associated context. - PI_ASSERT(Program->Context, PI_ERROR_INVALID_PROGRAM); - PI_ASSERT(Program->Context->isValidDevice(DeviceList[0]), - PI_ERROR_INVALID_VALUE); - - // It is legal to build a program created from either IL or from native - // device code. - if (Program->State != _pi_program::IL && - Program->State != _pi_program::Native) - return PI_ERROR_INVALID_OPERATION; - - // We should have either IL or native device code. - PI_ASSERT(Program->Code, PI_ERROR_INVALID_PROGRAM); - - // Ask Level Zero to build and load the native code onto the device. - ZeStruct ZeModuleDesc; - _pi_program::SpecConstantShim Shim(Program); - ZeModuleDesc.format = (Program->State == _pi_program::IL) - ? ZE_MODULE_FORMAT_IL_SPIRV - : ZE_MODULE_FORMAT_NATIVE; - ZeModuleDesc.inputSize = Program->CodeLength; - ZeModuleDesc.pInputModule = Program->Code.get(); - ZeModuleDesc.pBuildFlags = Options; - ZeModuleDesc.pConstants = Shim.ze(); - - ze_device_handle_t ZeDevice = DeviceList[0]->ZeDevice; - ze_context_handle_t ZeContext = Program->Context->ZeContext; - ze_module_handle_t ZeModule = nullptr; - - pi_result Result = PI_SUCCESS; - Program->State = _pi_program::Exe; - ze_result_t ZeResult = - ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc, - &ZeModule, &Program->ZeBuildLog)); - if (ZeResult != ZE_RESULT_SUCCESS) { - // We adjust pi_program below to avoid attempting to release zeModule when - // RT calls piProgramRelease(). - Program->State = _pi_program::Invalid; - Result = mapError(ZeResult); - if (ZeModule) { - ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule)); - ZeModule = nullptr; - } - } else { - // The call to zeModuleCreate does not report an error if there are - // unresolved symbols because it thinks these could be resolved later via a - // call to zeModuleDynamicLink. However, modules created with - // piProgramBuild are supposed to be fully linked and ready to use. - // Therefore, do an extra check now for unresolved symbols. - ZeResult = checkUnresolvedSymbols(ZeModule, &Program->ZeBuildLog); - if (ZeResult != ZE_RESULT_SUCCESS) { - Program->State = _pi_program::Invalid; - Result = (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE) - ? PI_ERROR_BUILD_PROGRAM_FAILURE - : mapError(ZeResult); - if (ZeModule) { - ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule)); - ZeModule = nullptr; - } - } - } - - // We no longer need the IL / native code. - Program->Code.reset(); - Program->ZeModule = ZeModule; - return Result; + return pi2ur::piProgramBuild(Program, NumDevices, DeviceList, Options, + PFnNotify, UserData); } pi_result piProgramGetBuildInfo(pi_program Program, pi_device Device, pi_program_build_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - (void)Device; - - std::shared_lock Guard(Program->Mutex); - ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); - if (ParamName == PI_PROGRAM_BUILD_INFO_BINARY_TYPE) { - pi_program_binary_type Type = PI_PROGRAM_BINARY_TYPE_NONE; - if (Program->State == _pi_program::Object) { - Type = PI_PROGRAM_BINARY_TYPE_COMPILED_OBJECT; - } else if (Program->State == _pi_program::Exe) { - Type = PI_PROGRAM_BINARY_TYPE_EXECUTABLE; - } - return ReturnValue(pi_program_binary_type{Type}); - } - if (ParamName == PI_PROGRAM_BUILD_INFO_OPTIONS) { - // TODO: how to get module build options out of Level Zero? - // For the programs that we compiled we can remember the options - // passed with piProgramCompile/piProgramBuild, but what can we - // return for programs that were built outside and registered - // with piProgramRegister? - return ReturnValue(""); - } else if (ParamName == PI_PROGRAM_BUILD_INFO_LOG) { - // Check first to see if the plugin code recorded an error message. - if (!Program->ErrorMessage.empty()) { - return ReturnValue(Program->ErrorMessage.c_str()); - } - - // Next check if there is a Level Zero build log. - if (Program->ZeBuildLog) { - size_t LogSize = ParamValueSize; - ZE_CALL(zeModuleBuildLogGetString, - (Program->ZeBuildLog, &LogSize, ur_cast(ParamValue))); - if (ParamValueSizeRet) { - *ParamValueSizeRet = LogSize; - } - if (ParamValue) { - // When the program build fails in piProgramBuild(), we delayed cleaning - // up the build log because RT later calls this routine to get the - // failed build log. - // To avoid memory leaks, we should clean up the failed build log here - // because RT does not create sycl::program when piProgramBuild() fails, - // thus it won't call piProgramRelease() to clean up the build log. - if (Program->State == _pi_program::Invalid) { - ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (Program->ZeBuildLog)); - Program->ZeBuildLog = nullptr; - } - } - return PI_SUCCESS; - } - - // Otherwise, there is no error. The OpenCL spec says to return an empty - // string if there ws no previous attempt to compile, build, or link the - // program. - return ReturnValue(""); - } else { - urPrint("piProgramGetBuildInfo: unsupported ParamName\n"); - return PI_ERROR_INVALID_VALUE; - } - return PI_SUCCESS; + + return pi2ur::piProgramGetBuildInfo(Program, Device, ParamName, + ParamValueSize, ParamValue, + ParamValueSizeRet); } pi_result piProgramRetain(pi_program Program) { - PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); - Program->RefCount.increment(); - return PI_SUCCESS; + return pi2ur::piProgramRetain(Program); } pi_result piProgramRelease(pi_program Program) { - PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); - - if (!Program->RefCount.decrementAndTest()) - return PI_SUCCESS; - - delete Program; - - return PI_SUCCESS; + return pi2ur::piProgramRelease(Program); } pi_result piextProgramGetNativeHandle(pi_program Program, pi_native_handle *NativeHandle) { - PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); - PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - - auto ZeModule = ur_cast(NativeHandle); - - std::shared_lock Guard(Program->Mutex); - switch (Program->State) { - case _pi_program::Exe: { - *ZeModule = Program->ZeModule; - break; - } - - default: - return PI_ERROR_INVALID_OPERATION; - } - - return PI_SUCCESS; + return pi2ur::piextProgramGetNativeHandle(Program, NativeHandle); } pi_result piextProgramCreateWithNativeHandle(pi_native_handle NativeHandle, pi_context Context, - bool ownNativeHandle, + bool OwnNativeHandle, pi_program *Program) { - PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); - PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - - auto ZeModule = ur_cast(NativeHandle); - - // We assume here that programs created from a native handle always - // represent a fully linked executable (state Exe) and not an unlinked - // executable (state Object). - - try { - *Program = - new _pi_program(_pi_program::Exe, Context, ZeModule, ownNativeHandle); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - return PI_SUCCESS; -} - -_pi_program::~_pi_program() { - // According to Level Zero Specification, all kernels and build logs - // must be destroyed before the Module can be destroyed. So, be sure - // to destroy build log before destroying the module. - if (ZeBuildLog) { - ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (ZeBuildLog)); - } - - if (ZeModule && OwnZeModule) { - ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule)); - } -} - -// Check to see if a Level Zero module has any unresolved symbols. -// -// @param ZeModule The module handle to check. -// @param ZeBuildLog If there are unresolved symbols, this build log handle is -// modified to receive information telling which symbols -// are unresolved. -// -// @return ZE_RESULT_ERROR_MODULE_LINK_FAILURE indicates there are unresolved -// symbols. ZE_RESULT_SUCCESS indicates all symbols are resolved. Any other -// value indicates there was an error and we cannot tell if symbols are -// resolved. -static ze_result_t -checkUnresolvedSymbols(ze_module_handle_t ZeModule, - ze_module_build_log_handle_t *ZeBuildLog) { - - // First check to see if the module has any imported symbols. If there are - // no imported symbols, it's not possible to have any unresolved symbols. We - // do this check first because we assume it's faster than the call to - // zeModuleDynamicLink below. - ZeStruct ZeModuleProps; - ze_result_t ZeResult = - ZE_CALL_NOCHECK(zeModuleGetProperties, (ZeModule, &ZeModuleProps)); - if (ZeResult != ZE_RESULT_SUCCESS) - return ZeResult; - - // If there are imported symbols, attempt to "link" the module with itself. - // As a side effect, this will return the error - // ZE_RESULT_ERROR_MODULE_LINK_FAILURE if there are any unresolved symbols. - if (ZeModuleProps.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS) { - return ZE_CALL_NOCHECK(zeModuleDynamicLink, (1, &ZeModule, ZeBuildLog)); - } - return ZE_RESULT_SUCCESS; + return pi2ur::piextProgramCreateWithNativeHandle(NativeHandle, Context, + OwnNativeHandle, Program); } pi_result piKernelCreate(pi_program Program, const char *KernelName, pi_kernel *RetKernel) { - PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); - PI_ASSERT(RetKernel, PI_ERROR_INVALID_VALUE); - PI_ASSERT(KernelName, PI_ERROR_INVALID_VALUE); - - std::shared_lock Guard(Program->Mutex); - if (Program->State != _pi_program::Exe) { - return PI_ERROR_INVALID_PROGRAM_EXECUTABLE; - } - - ZeStruct ZeKernelDesc; - ZeKernelDesc.flags = 0; - ZeKernelDesc.pKernelName = KernelName; - - ze_kernel_handle_t ZeKernel; - ZE_CALL(zeKernelCreate, (Program->ZeModule, &ZeKernelDesc, &ZeKernel)); - - try { - *RetKernel = new _pi_kernel(ZeKernel, true, Program); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - PI_CALL((*RetKernel)->initialize()); - return PI_SUCCESS; -} - -pi_result _pi_kernel::initialize() { - // Retain the program and context to show it's used by this kernel. - PI_CALL(piProgramRetain(Program)); - if (IndirectAccessTrackingEnabled) - // TODO: do piContextRetain without the guard - PI_CALL(piContextRetain(Program->Context)); - - // Set up how to obtain kernel properties when needed. - ZeKernelProperties.Compute = [this](ze_kernel_properties_t &Properties) { - ZE_CALL_NOCHECK(zeKernelGetProperties, (ZeKernel, &Properties)); - }; - - // Cache kernel name. - ZeKernelName.Compute = [this](std::string &Name) { - size_t Size = 0; - ZE_CALL_NOCHECK(zeKernelGetName, (ZeKernel, &Size, nullptr)); - char *KernelName = new char[Size]; - ZE_CALL_NOCHECK(zeKernelGetName, (ZeKernel, &Size, KernelName)); - Name = KernelName; - delete[] KernelName; - }; - - return PI_SUCCESS; + return pi2ur::piKernelCreate(Program, KernelName, RetKernel); } pi_result piKernelSetArg(pi_kernel Kernel, pi_uint32 ArgIndex, size_t ArgSize, const void *ArgValue) { - // OpenCL: "the arg_value pointer can be NULL or point to a NULL value - // in which case a NULL value will be used as the value for the argument - // declared as a pointer to global or constant memory in the kernel" - // - // We don't know the type of the argument but it seems that the only time - // SYCL RT would send a pointer to NULL in 'arg_value' is when the argument - // is a NULL pointer. Treat a pointer to NULL in 'arg_value' as a NULL. - if (ArgSize == sizeof(void *) && ArgValue && - *(void **)(const_cast(ArgValue)) == nullptr) { - ArgValue = nullptr; - } - - PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); - - std::scoped_lock Guard(Kernel->Mutex); - ZE_CALL(zeKernelSetArgumentValue, - (ur_cast(Kernel->ZeKernel), - ur_cast(ArgIndex), ur_cast(ArgSize), - ur_cast(ArgValue))); - - return PI_SUCCESS; + return pi2ur::piKernelSetArg(Kernel, ArgIndex, ArgSize, ArgValue); } // Special version of piKernelSetArg to accept pi_mem. pi_result piextKernelSetArgMemObj(pi_kernel Kernel, pi_uint32 ArgIndex, const pi_mem *ArgValue) { - // TODO: the better way would probably be to add a new PI API for - // extracting native PI object from PI handle, and have SYCL - // RT pass that directly to the regular piKernelSetArg (and - // then remove this piextKernelSetArgMemObj). - - PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); - - // We don't yet know the device where this kernel will next be run on. - // Thus we can't know the actual memory allocation that needs to be used. - // Remember the memory object being used as an argument for this kernel - // to process it later when the device is known (at the kernel enqueue). - // - // TODO: for now we have to conservatively assume the access as read-write. - // Improve that by passing SYCL buffer accessor type into - // piextKernelSetArgMemObj. - // - std::scoped_lock Guard(Kernel->Mutex); - // The ArgValue may be a NULL pointer in which case a NULL value is used for - // the kernel argument declared as a pointer to global or constant memory. - auto Arg = ArgValue ? *ArgValue : nullptr; - Kernel->PendingArguments.push_back( - {ArgIndex, sizeof(void *), Arg, _pi_mem::read_write}); - return PI_SUCCESS; + return pi2ur::piextKernelSetArgMemObj(Kernel, ArgIndex, ArgValue); } // Special version of piKernelSetArg to accept pi_sampler. pi_result piextKernelSetArgSampler(pi_kernel Kernel, pi_uint32 ArgIndex, const pi_sampler *ArgValue) { - PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); - - std::scoped_lock Guard(Kernel->Mutex); - ZE_CALL(zeKernelSetArgumentValue, - (ur_cast(Kernel->ZeKernel), - ur_cast(ArgIndex), sizeof(void *), - &(*ArgValue)->ZeSampler)); - return PI_SUCCESS; + return pi2ur::piextKernelSetArgSampler(Kernel, ArgIndex, ArgValue); } pi_result piKernelGetInfo(pi_kernel Kernel, pi_kernel_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); - - ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); - - std::shared_lock Guard(Kernel->Mutex); - switch (ParamName) { - case PI_KERNEL_INFO_CONTEXT: - return ReturnValue(pi_context{Kernel->Program->Context}); - case PI_KERNEL_INFO_PROGRAM: - return ReturnValue(pi_program{Kernel->Program}); - case PI_KERNEL_INFO_FUNCTION_NAME: - try { - std::string &KernelName = *Kernel->ZeKernelName.operator->(); - return ReturnValue(static_cast(KernelName.c_str())); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - case PI_KERNEL_INFO_NUM_ARGS: - return ReturnValue(pi_uint32{Kernel->ZeKernelProperties->numKernelArgs}); - case PI_KERNEL_INFO_REFERENCE_COUNT: - return ReturnValue(pi_uint32{Kernel->RefCount.load()}); - case PI_KERNEL_INFO_ATTRIBUTES: - try { - uint32_t Size; - ZE_CALL(zeKernelGetSourceAttributes, (Kernel->ZeKernel, &Size, nullptr)); - char *attributes = new char[Size]; - ZE_CALL(zeKernelGetSourceAttributes, - (Kernel->ZeKernel, &Size, &attributes)); - auto Res = ReturnValue(attributes); - delete[] attributes; - return Res; - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - default: - urPrint("Unsupported ParamName in piKernelGetInfo: ParamName=%d(0x%x)\n", - ParamName, ParamName); - return PI_ERROR_INVALID_VALUE; - } - return PI_SUCCESS; + return pi2ur::piKernelGetInfo(Kernel, ParamName, ParamValueSize, ParamValue, + ParamValueSizeRet); } pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device, pi_kernel_group_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); - PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); - - ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); - - std::shared_lock Guard(Kernel->Mutex); - switch (ParamName) { - case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { - struct { - size_t Arr[3]; - } GlobalWorkSize = {{(Device->ZeDeviceComputeProperties->maxGroupSizeX * - Device->ZeDeviceComputeProperties->maxGroupCountX), - (Device->ZeDeviceComputeProperties->maxGroupSizeY * - Device->ZeDeviceComputeProperties->maxGroupCountY), - (Device->ZeDeviceComputeProperties->maxGroupSizeZ * - Device->ZeDeviceComputeProperties->maxGroupCountZ)}}; - return ReturnValue(GlobalWorkSize); - } - case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { - // As of right now, L0 is missing API to query kernel and device specific - // max work group size. - return ReturnValue( - pi_uint64{Device->ZeDeviceComputeProperties->maxTotalGroupSize}); - } - case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { - struct { - size_t Arr[3]; - } WgSize = {{Kernel->ZeKernelProperties->requiredGroupSizeX, - Kernel->ZeKernelProperties->requiredGroupSizeY, - Kernel->ZeKernelProperties->requiredGroupSizeZ}}; - return ReturnValue(WgSize); - } - case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: - return ReturnValue(pi_uint32{Kernel->ZeKernelProperties->localMemSize}); - case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { - return ReturnValue(size_t{Device->ZeDeviceProperties->physicalEUSimdWidth}); - } - case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: - return ReturnValue(pi_uint32{Kernel->ZeKernelProperties->privateMemSize}); - case PI_KERNEL_GROUP_INFO_NUM_REGS: { - die("PI_KERNEL_GROUP_INFO_NUM_REGS in piKernelGetGroupInfo not " - "implemented\n"); - break; - } - default: - urPrint("Unknown ParamName in piKernelGetGroupInfo: ParamName=%d(0x%x)\n", - ParamName, ParamName); - return PI_ERROR_INVALID_VALUE; - } - return PI_SUCCESS; + + return pi2ur::piKernelGetGroupInfo(Kernel, Device, ParamName, ParamValueSize, + ParamValue, ParamValueSizeRet); } pi_result piKernelGetSubGroupInfo(pi_kernel Kernel, pi_device Device, @@ -4579,57 +407,20 @@ pi_result piKernelGetSubGroupInfo(pi_kernel Kernel, pi_device Device, size_t InputValueSize, const void *InputValue, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - (void)Device; - (void)InputValueSize; - (void)InputValue; - - ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); - - std::shared_lock Guard(Kernel->Mutex); - if (ParamName == PI_KERNEL_MAX_SUB_GROUP_SIZE) { - ReturnValue(uint32_t{Kernel->ZeKernelProperties->maxSubgroupSize}); - } else if (ParamName == PI_KERNEL_MAX_NUM_SUB_GROUPS) { - ReturnValue(uint32_t{Kernel->ZeKernelProperties->maxNumSubgroups}); - } else if (ParamName == PI_KERNEL_COMPILE_NUM_SUB_GROUPS) { - ReturnValue(uint32_t{Kernel->ZeKernelProperties->requiredNumSubGroups}); - } else if (ParamName == PI_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL) { - ReturnValue(uint32_t{Kernel->ZeKernelProperties->requiredSubgroupSize}); - } else { - die("piKernelGetSubGroupInfo: parameter not implemented"); - return {}; - } - return PI_SUCCESS; + + return pi2ur::piKernelGetSubGroupInfo( + Kernel, Device, ParamName, InputValueSize, InputValue, ParamValueSize, + ParamValue, ParamValueSizeRet); } pi_result piKernelRetain(pi_kernel Kernel) { - PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); - - Kernel->RefCount.increment(); - return PI_SUCCESS; + return pi2ur::piKernelRetain(Kernel); } pi_result piKernelRelease(pi_kernel Kernel) { - PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); - - if (!Kernel->RefCount.decrementAndTest()) - return PI_SUCCESS; - - auto KernelProgram = Kernel->Program; - if (Kernel->OwnZeKernel) { - auto ZeResult = ZE_CALL_NOCHECK(zeKernelDestroy, (Kernel->ZeKernel)); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return mapError(ZeResult); - } - if (IndirectAccessTrackingEnabled) { - PI_CALL(piContextRelease(KernelProgram->Context)); - } - // do a release on the program this kernel was part of - PI_CALL(piProgramRelease(KernelProgram)); - delete Kernel; - return PI_SUCCESS; + return pi2ur::piKernelRelease(Kernel); } pi_result @@ -4638,215 +429,9 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *OutEvent) { - PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - PI_ASSERT((WorkDim > 0) && (WorkDim < 4), PI_ERROR_INVALID_WORK_DIMENSION); - - // Lock automatically releases when this goes out of scope. - std::scoped_lock Lock( - Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex); - if (GlobalWorkOffset != NULL) { - if (!Queue->Device->Platform->ZeDriverGlobalOffsetExtensionFound) { - urPrint("No global offset extension found on this driver\n"); - return PI_ERROR_INVALID_VALUE; - } - - ZE_CALL(zeKernelSetGlobalOffsetExp, - (Kernel->ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1], - GlobalWorkOffset[2])); - } - - // If there are any pending arguments set them now. - for (auto &Arg : Kernel->PendingArguments) { - // The ArgValue may be a NULL pointer in which case a NULL value is used for - // the kernel argument declared as a pointer to global or constant memory. - char **ZeHandlePtr = nullptr; - if (Arg.Value) { - PI_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode, - Queue->Device)); - } - ZE_CALL(zeKernelSetArgumentValue, - (Kernel->ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr)); - } - Kernel->PendingArguments.clear(); - - ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; - uint32_t WG[3]; - - // global_work_size of unused dimensions must be set to 1 - PI_ASSERT(WorkDim == 3 || GlobalWorkSize[2] == 1, PI_ERROR_INVALID_VALUE); - PI_ASSERT(WorkDim >= 2 || GlobalWorkSize[1] == 1, PI_ERROR_INVALID_VALUE); - - if (LocalWorkSize) { - WG[0] = ur_cast(LocalWorkSize[0]); - WG[1] = ur_cast(LocalWorkSize[1]); - WG[2] = ur_cast(LocalWorkSize[2]); - } else { - // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize - // values do not fit to 32-bit that the API only supports currently. - bool SuggestGroupSize = true; - for (int I : {0, 1, 2}) { - if (GlobalWorkSize[I] > UINT32_MAX) { - SuggestGroupSize = false; - } - } - if (SuggestGroupSize) { - ZE_CALL(zeKernelSuggestGroupSize, - (Kernel->ZeKernel, GlobalWorkSize[0], GlobalWorkSize[1], - GlobalWorkSize[2], &WG[0], &WG[1], &WG[2])); - } else { - for (int I : {0, 1, 2}) { - // Try to find a I-dimension WG size that the GlobalWorkSize[I] is - // fully divisable with. Start with the max possible size in - // each dimension. - uint32_t GroupSize[] = { - Queue->Device->ZeDeviceComputeProperties->maxGroupSizeX, - Queue->Device->ZeDeviceComputeProperties->maxGroupSizeY, - Queue->Device->ZeDeviceComputeProperties->maxGroupSizeZ}; - GroupSize[I] = std::min(size_t(GroupSize[I]), GlobalWorkSize[I]); - while (GlobalWorkSize[I] % GroupSize[I]) { - --GroupSize[I]; - } - if (GlobalWorkSize[I] / GroupSize[I] > UINT32_MAX) { - urPrint("piEnqueueKernelLaunch: can't find a WG size " - "suitable for global work size > UINT32_MAX\n"); - return PI_ERROR_INVALID_WORK_GROUP_SIZE; - } - WG[I] = GroupSize[I]; - } - urPrint("piEnqueueKernelLaunch: using computed WG size = {%d, %d, %d}\n", - WG[0], WG[1], WG[2]); - } - } - - // TODO: assert if sizes do not fit into 32-bit? - switch (WorkDim) { - case 3: - ZeThreadGroupDimensions.groupCountX = - ur_cast(GlobalWorkSize[0] / WG[0]); - ZeThreadGroupDimensions.groupCountY = - ur_cast(GlobalWorkSize[1] / WG[1]); - ZeThreadGroupDimensions.groupCountZ = - ur_cast(GlobalWorkSize[2] / WG[2]); - break; - case 2: - ZeThreadGroupDimensions.groupCountX = - ur_cast(GlobalWorkSize[0] / WG[0]); - ZeThreadGroupDimensions.groupCountY = - ur_cast(GlobalWorkSize[1] / WG[1]); - WG[2] = 1; - break; - case 1: - ZeThreadGroupDimensions.groupCountX = - ur_cast(GlobalWorkSize[0] / WG[0]); - WG[1] = WG[2] = 1; - break; - - default: - urPrint("piEnqueueKernelLaunch: unsupported work_dim\n"); - return PI_ERROR_INVALID_VALUE; - } - - // Error handling for non-uniform group size case - if (GlobalWorkSize[0] != - size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) { - urPrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a " - "multiple of the group size in the 1st dimension\n"); - return PI_ERROR_INVALID_WORK_GROUP_SIZE; - } - if (GlobalWorkSize[1] != - size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) { - urPrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a " - "multiple of the group size in the 2nd dimension\n"); - return PI_ERROR_INVALID_WORK_GROUP_SIZE; - } - if (GlobalWorkSize[2] != - size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) { - urPrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a " - "multiple of the group size in the 3rd dimension\n"); - return PI_ERROR_INVALID_WORK_GROUP_SIZE; - } - - ZE_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2])); - - bool UseCopyEngine = false; - _pi_ze_event_list_t TmpWaitList; - if (auto Res = TmpWaitList.createAndRetainPiZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)) - return Res; - - // Get a new command list to be used on this call - pi_command_list_ptr_t CommandList{}; - if (auto Res = Queue->Context->getAvailableCommandList( - Queue, CommandList, UseCopyEngine, true /* AllowBatching */)) - return Res; - - ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - pi_result Res = createEventAndAssociateQueue( - Queue, Event, PI_COMMAND_TYPE_NDRANGE_KERNEL, CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - - // Save the kernel in the event, so that when the event is signalled - // the code can do a piKernelRelease on this kernel. - (*Event)->CommandData = (void *)Kernel; - - // Increment the reference count of the Kernel and indicate that the Kernel is - // in use. Once the event has been signalled, the code in - // CleanupCompletedEvent(Event) will do a piReleaseKernel to update the - // reference count on the kernel, using the kernel saved in CommandData. - PI_CALL(piKernelRetain(Kernel)); - - // Add to list of kernels to be submitted - if (IndirectAccessTrackingEnabled) - Queue->KernelsToBeSubmitted.push_back(Kernel); - - if (Queue->UsingImmCmdLists && IndirectAccessTrackingEnabled) { - // If using immediate commandlists then gathering of indirect - // references and appending to the queue (which means submission) - // must be done together. - std::unique_lock ContextsLock( - Queue->Device->Platform->ContextsMutex, std::defer_lock); - // We are going to submit kernels for execution. If indirect access flag is - // set for a kernel then we need to make a snapshot of existing memory - // allocations in all contexts in the platform. We need to lock the mutex - // guarding the list of contexts in the platform to prevent creation of new - // memory alocations in any context before we submit the kernel for - // execution. - ContextsLock.lock(); - Queue->CaptureIndirectAccesses(); - // Add the command to the command list, which implies submission. - ZE_CALL(zeCommandListAppendLaunchKernel, - (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions, - ZeEvent, (*Event)->WaitList.Length, - (*Event)->WaitList.ZeEventList)); - } else { - // Add the command to the command list for later submission. - // No lock is needed here, unlike the immediate commandlist case above, - // because the kernels are not actually submitted yet. Kernels will be - // submitted only when the comamndlist is closed. Then, a lock is held. - ZE_CALL(zeCommandListAppendLaunchKernel, - (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions, - ZeEvent, (*Event)->WaitList.Length, - (*Event)->WaitList.ZeEventList)); - } - - urPrint("calling zeCommandListAppendLaunchKernel() with" - " ZeEvent %#llx\n", - ur_cast(ZeEvent)); - printZeEventList((*Event)->WaitList); - - // Execute command list asynchronously, as the event will be used - // to track down its completion. - if (auto Res = Queue->executeCommandList(CommandList, false, true)) - return Res; - - return PI_SUCCESS; + return pi2ur::piEnqueueKernelLaunch( + Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize, + NumEventsInWaitList, EventWaitList, OutEvent); } pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle, @@ -4854,535 +439,42 @@ pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle, pi_program Program, bool OwnNativeHandle, pi_kernel *Kernel) { - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); - PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); - auto ZeKernel = ur_cast(NativeHandle); - *Kernel = new _pi_kernel(ZeKernel, OwnNativeHandle, Program); - PI_CALL((*Kernel)->initialize()); - return PI_SUCCESS; + return pi2ur::piextKernelCreateWithNativeHandle( + NativeHandle, Context, Program, OwnNativeHandle, Kernel); } pi_result piextKernelGetNativeHandle(pi_kernel Kernel, pi_native_handle *NativeHandle) { - PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); - PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - - std::shared_lock Guard(Kernel->Mutex); - auto *ZeKernel = ur_cast(NativeHandle); - *ZeKernel = Kernel->ZeKernel; - return PI_SUCCESS; + return pi2ur::piextKernelGetNativeHandle(Kernel, NativeHandle); } // // Events // -pi_result -_pi_event::getOrCreateHostVisibleEvent(ze_event_handle_t &ZeHostVisibleEvent) { - PI_ASSERT(Queue, PI_ERROR_INVALID_EVENT); - - std::scoped_lock Lock(Queue->Mutex, - this->Mutex); - - if (!HostVisibleEvent) { - if (Queue->Device->ZeEventsScope != OnDemandHostVisibleProxy) - die("getOrCreateHostVisibleEvent: missing host-visible event"); - - // Submit the command(s) signalling the proxy event to the queue. - // We have to first submit a wait for the device-only event for which this - // proxy is created. - // - // Get a new command list to be used on this call - - // We want to batch these commands to avoid extra submissions (costly) - bool OkToBatch = true; - - pi_command_list_ptr_t CommandList{}; - if (auto Res = Queue->Context->getAvailableCommandList( - Queue, CommandList, false /* UseCopyEngine */, OkToBatch)) - return Res; - - // Create a "proxy" host-visible event. - auto Res = createEventAndAssociateQueue( - Queue, &HostVisibleEvent, PI_COMMAND_TYPE_USER, CommandList, - /* IsInternal */ false, /* HostVisible */ true); - if (Res != PI_SUCCESS) - return Res; - - ZE_CALL(zeCommandListAppendWaitOnEvents, (CommandList->first, 1, &ZeEvent)); - ZE_CALL(zeCommandListAppendSignalEvent, - (CommandList->first, HostVisibleEvent->ZeEvent)); - - if (auto Res = Queue->executeCommandList(CommandList, false, OkToBatch)) - return Res; - } - - ZeHostVisibleEvent = HostVisibleEvent->ZeEvent; - return PI_SUCCESS; -} - -pi_result _pi_event::reset() { - Queue = nullptr; - CleanedUp = false; - Completed = false; - CommandData = nullptr; - CommandType = PI_COMMAND_TYPE_USER; - WaitList = {}; - RefCountExternal = 0; - RefCount.reset(); - CommandList = std::nullopt; - - if (!isHostVisible()) - HostVisibleEvent = nullptr; - - ZE_CALL(zeEventHostReset, (ZeEvent)); - return PI_SUCCESS; -} - -pi_event _pi_context::getEventFromContextCache(bool HostVisible, - bool WithProfiling) { - std::scoped_lock Lock(EventCacheMutex); - auto Cache = getEventCache(HostVisible, WithProfiling); - if (Cache->empty()) - return nullptr; - - auto It = Cache->begin(); - pi_event Event = *It; - Cache->erase(It); - // We have to reset event before using it. - Event->reset(); - return Event; -} - -void _pi_context::addEventToContextCache(pi_event Event) { - std::scoped_lock Lock(EventCacheMutex); - auto Cache = - getEventCache(Event->isHostVisible(), Event->isProfilingEnabled()); - Cache->emplace_back(Event); -} - -// Helper function for creating a PI event. -// The "Queue" argument specifies the PI queue where a command is submitted. -// The "HostVisible" argument specifies if event needs to be allocated from -// a host-visible pool. -// -static pi_result EventCreate(pi_context Context, pi_queue Queue, - bool HostVisible, pi_event *RetEvent) { - bool ProfilingEnabled = - !Queue || (Queue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) != 0; - - if (auto CachedEvent = - Context->getEventFromContextCache(HostVisible, ProfilingEnabled)) { - *RetEvent = CachedEvent; - return PI_SUCCESS; - } - - ze_event_handle_t ZeEvent; - ze_event_pool_handle_t ZeEventPool = {}; - - size_t Index = 0; - - if (auto Res = Context->getFreeSlotInExistingOrNewPool( - ZeEventPool, Index, HostVisible, ProfilingEnabled)) - return Res; - - ZeStruct ZeEventDesc; - ZeEventDesc.index = Index; - ZeEventDesc.wait = 0; - - if (HostVisible) { - ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; - } else { - // - // Set the scope to "device" for every event. This is sufficient for - // global device access and peer device access. If needed to be seen on - // the host we are doing special handling, see EventsScope options. - // - // TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be - // used in some circumstances. - // - ZeEventDesc.signal = 0; - } - - ZE_CALL(zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeEvent)); - - try { - PI_ASSERT(RetEvent, PI_ERROR_INVALID_VALUE); - - *RetEvent = new _pi_event(ZeEvent, ZeEventPool, Context, - PI_COMMAND_TYPE_USER, true); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - if (HostVisible) - (*RetEvent)->HostVisibleEvent = *RetEvent; - - return PI_SUCCESS; -} // External PI API entry pi_result piEventCreate(pi_context Context, pi_event *RetEvent) { - pi_result Result = EventCreate(Context, nullptr, true, RetEvent); - (*RetEvent)->RefCountExternal++; - if (Result != PI_SUCCESS) - return Result; - ZE_CALL(zeEventHostSignal, ((*RetEvent)->ZeEvent)); - return PI_SUCCESS; + return pi2ur::piEventCreate(Context, RetEvent); } pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); - - ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); - switch (ParamName) { - case PI_EVENT_INFO_COMMAND_QUEUE: { - std::shared_lock EventLock(Event->Mutex); - return ReturnValue(pi_queue{Event->Queue}); - } - case PI_EVENT_INFO_CONTEXT: { - std::shared_lock EventLock(Event->Mutex); - return ReturnValue(pi_context{Event->Context}); - } - case PI_EVENT_INFO_COMMAND_TYPE: { - std::shared_lock EventLock(Event->Mutex); - return ReturnValue(ur_cast(Event->CommandType)); - } - case PI_EVENT_INFO_COMMAND_EXECUTION_STATUS: { - // Check to see if the event's Queue has an open command list due to - // batching. If so, go ahead and close and submit it, because it is - // possible that this is trying to query some event's status that - // is part of the batch. This isn't strictly required, but it seems - // like a reasonable thing to do. - auto Queue = Event->Queue; - if (Queue) { - // Lock automatically releases when this goes out of scope. - std::scoped_lock lock(Queue->Mutex); - const auto &OpenCommandList = Queue->eventOpenCommandList(Event); - if (OpenCommandList != Queue->CommandListMap.end()) { - if (auto Res = Queue->executeOpenCommandList( - OpenCommandList->second.isCopy(Queue))) - return Res; - } - } - - // Level Zero has a much more explicit notion of command submission than - // OpenCL. It doesn't happen unless the user submits a command list. We've - // done it just above so the status is at least PI_EVENT_SUBMITTED. - // - // NOTE: We currently cannot tell if command is currently running, so - // it will always show up "submitted" before it is finally "completed". - // - pi_int32 Result = PI_EVENT_SUBMITTED; - - // Make sure that we query a host-visible event only. - // If one wasn't yet created then don't create it here as well, and - // just conservatively return that event is not yet completed. - std::shared_lock EventLock(Event->Mutex); - auto HostVisibleEvent = Event->HostVisibleEvent; - if (Event->Completed) { - Result = PI_EVENT_COMPLETE; - } else if (HostVisibleEvent) { - ze_result_t ZeResult; - ZeResult = - ZE_CALL_NOCHECK(zeEventQueryStatus, (HostVisibleEvent->ZeEvent)); - if (ZeResult == ZE_RESULT_SUCCESS) { - Result = PI_EVENT_COMPLETE; - } - } - return ReturnValue(ur_cast(Result)); - } - case PI_EVENT_INFO_REFERENCE_COUNT: - return ReturnValue(pi_uint32{Event->RefCount.load()}); - default: - urPrint("Unsupported ParamName in piEventGetInfo: ParamName=%d(%x)\n", - ParamName, ParamName); - return PI_ERROR_INVALID_VALUE; - } - - return PI_SUCCESS; + return pi2ur::piEventGetInfo(Event, ParamName, ParamValueSize, ParamValue, + ParamValueSizeRet); } pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); - - std::shared_lock EventLock(Event->Mutex); - if (Event->Queue && - (Event->Queue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) == 0) { - return PI_ERROR_PROFILING_INFO_NOT_AVAILABLE; - } - - pi_device Device = - Event->Queue ? Event->Queue->Device : Event->Context->Devices[0]; - - uint64_t ZeTimerResolution = Device->ZeDeviceProperties->timerResolution; - const uint64_t TimestampMaxValue = - ((1ULL << Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL); - - ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); - - ze_kernel_timestamp_result_t tsResult; - - switch (ParamName) { - case PI_PROFILING_INFO_COMMAND_START: { - ZE_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult)); - uint64_t ContextStartTime = - (tsResult.global.kernelStart & TimestampMaxValue) * ZeTimerResolution; - return ReturnValue(ContextStartTime); - } - case PI_PROFILING_INFO_COMMAND_END: { - ZE_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult)); - - uint64_t ContextStartTime = - (tsResult.global.kernelStart & TimestampMaxValue); - uint64_t ContextEndTime = (tsResult.global.kernelEnd & TimestampMaxValue); - - // - // Handle a possible wrap-around (the underlying HW counter is < 64-bit). - // Note, it will not report correct time if there were multiple wrap - // arounds, and the longer term plan is to enlarge the capacity of the - // HW timestamps. - // - if (ContextEndTime <= ContextStartTime) { - ContextEndTime += TimestampMaxValue; - } - ContextEndTime *= ZeTimerResolution; - return ReturnValue(ContextEndTime); - } - case PI_PROFILING_INFO_COMMAND_QUEUED: - case PI_PROFILING_INFO_COMMAND_SUBMIT: - // Note: No users for this case - // The "command_submit" time is implemented by recording submission - // timestamp with a call to piGetDeviceAndHostTimer before command enqueue. - // - return ReturnValue(uint64_t{0}); - default: - urPrint("piEventGetProfilingInfo: not supported ParamName\n"); - return PI_ERROR_INVALID_VALUE; - } - - return PI_SUCCESS; -} - -} // extern "C" - -// Perform any necessary cleanup after an event has been signalled. -// This currently makes sure to release any kernel that may have been used by -// the event, updates the last command event in the queue and cleans up all dep -// events of the event. -// If the caller locks queue mutex then it must pass 'true' to QueueLocked. -static pi_result CleanupCompletedEvent(pi_event Event, bool QueueLocked) { - pi_kernel AssociatedKernel = nullptr; - // List of dependent events. - std::list EventsToBeReleased; - pi_queue AssociatedQueue = nullptr; - { - std::scoped_lock EventLock(Event->Mutex); - // Exit early of event was already cleanedup. - if (Event->CleanedUp) - return PI_SUCCESS; - - AssociatedQueue = Event->Queue; - - // Remember the kernel associated with this event if there is one. We are - // going to release it later. - if (Event->CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL && - Event->CommandData) { - AssociatedKernel = ur_cast(Event->CommandData); - Event->CommandData = nullptr; - } - - // Make a list of all the dependent events that must have signalled - // because this event was dependent on them. - Event->WaitList.collectEventsForReleaseAndDestroyPiZeEventList( - EventsToBeReleased); - - Event->CleanedUp = true; - } - - auto ReleaseIndirectMem = [](pi_kernel Kernel) { - if (IndirectAccessTrackingEnabled) { - // piKernelRelease is called by CleanupCompletedEvent(Event) as soon as - // kernel execution has finished. This is the place where we need to - // release memory allocations. If kernel is not in use (not submitted by - // some other thread) then release referenced memory allocations. As a - // result, memory can be deallocated and context can be removed from - // container in the platform. That's why we need to lock a mutex here. - pi_platform Plt = Kernel->Program->Context->getPlatform(); - std::scoped_lock ContextsLock(Plt->ContextsMutex); - - if (--Kernel->SubmissionsCount == 0) { - // Kernel is not submitted for execution, release referenced memory - // allocations. - for (auto &MemAlloc : Kernel->MemAllocs) { - // std::pair *, Hash - USMFreeHelper(MemAlloc->second.Context, MemAlloc->first, - MemAlloc->second.OwnZeMemHandle); - } - Kernel->MemAllocs.clear(); - } - } - }; - - // We've reset event data members above, now cleanup resources. - if (AssociatedKernel) { - ReleaseIndirectMem(AssociatedKernel); - PI_CALL(piKernelRelease(AssociatedKernel)); - } - - if (AssociatedQueue) { - { - // Lock automatically releases when this goes out of scope. - std::unique_lock QueueLock(AssociatedQueue->Mutex, - std::defer_lock); - if (!QueueLocked) - QueueLock.lock(); - - // If this event was the LastCommandEvent in the queue, being used - // to make sure that commands were executed in-order, remove this. - // If we don't do this, the event can get released and freed leaving - // a dangling pointer to this event. It could also cause unneeded - // already finished events to show up in the wait list. - if (AssociatedQueue->LastCommandEvent == Event) { - AssociatedQueue->LastCommandEvent = nullptr; - } - } - - // Release this event since we explicitly retained it on creation and - // association with queue. Events which don't have associated queue doesn't - // require this release because it means that they are not created using - // createEventAndAssociateQueue, i.e. additional retain was not made. - PI_CALL(piEventReleaseInternal(Event)); - } - - // The list of dependent events will be appended to as we walk it so that this - // algorithm doesn't go recursive due to dependent events themselves being - // dependent on other events forming a potentially very deep tree, and deep - // recursion. That turned out to be a significant problem with the recursive - // code that preceded this implementation. - while (!EventsToBeReleased.empty()) { - pi_event DepEvent = EventsToBeReleased.front(); - DepEvent->Completed = true; - EventsToBeReleased.pop_front(); - - pi_kernel DepEventKernel = nullptr; - { - std::scoped_lock DepEventLock(DepEvent->Mutex); - DepEvent->WaitList.collectEventsForReleaseAndDestroyPiZeEventList( - EventsToBeReleased); - if (IndirectAccessTrackingEnabled) { - // DepEvent has finished, we can release the associated kernel if there - // is one. This is the earliest place we can do this and it can't be - // done twice, so it is safe. Lock automatically releases when this goes - // out of scope. - // TODO: this code needs to be moved out of the guard. - if (DepEvent->CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL && - DepEvent->CommandData) { - DepEventKernel = ur_cast(DepEvent->CommandData); - DepEvent->CommandData = nullptr; - } - } - } - if (DepEventKernel) { - ReleaseIndirectMem(DepEventKernel); - PI_CALL(piKernelRelease(DepEventKernel)); - } - PI_CALL(piEventReleaseInternal(DepEvent)); - } - - return PI_SUCCESS; -} - -extern "C" { + return pi2ur::piEventGetProfilingInfo(Event, ParamName, ParamValueSize, + ParamValue, ParamValueSizeRet); +} pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { - - if (NumEvents && !EventList) { - return PI_ERROR_INVALID_EVENT; - } - for (uint32_t I = 0; I < NumEvents; I++) { - if (EventList[I]->Queue->Device->ZeEventsScope == - OnDemandHostVisibleProxy) { - // Make sure to add all host-visible "proxy" event signals if needed. - // This ensures that all signalling commands are submitted below and - // thus proxy events can be waited without a deadlock. - // - if (!EventList[I]->hasExternalRefs()) - die("piEventsWait must not be called for an internal event"); - - ze_event_handle_t ZeHostVisibleEvent; - if (auto Res = - EventList[I]->getOrCreateHostVisibleEvent(ZeHostVisibleEvent)) - return Res; - } - } - // Submit dependent open command lists for execution, if any - for (uint32_t I = 0; I < NumEvents; I++) { - auto Queue = EventList[I]->Queue; - if (Queue) { - // Lock automatically releases when this goes out of scope. - std::scoped_lock lock(Queue->Mutex); - - if (auto Res = Queue->executeAllOpenCommandLists()) - return Res; - } - } - std::unordered_set Queues; - for (uint32_t I = 0; I < NumEvents; I++) { - { - { - std::shared_lock EventLock(EventList[I]->Mutex); - if (!EventList[I]->hasExternalRefs()) - die("piEventsWait must not be called for an internal event"); - - if (!EventList[I]->Completed) { - auto HostVisibleEvent = EventList[I]->HostVisibleEvent; - if (!HostVisibleEvent) - die("The host-visible proxy event missing"); - - ze_event_handle_t ZeEvent = HostVisibleEvent->ZeEvent; - urPrint("ZeEvent = %#llx\n", ur_cast(ZeEvent)); - ZE_CALL(zeHostSynchronize, (ZeEvent)); - EventList[I]->Completed = true; - } - } - if (auto Q = EventList[I]->Queue) { - if (Q->UsingImmCmdLists && Q->isInOrderQueue()) - // Use information about waited event to cleanup completed events in - // the in-order queue. - CleanupEventsInImmCmdLists(EventList[I]->Queue, - /* QueueLocked */ false, - /* QueueSynced */ false, EventList[I]); - else { - // NOTE: we are cleaning up after the event here to free resources - // sooner in case run-time is not calling piEventRelease soon enough. - CleanupCompletedEvent(EventList[I]); - // For the case when we have out-of-order queue or regular command - // lists its more efficient to check fences so put the queue in the - // set to cleanup later. - Queues.insert(Q); - } - } - } - } - - // We waited some events above, check queue for signaled command lists and - // reset them. - for (auto &Q : Queues) { - std::unique_lock Lock(Q->Mutex); - resetCommandLists(Q); - } - return PI_SUCCESS; + return pi2ur::piEventsWait(NumEvents, EventList); } pi_result piEventSetCallback(pi_event Event, pi_int32 CommandExecCallbackType, @@ -5390,152 +482,32 @@ pi_result piEventSetCallback(pi_event Event, pi_int32 CommandExecCallbackType, pi_int32 EventCommandStatus, void *UserData), void *UserData) { - (void)Event; - (void)CommandExecCallbackType; - (void)PFnNotify; - (void)UserData; - die("piEventSetCallback: deprecated, to be removed"); - return PI_SUCCESS; + return pi2ur::piEventSetCallback(Event, CommandExecCallbackType, PFnNotify, + UserData); } pi_result piEventSetStatus(pi_event Event, pi_int32 ExecutionStatus) { - (void)Event; - (void)ExecutionStatus; - die("piEventSetStatus: deprecated, to be removed"); - return PI_SUCCESS; + return pi2ur::piEventSetStatus(Event, ExecutionStatus); } -pi_result piEventRetain(pi_event Event) { - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); - Event->RefCountExternal++; - Event->RefCount.increment(); - return PI_SUCCESS; -} +pi_result piEventRetain(pi_event Event) { return pi2ur::piEventRetain(Event); } pi_result piEventRelease(pi_event Event) { - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); - Event->RefCountExternal--; - PI_CALL(piEventReleaseInternal(Event)); - return PI_SUCCESS; -} - -void _pi_queue::active_barriers::add(pi_event &Event) { - Event->RefCount.increment(); - Events.push_back(Event); -} - -pi_result _pi_queue::active_barriers::clear() { - for (const auto &Event : Events) - PI_CALL(piEventReleaseInternal(Event)); - Events.clear(); - return PI_SUCCESS; -} - -static pi_result piEventReleaseInternal(pi_event Event) { - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); - - if (!Event->RefCount.decrementAndTest()) - return PI_SUCCESS; - - if (Event->CommandType == PI_COMMAND_TYPE_MEM_BUFFER_UNMAP && - Event->CommandData) { - // Free the memory allocated in the piEnqueueMemBufferMap. - if (auto Res = ZeMemFreeHelper(Event->Context, Event->CommandData)) - return Res; - Event->CommandData = nullptr; - } - if (Event->OwnZeEvent) { - if (DisableEventsCaching) { - auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent)); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return mapError(ZeResult); - - auto Context = Event->Context; - if (auto Res = Context->decrementUnreleasedEventsInPool(Event)) - return Res; - } - } - // It is possible that host-visible event was never created. - // In case it was check if that's different from this same event - // and release a reference to it. - if (Event->HostVisibleEvent && Event->HostVisibleEvent != Event) { - // Decrement ref-count of the host-visible proxy event. - PI_CALL(piEventReleaseInternal(Event->HostVisibleEvent)); - } - - // Save pointer to the queue before deleting/resetting event. - // When we add an event to the cache we need to check whether profiling is - // enabled or not, so we access properties of the queue and that's why queue - // must released later. - auto Queue = Event->Queue; - if (DisableEventsCaching || !Event->OwnZeEvent) { - delete Event; - } else { - Event->Context->addEventToContextCache(Event); - } - - // We intentionally incremented the reference counter when an event is - // created so that we can avoid pi_queue is released before the associated - // pi_event is released. Here we have to decrement it so pi_queue - // can be released successfully. - if (Queue) { - PI_CALL(piQueueReleaseInternal(Queue)); - } - - return PI_SUCCESS; + return pi2ur::piEventRelease(Event); } pi_result piextEventGetNativeHandle(pi_event Event, pi_native_handle *NativeHandle) { - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); - PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - { - std::shared_lock Lock(Event->Mutex); - auto *ZeEvent = ur_cast(NativeHandle); - *ZeEvent = Event->ZeEvent; - } - // Event can potentially be in an open command-list, make sure that - // it is submitted for execution to avoid potential deadlock if - // interop app is going to wait for it. - auto Queue = Event->Queue; - if (Queue) { - std::scoped_lock lock(Queue->Mutex); - const auto &OpenCommandList = Queue->eventOpenCommandList(Event); - if (OpenCommandList != Queue->CommandListMap.end()) { - if (auto Res = Queue->executeOpenCommandList( - OpenCommandList->second.isCopy(Queue))) - return Res; - } - } - return PI_SUCCESS; + return pi2ur::piextEventGetNativeHandle(Event, NativeHandle); } pi_result piextEventCreateWithNativeHandle(pi_native_handle NativeHandle, pi_context Context, bool OwnNativeHandle, pi_event *Event) { - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); - PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - - auto ZeEvent = ur_cast(NativeHandle); - *Event = new _pi_event(ZeEvent, nullptr /* ZeEventPool */, Context, - PI_COMMAND_TYPE_USER, OwnNativeHandle); - - // Assume native event is host-visible, or otherwise we'd - // need to create a host-visible proxy for it. - (*Event)->HostVisibleEvent = *Event; - - // Unlike regular events managed by SYCL RT we don't have to wait for interop - // events completion, and not need to do the their `cleanup()`. This in - // particular guarantees that the extra `piEventRelease` is not called on - // them. That release is needed to match the `piEventRetain` of regular events - // made for waiting for event completion, but not this interop event. - (*Event)->CleanedUp = true; - - return PI_SUCCESS; + return pi2ur::piextEventCreateWithNativeHandle(NativeHandle, Context, + OwnNativeHandle, Event); } // @@ -5544,167 +516,23 @@ pi_result piextEventCreateWithNativeHandle(pi_native_handle NativeHandle, pi_result piSamplerCreate(pi_context Context, const pi_sampler_properties *SamplerProperties, pi_sampler *RetSampler) { - - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - PI_ASSERT(RetSampler, PI_ERROR_INVALID_VALUE); - - std::shared_lock Lock(Context->Mutex); - - // Have the "0" device in context to own the sampler. Rely on Level-Zero - // drivers to perform migration as necessary for sharing it across multiple - // devices in the context. - // - // TODO: figure out if we instead need explicit copying for acessing - // the sampler from other devices in the context. - // - pi_device Device = Context->Devices[0]; - - ze_sampler_handle_t ZeSampler; - ZeStruct ZeSamplerDesc; - - // Set the default values for the ZeSamplerDesc. - ZeSamplerDesc.isNormalized = PI_TRUE; - ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_CLAMP; - ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST; - - // Update the values of the ZeSamplerDesc from the pi_sampler_properties list. - // Default values will be used if any of the following is true: - // a) SamplerProperties list is NULL - // b) SamplerProperties list is missing any properties - - if (SamplerProperties) { - const pi_sampler_properties *CurProperty = SamplerProperties; - - while (*CurProperty != 0) { - switch (*CurProperty) { - case PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS: { - pi_bool CurValueBool = ur_cast(*(++CurProperty)); - - if (CurValueBool == PI_TRUE) - ZeSamplerDesc.isNormalized = PI_TRUE; - else if (CurValueBool == PI_FALSE) - ZeSamplerDesc.isNormalized = PI_FALSE; - else { - urPrint("piSamplerCreate: unsupported " - "PI_SAMPLER_NORMALIZED_COORDS value\n"); - return PI_ERROR_INVALID_VALUE; - } - } break; - - case PI_SAMPLER_PROPERTIES_ADDRESSING_MODE: { - pi_sampler_addressing_mode CurValueAddressingMode = - ur_cast( - ur_cast(*(++CurProperty))); - - // Level Zero runtime with API version 1.2 and lower has a bug: - // ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER is implemented as "clamp to - // edge" and ZE_SAMPLER_ADDRESS_MODE_CLAMP is implemented as "clamp to - // border", i.e. logic is flipped. Starting from API version 1.3 this - // problem is going to be fixed. That's why check for API version to set - // an address mode. - ze_api_version_t ZeApiVersion = Context->getPlatform()->ZeApiVersion; - // TODO: add support for PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE - switch (CurValueAddressingMode) { - case PI_SAMPLER_ADDRESSING_MODE_NONE: - ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_NONE; - break; - case PI_SAMPLER_ADDRESSING_MODE_REPEAT: - ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_REPEAT; - break; - case PI_SAMPLER_ADDRESSING_MODE_CLAMP: - ZeSamplerDesc.addressMode = - ZeApiVersion < ZE_MAKE_VERSION(1, 3) - ? ZE_SAMPLER_ADDRESS_MODE_CLAMP - : ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; - break; - case PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: - ZeSamplerDesc.addressMode = - ZeApiVersion < ZE_MAKE_VERSION(1, 3) - ? ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER - : ZE_SAMPLER_ADDRESS_MODE_CLAMP; - break; - case PI_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: - ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_MIRROR; - break; - default: - urPrint("piSamplerCreate: unsupported PI_SAMPLER_ADDRESSING_MODE " - "value\n"); - urPrint("PI_SAMPLER_ADDRESSING_MODE=%d\n", CurValueAddressingMode); - return PI_ERROR_INVALID_VALUE; - } - } break; - - case PI_SAMPLER_PROPERTIES_FILTER_MODE: { - pi_sampler_filter_mode CurValueFilterMode = - ur_cast( - ur_cast(*(++CurProperty))); - - if (CurValueFilterMode == PI_SAMPLER_FILTER_MODE_NEAREST) - ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST; - else if (CurValueFilterMode == PI_SAMPLER_FILTER_MODE_LINEAR) - ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_LINEAR; - else { - urPrint("PI_SAMPLER_FILTER_MODE=%d\n", CurValueFilterMode); - urPrint( - "piSamplerCreate: unsupported PI_SAMPLER_FILTER_MODE value\n"); - return PI_ERROR_INVALID_VALUE; - } - } break; - - default: - break; - } - CurProperty++; - } - } - - ZE_CALL(zeSamplerCreate, (Context->ZeContext, Device->ZeDevice, - &ZeSamplerDesc, // TODO: translate properties - &ZeSampler)); - - try { - *RetSampler = new _pi_sampler(ZeSampler); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - return PI_SUCCESS; + return pi2ur::piSamplerCreate(Context, SamplerProperties, RetSampler); } pi_result piSamplerGetInfo(pi_sampler Sampler, pi_sampler_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - (void)Sampler; - (void)ParamName; - (void)ParamValueSize; - (void)ParamValue; - (void)ParamValueSizeRet; - die("piSamplerGetInfo: not implemented"); - return {}; + return pi2ur::piSamplerGetInfo(Sampler, ParamName, ParamValueSize, ParamValue, + ParamValueSizeRet); } pi_result piSamplerRetain(pi_sampler Sampler) { - PI_ASSERT(Sampler, PI_ERROR_INVALID_SAMPLER); - - Sampler->RefCount.increment(); - return PI_SUCCESS; + return pi2ur::piSamplerRetain(Sampler); } pi_result piSamplerRelease(pi_sampler Sampler) { - PI_ASSERT(Sampler, PI_ERROR_INVALID_SAMPLER); - - if (!Sampler->RefCount.decrementAndTest()) - return PI_SUCCESS; - - auto ZeResult = ZE_CALL_NOCHECK(zeSamplerDestroy, (Sampler->ZeSampler)); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return mapError(ZeResult); - - delete Sampler; - return PI_SUCCESS; + return pi2ur::piSamplerRelease(Sampler); } // @@ -5714,302 +542,17 @@ pi_result piEnqueueEventsWait(pi_queue Queue, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *OutEvent) { - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - if (EventWaitList) { - PI_ASSERT(NumEventsInWaitList > 0, PI_ERROR_INVALID_VALUE); - - bool UseCopyEngine = false; - - // Lock automatically releases when this goes out of scope. - std::scoped_lock lock(Queue->Mutex); - - _pi_ze_event_list_t TmpWaitList = {}; - if (auto Res = TmpWaitList.createAndRetainPiZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)) - return Res; - - // Get a new command list to be used on this call - pi_command_list_ptr_t CommandList{}; - if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine)) - return Res; - - ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - - const auto &WaitList = (*Event)->WaitList; - auto ZeCommandList = CommandList->first; - ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); - - ZE_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent)); - - // Execute command list asynchronously as the event will be used - // to track down its completion. - return Queue->executeCommandList(CommandList); - } - - { - // If wait-list is empty, then this particular command should wait until - // all previous enqueued commands to the command-queue have completed. - // - // TODO: find a way to do that without blocking the host. - - // Lock automatically releases when this goes out of scope. - std::scoped_lock lock(Queue->Mutex); - - if (OutEvent) { - auto Res = createEventAndAssociateQueue( - Queue, OutEvent, PI_COMMAND_TYPE_USER, Queue->CommandListMap.end(), - /* IsInternal */ false); - if (Res != PI_SUCCESS) - return Res; - } - - Queue->synchronize(); - - if (OutEvent) { - Queue->LastCommandEvent = *OutEvent; - - ZE_CALL(zeEventHostSignal, ((*OutEvent)->ZeEvent)); - (*OutEvent)->Completed = true; - } - } - - if (!Queue->UsingImmCmdLists) { - std::unique_lock Lock(Queue->Mutex); - resetCommandLists(Queue); - } - - return PI_SUCCESS; + return pi2ur::piEnqueueEventsWait(Queue, NumEventsInWaitList, EventWaitList, + OutEvent); } pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *OutEvent) { - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - // Lock automatically releases when this goes out of scope. - std::scoped_lock lock(Queue->Mutex); - - // Helper function for appending a barrier to a command list. - auto insertBarrierIntoCmdList = [&Queue]( - pi_command_list_ptr_t CmdList, - const _pi_ze_event_list_t &EventWaitList, - pi_event &Event, bool IsInternal) { - // For in-order queue and empty wait-list just use the last command - // event as the barrier event. - if (Queue->isInOrderQueue() && !EventWaitList.Length && - Queue->LastCommandEvent && !Queue->LastCommandEvent->IsDiscarded) { - PI_CALL(piEventRetain(Queue->LastCommandEvent)); - Event = Queue->LastCommandEvent; - return PI_SUCCESS; - } - - if (auto Res = createEventAndAssociateQueue( - Queue, &Event, PI_COMMAND_TYPE_USER, CmdList, IsInternal)) - return Res; - - Event->WaitList = EventWaitList; - - // For in-order queue we don't need a real barrier, just wait for requested - // events in potentially different queues and add a "barrier" event signal - // because it is already guaranteed that previous commands in this queue - // are completed when the signal is started. - // - // TODO: this and other special handling of in-order queues to be - // updated when/if Level Zero adds native support for in-order queues. - // - if (Queue->isInOrderQueue() && InOrderBarrierBySignal) { - if (EventWaitList.Length) { - ZE_CALL( - zeCommandListAppendWaitOnEvents, - (CmdList->first, EventWaitList.Length, EventWaitList.ZeEventList)); - } - ZE_CALL(zeCommandListAppendSignalEvent, (CmdList->first, Event->ZeEvent)); - } else { - ZE_CALL(zeCommandListAppendBarrier, - (CmdList->first, Event->ZeEvent, EventWaitList.Length, - EventWaitList.ZeEventList)); - } - return PI_SUCCESS; - }; - - // If the queue is in-order then each command in it effectively acts as a - // barrier, so we don't need to do anything except if we were requested - // a "barrier" event to be created. Or if we need to wait for events in - // potentially different queues. - // - if (Queue->isInOrderQueue() && NumEventsInWaitList == 0 && !OutEvent) { - return PI_SUCCESS; - } - - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - - // Indicator for whether batching is allowed. This may be changed later in - // this function, but allow it by default. - bool OkToBatch = true; - - // If we have a list of events to make the barrier from, then we can create a - // barrier on these and use the resulting event as our future barrier. - // We use the same approach if - // UR_L0_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a - // positive value. - // We use the same approach if we have in-order queue because every command - // depends on previous one, so we don't need to insert barrier to multiple - // command lists. - if (NumEventsInWaitList || !UseMultipleCmdlistBarriers || - Queue->isInOrderQueue()) { - // Retain the events as they will be owned by the result event. - _pi_ze_event_list_t TmpWaitList; - if (auto Res = TmpWaitList.createAndRetainPiZeEventList( - NumEventsInWaitList, EventWaitList, Queue, - /*UseCopyEngine=*/false)) - return Res; - - // Get an arbitrary command-list in the queue. - pi_command_list_ptr_t CmdList; - if (auto Res = Queue->Context->getAvailableCommandList( - Queue, CmdList, - /*UseCopyEngine=*/false, OkToBatch)) - return Res; - - // Insert the barrier into the command-list and execute. - if (auto Res = - insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal)) - return Res; - - if (auto Res = Queue->executeCommandList(CmdList, false, OkToBatch)) - return Res; - - // Because of the dependency between commands in the in-order queue we don't - // need to keep track of any active barriers if we have in-order queue. - if (UseMultipleCmdlistBarriers && !Queue->isInOrderQueue()) { - Queue->ActiveBarriers.add(*Event); - } - return PI_SUCCESS; - } - // Since there are no events to explicitly create a barrier for, we are - // inserting a queue-wide barrier. - - // Command list(s) for putting barriers. - std::vector CmdLists; - - // There must be at least one L0 queue. - auto &ComputeGroup = Queue->ComputeQueueGroupsByTID.get(); - auto &CopyGroup = Queue->CopyQueueGroupsByTID.get(); - PI_ASSERT(!ComputeGroup.ZeQueues.empty() || !CopyGroup.ZeQueues.empty(), - PI_ERROR_INVALID_QUEUE); - - size_t NumQueues = 0; - for (auto &QueueMap : - {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID}) - for (auto &QueueGroup : QueueMap) - NumQueues += QueueGroup.second.ZeQueues.size(); - - OkToBatch = true; - // Get an available command list tied to each command queue. We need - // these so a queue-wide barrier can be inserted into each command - // queue. - CmdLists.reserve(NumQueues); - for (auto &QueueMap : - {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID}) - for (auto &QueueGroup : QueueMap) { - bool UseCopyEngine = - QueueGroup.second.Type != _pi_queue::queue_type::Compute; - if (Queue->UsingImmCmdLists) { - // If immediate command lists are being used, each will act as their own - // queue, so we must insert a barrier into each. - for (auto &ImmCmdList : QueueGroup.second.ImmCmdLists) - if (ImmCmdList != Queue->CommandListMap.end()) - CmdLists.push_back(ImmCmdList); - } else { - for (auto ZeQueue : QueueGroup.second.ZeQueues) { - if (ZeQueue) { - pi_command_list_ptr_t CmdList; - if (auto Res = Queue->Context->getAvailableCommandList( - Queue, CmdList, UseCopyEngine, OkToBatch, &ZeQueue)) - return Res; - CmdLists.push_back(CmdList); - } - } - } - } - - // If no activity has occurred on the queue then there will be no cmdlists. - // We need one for generating an Event, so create one. - if (CmdLists.size() == 0) { - // Get any available command list. - pi_command_list_ptr_t CmdList; - if (auto Res = Queue->Context->getAvailableCommandList( - Queue, CmdList, - /*UseCopyEngine=*/false, OkToBatch)) - return Res; - CmdLists.push_back(CmdList); - } - - if (CmdLists.size() > 1) { - // Insert a barrier into each unique command queue using the available - // command-lists. - std::vector EventWaitVector(CmdLists.size()); - for (size_t I = 0; I < CmdLists.size(); ++I) { - if (auto Res = - insertBarrierIntoCmdList(CmdLists[I], _pi_ze_event_list_t{}, - EventWaitVector[I], /*IsInternal*/ true)) - return Res; - } - // If there were multiple queues we need to create a "convergence" event to - // be our active barrier. This convergence event is signalled by a barrier - // on all the events from the barriers we have inserted into each queue. - // Use the first command list as our convergence command list. - pi_command_list_ptr_t &ConvergenceCmdList = CmdLists[0]; - - // Create an event list. It will take ownership over all relevant events so - // we relinquish ownership and let it keep all events it needs. - _pi_ze_event_list_t BaseWaitList; - if (auto Res = BaseWaitList.createAndRetainPiZeEventList( - EventWaitVector.size(), EventWaitVector.data(), Queue, - ConvergenceCmdList->second.isCopy(Queue))) - return Res; - - // Insert a barrier with the events from each command-queue into the - // convergence command list. The resulting event signals the convergence of - // all barriers. - if (auto Res = insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList, - *Event, IsInternal)) - return Res; - } else { - // If there is only a single queue then insert a barrier and the single - // result event can be used as our active barrier and used as the return - // event. Take into account whether output event is discarded or not. - if (auto Res = insertBarrierIntoCmdList(CmdLists[0], _pi_ze_event_list_t{}, - *Event, IsInternal)) - return Res; - } - - // Execute each command list so the barriers can be encountered. - for (pi_command_list_ptr_t &CmdList : CmdLists) - if (auto Res = Queue->executeCommandList(CmdList, false, OkToBatch)) - return Res; - - if (auto Res = Queue->ActiveBarriers.clear()) - return Res; - Queue->ActiveBarriers.add(*Event); - return PI_SUCCESS; + return pi2ur::piEnqueueEventsWaitWithBarrier(Queue, NumEventsInWaitList, + EventWaitList, OutEvent); } pi_result piEnqueueMemBufferRead(pi_queue Queue, pi_mem Src, @@ -6018,19 +561,10 @@ pi_result piEnqueueMemBufferRead(pi_queue Queue, pi_mem Src, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *Event) { - PI_ASSERT(Src, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - std::shared_lock SrcLock(Src->Mutex, std::defer_lock); - std::scoped_lock, ur_shared_mutex> LockAll( - SrcLock, Queue->Mutex); - char *ZeHandleSrc; - PI_CALL(Src->getZeHandle(ZeHandleSrc, _pi_mem::read_only, Queue->Device)); - return enqueueMemCopyHelper(PI_COMMAND_TYPE_MEM_BUFFER_READ, Queue, Dst, - BlockingRead, Size, ZeHandleSrc + Offset, - NumEventsInWaitList, EventWaitList, Event, - /* PreferCopyEngine */ true); + return pi2ur::piEnqueueMemBufferRead(Queue, Src, BlockingRead, Offset, Size, + Dst, NumEventsInWaitList, EventWaitList, + Event); } pi_result piEnqueueMemBufferReadRect( @@ -6041,255 +575,12 @@ pi_result piEnqueueMemBufferReadRect( pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *Event) { - PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - std::shared_lock SrcLock(Buffer->Mutex, std::defer_lock); - std::scoped_lock, ur_shared_mutex> LockAll( - SrcLock, Queue->Mutex); - - char *ZeHandleSrc; - PI_CALL(Buffer->getZeHandle(ZeHandleSrc, _pi_mem::read_only, Queue->Device)); - return enqueueMemCopyRectHelper( - PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, Queue, ZeHandleSrc, - static_cast(Ptr), BufferOffset, HostOffset, Region, - BufferRowPitch, HostRowPitch, BufferSlicePitch, HostSlicePitch, - BlockingRead, NumEventsInWaitList, EventWaitList, Event); -} - -} // extern "C" - -bool _pi_queue::useCopyEngine(bool PreferCopyEngine) const { - auto InitialCopyGroup = CopyQueueGroupsByTID.begin()->second; - return PreferCopyEngine && InitialCopyGroup.ZeQueues.size() > 0 && - (!isInOrderQueue() || UseCopyEngineForInOrderQueue); -} - -// Wait on all operations in flight on this Queue. -// The caller is expected to hold a lock on the Queue. -// For standard commandlists sync the L0 queues directly. -// For immediate commandlists add barriers to all commandlists associated -// with the Queue. An alternative approach would be to wait on all Events -// associated with the in-flight operations. -// TODO: Event release in immediate commandlist mode is driven by the SYCL -// runtime. Need to investigate whether relase can be done earlier, at sync -// points such as this, to reduce total number of active Events. -pi_result _pi_queue::synchronize() { - if (!Healthy) - return PI_SUCCESS; - - auto syncImmCmdList = [](_pi_queue *Queue, pi_command_list_ptr_t ImmCmdList) { - if (ImmCmdList == Queue->CommandListMap.end()) - return PI_SUCCESS; - - pi_event Event; - pi_result Res = - createEventAndAssociateQueue(Queue, &Event, PI_COMMAND_TYPE_USER, - ImmCmdList, /* IsInternal */ false); - if (Res != PI_SUCCESS) - return Res; - auto zeEvent = Event->ZeEvent; - ZE_CALL(zeCommandListAppendBarrier, - (ImmCmdList->first, zeEvent, 0, nullptr)); - ZE_CALL(zeHostSynchronize, (zeEvent)); - Event->Completed = true; - PI_CALL(piEventRelease(Event)); - - // Cleanup all events from the synced command list. - auto EventListToCleanup = std::move(ImmCmdList->second.EventList); - ImmCmdList->second.EventList.clear(); - CleanupEventListFromResetCmdList(EventListToCleanup, true); - return PI_SUCCESS; - }; - - if (LastCommandEvent) { - // For in-order queue just wait for the last command. - // If event is discarded then it can be in reset state or underlying level - // zero handle can have device scope, so we can't synchronize the last - // event. - if (isInOrderQueue() && !LastCommandEvent->IsDiscarded) { - ZE_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent)); - } else { - // Otherwise sync all L0 queues/immediate command-lists. - for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) { - for (auto &QueueGroup : QueueMap) { - if (UsingImmCmdLists) { - for (auto ImmCmdList : QueueGroup.second.ImmCmdLists) - syncImmCmdList(this, ImmCmdList); - } else { - for (auto &ZeQueue : QueueGroup.second.ZeQueues) - if (ZeQueue) - ZE_CALL(zeHostSynchronize, (ZeQueue)); - } - } - } - } - LastCommandEvent = nullptr; - } - // With the entire queue synchronized, the active barriers must be done so we - // can remove them. - if (auto Res = ActiveBarriers.clear()) - return Res; - - return PI_SUCCESS; -} - -// Shared by all memory read/write/copy PI interfaces. -// PI interfaces must have queue's and destination buffer's mutexes locked for -// exclusive use and source buffer's mutex locked for shared use on entry. -static pi_result -enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst, - pi_bool BlockingWrite, size_t Size, const void *Src, - pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *OutEvent, - bool PreferCopyEngine) { - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); - - _pi_ze_event_list_t TmpWaitList; - if (auto Res = TmpWaitList.createAndRetainPiZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)) - return Res; - - // We want to batch these commands to avoid extra submissions (costly) - bool OkToBatch = true; - - // Get a new command list to be used on this call - pi_command_list_ptr_t CommandList{}; - if (auto Res = Queue->Context->getAvailableCommandList( - Queue, CommandList, UseCopyEngine, OkToBatch)) - return Res; - - ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - - const auto &ZeCommandList = CommandList->first; - const auto &WaitList = (*Event)->WaitList; - - urPrint("calling zeCommandListAppendMemoryCopy() with\n" - " ZeEvent %#llx\n", - ur_cast(ZeEvent)); - printZeEventList(WaitList); - - ZE_CALL(zeCommandListAppendMemoryCopy, - (ZeCommandList, Dst, Src, Size, ZeEvent, WaitList.Length, - WaitList.ZeEventList)); - - if (auto Res = - Queue->executeCommandList(CommandList, BlockingWrite, OkToBatch)) - return Res; - - return PI_SUCCESS; -} - -// Shared by all memory read/write/copy rect PI interfaces. -// PI interfaces must have queue's and destination buffer's mutexes locked for -// exclusive use and source buffer's mutex locked for shared use on entry. -static pi_result enqueueMemCopyRectHelper( - pi_command_type CommandType, pi_queue Queue, const void *SrcBuffer, - void *DstBuffer, pi_buff_rect_offset SrcOrigin, - pi_buff_rect_offset DstOrigin, pi_buff_rect_region Region, - size_t SrcRowPitch, size_t DstRowPitch, size_t SrcSlicePitch, - size_t DstSlicePitch, pi_bool Blocking, pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *OutEvent, bool PreferCopyEngine) { - - PI_ASSERT(Region && SrcOrigin && DstOrigin && Queue, PI_ERROR_INVALID_VALUE); - - bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); - - _pi_ze_event_list_t TmpWaitList; - if (auto Res = TmpWaitList.createAndRetainPiZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)) - return Res; - - // We want to batch these commands to avoid extra submissions (costly) - bool OkToBatch = true; - - // Get a new command list to be used on this call - pi_command_list_ptr_t CommandList{}; - if (auto Res = Queue->Context->getAvailableCommandList( - Queue, CommandList, UseCopyEngine, OkToBatch)) - return Res; - - ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - - const auto &ZeCommandList = CommandList->first; - const auto &WaitList = (*Event)->WaitList; - - urPrint("calling zeCommandListAppendMemoryCopy() with\n" - " ZeEvent %#llx\n", - ur_cast(ZeEvent)); - printZeEventList(WaitList); - - uint32_t SrcOriginX = ur_cast(SrcOrigin->x_bytes); - uint32_t SrcOriginY = ur_cast(SrcOrigin->y_scalar); - uint32_t SrcOriginZ = ur_cast(SrcOrigin->z_scalar); - - uint32_t SrcPitch = SrcRowPitch; - if (SrcPitch == 0) - SrcPitch = ur_cast(Region->width_bytes); - - if (SrcSlicePitch == 0) - SrcSlicePitch = ur_cast(Region->height_scalar) * SrcPitch; - - uint32_t DstOriginX = ur_cast(DstOrigin->x_bytes); - uint32_t DstOriginY = ur_cast(DstOrigin->y_scalar); - uint32_t DstOriginZ = ur_cast(DstOrigin->z_scalar); - - uint32_t DstPitch = DstRowPitch; - if (DstPitch == 0) - DstPitch = ur_cast(Region->width_bytes); - - if (DstSlicePitch == 0) - DstSlicePitch = ur_cast(Region->height_scalar) * DstPitch; - - uint32_t Width = ur_cast(Region->width_bytes); - uint32_t Height = ur_cast(Region->height_scalar); - uint32_t Depth = ur_cast(Region->depth_scalar); - - const ze_copy_region_t ZeSrcRegion = {SrcOriginX, SrcOriginY, SrcOriginZ, - Width, Height, Depth}; - const ze_copy_region_t ZeDstRegion = {DstOriginX, DstOriginY, DstOriginZ, - Width, Height, Depth}; - - ZE_CALL(zeCommandListAppendMemoryCopyRegion, - (ZeCommandList, DstBuffer, &ZeDstRegion, DstPitch, DstSlicePitch, - SrcBuffer, &ZeSrcRegion, SrcPitch, SrcSlicePitch, nullptr, - WaitList.Length, WaitList.ZeEventList)); - - urPrint("calling zeCommandListAppendMemoryCopyRegion()\n"); - - ZE_CALL(zeCommandListAppendBarrier, (ZeCommandList, ZeEvent, 0, nullptr)); - - urPrint("calling zeCommandListAppendBarrier() with Event %#llx\n", - ur_cast(ZeEvent)); - - if (auto Res = Queue->executeCommandList(CommandList, Blocking, OkToBatch)) - return Res; - - return PI_SUCCESS; + return pi2ur::piEnqueueMemBufferReadRect( + Queue, Buffer, BlockingRead, BufferOffset, HostOffset, Region, + BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr, + NumEventsInWaitList, EventWaitList, Event); } -extern "C" { - pi_result piEnqueueMemBufferWrite(pi_queue Queue, pi_mem Buffer, pi_bool BlockingWrite, size_t Offset, size_t Size, const void *Ptr, @@ -6297,20 +588,9 @@ pi_result piEnqueueMemBufferWrite(pi_queue Queue, pi_mem Buffer, const pi_event *EventWaitList, pi_event *Event) { - PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - std::scoped_lock Lock(Queue->Mutex, - Buffer->Mutex); - - char *ZeHandleDst; - PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device)); - return enqueueMemCopyHelper(PI_COMMAND_TYPE_MEM_BUFFER_WRITE, Queue, - ZeHandleDst + Offset, // dst - BlockingWrite, Size, - Ptr, // src - NumEventsInWaitList, EventWaitList, Event, - /* PreferCopyEngine */ true); + return pi2ur::piEnqueueMemBufferWrite(Queue, Buffer, BlockingWrite, Offset, + Size, Ptr, NumEventsInWaitList, + EventWaitList, Event); } pi_result piEnqueueMemBufferWriteRect( @@ -6321,20 +601,10 @@ pi_result piEnqueueMemBufferWriteRect( pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *Event) { - PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - std::scoped_lock Lock(Queue->Mutex, - Buffer->Mutex); - - char *ZeHandleDst; - PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device)); - return enqueueMemCopyRectHelper( - PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, Queue, - const_cast(static_cast(Ptr)), ZeHandleDst, - HostOffset, BufferOffset, Region, HostRowPitch, BufferRowPitch, - HostSlicePitch, BufferSlicePitch, BlockingWrite, NumEventsInWaitList, - EventWaitList, Event); + return pi2ur::piEnqueueMemBufferWriteRect( + Queue, Buffer, BlockingWrite, BufferOffset, HostOffset, Region, + BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr, + NumEventsInWaitList, EventWaitList, Event); } pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcMem, pi_mem DstMem, @@ -6342,38 +612,10 @@ pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcMem, pi_mem DstMem, size_t Size, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *Event) { - PI_ASSERT(SrcMem && DstMem, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - PI_ASSERT(!SrcMem->isImage(), PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(!DstMem->isImage(), PI_ERROR_INVALID_MEM_OBJECT); - auto SrcBuffer = ur_cast(SrcMem); - auto DstBuffer = ur_cast(DstMem); - - std::shared_lock SrcLock(SrcBuffer->Mutex, std::defer_lock); - std::scoped_lock, ur_shared_mutex, - ur_shared_mutex> - LockAll(SrcLock, DstBuffer->Mutex, Queue->Mutex); - - // Copy engine is preferred only for host to device transfer. - // Device to device transfers run faster on compute engines. - bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost); - - // Temporary option added to use copy engine for D2D copy - PreferCopyEngine |= UseCopyEngineForD2DCopy; - - char *ZeHandleSrc; - PI_CALL( - SrcBuffer->getZeHandle(ZeHandleSrc, _pi_mem::read_only, Queue->Device)); - char *ZeHandleDst; - PI_CALL( - DstBuffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device)); - - return enqueueMemCopyHelper( - PI_COMMAND_TYPE_MEM_BUFFER_COPY, Queue, ZeHandleDst + DstOffset, - false, // blocking - Size, ZeHandleSrc + SrcOffset, NumEventsInWaitList, EventWaitList, Event, - PreferCopyEngine); + return pi2ur::piEnqueueMemBufferCopy(Queue, SrcMem, DstMem, SrcOffset, + DstOffset, Size, NumEventsInWaitList, + EventWaitList, Event); } pi_result piEnqueueMemBufferCopyRect( @@ -6382,133 +624,13 @@ pi_result piEnqueueMemBufferCopyRect( size_t SrcRowPitch, size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *Event) { - PI_ASSERT(SrcMem && DstMem, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - PI_ASSERT(!SrcMem->isImage(), PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(!DstMem->isImage(), PI_ERROR_INVALID_MEM_OBJECT); - auto SrcBuffer = ur_cast(SrcMem); - auto DstBuffer = ur_cast(DstMem); - - std::shared_lock SrcLock(SrcBuffer->Mutex, std::defer_lock); - std::scoped_lock, ur_shared_mutex, - ur_shared_mutex> - LockAll(SrcLock, DstBuffer->Mutex, Queue->Mutex); - - // Copy engine is preferred only for host to device transfer. - // Device to device transfers run faster on compute engines. - bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost); - - char *ZeHandleSrc; - PI_CALL( - SrcBuffer->getZeHandle(ZeHandleSrc, _pi_mem::read_only, Queue->Device)); - char *ZeHandleDst; - PI_CALL( - DstBuffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device)); - - return enqueueMemCopyRectHelper( - PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, Queue, ZeHandleSrc, ZeHandleDst, - SrcOrigin, DstOrigin, Region, SrcRowPitch, DstRowPitch, SrcSlicePitch, - DstSlicePitch, - false, // blocking - NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine); -} - -} // extern "C" - -// Default to using compute engine for fill operation, but allow to -// override this with an environment variable. -static bool PreferCopyEngine = [] { - const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_FILL"); - const char *PiRet = - std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL"); - return (UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0)); -}(); - -// PI interfaces must have queue's and buffer's mutexes locked on entry. -static pi_result -enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr, - const void *Pattern, size_t PatternSize, size_t Size, - pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *OutEvent) { - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - // Pattern size must be a power of two. - PI_ASSERT((PatternSize > 0) && ((PatternSize & (PatternSize - 1)) == 0), - PI_ERROR_INVALID_VALUE); - - auto &Device = Queue->Device; - - // Make sure that pattern size matches the capability of the copy queues. - // Check both main and link groups as we don't known which one will be used. - // - if (PreferCopyEngine && Device->hasCopyEngine()) { - if (Device->hasMainCopyEngine() && - Device->QueueGroup[_pi_device::queue_group_info_t::MainCopy] - .ZeProperties.maxMemoryFillPatternSize < PatternSize) { - PreferCopyEngine = false; - } - if (Device->hasLinkCopyEngine() && - Device->QueueGroup[_pi_device::queue_group_info_t::LinkCopy] - .ZeProperties.maxMemoryFillPatternSize < PatternSize) { - PreferCopyEngine = false; - } - } - - bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); - if (!UseCopyEngine) { - // Pattern size must fit the compute queue capabilities. - PI_ASSERT(PatternSize <= - Device->QueueGroup[_pi_device::queue_group_info_t::Compute] - .ZeProperties.maxMemoryFillPatternSize, - PI_ERROR_INVALID_VALUE); - } - _pi_ze_event_list_t TmpWaitList; - if (auto Res = TmpWaitList.createAndRetainPiZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)) - return Res; - - pi_command_list_ptr_t CommandList{}; - // We want to batch these commands to avoid extra submissions (costly) - bool OkToBatch = true; - if (auto Res = Queue->Context->getAvailableCommandList( - Queue, CommandList, UseCopyEngine, OkToBatch)) - return Res; - - ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - - const auto &ZeCommandList = CommandList->first; - const auto &WaitList = (*Event)->WaitList; - - ZE_CALL(zeCommandListAppendMemoryFill, - (ZeCommandList, Ptr, Pattern, PatternSize, Size, ZeEvent, - WaitList.Length, WaitList.ZeEventList)); - - urPrint("calling zeCommandListAppendMemoryFill() with\n" - " ZeEvent %#llx\n", - ur_cast(ZeEvent)); - printZeEventList(WaitList); - - // Execute command list asynchronously, as the event will be used - // to track down its completion. - if (auto Res = Queue->executeCommandList(CommandList, false, OkToBatch)) - return Res; - - return PI_SUCCESS; + return pi2ur::piEnqueueMemBufferCopyRect( + Queue, SrcMem, DstMem, SrcOrigin, DstOrigin, Region, SrcRowPitch, + SrcSlicePitch, DstRowPitch, DstSlicePitch, NumEventsInWaitList, + EventWaitList, Event); } -extern "C" { - pi_result piEnqueueMemBufferFill(pi_queue Queue, pi_mem Buffer, const void *Pattern, size_t PatternSize, size_t Offset, size_t Size, @@ -6516,502 +638,38 @@ pi_result piEnqueueMemBufferFill(pi_queue Queue, pi_mem Buffer, const pi_event *EventWaitList, pi_event *Event) { - PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - std::scoped_lock Lock(Queue->Mutex, - Buffer->Mutex); - - char *ZeHandleDst; - PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device)); - return enqueueMemFillHelper(PI_COMMAND_TYPE_MEM_BUFFER_FILL, Queue, - ZeHandleDst + Offset, Pattern, PatternSize, Size, - NumEventsInWaitList, EventWaitList, Event); + return pi2ur::piEnqueueMemBufferFill(Queue, Buffer, Pattern, PatternSize, + Offset, Size, NumEventsInWaitList, + EventWaitList, Event); } -static pi_result USMHostAllocImpl(void **ResultPtr, pi_context Context, - pi_usm_mem_properties *Properties, - size_t Size, pi_uint32 Alignment); - pi_result piEnqueueMemBufferMap(pi_queue Queue, pi_mem Mem, pi_bool BlockingMap, pi_map_flags MapFlags, size_t Offset, size_t Size, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *OutEvent, void **RetMap) { - // TODO: we don't implement read-only or write-only, always read-write. - // assert((map_flags & PI_MAP_READ) != 0); - // assert((map_flags & PI_MAP_WRITE) != 0); - PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - PI_ASSERT(!Mem->isImage(), PI_ERROR_INVALID_MEM_OBJECT); - auto Buffer = ur_cast(Mem); - - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - ze_event_handle_t ZeEvent = nullptr; - - bool UseCopyEngine = false; - { - // Lock automatically releases when this goes out of scope. - std::scoped_lock lock(Queue->Mutex); - - _pi_ze_event_list_t TmpWaitList; - if (auto Res = TmpWaitList.createAndRetainPiZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)) - return Res; - - auto Res = createEventAndAssociateQueue( - Queue, Event, PI_COMMAND_TYPE_MEM_BUFFER_MAP, - Queue->CommandListMap.end(), IsInternal); - if (Res != PI_SUCCESS) - return Res; - - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - } - - // Translate the host access mode info. - _pi_mem::access_mode_t AccessMode = _pi_mem::unknown; - if (MapFlags & PI_MAP_WRITE_INVALIDATE_REGION) - AccessMode = _pi_mem::write_only; - else { - if (MapFlags & PI_MAP_READ) { - AccessMode = _pi_mem::read_only; - if (MapFlags & PI_MAP_WRITE) - AccessMode = _pi_mem::read_write; - } else if (MapFlags & PI_MAP_WRITE) - AccessMode = _pi_mem::write_only; - } - PI_ASSERT(AccessMode != _pi_mem::unknown, PI_ERROR_INVALID_VALUE); - - // TODO: Level Zero is missing the memory "mapping" capabilities, so we are - // left to doing new memory allocation and a copy (read) on discrete devices. - // For integrated devices, we have allocated the buffer in host memory so no - // actions are needed here except for synchronizing on incoming events. - // A host-to-host copy is done if a host pointer had been supplied during - // buffer creation on integrated devices. - // - // TODO: for discrete, check if the input buffer is already allocated - // in shared memory and thus is accessible from the host as is. - // Can we get SYCL RT to predict/allocate in shared memory - // from the beginning? - - // For integrated devices the buffer has been allocated in host memory. - if (Buffer->OnHost) { - // Wait on incoming events before doing the copy - if (NumEventsInWaitList > 0) - PI_CALL(piEventsWait(NumEventsInWaitList, EventWaitList)); - - if (Queue->isInOrderQueue()) - PI_CALL(piQueueFinish(Queue)); - - // Lock automatically releases when this goes out of scope. - std::scoped_lock Guard(Buffer->Mutex); - - char *ZeHandleSrc; - PI_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device)); - - if (Buffer->MapHostPtr) { - *RetMap = Buffer->MapHostPtr + Offset; - if (ZeHandleSrc != Buffer->MapHostPtr && - AccessMode != _pi_mem::write_only) { - memcpy(*RetMap, ZeHandleSrc + Offset, Size); - } - } else { - *RetMap = ZeHandleSrc + Offset; - } - - auto Res = Buffer->Mappings.insert({*RetMap, {Offset, Size}}); - // False as the second value in pair means that mapping was not inserted - // because mapping already exists. - if (!Res.second) { - urPrint("piEnqueueMemBufferMap: duplicate mapping detected\n"); - return PI_ERROR_INVALID_VALUE; - } - - // Signal this event - ZE_CALL(zeEventHostSignal, (ZeEvent)); - (*Event)->Completed = true; - return PI_SUCCESS; - } - - // Lock automatically releases when this goes out of scope. - std::scoped_lock Lock(Queue->Mutex, - Buffer->Mutex); - - if (Buffer->MapHostPtr) { - *RetMap = Buffer->MapHostPtr + Offset; - } else { - // TODO: use USM host allocator here - // TODO: Do we even need every map to allocate new host memory? - // In the case when the buffer is "OnHost" we use single allocation. - if (auto Res = ZeHostMemAllocHelper(RetMap, Queue->Context, Size)) - return Res; - } - - // Take a shortcut if the host is not going to read buffer's data. - if (AccessMode == _pi_mem::write_only) { - (*Event)->Completed = true; - } else { - // For discrete devices we need a command list - pi_command_list_ptr_t CommandList{}; - if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine)) - return Res; - - // Add the event to the command list. - CommandList->second.append(*Event); - (*Event)->RefCount.increment(); - - const auto &ZeCommandList = CommandList->first; - const auto &WaitList = (*Event)->WaitList; - - char *ZeHandleSrc; - PI_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device)); - - ZE_CALL(zeCommandListAppendMemoryCopy, - (ZeCommandList, *RetMap, ZeHandleSrc + Offset, Size, ZeEvent, - WaitList.Length, WaitList.ZeEventList)); - - if (auto Res = Queue->executeCommandList(CommandList, BlockingMap)) - return Res; - } - - auto Res = Buffer->Mappings.insert({*RetMap, {Offset, Size}}); - // False as the second value in pair means that mapping was not inserted - // because mapping already exists. - if (!Res.second) { - urPrint("piEnqueueMemBufferMap: duplicate mapping detected\n"); - return PI_ERROR_INVALID_VALUE; - } - return PI_SUCCESS; + return pi2ur::piEnqueueMemBufferMap(Queue, Mem, BlockingMap, MapFlags, Offset, + Size, NumEventsInWaitList, EventWaitList, + OutEvent, RetMap); } pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem Mem, void *MappedPtr, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *OutEvent) { - PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - PI_ASSERT(!Mem->isImage(), PI_ERROR_INVALID_MEM_OBJECT); - auto Buffer = ur_cast(Mem); - - bool UseCopyEngine = false; - - ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - { - // Lock automatically releases when this goes out of scope. - std::scoped_lock lock(Queue->Mutex); - - _pi_ze_event_list_t TmpWaitList; - if (auto Res = TmpWaitList.createAndRetainPiZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)) - return Res; - - auto Res = createEventAndAssociateQueue( - Queue, Event, PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, - Queue->CommandListMap.end(), IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - } - - _pi_buffer::Mapping MapInfo = {}; - { - // Lock automatically releases when this goes out of scope. - std::scoped_lock Guard(Buffer->Mutex); - auto It = Buffer->Mappings.find(MappedPtr); - if (It == Buffer->Mappings.end()) { - urPrint("piEnqueueMemUnmap: unknown memory mapping\n"); - return PI_ERROR_INVALID_VALUE; - } - MapInfo = It->second; - Buffer->Mappings.erase(It); - - // NOTE: we still have to free the host memory allocated/returned by - // piEnqueueMemBufferMap, but can only do so after the above copy - // is completed. Instead of waiting for It here (blocking), we shall - // do so in piEventRelease called for the pi_event tracking the unmap. - // In the case of an integrated device, the map operation does not allocate - // any memory, so there is nothing to free. This is indicated by a nullptr. - (*Event)->CommandData = - (Buffer->OnHost ? nullptr : (Buffer->MapHostPtr ? nullptr : MappedPtr)); - } - - // For integrated devices the buffer is allocated in host memory. - if (Buffer->OnHost) { - // Wait on incoming events before doing the copy - if (NumEventsInWaitList > 0) - PI_CALL(piEventsWait(NumEventsInWaitList, EventWaitList)); - - if (Queue->isInOrderQueue()) - PI_CALL(piQueueFinish(Queue)); - - char *ZeHandleDst; - PI_CALL( - Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device)); - std::scoped_lock Guard(Buffer->Mutex); - if (Buffer->MapHostPtr) - memcpy(ZeHandleDst + MapInfo.Offset, MappedPtr, MapInfo.Size); - - // Signal this event - ZE_CALL(zeEventHostSignal, (ZeEvent)); - (*Event)->Completed = true; - return PI_SUCCESS; - } - - // Lock automatically releases when this goes out of scope. - std::scoped_lock Lock(Queue->Mutex, - Buffer->Mutex); - - pi_command_list_ptr_t CommandList{}; - if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine)) - return Res; - - CommandList->second.append(*Event); - (*Event)->RefCount.increment(); - - const auto &ZeCommandList = CommandList->first; - - // TODO: Level Zero is missing the memory "mapping" capabilities, so we are - // left to doing copy (write back to the device). - // - // NOTE: Keep this in sync with the implementation of - // piEnqueueMemBufferMap. - - char *ZeHandleDst; - PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device)); - - ZE_CALL(zeCommandListAppendMemoryCopy, - (ZeCommandList, ZeHandleDst + MapInfo.Offset, MappedPtr, MapInfo.Size, - ZeEvent, (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList)); - - // Execute command list asynchronously, as the event will be used - // to track down its completion. - if (auto Res = Queue->executeCommandList(CommandList)) - return Res; - - return PI_SUCCESS; + return pi2ur::piEnqueueMemUnmap(Queue, Mem, MappedPtr, NumEventsInWaitList, + EventWaitList, OutEvent); } pi_result piMemImageGetInfo(pi_mem Image, pi_image_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - (void)Image; - (void)ParamName; - (void)ParamValueSize; - (void)ParamValue; - (void)ParamValueSizeRet; - - die("piMemImageGetInfo: not implemented"); - return {}; -} - -} // extern "C" - -static pi_result getImageRegionHelper(pi_mem Mem, pi_image_offset Origin, - pi_image_region Region, - ze_image_region_t &ZeRegion) { - - PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); - PI_ASSERT(Origin, PI_ERROR_INVALID_VALUE); - -#ifndef NDEBUG - PI_ASSERT(Mem->isImage(), PI_ERROR_INVALID_MEM_OBJECT); - auto Image = static_cast<_pi_image *>(Mem); - ze_image_desc_t &ZeImageDesc = Image->ZeImageDesc; - - PI_ASSERT((ZeImageDesc.type == ZE_IMAGE_TYPE_1D && Origin->y == 0 && - Origin->z == 0) || - (ZeImageDesc.type == ZE_IMAGE_TYPE_1DARRAY && Origin->z == 0) || - (ZeImageDesc.type == ZE_IMAGE_TYPE_2D && Origin->z == 0) || - (ZeImageDesc.type == ZE_IMAGE_TYPE_3D), - PI_ERROR_INVALID_VALUE); - - PI_ASSERT(Region->width && Region->height && Region->depth, - PI_ERROR_INVALID_VALUE); - PI_ASSERT( - (ZeImageDesc.type == ZE_IMAGE_TYPE_1D && Region->height == 1 && - Region->depth == 1) || - (ZeImageDesc.type == ZE_IMAGE_TYPE_1DARRAY && Region->depth == 1) || - (ZeImageDesc.type == ZE_IMAGE_TYPE_2D && Region->depth == 1) || - (ZeImageDesc.type == ZE_IMAGE_TYPE_3D), - PI_ERROR_INVALID_VALUE); -#endif // !NDEBUG - - uint32_t OriginX = ur_cast(Origin->x); - uint32_t OriginY = ur_cast(Origin->y); - uint32_t OriginZ = ur_cast(Origin->z); - - uint32_t Width = ur_cast(Region->width); - uint32_t Height = ur_cast(Region->height); - uint32_t Depth = ur_cast(Region->depth); - - ZeRegion = {OriginX, OriginY, OriginZ, Width, Height, Depth}; - - return PI_SUCCESS; -} - -// Helper function to implement image read/write/copy. -// PI interfaces must have queue's and destination image's mutexes locked for -// exclusive use and source image's mutex locked for shared use on entry. -static pi_result enqueueMemImageCommandHelper( - pi_command_type CommandType, pi_queue Queue, - const void *Src, // image or ptr - void *Dst, // image or ptr - pi_bool IsBlocking, pi_image_offset SrcOrigin, pi_image_offset DstOrigin, - pi_image_region Region, size_t RowPitch, size_t SlicePitch, - pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, - pi_event *OutEvent, bool PreferCopyEngine = false) { - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); - - _pi_ze_event_list_t TmpWaitList; - if (auto Res = TmpWaitList.createAndRetainPiZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)) - return Res; - - // We want to batch these commands to avoid extra submissions (costly) - bool OkToBatch = true; - - // Get a new command list to be used on this call - pi_command_list_ptr_t CommandList{}; - if (auto Res = Queue->Context->getAvailableCommandList( - Queue, CommandList, UseCopyEngine, OkToBatch)) - return Res; - - ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, CommandType, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - - const auto &ZeCommandList = CommandList->first; - const auto &WaitList = (*Event)->WaitList; - - if (CommandType == PI_COMMAND_TYPE_IMAGE_READ) { - pi_mem SrcMem = ur_cast(const_cast(Src)); - - ze_image_region_t ZeSrcRegion; - auto Result = getImageRegionHelper(SrcMem, SrcOrigin, Region, ZeSrcRegion); - if (Result != PI_SUCCESS) - return Result; - - // TODO: Level Zero does not support row_pitch/slice_pitch for images yet. - // Check that SYCL RT did not want pitch larger than default. - (void)RowPitch; - (void)SlicePitch; -#ifndef NDEBUG - PI_ASSERT(SrcMem->isImage(), PI_ERROR_INVALID_MEM_OBJECT); - - auto SrcImage = static_cast<_pi_image *>(SrcMem); - const ze_image_desc_t &ZeImageDesc = SrcImage->ZeImageDesc; - PI_ASSERT( - RowPitch == 0 || - // special case RGBA image pitch equal to region's width - (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32 && - RowPitch == 4 * 4 * ZeSrcRegion.width) || - (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16 && - RowPitch == 4 * 2 * ZeSrcRegion.width) || - (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 && - RowPitch == 4 * ZeSrcRegion.width), - PI_ERROR_INVALID_IMAGE_SIZE); - PI_ASSERT(SlicePitch == 0 || SlicePitch == RowPitch * ZeSrcRegion.height, - PI_ERROR_INVALID_IMAGE_SIZE); -#endif // !NDEBUG - - char *ZeHandleSrc; - PI_CALL( - SrcMem->getZeHandle(ZeHandleSrc, _pi_mem::read_only, Queue->Device)); - ZE_CALL(zeCommandListAppendImageCopyToMemory, - (ZeCommandList, Dst, ur_cast(ZeHandleSrc), - &ZeSrcRegion, ZeEvent, WaitList.Length, WaitList.ZeEventList)); - } else if (CommandType == PI_COMMAND_TYPE_IMAGE_WRITE) { - pi_mem DstMem = ur_cast(Dst); - ze_image_region_t ZeDstRegion; - auto Result = getImageRegionHelper(DstMem, DstOrigin, Region, ZeDstRegion); - if (Result != PI_SUCCESS) - return Result; - - // TODO: Level Zero does not support row_pitch/slice_pitch for images yet. - // Check that SYCL RT did not want pitch larger than default. -#ifndef NDEBUG - PI_ASSERT(DstMem->isImage(), PI_ERROR_INVALID_MEM_OBJECT); - - auto DstImage = static_cast<_pi_image *>(DstMem); - const ze_image_desc_t &ZeImageDesc = DstImage->ZeImageDesc; - PI_ASSERT( - RowPitch == 0 || - // special case RGBA image pitch equal to region's width - (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32 && - RowPitch == 4 * 4 * ZeDstRegion.width) || - (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16 && - RowPitch == 4 * 2 * ZeDstRegion.width) || - (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 && - RowPitch == 4 * ZeDstRegion.width), - PI_ERROR_INVALID_IMAGE_SIZE); - PI_ASSERT(SlicePitch == 0 || SlicePitch == RowPitch * ZeDstRegion.height, - PI_ERROR_INVALID_IMAGE_SIZE); -#endif // !NDEBUG - char *ZeHandleDst; - PI_CALL( - DstMem->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device)); - ZE_CALL(zeCommandListAppendImageCopyFromMemory, - (ZeCommandList, ur_cast(ZeHandleDst), Src, - &ZeDstRegion, ZeEvent, WaitList.Length, WaitList.ZeEventList)); - } else if (CommandType == PI_COMMAND_TYPE_IMAGE_COPY) { - pi_mem SrcImage = ur_cast(const_cast(Src)); - pi_mem DstImage = ur_cast(Dst); - - ze_image_region_t ZeSrcRegion; - auto Result = - getImageRegionHelper(SrcImage, SrcOrigin, Region, ZeSrcRegion); - if (Result != PI_SUCCESS) - return Result; - ze_image_region_t ZeDstRegion; - Result = getImageRegionHelper(DstImage, DstOrigin, Region, ZeDstRegion); - if (Result != PI_SUCCESS) - return Result; - - char *ZeHandleSrc; - char *ZeHandleDst; - PI_CALL( - SrcImage->getZeHandle(ZeHandleSrc, _pi_mem::read_only, Queue->Device)); - PI_CALL( - DstImage->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device)); - ZE_CALL(zeCommandListAppendImageCopyRegion, - (ZeCommandList, ur_cast(ZeHandleDst), - ur_cast(ZeHandleSrc), &ZeDstRegion, - &ZeSrcRegion, ZeEvent, 0, nullptr)); - } else { - urPrint("enqueueMemImageUpdate: unsupported image command type\n"); - return PI_ERROR_INVALID_OPERATION; - } - - if (auto Res = Queue->executeCommandList(CommandList, IsBlocking, OkToBatch)) - return Res; - - return PI_SUCCESS; + return pi2ur::piMemImageGetInfo(Image, ParamName, ParamValueSize, ParamValue, + ParamValueSizeRet); } -extern "C" { - pi_result piEnqueueMemImageRead(pi_queue Queue, pi_mem Image, pi_bool BlockingRead, pi_image_offset Origin, pi_image_region Region, size_t RowPitch, @@ -7019,19 +677,9 @@ pi_result piEnqueueMemImageRead(pi_queue Queue, pi_mem Image, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *Event) { - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - std::shared_lock SrcLock(Image->Mutex, std::defer_lock); - std::scoped_lock, ur_shared_mutex> LockAll( - SrcLock, Queue->Mutex); - return enqueueMemImageCommandHelper( - PI_COMMAND_TYPE_IMAGE_READ, Queue, - Image, // src - Ptr, // dst - BlockingRead, - Origin, // SrcOrigin - nullptr, // DstOrigin - Region, RowPitch, SlicePitch, NumEventsInWaitList, EventWaitList, Event); + return pi2ur::piEnqueueMemImageRead( + Queue, Image, BlockingRead, Origin, Region, RowPitch, SlicePitch, Ptr, + NumEventsInWaitList, EventWaitList, Event); } pi_result piEnqueueMemImageWrite(pi_queue Queue, pi_mem Image, @@ -7042,19 +690,9 @@ pi_result piEnqueueMemImageWrite(pi_queue Queue, pi_mem Image, const pi_event *EventWaitList, pi_event *Event) { - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - std::scoped_lock Lock(Queue->Mutex, - Image->Mutex); - return enqueueMemImageCommandHelper(PI_COMMAND_TYPE_IMAGE_WRITE, Queue, - Ptr, // src - Image, // dst - BlockingWrite, - nullptr, // SrcOrigin - Origin, // DstOrigin - Region, InputRowPitch, InputSlicePitch, - NumEventsInWaitList, EventWaitList, - Event); + return pi2ur::piEnqueueMemImageWrite( + Queue, Image, BlockingWrite, Origin, Region, InputRowPitch, + InputSlicePitch, Ptr, NumEventsInWaitList, EventWaitList, Event); } pi_result @@ -7062,24 +700,9 @@ piEnqueueMemImageCopy(pi_queue Queue, pi_mem SrcImage, pi_mem DstImage, pi_image_offset SrcOrigin, pi_image_offset DstOrigin, pi_image_region Region, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *Event) { - - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - std::shared_lock SrcLock(SrcImage->Mutex, std::defer_lock); - std::scoped_lock, ur_shared_mutex, - ur_shared_mutex> - LockAll(SrcLock, DstImage->Mutex, Queue->Mutex); - // Copy engine is preferred only for host to device transfer. - // Device to device transfers run faster on compute engines. - // Images are always allocated on device. - bool PreferCopyEngine = false; - return enqueueMemImageCommandHelper( - PI_COMMAND_TYPE_IMAGE_COPY, Queue, SrcImage, DstImage, - false, // is_blocking - SrcOrigin, DstOrigin, Region, - 0, // row pitch - 0, // slice pitch - NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine); + return pi2ur::piEnqueueMemImageCopy(Queue, SrcImage, DstImage, SrcOrigin, + DstOrigin, Region, NumEventsInWaitList, + EventWaitList, Event); } pi_result piEnqueueMemImageFill(pi_queue Queue, pi_mem Image, @@ -7088,59 +711,18 @@ pi_result piEnqueueMemImageFill(pi_queue Queue, pi_mem Image, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *Event) { - (void)Image; - (void)FillColor; - (void)Origin; - (void)Region; - (void)NumEventsInWaitList; - (void)EventWaitList; - (void)Event; - - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - // Lock automatically releases when this goes out of scope. - std::scoped_lock Lock(Queue->Mutex, - Image->Mutex); - die("piEnqueueMemImageFill: not implemented"); - return {}; + return pi2ur::piEnqueueMemImageFill(Queue, Image, FillColor, Origin, Region, + NumEventsInWaitList, EventWaitList, + Event); } pi_result piMemBufferPartition(pi_mem Buffer, pi_mem_flags Flags, pi_buffer_create_type BufferCreateType, void *BufferCreateInfo, pi_mem *RetMem) { - PI_ASSERT(Buffer && !Buffer->isImage() && - !(static_cast(Buffer))->isSubBuffer(), - PI_ERROR_INVALID_MEM_OBJECT); - - PI_ASSERT(BufferCreateType == PI_BUFFER_CREATE_TYPE_REGION && - BufferCreateInfo && RetMem, - PI_ERROR_INVALID_VALUE); - - std::shared_lock Guard(Buffer->Mutex); - - if (Flags != PI_MEM_FLAGS_ACCESS_RW) { - die("piMemBufferPartition: Level-Zero implements only read-write buffer," - "no read-only or write-only yet."); - } - - auto Region = (pi_buffer_region)BufferCreateInfo; - - PI_ASSERT(Region->size != 0u, PI_ERROR_INVALID_BUFFER_SIZE); - PI_ASSERT(Region->origin <= (Region->origin + Region->size), - PI_ERROR_INVALID_VALUE); - - try { - *RetMem = new _pi_buffer(static_cast(Buffer), Region->origin, - Region->size); - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - return PI_SUCCESS; + return pi2ur::piMemBufferPartition(Buffer, Flags, BufferCreateType, + BufferCreateInfo, RetMem); } pi_result piEnqueueNativeKernel(pi_queue Queue, void (*UserFunc)(void *), @@ -7150,725 +732,53 @@ pi_result piEnqueueNativeKernel(pi_queue Queue, void (*UserFunc)(void *), pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *Event) { - (void)UserFunc; - (void)Args; - (void)CbArgs; - (void)NumMemObjects; - (void)MemList; - (void)ArgsMemLoc; - (void)NumEventsInWaitList; - (void)EventWaitList; - (void)Event; - - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - // Lock automatically releases when this goes out of scope. - std::scoped_lock lock(Queue->Mutex); - - die("piEnqueueNativeKernel: not implemented"); - return {}; -} - -// Function gets characters between delimeter's in str -// then checks if they are equal to the sub_str. -// returns true if there is at least one instance -// returns false if there are no instances of the name -static bool is_in_separated_string(const std::string &str, char delimiter, - const std::string &sub_str) { - size_t beg = 0; - size_t length = 0; - for (const auto &x : str) { - if (x == delimiter) { - if (str.substr(beg, length) == sub_str) - return true; - - beg += length + 1; - length = 0; - continue; - } - length++; - } - if (length != 0) - if (str.substr(beg, length) == sub_str) - return true; - - return false; + return pi2ur::piEnqueueNativeKernel( + Queue, UserFunc, Args, CbArgs, NumMemObjects, MemList, ArgsMemLoc, + NumEventsInWaitList, EventWaitList, Event); } // TODO: Check if the function_pointer_ret type can be converted to void**. pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program, const char *FunctionName, pi_uint64 *FunctionPointerRet) { - (void)Device; - PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); - - std::shared_lock Guard(Program->Mutex); - if (Program->State != _pi_program::Exe) { - return PI_ERROR_INVALID_PROGRAM_EXECUTABLE; - } - - ze_result_t ZeResult = - ZE_CALL_NOCHECK(zeModuleGetFunctionPointer, - (Program->ZeModule, FunctionName, - reinterpret_cast(FunctionPointerRet))); - - // zeModuleGetFunctionPointer currently fails for all - // kernels regardless of if the kernel exist or not - // with ZE_RESULT_ERROR_INVALID_ARGUMENT - // TODO: remove when this is no longer the case - // If zeModuleGetFunctionPointer returns invalid argument, - // fallback to searching through kernel list and return - // PI_ERROR_FUNCTION_ADDRESS_IS_NOT_AVAILABLE if the function exists - // or PI_ERROR_INVALID_KERNEL_NAME if the function does not exist. - // FunctionPointerRet should always be 0 - if (ZeResult == ZE_RESULT_ERROR_INVALID_ARGUMENT) { - size_t Size; - *FunctionPointerRet = 0; - PI_CALL(piProgramGetInfo(Program, PI_PROGRAM_INFO_KERNEL_NAMES, 0, nullptr, - &Size)); - - std::string ClResult(Size, ' '); - PI_CALL(piProgramGetInfo(Program, PI_PROGRAM_INFO_KERNEL_NAMES, - ClResult.size(), &ClResult[0], nullptr)); - - // Get rid of the null terminator and search for kernel_name - // If function can be found return error code to indicate it - // exists - ClResult.pop_back(); - if (is_in_separated_string(ClResult, ';', std::string(FunctionName))) - return PI_ERROR_FUNCTION_ADDRESS_IS_NOT_AVAILABLE; - - return PI_ERROR_INVALID_KERNEL_NAME; - } - - if (ZeResult == ZE_RESULT_ERROR_INVALID_FUNCTION_NAME) { - *FunctionPointerRet = 0; - return PI_ERROR_INVALID_KERNEL_NAME; - } - - return mapError(ZeResult); -} - -enum class USMAllocationForceResidencyType { - // Do not force memory residency at allocation time. - None = 0, - // Force memory resident on the device of allocation at allocation time. - // For host allocation force residency on all devices in a context. - Device = 1, - // Force memory resident on all devices in the context with P2P - // access to the device of allocation. - // For host allocation force residency on all devices in a context. - P2PDevices = 2 -}; - -// Returns the desired USM residency setting -// Input value is of the form 0xHSD, where: -// 4-bits of D control device allocations -// 4-bits of S control shared allocations -// 4-bits of H control host allocations -// Each 4-bit value is holding a USMAllocationForceResidencyType enum value. -// The default is 0x2, i.e. force full residency for device allocations only. -// -static uint32_t USMAllocationForceResidency = [] { - const char *UrRet = std::getenv("UR_L0_USM_RESIDENT"); - const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT"); - const char *Str = UrRet ? UrRet : (PiRet ? PiRet : nullptr); - try { - if (Str) { - // Auto-detect radix to allow more convinient hex base - return std::stoi(Str, nullptr, 0); - } - } catch (...) { - } - return 0x2; -}(); - -// Convert from an integer value to USMAllocationForceResidencyType enum value -static USMAllocationForceResidencyType -USMAllocationForceResidencyConvert(uint32_t Val) { - switch (Val) { - case 1: - return USMAllocationForceResidencyType::Device; - case 2: - return USMAllocationForceResidencyType::P2PDevices; - default: - return USMAllocationForceResidencyType::None; - }; -} - -static USMAllocationForceResidencyType USMHostAllocationForceResidency = [] { - return USMAllocationForceResidencyConvert( - (USMAllocationForceResidency & 0xf00) >> 8); -}(); -static USMAllocationForceResidencyType USMSharedAllocationForceResidency = [] { - return USMAllocationForceResidencyConvert( - (USMAllocationForceResidency & 0x0f0) >> 4); -}(); -static USMAllocationForceResidencyType USMDeviceAllocationForceResidency = [] { - return USMAllocationForceResidencyConvert( - (USMAllocationForceResidency & 0x00f)); -}(); - -// Make USM allocation resident as requested -static pi_result -USMAllocationMakeResident(USMAllocationForceResidencyType ForceResidency, - pi_context Context, - pi_device Device, // nullptr for host allocation - void *Ptr, size_t Size) { - if (ForceResidency == USMAllocationForceResidencyType::None) - return PI_SUCCESS; - - std::list Devices; - if (!Device) { - // Host allocation, make it resident on all devices in the context - Devices.insert(Devices.end(), Context->Devices.begin(), - Context->Devices.end()); - } else { - Devices.push_back(Device); - if (ForceResidency == USMAllocationForceResidencyType::P2PDevices) { - ze_bool_t P2P; - for (const auto &D : Context->Devices) { - if (D == Device) - continue; - // TODO: Cache P2P devices for a context - ZE_CALL(zeDeviceCanAccessPeer, (D->ZeDevice, Device->ZeDevice, &P2P)); - if (P2P) - Devices.push_back(D); - } - } - } - for (const auto &D : Devices) { - ZE_CALL(zeContextMakeMemoryResident, - (Context->ZeContext, D->ZeDevice, Ptr, Size)); - } - return PI_SUCCESS; -} - -static pi_result USMDeviceAllocImpl(void **ResultPtr, pi_context Context, - pi_device Device, - pi_usm_mem_properties *Properties, - size_t Size, pi_uint32 Alignment) { - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); - // Check that incorrect bits are not set in the properties. - PI_ASSERT(!Properties || *Properties == 0 || - (*Properties == PI_MEM_ALLOC_FLAGS && *(Properties + 2) == 0), - PI_ERROR_INVALID_VALUE); - - // TODO: translate PI properties to Level Zero flags - ZeStruct ZeDesc; - ZeDesc.flags = 0; - ZeDesc.ordinal = 0; - - ZeStruct RelaxedDesc; - if (Size > Device->ZeDeviceProperties->maxMemAllocSize) { - // Tell Level-Zero to accept Size > maxMemAllocSize - RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE; - ZeDesc.pNext = &RelaxedDesc; - } - - ZE_CALL(zeMemAllocDevice, (Context->ZeContext, &ZeDesc, Size, Alignment, - Device->ZeDevice, ResultPtr)); - - PI_ASSERT(Alignment == 0 || - reinterpret_cast(*ResultPtr) % Alignment == 0, - PI_ERROR_INVALID_VALUE); - - USMAllocationMakeResident(USMDeviceAllocationForceResidency, Context, Device, - *ResultPtr, Size); - return PI_SUCCESS; -} - -static pi_result USMSharedAllocImpl(void **ResultPtr, pi_context Context, - pi_device Device, pi_usm_mem_properties *, - size_t Size, pi_uint32 Alignment) { - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); - - // TODO: translate PI properties to Level Zero flags - ZeStruct ZeHostDesc; - ZeHostDesc.flags = 0; - ZeStruct ZeDevDesc; - ZeDevDesc.flags = 0; - ZeDevDesc.ordinal = 0; - - ZeStruct RelaxedDesc; - if (Size > Device->ZeDeviceProperties->maxMemAllocSize) { - // Tell Level-Zero to accept Size > maxMemAllocSize - RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE; - ZeDevDesc.pNext = &RelaxedDesc; - } - - ZE_CALL(zeMemAllocShared, (Context->ZeContext, &ZeDevDesc, &ZeHostDesc, Size, - Alignment, Device->ZeDevice, ResultPtr)); - - PI_ASSERT(Alignment == 0 || - reinterpret_cast(*ResultPtr) % Alignment == 0, - PI_ERROR_INVALID_VALUE); - - USMAllocationMakeResident(USMSharedAllocationForceResidency, Context, Device, - *ResultPtr, Size); - - // TODO: Handle PI_MEM_ALLOC_DEVICE_READ_ONLY. - return PI_SUCCESS; -} - -static pi_result USMHostAllocImpl(void **ResultPtr, pi_context Context, - pi_usm_mem_properties *Properties, - size_t Size, pi_uint32 Alignment) { - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - - // Check that incorrect bits are not set in the properties. - PI_ASSERT(!Properties || *Properties == 0 || - (*Properties == PI_MEM_ALLOC_FLAGS && *(Properties + 2) == 0), - PI_ERROR_INVALID_VALUE); - - // TODO: translate PI properties to Level Zero flags - ZeStruct ZeHostDesc; - ZeHostDesc.flags = 0; - ZE_CALL(zeMemAllocHost, - (Context->ZeContext, &ZeHostDesc, Size, Alignment, ResultPtr)); - - PI_ASSERT(Alignment == 0 || - reinterpret_cast(*ResultPtr) % Alignment == 0, - PI_ERROR_INVALID_VALUE); - - USMAllocationMakeResident(USMHostAllocationForceResidency, Context, nullptr, - *ResultPtr, Size); - return PI_SUCCESS; -} - -static pi_result USMFreeImpl(pi_context Context, void *Ptr) { - ZE_CALL(zeMemFree, (Context->ZeContext, Ptr)); - return PI_SUCCESS; -} - -// Exception type to pass allocation errors -class UsmAllocationException { - const pi_result Error; - -public: - UsmAllocationException(pi_result Err) : Error{Err} {} - pi_result getError() const { return Error; } -}; - -pi_result USMSharedMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, - pi_uint32 Alignment) { - return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, Size, - Alignment); -} - -pi_result USMSharedReadOnlyMemoryAlloc::allocateImpl(void **ResultPtr, - size_t Size, - pi_uint32 Alignment) { - pi_usm_mem_properties Props[] = {PI_MEM_ALLOC_FLAGS, - PI_MEM_ALLOC_DEVICE_READ_ONLY, 0}; - return USMSharedAllocImpl(ResultPtr, Context, Device, Props, Size, Alignment); -} - -pi_result USMDeviceMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, - pi_uint32 Alignment) { - return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size, - Alignment); -} - -pi_result USMHostMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, - pi_uint32 Alignment) { - return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment); -} - -void *USMMemoryAllocBase::allocate(size_t Size) { - void *Ptr = nullptr; - - auto Res = allocateImpl(&Ptr, Size, sizeof(void *)); - if (Res != PI_SUCCESS) { - throw UsmAllocationException(Res); - } - - return Ptr; -} - -void *USMMemoryAllocBase::allocate(size_t Size, size_t Alignment) { - void *Ptr = nullptr; - - auto Res = allocateImpl(&Ptr, Size, Alignment); - if (Res != PI_SUCCESS) { - throw UsmAllocationException(Res); - } - return Ptr; -} - -void USMMemoryAllocBase::deallocate(void *Ptr) { - auto Res = USMFreeImpl(Context, Ptr); - if (Res != PI_SUCCESS) { - throw UsmAllocationException(Res); - } + return pi2ur::piextGetDeviceFunctionPointer(Device, Program, FunctionName, + FunctionPointerRet); } pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, pi_device Device, pi_usm_mem_properties *Properties, size_t Size, pi_uint32 Alignment) { - // L0 supports alignment up to 64KB and silently ignores higher values. - // We flag alignment > 64KB as an invalid value. - if (Alignment > 65536) - return PI_ERROR_INVALID_VALUE; - - pi_platform Plt = Device->Platform; - - // If indirect access tracking is enabled then lock the mutex which is - // guarding contexts container in the platform. This prevents new kernels from - // being submitted in any context while we are in the process of allocating a - // memory, this is needed to properly capture allocations by kernels with - // indirect access. This lock also protects access to the context's data - // structures. If indirect access tracking is not enabled then lock context - // mutex to protect access to context's data structures. - std::shared_lock ContextLock(Context->Mutex, - std::defer_lock); - std::unique_lock IndirectAccessTrackingLock( - Plt->ContextsMutex, std::defer_lock); - if (IndirectAccessTrackingEnabled) { - IndirectAccessTrackingLock.lock(); - // We are going to defer memory release if there are kernels with indirect - // access, that is why explicitly retain context to be sure that it is - // released after all memory allocations in this context are released. - PI_CALL(piContextRetain(Context)); - } else { - ContextLock.lock(); - } - - if (!UseUSMAllocator || - // L0 spec says that allocation fails if Alignment != 2^n, in order to - // keep the same behavior for the allocator, just call L0 API directly and - // return the error code. - ((Alignment & (Alignment - 1)) != 0)) { - pi_result Res = USMDeviceAllocImpl(ResultPtr, Context, Device, Properties, - Size, Alignment); - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*ResultPtr), - std::forward_as_tuple(Context)); - } - return Res; - } - - try { - auto It = Context->DeviceMemAllocContexts.find(Device->ZeDevice); - if (It == Context->DeviceMemAllocContexts.end()) - return PI_ERROR_INVALID_VALUE; - - *ResultPtr = It->second.allocate(Size, Alignment); - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*ResultPtr), - std::forward_as_tuple(Context)); - } - - } catch (const UsmAllocationException &Ex) { - *ResultPtr = nullptr; - return Ex.getError(); - } catch (...) { - return PI_ERROR_UNKNOWN; - } - return PI_SUCCESS; + return pi2ur::piextUSMDeviceAlloc(ResultPtr, Context, Device, Properties, + Size, Alignment); } pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context, pi_device Device, pi_usm_mem_properties *Properties, size_t Size, pi_uint32 Alignment) { - // See if the memory is going to be read-only on the device. - bool DeviceReadOnly = false; - // Check that incorrect bits are not set in the properties. - if (Properties && *Properties != 0) { - PI_ASSERT(*(Properties) == PI_MEM_ALLOC_FLAGS && *(Properties + 2) == 0, - PI_ERROR_INVALID_VALUE); - DeviceReadOnly = *(Properties + 1) & PI_MEM_ALLOC_DEVICE_READ_ONLY; - } - // L0 supports alignment up to 64KB and silently ignores higher values. - // We flag alignment > 64KB as an invalid value. - if (Alignment > 65536) - return PI_ERROR_INVALID_VALUE; - - pi_platform Plt = Device->Platform; - - // If indirect access tracking is enabled then lock the mutex which is - // guarding contexts container in the platform. This prevents new kernels from - // being submitted in any context while we are in the process of allocating a - // memory, this is needed to properly capture allocations by kernels with - // indirect access. This lock also protects access to the context's data - // structures. If indirect access tracking is not enabled then lock context - // mutex to protect access to context's data structures. - std::scoped_lock Lock( - IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex); - - if (IndirectAccessTrackingEnabled) { - // We are going to defer memory release if there are kernels with indirect - // access, that is why explicitly retain context to be sure that it is - // released after all memory allocations in this context are released. - PI_CALL(piContextRetain(Context)); - } - - if (!UseUSMAllocator || - // L0 spec says that allocation fails if Alignment != 2^n, in order to - // keep the same behavior for the allocator, just call L0 API directly and - // return the error code. - ((Alignment & (Alignment - 1)) != 0)) { - pi_result Res = USMSharedAllocImpl(ResultPtr, Context, Device, Properties, - Size, Alignment); - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*ResultPtr), - std::forward_as_tuple(Context)); - } - return Res; - } - - try { - auto &Allocator = (DeviceReadOnly ? Context->SharedReadOnlyMemAllocContexts - : Context->SharedMemAllocContexts); - auto It = Allocator.find(Device->ZeDevice); - if (It == Allocator.end()) - return PI_ERROR_INVALID_VALUE; - - *ResultPtr = It->second.allocate(Size, Alignment); - if (DeviceReadOnly) { - Context->SharedReadOnlyAllocs.insert(*ResultPtr); - } - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*ResultPtr), - std::forward_as_tuple(Context)); - } - } catch (const UsmAllocationException &Ex) { - *ResultPtr = nullptr; - return Ex.getError(); - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - return PI_SUCCESS; + return pi2ur::piextUSMSharedAlloc(ResultPtr, Context, Device, Properties, + Size, Alignment); } pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context, pi_usm_mem_properties *Properties, size_t Size, pi_uint32 Alignment) { - // L0 supports alignment up to 64KB and silently ignores higher values. - // We flag alignment > 64KB as an invalid value. - if (Alignment > 65536) - return PI_ERROR_INVALID_VALUE; - - pi_platform Plt = Context->getPlatform(); - // If indirect access tracking is enabled then lock the mutex which is - // guarding contexts container in the platform. This prevents new kernels from - // being submitted in any context while we are in the process of allocating a - // memory, this is needed to properly capture allocations by kernels with - // indirect access. This lock also protects access to the context's data - // structures. If indirect access tracking is not enabled then lock context - // mutex to protect access to context's data structures. - std::shared_lock ContextLock(Context->Mutex, - std::defer_lock); - std::unique_lock IndirectAccessTrackingLock( - Plt->ContextsMutex, std::defer_lock); - if (IndirectAccessTrackingEnabled) { - IndirectAccessTrackingLock.lock(); - // We are going to defer memory release if there are kernels with indirect - // access, that is why explicitly retain context to be sure that it is - // released after all memory allocations in this context are released. - PI_CALL(piContextRetain(Context)); - } else { - ContextLock.lock(); - } - - if (!UseUSMAllocator || - // L0 spec says that allocation fails if Alignment != 2^n, in order to - // keep the same behavior for the allocator, just call L0 API directly and - // return the error code. - ((Alignment & (Alignment - 1)) != 0)) { - pi_result Res = - USMHostAllocImpl(ResultPtr, Context, Properties, Size, Alignment); - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*ResultPtr), - std::forward_as_tuple(Context)); - } - return Res; - } - - // There is a single allocator for Host USM allocations, so we don't need to - // find the allocator depending on context as we do for Shared and Device - // allocations. - try { - *ResultPtr = Context->HostMemAllocContext->allocate(Size, Alignment); - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*ResultPtr), - std::forward_as_tuple(Context)); - } - } catch (const UsmAllocationException &Ex) { - *ResultPtr = nullptr; - return Ex.getError(); - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - return PI_SUCCESS; -} - -// Helper function to deallocate USM memory, if indirect access support is -// enabled then a caller must lock the platform-level mutex guarding the -// container with contexts because deallocating the memory can turn RefCount of -// a context to 0 and as a result the context being removed from the list of -// tracked contexts. -// If indirect access tracking is not enabled then caller must lock Context -// mutex. -static pi_result USMFreeHelper(pi_context Context, void *Ptr, - bool OwnZeMemHandle) { - if (!OwnZeMemHandle) { - // Memory should not be freed - return PI_SUCCESS; - } - - if (IndirectAccessTrackingEnabled) { - auto It = Context->MemAllocs.find(Ptr); - if (It == std::end(Context->MemAllocs)) { - die("All memory allocations must be tracked!"); - } - if (!It->second.RefCount.decrementAndTest()) { - // Memory can't be deallocated yet. - return PI_SUCCESS; - } - - // Reference count is zero, it is ok to free memory. - // We don't need to track this allocation anymore. - Context->MemAllocs.erase(It); - } - - if (!UseUSMAllocator) { - pi_result Res = USMFreeImpl(Context, Ptr); - if (IndirectAccessTrackingEnabled) - PI_CALL(ContextReleaseHelper(Context)); - return Res; - } - - // Query the device of the allocation to determine the right allocator context - ze_device_handle_t ZeDeviceHandle; - ZeStruct ZeMemoryAllocationProperties; - - // Query memory type of the pointer we're freeing to determine the correct - // way to do it(directly or via an allocator) - auto ZeResult = - ZE_CALL_NOCHECK(zeMemGetAllocProperties, - (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties, - &ZeDeviceHandle)); - - // Handle the case that L0 RT was already unloaded - if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) { - if (IndirectAccessTrackingEnabled) - PI_CALL(ContextReleaseHelper(Context)); - return PI_SUCCESS; - } else if (ZeResult) { - return mapError(ZeResult); - } - - // If memory type is host release from host pool - if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST) { - try { - Context->HostMemAllocContext->deallocate(Ptr); - } catch (const UsmAllocationException &Ex) { - return Ex.getError(); - } catch (...) { - return PI_ERROR_UNKNOWN; - } - if (IndirectAccessTrackingEnabled) - PI_CALL(ContextReleaseHelper(Context)); - return PI_SUCCESS; - } - - // Points out an allocation in SharedReadOnlyMemAllocContexts - auto SharedReadOnlyAllocsIterator = Context->SharedReadOnlyAllocs.end(); - - if (!ZeDeviceHandle) { - // The only case where it is OK not have device identified is - // if the memory is not known to the driver. We should not ever get - // this either, probably. - PI_ASSERT(ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN, - PI_ERROR_INVALID_DEVICE); - } else { - pi_device Device; - // All context member devices or their descendants are of the same platform. - auto Platform = Context->getPlatform(); - Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle); - PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); - - auto DeallocationHelper = - [Context, Device, - Ptr](std::unordered_map - &AllocContextMap) { - try { - auto It = AllocContextMap.find(Device->ZeDevice); - if (It == AllocContextMap.end()) - return PI_ERROR_INVALID_VALUE; - - // The right context is found, deallocate the pointer - It->second.deallocate(Ptr); - } catch (const UsmAllocationException &Ex) { - return Ex.getError(); - } - - if (IndirectAccessTrackingEnabled) - PI_CALL(ContextReleaseHelper(Context)); - return PI_SUCCESS; - }; - - switch (ZeMemoryAllocationProperties.type) { - case ZE_MEMORY_TYPE_SHARED: - // Distinguish device_read_only allocations since they have own pool. - SharedReadOnlyAllocsIterator = Context->SharedReadOnlyAllocs.find(Ptr); - return DeallocationHelper(SharedReadOnlyAllocsIterator != - Context->SharedReadOnlyAllocs.end() - ? Context->SharedReadOnlyMemAllocContexts - : Context->SharedMemAllocContexts); - case ZE_MEMORY_TYPE_DEVICE: - return DeallocationHelper(Context->DeviceMemAllocContexts); - default: - // Handled below - break; - } - } - - pi_result Res = USMFreeImpl(Context, Ptr); - if (SharedReadOnlyAllocsIterator != Context->SharedReadOnlyAllocs.end()) { - Context->SharedReadOnlyAllocs.erase(SharedReadOnlyAllocsIterator); - } - if (IndirectAccessTrackingEnabled) - PI_CALL(ContextReleaseHelper(Context)); - return Res; + return pi2ur::piextUSMHostAlloc(ResultPtr, Context, Properties, Size, + Alignment); } pi_result piextUSMFree(pi_context Context, void *Ptr) { - pi_platform Plt = Context->getPlatform(); - - std::scoped_lock Lock( - IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex); - return USMFreeHelper(Context, Ptr); + return pi2ur::piextUSMFree(Context, Ptr); } pi_result piextKernelSetArgPointer(pi_kernel Kernel, pi_uint32 ArgIndex, size_t ArgSize, const void *ArgValue) { - - PI_CALL(piKernelSetArg(Kernel, ArgIndex, ArgSize, ArgValue)); - return PI_SUCCESS; + return pi2ur::piextKernelSetArgPointer(Kernel, ArgIndex, ArgSize, ArgValue); } /// USM Memset API @@ -7886,32 +796,8 @@ pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, pi_int32 Value, size_t Count, pi_uint32 NumEventsInWaitlist, const pi_event *EventsWaitlist, pi_event *Event) { - if (!Ptr) { - return PI_ERROR_INVALID_VALUE; - } - - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - std::scoped_lock Lock(Queue->Mutex); - return enqueueMemFillHelper( - // TODO: do we need a new command type for USM memset? - PI_COMMAND_TYPE_MEM_BUFFER_FILL, Queue, Ptr, - &Value, // It will be interpreted as an 8-bit value, - 1, // which is indicated with this pattern_size==1 - Count, NumEventsInWaitlist, EventsWaitlist, Event); -} - -// Helper function to check if a pointer is a device pointer. -static bool IsDevicePointer(pi_context Context, const void *Ptr) { - ze_device_handle_t ZeDeviceHandle; - ZeStruct ZeMemoryAllocationProperties; - - // Query memory type of the pointer - ZE_CALL(zeMemGetAllocProperties, - (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties, - &ZeDeviceHandle)); - - return (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_DEVICE); + return pi2ur::piextUSMEnqueueMemset( + Queue, Ptr, Value, Count, NumEventsInWaitlist, EventsWaitlist, Event); } pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking, void *DstPtr, @@ -7920,26 +806,9 @@ pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking, void *DstPtr, const pi_event *EventsWaitlist, pi_event *Event) { - if (!DstPtr) { - return PI_ERROR_INVALID_VALUE; - } - - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - std::scoped_lock lock(Queue->Mutex); - - // Device to Device copies are found to execute slower on copy engine - // (versus compute engine). - bool PreferCopyEngine = !IsDevicePointer(Queue->Context, SrcPtr) || - !IsDevicePointer(Queue->Context, DstPtr); - - // Temporary option added to use copy engine for D2D copy - PreferCopyEngine |= UseCopyEngineForD2DCopy; - - return enqueueMemCopyHelper( - // TODO: do we need a new command type for this? - PI_COMMAND_TYPE_MEM_BUFFER_COPY, Queue, DstPtr, Blocking, Size, SrcPtr, - NumEventsInWaitlist, EventsWaitlist, Event, PreferCopyEngine); + return pi2ur::piextUSMEnqueueMemcpy(Queue, Blocking, DstPtr, SrcPtr, Size, + NumEventsInWaitlist, EventsWaitlist, + Event); } /// Hint to migrate memory to the device @@ -7957,63 +826,8 @@ pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, size_t Size, const pi_event *EventWaitList, pi_event *OutEvent) { - // flags is currently unused so fail if set - PI_ASSERT(Flags == 0, PI_ERROR_INVALID_VALUE); - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - // Lock automatically releases when this goes out of scope. - std::scoped_lock lock(Queue->Mutex); - - bool UseCopyEngine = false; - - // Please note that the following code should be run before the - // subsequent getAvailableCommandList() call so that there is no - // dead-lock from waiting unsubmitted events in an open batch. - // The createAndRetainPiZeEventList() has the proper side-effect - // of submitting batches with dependent events. - // - _pi_ze_event_list_t TmpWaitList; - if (auto Res = TmpWaitList.createAndRetainPiZeEventList( - NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)) - return Res; - - // Get a new command list to be used on this call - pi_command_list_ptr_t CommandList{}; - // TODO: Change UseCopyEngine argument to 'true' once L0 backend - // support is added - if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine)) - return Res; - - // TODO: do we need to create a unique command type for this? - ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - - const auto &WaitList = (*Event)->WaitList; - const auto &ZeCommandList = CommandList->first; - if (WaitList.Length) { - ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); - } - // TODO: figure out how to translate "flags" - ZE_CALL(zeCommandListAppendMemoryPrefetch, (ZeCommandList, Ptr, Size)); - - // TODO: Level Zero does not have a completion "event" with the prefetch API, - // so manually add command to signal our event. - ZE_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent)); - - if (auto Res = Queue->executeCommandList(CommandList, false)) - return Res; - - return PI_SUCCESS; + return pi2ur::piextUSMEnqueuePrefetch( + Queue, Ptr, Size, Flags, NumEventsInWaitList, EventWaitList, OutEvent); } /// USM memadvise API to govern behavior of automatic migration mechanisms @@ -8027,59 +841,8 @@ pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, size_t Size, pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr, size_t Length, pi_mem_advice Advice, pi_event *OutEvent) { - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - // Lock automatically releases when this goes out of scope. - std::scoped_lock lock(Queue->Mutex); - - auto ZeAdvice = ur_cast(Advice); - - bool UseCopyEngine = false; - - _pi_ze_event_list_t TmpWaitList; - if (auto Res = TmpWaitList.createAndRetainPiZeEventList(0, nullptr, Queue, - UseCopyEngine)) - return Res; - - // Get a new command list to be used on this call - pi_command_list_ptr_t CommandList{}; - // UseCopyEngine is set to 'false' here. - // TODO: Additional analysis is required to check if this operation will - // run faster on copy engines. - if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine)) - return Res; - - // TODO: do we need to create a unique command type for this? - ze_event_handle_t ZeEvent = nullptr; - pi_event InternalEvent; - bool IsInternal = OutEvent == nullptr; - pi_event *Event = OutEvent ? OutEvent : &InternalEvent; - auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER, - CommandList, IsInternal); - if (Res != PI_SUCCESS) - return Res; - ZeEvent = (*Event)->ZeEvent; - (*Event)->WaitList = TmpWaitList; - - const auto &ZeCommandList = CommandList->first; - const auto &WaitList = (*Event)->WaitList; - - if (WaitList.Length) { - ZE_CALL(zeCommandListAppendWaitOnEvents, - (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); - } - ZE_CALL(zeCommandListAppendMemAdvise, - (ZeCommandList, Queue->Device->ZeDevice, Ptr, Length, ZeAdvice)); - - // TODO: Level Zero does not have a completion "event" with the advise API, - // so manually add command to signal our event. - ZE_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent)); - - Queue->executeCommandList(CommandList, false); - - return PI_SUCCESS; + return pi2ur::piextUSMEnqueueMemAdvise(Queue, Ptr, Length, Advice, OutEvent); } /// USM 2D Fill API @@ -8094,25 +857,17 @@ pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr, /// \param num_events_in_waitlist is the number of events to wait on /// \param events_waitlist is an array of events to wait on /// \param event is the event that represents this operation -__SYCL_EXPORT pi_result piextUSMEnqueueFill2D(pi_queue queue, void *ptr, - size_t pitch, size_t pattern_size, - const void *pattern, size_t width, - size_t height, - pi_uint32 num_events_in_waitlist, - const pi_event *events_waitlist, - pi_event *event) { - std::ignore = queue; - std::ignore = ptr; - std::ignore = pitch; - std::ignore = pattern_size; - std::ignore = pattern; - std::ignore = width; - std::ignore = height; - std::ignore = num_events_in_waitlist; - std::ignore = events_waitlist; - std::ignore = event; - die("piextUSMEnqueueFill2D: not implemented"); - return {}; +__SYCL_EXPORT pi_result piextUSMEnqueueFill2D(pi_queue Queue, void *Ptr, + size_t Pitch, size_t PatternSize, + const void *Pattern, size_t Width, + size_t Height, + pi_uint32 NumEventsWaitList, + const pi_event *EventsWaitList, + pi_event *Event) { + + return pi2ur::piextUSMEnqueueFill2D(Queue, Ptr, Pitch, PatternSize, Pattern, + Width, Height, NumEventsWaitList, + EventsWaitList, Event); } /// USM 2D Memset API @@ -8127,21 +882,16 @@ __SYCL_EXPORT pi_result piextUSMEnqueueFill2D(pi_queue queue, void *ptr, /// \param num_events_in_waitlist is the number of events to wait on /// \param events_waitlist is an array of events to wait on /// \param event is the event that represents this operation -__SYCL_EXPORT pi_result piextUSMEnqueueMemset2D( - pi_queue queue, void *ptr, size_t pitch, int value, size_t width, - size_t height, pi_uint32 num_events_in_waitlist, - const pi_event *events_waitlist, pi_event *event) { - std::ignore = queue; - std::ignore = ptr; - std::ignore = pitch; - std::ignore = value; - std::ignore = width; - std::ignore = height; - std::ignore = num_events_in_waitlist; - std::ignore = events_waitlist; - std::ignore = event; - die("piextUSMEnqueueMemset2D: not implemented"); - return {}; +__SYCL_EXPORT pi_result piextUSMEnqueueMemset2D(pi_queue Queue, void *Ptr, + size_t Pitch, int Value, + size_t Width, size_t Height, + pi_uint32 NumEventsWaitList, + const pi_event *EventsWaitlist, + pi_event *Event) { + + return pi2ur::piextUSMEnqueueMemset2D(Queue, Ptr, Pitch, Value, Width, Height, + NumEventsWaitList, EventsWaitlist, + Event); } /// USM 2D Memcpy API @@ -8163,30 +913,10 @@ __SYCL_EXPORT pi_result piextUSMEnqueueMemcpy2D( const void *SrcPtr, size_t SrcPitch, size_t Width, size_t Height, pi_uint32 NumEventsInWaitlist, const pi_event *EventWaitlist, pi_event *Event) { - if (!DstPtr || !SrcPtr) - return PI_ERROR_INVALID_VALUE; - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - pi_buff_rect_offset_struct ZeroOffset{0, 0, 0}; - pi_buff_rect_region_struct Region{Width, Height, 0}; - - std::scoped_lock lock(Queue->Mutex); - - // Device to Device copies are found to execute slower on copy engine - // (versus compute engine). - bool PreferCopyEngine = !IsDevicePointer(Queue->Context, SrcPtr) || - !IsDevicePointer(Queue->Context, DstPtr); - - // Temporary option added to use copy engine for D2D copy - PreferCopyEngine |= UseCopyEngineForD2DCopy; - - return enqueueMemCopyRectHelper( - // TODO: do we need a new command type for this? - PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, Queue, SrcPtr, DstPtr, &ZeroOffset, - &ZeroOffset, &Region, SrcPitch, DstPitch, /*SrcSlicePitch=*/0, - /*DstSlicePitch=*/0, Blocking, NumEventsInWaitlist, EventWaitlist, Event, - PreferCopyEngine); + return pi2ur::piextUSMEnqueueMemcpy2D( + Queue, Blocking, DstPtr, DstPitch, SrcPtr, SrcPitch, Width, Height, + NumEventsInWaitlist, EventWaitlist, Event); } /// API to query information about USM allocated pointers. @@ -8209,61 +939,8 @@ pi_result piextUSMGetMemAllocInfo(pi_context Context, const void *Ptr, pi_mem_alloc_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - - ze_device_handle_t ZeDeviceHandle; - ZeStruct ZeMemoryAllocationProperties; - - ZE_CALL(zeMemGetAllocProperties, - (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties, - &ZeDeviceHandle)); - - ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); - switch (ParamName) { - case PI_MEM_ALLOC_TYPE: { - pi_usm_type MemAllocaType; - switch (ZeMemoryAllocationProperties.type) { - case ZE_MEMORY_TYPE_UNKNOWN: - MemAllocaType = PI_MEM_TYPE_UNKNOWN; - break; - case ZE_MEMORY_TYPE_HOST: - MemAllocaType = PI_MEM_TYPE_HOST; - break; - case ZE_MEMORY_TYPE_DEVICE: - MemAllocaType = PI_MEM_TYPE_DEVICE; - break; - case ZE_MEMORY_TYPE_SHARED: - MemAllocaType = PI_MEM_TYPE_SHARED; - break; - default: - urPrint("piextUSMGetMemAllocInfo: unexpected usm memory type\n"); - return PI_ERROR_INVALID_VALUE; - } - return ReturnValue(MemAllocaType); - } - case PI_MEM_ALLOC_DEVICE: - if (ZeDeviceHandle) { - auto Platform = Context->getPlatform(); - auto Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle); - return Device ? ReturnValue(Device) : PI_ERROR_INVALID_VALUE; - } else { - return PI_ERROR_INVALID_VALUE; - } - case PI_MEM_ALLOC_BASE_PTR: { - void *Base; - ZE_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, &Base, nullptr)); - return ReturnValue(Base); - } - case PI_MEM_ALLOC_SIZE: { - size_t Size; - ZE_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, nullptr, &Size)); - return ReturnValue(Size); - } - default: - urPrint("piextUSMGetMemAllocInfo: unsupported ParamName\n"); - return PI_ERROR_INVALID_VALUE; - } - return PI_SUCCESS; + return pi2ur::piextUSMGetMemAllocInfo(Context, Ptr, ParamName, ParamValueSize, + ParamValue, ParamValueSizeRet); } /// API for writing data from host to a device global variable. @@ -8283,32 +960,9 @@ pi_result piextEnqueueDeviceGlobalVariableWrite( pi_queue Queue, pi_program Program, const char *Name, pi_bool BlockingWrite, size_t Count, size_t Offset, const void *Src, pi_uint32 NumEventsInWaitList, const pi_event *EventsWaitList, pi_event *Event) { - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - - std::scoped_lock lock(Queue->Mutex); - - // Find global variable pointer - size_t GlobalVarSize = 0; - void *GlobalVarPtr = nullptr; - ZE_CALL(zeModuleGetGlobalPointer, - (Program->ZeModule, Name, &GlobalVarSize, &GlobalVarPtr)); - if (GlobalVarSize < Offset + Count) { - setErrorMessage("Write device global variable is out of range.", - UR_RESULT_ERROR_INVALID_VALUE); - return PI_ERROR_PLUGIN_SPECIFIC_ERROR; - } - - // Copy engine is preferred only for host to device transfer. - // Device to device transfers run faster on compute engines. - bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src); - - // Temporary option added to use copy engine for D2D copy - PreferCopyEngine |= UseCopyEngineForD2DCopy; - - return enqueueMemCopyHelper(PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_WRITE, - Queue, ur_cast(GlobalVarPtr) + Offset, - BlockingWrite, Count, Src, NumEventsInWaitList, - EventsWaitList, Event, PreferCopyEngine); + return pi2ur::piextEnqueueDeviceGlobalVariableWrite( + Queue, Program, Name, BlockingWrite, Count, Offset, Src, + NumEventsInWaitList, EventsWaitList, Event); } /// API reading data from a device global variable to host. @@ -8328,32 +982,12 @@ pi_result piextEnqueueDeviceGlobalVariableRead( pi_queue Queue, pi_program Program, const char *Name, pi_bool BlockingRead, size_t Count, size_t Offset, void *Dst, pi_uint32 NumEventsInWaitList, const pi_event *EventsWaitList, pi_event *Event) { - PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); - std::scoped_lock lock(Queue->Mutex); - - // Find global variable pointer - size_t GlobalVarSize = 0; - void *GlobalVarPtr = nullptr; - ZE_CALL(zeModuleGetGlobalPointer, - (Program->ZeModule, Name, &GlobalVarSize, &GlobalVarPtr)); - if (GlobalVarSize < Offset + Count) { - setErrorMessage("Read from device global variable is out of range.", - UR_RESULT_ERROR_INVALID_VALUE); - return PI_ERROR_PLUGIN_SPECIFIC_ERROR; - } - - // Copy engine is preferred only for host to device transfer. - // Device to device transfers run faster on compute engines. - bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Dst); + return pi2ur::piextEnqueueDeviceGlobalVariableRead( + Queue, Program, Name, BlockingRead, Count, Offset, Dst, + NumEventsInWaitList, EventsWaitList, Event); - // Temporary option added to use copy engine for D2D copy - PreferCopyEngine |= UseCopyEngineForD2DCopy; - - return enqueueMemCopyHelper( - PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_READ, Queue, Dst, BlockingRead, - Count, ur_cast(GlobalVarPtr) + Offset, NumEventsInWaitList, - EventsWaitList, Event, PreferCopyEngine); + return PI_SUCCESS; } /// API for Read from host pipe. /// @@ -8423,65 +1057,21 @@ pi_result piextEnqueueWriteHostPipe(pi_queue Queue, pi_program Program, pi_result piKernelSetExecInfo(pi_kernel Kernel, pi_kernel_exec_info ParamName, size_t ParamValueSize, const void *ParamValue) { - (void)ParamValueSize; - PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); - PI_ASSERT(ParamValue, PI_ERROR_INVALID_VALUE); - - std::scoped_lock Guard(Kernel->Mutex); - if (ParamName == PI_USM_INDIRECT_ACCESS && - *(static_cast(ParamValue)) == PI_TRUE) { - // The whole point for users really was to not need to know anything - // about the types of allocations kernel uses. So in DPC++ we always - // just set all 3 modes for each kernel. - ze_kernel_indirect_access_flags_t IndirectFlags = - ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST | - ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE | - ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED; - ZE_CALL(zeKernelSetIndirectAccess, (Kernel->ZeKernel, IndirectFlags)); - } else if (ParamName == PI_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG) { - ze_cache_config_flag_t ZeCacheConfig; - switch (*(static_cast(ParamValue))) { - case PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM: - ZeCacheConfig = ZE_CACHE_CONFIG_FLAG_LARGE_SLM; - break; - case PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA: - ZeCacheConfig = ZE_CACHE_CONFIG_FLAG_LARGE_DATA; - break; - case PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT: - ZeCacheConfig = static_cast(0); - break; - default: - // Unexpected cache configuration value. - return PI_ERROR_INVALID_VALUE; - } - ZE_CALL(zeKernelSetCacheConfig, (Kernel->ZeKernel, ZeCacheConfig);); - } else { - urPrint("piKernelSetExecInfo: unsupported ParamName\n"); - return PI_ERROR_INVALID_VALUE; - } - return PI_SUCCESS; + return pi2ur::piKernelSetExecInfo(Kernel, ParamName, ParamValueSize, + ParamValue); } pi_result piextProgramSetSpecializationConstant(pi_program Prog, - pi_uint32 SpecID, size_t, + pi_uint32 SpecID, size_t Size, const void *SpecValue) { - std::scoped_lock Guard(Prog->Mutex); - - // Remember the value of this specialization constant until the program is - // built. Note that we only save the pointer to the buffer that contains the - // value. The caller is responsible for maintaining storage for this buffer. - // - // NOTE: SpecSize is unused in Level Zero, the size is known from SPIR-V by - // SpecID. - Prog->SpecConstants[SpecID] = SpecValue; - - return PI_SUCCESS; + return pi2ur::piextProgramSetSpecializationConstant(Prog, SpecID, Size, + SpecValue); } const char SupportedVersion[] = _PI_LEVEL_ZERO_PLUGIN_VERSION_STRING; -pi_result piPluginInit(pi_plugin *PluginInit) { +pi_result piPluginInit(pi_plugin *PluginInit) { // missing PI_ASSERT(PluginInit, PI_ERROR_INVALID_VALUE); // Check that the major version matches in PiVersion and SupportedVersion @@ -8505,9 +1095,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) { pi_result piextPluginGetOpaqueData(void *opaque_data_param, void **opaque_data_return) { - (void)opaque_data_param; - (void)opaque_data_return; - return PI_ERROR_UNKNOWN; + return pi2ur::piextPluginGetOpaqueData(opaque_data_param, opaque_data_return); } // SYCL RT calls this api to notify the end of plugin lifetime. @@ -8518,388 +1106,12 @@ pi_result piextPluginGetOpaqueData(void *opaque_data_param, // It can include all the jobs to tear down resources before // the plugin is unloaded from memory. pi_result piTearDown(void *PluginParameter) { - (void)PluginParameter; - bool LeakFound = false; - // reclaim pi_platform objects here since we don't have piPlatformRelease. - for (pi_platform Platform : *PiPlatformsCache) { - delete Platform; - } - delete PiPlatformsCache; - delete PiPlatformsCacheMutex; - - // Print the balance of various create/destroy native calls. - // The idea is to verify if the number of create(+) and destroy(-) calls are - // matched. - if (ZeCallCount && (UrL0Debug & UR_L0_DEBUG_CALL_COUNT) != 0) { - // clang-format off - // - // The format of this table is such that each row accounts for a - // specific type of objects, and all elements in the raw except the last - // one are allocating objects of that type, while the last element is known - // to deallocate objects of that type. - // - std::vector> CreateDestroySet = { - {"zeContextCreate", "zeContextDestroy"}, - {"zeCommandQueueCreate", "zeCommandQueueDestroy"}, - {"zeModuleCreate", "zeModuleDestroy"}, - {"zeKernelCreate", "zeKernelDestroy"}, - {"zeEventPoolCreate", "zeEventPoolDestroy"}, - {"zeCommandListCreateImmediate", "zeCommandListCreate", "zeCommandListDestroy"}, - {"zeEventCreate", "zeEventDestroy"}, - {"zeFenceCreate", "zeFenceDestroy"}, - {"zeImageCreate", "zeImageDestroy"}, - {"zeSamplerCreate", "zeSamplerDestroy"}, - {"zeMemAllocDevice", "zeMemAllocHost", "zeMemAllocShared", "zeMemFree"}, - }; - - // A sample output aimed below is this: - // ------------------------------------------------------------------------ - // zeContextCreate = 1 \---> zeContextDestroy = 1 - // zeCommandQueueCreate = 1 \---> zeCommandQueueDestroy = 1 - // zeModuleCreate = 1 \---> zeModuleDestroy = 1 - // zeKernelCreate = 1 \---> zeKernelDestroy = 1 - // zeEventPoolCreate = 1 \---> zeEventPoolDestroy = 1 - // zeCommandListCreateImmediate = 1 | - // zeCommandListCreate = 1 \---> zeCommandListDestroy = 1 ---> LEAK = 1 - // zeEventCreate = 2 \---> zeEventDestroy = 2 - // zeFenceCreate = 1 \---> zeFenceDestroy = 1 - // zeImageCreate = 0 \---> zeImageDestroy = 0 - // zeSamplerCreate = 0 \---> zeSamplerDestroy = 0 - // zeMemAllocDevice = 0 | - // zeMemAllocHost = 1 | - // zeMemAllocShared = 0 \---> zeMemFree = 1 - // - // clang-format on - - fprintf(stderr, "ZE_DEBUG=%d: check balance of create/destroy calls\n", - UR_L0_DEBUG_CALL_COUNT); - fprintf(stderr, - "----------------------------------------------------------\n"); - for (const auto &Row : CreateDestroySet) { - int diff = 0; - for (auto I = Row.begin(); I != Row.end();) { - const char *ZeName = *I; - const auto &ZeCount = (*ZeCallCount)[*I]; - - bool First = (I == Row.begin()); - bool Last = (++I == Row.end()); - - if (Last) { - fprintf(stderr, " \\--->"); - diff -= ZeCount; - } else { - diff += ZeCount; - if (!First) { - fprintf(stderr, " | \n"); - } - } - - fprintf(stderr, "%30s = %-5d", ZeName, ZeCount); - } - - if (diff) { - LeakFound = true; - fprintf(stderr, " ---> LEAK = %d", diff); - } - fprintf(stderr, "\n"); - } - - ZeCallCount->clear(); - delete ZeCallCount; - ZeCallCount = nullptr; - } - if (LeakFound) - return PI_ERROR_INVALID_MEM_OBJECT; - - disableZeTracing(); - return PI_SUCCESS; -} - -pi_result _pi_buffer::getZeHandlePtr(char **&ZeHandlePtr, - access_mode_t AccessMode, - pi_device Device) { - char *ZeHandle; - PI_CALL(getZeHandle(ZeHandle, AccessMode, Device)); - ZeHandlePtr = &Allocations[Device].ZeHandle; - return PI_SUCCESS; -} - -size_t _pi_buffer::getAlignment() const { - // Choose an alignment that is at most 64 and is the next power of 2 - // for sizes less than 64. - auto Alignment = Size; - if (Alignment > 32UL) - Alignment = 64UL; - else if (Alignment > 16UL) - Alignment = 32UL; - else if (Alignment > 8UL) - Alignment = 16UL; - else if (Alignment > 4UL) - Alignment = 8UL; - else if (Alignment > 2UL) - Alignment = 4UL; - else if (Alignment > 1UL) - Alignment = 2UL; - else - Alignment = 1UL; - return Alignment; -} - -pi_result _pi_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, - pi_device Device) { - - // NOTE: There might be no valid allocation at all yet and we get - // here from piEnqueueKernelLaunch that would be doing the buffer - // initialization. In this case the Device is not null as kernel - // launch is always on a specific device. - if (!Device) - Device = LastDeviceWithValidAllocation; - // If the device is still not selected then use the first one in - // the context of the buffer. - if (!Device) - Device = Context->Devices[0]; - - auto &Allocation = Allocations[Device]; - - // Sub-buffers don't maintain own allocations but rely on parent buffer. - if (isSubBuffer()) { - PI_CALL(SubBuffer.Parent->getZeHandle(ZeHandle, AccessMode, Device)); - ZeHandle += SubBuffer.Origin; - // Still store the allocation info in the PI sub-buffer for - // getZeHandlePtr to work. At least zeKernelSetArgumentValue needs to - // be given a pointer to the allocation handle rather than its value. - // - Allocation.ZeHandle = ZeHandle; - Allocation.ReleaseAction = allocation_t::keep; - LastDeviceWithValidAllocation = Device; - return PI_SUCCESS; - } - - // First handle case where the buffer is represented by only - // a single host allocation. - if (OnHost) { - auto &HostAllocation = Allocations[nullptr]; - // The host allocation may already exists, e.g. with imported - // host ptr, or in case of interop buffer. - if (!HostAllocation.ZeHandle) { - if (USMAllocatorConfigInstance.EnableBuffers) { - HostAllocation.ReleaseAction = allocation_t::free; - PI_CALL(piextUSMHostAlloc(ur_cast(&ZeHandle), Context, nullptr, - Size, getAlignment())); - } else { - HostAllocation.ReleaseAction = allocation_t::free_native; - PI_CALL( - ZeHostMemAllocHelper(ur_cast(&ZeHandle), Context, Size)); - } - HostAllocation.ZeHandle = ZeHandle; - HostAllocation.Valid = true; - } - Allocation = HostAllocation; - Allocation.ReleaseAction = allocation_t::keep; - ZeHandle = Allocation.ZeHandle; - LastDeviceWithValidAllocation = Device; - return PI_SUCCESS; - } - // Reads user setting on how to deal with buffers in contexts where - // all devices have the same root-device. Returns "true" if the - // preference is to have allocate on each [sub-]device and migrate - // normally (copy) to other sub-devices as needed. Returns "false" - // if the preference is to have single root-device allocations - // serve the needs of all [sub-]devices, meaning potentially more - // cross-tile traffic. - // - static const bool SingleRootDeviceBufferMigration = [] { - const char *UrRet = - std::getenv("UR_L0_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION"); - const char *PiRet = - std::getenv("SYCL_PI_LEVEL_ZERO_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION"); - const char *EnvStr = UrRet ? UrRet : (PiRet ? PiRet : nullptr); - - if (EnvStr) - return (std::stoi(EnvStr) != 0); - // The default is to migrate normally, which may not always be the - // best option (depends on buffer access patterns), but is an - // overall win on the set of the available benchmarks. - return true; - }(); - - // Peform actual device allocation as needed. - if (!Allocation.ZeHandle) { - if (!SingleRootDeviceBufferMigration && Context->SingleRootDevice && - Context->SingleRootDevice != Device) { - // If all devices in the context are sub-devices of the same device - // then we reuse root-device allocation by all sub-devices in the - // context. - // TODO: we can probably generalize this and share root-device - // allocations by its own sub-devices even if not all other - // devices in the context have the same root. - PI_CALL(getZeHandle(ZeHandle, AccessMode, Context->SingleRootDevice)); - Allocation.ReleaseAction = allocation_t::keep; - Allocation.ZeHandle = ZeHandle; - Allocation.Valid = true; - return PI_SUCCESS; - } else { // Create device allocation - if (USMAllocatorConfigInstance.EnableBuffers) { - Allocation.ReleaseAction = allocation_t::free; - PI_CALL(piextUSMDeviceAlloc(ur_cast(&ZeHandle), Context, - Device, nullptr, Size, getAlignment())); - } else { - Allocation.ReleaseAction = allocation_t::free_native; - PI_CALL(ZeDeviceMemAllocHelper(ur_cast(&ZeHandle), Context, - Device, Size)); - } - } - Allocation.ZeHandle = ZeHandle; - } else { - ZeHandle = Allocation.ZeHandle; - } - - // If some prior access invalidated this allocation then make it valid again. - if (!Allocation.Valid) { - // LastDeviceWithValidAllocation should always have valid allocation. - if (Device == LastDeviceWithValidAllocation) - die("getZeHandle: last used allocation is not valid"); - - // For write-only access the allocation contents is not going to be used. - // So don't do anything to make it "valid". - bool NeedCopy = AccessMode != _pi_mem::write_only; - // It's also possible that the buffer doesn't have a valid allocation - // yet presumably when it is passed to a kernel that will perform - // it's intialization. - if (NeedCopy && !LastDeviceWithValidAllocation) { - NeedCopy = false; - } - char *ZeHandleSrc = nullptr; - if (NeedCopy) { - PI_CALL(getZeHandle(ZeHandleSrc, _pi_mem::read_only, - LastDeviceWithValidAllocation)); - // It's possible with the single root-device contexts that - // the buffer is represented by the single root-device - // allocation and then skip the copy to itself. - if (ZeHandleSrc == ZeHandle) - NeedCopy = false; - } - - if (NeedCopy) { - // Copy valid buffer data to this allocation. - // TODO: see if we should better use peer's device allocation used - // directly, if that capability is reported with zeDeviceCanAccessPeer, - // instead of maintaining a separate allocation and performing - // explciit copies. - // - // zeCommandListAppendMemoryCopy must not be called from simultaneous - // threads with the same command list handle, so we need exclusive lock. - ze_bool_t P2P = false; - ZE_CALL( - zeDeviceCanAccessPeer, - (Device->ZeDevice, LastDeviceWithValidAllocation->ZeDevice, &P2P)); - if (!P2P) { - // P2P copy is not possible, so copy through the host. - auto &HostAllocation = Allocations[nullptr]; - // The host allocation may already exists, e.g. with imported - // host ptr, or in case of interop buffer. - if (!HostAllocation.ZeHandle) { - void *ZeHandleHost; - if (USMAllocatorConfigInstance.EnableBuffers) { - HostAllocation.ReleaseAction = allocation_t::free; - PI_CALL(piextUSMHostAlloc(&ZeHandleHost, Context, nullptr, Size, - getAlignment())); - } else { - HostAllocation.ReleaseAction = allocation_t::free_native; - PI_CALL(ZeHostMemAllocHelper(&ZeHandleHost, Context, Size)); - } - HostAllocation.ZeHandle = ur_cast(ZeHandleHost); - HostAllocation.Valid = false; - } - std::scoped_lock Lock(Context->ImmediateCommandListMutex); - if (!HostAllocation.Valid) { - ZE_CALL(zeCommandListAppendMemoryCopy, - (Context->ZeCommandListInit, - HostAllocation.ZeHandle /* Dst */, ZeHandleSrc, Size, - nullptr, 0, nullptr)); - // Mark the host allocation data as valid so it can be reused. - // It will be invalidated below if the current access is not - // read-only. - HostAllocation.Valid = true; - } - ZE_CALL(zeCommandListAppendMemoryCopy, - (Context->ZeCommandListInit, ZeHandle /* Dst */, - HostAllocation.ZeHandle, Size, nullptr, 0, nullptr)); - } else { - // Perform P2P copy. - std::scoped_lock Lock(Context->ImmediateCommandListMutex); - ZE_CALL(zeCommandListAppendMemoryCopy, - (Context->ZeCommandListInit, ZeHandle /* Dst */, ZeHandleSrc, - Size, nullptr, 0, nullptr)); - } - } - Allocation.Valid = true; - LastDeviceWithValidAllocation = Device; - } - - // Invalidate other allocations that would become not valid if - // this access is not read-only. - if (AccessMode != _pi_mem::read_only) { - for (auto &Alloc : Allocations) { - if (Alloc.first != LastDeviceWithValidAllocation) - Alloc.second.Valid = false; - } - } - - urPrint("getZeHandle(pi_device{%p}) = %p\n", (void *)Device, - (void *)Allocation.ZeHandle); - return PI_SUCCESS; -} - -pi_result _pi_buffer::free() { - for (auto &Alloc : Allocations) { - auto &ZeHandle = Alloc.second.ZeHandle; - // It is possible that the real allocation wasn't made if the buffer - // wasn't really used in this location. - if (!ZeHandle) - continue; - - switch (Alloc.second.ReleaseAction) { - case allocation_t::keep: - break; - case allocation_t::free: { - pi_platform Plt = Context->getPlatform(); - std::scoped_lock Lock( - IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex); - - PI_CALL(USMFreeHelper(Context, ZeHandle)); - break; - } - case allocation_t::free_native: - PI_CALL(ZeMemFreeHelper(Context, ZeHandle)); - break; - case allocation_t::unimport: - ZeUSMImport.doZeUSMRelease(Context->getPlatform()->ZeDriver, ZeHandle); - break; - default: - die("_pi_buffer::free(): Unhandled release action"); - } - ZeHandle = nullptr; // don't leave hanging pointers - } - return PI_SUCCESS; + return pi2ur::piTearDown(PluginParameter); } pi_result piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime, uint64_t *HostTime) { - const uint64_t &ZeTimerResolution = - Device->ZeDeviceProperties->timerResolution; - const uint64_t TimestampMaxCount = - ((1ULL << Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL); - uint64_t DeviceClockCount, Dummy; - - ZE_CALL(zeDeviceGetGlobalTimestamps, - (Device->ZeDevice, HostTime == nullptr ? &Dummy : HostTime, - &DeviceClockCount)); - - if (DeviceTime != nullptr) { - *DeviceTime = (DeviceClockCount & TimestampMaxCount) * ZeTimerResolution; - } - return PI_SUCCESS; + return pi2ur::piGetDeviceAndHostTimer(Device, DeviceTime, HostTime); } #ifdef _WIN32 diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 2634e03cae595..8acc1077eb713 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -25,1330 +25,10 @@ #define _PI_LEVEL_ZERO_PLUGIN_VERSION_STRING \ _PI_PLUGIN_VERSION_STRING(_PI_LEVEL_ZERO_PLUGIN_VERSION) -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - // Share code between this PI L0 Plugin and UR L0 Adapter +#include "ur/usm_allocator_config.hpp" #include #include #include -// Define the types that are opaque in pi.h in a manner suitabale for Level Zero -// plugin - -struct _pi_platform : public _ur_platform_handle_t { - using _ur_platform_handle_t::_ur_platform_handle_t; - - // Keep track of all contexts in the platform. This is needed to manage - // a lifetime of memory allocations in each context when there are kernels - // with indirect access. - // TODO: should be deleted when memory isolation in the context is implemented - // in the driver. - std::list Contexts; - ur_shared_mutex ContextsMutex; -}; - -// Implements memory allocation via L0 RT for USM allocator interface. -class USMMemoryAllocBase : public SystemMemory { -protected: - pi_context Context; - pi_device Device; - // Internal allocation routine which must be implemented for each allocation - // type - virtual pi_result allocateImpl(void **ResultPtr, size_t Size, - pi_uint32 Alignment) = 0; - -public: - USMMemoryAllocBase(pi_context Ctx, pi_device Dev) - : Context{Ctx}, Device{Dev} {} - void *allocate(size_t Size) override final; - void *allocate(size_t Size, size_t Alignment) override final; - void deallocate(void *Ptr) override final; -}; - -// Allocation routines for shared memory type -class USMSharedMemoryAlloc : public USMMemoryAllocBase { -protected: - pi_result allocateImpl(void **ResultPtr, size_t Size, - pi_uint32 Alignment) override; - -public: - USMSharedMemoryAlloc(pi_context Ctx, pi_device Dev) - : USMMemoryAllocBase(Ctx, Dev) {} -}; - -// Allocation routines for shared memory type that is only modified from host. -class USMSharedReadOnlyMemoryAlloc : public USMMemoryAllocBase { -protected: - pi_result allocateImpl(void **ResultPtr, size_t Size, - pi_uint32 Alignment) override; - -public: - USMSharedReadOnlyMemoryAlloc(pi_context Ctx, pi_device Dev) - : USMMemoryAllocBase(Ctx, Dev) {} -}; - -// Allocation routines for device memory type -class USMDeviceMemoryAlloc : public USMMemoryAllocBase { -protected: - pi_result allocateImpl(void **ResultPtr, size_t Size, - pi_uint32 Alignment) override; - -public: - USMDeviceMemoryAlloc(pi_context Ctx, pi_device Dev) - : USMMemoryAllocBase(Ctx, Dev) {} -}; - -// Allocation routines for host memory type -class USMHostMemoryAlloc : public USMMemoryAllocBase { -protected: - pi_result allocateImpl(void **ResultPtr, size_t Size, - pi_uint32 Alignment) override; - -public: - USMHostMemoryAlloc(pi_context Ctx) : USMMemoryAllocBase(Ctx, nullptr) {} -}; - -struct _pi_device : _ur_device_handle_t { - using _ur_device_handle_t::_ur_device_handle_t; -}; - -// Structure describing the specific use of a command-list in a queue. -// This is because command-lists are re-used across multiple queues -// in the same context. -struct pi_command_list_info_t { - // The Level-Zero fence that will be signalled at completion. - // Immediate commandlists do not have an associated fence. - // A nullptr for the fence indicates that this is an immediate commandlist. - ze_fence_handle_t ZeFence{nullptr}; - // Record if the fence is in use. - // This is needed to avoid leak of the tracked command-list if the fence - // was not yet signaled at the time all events in that list were already - // completed (we are polling the fence at events completion). The fence - // may be still "in-use" due to sporadic delay in HW. - bool ZeFenceInUse{false}; - - // Indicates if command list is in closed state. This is needed to avoid - // appending commands to the closed command list. - bool IsClosed{false}; - - // Record the queue to which the command list will be submitted. - ze_command_queue_handle_t ZeQueue{nullptr}; - - // Record the queue descriptor fields used when creating the command list - // because we cannot recover these fields from the command list. Immediate - // command lists are recycled across queues and then all fields are used. For - // standard command lists only the ordinal is used. For queues created through - // the make_queue API the descriptor is unavailable so a dummy descriptor is - // used and then this entry is marked as not eligible for recycling. - ZeStruct ZeQueueDesc; - bool CanReuse{true}; - - // Helper functions to tell if this is a copy command-list. - bool isCopy(pi_queue Queue) const; - - // Keeps events created by commands submitted into this command-list. - // TODO: use this for explicit wait/cleanup of events at command-list - // completion. - // TODO: use this for optimizing events in the same command-list, e.g. - // only have last one visible to the host. - std::vector EventList{}; - size_t size() const { return EventList.size(); } - void append(pi_event Event) { EventList.push_back(Event); } -}; - -// The map type that would track all command-lists in a queue. -using pi_command_list_map_t = - std::unordered_map; -// The iterator pointing to a specific command-list in use. -using pi_command_list_ptr_t = pi_command_list_map_t::iterator; - -struct _pi_context : _ur_object { - _pi_context(ze_context_handle_t ZeContext, pi_uint32 NumDevices, - const pi_device *Devs, bool OwnZeContext) - : ZeContext{ZeContext}, OwnZeContext{OwnZeContext}, - Devices{Devs, Devs + NumDevices}, SingleRootDevice(getRootDevice()), - ZeCommandListInit{nullptr} { - // NOTE: one must additionally call initialize() to complete - // PI context creation. - } - - // Initialize the PI context. - pi_result initialize(); - - // Finalize the PI context - pi_result finalize(); - - // Return the Platform, which is the same for all devices in the context - pi_platform getPlatform() const; - - // A L0 context handle is primarily used during creation and management of - // resources that may be used by multiple devices. - // This field is only set at _pi_context creation time, and cannot change. - // Therefore it can be accessed without holding a lock on this _pi_context. - const ze_context_handle_t ZeContext; - - // Indicates if we own the ZeContext or it came from interop that - // asked to not transfer the ownership to SYCL RT. - bool OwnZeContext; - - // Keep the PI devices this PI context was created for. - // This field is only set at _pi_context creation time, and cannot change. - // Therefore it can be accessed without holding a lock on this _pi_context. - const std::vector Devices; - - // Checks if Device is covered by this context. - // For that the Device or its root devices need to be in the context. - bool isValidDevice(pi_device Device) const; - - // If context contains one device or sub-devices of the same device, we want - // to save this device. - // This field is only set at _pi_context creation time, and cannot change. - // Therefore it can be accessed without holding a lock on this _pi_context. - const pi_device SingleRootDevice = nullptr; - - // Immediate Level Zero command list for the device in this context, to be - // used for initializations. To be created as: - // - Immediate command list: So any command appended to it is immediately - // offloaded to the device. - // - Synchronous: So implicit synchronization is made inside the level-zero - // driver. - // There will be a list of immediate command lists (for each device) when - // support of the multiple devices per context will be added. - ze_command_list_handle_t ZeCommandListInit; - - // Mutex for the immediate command list. Per the Level Zero spec memory copy - // operations submitted to an immediate command list are not allowed to be - // called from simultaneous threads. - ur_mutex ImmediateCommandListMutex; - - // Mutex Lock for the Command List Cache. This lock is used to control both - // compute and copy command list caches. - ur_mutex ZeCommandListCacheMutex; - // Cache of all currently available/completed command/copy lists. - // Note that command-list can only be re-used on the same device. - // - // TODO: explore if we should use root-device for creating command-lists - // as spec says that in that case any sub-device can re-use it: "The - // application must only use the command list for the device, or its - // sub-devices, which was provided during creation." - // - std::unordered_map>>> - ZeComputeCommandListCache; - std::unordered_map>>> - ZeCopyCommandListCache; - - // Retrieves a command list for executing on this device along with - // a fence to be used in tracking the execution of this command list. - // If a command list has been created on this device which has - // completed its commands, then that command list and its associated fence - // will be reused. Otherwise, a new command list and fence will be created for - // running on this device. L0 fences are created on a L0 command queue so the - // caller must pass a command queue to create a new fence for the new command - // list if a command list/fence pair is not available. All Command Lists & - // associated fences are destroyed at Device Release. - // If UseCopyEngine is true, the command will eventually be executed in a - // copy engine. Otherwise, the command will be executed in a compute engine. - // If AllowBatching is true, then the command list returned may already have - // command in it, if AllowBatching is false, any open command lists that - // already exist in Queue will be closed and executed. - // If ForcedCmdQueue is not nullptr, the resulting command list must be tied - // to the contained command queue. This option is ignored if immediate - // command lists are used. - // When using immediate commandlists, retrieves an immediate command list - // for executing on this device. Immediate commandlists are created only - // once for each SYCL Queue and after that they are reused. - pi_result - getAvailableCommandList(pi_queue Queue, pi_command_list_ptr_t &CommandList, - bool UseCopyEngine, bool AllowBatching = false, - ze_command_queue_handle_t *ForcedCmdQueue = nullptr); - - // Get index of the free slot in the available pool. If there is no available - // pool then create new one. The HostVisible parameter tells if we need a - // slot for a host-visible event. The ProfilingEnabled tells is we need a - // slot for an event with profiling capabilities. - pi_result getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &, - bool HostVisible, - bool ProfilingEnabled); - - // Decrement number of events living in the pool upon event destroy - // and return the pool to the cache if there are no unreleased events. - pi_result decrementUnreleasedEventsInPool(pi_event Event); - - // Store USM allocator context(internal allocator structures) - // for USM shared and device allocations. There is 1 allocator context - // per each pair of (context, device) per each memory type. - std::unordered_map - DeviceMemAllocContexts; - std::unordered_map - SharedMemAllocContexts; - std::unordered_map - SharedReadOnlyMemAllocContexts; - - // Since L0 native runtime does not distinguisg "shared device_read_only" - // vs regular "shared" allocations, we have keep track of it to use - // proper USMAllocContext when freeing allocations. - std::unordered_set SharedReadOnlyAllocs; - - // Store the host allocator context. It does not depend on any device. - std::unique_ptr HostMemAllocContext; - - // We need to store all memory allocations in the context because there could - // be kernels with indirect access. Kernels with indirect access start to - // reference all existing memory allocations at the time when they are - // submitted to the device. Referenced memory allocations can be released only - // when kernel has finished execution. - std::unordered_map MemAllocs; - - // Get pi_event from cache. - pi_event getEventFromContextCache(bool HostVisible, bool WithProfiling); - - // Add pi_event to cache. - void addEventToContextCache(pi_event); - -private: - // If context contains one device then return this device. - // If context contains sub-devices of the same device, then return this parent - // device. Return nullptr if context consists of several devices which are not - // sub-devices of the same device. We call returned device the root device of - // a context. - // TODO: get rid of this when contexts with multiple devices are supported for - // images. - pi_device getRootDevice() const; - - // Following member variables are used to manage assignment of events - // to event pools. - // - // TODO: Create pi_event_pool class to encapsulate working with pools. - // This will avoid needing the use of maps below, and cleanup the - // pi_context overall. - // - - // The cache of event pools from where new events are allocated from. - // The head event pool is where the next event would be added to if there - // is still some room there. If there is no room in the head then - // the following event pool is taken (guranteed to be empty) and made the - // head. In case there is no next pool, a new pool is created and made the - // head. - // - // Cache of event pools to which host-visible events are added to. - std::vector> ZeEventPoolCache{4}; - auto getZeEventPoolCache(bool HostVisible, bool WithProfiling) { - if (HostVisible) - return WithProfiling ? &ZeEventPoolCache[0] : &ZeEventPoolCache[1]; - else - return WithProfiling ? &ZeEventPoolCache[2] : &ZeEventPoolCache[3]; - } - - // This map will be used to determine if a pool is full or not - // by storing number of empty slots available in the pool. - std::unordered_map - NumEventsAvailableInEventPool; - // This map will be used to determine number of unreleased events in the pool. - // We use separate maps for number of event slots available in the pool from - // the number of events unreleased in the pool. - // This will help when we try to make the code thread-safe. - std::unordered_map - NumEventsUnreleasedInEventPool; - - // Mutex to control operations on event pool caches and the helper maps - // holding the current pool usage counts. - ur_mutex ZeEventPoolCacheMutex; - - // Mutex to control operations on event caches. - ur_mutex EventCacheMutex; - - // Caches for events. - std::vector> EventCaches{4}; - - // Get the cache of events for a provided scope and profiling mode. - auto getEventCache(bool HostVisible, bool WithProfiling) { - if (HostVisible) - return WithProfiling ? &EventCaches[0] : &EventCaches[1]; - else - return WithProfiling ? &EventCaches[2] : &EventCaches[3]; - } -}; - -struct _pi_queue : _ur_object { - // ForceComputeIndex, if non-negative, indicates that the queue must be fixed - // to that particular compute CCS. - _pi_queue(std::vector &ComputeQueues, - std::vector &CopyQueues, - pi_context Context, pi_device Device, bool OwnZeCommandQueue, - pi_queue_properties Properties = 0, int ForceComputeIndex = -1); - - using queue_type = _pi_device::queue_group_info_t::type; - - // PI queue is in general a one to many mapping to L0 native queues. - struct pi_queue_group_t { - pi_queue Queue; - pi_queue_group_t() = delete; - - // The Queue argument captures the enclosing PI queue. - // The Type argument specifies the type of this queue group. - // The actual ZeQueues are populated at PI queue construction. - pi_queue_group_t(pi_queue Queue, queue_type Type) - : Queue(Queue), Type(Type) {} - - // The type of the queue group. - queue_type Type; - bool isCopy() const { return Type != queue_type::Compute; } - - // Level Zero command queue handles. - std::vector ZeQueues; - - // Immediate commandlist handles, one per Level Zero command queue handle. - // These are created only once, along with the L0 queues (see above) - // and reused thereafter. - std::vector ImmCmdLists; - - // Return the index of the next queue to use based on a - // round robin strategy and the queue group ordinal. - // If QueryOnly is true then return index values but don't update internal - // index data members of the queue. - uint32_t getQueueIndex(uint32_t *QueueGroupOrdinal, uint32_t *QueueIndex, - bool QueryOnly = false); - - // Get the ordinal for a command queue handle. - int32_t getCmdQueueOrdinal(ze_command_queue_handle_t CmdQueue); - - // This function will return one of possibly multiple available native - // queues and the value of the queue group ordinal. - ze_command_queue_handle_t &getZeQueue(uint32_t *QueueGroupOrdinal); - - // This function sets an immediate commandlist from the interop interface. - void setImmCmdList(ze_command_list_handle_t); - - // This function returns the next immediate commandlist to use. - pi_command_list_ptr_t &getImmCmdList(); - - // These indices are to filter specific range of the queues to use, - // and to organize round-robin across them. - uint32_t UpperIndex{0}; - uint32_t LowerIndex{0}; - uint32_t NextIndex{0}; - }; - - // Helper class to facilitate per-thread queue groups - // We maintain a hashtable of queue groups if requested to do them per-thread. - // Otherwise it is just single entry used for all threads. - struct pi_queue_group_by_tid_t - : public std::unordered_map { - bool PerThread = false; - - // Returns thread id if doing per-thread, or a generic id that represents - // all the threads. - std::thread::id tid() const { - return PerThread ? std::this_thread::get_id() : std::thread::id(); - } - - // Make the specified queue group be the master - void set(const pi_queue_group_t &QueueGroup) { - const auto &Device = QueueGroup.Queue->Device; - PerThread = Device->ImmCommandListUsed == _pi_device::PerThreadPerQueue; - assert(empty()); - insert({tid(), QueueGroup}); - } - - // Get a queue group to use for this thread - pi_queue_group_t &get() { - assert(!empty()); - auto It = find(tid()); - if (It != end()) { - return It->second; - } - // Add new queue group for this thread initialized from a master entry. - auto QueueGroup = begin()->second; - // Create space for queues and immediate commandlists, which are created - // on demand. - QueueGroup.ZeQueues = std::vector( - QueueGroup.ZeQueues.size(), nullptr); - QueueGroup.ImmCmdLists = std::vector( - QueueGroup.ZeQueues.size(), QueueGroup.Queue->CommandListMap.end()); - - std::tie(It, std::ignore) = insert({tid(), QueueGroup}); - return It->second; - } - }; - - // A map of compute groups containing compute queue handles, one per thread. - // When a queue is accessed from multiple host threads, a separate queue group - // is created for each thread. The key used for mapping is the thread ID. - pi_queue_group_by_tid_t ComputeQueueGroupsByTID; - - // A group containing copy queue handles. The main copy engine, if available, - // comes first followed by link copy engines, if available. - // When a queue is accessed from multiple host threads, a separate queue group - // is created for each thread. The key used for mapping is the thread ID. - pi_queue_group_by_tid_t CopyQueueGroupsByTID; - - // Wait for all commandlists associated with this Queue to finish operations. - pi_result synchronize(); - - // Return the queue group to use based on standard/immediate commandlist mode, - // and if immediate mode, the thread-specific group. - pi_queue_group_t &getQueueGroup(bool UseCopyEngine); - - // This function considers multiple factors including copy engine - // availability and user preference and returns a boolean that is used to - // specify if copy engine will eventually be used for a particular command. - bool useCopyEngine(bool PreferCopyEngine = true) const; - - // Keeps the PI context to which this queue belongs. - // This field is only set at _pi_queue creation time, and cannot change. - // Therefore it can be accessed without holding a lock on this _pi_queue. - const pi_context Context; - - // Keeps the PI device to which this queue belongs. - // This field is only set at _pi_queue creation time, and cannot change. - // Therefore it can be accessed without holding a lock on this _pi_queue. - const pi_device Device; - - // A queue may use either standard or immediate commandlists. At queue - // construction time this is set based on the device and any env var settings - // that change the default for the device type. When an interop queue is - // constructed, the caller chooses the type of commandlists to use. - bool UsingImmCmdLists; - - // Keeps track of the event associated with the last enqueued command into - // this queue. this is used to add dependency with the last command to add - // in-order semantics and updated with the latest event each time a new - // command is enqueued. - pi_event LastCommandEvent = nullptr; - - // Kernel is not necessarily submitted for execution during - // piEnqueueKernelLaunch, it may be batched. That's why we need to save the - // list of kernels which is going to be submitted but have not been submitted - // yet. This is needed to capture memory allocations for each kernel with - // indirect access in the list at the moment when kernel is really submitted - // for execution. - std::vector KernelsToBeSubmitted; - - // Update map of memory references made by the kernels about to be submitted - void CaptureIndirectAccesses(); - - // Indicates if we own the ZeCommandQueue or it came from interop that - // asked to not transfer the ownership to SYCL RT. - bool OwnZeCommandQueue; - - // Map of all command lists used in this queue. - pi_command_list_map_t CommandListMap; - - // Helper data structure to hold all variables related to batching - struct command_batch { - // These two members are used to keep track of how often the - // batching closes and executes a command list before reaching the - // QueueComputeBatchSize limit, versus how often we reach the limit. - // This info might be used to vary the QueueComputeBatchSize value. - pi_uint32 NumTimesClosedEarly = {0}; - pi_uint32 NumTimesClosedFull = {0}; - - // Open command list fields for batching commands into this queue. - pi_command_list_ptr_t OpenCommandList{}; - - // Approximate number of commands that are allowed to be batched for - // this queue. - // Added this member to the queue rather than using a global variable - // so that future implementation could use heuristics to change this on - // a queue specific basis. And by putting it in the queue itself, this - // is thread safe because of the locking of the queue that occurs. - pi_uint32 QueueBatchSize = {0}; - }; - - // ComputeCommandBatch holds data related to batching of non-copy commands. - // CopyCommandBatch holds data related to batching of copy commands. - command_batch ComputeCommandBatch, CopyCommandBatch; - - // Returns true if any commands for this queue are allowed to - // be batched together. - // For copy commands, IsCopy is set to 'true'. - // For non-copy commands, IsCopy is set to 'false'. - bool isBatchingAllowed(bool IsCopy) const; - - // Keeps the properties of this queue. - pi_queue_properties Properties; - - // Returns true if the queue is a in-order queue. - bool isInOrderQueue() const; - - // Returns true if the queue has discard events property. - bool isDiscardEvents() const; - - // Returns true if the queue has explicit priority set by user. - bool isPriorityLow() const; - bool isPriorityHigh() const; - - // adjust the queue's batch size, knowing that the current command list - // is being closed with a full batch. - // For copy commands, IsCopy is set to 'true'. - // For non-copy commands, IsCopy is set to 'false'. - void adjustBatchSizeForFullBatch(bool IsCopy); - - // adjust the queue's batch size, knowing that the current command list - // is being closed with only a partial batch of commands. - // For copy commands, IsCopy is set to 'true'. - // For non-copy commands, IsCopy is set to 'false'. - void adjustBatchSizeForPartialBatch(bool IsCopy); - - // Helper function to create a new command-list to this queue and associated - // fence tracking its completion. This command list & fence are added to the - // map of command lists in this queue with ZeFenceInUse = false. - // The caller must hold a lock of the queue already. - pi_result - createCommandList(bool UseCopyEngine, pi_command_list_ptr_t &CommandList, - ze_command_queue_handle_t *ForcedCmdQueue = nullptr); - - /// @brief Resets the command list and associated fence in the map and removes - /// events from the command list. - /// @param CommandList The caller must verify that this command list and fence - /// have been signalled. - /// @param MakeAvailable If the reset command list should be made available, - /// then MakeAvailable needs to be set to true. - /// @param EventListToCleanup The EventListToCleanup contains a list of - /// events from the command list which need to be cleaned up. - /// @param CheckStatus Hint informing whether we need to check status of the - /// events before removing them from the immediate command list. This is - /// needed because immediate command lists are not associated with fences and - /// in general status of the event needs to be checked. - /// @return PI_SUCCESS if successful, PI error code otherwise. - pi_result resetCommandList(pi_command_list_ptr_t CommandList, - bool MakeAvailable, - std::vector &EventListToCleanup, - bool CheckStatus = true); - - // Returns true if an OpenCommandList has commands that need to be submitted. - // If IsCopy is 'true', then the OpenCommandList containing copy commands is - // checked. Otherwise, the OpenCommandList containing compute commands is - // checked. - bool hasOpenCommandList(bool IsCopy) const { - auto CommandBatch = (IsCopy) ? CopyCommandBatch : ComputeCommandBatch; - return CommandBatch.OpenCommandList != CommandListMap.end(); - } - // Attach a command list to this queue. - // For non-immediate commandlist also close and execute it. - // Note that this command list cannot be appended to after this. - // The "IsBlocking" tells if the wait for completion is required. - // If OKToBatchCommand is true, then this command list may be executed - // immediately, or it may be left open for other future command to be - // batched into. - // If IsBlocking is true, then batching will not be allowed regardless - // of the value of OKToBatchCommand - // - // For immediate commandlists, no close and execute is necessary. - pi_result executeCommandList(pi_command_list_ptr_t CommandList, - bool IsBlocking = false, - bool OKToBatchCommand = false); - - // If there is an open command list associated with this queue, - // close it, execute it, and reset the corresponding OpenCommandList. - // If IsCopy is 'true', then the OpenCommandList containing copy commands is - // executed. Otherwise OpenCommandList containing compute commands is - // executed. - pi_result executeOpenCommandList(bool IsCopy); - - // Gets the open command containing the event, or CommandListMap.end() - pi_command_list_ptr_t eventOpenCommandList(pi_event Event); - - // Wrapper function to execute both OpenCommandLists (Copy and Compute). - // This wrapper is helpful when all 'open' commands need to be executed. - // Call-sites instances: piQuueueFinish, piQueueRelease, etc. - pi_result executeAllOpenCommandLists() { - using IsCopy = bool; - if (auto Res = executeOpenCommandList(IsCopy{false})) - return Res; - if (auto Res = executeOpenCommandList(IsCopy{true})) - return Res; - return PI_SUCCESS; - } - - // Inserts a barrier waiting for all unfinished events in ActiveBarriers into - // CmdList. Any finished events will be removed from ActiveBarriers. - pi_result insertActiveBarriers(pi_command_list_ptr_t &CmdList, - bool UseCopyEngine); - - // A helper structure to keep active barriers of the queue. - // It additionally manages ref-count of events in this list. - struct active_barriers { - std::vector Events; - void add(pi_event &Event); - pi_result clear(); - bool empty() { return Events.empty(); } - std::vector &vector() { return Events; } - }; - // A collection of currently active barriers. - // These should be inserted into a command list whenever an available command - // list is needed for a command. - active_barriers ActiveBarriers; - - // Besides each PI object keeping a total reference count in - // _ur_object::RefCount we keep special track of the queue *external* - // references. This way we are able to tell when the queue is being finished - // externally, and can wait for internal references to complete, and do proper - // cleanup of the queue. - // This counter doesn't track the lifetime of a queue object, it only tracks - // the number of external references. I.e. even if it reaches zero a queue - // object may not be destroyed and can be used internally in the plugin. - // That's why we intentionally don't use atomic type for this counter to - // enforce guarding with a mutex all the work involving this counter. - pi_uint32 RefCountExternal{1}; - - // Indicates that the queue is healthy and all operations on it are OK. - bool Healthy{true}; - - // The following data structures and methods are used only for handling - // in-order queue with discard_events property. Some commands in such queue - // may have discarded event. Which means that event is not visible outside of - // the plugin. It is possible to reset and reuse discarded events in the same - // in-order queue because of the dependency between commands. We don't have to - // wait event completion to do this. We use the following 2-event model to - // reuse events inside each command list: - // - // Operation1 = zeCommantListAppendMemoryCopy (signal ze_event1) - // zeCommandListAppendBarrier(wait for ze_event1) - // zeCommandListAppendEventReset(ze_event1) - // # Create new pi_event using ze_event1 and append to the cache. - // - // Operation2 = zeCommandListAppendMemoryCopy (signal ze_event2) - // zeCommandListAppendBarrier(wait for ze_event2) - // zeCommandListAppendEventReset(ze_event2) - // # Create new pi_event using ze_event2 and append to the cache. - // - // # Get pi_event from the beginning of the cache because there are two events - // # there. So it is guaranteed that we do round-robin between two events - - // # event from the last command is appended to the cache. - // Operation3 = zeCommandListAppendMemoryCopy (signal ze_event1) - // # The same ze_event1 is used for Operation1 and Operation3. - // - // When we switch to a different command list we need to signal new event and - // wait for it in the new command list using barrier. - // [CmdList1] - // Operation1 = zeCommantListAppendMemoryCopy (signal event1) - // zeCommandListAppendBarrier(wait for event1) - // zeCommandListAppendEventReset(event1) - // zeCommandListAppendSignalEvent(NewEvent) - // - // [CmdList2] - // zeCommandListAppendBarrier(wait for NewEvent) - // - // This barrier guarantees that command list execution starts only after - // completion of previous command list which signals aforementioned event. It - // allows to reset and reuse same event handles inside all command lists in - // scope of the queue. It means that we need 2 reusable events of each type - // (host-visible and device-scope) per queue at maximum. - - // This data member keeps track of the last used command list and allows to - // handle switch of immediate command lists because immediate command lists - // are never closed unlike regular command lists. - pi_command_list_ptr_t LastUsedCommandList = CommandListMap.end(); - - // Vector of 2 lists of reusable events: host-visible and device-scope. - // They are separated to allow faster access to stored events depending on - // requested type of event. Each list contains events which can be reused - // inside all command lists in the queue as described in the 2-event model. - // Leftover events in the cache are relased at the queue destruction. - std::vector> EventCaches{2}; - - // Get event from the queue's cache. - // Returns nullptr if the cache doesn't contain any reusable events or if the - // cache contains only one event which corresponds to the previous command and - // can't be used for the current command because we can't use the same event - // two times in a row and have to do round-robin between two events. Otherwise - // it picks an event from the beginning of the cache and returns it. Event - // from the last command is always appended to the end of the list. - pi_event getEventFromQueueCache(bool HostVisible); - - // Put pi_event to the cache. Provided pi_event object is not used by - // any command but its ZeEvent is used by many pi_event objects. - // Commands to wait and reset ZeEvent must be submitted to the queue before - // calling this method. - pi_result addEventToQueueCache(pi_event Event); - - // Append command to provided command list to wait and reset the last event if - // it is discarded and create new pi_event wrapper using the same native event - // and put it to the cache. We call this method after each command submission - // to make native event available to use by next commands. - pi_result resetDiscardedEvent(pi_command_list_ptr_t); - - // Append command to the command list to signal new event if the last event in - // the command list is discarded. While we submit commands in scope of the - // same command list we can reset and reuse events but when we switch to a - // different command list we currently need to signal new event and wait for - // it in the new command list using barrier. - pi_result signalEventFromCmdListIfLastEventDiscarded(pi_command_list_ptr_t); - - // Insert a barrier waiting for the last command event into the beginning of - // command list. This barrier guarantees that command list execution starts - // only after completion of previous command list which signals aforementioned - // event. It allows to reset and reuse same event handles inside all command - // lists in the queue. - pi_result - insertStartBarrierIfDiscardEventsMode(pi_command_list_ptr_t &CmdList); - - // Helper method telling whether we need to reuse discarded event in this - // queue. - bool doReuseDiscardedEvents(); -}; - -struct _pi_mem : _ur_object { - // Keeps the PI context of this memory handle. - pi_context Context; - - // Enumerates all possible types of accesses. - enum access_mode_t { unknown, read_write, read_only, write_only }; - - // Interface of the _pi_mem object - - // Get the Level Zero handle of the current memory object - virtual pi_result getZeHandle(char *&ZeHandle, access_mode_t, - pi_device Device = nullptr) = 0; - - // Get a pointer to the Level Zero handle of the current memory object - virtual pi_result getZeHandlePtr(char **&ZeHandlePtr, access_mode_t, - pi_device Device = nullptr) = 0; - - // Method to get type of the derived object (image or buffer) - virtual bool isImage() const = 0; - - virtual ~_pi_mem() = default; - -protected: - _pi_mem(pi_context Ctx) : Context{Ctx} {} -}; - -struct _pi_buffer; -using pi_buffer = _pi_buffer *; - -struct _pi_buffer final : _pi_mem { - // Buffer constructor - _pi_buffer(pi_context Context, size_t Size, char *HostPtr, - bool ImportedHostPtr = false) - : _pi_mem(Context), Size(Size), SubBuffer{nullptr, 0} { - - // We treat integrated devices (physical memory shared with the CPU) - // differently from discrete devices (those with distinct memories). - // For integrated devices, allocating the buffer in the host memory - // enables automatic access from the device, and makes copying - // unnecessary in the map/unmap operations. This improves performance. - OnHost = Context->Devices.size() == 1 && - Context->Devices[0]->ZeDeviceProperties->flags & - ZE_DEVICE_PROPERTY_FLAG_INTEGRATED; - - // Fill the host allocation data. - if (HostPtr) { - MapHostPtr = HostPtr; - // If this host ptr is imported to USM then use this as a host - // allocation for this buffer. - if (ImportedHostPtr) { - Allocations[nullptr].ZeHandle = HostPtr; - Allocations[nullptr].Valid = true; - Allocations[nullptr].ReleaseAction = _pi_buffer::allocation_t::unimport; - } - } - - // This initialization does not end up with any valid allocation yet. - LastDeviceWithValidAllocation = nullptr; - } - - // Sub-buffer constructor - _pi_buffer(pi_buffer Parent, size_t Origin, size_t Size) - : _pi_mem(Parent->Context), Size(Size), SubBuffer{Parent, Origin} {} - - // Interop-buffer constructor - _pi_buffer(pi_context Context, size_t Size, pi_device Device, - char *ZeMemHandle, bool OwnZeMemHandle) - : _pi_mem(Context), Size(Size), SubBuffer{nullptr, 0} { - - // Device == nullptr means host allocation - Allocations[Device].ZeHandle = ZeMemHandle; - Allocations[Device].Valid = true; - Allocations[Device].ReleaseAction = - OwnZeMemHandle ? allocation_t::free_native : allocation_t::keep; - - // Check if this buffer can always stay on host - OnHost = false; - if (!Device) { // Host allocation - if (Context->Devices.size() == 1 && - Context->Devices[0]->ZeDeviceProperties->flags & - ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) { - OnHost = true; - MapHostPtr = ZeMemHandle; // map to this allocation - } - } - LastDeviceWithValidAllocation = Device; - } - - // Returns a pointer to the USM allocation representing this PI buffer - // on the specified Device. If Device is nullptr then the returned - // USM allocation is on the device where this buffer was used the latest. - // The returned allocation is always valid, i.e. its contents is - // up-to-date and any data copies needed for that are performed under - // the hood. - // - virtual pi_result getZeHandle(char *&ZeHandle, access_mode_t, - pi_device Device = nullptr) override; - virtual pi_result getZeHandlePtr(char **&ZeHandlePtr, access_mode_t, - pi_device Device = nullptr) override; - - bool isImage() const override { return false; } - - bool isSubBuffer() const { return SubBuffer.Parent != nullptr; } - - // Frees all allocations made for the buffer. - pi_result free(); - - // Information about a single allocation representing this buffer. - struct allocation_t { - // Level Zero memory handle is really just a naked pointer. - // It is just convenient to have it char * to simplify offset arithmetics. - char *ZeHandle{nullptr}; - // Indicates if this allocation's data is valid. - bool Valid{false}; - // Specifies the action that needs to be taken for this - // allocation at buffer destruction. - enum { - keep, // do nothing, the allocation is not owned by us - unimport, // release of the imported allocation - free, // free from the pooling context (default) - free_native // free with a native call - } ReleaseAction{free}; - }; - - // We maintain multiple allocations on possibly all devices in the context. - // The "nullptr" device identifies a host allocation representing buffer. - // Sub-buffers don't maintain own allocations but rely on parent buffer. - std::unordered_map Allocations; - pi_device LastDeviceWithValidAllocation{nullptr}; - - // Flag to indicate that this memory is allocated in host memory. - // Integrated device accesses this memory. - bool OnHost{false}; - - // Tells the host allocation to use for buffer map operations. - char *MapHostPtr{nullptr}; - - // Supplementary data to keep track of the mappings of this buffer - // created with piEnqueueMemBufferMap. - struct Mapping { - // The offset in the buffer giving the start of the mapped region. - size_t Offset; - // The size of the mapped region. - size_t Size; - }; - - // The key is the host pointer representing an active mapping. - // The value is the information needed to maintain/undo the mapping. - std::unordered_map Mappings; - - // The size and alignment of the buffer - size_t Size; - size_t getAlignment() const; - - struct { - _pi_mem *Parent; - size_t Origin; // only valid if Parent != nullptr - } SubBuffer; -}; - -struct _pi_image; -using pi_image = _pi_image *; - -// TODO: add proper support for images on context with multiple devices. -struct _pi_image final : _pi_mem { - // Image constructor - _pi_image(pi_context Ctx, ze_image_handle_t Image, bool OwnNativeHandle) - : _pi_mem(Ctx), ZeImage{Image}, OwnZeMemHandle{OwnNativeHandle} {} - - virtual pi_result getZeHandle(char *&ZeHandle, access_mode_t, - pi_device = nullptr) override { - ZeHandle = ur_cast(ZeImage); - return PI_SUCCESS; - } - virtual pi_result getZeHandlePtr(char **&ZeHandlePtr, access_mode_t, - pi_device = nullptr) override { - ZeHandlePtr = ur_cast(&ZeImage); - return PI_SUCCESS; - } - - bool isImage() const override { return true; } - -#ifndef NDEBUG - // Keep the descriptor of the image (for debugging purposes) - ZeStruct ZeImageDesc; -#endif // !NDEBUG - - // Level Zero image handle. - ze_image_handle_t ZeImage; - - bool OwnZeMemHandle; -}; - -struct _pi_ze_event_list_t { - // List of level zero events for this event list. - ze_event_handle_t *ZeEventList = {nullptr}; - - // List of pi_events for this event list. - pi_event *PiEventList = {nullptr}; - - // length of both the lists. The actual allocation of these lists - // may be longer than this length. This length is the actual number - // of elements in the above arrays that are valid. - pi_uint32 Length = {0}; - - // A mutex is needed for destroying the event list. - // Creation is already thread-safe because we only create the list - // when an event is initially created. However, it might be - // possible to have multiple threads racing to destroy the list, - // so this will be used to make list destruction thread-safe. - ur_mutex PiZeEventListMutex; - - // Initialize this using the array of events in EventList, and retain - // all the pi_events in the created data structure. - // CurQueue is the pi_queue that the command with this event wait - // list is going to be added to. That is needed to flush command - // batches for wait events that are in other queues. - // UseCopyEngine indicates if the next command (the one that this - // event wait-list is for) is going to go to copy or compute - // queue. This is used to properly submit the dependent open - // command-lists. - pi_result createAndRetainPiZeEventList(pi_uint32 EventListLength, - const pi_event *EventList, - pi_queue CurQueue, bool UseCopyEngine); - - // Add all the events in this object's PiEventList to the end - // of the list EventsToBeReleased. Destroy pi_ze_event_list_t data - // structure fields making it look empty. - pi_result collectEventsForReleaseAndDestroyPiZeEventList( - std::list &EventsToBeReleased); - - // Had to create custom assignment operator because the mutex is - // not assignment copyable. Just field by field copy of the other - // fields. - _pi_ze_event_list_t &operator=(const _pi_ze_event_list_t &other) { - if (this != &other) { - this->ZeEventList = other.ZeEventList; - this->PiEventList = other.PiEventList; - this->Length = other.Length; - } - return *this; - } -}; - -struct _pi_event : _ur_object { - _pi_event(ze_event_handle_t ZeEvent, ze_event_pool_handle_t ZeEventPool, - pi_context Context, pi_command_type CommandType, bool OwnZeEvent) - : ZeEvent{ZeEvent}, OwnZeEvent{OwnZeEvent}, ZeEventPool{ZeEventPool}, - CommandType{CommandType}, Context{Context}, CommandData{nullptr} {} - - // Level Zero event handle. - ze_event_handle_t ZeEvent; - - // Indicates if we own the ZeEvent or it came from interop that - // asked to not transfer the ownership to SYCL RT. - bool OwnZeEvent; - - // Level Zero event pool handle. - ze_event_pool_handle_t ZeEventPool; - - // In case we use device-only events this holds their host-visible - // counterpart. If this event is itself host-visble then HostVisibleEvent - // points to this event. If this event is not host-visible then this field can - // be: 1) null, meaning that a host-visible event wasn't yet created 2) a PI - // event created internally that host will actually be redirected - // to wait/query instead of this PI event. - // - // The HostVisibleEvent is a reference counted PI event and can be used more - // than by just this one event, depending on the mode (see EventsScope). - // - pi_event HostVisibleEvent = {nullptr}; - bool isHostVisible() const { return this == HostVisibleEvent; } - - // Get the host-visible event or create one and enqueue its signal. - pi_result getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent); - - // Tells if this event is with profiling capabilities. - bool isProfilingEnabled() const { - return !Queue || // tentatively assume user events are profiling enabled - (Queue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) != 0; - } - - // Keeps the command-queue and command associated with the event. - // These are NULL for the user events. - pi_queue Queue = {nullptr}; - pi_command_type CommandType; - // Provide direct access to Context, instead of going via queue. - // Not every PI event has a queue, and we need a handle to Context - // to get to event pool related information. - pi_context Context; - - // Opaque data to hold any data needed for CommandType. - void *CommandData; - - // List of events that were in the wait list of the command that will - // signal this event. These events must be retained when the command is - // enqueued, and must then be released when this event has signalled. - // This list must be destroyed once the event has signalled. - _pi_ze_event_list_t WaitList; - - // Command list associated with the pi_event. - std::optional CommandList; - - // Tracks if the needed cleanup was already performed for - // a completed event. This allows to control that some cleanup - // actions are performed only once. - // - bool CleanedUp = {false}; - - // Indicates that this PI event had already completed in the sense - // that no other synchromization is needed. Note that the underlying - // L0 event (if any) is not guranteed to have been signalled, or - // being visible to the host at all. - bool Completed = {false}; - - // Indicates that this event is discarded, i.e. it is not visible outside of - // plugin. - bool IsDiscarded = {false}; - - // Besides each PI object keeping a total reference count in - // _ur_object::RefCount we keep special track of the event *external* - // references. This way we are able to tell when the event is not referenced - // externally anymore, i.e. it can't be passed as a dependency event to - // piEnqueue* functions and explicitly waited meaning that we can do some - // optimizations: - // 1. For in-order queues we can reset and reuse event even if it was not yet - // completed by submitting a reset command to the queue (since there are no - // external references, we know that nobody can wait this event somewhere in - // parallel thread or pass it as a dependency which may lead to hang) - // 2. We can avoid creating host proxy event. - // This counter doesn't track the lifetime of an event object. Even if it - // reaches zero an event object may not be destroyed and can be used - // internally in the plugin. - std::atomic RefCountExternal{0}; - - bool hasExternalRefs() { return RefCountExternal != 0; } - - // Reset _pi_event object. - pi_result reset(); -}; - -struct _pi_program : _ur_object { - // Possible states of a program. - typedef enum { - // The program has been created from intermediate language (SPIR-V), but it - // is not yet compiled. - IL, - - // The program has been created by loading native code, but it has not yet - // been built. This is equivalent to an OpenCL "program executable" that - // is loaded via clCreateProgramWithBinary(). - Native, - - // The program was notionally compiled from SPIR-V form. However, since we - // postpone compilation until the module is linked, the internal state - // still represents the module as SPIR-V. - Object, - - // The program has been built or linked, and it is represented as a Level - // Zero module. - Exe, - - // An error occurred during piProgramLink, but we created a _pi_program - // object anyways in order to hold the ZeBuildLog. Note that the ZeModule - // may or may not be nullptr in this state, depending on the error. - Invalid - } state; - - // A utility class that converts specialization constants into the form - // required by the Level Zero driver. - class SpecConstantShim { - public: - SpecConstantShim(pi_program Program) { - ZeSpecConstants.numConstants = Program->SpecConstants.size(); - ZeSpecContantsIds.reserve(ZeSpecConstants.numConstants); - ZeSpecContantsValues.reserve(ZeSpecConstants.numConstants); - - for (auto &SpecConstant : Program->SpecConstants) { - ZeSpecContantsIds.push_back(SpecConstant.first); - ZeSpecContantsValues.push_back(SpecConstant.second); - } - ZeSpecConstants.pConstantIds = ZeSpecContantsIds.data(); - ZeSpecConstants.pConstantValues = ZeSpecContantsValues.data(); - } - - const ze_module_constants_t *ze() { return &ZeSpecConstants; } - - private: - std::vector ZeSpecContantsIds; - std::vector ZeSpecContantsValues; - ze_module_constants_t ZeSpecConstants; - }; - - // Construct a program in IL or Native state. - _pi_program(state St, pi_context Context, const void *Input, size_t Length) - : Context{Context}, OwnZeModule{true}, State{St}, - Code{new uint8_t[Length]}, CodeLength{Length}, ZeModule{nullptr}, - ZeBuildLog{nullptr} { - std::memcpy(Code.get(), Input, Length); - } - - // Construct a program in Exe or Invalid state. - _pi_program(state St, pi_context Context, ze_module_handle_t ZeModule, - ze_module_build_log_handle_t ZeBuildLog) - : Context{Context}, OwnZeModule{true}, State{St}, ZeModule{ZeModule}, - ZeBuildLog{ZeBuildLog} {} - - // Construct a program in Exe state (interop). - _pi_program(state St, pi_context Context, ze_module_handle_t ZeModule, - bool OwnZeModule) - : Context{Context}, OwnZeModule{OwnZeModule}, State{St}, - ZeModule{ZeModule}, ZeBuildLog{nullptr} {} - - // Construct a program in Invalid state with a custom error message. - _pi_program(state St, pi_context Context, const std::string &ErrorMessage) - : Context{Context}, OwnZeModule{true}, ErrorMessage{ErrorMessage}, - State{St}, ZeModule{nullptr}, ZeBuildLog{nullptr} {} - - ~_pi_program(); - - const pi_context Context; // Context of the program. - - // Indicates if we own the ZeModule or it came from interop that - // asked to not transfer the ownership to SYCL RT. - const bool OwnZeModule; - - // This error message is used only in Invalid state to hold a custom error - // message from a call to piProgramLink. - const std::string ErrorMessage; - - state State; - - // In IL and Object states, this contains the SPIR-V representation of the - // module. In Native state, it contains the native code. - std::unique_ptr Code; // Array containing raw IL / native code. - size_t CodeLength{0}; // Size (bytes) of the array. - - // Used only in IL and Object states. Contains the SPIR-V specialization - // constants as a map from the SPIR-V "SpecID" to a buffer that contains the - // associated value. The caller of the PI layer is responsible for - // maintaining the storage of this buffer. - std::unordered_map SpecConstants; - - // Used only in Object state. Contains the build flags from the last call to - // piProgramCompile(). - std::string BuildFlags; - - // The Level Zero module handle. Used primarily in Exe state. - ze_module_handle_t ZeModule; - - // The Level Zero build log from the last call to zeModuleCreate(). - ze_module_build_log_handle_t ZeBuildLog; -}; - -struct _pi_kernel : _ur_object { - _pi_kernel(ze_kernel_handle_t Kernel, bool OwnZeKernel, pi_program Program) - : ZeKernel{Kernel}, OwnZeKernel{OwnZeKernel}, Program{Program}, - MemAllocs{}, SubmissionsCount{0} {} - - // Completed initialization of PI kernel. Must be called after construction. - pi_result initialize(); - - // Returns true if kernel has indirect access, false otherwise. - bool hasIndirectAccess() { - // Currently indirect access flag is set for all kernels and there is no API - // to check if kernel actually indirectly access smth. - return true; - } - - // Level Zero function handle. - ze_kernel_handle_t ZeKernel; - - // Indicates if we own the ZeKernel or it came from interop that - // asked to not transfer the ownership to SYCL RT. - bool OwnZeKernel; - - // Keep the program of the kernel. - pi_program Program; - - // Hash function object for the unordered_set below. - struct Hash { - size_t operator()(const std::pair *P) const { - return std::hash()(P->first); - } - }; - - // If kernel has indirect access we need to make a snapshot of all existing - // memory allocations to defer deletion of these memory allocations to the - // moment when kernel execution has finished. - // We store pointers to the elements because pointers are not invalidated by - // insert/delete for std::unordered_map (iterators are invalidated). We need - // to take a snapshot instead of just reference-counting the allocations, - // because picture of active allocations can change during kernel execution - // (new allocations can be added) and we need to know which memory allocations - // were retained by this kernel to release them (and don't touch new - // allocations) at kernel completion. Same kernel may be submitted several - // times and retained allocations may be different at each submission. That's - // why we have a set of memory allocations here and increase ref count only - // once even if kernel is submitted many times. We don't want to know how many - // times and which allocations were retained by each submission. We release - // all allocations in the set only when SubmissionsCount == 0. - std::unordered_set *, Hash> MemAllocs; - - // Counter to track the number of submissions of the kernel. - // When this value is zero, it means that kernel is not submitted for an - // execution - at this time we can release memory allocations referenced by - // this kernel. We can do this when RefCount turns to 0 but it is too late - // because kernels are cached in the context by SYCL RT and they are released - // only during context object destruction. Regular RefCount is not usable to - // track submissions because user/SYCL RT can retain kernel object any number - // of times. And that's why there is no value of RefCount which can mean zero - // submissions. - std::atomic SubmissionsCount; - - // Keeps info about an argument to the kernel enough to set it with - // zeKernelSetArgumentValue. - struct ArgumentInfo { - uint32_t Index; - size_t Size; - const pi_mem Value; - _pi_mem::access_mode_t AccessMode{_pi_mem::unknown}; - }; - // Arguments that still need to be set (with zeKernelSetArgumentValue) - // before kernel is enqueued. - std::vector PendingArguments; - - // Cache of the kernel properties. - ZeCache> ZeKernelProperties; - ZeCache ZeKernelName; -}; - -struct _pi_sampler : _ur_object { - _pi_sampler(ze_sampler_handle_t Sampler) : ZeSampler{Sampler} {} - - // Level Zero sampler handle. - ze_sampler_handle_t ZeSampler; -}; - #endif // PI_LEVEL_ZERO_HPP diff --git a/sycl/plugins/level_zero/ur_bindings.hpp b/sycl/plugins/level_zero/ur_bindings.hpp index 0504df7e2f0d9..faaab6d5e925b 100755 --- a/sycl/plugins/level_zero/ur_bindings.hpp +++ b/sycl/plugins/level_zero/ur_bindings.hpp @@ -9,41 +9,3 @@ #include "pi_level_zero.hpp" #include - -// Make the Unified Runtime handles definition complete. -// This is used in various "create" API where new handles are allocated. -struct ur_platform_handle_t_ : public _pi_platform { - using _pi_platform::_pi_platform; -}; - -struct ur_device_handle_t_ : public _pi_device { - using _pi_device::_pi_device; -}; - -struct ur_context_handle_t_ : public _pi_context { - using _pi_context::_pi_context; -}; - -struct ur_event_handle_t_ : public _pi_event { - using _pi_event::_pi_event; -}; - -struct ur_program_handle_t_ : public _pi_program { - using _pi_program::_pi_program; -}; - -struct ur_kernel_handle_t_ : public _pi_kernel { - using _pi_kernel::_pi_kernel; -}; - -struct ur_queue_handle_t_ : public _pi_queue { - using _pi_queue::_pi_queue; -}; - -struct ur_sampler_handle_t_ : public _pi_sampler { - using _pi_sampler::_pi_sampler; -}; - -struct ur_mem_handle_t_ : public _pi_mem { - using _pi_mem::_pi_mem; -}; diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 7dd2a7b96bcd3..a4eee6963601e 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -71,6 +71,7 @@ add_sycl_plugin(unified_runtime Threads::Threads UnifiedRuntimeLoader UnifiedRuntime-Headers + LevelZeroLoader-Headers # we need for #include in ur_level_zero_common.h ) # Build level zero adapter @@ -90,7 +91,7 @@ add_sycl_library("ur_adapter_level_zero" SHARED "ur/adapters/level_zero/ur_level_zero_device.hpp" "ur/adapters/level_zero/ur_level_zero_event.hpp" "ur/adapters/level_zero/ur_level_zero_mem.hpp" - "ur/adapters/level_zero/ur_level_zero_module.hpp" + "ur/adapters/level_zero/ur_level_zero_kernel.hpp" "ur/adapters/level_zero/ur_level_zero_platform.hpp" "ur/adapters/level_zero/ur_level_zero_program.hpp" "ur/adapters/level_zero/ur_level_zero_queue.hpp" @@ -101,7 +102,7 @@ add_sycl_library("ur_adapter_level_zero" SHARED "ur/adapters/level_zero/ur_level_zero_device.cpp" "ur/adapters/level_zero/ur_level_zero_event.cpp" "ur/adapters/level_zero/ur_level_zero_mem.cpp" - "ur/adapters/level_zero/ur_level_zero_module.cpp" + "ur/adapters/level_zero/ur_level_zero_kernel.cpp" "ur/adapters/level_zero/ur_level_zero_platform.cpp" "ur/adapters/level_zero/ur_level_zero_program.cpp" "ur/adapters/level_zero/ur_level_zero_queue.cpp" diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 2d80f4c4ad20a..5ca4b1b9ae4f6 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -7,7 +7,9 @@ //===------------------------------------------------------------------===// #pragma once +#include "ur/adapters/level_zero/ur_level_zero.hpp" #include "ur_api.h" +#include #include #include @@ -52,6 +54,11 @@ static pi_result ur2piResult(ur_result_t urResult) { }; } +// Helper for one-liner validation +#define PI_ASSERT(condition, error) \ + if (!(condition)) \ + return error; + // Early exits on any error #define HANDLE_ERRORS(urCall) \ if (auto Result = urCall) \ @@ -375,54 +382,135 @@ inline pi_result ur2piDeviceInfoValue(ur_device_info_t ParamName, return PI_SUCCESS; } +struct _pi_context : ur_context_handle_t_ {}; + +struct _pi_queue : ur_context_handle_t_ {}; + +struct _pi_program : ur_program_handle_t_ {}; + +struct _pi_kernel : ur_kernel_handle_t_ {}; + +struct _pi_mem : ur_mem_handle_t_ {}; + +struct _pi_buffer : ur_mem_handle_t_ {}; + +struct _pi_image : ur_mem_handle_t_ {}; + +struct _pi_sampler : ur_sampler_handle_t_ {}; + +struct _pi_event : ur_event_handle_t_ {}; + namespace pi2ur { -inline pi_result piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, - pi_uint32 *num_platforms) { + +inline pi_result piTearDown(void *PluginParameter) { + std::ignore = PluginParameter; + HANDLE_ERRORS(urTearDown(nullptr)); + return PI_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +// Platform +inline pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, + pi_uint32 *NumPlatforms) { urInit(0); - uint32_t Count = num_entries; - auto phPlatforms = reinterpret_cast(platforms); - HANDLE_ERRORS(urPlatformGet(Count, phPlatforms, num_platforms)); + auto phPlatforms = reinterpret_cast(Platforms); + HANDLE_ERRORS(urPlatformGet(NumEntries, phPlatforms, NumPlatforms)); + return PI_SUCCESS; +} + +inline pi_result piextPlatformGetNativeHandle(pi_platform Platform, + pi_native_handle *NativeHandle) { + + PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM); + PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); + + auto UrPlatform = reinterpret_cast(Platform); + + ur_native_handle_t UrNativeHandle{}; + HANDLE_ERRORS(urPlatformGetNativeHandle(UrPlatform, &UrNativeHandle)); + + *NativeHandle = reinterpret_cast(UrNativeHandle); + + return PI_SUCCESS; +} + +inline pi_result +piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle, + pi_platform *Platform) { + + PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM); + PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); + + ur_platform_handle_t UrPlatform{}; + ur_native_handle_t UrNativeHandle = + reinterpret_cast(NativeHandle); + urPlatformCreateWithNativeHandle(UrNativeHandle, &UrPlatform); + + *Platform = reinterpret_cast(UrPlatform); + return PI_SUCCESS; } -inline pi_result piPlatformGetInfo(pi_platform platform, +inline pi_result piPlatformGetInfo(pi_platform Platform, pi_platform_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - ur_platform_info_t InfoType; + + PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM); + + ur_platform_info_t UrParamName = {}; switch (ParamName) { - case PI_PLATFORM_INFO_EXTENSIONS: - InfoType = UR_PLATFORM_INFO_NAME; + case PI_PLATFORM_INFO_EXTENSIONS: { + UrParamName = UR_PLATFORM_INFO_EXTENSIONS; break; - case PI_PLATFORM_INFO_NAME: - InfoType = UR_PLATFORM_INFO_NAME; + } + case PI_PLATFORM_INFO_NAME: { + UrParamName = UR_PLATFORM_INFO_NAME; break; - case PI_PLATFORM_INFO_PROFILE: - InfoType = UR_PLATFORM_INFO_PROFILE; + } + case PI_PLATFORM_INFO_PROFILE: { + UrParamName = UR_PLATFORM_INFO_PROFILE; break; - case PI_PLATFORM_INFO_VENDOR: - InfoType = UR_PLATFORM_INFO_VENDOR_NAME; + } + case PI_PLATFORM_INFO_VENDOR: { + UrParamName = UR_PLATFORM_INFO_VENDOR_NAME; break; - case PI_PLATFORM_INFO_VERSION: - InfoType = UR_PLATFORM_INFO_VERSION; + } + case PI_PLATFORM_INFO_VERSION: { + UrParamName = UR_PLATFORM_INFO_VERSION; break; - case PI_EXT_PLATFORM_INFO_BACKEND: - InfoType = UR_PLATFORM_INFO_BACKEND; + } + case PI_EXT_PLATFORM_INFO_BACKEND: { + UrParamName = UR_PLATFORM_INFO_BACKEND; break; + } default: - return PI_ERROR_UNKNOWN; + die("urGetContextInfo: unsuppported ParamName."); } size_t SizeInOut = ParamValueSize; - auto hPlatform = reinterpret_cast(platform); - HANDLE_ERRORS(urPlatformGetInfo(hPlatform, InfoType, SizeInOut, ParamValue, - ParamValueSizeRet)); + auto UrPlatform = reinterpret_cast(Platform); + HANDLE_ERRORS(urPlatformGetInfo(UrPlatform, UrParamName, ParamValueSize, + ParamValue, ParamValueSizeRet)); + + ur2piPlatformInfoValue(UrParamName, ParamValueSize, &SizeInOut, ParamValue); - ur2piPlatformInfoValue(InfoType, ParamValueSize, &SizeInOut, ParamValue); return PI_SUCCESS; } +inline pi_result piextPluginGetOpaqueData(void *opaque_data_param, + void **opaque_data_return) { + (void)opaque_data_param; + (void)opaque_data_return; + return PI_ERROR_UNKNOWN; +} + +// Platform +/////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////// +// Device inline pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, pi_uint32 NumEntries, pi_device *Devices, pi_uint32 *NumDevices) { @@ -444,26 +532,36 @@ inline pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, return PI_ERROR_UNKNOWN; } - uint32_t Count = NumEntries; - auto hPlatform = reinterpret_cast(Platform); - auto phDevices = reinterpret_cast(Devices); - HANDLE_ERRORS(urDeviceGet(hPlatform, Type, Count, phDevices, NumDevices)); + PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM); + + auto UrPlatform = reinterpret_cast(Platform); + auto UrDevices = reinterpret_cast(Devices); + HANDLE_ERRORS( + urDeviceGet(UrPlatform, Type, NumEntries, UrDevices, NumDevices)); + return PI_SUCCESS; } inline pi_result piDeviceRetain(pi_device Device) { - auto hDevice = reinterpret_cast(Device); - HANDLE_ERRORS(urDeviceRetain(hDevice)); + PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); + + auto UrDevice = reinterpret_cast(Device); + HANDLE_ERRORS(urDeviceRetain(UrDevice)); return PI_SUCCESS; } inline pi_result piDeviceRelease(pi_device Device) { - auto hDevice = reinterpret_cast(Device); - HANDLE_ERRORS(urDeviceRelease(hDevice)); + PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); + + auto UrDevice = reinterpret_cast(Device); + HANDLE_ERRORS(urDeviceRelease(UrDevice)); return PI_SUCCESS; } -inline pi_result piPluginGetLastError(char **) { return PI_SUCCESS; } +inline pi_result piPluginGetLastError(char **message) { + std::ignore = message; + return PI_SUCCESS; +} inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, size_t ParamValueSize, void *ParamValue, @@ -800,9 +898,12 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, return PI_ERROR_UNKNOWN; }; + PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); + size_t SizeInOut = ParamValueSize; - auto hDevice = reinterpret_cast(Device); - HANDLE_ERRORS(urDeviceGetInfo(hDevice, InfoType, SizeInOut, ParamValue, + auto UrDevice = reinterpret_cast(Device); + + HANDLE_ERRORS(urDeviceGetInfo(UrDevice, InfoType, SizeInOut, ParamValue, ParamValueSizeRet)); ur2piDeviceInfoValue(InfoType, ParamValueSize, &SizeInOut, ParamValue); @@ -810,10 +911,43 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, return PI_SUCCESS; } +inline pi_result piextDeviceGetNativeHandle(pi_device Device, + pi_native_handle *NativeHandle) { + PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); + PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); + + auto UrDevice = reinterpret_cast(Device); + + ur_native_handle_t UrNativeHandle{}; + HANDLE_ERRORS(urDeviceGetNativeHandle(UrDevice, &UrNativeHandle)); + *NativeHandle = reinterpret_cast(UrNativeHandle); + return PI_SUCCESS; +} + +inline pi_result +piextDeviceCreateWithNativeHandle(pi_native_handle NativeHandle, + pi_platform Platform, pi_device *Device) { + + PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); + PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); + + ur_native_handle_t UrNativeDevice = + reinterpret_cast(NativeHandle); + ur_platform_handle_t UrPlatform = + reinterpret_cast(Platform); + auto UrDevice = reinterpret_cast(Device); + HANDLE_ERRORS( + urDeviceCreateWithNativeHandle(UrNativeDevice, UrPlatform, UrDevice)); + + return PI_SUCCESS; +} + inline pi_result piDevicePartition( pi_device Device, const pi_device_partition_property *Properties, pi_uint32 NumEntries, pi_device *SubDevices, pi_uint32 *NumSubDevices) { + PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); + if (!Properties || !Properties[0]) return PI_ERROR_INVALID_VALUE; @@ -860,10 +994,2521 @@ inline pi_result piDevicePartition( ur_device_partition_property_t UrProperties[] = { ur_device_partition_property_t(Property), Value, 0}; - auto hDevice = reinterpret_cast(Device); - auto phSubDevices = reinterpret_cast(SubDevices); - HANDLE_ERRORS(urDevicePartition(hDevice, UrProperties, NumEntries, - phSubDevices, NumSubDevices)); + auto UrDevice = reinterpret_cast(Device); + auto UrSubDevices = reinterpret_cast(SubDevices); + HANDLE_ERRORS(urDevicePartition(UrDevice, UrProperties, NumEntries, + UrSubDevices, NumSubDevices)); + return PI_SUCCESS; +} + +inline pi_result piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime, + uint64_t *HostTime) { + auto UrDevice = reinterpret_cast(Device); + HANDLE_ERRORS(urDeviceGetGlobalTimestamps(UrDevice, DeviceTime, HostTime)); + return PI_SUCCESS; +} + +inline pi_result +piextDeviceSelectBinary(pi_device Device, // TODO: does this need to be context? + pi_device_binary *Binaries, pi_uint32 NumBinaries, + pi_uint32 *SelectedBinaryInd) { + + auto UrDevice = reinterpret_cast(Device); + const uint8_t **UrBinaries = + const_cast(reinterpret_cast(Binaries)); + HANDLE_ERRORS(urDeviceSelectBinary(UrDevice, UrBinaries, NumBinaries, + SelectedBinaryInd)); + return PI_SUCCESS; +} + +// Device +/////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////// +// Context +inline pi_result piContextCreate(const pi_context_properties *Properties, + pi_uint32 NumDevices, const pi_device *Devices, + void (*PFnNotify)(const char *ErrInfo, + const void *PrivateInfo, + size_t CB, void *UserData), + void *UserData, pi_context *RetContext) { + auto UrDevices = reinterpret_cast(Devices); + + ur_context_handle_t *UrContext = + reinterpret_cast(RetContext); + // TODO: Parse PI Context Properties into UR + ur_context_properties_t UrProperties{}; + HANDLE_ERRORS( + urContextCreate(NumDevices, UrDevices, &UrProperties, UrContext)); + return PI_SUCCESS; +} + +// FIXME: Dummy implementation to prevent link fail +inline pi_result piextContextSetExtendedDeleter( + pi_context Context, pi_context_extended_deleter Function, void *UserData) { + std::ignore = Context; + std::ignore = Function; + std::ignore = UserData; + die("piextContextSetExtendedDeleter: not supported"); + return PI_SUCCESS; +} + +inline pi_result piextContextGetNativeHandle(pi_context Context, + pi_native_handle *NativeHandle) { + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + ur_native_handle_t UrNativeHandle{}; + HANDLE_ERRORS(urContextGetNativeHandle(UrContext, &UrNativeHandle)); + *NativeHandle = reinterpret_cast(UrNativeHandle); + return PI_SUCCESS; +} + +inline pi_result piextContextCreateWithNativeHandle( + pi_native_handle NativeHandle, pi_uint32 NumDevices, + const pi_device *Devices, bool OwnNativeHandle, pi_context *RetContext) { + PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); + PI_ASSERT(Devices, PI_ERROR_INVALID_DEVICE); + PI_ASSERT(RetContext, PI_ERROR_INVALID_VALUE); + PI_ASSERT(NumDevices, PI_ERROR_INVALID_VALUE); + + ur_native_handle_t NativeContext = + reinterpret_cast(NativeHandle); + ur_context_handle_t *UrContext = + reinterpret_cast(RetContext); + HANDLE_ERRORS(urContextCreateWithNativeHandle(NativeContext, UrContext)); + (*UrContext)->OwnZeContext = OwnNativeHandle; + + return PI_SUCCESS; +} + +inline pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + + ur_context_handle_t hContext = reinterpret_cast(Context); + ur_context_info_t ContextInfoType{}; + + switch (ParamName) { + case PI_CONTEXT_INFO_DEVICES: { + ContextInfoType = UR_CONTEXT_INFO_DEVICES; + break; + } + case PI_CONTEXT_INFO_PLATFORM: { + die("urGetContextInfo: unsuppported ParamName."); + } + case PI_CONTEXT_INFO_NUM_DEVICES: { + ContextInfoType = UR_CONTEXT_INFO_NUM_DEVICES; + break; + } + case PI_CONTEXT_INFO_PROPERTIES: { + die("urGetContextInfo: unsuppported ParamName."); + } + case PI_CONTEXT_INFO_REFERENCE_COUNT: { + ContextInfoType = UR_EXT_CONTEXT_INFO_REFERENCE_COUNT; + break; + } + case PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT: { + ContextInfoType = UR_CONTEXT_INFO_USM_FILL2D_SUPPORT; + break; + } + case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: { + ContextInfoType = UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT; + break; + } + case PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: + case PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: + case PI_EXT_CONTEXT_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: + case PI_EXT_CONTEXT_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { + // These queries should be dealt with in context_impl.cpp by calling the + // queries of each device separately and building the intersection set. + die("These queries should have never come here"); + } + default: { + die("piGetContextInfo: unsuppported ParamName."); + } + } + + HANDLE_ERRORS(urContextGetInfo(hContext, ContextInfoType, ParamValueSize, + ParamValue, ParamValueSizeRet)); + return PI_SUCCESS; +} + +inline pi_result piContextRetain(pi_context Context) { + ur_context_handle_t hContext = reinterpret_cast(Context); + + HANDLE_ERRORS(urContextRetain(hContext)); + + return PI_SUCCESS; +} + +inline pi_result piContextRelease(pi_context Context) { + ur_context_handle_t UrContext = + reinterpret_cast(Context); + HANDLE_ERRORS(urContextRelease(UrContext)); + return PI_SUCCESS; +} +// Context +/////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////// +// Queue +inline pi_result piQueueCreate(pi_context Context, pi_device Device, + pi_queue_properties Flags, pi_queue *Queue) { + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + auto UrDevice = reinterpret_cast(Device); + ur_queue_property_t Props{}; + ur_queue_handle_t *UrQueue = reinterpret_cast(Queue); + HANDLE_ERRORS(urQueueCreate(UrContext, UrDevice, &Props, UrQueue)); + + return PI_SUCCESS; +} + +inline pi_result piextQueueCreate(pi_context Context, pi_device Device, + pi_queue_properties *Properties, + pi_queue *Queue) { + + PI_ASSERT(Properties, PI_ERROR_INVALID_VALUE); + // Expect flags mask to be passed first. + PI_ASSERT(Properties[0] == PI_QUEUE_FLAGS, PI_ERROR_INVALID_VALUE); + + PI_ASSERT(Properties[2] == 0 || + (Properties[2] == PI_QUEUE_COMPUTE_INDEX && Properties[4] == 0), + PI_ERROR_INVALID_VALUE); + + // Check that unexpected bits are not set. + PI_ASSERT(!(Properties[1] & + ~(PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE | + PI_QUEUE_FLAG_PROFILING_ENABLE | PI_QUEUE_FLAG_ON_DEVICE | + PI_QUEUE_FLAG_ON_DEVICE_DEFAULT | + PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS | + PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW | + PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH)), + PI_ERROR_INVALID_VALUE); + + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); + + ur_queue_property_t props[5]{}; + props[0] = UR_QUEUE_PROPERTIES_FLAGS; + if (Properties[1] & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) + props[1] |= UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; + if (Properties[1] & PI_QUEUE_FLAG_PROFILING_ENABLE) + props[1] |= UR_QUEUE_FLAG_PROFILING_ENABLE; + if (Properties[1] & PI_QUEUE_FLAG_ON_DEVICE) + props[1] |= UR_QUEUE_FLAG_ON_DEVICE; + if (Properties[1] & PI_QUEUE_FLAG_ON_DEVICE_DEFAULT) + props[1] |= UR_QUEUE_FLAG_ON_DEVICE_DEFAULT; + if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS) + props[1] |= UR_QUEUE_FLAG_DISCARD_EVENTS; + if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW) + props[1] |= UR_QUEUE_FLAG_PRIORITY_LOW; + if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH) + props[1] |= UR_QUEUE_FLAG_PRIORITY_HIGH; + + if (Properties[2] != 0) { + props[2] = UR_QUEUE_PROPERTIES_COMPUTE_INDEX; + props[3] = Properties[3]; + } + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + auto UrDevice = reinterpret_cast(Device); + + ur_queue_handle_t *UrQueue = reinterpret_cast(Queue); + HANDLE_ERRORS(urQueueCreate(UrContext, UrDevice, props, UrQueue)); + + return PI_SUCCESS; +} + +inline pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle, + pi_context Context, + pi_device Device, + bool OwnNativeHandle, + pi_queue *Queue) { + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + + ur_native_handle_t UrNativeHandle = + reinterpret_cast(NativeHandle); + ur_queue_handle_t *UrQueue = reinterpret_cast(Queue); + HANDLE_ERRORS( + urQueueCreateWithNativeHandle(UrNativeHandle, UrContext, UrQueue)); + (*UrQueue)->OwnNativeHandle = OwnNativeHandle; + return PI_SUCCESS; +} + +inline pi_result piextQueueGetNativeHandle(pi_queue Queue, + pi_native_handle *NativeHandle) { + + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + + ur_native_handle_t UrNativeQueue{}; + HANDLE_ERRORS(urQueueGetNativeHandle(UrQueue, &UrNativeQueue)); + + *NativeHandle = reinterpret_cast(UrNativeQueue); + + return PI_SUCCESS; +} + +inline pi_result piQueueRelease(pi_queue Queue) { + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + + HANDLE_ERRORS(urQueueRelease(UrQueue)); + + return PI_SUCCESS; +} + +inline pi_result piQueueFinish(pi_queue Queue) { + // Wait until command lists attached to the command queue are executed. + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + + HANDLE_ERRORS(urQueueFinish(UrQueue)); + + return PI_SUCCESS; +} + +inline pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + + ur_queue_info_t UrParamName{}; + + switch (ParamName) { + case PI_QUEUE_INFO_CONTEXT: { + UrParamName = UR_QUEUE_INFO_CONTEXT; + break; + } + case PI_QUEUE_INFO_DEVICE: { + UrParamName = UR_QUEUE_INFO_DEVICE; + break; + } + case PI_QUEUE_INFO_DEVICE_DEFAULT: { + UrParamName = UR_QUEUE_INFO_DEVICE_DEFAULT; + break; + } + case PI_QUEUE_INFO_PROPERTIES: { + UrParamName = UR_QUEUE_INFO_PROPERTIES; + break; + } + case PI_QUEUE_INFO_REFERENCE_COUNT: { + UrParamName = UR_QUEUE_INFO_REFERENCE_COUNT; + break; + } + case PI_QUEUE_INFO_SIZE: { + UrParamName = UR_QUEUE_INFO_SIZE; + break; + } + case PI_EXT_ONEAPI_QUEUE_INFO_EMPTY: { + UrParamName = UR_EXT_ONEAPI_QUEUE_INFO_EMPTY; + break; + } + default: { + die("Unsupported ParamName in piQueueGetInfo"); + return PI_ERROR_INVALID_VALUE; + } + } + + HANDLE_ERRORS(urQueueGetInfo(UrQueue, UrParamName, ParamValueSize, ParamValue, + ParamValueSizeRet)); + + return PI_SUCCESS; +} + +inline pi_result piQueueRetain(pi_queue Queue) { + + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + + HANDLE_ERRORS(urQueueRetain(UrQueue)); + + return PI_SUCCESS; +} + +inline pi_result piQueueFlush(pi_queue Queue) { + + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + + HANDLE_ERRORS(urQueueFlush(UrQueue)); + + return PI_SUCCESS; +} + +// Queue +/////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////// +// Program + +inline pi_result piProgramCreate(pi_context Context, const void *ILBytes, + size_t Length, pi_program *Program) { + + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(ILBytes && Length, PI_ERROR_INVALID_VALUE); + PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + + ur_program_properties_t UrProperties{}; + ur_program_handle_t *UrProgram = + reinterpret_cast(Program); + HANDLE_ERRORS(urProgramCreateWithIL(UrContext, ILBytes, Length, &UrProperties, + UrProgram)); + + return PI_SUCCESS; +} + +inline pi_result piProgramCreateWithBinary( + pi_context Context, pi_uint32 NumDevices, const pi_device *DeviceList, + const size_t *Lengths, const unsigned char **Binaries, + size_t NumMetadataEntries, const pi_device_binary_property *Metadata, + pi_int32 *BinaryStatus, pi_program *Program) { + std::ignore = Metadata; + std::ignore = NumMetadataEntries; + + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(DeviceList && NumDevices, PI_ERROR_INVALID_VALUE); + PI_ASSERT(Binaries && Lengths, PI_ERROR_INVALID_VALUE); + PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); + + // For now we support only one device. + if (NumDevices != 1) { + die("piProgramCreateWithBinary: level_zero supports only one device."); + return PI_ERROR_INVALID_VALUE; + } + if (!Binaries[0] || !Lengths[0]) { + if (BinaryStatus) + *BinaryStatus = PI_ERROR_INVALID_VALUE; + return PI_ERROR_INVALID_VALUE; + } + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + auto UrDevice = reinterpret_cast(DeviceList[0]); + + // TODO: Translate Metadata into Properties? + ur_program_properties_t Properties{}; + ur_program_handle_t *UrProgram = + reinterpret_cast(Program); + HANDLE_ERRORS(urProgramCreateWithBinary(UrContext, UrDevice, Lengths[0], + Binaries[0], &Properties, UrProgram)); + + if (BinaryStatus) + *BinaryStatus = PI_SUCCESS; + + return PI_SUCCESS; +} + +inline pi_result piclProgramCreateWithSource(pi_context Context, + pi_uint32 Count, + const char **Strings, + const size_t *Lengths, + pi_program *RetProgram) { + std::ignore = Context; + std::ignore = Count; + std::ignore = Strings; + std::ignore = Lengths; + std::ignore = RetProgram; + die("piclProgramCreateWithSource: not supported in UR\n"); + return PI_ERROR_INVALID_OPERATION; +} + +inline pi_result piProgramGetInfo(pi_program Program, pi_program_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); + + ur_program_handle_t UrProgram = + reinterpret_cast(Program); + + ur_program_info_t PropName{}; + + switch (ParamName) { + case PI_PROGRAM_INFO_REFERENCE_COUNT: { + PropName = UR_PROGRAM_INFO_REFERENCE_COUNT; + break; + } + case PI_PROGRAM_INFO_CONTEXT: { + PropName = UR_PROGRAM_INFO_CONTEXT; + break; + } + case PI_PROGRAM_INFO_NUM_DEVICES: { + PropName = UR_PROGRAM_INFO_NUM_DEVICES; + break; + } + case PI_PROGRAM_INFO_DEVICES: { + PropName = UR_PROGRAM_INFO_DEVICES; + break; + } + case PI_PROGRAM_INFO_SOURCE: { + PropName = UR_PROGRAM_INFO_SOURCE; + break; + } + case PI_PROGRAM_INFO_BINARY_SIZES: { + PropName = UR_PROGRAM_INFO_BINARY_SIZES; + break; + } + case PI_PROGRAM_INFO_BINARIES: { + PropName = UR_PROGRAM_INFO_BINARIES; + break; + } + case PI_PROGRAM_INFO_NUM_KERNELS: { + PropName = UR_PROGRAM_INFO_NUM_KERNELS; + break; + } + case PI_PROGRAM_INFO_KERNEL_NAMES: { + PropName = UR_PROGRAM_INFO_KERNEL_NAMES; + break; + } + default: { + die("urProgramGetInfo: not implemented"); + } + } + + HANDLE_ERRORS(urProgramGetInfo(UrProgram, PropName, ParamValueSize, + ParamValue, ParamValueSizeRet)); + return PI_SUCCESS; } + +inline pi_result +piProgramLink(pi_context Context, pi_uint32 NumDevices, + const pi_device *DeviceList, const char *Options, + pi_uint32 NumInputPrograms, const pi_program *InputPrograms, + void (*PFnNotify)(pi_program Program, void *UserData), + void *UserData, pi_program *RetProgram) { + // We only support one device with Level Zero currently. + if (NumDevices != 1) { + die("piProgramLink: level_zero supports only one device."); + return PI_ERROR_INVALID_VALUE; + } + + // Validate input parameters. + PI_ASSERT(DeviceList, PI_ERROR_INVALID_DEVICE); + PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE); + if (NumInputPrograms == 0 || InputPrograms == nullptr) + return PI_ERROR_INVALID_VALUE; + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + const ur_program_handle_t *UrInputPrograms = + reinterpret_cast(InputPrograms); + ur_program_handle_t *UrProgram = + reinterpret_cast(RetProgram); + + HANDLE_ERRORS(urProgramLink(UrContext, NumInputPrograms, UrInputPrograms, + Options, UrProgram)); + + return PI_SUCCESS; +} + +inline pi_result piProgramCompile( + pi_program Program, pi_uint32 NumDevices, const pi_device *DeviceList, + const char *Options, pi_uint32 NumInputHeaders, + const pi_program *InputHeaders, const char **HeaderIncludeNames, + void (*PFnNotify)(pi_program Program, void *UserData), void *UserData) { + + std::ignore = NumInputHeaders; + std::ignore = InputHeaders; + std::ignore = HeaderIncludeNames; + + PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); + + if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList)) + return PI_ERROR_INVALID_VALUE; + + // These aren't supported. + PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE); + + ur_program_handle_t UrProgram = + reinterpret_cast(Program); + + ur_program_info_t PropName = UR_PROGRAM_INFO_CONTEXT; + ur_context_handle_t UrContext{}; + HANDLE_ERRORS(urProgramGetInfo(UrProgram, PropName, sizeof(&UrContext), + &UrContext, nullptr)); + + HANDLE_ERRORS(urProgramCompile(UrContext, UrProgram, Options)); + + return PI_SUCCESS; +} + +inline pi_result +piProgramBuild(pi_program Program, pi_uint32 NumDevices, + const pi_device *DeviceList, const char *Options, + void (*PFnNotify)(pi_program Program, void *UserData), + void *UserData) { + PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); + if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList)) { + return PI_ERROR_INVALID_VALUE; + } + + // We only support build to one device with Level Zero now. + // TODO: we should eventually build to the possibly multiple root + // devices in the context. + if (NumDevices != 1) { + die("piProgramBuild: level_zero supports only one device."); + return PI_ERROR_INVALID_VALUE; + } + + // These aren't supported. + PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE); + + ur_program_handle_t UrProgram = + reinterpret_cast(Program); + ur_program_info_t PropName = UR_PROGRAM_INFO_CONTEXT; + ur_context_handle_t UrContext{}; + HANDLE_ERRORS(urProgramGetInfo(UrProgram, PropName, sizeof(&UrContext), + &UrContext, nullptr)); + + HANDLE_ERRORS(urProgramBuild(UrContext, UrProgram, Options)); + + return PI_SUCCESS; +} + +inline pi_result piextProgramSetSpecializationConstant(pi_program Program, + pi_uint32 SpecID, + size_t Size, + const void *SpecValue) { + ur_program_handle_t UrProgram = + reinterpret_cast(Program); + uint32_t Count = 1; + ur_specialization_constant_info_t SpecConstant{}; + SpecConstant.id = SpecID; + SpecConstant.size = Size; + SpecConstant.pValue = SpecValue; + HANDLE_ERRORS( + urProgramSetSpecializationConstants(UrProgram, Count, &SpecConstant)); + + return PI_SUCCESS; +} + +inline pi_result piKernelCreate(pi_program Program, const char *KernelName, + pi_kernel *RetKernel) { + PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); + PI_ASSERT(RetKernel, PI_ERROR_INVALID_VALUE); + PI_ASSERT(KernelName, PI_ERROR_INVALID_VALUE); + + ur_program_handle_t UrProgram = + reinterpret_cast(Program); + ur_kernel_handle_t *UrKernel = + reinterpret_cast(RetKernel); + + HANDLE_ERRORS(urKernelCreate(UrProgram, KernelName, UrKernel)); + + return PI_SUCCESS; +} + +inline pi_result +piEnqueueMemImageFill(pi_queue Queue, pi_mem Image, const void *FillColor, + const size_t *Origin, const size_t *Region, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *Event) { + + std::ignore = Image; + std::ignore = FillColor; + std::ignore = Origin; + std::ignore = Region; + std::ignore = NumEventsInWaitList; + std::ignore = EventsWaitList; + std::ignore = Event; + + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + die("piEnqueueMemImageFill: not implemented"); + return PI_SUCCESS; +} + +inline pi_result +piEnqueueNativeKernel(pi_queue Queue, void (*UserFunc)(void *), void *Args, + size_t CbArgs, pi_uint32 NumMemObjects, + const pi_mem *MemList, const void **ArgsMemLoc, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *Event) { + std::ignore = UserFunc; + std::ignore = Args; + std::ignore = CbArgs; + std::ignore = NumMemObjects; + std::ignore = MemList; + std::ignore = ArgsMemLoc; + std::ignore = NumEventsInWaitList; + std::ignore = EventsWaitList; + std::ignore = Event; + + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + die("piEnqueueNativeKernel: not implemented"); + return PI_SUCCESS; +} + +inline pi_result piextGetDeviceFunctionPointer(pi_device Device, + pi_program Program, + const char *FunctionName, + pi_uint64 *FunctionPointerRet) { + + PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); + + auto UrDevice = reinterpret_cast(Device); + + ur_program_handle_t UrProgram = + reinterpret_cast(Program); + + void **FunctionPointer = reinterpret_cast(FunctionPointerRet); + + HANDLE_ERRORS(urProgramGetFunctionPointer(UrDevice, UrProgram, FunctionName, + FunctionPointer)); + return PI_SUCCESS; +} + +// Special version of piKernelSetArg to accept pi_mem. +inline pi_result piextKernelSetArgMemObj(pi_kernel Kernel, pi_uint32 ArgIndex, + const pi_mem *ArgValue) { + + // TODO: the better way would probably be to add a new PI API for + // extracting native PI object from PI handle, and have SYCL + // RT pass that directly to the regular piKernelSetArg (and + // then remove this piextKernelSetArgMemObj). + + PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); + + ur_mem_handle_t UrMemory = reinterpret_cast(*ArgValue); + + // We don't yet know the device where this kernel will next be run on. + // Thus we can't know the actual memory allocation that needs to be used. + // Remember the memory object being used as an argument for this kernel + // to process it later when the device is known (at the kernel enqueue). + // + // TODO: for now we have to conservatively assume the access as read-write. + // Improve that by passing SYCL buffer accessor type into + // piextKernelSetArgMemObj. + // + + ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); + HANDLE_ERRORS(urKernelSetArgMemObj(UrKernel, ArgIndex, UrMemory)); + return PI_SUCCESS; +} + +inline pi_result piKernelSetArg(pi_kernel Kernel, pi_uint32 ArgIndex, + size_t ArgSize, const void *ArgValue) { + + PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); + + ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); + + HANDLE_ERRORS(urKernelSetArgValue(UrKernel, ArgIndex, ArgSize, ArgValue)); + return PI_SUCCESS; +} + +inline pi_result +piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle, + pi_context Context, pi_program Program, + bool OwnNativeHandle, pi_kernel *Kernel) { + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); + PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); + PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); + + ur_native_handle_t UrNativeKernel = + reinterpret_cast(NativeHandle); + ur_context_handle_t UrContext = + reinterpret_cast(Context); + std::ignore = Program; + ur_kernel_handle_t *UrKernel = reinterpret_cast(Kernel); + HANDLE_ERRORS( + urKernelCreateWithNativeHandle(UrNativeKernel, UrContext, UrKernel)); + (*UrKernel)->OwnNativeHandle = OwnNativeHandle; + + return PI_SUCCESS; +} + +inline pi_result piProgramRetain(pi_program Program) { + PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); + + ur_program_handle_t UrProgram = + reinterpret_cast(Program); + HANDLE_ERRORS( + urProgramRetain(reinterpret_cast(UrProgram))); + + return PI_SUCCESS; +} + +inline pi_result piKernelSetExecInfo(pi_kernel Kernel, + pi_kernel_exec_info ParamName, + size_t ParamValueSize, + const void *ParamValue) { + + PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); + PI_ASSERT(ParamValue, PI_ERROR_INVALID_VALUE); + + ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); + ur_kernel_exec_info_t propName{}; + switch (ParamName) { + case PI_USM_INDIRECT_ACCESS: { + propName = UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS; + break; + } + case PI_USM_PTRS: { + propName = UR_KERNEL_EXEC_INFO_USM_PTRS; + break; + } + case PI_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG: { + propName = UR_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG; + break; + } + default: + return PI_ERROR_INVALID_PROPERTY; + } + HANDLE_ERRORS( + urKernelSetExecInfo(UrKernel, propName, ParamValueSize, ParamValue)); + + return PI_SUCCESS; +} + +inline pi_result piextProgramGetNativeHandle(pi_program Program, + pi_native_handle *NativeHandle) { + PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); + PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); + + ur_program_handle_t UrProgram = + reinterpret_cast(Program); + ur_native_handle_t NativeProgram{}; + HANDLE_ERRORS(urProgramGetNativeHandle(UrProgram, &NativeProgram)); + + *NativeHandle = reinterpret_cast(NativeProgram); + + return PI_SUCCESS; +} + +inline pi_result +piextProgramCreateWithNativeHandle(pi_native_handle NativeHandle, + pi_context Context, bool ownNativeHandle, + pi_program *Program) { + PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); + PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + + ur_native_handle_t NativeProgram = + reinterpret_cast(NativeHandle); + ur_context_handle_t UrContext = + reinterpret_cast(Context); + ur_program_handle_t *UrProgram = + reinterpret_cast(Program); + HANDLE_ERRORS( + urProgramCreateWithNativeHandle(NativeProgram, UrContext, UrProgram)); + return PI_SUCCESS; +} + +inline pi_result piKernelGetInfo(pi_kernel Kernel, pi_kernel_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); + + ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); + ur_kernel_info_t UrParamName{}; + switch (ParamName) { + case PI_KERNEL_INFO_FUNCTION_NAME: { + UrParamName = UR_KERNEL_INFO_FUNCTION_NAME; + break; + } + case PI_KERNEL_INFO_NUM_ARGS: { + UrParamName = UR_KERNEL_INFO_NUM_ARGS; + break; + } + case PI_KERNEL_INFO_REFERENCE_COUNT: { + UrParamName = UR_KERNEL_INFO_REFERENCE_COUNT; + break; + } + case PI_KERNEL_INFO_CONTEXT: { + UrParamName = UR_KERNEL_INFO_CONTEXT; + break; + } + case PI_KERNEL_INFO_PROGRAM: { + UrParamName = UR_KERNEL_INFO_PROGRAM; + break; + } + case PI_KERNEL_INFO_ATTRIBUTES: { + UrParamName = UR_KERNEL_INFO_ATTRIBUTES; + break; + } + default: + return PI_ERROR_INVALID_PROPERTY; + } + + HANDLE_ERRORS(urKernelGetInfo(UrKernel, UrParamName, ParamValueSize, + ParamValue, ParamValueSizeRet)); + + return PI_SUCCESS; +} + +inline pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device, + pi_kernel_group_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); + PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); + + ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); + auto UrDevice = reinterpret_cast(Device); + + ur_kernel_group_info_t UrParamName{}; + switch (ParamName) { + case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { + UrParamName = UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE; + break; + } + case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { + UrParamName = UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE; + break; + } + case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { + UrParamName = UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE; + break; + } + case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { + UrParamName = UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE; + break; + } + case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { + UrParamName = UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE; + break; + } + case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { + UrParamName = UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE; + break; + } + // The number of registers used by the compiled kernel (device specific) + case PI_KERNEL_GROUP_INFO_NUM_REGS: { + die("PI_KERNEL_GROUP_INFO_NUM_REGS in piKernelGetGroupInfo not " + "implemented\n"); + break; + } + default: { + die("Unknown ParamName in piKernelGetGroupInfo"); + return PI_ERROR_INVALID_VALUE; + } + } + + HANDLE_ERRORS(urKernelGetGroupInfo(UrKernel, UrDevice, UrParamName, + ParamValueSize, ParamValue, + ParamValueSizeRet)); + + return PI_SUCCESS; +} + +inline pi_result piKernelRetain(pi_kernel Kernel) { + + PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); + + ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); + + HANDLE_ERRORS(urKernelRetain(UrKernel)); + + return PI_SUCCESS; +} + +inline pi_result piKernelRelease(pi_kernel Kernel) { + + PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); + + ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); + + HANDLE_ERRORS(urKernelRelease(UrKernel)); + + return PI_SUCCESS; +} + +inline pi_result piProgramRelease(pi_program Program) { + + PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); + + ur_program_handle_t UrProgram = + reinterpret_cast(Program); + + HANDLE_ERRORS(urProgramRelease(UrProgram)); + + return PI_SUCCESS; +} + +inline pi_result piextKernelSetArgPointer(pi_kernel Kernel, pi_uint32 ArgIndex, + size_t ArgSize, + const void *ArgValue) { + ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); + + HANDLE_ERRORS(urKernelSetArgValue(UrKernel, ArgIndex, ArgSize, ArgValue)); + + return PI_SUCCESS; +} + +inline pi_result piKernelGetSubGroupInfo( + pi_kernel Kernel, pi_device Device, pi_kernel_sub_group_info ParamName, + size_t InputValueSize, const void *InputValue, size_t ParamValueSize, + void *ParamValue, size_t *ParamValueSizeRet) { + + std::ignore = InputValueSize; + std::ignore = InputValue; + + ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); + auto UrDevice = reinterpret_cast(Device); + + ur_kernel_sub_group_info_t PropName{}; + switch (ParamName) { + case PI_KERNEL_MAX_SUB_GROUP_SIZE: { + PropName = UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE; + break; + } + case PI_KERNEL_MAX_NUM_SUB_GROUPS: { + PropName = UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS; + break; + } + case PI_KERNEL_COMPILE_NUM_SUB_GROUPS: { + PropName = UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS; + break; + } + case PI_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL: { + PropName = UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL; + break; + } + } + HANDLE_ERRORS(urKernelGetSubGroupInfo(UrKernel, UrDevice, PropName, + ParamValueSize, ParamValue, + ParamValueSizeRet)); + + return PI_SUCCESS; +} + +inline pi_result piProgramGetBuildInfo(pi_program Program, pi_device Device, + pi_program_build_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + ur_program_handle_t UrProgram = + reinterpret_cast(Program); + auto UrDevice = reinterpret_cast(Device); + + ur_program_build_info_t PropName{}; + switch (ParamName) { + case PI_PROGRAM_BUILD_INFO_STATUS: { + PropName = UR_PROGRAM_BUILD_INFO_STATUS; + break; + } + case PI_PROGRAM_BUILD_INFO_OPTIONS: { + PropName = UR_PROGRAM_BUILD_INFO_OPTIONS; + break; + } + case PI_PROGRAM_BUILD_INFO_LOG: { + PropName = UR_PROGRAM_BUILD_INFO_LOG; + break; + } + case PI_PROGRAM_BUILD_INFO_BINARY_TYPE: { + PropName = UR_PROGRAM_BUILD_INFO_BINARY_TYPE; + break; + } + default: { + die("piProgramGetBuildInfo: not implemented"); + } + } + HANDLE_ERRORS(urProgramGetBuildInfo(UrProgram, UrDevice, PropName, + ParamValueSize, ParamValue, + ParamValueSizeRet)); + + return PI_SUCCESS; +} + +inline pi_result piextKernelGetNativeHandle(pi_kernel Kernel, + pi_native_handle *NativeHandle) { + PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); + PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); + + ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); + ur_native_handle_t NativeKernel{}; + HANDLE_ERRORS(urKernelGetNativeHandle(UrKernel, &NativeKernel)); + + *NativeHandle = reinterpret_cast(NativeKernel); + + return PI_SUCCESS; +} + +/// API for writing data from host to a device global variable. +/// +/// \param Queue is the queue +/// \param Program is the program containing the device global variable +/// \param Name is the unique identifier for the device global variable +/// \param BlockingWrite is true if the write should block +/// \param Count is the number of bytes to copy +/// \param Offset is the byte offset into the device global variable to start +/// copying +/// \param Src is a pointer to where the data must be copied from +/// \param NumEventsInWaitList is a number of events in the wait list +/// \param EventWaitList is the wait list +/// \param Event is the resulting event +inline pi_result piextEnqueueDeviceGlobalVariableWrite( + pi_queue Queue, pi_program Program, const char *Name, pi_bool BlockingWrite, + size_t Count, size_t Offset, const void *Src, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_program_handle_t UrProgram = + reinterpret_cast(Program); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + HANDLE_ERRORS(urEnqueueDeviceGlobalVariableWrite( + UrQueue, UrProgram, Name, BlockingWrite, Count, Offset, Src, + NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +/// API reading data from a device global variable to host. +/// +/// \param Queue is the queue +/// \param Program is the program containing the device global variable +/// \param Name is the unique identifier for the device global variable +/// \param BlockingRead is true if the read should block +/// \param Count is the number of bytes to copy +/// \param Offset is the byte offset into the device global variable to start +/// copying +/// \param Dst is a pointer to where the data must be copied to +/// \param NumEventsInWaitList is a number of events in the wait list +/// \param EventWaitList is the wait list +/// \param Event is the resulting event +inline pi_result piextEnqueueDeviceGlobalVariableRead( + pi_queue Queue, pi_program Program, const char *Name, pi_bool BlockingRead, + size_t Count, size_t Offset, void *Dst, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_program_handle_t UrProgram = + reinterpret_cast(Program); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueDeviceGlobalVariableRead( + UrQueue, UrProgram, Name, BlockingRead, Count, Offset, Dst, + NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +// Program +/////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////// +// Memory +inline pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, + size_t Size, void *HostPtr, pi_mem *RetMem, + const pi_mem_properties *properties) { + + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(RetMem, PI_ERROR_INVALID_VALUE); + + // TODO: implement support for more access modes + if (!((Flags & PI_MEM_FLAGS_ACCESS_RW) || + (Flags & PI_MEM_ACCESS_READ_ONLY))) { + die("piMemBufferCreate: Level-Zero supports read-write and read-only " + "buffer," + "but not other accesses (such as write-only) yet."); + } + + if (properties != nullptr) { + die("piMemBufferCreate: no mem properties goes to Level-Zero RT yet"); + } + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + + ur_mem_flags_t UrBufferFlags{}; + if (Flags & PI_MEM_FLAGS_ACCESS_RW) { + UrBufferFlags |= UR_MEM_FLAG_READ_WRITE; + } + if (Flags & PI_MEM_ACCESS_READ_ONLY) { + UrBufferFlags |= UR_MEM_FLAG_READ_ONLY; + } + if (Flags & PI_MEM_FLAGS_HOST_PTR_USE) { + UrBufferFlags |= UR_MEM_FLAG_USE_HOST_POINTER; + } + if (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) { + UrBufferFlags |= UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER; + } + if (Flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) { + UrBufferFlags |= UR_MEM_FLAG_ALLOC_HOST_POINTER; + } + + ur_mem_handle_t *UrBuffer = reinterpret_cast(RetMem); + HANDLE_ERRORS( + urMemBufferCreate(UrContext, UrBufferFlags, Size, HostPtr, UrBuffer)); + + return PI_SUCCESS; +} + +inline pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context, + pi_usm_mem_properties *Properties, + size_t Size, pi_uint32 Alignment) { + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + ur_usm_desc_t USMDesc{}; + ur_usm_pool_handle_t Pool{}; + HANDLE_ERRORS( + urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, Alignment, ResultPtr)); + return PI_SUCCESS; +} + +inline pi_result piMemGetInfo(pi_mem Mem, pi_mem_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + PI_ASSERT(Mem, PI_ERROR_INVALID_VALUE); + // piMemImageGetInfo must be used for images + + ur_mem_handle_t UrMemory = reinterpret_cast(Mem); + ur_mem_info_t MemInfoType{}; + switch (ParamName) { + case PI_MEM_CONTEXT: { + MemInfoType = UR_MEM_INFO_CONTEXT; + break; + } + case PI_MEM_SIZE: { + MemInfoType = UR_MEM_INFO_SIZE; + break; + } + default: { + die("piMemGetInfo: unsuppported ParamName."); + } + } + HANDLE_ERRORS(urMemGetInfo(UrMemory, MemInfoType, ParamValueSize, ParamValue, + ParamValueSizeRet)); + return PI_SUCCESS; +} + +inline pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, + const pi_image_format *ImageFormat, + const pi_image_desc *ImageDesc, void *HostPtr, + pi_mem *RetImage) { + + // TODO: implement read-only, write-only + if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) { + die("piMemImageCreate: Level-Zero implements only read-write buffer," + "no read-only or write-only yet."); + } + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE); + PI_ASSERT(ImageFormat, PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + + ur_mem_flags_t UrFlags{}; + if (Flags & PI_MEM_FLAGS_ACCESS_RW) { + UrFlags |= UR_MEM_FLAG_READ_WRITE; + } + if (Flags & PI_MEM_ACCESS_READ_ONLY) { + UrFlags |= UR_MEM_FLAG_READ_ONLY; + } + if (Flags & PI_MEM_FLAGS_HOST_PTR_USE) { + UrFlags |= UR_MEM_FLAG_USE_HOST_POINTER; + } + if (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) { + UrFlags |= UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER; + } + if (Flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) { + UrFlags |= UR_MEM_FLAG_ALLOC_HOST_POINTER; + } + + ur_image_format_t UrFormat{}; + switch (ImageFormat->image_channel_data_type) { + case PI_IMAGE_CHANNEL_TYPE_SNORM_INT8: { + UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SNORM_INT8; + break; + } + case PI_IMAGE_CHANNEL_TYPE_SNORM_INT16: { + UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SNORM_INT16; + break; + } + case PI_IMAGE_CHANNEL_TYPE_UNORM_INT8: { + UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_INT8; + break; + } + case PI_IMAGE_CHANNEL_TYPE_UNORM_INT16: { + UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_INT16; + break; + } + case PI_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565: { + UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565; + break; + } + case PI_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555: { + UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555; + break; + } + case PI_IMAGE_CHANNEL_TYPE_UNORM_INT_101010: { + UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_INT_101010; + break; + } + case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8: { + UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8; + break; + } + case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT16: { + UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16; + break; + } + case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT32: { + UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32; + break; + } + case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: { + UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8; + break; + } + case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: { + UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16; + break; + } + case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: { + UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32; + break; + } + case PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT: { + UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT; + break; + } + case PI_IMAGE_CHANNEL_TYPE_FLOAT: { + UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_FLOAT; + break; + } + default: { + die("piMemImageCreate: unsuppported image_channel_data_type."); + } + } + switch (ImageFormat->image_channel_order) { + case PI_IMAGE_CHANNEL_ORDER_A: { + UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_A; + break; + } + case PI_IMAGE_CHANNEL_ORDER_R: { + UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_R; + break; + } + case PI_IMAGE_CHANNEL_ORDER_RG: { + UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RG; + break; + } + case PI_IMAGE_CHANNEL_ORDER_RA: { + UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RA; + break; + } + case PI_IMAGE_CHANNEL_ORDER_RGB: { + UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RGB; + break; + } + case PI_IMAGE_CHANNEL_ORDER_RGBA: { + UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RGBA; + break; + } + case PI_IMAGE_CHANNEL_ORDER_BGRA: { + UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_BGRA; + break; + } + case PI_IMAGE_CHANNEL_ORDER_ARGB: { + UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_ARGB; + break; + } + case PI_IMAGE_CHANNEL_ORDER_ABGR: { + UrFormat.channelOrder = UR_EXT_IMAGE_CHANNEL_ORDER_ABGR; + break; + } + case PI_IMAGE_CHANNEL_ORDER_INTENSITY: { + UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_INTENSITY; + break; + } + case PI_IMAGE_CHANNEL_ORDER_LUMINANCE: { + UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_LUMINANCE; + break; + } + case PI_IMAGE_CHANNEL_ORDER_Rx: { + UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RX; + break; + } + case PI_IMAGE_CHANNEL_ORDER_RGx: { + UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RGX; + break; + } + case PI_IMAGE_CHANNEL_ORDER_RGBx: { + UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RGBX; + break; + } + case PI_IMAGE_CHANNEL_ORDER_sRGBA: { + UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_SRGBA; + break; + } + default: { + die("piMemImageCreate: unsuppported image_channel_data_type."); + } + } + ur_image_desc_t UrDesc{}; + UrDesc.arraySize = ImageDesc->image_array_size; + UrDesc.depth = ImageDesc->image_depth; + UrDesc.height = ImageDesc->image_height; + UrDesc.numMipLevel = ImageDesc->num_mip_levels; + UrDesc.numSamples = ImageDesc->num_samples; + UrDesc.rowPitch = ImageDesc->image_row_pitch; + UrDesc.slicePitch = ImageDesc->image_slice_pitch; + switch (ImageDesc->image_type) { + case PI_MEM_TYPE_BUFFER: { + UrDesc.type = UR_MEM_TYPE_BUFFER; + break; + } + case PI_MEM_TYPE_IMAGE2D: { + UrDesc.type = UR_MEM_TYPE_IMAGE2D; + break; + } + case PI_MEM_TYPE_IMAGE3D: { + UrDesc.type = UR_MEM_TYPE_IMAGE3D; + break; + } + case PI_MEM_TYPE_IMAGE2D_ARRAY: { + UrDesc.type = UR_MEM_TYPE_IMAGE2D_ARRAY; + break; + } + case PI_MEM_TYPE_IMAGE1D: { + UrDesc.type = UR_MEM_TYPE_IMAGE1D; + break; + } + case PI_MEM_TYPE_IMAGE1D_ARRAY: { + UrDesc.type = UR_MEM_TYPE_IMAGE1D_ARRAY; + break; + } + case PI_MEM_TYPE_IMAGE1D_BUFFER: { + UrDesc.type = UR_MEM_TYPE_IMAGE1D_BUFFER; + break; + } + default: { + die("piMemImageCreate: unsuppported image_type."); + } + } + UrDesc.width = ImageDesc->image_width; + UrDesc.arraySize = ImageDesc->image_array_size; + UrDesc.arraySize = ImageDesc->image_array_size; + // TODO: UrDesc doesn't have something for ImageDesc->buffer + + ur_mem_handle_t *UrMem = reinterpret_cast(RetImage); + HANDLE_ERRORS( + urMemImageCreate(UrContext, UrFlags, &UrFormat, &UrDesc, HostPtr, UrMem)); + + return PI_SUCCESS; +} + +inline pi_result piMemBufferPartition(pi_mem Buffer, pi_mem_flags Flags, + pi_buffer_create_type BufferCreateType, + void *BufferCreateInfo, pi_mem *RetMem) { + + PI_ASSERT(BufferCreateType == PI_BUFFER_CREATE_TYPE_REGION && + BufferCreateInfo && RetMem, + PI_ERROR_INVALID_VALUE); + + auto Region = (pi_buffer_region)BufferCreateInfo; + PI_ASSERT(Region->size != 0u, PI_ERROR_INVALID_BUFFER_SIZE); + PI_ASSERT(Region->origin <= (Region->origin + Region->size), + PI_ERROR_INVALID_VALUE); + + ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); + + ur_mem_flags_t UrFlags{}; + if (Flags & PI_MEM_FLAGS_ACCESS_RW) { + UrFlags |= UR_MEM_FLAG_READ_WRITE; + } + if (Flags & PI_MEM_ACCESS_READ_ONLY) { + UrFlags |= UR_MEM_FLAG_READ_ONLY; + } + if (Flags & PI_MEM_FLAGS_HOST_PTR_USE) { + UrFlags |= UR_MEM_FLAG_USE_HOST_POINTER; + } + if (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) { + UrFlags |= UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER; + } + if (Flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) { + UrFlags |= UR_MEM_FLAG_ALLOC_HOST_POINTER; + } + + ur_buffer_create_type_t UrBufferCreateType{}; + if (BufferCreateType == PI_BUFFER_CREATE_TYPE_REGION) { + UrBufferCreateType = UR_BUFFER_CREATE_TYPE_REGION; + } + + ur_buffer_region_t UrBufferCreateInfo{}; + UrBufferCreateInfo.origin = Region->origin; + UrBufferCreateInfo.size = Region->size; + ur_mem_handle_t *UrMem = reinterpret_cast(RetMem); + HANDLE_ERRORS(urMemBufferPartition(UrBuffer, UrFlags, UrBufferCreateType, + &UrBufferCreateInfo, UrMem)); + + return PI_SUCCESS; +} + +inline pi_result piextMemGetNativeHandle(pi_mem Mem, + pi_native_handle *NativeHandle) { + PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); + + ur_mem_handle_t UrMem = reinterpret_cast(Mem); + ur_native_handle_t NativeMem{}; + HANDLE_ERRORS(urMemGetNativeHandle(UrMem, &NativeMem)); + + *NativeHandle = reinterpret_cast(NativeMem); + + return PI_SUCCESS; +} + +inline pi_result +piEnqueueMemImageCopy(pi_queue Queue, pi_mem SrcImage, pi_mem DstImage, + pi_image_offset SrcOrigin, pi_image_offset DstOrigin, + pi_image_region Region, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + + ur_mem_handle_t UrImageSrc = reinterpret_cast(SrcImage); + ur_mem_handle_t UrImageDst = reinterpret_cast(DstImage); + + ur_rect_offset_t UrSrcOrigin{SrcOrigin->x, SrcOrigin->y, SrcOrigin->z}; + ur_rect_offset_t UrDstOrigin{DstOrigin->x, DstOrigin->y, DstOrigin->z}; + ur_rect_region_t UrRegion{}; + UrRegion.depth = Region->depth; + UrRegion.height = Region->height; + UrRegion.width = Region->width; + + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemImageCopy( + UrQueue, UrImageSrc, UrImageDst, UrSrcOrigin, UrDstOrigin, UrRegion, + NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, + pi_context Context, + bool OwnNativeHandle, + pi_mem *Mem) { + PI_ASSERT(Mem, PI_ERROR_INVALID_VALUE); + PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + + ur_native_handle_t UrNativeMem = + reinterpret_cast(NativeHandle); + ur_context_handle_t UrContext = + reinterpret_cast(Context); + ur_mem_handle_t *UrMem = reinterpret_cast(Mem); + // TODO: Pass OwnNativeHandle to the output parameter + // while we get it in interface + (*UrMem)->OwnNativeHandle = OwnNativeHandle; + HANDLE_ERRORS(urMemCreateWithNativeHandle(UrNativeMem, UrContext, UrMem)); + + return PI_SUCCESS; +} + +inline pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, + pi_device Device, + pi_usm_mem_properties *Properties, + size_t Size, pi_uint32 Alignment) { + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + auto UrDevice = reinterpret_cast(Device); + + ur_usm_desc_t USMDesc{}; + ur_usm_pool_handle_t Pool{}; + HANDLE_ERRORS(urUSMDeviceAlloc(UrContext, UrDevice, &USMDesc, Pool, Size, + Alignment, ResultPtr)); + + return PI_SUCCESS; +} + +inline pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context, + pi_device Device, + pi_usm_mem_properties *Properties, + size_t Size, pi_uint32 Alignment) { + + if (Properties && *Properties != 0) { + PI_ASSERT(*(Properties) == PI_MEM_ALLOC_FLAGS && *(Properties + 2) == 0, + PI_ERROR_INVALID_VALUE); + } + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + auto UrDevice = reinterpret_cast(Device); + + ur_usm_desc_t USMDesc{}; + ur_usm_pool_handle_t Pool{}; + HANDLE_ERRORS(urUSMSharedAlloc(UrContext, UrDevice, &USMDesc, Pool, Size, + Alignment, ResultPtr)); + + return PI_SUCCESS; +} + +inline pi_result piextUSMFree(pi_context Context, void *Ptr) { + ur_context_handle_t UrContext = + reinterpret_cast(Context); + HANDLE_ERRORS(urUSMFree(UrContext, Ptr)); + return PI_SUCCESS; +} + +inline pi_result piMemRetain(pi_mem Mem) { + PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); + + ur_mem_handle_t UrMem = reinterpret_cast(Mem); + + HANDLE_ERRORS(urMemRetain(UrMem)); + + return PI_SUCCESS; +} + +inline pi_result piMemRelease(pi_mem Mem) { + PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); + + ur_mem_handle_t UrMem = reinterpret_cast(Mem); + + HANDLE_ERRORS(urMemRelease(UrMem)); + + return PI_SUCCESS; +} + +/// Hint to migrate memory to the device +/// +/// @param Queue is the queue to submit to +/// @param Ptr points to the memory to migrate +/// @param Size is the number of bytes to migrate +/// @param Flags is a bitfield used to specify memory migration options +/// @param NumEventsInWaitList is the number of events to wait on +/// @param EventsWaitList is an array of events to wait on +/// @param Event is the event that represents this operation +inline pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, + size_t Size, + pi_usm_migration_flags Flags, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + + // flags is currently unused so fail if set + PI_ASSERT(Flags == 0, PI_ERROR_INVALID_VALUE); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + // TODO: to map from pi_usm_migration_flags to + // ur_usm_migration_flags_t + // once we have those defined + ur_usm_migration_flags_t UrFlags{}; + HANDLE_ERRORS(urEnqueueUSMPrefetch(UrQueue, Ptr, Size, UrFlags, + NumEventsInWaitList, UrEventsWaitList, + UrEvent)); + + return PI_SUCCESS; +} + +/// USM memadvise API to govern behavior of automatic migration mechanisms +/// +/// @param Queue is the queue to submit to +/// @param Ptr is the data to be advised +/// @param Length is the size in bytes of the meory to advise +/// @param Advice is device specific advice +/// @param Event is the event that represents this operation +/// +inline pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr, + size_t Length, pi_mem_advice Advice, + pi_event *OutEvent) { + + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + // TODO: to map from pi_mem_advice to ur_mem_advice_t + // once we have those defined + ur_mem_advice_t UrAdvice{}; + HANDLE_ERRORS(urEnqueueUSMMemAdvise(UrQueue, Ptr, Length, UrAdvice, UrEvent)); + + return PI_SUCCESS; +} + +/// USM 2D Fill API +/// +/// \param queue is the queue to submit to +/// \param ptr is the ptr to fill +/// \param pitch is the total width of the destination memory including padding +/// \param pattern is a pointer with the bytes of the pattern to set +/// \param pattern_size is the size in bytes of the pattern +/// \param width is width in bytes of each row to fill +/// \param height is height the columns to fill +/// \param num_events_in_waitlist is the number of events to wait on +/// \param events_waitlist is an array of events to wait on +/// \param event is the event that represents this operation +inline pi_result piextUSMEnqueueFill2D(pi_queue Queue, void *Ptr, size_t Pitch, + size_t PatternSize, const void *Pattern, + size_t Width, size_t Height, + pi_uint32 NumEventsWaitList, + const pi_event *EventsWaitList, + pi_event *Event) { + + std::ignore = Queue; + std::ignore = Ptr; + std::ignore = Pitch; + std::ignore = PatternSize; + std::ignore = Pattern; + std::ignore = Width; + std::ignore = Height; + std::ignore = NumEventsWaitList; + std::ignore = EventsWaitList; + std::ignore = Event; + die("piextUSMEnqueueFill2D: not implemented"); + return {}; +} + +inline pi_result piextUSMEnqueueMemset2D(pi_queue Queue, void *Ptr, + size_t Pitch, int Value, size_t Width, + size_t Height, + pi_uint32 NumEventsWaitList, + const pi_event *EventsWaitList, + pi_event *Event) { + std::ignore = Queue; + std::ignore = Ptr; + std::ignore = Pitch; + std::ignore = Value; + std::ignore = Width; + std::ignore = Height; + std::ignore = NumEventsWaitList; + std::ignore = EventsWaitList; + std::ignore = Event; + die("piextUSMEnqueueMemset2D: not implemented"); + return PI_SUCCESS; +} + +inline pi_result piextUSMGetMemAllocInfo(pi_context Context, const void *Ptr, + pi_mem_alloc_info ParamName, + size_t ParamValueSize, + void *ParamValue, + size_t *ParamValueSizeRet) { + + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + + ur_usm_alloc_info_t UrParamName{}; + switch (ParamName) { + case PI_MEM_ALLOC_TYPE: { + UrParamName = UR_USM_ALLOC_INFO_TYPE; + break; + } + case PI_MEM_ALLOC_BASE_PTR: { + UrParamName = UR_USM_ALLOC_INFO_BASE_PTR; + break; + } + case PI_MEM_ALLOC_SIZE: { + UrParamName = UR_USM_ALLOC_INFO_SIZE; + break; + } + case PI_MEM_ALLOC_DEVICE: { + UrParamName = UR_USM_ALLOC_INFO_DEVICE; + break; + } + default: { + die("piextUSMGetMemAllocInfo: unsuppported ParamName."); + } + } + + HANDLE_ERRORS(urUSMGetMemAllocInfo(UrContext, Ptr, UrParamName, + ParamValueSize, ParamValue, + ParamValueSizeRet)) + return PI_SUCCESS; +} + +inline pi_result piMemImageGetInfo(pi_mem Image, pi_image_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { // missing + std::ignore = Image; + std::ignore = ParamName; + std::ignore = ParamValueSize; + std::ignore = ParamValue; + std::ignore = ParamValueSizeRet; + + // TODO: use urMemImageGetInfo + + die("piMemImageGetInfo: not implemented"); + return {}; +} + +/// USM 2D Memcpy API +/// +/// \param queue is the queue to submit to +/// \param blocking is whether this operation should block the host +/// \param dst_ptr is the location the data will be copied +/// \param dst_pitch is the total width of the destination memory including +/// padding +/// \param src_ptr is the data to be copied +/// \param dst_pitch is the total width of the source memory including padding +/// \param width is width in bytes of each row to be copied +/// \param height is height the columns to be copied +/// \param num_events_in_waitlist is the number of events to wait on +/// \param events_waitlist is an array of events to wait on +/// \param event is the event that represents this operation +inline pi_result piextUSMEnqueueMemcpy2D(pi_queue Queue, pi_bool Blocking, + void *DstPtr, size_t DstPitch, + const void *SrcPtr, size_t SrcPitch, + size_t Width, size_t Height, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + + if (!DstPtr || !SrcPtr) + return PI_ERROR_INVALID_VALUE; + + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueUSMMemcpy2D( + UrQueue, Blocking, DstPtr, DstPitch, SrcPtr, SrcPitch, Width, Height, + NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +// Memory +/////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////// +// Enqueue + +inline pi_result +piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkOffset, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + + PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + PI_ASSERT((WorkDim > 0) && (WorkDim < 4), PI_ERROR_INVALID_WORK_DIMENSION); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueKernelLaunch( + UrQueue, UrKernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, + LocalWorkSize, NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result +piEnqueueMemImageWrite(pi_queue Queue, pi_mem Image, pi_bool BlockingWrite, + pi_image_offset Origin, pi_image_region Region, + size_t InputRowPitch, size_t InputSlicePitch, + const void *Ptr, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrImage = reinterpret_cast(Image); + ur_rect_offset_t UrOrigin{Origin->x, Origin->y, Origin->z}; + ur_rect_region_t UrRegion{}; + UrRegion.depth = Region->depth; + UrRegion.height = Region->height; + UrRegion.width = Region->width; + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemImageWrite( + UrQueue, UrImage, BlockingWrite, UrOrigin, UrRegion, InputRowPitch, + InputSlicePitch, const_cast(Ptr), NumEventsInWaitList, + UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result +piEnqueueMemImageRead(pi_queue Queue, pi_mem Image, pi_bool BlockingRead, + pi_image_offset Origin, pi_image_region Region, + size_t RowPitch, size_t SlicePitch, void *Ptr, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrImage = reinterpret_cast(Image); + ur_rect_offset_t UrOrigin{Origin->x, Origin->y, Origin->z}; + ur_rect_region_t UrRegion{}; + UrRegion.depth = Region->depth; + UrRegion.height = Region->height; + UrRegion.width = Region->width; + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemImageRead( + UrQueue, UrImage, BlockingRead, UrOrigin, UrRegion, RowPitch, SlicePitch, + Ptr, NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemBufferMap( + pi_queue Queue, pi_mem Mem, pi_bool BlockingMap, pi_map_flags MapFlags, + size_t Offset, size_t Size, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent, void **RetMap) { + // TODO: we don't implement read-only or write-only, always read-write. + // assert((map_flags & PI_MAP_READ) != 0); + // assert((map_flags & PI_MAP_WRITE) != 0); + PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrMem = reinterpret_cast(Mem); + + ur_map_flags_t UrMapFlags{}; + if (MapFlags & PI_MAP_READ) + UrMapFlags |= UR_MAP_FLAG_READ; + if (MapFlags & PI_MAP_WRITE) + UrMapFlags |= UR_MAP_FLAG_WRITE; + if (MapFlags & PI_MAP_WRITE_INVALIDATE_REGION) + UrMapFlags |= UR_EXT_MAP_FLAG_WRITE_INVALIDATE_REGION; + + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemBufferMap(UrQueue, UrMem, BlockingMap, UrMapFlags, + Offset, Size, NumEventsInWaitList, + UrEventsWaitList, UrEvent, RetMap)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem Mem, void *MappedPtr, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + + PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrMem = reinterpret_cast(Mem); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemUnmap(UrQueue, UrMem, MappedPtr, + NumEventsInWaitList, UrEventsWaitList, + UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemBufferFill(pi_queue Queue, pi_mem Buffer, + const void *Pattern, size_t PatternSize, + size_t Offset, size_t Size, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemBufferFill(UrQueue, UrBuffer, Pattern, PatternSize, + Offset, Size, NumEventsInWaitList, + UrEventsWaitList, UrEvent)); + return PI_SUCCESS; +} + +inline pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, + pi_int32 Value, size_t Count, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + PI_ASSERT(Ptr, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrBuffer = reinterpret_cast(Ptr); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + uint32_t Pattern = Value; + size_t PatternSize = sizeof(Pattern); + HANDLE_ERRORS(urEnqueueMemBufferFill( + UrQueue, UrBuffer, + const_cast(reinterpret_cast(&Pattern)), PatternSize, + 0, Count, NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemBufferCopyRect( + pi_queue Queue, pi_mem SrcMem, pi_mem DstMem, pi_buff_rect_offset SrcOrigin, + pi_buff_rect_offset DstOrigin, pi_buff_rect_region Region, + size_t SrcRowPitch, size_t SrcSlicePitch, size_t DstRowPitch, + size_t DstSlicePitch, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *OutEvent) { + + PI_ASSERT(SrcMem && DstMem, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrBufferSrc = reinterpret_cast(SrcMem); + ur_mem_handle_t UrBufferDst = reinterpret_cast(DstMem); + ur_rect_offset_t UrSrcOrigin{SrcOrigin->x_bytes, SrcOrigin->y_scalar, + SrcOrigin->z_scalar}; + ur_rect_offset_t UrDstOrigin{DstOrigin->x_bytes, DstOrigin->y_scalar, + DstOrigin->z_scalar}; + ur_rect_region_t UrRegion{}; + UrRegion.depth = Region->depth_scalar; + UrRegion.height = Region->height_scalar; + UrRegion.width = Region->width_bytes; + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemBufferCopyRect( + UrQueue, UrBufferSrc, UrBufferDst, UrSrcOrigin, UrDstOrigin, UrRegion, + SrcRowPitch, SrcSlicePitch, DstRowPitch, DstSlicePitch, + NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcMem, + pi_mem DstMem, size_t SrcOffset, + size_t DstOffset, size_t Size, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + + PI_ASSERT(SrcMem && DstMem, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrBufferSrc = reinterpret_cast(SrcMem); + ur_mem_handle_t UrBufferDst = reinterpret_cast(DstMem); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemBufferCopy( + UrQueue, UrBufferSrc, UrBufferDst, SrcOffset, DstOffset, Size, + NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking, + void *DstPtr, const void *SrcPtr, + size_t Size, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueUSMMemcpy(UrQueue, Blocking, DstPtr, SrcPtr, Size, + NumEventsInWaitList, UrEventsWaitList, + UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemBufferWriteRect( + pi_queue Queue, pi_mem Buffer, pi_bool BlockingWrite, + pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, + pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, + size_t HostRowPitch, size_t HostSlicePitch, const void *Ptr, + pi_uint32 NumEventsInWaitList, const pi_event *EventsWaitList, + pi_event *OutEvent) { + + PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); + ur_rect_offset_t UrBufferOffset{BufferOffset->x_bytes, BufferOffset->y_scalar, + BufferOffset->z_scalar}; + ur_rect_offset_t UrHostOffset{HostOffset->x_bytes, HostOffset->y_scalar, + HostOffset->z_scalar}; + ur_rect_region_t UrRegion{}; + UrRegion.depth = Region->depth_scalar; + UrRegion.height = Region->height_scalar; + UrRegion.width = Region->width_bytes; + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemBufferWriteRect( + UrQueue, UrBuffer, BlockingWrite, UrBufferOffset, UrHostOffset, UrRegion, + BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, + const_cast(Ptr), NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemBufferWrite(pi_queue Queue, pi_mem Buffer, + pi_bool BlockingWrite, size_t Offset, + size_t Size, const void *Ptr, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + + PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemBufferWrite( + UrQueue, UrBuffer, BlockingWrite, Offset, Size, const_cast(Ptr), + NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemBufferReadRect( + pi_queue Queue, pi_mem Buffer, pi_bool BlockingRead, + pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, + pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, + size_t HostRowPitch, size_t HostSlicePitch, void *Ptr, + pi_uint32 NumEventsInWaitList, const pi_event *EventsWaitList, + pi_event *OutEvent) { + + PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrBuffer = reinterpret_cast(Buffer); + ur_rect_offset_t UrBufferOffset{BufferOffset->x_bytes, BufferOffset->y_scalar, + BufferOffset->z_scalar}; + ur_rect_offset_t UrHostOffset{HostOffset->x_bytes, HostOffset->y_scalar, + HostOffset->z_scalar}; + ur_rect_region_t UrRegion{}; + UrRegion.depth = Region->depth_scalar; + UrRegion.height = Region->height_scalar; + UrRegion.width = Region->width_bytes; + + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemBufferReadRect( + UrQueue, UrBuffer, BlockingRead, UrBufferOffset, UrHostOffset, UrRegion, + BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr, + NumEventsInWaitList, UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueMemBufferRead(pi_queue Queue, pi_mem Src, + pi_bool BlockingRead, size_t Offset, + size_t Size, void *Dst, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + PI_ASSERT(Src, PI_ERROR_INVALID_MEM_OBJECT); + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + ur_mem_handle_t UrBuffer = reinterpret_cast(Src); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueMemBufferRead(UrQueue, UrBuffer, BlockingRead, Offset, + Size, Dst, NumEventsInWaitList, + UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueEventsWaitWithBarrier(UrQueue, NumEventsInWaitList, + UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEnqueueEventsWait(pi_queue Queue, + pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, + pi_event *OutEvent) { + + PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + if (EventsWaitList) { + PI_ASSERT(NumEventsInWaitList > 0, PI_ERROR_INVALID_VALUE); + } + + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); + + HANDLE_ERRORS(urEnqueueEventsWait(UrQueue, NumEventsInWaitList, + UrEventsWaitList, UrEvent)); + + return PI_SUCCESS; +} +// Enqueue +/////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////// +// Events +inline pi_result piEventsWait(pi_uint32 NumEvents, + const pi_event *EventsWaitList) { + if (NumEvents && !EventsWaitList) { + return PI_ERROR_INVALID_EVENT; + } + + const ur_event_handle_t *UrEventsWaitList = + reinterpret_cast(EventsWaitList); + + HANDLE_ERRORS(urEventWait(NumEvents, UrEventsWaitList)); + + return PI_SUCCESS; +} + +inline pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); + + ur_event_handle_t UrEvent = reinterpret_cast(Event); + + ur_event_info_t PropName{}; + if (ParamName == PI_EVENT_INFO_COMMAND_QUEUE) { + PropName = UR_EVENT_INFO_COMMAND_QUEUE; + } else if (ParamName == PI_EVENT_INFO_CONTEXT) { + PropName = UR_EVENT_INFO_CONTEXT; + } else if (ParamName == PI_EVENT_INFO_COMMAND_TYPE) { + PropName = UR_EVENT_INFO_COMMAND_TYPE; + } else if (ParamName == PI_EVENT_INFO_COMMAND_EXECUTION_STATUS) { + PropName = UR_EVENT_INFO_COMMAND_EXECUTION_STATUS; + } else if (ParamName == PI_EVENT_INFO_REFERENCE_COUNT) { + PropName = UR_EVENT_INFO_REFERENCE_COUNT; + } else { + return PI_ERROR_INVALID_VALUE; + } + + HANDLE_ERRORS(urEventGetInfo(UrEvent, PropName, ParamValueSize, ParamValue, + ParamValueSizeRet)); + + return PI_SUCCESS; +} + +inline pi_result piextEventGetNativeHandle(pi_event Event, + pi_native_handle *NativeHandle) { + + PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); + PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); + + ur_event_handle_t UrEvent = reinterpret_cast(Event); + + ur_native_handle_t *UrNativeEvent = + reinterpret_cast(NativeHandle); + HANDLE_ERRORS(urEventGetNativeHandle(UrEvent, UrNativeEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEventGetProfilingInfo(pi_event Event, + pi_profiling_info ParamName, + size_t ParamValueSize, + void *ParamValue, + size_t *ParamValueSizeRet) { + + PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); + + ur_event_handle_t UrEvent = reinterpret_cast(Event); + + ur_profiling_info_t PropName{}; + switch (ParamName) { + case PI_PROFILING_INFO_COMMAND_QUEUED: { + PropName = UR_PROFILING_INFO_COMMAND_QUEUED; + break; + } + case PI_PROFILING_INFO_COMMAND_SUBMIT: { + PropName = UR_PROFILING_INFO_COMMAND_SUBMIT; + break; + } + case PI_PROFILING_INFO_COMMAND_START: { + PropName = UR_PROFILING_INFO_COMMAND_START; + break; + } + case PI_PROFILING_INFO_COMMAND_END: { + PropName = UR_PROFILING_INFO_COMMAND_END; + break; + } + default: + return PI_ERROR_INVALID_PROPERTY; + } + + HANDLE_ERRORS(urEventGetProfilingInfo(UrEvent, PropName, ParamValueSize, + ParamValue, ParamValueSizeRet)); + + return PI_SUCCESS; +} + +inline pi_result piEventCreate(pi_context Context, pi_event *RetEvent) { + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + + ur_event_handle_t *UrEvent = reinterpret_cast(RetEvent); + // pass null for the hNativeHandle to use urEventCreateWithNativeHandle + // as urEventCreate + HANDLE_ERRORS(urEventCreateWithNativeHandle(nullptr, UrContext, UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piextEventCreateWithNativeHandle(pi_native_handle NativeHandle, + pi_context Context, + bool OwnNativeHandle, + pi_event *Event) { + + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); + PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); + + ur_native_handle_t UrNativeKernel = + reinterpret_cast(NativeHandle); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + + ur_event_handle_t *UrEvent = reinterpret_cast(*Event); + HANDLE_ERRORS( + urEventCreateWithNativeHandle(UrNativeKernel, UrContext, UrEvent)); + (*UrEvent)->OwnNativeHandle = OwnNativeHandle; + + return PI_SUCCESS; +} + +inline pi_result piEventSetCallback( + pi_event Event, pi_int32 CommandExecCallbackType, + void (*PFnNotify)(pi_event Event, pi_int32 EventCommandStatus, + void *UserData), + void *UserData) { + std::ignore = Event; + std::ignore = CommandExecCallbackType; + std::ignore = PFnNotify; + std::ignore = UserData; + die("piEventSetCallback: deprecated, to be removed"); + return PI_SUCCESS; +} + +inline pi_result piEventSetStatus(pi_event Event, pi_int32 ExecutionStatus) { + std::ignore = Event; + std::ignore = ExecutionStatus; + die("piEventSetStatus: deprecated, to be removed"); + return PI_SUCCESS; +} + +inline pi_result piEventRetain(pi_event Event) { + PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); + + ur_event_handle_t UrEvent = reinterpret_cast(Event); + HANDLE_ERRORS(urEventRetain(UrEvent)); + + return PI_SUCCESS; +} + +inline pi_result piEventRelease(pi_event Event) { + PI_ASSERT(Event, PI_ERROR_INVALID_EVENT); + + ur_event_handle_t UrEvent = reinterpret_cast(Event); + HANDLE_ERRORS(urEventRelease(UrEvent)); + + return PI_SUCCESS; +} + +// Events +/////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////// +// Sampler +inline pi_result piSamplerCreate(pi_context Context, + const pi_sampler_properties *SamplerProperties, + pi_sampler *RetSampler) { + + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(RetSampler, PI_ERROR_INVALID_VALUE); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + ur_sampler_property_t UrProps[6]{}; + UrProps[0] = UR_SAMPLER_PROPERTIES_NORMALIZED_COORDS; + UrProps[1] = SamplerProperties[1]; + + UrProps[2] = UR_SAMPLER_PROPERTIES_ADDRESSING_MODE; + if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT) + UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT; + else if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_REPEAT) + UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_REPEAT; + else if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE) + UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; + else if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_CLAMP) + UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_CLAMP; + else if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_NONE) + UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_NONE; + + UrProps[4] = UR_SAMPLER_PROPERTIES_FILTER_MODE; + if (SamplerProperties[4] & PI_SAMPLER_FILTER_MODE_NEAREST) + UrProps[5] = UR_EXT_SAMPLER_FILTER_MODE_NEAREST; + else if (SamplerProperties[4] & PI_SAMPLER_FILTER_MODE_LINEAR) + UrProps[5] = UR_EXT_SAMPLER_FILTER_MODE_LINEAR; + + ur_sampler_handle_t *UrSampler = + reinterpret_cast(RetSampler); + + HANDLE_ERRORS(urSamplerCreate(UrContext, UrProps, UrSampler)); + + return PI_SUCCESS; +} + +inline pi_result piSamplerGetInfo(pi_sampler Sampler, pi_sampler_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + std::ignore = Sampler; + std::ignore = ParamName; + std::ignore = ParamValueSize; + std::ignore = ParamValue; + std::ignore = ParamValueSizeRet; + + die("piSamplerGetInfo: not implemented"); + return PI_SUCCESS; +} + +// Special version of piKernelSetArg to accept pi_sampler. +inline pi_result piextKernelSetArgSampler(pi_kernel Kernel, pi_uint32 ArgIndex, + const pi_sampler *ArgValue) { + ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); + ur_sampler_handle_t UrSampler = + reinterpret_cast(*ArgValue); + + HANDLE_ERRORS(urKernelSetArgSampler(UrKernel, ArgIndex, UrSampler)); + + return PI_SUCCESS; +} + +inline pi_result piSamplerRetain(pi_sampler Sampler) { + PI_ASSERT(Sampler, PI_ERROR_INVALID_SAMPLER); + + ur_sampler_handle_t UrSampler = + reinterpret_cast(Sampler); + + HANDLE_ERRORS(urSamplerRetain(UrSampler)); + + return PI_SUCCESS; +} + +inline pi_result piSamplerRelease(pi_sampler Sampler) { + PI_ASSERT(Sampler, PI_ERROR_INVALID_SAMPLER); + + ur_sampler_handle_t UrSampler = + reinterpret_cast(Sampler); + + HANDLE_ERRORS(urSamplerRelease(UrSampler)); + + return PI_SUCCESS; +} + +// Sampler +/////////////////////////////////////////////////////////////////////////////// + } // namespace pi2ur diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index cb255fbd53229..ba1cb72e8518f 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -8,6 +8,7 @@ #include +// #include "ur/adapters/level_zero/ur_level_zero_common.hpp" #include #include @@ -18,10 +19,10 @@ static void DieUnsupported() { // All PI API interfaces are C interfaces extern "C" { -__SYCL_EXPORT pi_result piPlatformsGet(pi_uint32 num_entries, - pi_platform *platforms, - pi_uint32 *num_platforms) { - return pi2ur::piPlatformsGet(num_entries, platforms, num_platforms); +__SYCL_EXPORT pi_result piPlatformsGet(pi_uint32 NumEntries, + pi_platform *Platforms, + pi_uint32 *NumPlatforms) { + return pi2ur::piPlatformsGet(NumEntries, Platforms, NumPlatforms); } __SYCL_EXPORT pi_result piPlatformGetInfo(pi_platform Platform, @@ -65,13 +66,903 @@ __SYCL_EXPORT pi_result piDevicePartition( } // Stub for the not yet supported API -__SYCL_EXPORT pi_result piextDeviceSelectBinary(pi_device, pi_device_binary *, - pi_uint32, pi_uint32 *) { - return PI_ERROR_INVALID_BINARY; +__SYCL_EXPORT pi_result piextDeviceSelectBinary(pi_device Device, + pi_device_binary *Binaries, + pi_uint32 NumBinaries, + pi_uint32 *SelectedBinaryInd) { + return pi2ur::piextDeviceSelectBinary(Device, Binaries, NumBinaries, + SelectedBinaryInd); +} + +__SYCL_EXPORT pi_result +piContextCreate(const pi_context_properties *Properties, pi_uint32 NumDevices, + const pi_device *Devices, + void (*PFnNotify)(const char *ErrInfo, const void *PrivateInfo, + size_t CB, void *UserData), + void *UserData, pi_context *RetContext) { + return pi2ur::piContextCreate(Properties, NumDevices, Devices, PFnNotify, + UserData, RetContext); +} + +__SYCL_EXPORT pi_result piContextRelease(pi_context Context) { + return pi2ur::piContextRelease(Context); +} + +__SYCL_EXPORT pi_result piQueueCreate(pi_context Context, pi_device Device, + pi_queue_properties Flags, + pi_queue *Queue) { + return pi2ur::piQueueCreate(Context, Device, Flags, Queue); +} + +__SYCL_EXPORT pi_result piextQueueCreate(pi_context Context, pi_device Device, + pi_queue_properties *Properties, + pi_queue *Queue) { + return pi2ur::piextQueueCreate(Context, Device, Properties, Queue); +} + +__SYCL_EXPORT pi_result piQueueRelease(pi_queue Queue) { + return pi2ur::piQueueRelease(Queue); +} + +__SYCL_EXPORT pi_result piProgramCreate(pi_context Context, const void *ILBytes, + size_t Length, pi_program *Program) { + return pi2ur::piProgramCreate(Context, ILBytes, Length, Program); +} + +__SYCL_EXPORT pi_result piProgramBuild( + pi_program Program, pi_uint32 NumDevices, const pi_device *DeviceList, + const char *Options, void (*PFnNotify)(pi_program Program, void *UserData), + void *UserData) { + return pi2ur::piProgramBuild(Program, NumDevices, DeviceList, Options, + PFnNotify, UserData); +} + +__SYCL_EXPORT pi_result piextProgramSetSpecializationConstant( + pi_program Prog, pi_uint32 SpecID, size_t Size, const void *SpecValue) { + return pi2ur::piextProgramSetSpecializationConstant(Prog, SpecID, Size, + SpecValue); +} + +__SYCL_EXPORT pi_result +piProgramLink(pi_context Context, pi_uint32 NumDevices, + const pi_device *DeviceList, const char *Options, + pi_uint32 NumInputPrograms, const pi_program *InputPrograms, + void (*PFnNotify)(pi_program Program, void *UserData), + void *UserData, pi_program *RetProgram) { + return pi2ur::piProgramLink(Context, NumDevices, DeviceList, Options, + NumInputPrograms, InputPrograms, PFnNotify, + UserData, RetProgram); +} + +__SYCL_EXPORT pi_result piKernelCreate(pi_program Program, + const char *KernelName, + pi_kernel *RetKernel) { + return pi2ur::piKernelCreate(Program, KernelName, RetKernel); +} + +// Special version of piKernelSetArg to accept pi_mem. +__SYCL_EXPORT pi_result piextKernelSetArgMemObj(pi_kernel Kernel, + pi_uint32 ArgIndex, + const pi_mem *ArgValue) { + + return pi2ur::piextKernelSetArgMemObj(Kernel, ArgIndex, ArgValue); +} + +__SYCL_EXPORT pi_result piKernelSetArg(pi_kernel Kernel, pi_uint32 ArgIndex, + size_t ArgSize, const void *ArgValue) { + + return pi2ur::piKernelSetArg(Kernel, ArgIndex, ArgSize, ArgValue); +} + +__SYCL_EXPORT pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device, + pi_kernel_group_info ParamName, + size_t ParamValueSize, + void *ParamValue, + size_t *ParamValueSizeRet) { + return pi2ur::piKernelGetGroupInfo(Kernel, Device, ParamName, ParamValueSize, + ParamValue, ParamValueSizeRet); +} + +__SYCL_EXPORT pi_result piMemBufferCreate(pi_context Context, + pi_mem_flags Flags, size_t Size, + void *HostPtr, pi_mem *RetMem, + const pi_mem_properties *properties) { + + return pi2ur::piMemBufferCreate(Context, Flags, Size, HostPtr, RetMem, + properties); +} + +__SYCL_EXPORT pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context, + pi_usm_mem_properties *Properties, + size_t Size, pi_uint32 Alignment) { + + return pi2ur::piextUSMHostAlloc(ResultPtr, Context, Properties, Size, + Alignment); +} + +__SYCL_EXPORT pi_result piMemGetInfo(pi_mem Mem, pi_mem_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + return pi2ur::piMemGetInfo(Mem, ParamName, ParamValueSize, ParamValue, + ParamValueSizeRet); +} + +__SYCL_EXPORT pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, + const pi_image_format *ImageFormat, + const pi_image_desc *ImageDesc, + void *HostPtr, pi_mem *RetImage) { + + return pi2ur::piMemImageCreate(Context, Flags, ImageFormat, ImageDesc, + HostPtr, RetImage); +} + +__SYCL_EXPORT pi_result piMemBufferPartition( + pi_mem Buffer, pi_mem_flags Flags, pi_buffer_create_type BufferCreateType, + void *BufferCreateInfo, pi_mem *RetMem) { + return pi2ur::piMemBufferPartition(Buffer, Flags, BufferCreateType, + BufferCreateInfo, RetMem); +} + +__SYCL_EXPORT pi_result +piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) { + return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle); +} + +__SYCL_EXPORT pi_result +piEnqueueMemImageCopy(pi_queue Queue, pi_mem SrcImage, pi_mem DstImage, + pi_image_offset SrcOrigin, pi_image_offset DstOrigin, + pi_image_region Region, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + return pi2ur::piEnqueueMemImageCopy(Queue, SrcImage, DstImage, SrcOrigin, + DstOrigin, Region, NumEventsInWaitList, + EventWaitList, Event); +} + +__SYCL_EXPORT pi_result piextMemCreateWithNativeHandle( + pi_native_handle NativeHandle, pi_context Context, bool ownNativeHandle, + pi_mem *Mem) { + return pi2ur::piextMemCreateWithNativeHandle(NativeHandle, Context, + ownNativeHandle, Mem); +} + +__SYCL_EXPORT pi_result piEnqueueKernelLaunch( + pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, + const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, + const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *OutEvent) { + + return pi2ur::piEnqueueKernelLaunch( + Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize, + NumEventsInWaitList, EventWaitList, OutEvent); +} + +__SYCL_EXPORT pi_result piEnqueueMemImageWrite( + pi_queue Queue, pi_mem Image, pi_bool BlockingWrite, pi_image_offset Origin, + pi_image_region Region, size_t InputRowPitch, size_t InputSlicePitch, + const void *Ptr, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + + return pi2ur::piEnqueueMemImageWrite( + Queue, Image, BlockingWrite, Origin, Region, InputRowPitch, + InputSlicePitch, Ptr, NumEventsInWaitList, EventWaitList, Event); +} + +__SYCL_EXPORT pi_result piEnqueueMemImageRead( + pi_queue Queue, pi_mem Image, pi_bool BlockingRead, pi_image_offset Origin, + pi_image_region Region, size_t RowPitch, size_t SlicePitch, void *Ptr, + pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, + pi_event *Event) { + return pi2ur::piEnqueueMemImageRead( + Queue, Image, BlockingRead, Origin, Region, RowPitch, SlicePitch, Ptr, + NumEventsInWaitList, EventWaitList, Event); +} + +__SYCL_EXPORT pi_result piextKernelCreateWithNativeHandle( + pi_native_handle NativeHandle, pi_context Context, pi_program Program, + bool OwnNativeHandle, pi_kernel *Kernel) { + + return pi2ur::piextKernelCreateWithNativeHandle( + NativeHandle, Context, Program, OwnNativeHandle, Kernel); +} + +__SYCL_EXPORT pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem Mem, + void *MappedPtr, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *OutEvent) { + + return pi2ur::piEnqueueMemUnmap(Queue, Mem, MappedPtr, NumEventsInWaitList, + EventWaitList, OutEvent); +} + +__SYCL_EXPORT pi_result piEventsWait(pi_uint32 NumEvents, + const pi_event *EventList) { + + return pi2ur::piEventsWait(NumEvents, EventList); +} + +__SYCL_EXPORT pi_result piQueueFinish(pi_queue Queue) { + return pi2ur::piQueueFinish(Queue); +} + +__SYCL_EXPORT pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + return pi2ur::piEventGetInfo(Event, ParamName, ParamValueSize, ParamValue, + ParamValueSizeRet); +} + +__SYCL_EXPORT pi_result piEnqueueMemBufferMap( + pi_queue Queue, pi_mem Mem, pi_bool BlockingMap, pi_map_flags MapFlags, + size_t Offset, size_t Size, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *OutEvent, void **RetMap) { + + return pi2ur::piEnqueueMemBufferMap(Queue, Mem, BlockingMap, MapFlags, Offset, + Size, NumEventsInWaitList, EventWaitList, + OutEvent, RetMap); +} + +__SYCL_EXPORT pi_result piEnqueueMemBufferFill( + pi_queue Queue, pi_mem Buffer, const void *Pattern, size_t PatternSize, + size_t Offset, size_t Size, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + return pi2ur::piEnqueueMemBufferFill(Queue, Buffer, Pattern, PatternSize, + Offset, Size, NumEventsInWaitList, + EventWaitList, Event); +} + +__SYCL_EXPORT pi_result piextUSMDeviceAlloc(void **ResultPtr, + pi_context Context, + pi_device Device, + pi_usm_mem_properties *Properties, + size_t Size, pi_uint32 Alignment) { + + return pi2ur::piextUSMDeviceAlloc(ResultPtr, Context, Device, Properties, + Size, Alignment); +} + +__SYCL_EXPORT pi_result piKernelRetain(pi_kernel Kernel) { + return pi2ur::piKernelRetain(Kernel); +} + +__SYCL_EXPORT pi_result piKernelRelease(pi_kernel Kernel) { + + return pi2ur::piKernelRelease(Kernel); +} + +__SYCL_EXPORT pi_result piProgramRelease(pi_program Program) { + return pi2ur::piProgramRelease(Program); +} + +__SYCL_EXPORT pi_result piextUSMSharedAlloc(void **ResultPtr, + pi_context Context, + pi_device Device, + pi_usm_mem_properties *Properties, + size_t Size, pi_uint32 Alignment) { + + return pi2ur::piextUSMSharedAlloc(ResultPtr, Context, Device, Properties, + Size, Alignment); +} + +__SYCL_EXPORT pi_result piextUSMFree(pi_context Context, void *Ptr) { + return pi2ur::piextUSMFree(Context, Ptr); +} + +__SYCL_EXPORT pi_result piContextRetain(pi_context Context) { + return pi2ur::piContextRetain(Context); +} + +__SYCL_EXPORT pi_result piextKernelSetArgPointer(pi_kernel Kernel, + pi_uint32 ArgIndex, + size_t ArgSize, + const void *ArgValue) { + return pi2ur::piextKernelSetArgPointer(Kernel, ArgIndex, ArgSize, ArgValue); +} + +// Special version of piKernelSetArg to accept pi_sampler. +__SYCL_EXPORT pi_result piextKernelSetArgSampler(pi_kernel Kernel, + pi_uint32 ArgIndex, + const pi_sampler *ArgValue) { + + return pi2ur::piextKernelSetArgSampler(Kernel, ArgIndex, ArgValue); +} + +__SYCL_EXPORT pi_result piKernelGetSubGroupInfo( + pi_kernel Kernel, pi_device Device, pi_kernel_sub_group_info ParamName, + size_t InputValueSize, const void *InputValue, size_t ParamValueSize, + void *ParamValue, size_t *ParamValueSizeRet) { + + return pi2ur::piKernelGetSubGroupInfo( + Kernel, Device, ParamName, InputValueSize, InputValue, ParamValueSize, + ParamValue, ParamValueSizeRet); +} + +__SYCL_EXPORT pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + return pi2ur::piQueueGetInfo(Queue, ParamName, ParamValueSize, ParamValue, + ParamValueSizeRet); +} + +/// USM Memset API +/// +/// @param Queue is the queue to submit to +/// @param Ptr is the ptr to memset +/// @param Value is value to set. It is interpreted as an 8-bit value and the +/// upper +/// 24 bits are ignored +/// @param Count is the size in bytes to memset +/// @param NumEventsInWaitlist is the number of events to wait on +/// @param EventsWaitlist is an array of events to wait on +/// @param Event is the event that represents this operation +__SYCL_EXPORT pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, + pi_int32 Value, size_t Count, + pi_uint32 NumEventsInWaitlist, + const pi_event *EventsWaitlist, + pi_event *Event) { + return pi2ur::piextUSMEnqueueMemset( + Queue, Ptr, Value, Count, NumEventsInWaitlist, EventsWaitlist, Event); +} + +__SYCL_EXPORT pi_result piEnqueueMemBufferCopyRect( + pi_queue Queue, pi_mem SrcMem, pi_mem DstMem, pi_buff_rect_offset SrcOrigin, + pi_buff_rect_offset DstOrigin, pi_buff_rect_region Region, + size_t SrcRowPitch, size_t SrcSlicePitch, size_t DstRowPitch, + size_t DstSlicePitch, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + + return pi2ur::piEnqueueMemBufferCopyRect( + Queue, SrcMem, DstMem, SrcOrigin, DstOrigin, Region, SrcRowPitch, + SrcSlicePitch, DstRowPitch, DstSlicePitch, NumEventsInWaitList, + EventWaitList, Event); +} + +__SYCL_EXPORT pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcMem, + pi_mem DstMem, size_t SrcOffset, + size_t DstOffset, size_t Size, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *Event) { + return pi2ur::piEnqueueMemBufferCopy(Queue, SrcMem, DstMem, SrcOffset, + DstOffset, Size, NumEventsInWaitList, + EventWaitList, Event); +} + +__SYCL_EXPORT pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking, + void *DstPtr, const void *SrcPtr, + size_t Size, + pi_uint32 NumEventsInWaitlist, + const pi_event *EventsWaitlist, + pi_event *Event) { + + return pi2ur::piextUSMEnqueueMemcpy(Queue, Blocking, DstPtr, SrcPtr, Size, + NumEventsInWaitlist, EventsWaitlist, + Event); +} + +__SYCL_EXPORT pi_result piEnqueueMemBufferWriteRect( + pi_queue Queue, pi_mem Buffer, pi_bool BlockingWrite, + pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, + pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, + size_t HostRowPitch, size_t HostSlicePitch, const void *Ptr, + pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, + pi_event *Event) { + + return pi2ur::piEnqueueMemBufferWriteRect( + Queue, Buffer, BlockingWrite, BufferOffset, HostOffset, Region, + BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr, + NumEventsInWaitList, EventWaitList, Event); +} + +__SYCL_EXPORT pi_result piEnqueueMemBufferWrite( + pi_queue Queue, pi_mem Buffer, pi_bool BlockingWrite, size_t Offset, + size_t Size, const void *Ptr, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + + return pi2ur::piEnqueueMemBufferWrite(Queue, Buffer, BlockingWrite, Offset, + Size, Ptr, NumEventsInWaitList, + EventWaitList, Event); +} + +__SYCL_EXPORT pi_result piEnqueueMemBufferReadRect( + pi_queue Queue, pi_mem Buffer, pi_bool BlockingRead, + pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, + pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, + size_t HostRowPitch, size_t HostSlicePitch, void *Ptr, + pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, + pi_event *Event) { + + return pi2ur::piEnqueueMemBufferReadRect( + Queue, Buffer, BlockingRead, BufferOffset, HostOffset, Region, + BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr, + NumEventsInWaitList, EventWaitList, Event); +} + +__SYCL_EXPORT pi_result piEnqueueMemBufferRead( + pi_queue Queue, pi_mem Src, pi_bool BlockingRead, size_t Offset, + size_t Size, void *Dst, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + + return pi2ur::piEnqueueMemBufferRead(Queue, Src, BlockingRead, Offset, Size, + Dst, NumEventsInWaitList, EventWaitList, + Event); +} + +__SYCL_EXPORT pi_result piEnqueueEventsWaitWithBarrier( + pi_queue Queue, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *OutEvent) { + + return pi2ur::piEnqueueEventsWaitWithBarrier(Queue, NumEventsInWaitList, + EventWaitList, OutEvent); +} + +__SYCL_EXPORT pi_result piEnqueueEventsWait(pi_queue Queue, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *OutEvent) { + + return pi2ur::piEnqueueEventsWait(Queue, NumEventsInWaitList, EventWaitList, + OutEvent); +} + +__SYCL_EXPORT pi_result +piextEventGetNativeHandle(pi_event Event, pi_native_handle *NativeHandle) { + + return pi2ur::piextEventGetNativeHandle(Event, NativeHandle); +} + +__SYCL_EXPORT pi_result piEventGetProfilingInfo(pi_event Event, + pi_profiling_info ParamName, + size_t ParamValueSize, + void *ParamValue, + size_t *ParamValueSizeRet) { + + return pi2ur::piEventGetProfilingInfo(Event, ParamName, ParamValueSize, + ParamValue, ParamValueSizeRet); +} + +__SYCL_EXPORT pi_result piProgramRetain(pi_program Program) { + return pi2ur::piProgramRetain(Program); +} + +__SYCL_EXPORT pi_result piKernelSetExecInfo(pi_kernel Kernel, + pi_kernel_exec_info ParamName, + size_t ParamValueSize, + const void *ParamValue) { + + return pi2ur::piKernelSetExecInfo(Kernel, ParamName, ParamValueSize, + ParamValue); +} + +__SYCL_EXPORT pi_result piKernelGetInfo(pi_kernel Kernel, + pi_kernel_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + return pi2ur::piKernelGetInfo(Kernel, ParamName, ParamValueSize, ParamValue, + ParamValueSizeRet); +} + +__SYCL_EXPORT pi_result piQueueRetain(pi_queue Queue) { + return pi2ur::piQueueRetain(Queue); +} + +__SYCL_EXPORT pi_result piQueueFlush(pi_queue Queue) { + return pi2ur::piQueueFlush(Queue); +} + +__SYCL_EXPORT pi_result piMemRetain(pi_mem Mem) { + return pi2ur::piMemRetain(Mem); +} + +__SYCL_EXPORT pi_result piProgramCreateWithBinary( + pi_context Context, pi_uint32 NumDevices, const pi_device *DeviceList, + const size_t *Lengths, const unsigned char **Binaries, + size_t NumMetadataEntries, const pi_device_binary_property *Metadata, + pi_int32 *BinaryStatus, pi_program *Program) { + + return pi2ur::piProgramCreateWithBinary(Context, NumDevices, DeviceList, + Lengths, Binaries, NumMetadataEntries, + Metadata, BinaryStatus, Program); +} + +__SYCL_EXPORT pi_result piclProgramCreateWithSource(pi_context Context, + pi_uint32 Count, + const char **Strings, + const size_t *Lengths, + pi_program *RetProgram) { + return pi2ur::piclProgramCreateWithSource(Context, Count, Strings, Lengths, + RetProgram); +} + +__SYCL_EXPORT pi_result piProgramGetInfo(pi_program Program, + pi_program_info ParamName, + size_t ParamValueSize, + void *ParamValue, + size_t *ParamValueSizeRet) { + + return pi2ur::piProgramGetInfo(Program, ParamName, ParamValueSize, ParamValue, + ParamValueSizeRet); +} + +__SYCL_EXPORT pi_result piProgramCompile( + pi_program Program, pi_uint32 NumDevices, const pi_device *DeviceList, + const char *Options, pi_uint32 NumInputHeaders, + const pi_program *InputHeaders, const char **HeaderIncludeNames, + void (*PFnNotify)(pi_program Program, void *UserData), void *UserData) { + + return pi2ur::piProgramCompile(Program, NumDevices, DeviceList, Options, + NumInputHeaders, InputHeaders, + HeaderIncludeNames, PFnNotify, UserData); +} + +__SYCL_EXPORT pi_result piProgramGetBuildInfo( + pi_program Program, pi_device Device, pi_program_build_info ParamName, + size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { + + return pi2ur::piProgramGetBuildInfo(Program, Device, ParamName, + ParamValueSize, ParamValue, + ParamValueSizeRet); +} + +__SYCL_EXPORT pi_result piEventCreate(pi_context Context, pi_event *RetEvent) { + + return pi2ur::piEventCreate(Context, RetEvent); +} + +__SYCL_EXPORT pi_result piEventSetCallback( + pi_event Event, pi_int32 CommandExecCallbackType, + void (*PFnNotify)(pi_event Event, pi_int32 EventCommandStatus, + void *UserData), + void *UserData) { + return pi2ur::piEventSetCallback(Event, CommandExecCallbackType, PFnNotify, + UserData); +} + +__SYCL_EXPORT pi_result piEventSetStatus(pi_event Event, + pi_int32 ExecutionStatus) { + return pi2ur::piEventSetStatus(Event, ExecutionStatus); +} + +__SYCL_EXPORT pi_result piEventRetain(pi_event Event) { + return pi2ur::piEventRetain(Event); +} + +__SYCL_EXPORT pi_result piEventRelease(pi_event Event) { + return pi2ur::piEventRelease(Event); +} + +__SYCL_EXPORT pi_result piextEventCreateWithNativeHandle( + pi_native_handle NativeHandle, pi_context Context, bool OwnNativeHandle, + pi_event *Event) { + return pi2ur::piextEventCreateWithNativeHandle(NativeHandle, Context, + OwnNativeHandle, Event); +} + +__SYCL_EXPORT pi_result piEnqueueMemImageFill( + pi_queue Queue, pi_mem Image, const void *FillColor, const size_t *Origin, + const size_t *Region, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { + + return pi2ur::piEnqueueMemImageFill(Queue, Image, FillColor, Origin, Region, + NumEventsInWaitList, EventWaitList, + Event); +} + +__SYCL_EXPORT pi_result piextPlatformGetNativeHandle( + pi_platform Platform, pi_native_handle *NativeHandle) { + + return pi2ur::piextPlatformGetNativeHandle(Platform, NativeHandle); +} + +__SYCL_EXPORT pi_result piextPlatformCreateWithNativeHandle( + pi_native_handle NativeHandle, pi_platform *Platform) { + + return pi2ur::piextPlatformCreateWithNativeHandle(NativeHandle, Platform); +} + +__SYCL_EXPORT pi_result +piextDeviceGetNativeHandle(pi_device Device, pi_native_handle *NativeHandle) { + + return pi2ur::piextDeviceGetNativeHandle(Device, NativeHandle); +} + +__SYCL_EXPORT pi_result piextDeviceCreateWithNativeHandle( + pi_native_handle NativeHandle, pi_platform Platform, pi_device *Device) { + + return pi2ur::piextDeviceCreateWithNativeHandle(NativeHandle, Platform, + Device); +} + +// FIXME: Dummy implementation to prevent link fail +__SYCL_EXPORT pi_result piextContextSetExtendedDeleter( + pi_context Context, pi_context_extended_deleter Function, void *UserData) { + return pi2ur::piextContextSetExtendedDeleter(Context, Function, UserData); +} + +__SYCL_EXPORT pi_result piextContextGetNativeHandle( + pi_context Context, pi_native_handle *NativeHandle) { + + return pi2ur::piextContextGetNativeHandle(Context, NativeHandle); +} + +__SYCL_EXPORT pi_result piextContextCreateWithNativeHandle( + pi_native_handle NativeHandle, pi_uint32 NumDevices, + const pi_device *Devices, bool OwnNativeHandle, pi_context *RetContext) { + return pi2ur::piextContextCreateWithNativeHandle( + NativeHandle, NumDevices, Devices, OwnNativeHandle, RetContext); +} + +__SYCL_EXPORT pi_result +piextQueueGetNativeHandle(pi_queue Queue, pi_native_handle *NativeHandle) { + return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle); +} + +__SYCL_EXPORT pi_result piextQueueCreateWithNativeHandle( + pi_native_handle NativeHandle, pi_context Context, pi_device Device, + bool OwnNativeHandle, pi_queue *Queue) { + return pi2ur::piextQueueCreateWithNativeHandle(NativeHandle, Context, Device, + OwnNativeHandle, Queue); +} + +__SYCL_EXPORT pi_result piMemRelease(pi_mem Mem) { + return pi2ur::piMemRelease(Mem); +} + +__SYCL_EXPORT pi_result piEnqueueNativeKernel( + pi_queue Queue, void (*UserFunc)(void *), void *Args, size_t CbArgs, + pi_uint32 NumMemObjects, const pi_mem *MemList, const void **ArgsMemLoc, + pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, + pi_event *Event) { + return pi2ur::piEnqueueNativeKernel( + Queue, UserFunc, Args, CbArgs, NumMemObjects, MemList, ArgsMemLoc, + NumEventsInWaitList, EventWaitList, Event); +} + +__SYCL_EXPORT pi_result piextGetDeviceFunctionPointer( + pi_device Device, pi_program Program, const char *FunctionName, + pi_uint64 *FunctionPointerRet) { + + return pi2ur::piextGetDeviceFunctionPointer(Device, Program, FunctionName, + FunctionPointerRet); +} + +/// Hint to migrate memory to the device +/// +/// @param Queue is the queue to submit to +/// @param Ptr points to the memory to migrate +/// @param Size is the number of bytes to migrate +/// @param Flags is a bitfield used to specify memory migration options +/// @param NumEventsInWaitlist is the number of events to wait on +/// @param EventsWaitlist is an array of events to wait on +/// @param Event is the event that represents this operation +__SYCL_EXPORT pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, + size_t Size, + pi_usm_migration_flags Flags, + pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, + pi_event *OutEvent) { + + return pi2ur::piextUSMEnqueuePrefetch( + Queue, Ptr, Size, Flags, NumEventsInWaitList, EventWaitList, OutEvent); +} + +/// USM memadvise API to govern behavior of automatic migration mechanisms +/// +/// @param Queue is the queue to submit to +/// @param Ptr is the data to be advised +/// @param Length is the size in bytes of the meory to advise +/// @param Advice is device specific advice +/// @param Event is the event that represents this operation +/// +__SYCL_EXPORT pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, + const void *Ptr, size_t Length, + pi_mem_advice Advice, + pi_event *OutEvent) { + + return pi2ur::piextUSMEnqueueMemAdvise(Queue, Ptr, Length, Advice, OutEvent); +} + +/// USM 2D Fill API +/// +/// \param queue is the queue to submit to +/// \param ptr is the ptr to fill +/// \param pitch is the total width of the destination memory including padding +/// \param pattern is a pointer with the bytes of the pattern to set +/// \param pattern_size is the size in bytes of the pattern +/// \param width is width in bytes of each row to fill +/// \param height is height the columns to fill +/// \param num_events_in_waitlist is the number of events to wait on +/// \param events_waitlist is an array of events to wait on +/// \param event is the event that represents this operation +__SYCL_EXPORT pi_result piextUSMEnqueueFill2D(pi_queue Queue, void *Ptr, + size_t Pitch, size_t PatternSize, + const void *Pattern, size_t Width, + size_t Height, + pi_uint32 NumEventsWaitList, + const pi_event *EventsWaitList, + pi_event *Event) { + + return pi2ur::piextUSMEnqueueFill2D(Queue, Ptr, Pitch, PatternSize, Pattern, + Width, Height, NumEventsWaitList, + EventsWaitList, Event); +} + +/// USM 2D Memset API +/// +/// \param queue is the queue to submit to +/// \param ptr is the ptr to fill +/// \param pitch is the total width of the destination memory including padding +/// \param pattern is a pointer with the bytes of the pattern to set +/// \param pattern_size is the size in bytes of the pattern +/// \param width is width in bytes of each row to fill +/// \param height is height the columns to fill +/// \param num_events_in_waitlist is the number of events to wait on +/// \param events_waitlist is an array of events to wait on +/// \param event is the event that represents this operation +__SYCL_EXPORT pi_result piextUSMEnqueueMemset2D(pi_queue Queue, void *Ptr, + size_t Pitch, int Value, + size_t Width, size_t Height, + pi_uint32 NumEventsWaitList, + const pi_event *EventsWaitlist, + pi_event *Event) { + return pi2ur::piextUSMEnqueueMemset2D(Queue, Ptr, Pitch, Value, Width, Height, + NumEventsWaitList, EventsWaitlist, + Event); +} + +/// API to query information about USM allocated pointers. +/// Valid Queries: +/// PI_MEM_ALLOC_TYPE returns host/device/shared pi_usm_type value +/// PI_MEM_ALLOC_BASE_PTR returns the base ptr of an allocation if +/// the queried pointer fell inside an allocation. +/// Result must fit in void * +/// PI_MEM_ALLOC_SIZE returns how big the queried pointer's +/// allocation is in bytes. Result is a size_t. +/// PI_MEM_ALLOC_DEVICE returns the pi_device this was allocated against +/// +/// @param Context is the pi_context +/// @param Ptr is the pointer to query +/// @param ParamName is the type of query to perform +/// @param ParamValueSize is the size of the result in bytes +/// @param ParamValue is the result +/// @param ParamValueRet is how many bytes were written +__SYCL_EXPORT pi_result piextUSMGetMemAllocInfo( + pi_context Context, const void *Ptr, pi_mem_alloc_info ParamName, + size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { + return pi2ur::piextUSMGetMemAllocInfo(Context, Ptr, ParamName, ParamValueSize, + ParamValue, ParamValueSizeRet); +} + +__SYCL_EXPORT pi_result piextPluginGetOpaqueData(void *opaque_data_param, + void **opaque_data_return) { + return pi2ur::piextPluginGetOpaqueData(opaque_data_param, opaque_data_return); +} + +__SYCL_EXPORT pi_result piextProgramGetNativeHandle( + pi_program Program, pi_native_handle *NativeHandle) { + + return pi2ur::piextProgramGetNativeHandle(Program, NativeHandle); +} + +__SYCL_EXPORT pi_result piextProgramCreateWithNativeHandle( + pi_native_handle NativeHandle, // missing + pi_context Context, bool ownNativeHandle, pi_program *Program) { + return pi2ur::piextProgramCreateWithNativeHandle(NativeHandle, Context, + ownNativeHandle, Program); +} + +__SYCL_EXPORT pi_result piSamplerCreate( + pi_context Context, const pi_sampler_properties *SamplerProperties, + pi_sampler *RetSampler) { + return pi2ur::piSamplerCreate(Context, SamplerProperties, RetSampler); +} + +__SYCL_EXPORT pi_result piSamplerGetInfo(pi_sampler Sampler, + pi_sampler_info ParamName, + size_t ParamValueSize, + void *ParamValue, + size_t *ParamValueSizeRet) { + return pi2ur::piSamplerGetInfo(Sampler, ParamName, ParamValueSize, ParamValue, + ParamValueSizeRet); +} + +__SYCL_EXPORT pi_result piSamplerRetain(pi_sampler Sampler) { + return pi2ur::piSamplerRetain(Sampler); +} + +__SYCL_EXPORT pi_result piSamplerRelease(pi_sampler Sampler) { + return pi2ur::piSamplerRelease(Sampler); +} + +__SYCL_EXPORT pi_result piMemImageGetInfo(pi_mem Image, pi_image_info ParamName, + size_t ParamValueSize, + void *ParamValue, + size_t *ParamValueSizeRet) { + return pi2ur::piMemImageGetInfo(Image, ParamName, ParamValueSize, ParamValue, + ParamValueSizeRet); +} + +/// USM 2D Memcpy API +/// +/// \param queue is the queue to submit to +/// \param blocking is whether this operation should block the host +/// \param dst_ptr is the location the data will be copied +/// \param dst_pitch is the total width of the destination memory including +/// padding +/// \param src_ptr is the data to be copied +/// \param dst_pitch is the total width of the source memory including padding +/// \param width is width in bytes of each row to be copied +/// \param height is height the columns to be copied +/// \param num_events_in_waitlist is the number of events to wait on +/// \param events_waitlist is an array of events to wait on +/// \param event is the event that represents this operation +__SYCL_EXPORT pi_result piextUSMEnqueueMemcpy2D( + pi_queue Queue, pi_bool Blocking, void *DstPtr, size_t DstPitch, + const void *SrcPtr, size_t SrcPitch, size_t Width, size_t Height, + pi_uint32 NumEventsInWaitList, const pi_event *EventsWaitList, + pi_event *Event) { + + return pi2ur::piextUSMEnqueueMemcpy2D( + Queue, Blocking, DstPtr, DstPitch, SrcPtr, SrcPitch, Width, Height, + NumEventsInWaitList, EventsWaitList, Event); +} + +/// API for writing data from host to a device global variable. +/// +/// \param Queue is the queue +/// \param Program is the program containing the device global variable +/// \param Name is the unique identifier for the device global variable +/// \param BlockingWrite is true if the write should block +/// \param Count is the number of bytes to copy +/// \param Offset is the byte offset into the device global variable to start +/// copying +/// \param Src is a pointer to where the data must be copied from +/// \param NumEventsInWaitList is a number of events in the wait list +/// \param EventWaitList is the wait list +/// \param Event is the resulting event +pi_result piextEnqueueDeviceGlobalVariableWrite( + pi_queue Queue, pi_program Program, const char *Name, pi_bool BlockingWrite, + size_t Count, size_t Offset, const void *Src, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *Event) { + return pi2ur::piextEnqueueDeviceGlobalVariableWrite( + Queue, Program, Name, BlockingWrite, Count, Offset, Src, + NumEventsInWaitList, EventsWaitList, Event); +} + +/// API reading data from a device global variable to host. +/// +/// \param Queue is the queue +/// \param Program is the program containing the device global variable +/// \param Name is the unique identifier for the device global variable +/// \param BlockingRead is true if the read should block +/// \param Count is the number of bytes to copy +/// \param Offset is the byte offset into the device global variable to start +/// copying +/// \param Dst is a pointer to where the data must be copied to +/// \param NumEventsInWaitList is a number of events in the wait list +/// \param EventWaitList is the wait list +/// \param Event is the resulting event +pi_result piextEnqueueDeviceGlobalVariableRead( + pi_queue Queue, pi_program Program, const char *Name, pi_bool BlockingRead, + size_t Count, size_t Offset, void *Dst, pi_uint32 NumEventsInWaitList, + const pi_event *EventsWaitList, pi_event *Event) { + + return pi2ur::piextEnqueueDeviceGlobalVariableRead( + Queue, Program, Name, BlockingRead, Count, Offset, Dst, + NumEventsInWaitList, EventsWaitList, Event); +} + +__SYCL_EXPORT pi_result piGetDeviceAndHostTimer(pi_device Device, + uint64_t *DeviceTime, + uint64_t *HostTime) { + return pi2ur::piGetDeviceAndHostTimer(Device, DeviceTime, HostTime); } // This interface is not in Unified Runtime currently -__SYCL_EXPORT pi_result piTearDown(void *) { return PI_SUCCESS; } +__SYCL_EXPORT pi_result piTearDown(void *PluginParameter) { + return pi2ur::piTearDown(PluginParameter); +} // This interface is not in Unified Runtime currently __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) { @@ -107,6 +998,116 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) { _PI_API(piDeviceGetInfo) _PI_API(piDevicePartition) _PI_API(piextDeviceSelectBinary) + _PI_API(piGetDeviceAndHostTimer) + _PI_API(piextPlatformGetNativeHandle) + _PI_API(piextPlatformCreateWithNativeHandle) + _PI_API(piextDeviceGetNativeHandle) + _PI_API(piextDeviceCreateWithNativeHandle) + + _PI_API(piContextCreate) + _PI_API(piContextRelease) + _PI_API(piContextRetain) + _PI_API(piextContextSetExtendedDeleter) + _PI_API(piextContextGetNativeHandle) + _PI_API(piextContextCreateWithNativeHandle) + + _PI_API(piQueueCreate) + _PI_API(piQueueRelease) + _PI_API(piextQueueCreate) + _PI_API(piQueueFinish) + _PI_API(piQueueGetInfo) + _PI_API(piQueueRetain) + _PI_API(piQueueFlush) + _PI_API(piextQueueGetNativeHandle) + _PI_API(piextQueueCreateWithNativeHandle) + + _PI_API(piProgramCreate) + _PI_API(piProgramBuild) + _PI_API(piextProgramGetNativeHandle) + _PI_API(piextProgramCreateWithNativeHandle) + _PI_API(piextProgramSetSpecializationConstant) + _PI_API(piProgramLink) + _PI_API(piKernelCreate) + _PI_API(piextKernelSetArgMemObj) + _PI_API(piextKernelCreateWithNativeHandle) + _PI_API(piProgramRetain) + _PI_API(piKernelSetExecInfo) + _PI_API(piKernelGetInfo) + _PI_API(piKernelSetArg) + _PI_API(piKernelGetGroupInfo) + _PI_API(piKernelRetain) + _PI_API(piKernelRelease) + _PI_API(piProgramRelease) + _PI_API(piextKernelSetArgPointer) + _PI_API(piextKernelSetArgSampler) + _PI_API(piKernelGetSubGroupInfo) + _PI_API(piProgramCreateWithBinary) + _PI_API(piclProgramCreateWithSource) + _PI_API(piProgramGetInfo) + _PI_API(piProgramCompile) + _PI_API(piProgramGetBuildInfo) + _PI_API(piextGetDeviceFunctionPointer) + + _PI_API(piMemBufferCreate) + _PI_API(piMemGetInfo) + _PI_API(piMemBufferPartition) + _PI_API(piEnqueueMemImageCopy) + _PI_API(piextMemGetNativeHandle) + _PI_API(piextMemCreateWithNativeHandle) + _PI_API(piMemRetain) + _PI_API(piextUSMGetMemAllocInfo) + _PI_API(piextUSMEnqueuePrefetch) + _PI_API(piextUSMEnqueueFill2D) + _PI_API(piextUSMEnqueueMemset2D) + _PI_API(piextUSMEnqueueMemAdvise) + _PI_API(piMemRelease) + _PI_API(piMemImageCreate) + _PI_API(piMemImageGetInfo) + _PI_API(piextUSMEnqueueMemcpy2D) + _PI_API(piextEnqueueDeviceGlobalVariableWrite) + _PI_API(piextEnqueueDeviceGlobalVariableRead) + + _PI_API(piextUSMHostAlloc) + _PI_API(piextUSMDeviceAlloc) + _PI_API(piextUSMSharedAlloc) + _PI_API(piextUSMFree) + + _PI_API(piEnqueueKernelLaunch) + _PI_API(piEnqueueMemImageWrite) + _PI_API(piEnqueueMemImageRead) + _PI_API(piEnqueueMemBufferMap) + _PI_API(piEnqueueMemUnmap) + _PI_API(piEnqueueMemBufferFill) + _PI_API(piextUSMEnqueueMemset) + _PI_API(piEnqueueMemBufferCopyRect) + _PI_API(piEnqueueMemBufferCopy) + _PI_API(piextUSMEnqueueMemcpy) + _PI_API(piEnqueueMemBufferWriteRect) + _PI_API(piEnqueueMemBufferWrite) + _PI_API(piEnqueueMemBufferReadRect) + _PI_API(piEnqueueMemBufferRead) + _PI_API(piEnqueueEventsWaitWithBarrier) + _PI_API(piEnqueueEventsWait) + _PI_API(piEnqueueNativeKernel) + _PI_API(piEnqueueMemImageFill) + + _PI_API(piEventSetCallback) + _PI_API(piEventSetStatus) + _PI_API(piEventRetain) + _PI_API(piEventRelease) + _PI_API(piextEventCreateWithNativeHandle) + _PI_API(piEventsWait) + _PI_API(piEventGetInfo) + _PI_API(piextEventGetNativeHandle) + _PI_API(piEventGetProfilingInfo) + _PI_API(piEventCreate) + + _PI_API(piSamplerCreate) + _PI_API(piSamplerGetInfo) + _PI_API(piSamplerRetain) + _PI_API(piSamplerRelease) + + _PI_API(piextPluginGetOpaqueData) _PI_API(piTearDown) return PI_SUCCESS; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp index ae7ae6375bea0..51fe4cf9c475b 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp @@ -15,1572 +15,3 @@ // Define the static class field std::mutex ZeCall::GlobalLock; - -// Trace a call to Level-Zero RT -#define ZE_CALL(ZeName, ZeArgs) \ - { \ - ze_result_t ZeResult = ZeName ZeArgs; \ - if (auto Result = ZeCall().doCall(ZeResult, #ZeName, #ZeArgs, true)) \ - return ze2urResult(Result); \ - } - -ur_result_t _ur_platform_handle_t::initialize() { - // Cache driver properties - ZeStruct ZeDriverProperties; - ZE_CALL(zeDriverGetProperties, (ZeDriver, &ZeDriverProperties)); - uint32_t DriverVersion = ZeDriverProperties.driverVersion; - // Intel Level-Zero GPU driver stores version as: - // | 31 - 24 | 23 - 16 | 15 - 0 | - // | Major | Minor | Build | - auto VersionMajor = std::to_string((DriverVersion & 0xFF000000) >> 24); - auto VersionMinor = std::to_string((DriverVersion & 0x00FF0000) >> 16); - auto VersionBuild = std::to_string(DriverVersion & 0x0000FFFF); - ZeDriverVersion = VersionMajor + "." + VersionMinor + "." + VersionBuild; - - ZE_CALL(zeDriverGetApiVersion, (ZeDriver, &ZeApiVersion)); - ZeDriverApiVersion = std::to_string(ZE_MAJOR_VERSION(ZeApiVersion)) + "." + - std::to_string(ZE_MINOR_VERSION(ZeApiVersion)); - - // Cache driver extension properties - uint32_t Count = 0; - ZE_CALL(zeDriverGetExtensionProperties, (ZeDriver, &Count, nullptr)); - - std::vector ZeExtensions(Count); - - ZE_CALL(zeDriverGetExtensionProperties, - (ZeDriver, &Count, ZeExtensions.data())); - - for (auto &extension : ZeExtensions) { - // Check if global offset extension is available - if (strncmp(extension.name, ZE_GLOBAL_OFFSET_EXP_NAME, - strlen(ZE_GLOBAL_OFFSET_EXP_NAME) + 1) == 0) { - if (extension.version == ZE_GLOBAL_OFFSET_EXP_VERSION_1_0) { - ZeDriverGlobalOffsetExtensionFound = true; - } - } - // Check if extension is available for "static linking" (compiling multiple - // SPIR-V modules together into one Level Zero module). - if (strncmp(extension.name, ZE_MODULE_PROGRAM_EXP_NAME, - strlen(ZE_MODULE_PROGRAM_EXP_NAME) + 1) == 0) { - if (extension.version == ZE_MODULE_PROGRAM_EXP_VERSION_1_0) { - ZeDriverModuleProgramExtensionFound = true; - } - } - zeDriverExtensionMap[extension.name] = extension.version; - } - - // Check if import user ptr into USM feature has been requested. - // If yes, then set up L0 API pointers if the platform supports it. - ZeUSMImport.setZeUSMImport(this); - - return UR_RESULT_SUCCESS; -} - -ur_result_t urPlatformGet( - uint32_t NumEntries, ///< [in] the number of platforms to be added to - ///< phPlatforms. If phPlatforms is not NULL, then - ///< NumEntries should be greater than zero, otherwise - ///< ::UR_RESULT_ERROR_INVALID_SIZE, will be returned. - ur_platform_handle_t - *Platforms, ///< [out][optional][range(0, NumEntries)] array of handle - ///< of platforms. If NumEntries is less than the number of - ///< platforms available, then - ///< ::urPlatformGet shall only retrieve that number of - ///< platforms. - uint32_t *NumPlatforms ///< [out][optional] returns the total number of - ///< platforms available. -) { - static std::once_flag ZeCallCountInitialized; - try { - std::call_once(ZeCallCountInitialized, []() { - if (UrL0Debug & UR_L0_DEBUG_CALL_COUNT) { - ZeCallCount = new std::map; - } - }); - } catch (const std::bad_alloc &) { - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - - // Setting these environment variables before running zeInit will enable the - // validation layer in the Level Zero loader. - if (UrL0Debug & UR_L0_DEBUG_VALIDATION) { - setEnvVar("ZE_ENABLE_VALIDATION_LAYER", "1"); - setEnvVar("ZE_ENABLE_PARAMETER_VALIDATION", "1"); - } - - // Enable SYSMAN support for obtaining the PCI address - // and maximum memory bandwidth. - if (getenv("SYCL_ENABLE_PCI") != nullptr) { - setEnvVar("ZES_ENABLE_SYSMAN", "1"); - } - - // TODO: We can still safely recover if something goes wrong during the init. - // Implement handling segfault using sigaction. - - // We must only initialize the driver once, even if piPlatformsGet() is called - // multiple times. Declaring the return value as "static" ensures it's only - // called once. - static ze_result_t ZeResult = ZE_CALL_NOCHECK(zeInit, (0)); - - // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms. - if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) { - PI_ASSERT(NumEntries != 0, UR_RESULT_ERROR_INVALID_VALUE); - if (NumPlatforms) - *NumPlatforms = 0; - return UR_RESULT_SUCCESS; - } - - if (ZeResult != ZE_RESULT_SUCCESS) { - urPrint("zeInit: Level Zero initialization failure\n"); - return ze2urResult(ZeResult); - } - - // Cache pi_platforms for reuse in the future - // It solves two problems; - // 1. sycl::platform equality issue; we always return the same pi_platform. - // 2. performance; we can save time by immediately return from cache. - // - - const std::lock_guard Lock{*PiPlatformsCacheMutex}; - if (!PiPlatformCachePopulated) { - try { - // Level Zero does not have concept of Platforms, but Level Zero driver is - // the closest match. - uint32_t ZeDriverCount = 0; - ZE_CALL(zeDriverGet, (&ZeDriverCount, nullptr)); - if (ZeDriverCount == 0) { - PiPlatformCachePopulated = true; - } else { - std::vector ZeDrivers; - ZeDrivers.resize(ZeDriverCount); - - ZE_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data())); - for (uint32_t I = 0; I < ZeDriverCount; ++I) { - auto Platform = new ur_platform_handle_t_(ZeDrivers[I]); - // Save a copy in the cache for future uses. - PiPlatformsCache->push_back(Platform); - - ur_result_t Result = Platform->initialize(); - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - } - PiPlatformCachePopulated = true; - } - } catch (const std::bad_alloc &) { - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - } - - // Populate returned platforms from the cache. - if (Platforms) { - PI_ASSERT(NumEntries <= PiPlatformsCache->size(), - UR_RESULT_ERROR_INVALID_PLATFORM); - std::copy_n(PiPlatformsCache->begin(), NumEntries, Platforms); - } - - if (NumPlatforms) { - if (*NumPlatforms == 0) - *NumPlatforms = PiPlatformsCache->size(); - else - *NumPlatforms = std::min(PiPlatformsCache->size(), (size_t)NumEntries); - } - - return UR_RESULT_SUCCESS; -} - -ur_result_t urPlatformGetInfo( - ur_platform_handle_t Platform, ///< [in] handle of the platform - ur_platform_info_t ParamName, ///< [in] type of the info to retrieve - size_t Size, ///< [in] the number of bytes pointed to by pPlatformInfo. - void *ParamValue, ///< [out][optional] array of bytes holding the info. - ///< If Size is not equal to or greater to the real number - ///< of bytes needed to return the info then the - ///< ::UR_RESULT_ERROR_INVALID_SIZE error is returned and - ///< pPlatformInfo is not used. - size_t *pSizeRet ///< [out][optional] pointer to the actual number of bytes - ///< being queried by pPlatformInfo. -) { - - PI_ASSERT(Platform, UR_RESULT_ERROR_INVALID_PLATFORM); - UrReturnHelper ReturnValue(Size, ParamValue, pSizeRet); - - switch (ParamName) { - case UR_PLATFORM_INFO_NAME: - // TODO: Query Level Zero driver when relevant info is added there. - return ReturnValue("Intel(R) oneAPI Unified Runtime over Level-Zero"); - case UR_PLATFORM_INFO_VENDOR_NAME: - // TODO: Query Level Zero driver when relevant info is added there. - return ReturnValue("Intel(R) Corporation"); - case UR_PLATFORM_INFO_EXTENSIONS: - // Convention adopted from OpenCL: - // "Returns a space-separated list of extension names (the extension - // names themselves do not contain any spaces) supported by the platform. - // Extensions defined here must be supported by all devices associated - // with this platform." - // - // TODO: Check the common extensions supported by all connected devices and - // return them. For now, hardcoding some extensions we know are supported by - // all Level Zero devices. - return ReturnValue(ZE_SUPPORTED_EXTENSIONS); - case UR_PLATFORM_INFO_PROFILE: - // TODO: figure out what this means and how is this used - return ReturnValue("FULL_PROFILE"); - case UR_PLATFORM_INFO_VERSION: - // TODO: this should query to zeDriverGetDriverVersion - // but we don't yet have the driver handle here. - // - // From OpenCL 2.1: "This version string has the following format: - // OpenCL. Follow the same notation here. - // - return ReturnValue(Platform->ZeDriverApiVersion.c_str()); - case UR_PLATFORM_INFO_BACKEND: - return ReturnValue(UR_PLATFORM_BACKEND_LEVEL_ZERO); - default: - urPrint("piPlatformGetInfo: unrecognized ParamName\n"); - return UR_RESULT_ERROR_INVALID_VALUE; - } - - return UR_RESULT_SUCCESS; -} - -ur_result_t urDeviceGet( - ur_platform_handle_t Platform, ///< [in] handle of the platform instance - ur_device_type_t DeviceType, ///< [in] the type of the devices. - uint32_t NumEntries, ///< [in] the number of devices to be added to - ///< phDevices. If phDevices in not NULL then - ///< NumEntries should be greater than zero, otherwise - ///< ::UR_RESULT_ERROR_INVALID_SIZE, will be returned. - ur_device_handle_t - *Devices, ///< [out][optional][range(0, NumEntries)] array of handle of - ///< devices. If NumEntries is less than the number of devices - ///< available, then platform shall only retrieve that number - ///< of devices. - uint32_t *NumDevices ///< [out][optional] pointer to the number of devices. - ///< pNumDevices will be updated with the total number - ///< of devices available. - -) { - - PI_ASSERT(Platform, UR_RESULT_ERROR_INVALID_PLATFORM); - - auto Res = Platform->populateDeviceCacheIfNeeded(); - if (Res != UR_RESULT_SUCCESS) { - return Res; - } - - // Filter available devices based on input DeviceType. - std::vector MatchedDevices; - std::shared_lock Lock(Platform->PiDevicesCacheMutex); - for (auto &D : Platform->PiDevicesCache) { - // Only ever return root-devices from piDevicesGet, but the - // devices cache also keeps sub-devices. - if (D->isSubDevice()) - continue; - - bool Matched = false; - switch (DeviceType) { - case UR_DEVICE_TYPE_ALL: - Matched = true; - break; - case UR_DEVICE_TYPE_GPU: - case UR_DEVICE_TYPE_DEFAULT: - Matched = (D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_GPU); - break; - case UR_DEVICE_TYPE_CPU: - Matched = (D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_CPU); - break; - case UR_DEVICE_TYPE_FPGA: - Matched = D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_FPGA; - break; - case UR_DEVICE_TYPE_MCA: - Matched = D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_MCA; - break; - default: - Matched = false; - urPrint("Unknown device type"); - break; - } - if (Matched) - MatchedDevices.push_back(D.get()); - } - - uint32_t ZeDeviceCount = MatchedDevices.size(); - - auto N = std::min(ZeDeviceCount, NumEntries); - if (Devices) - std::copy_n(MatchedDevices.begin(), N, Devices); - - if (NumDevices) { - if (*NumDevices == 0) - *NumDevices = ZeDeviceCount; - else - *NumDevices = N; - } - - return UR_RESULT_SUCCESS; -} - -ur_result_t urDeviceGetInfo( - ur_device_handle_t Device, ///< [in] handle of the device instance - ur_device_info_t ParamName, ///< [in] type of the info to retrieve - size_t propSize, ///< [in] the number of bytes pointed to by pDeviceInfo. - void *ParamValue, ///< [out][optional] array of bytes holding the info. - ///< If propSize is not equal to or greater than the real - ///< number of bytes needed to return the info then the - ///< ::UR_RESULT_ERROR_INVALID_SIZE error is returned and - ///< pDeviceInfo is not used. - size_t *pSize ///< [out][optional] pointer to the actual size in bytes of - ///< the queried infoType. -) { - PI_ASSERT(Device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UrReturnHelper ReturnValue(propSize, ParamValue, pSize); - - ze_device_handle_t ZeDevice = Device->ZeDevice; - - switch ((int)ParamName) { - case UR_DEVICE_INFO_TYPE: { - switch (Device->ZeDeviceProperties->type) { - case ZE_DEVICE_TYPE_GPU: - return ReturnValue(UR_DEVICE_TYPE_GPU); - case ZE_DEVICE_TYPE_CPU: - return ReturnValue(UR_DEVICE_TYPE_CPU); - case ZE_DEVICE_TYPE_FPGA: - return ReturnValue(UR_DEVICE_TYPE_FPGA); - default: - urPrint("This device type is not supported\n"); - return UR_RESULT_ERROR_INVALID_VALUE; - } - } - case UR_DEVICE_INFO_PARENT_DEVICE: - return ReturnValue(Device->RootDevice); - case UR_DEVICE_INFO_PLATFORM: - return ReturnValue(Device->Platform); - case UR_DEVICE_INFO_VENDOR_ID: - return ReturnValue(uint32_t{Device->ZeDeviceProperties->vendorId}); - case UR_DEVICE_INFO_UUID: { - // Intel extension for device UUID. This returns the UUID as - // std::array. For details about this extension, - // see sycl/doc/extensions/supported/sycl_ext_intel_device_info.md. - const auto &UUID = Device->ZeDeviceProperties->uuid.id; - return ReturnValue(UUID, sizeof(UUID)); - } - case UR_DEVICE_INFO_ATOMIC_64: - return ReturnValue(uint32_t{Device->ZeDeviceModuleProperties->flags & - ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS}); - case UR_DEVICE_INFO_EXTENSIONS: { - // Convention adopted from OpenCL: - // "Returns a space separated list of extension names (the extension - // names themselves do not contain any spaces) supported by the device." - // - // TODO: Use proper mechanism to get this information from Level Zero after - // it is added to Level Zero. - // Hardcoding the few we know are supported by the current hardware. - // - // - std::string SupportedExtensions; - - // cl_khr_il_program - OpenCL 2.0 KHR extension for SPIR-V support. Core - // feature in >OpenCL 2.1 - // cl_khr_subgroups - Extension adds support for implementation-controlled - // subgroups. - // cl_intel_subgroups - Extension adds subgroup features, defined by Intel. - // cl_intel_subgroups_short - Extension adds subgroup functions described in - // the cl_intel_subgroups extension to support 16-bit integer data types - // for performance. - // cl_intel_required_subgroup_size - Extension to allow programmers to - // optionally specify the required subgroup size for a kernel function. - // cl_khr_fp16 - Optional half floating-point support. - // cl_khr_fp64 - Support for double floating-point precision. - // cl_khr_int64_base_atomics, cl_khr_int64_extended_atomics - Optional - // extensions that implement atomic operations on 64-bit signed and - // unsigned integers to locations in __global and __local memory. - // cl_khr_3d_image_writes - Extension to enable writes to 3D image memory - // objects. - // - // Hardcoding some extensions we know are supported by all Level Zero - // devices. - SupportedExtensions += (ZE_SUPPORTED_EXTENSIONS); - if (Device->ZeDeviceModuleProperties->flags & ZE_DEVICE_MODULE_FLAG_FP16) - SupportedExtensions += ("cl_khr_fp16 "); - if (Device->ZeDeviceModuleProperties->flags & ZE_DEVICE_MODULE_FLAG_FP64) - SupportedExtensions += ("cl_khr_fp64 "); - if (Device->ZeDeviceModuleProperties->flags & - ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS) - // int64AtomicsSupported indicates support for both. - SupportedExtensions += - ("cl_khr_int64_base_atomics cl_khr_int64_extended_atomics "); - if (Device->ZeDeviceImageProperties->maxImageDims3D > 0) - // Supports reading and writing of images. - SupportedExtensions += ("cl_khr_3d_image_writes "); - - // L0 does not tell us if bfloat16 is supported. - // For now, assume ATS and PVC support it. - // TODO: change the way we detect bfloat16 support. - if ((Device->ZeDeviceProperties->deviceId & 0xfff) == 0x201 || - (Device->ZeDeviceProperties->deviceId & 0xff0) == 0xbd0) - SupportedExtensions += ("cl_intel_bfloat16_conversions "); - - return ReturnValue(SupportedExtensions.c_str()); - } - case UR_DEVICE_INFO_NAME: - return ReturnValue(Device->ZeDeviceProperties->name); - // zeModuleCreate allows using root device module for sub-devices: - // > The application must only use the module for the device, or its - // > sub-devices, which was provided during creation. - case UR_EXT_DEVICE_INFO_BUILD_ON_SUBDEVICE: - return ReturnValue(uint32_t{0}); - case UR_DEVICE_INFO_COMPILER_AVAILABLE: - return ReturnValue(uint32_t{1}); - case UR_DEVICE_INFO_LINKER_AVAILABLE: - return ReturnValue(uint32_t{1}); - case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: { - uint32_t MaxComputeUnits = - Device->ZeDeviceProperties->numEUsPerSubslice * - Device->ZeDeviceProperties->numSubslicesPerSlice * - Device->ZeDeviceProperties->numSlices; - - bool RepresentsCSlice = - Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] - .ZeIndex >= 0; - if (RepresentsCSlice) - MaxComputeUnits /= Device->RootDevice->SubDevices.size(); - - return ReturnValue(uint32_t{MaxComputeUnits}); - } - case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: - // Level Zero spec defines only three dimensions - return ReturnValue(uint32_t{3}); - case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: - return ReturnValue( - uint64_t{Device->ZeDeviceComputeProperties->maxTotalGroupSize}); - case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: { - struct { - size_t Arr[3]; - } MaxGroupSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX, - Device->ZeDeviceComputeProperties->maxGroupSizeY, - Device->ZeDeviceComputeProperties->maxGroupSizeZ}}; - return ReturnValue(MaxGroupSize); - } - case UR_EXT_DEVICE_INFO_MAX_WORK_GROUPS_3D: { - struct { - size_t Arr[3]; - } MaxGroupCounts = {{Device->ZeDeviceComputeProperties->maxGroupCountX, - Device->ZeDeviceComputeProperties->maxGroupCountY, - Device->ZeDeviceComputeProperties->maxGroupCountZ}}; - return ReturnValue(MaxGroupCounts); - } - case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: - return ReturnValue(uint32_t{Device->ZeDeviceProperties->coreClockRate}); - case UR_DEVICE_INFO_ADDRESS_BITS: { - // TODO: To confirm with spec. - return ReturnValue(uint32_t{64}); - } - case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: - return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize}); - case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: { - uint64_t GlobalMemSize = 0; - // Support to read physicalSize depends on kernel, - // so fallback into reading totalSize if physicalSize - // is not available. - for (const auto &ZeDeviceMemoryExtProperty : - Device->ZeDeviceMemoryProperties->second) { - GlobalMemSize += ZeDeviceMemoryExtProperty.physicalSize; - } - if (GlobalMemSize == 0) { - for (const auto &ZeDeviceMemoryProperty : - Device->ZeDeviceMemoryProperties->first) { - GlobalMemSize += ZeDeviceMemoryProperty.totalSize; - } - } - return ReturnValue(uint64_t{GlobalMemSize}); - } - case UR_DEVICE_INFO_LOCAL_MEM_SIZE: - return ReturnValue( - uint64_t{Device->ZeDeviceComputeProperties->maxSharedLocalMemory}); - case UR_DEVICE_INFO_IMAGE_SUPPORTED: - return ReturnValue( - uint32_t{Device->ZeDeviceImageProperties->maxImageDims1D > 0}); - case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: - return ReturnValue(uint32_t{(Device->ZeDeviceProperties->flags & - ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0}); - case UR_DEVICE_INFO_AVAILABLE: - return ReturnValue(uint32_t{ZeDevice ? true : false}); - case UR_DEVICE_INFO_VENDOR: - // TODO: Level-Zero does not return vendor's name at the moment - // only the ID. - return ReturnValue("Intel(R) Corporation"); - case UR_DEVICE_INFO_DRIVER_VERSION: - return ReturnValue(Device->Platform->ZeDriverVersion.c_str()); - case UR_DEVICE_INFO_VERSION: - return ReturnValue(Device->Platform->ZeDriverApiVersion.c_str()); - case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: { - auto Res = Device->Platform->populateDeviceCacheIfNeeded(); - if (Res != UR_RESULT_SUCCESS) { - return Res; - } - return ReturnValue((uint32_t)Device->SubDevices.size()); - } - case UR_DEVICE_INFO_REFERENCE_COUNT: - return ReturnValue(uint32_t{Device->RefCount.load()}); - case UR_DEVICE_INFO_PARTITION_PROPERTIES: { - // SYCL spec says: if this SYCL device cannot be partitioned into at least - // two sub devices then the returned vector must be empty. - auto Res = Device->Platform->populateDeviceCacheIfNeeded(); - if (Res != UR_RESULT_SUCCESS) { - return Res; - } - - uint32_t ZeSubDeviceCount = Device->SubDevices.size(); - if (ZeSubDeviceCount < 2) { - return ReturnValue((ur_device_partition_property_t)0); - } - bool PartitionedByCSlice = Device->SubDevices[0]->isCCS(); - - auto ReturnHelper = [&](auto... Partitions) { - struct { - ur_device_partition_property_t Arr[sizeof...(Partitions) + 1]; - } PartitionProperties = { - {Partitions..., ur_device_partition_property_t(0)}}; - return ReturnValue(PartitionProperties); - }; - - if (ExposeCSliceInAffinityPartitioning) { - if (PartitionedByCSlice) - return ReturnHelper(UR_DEVICE_PARTITION_BY_CSLICE, - UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); - - else - return ReturnHelper(UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); - } else { - return ReturnHelper(PartitionedByCSlice - ? UR_DEVICE_PARTITION_BY_CSLICE - : UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); - } - break; - } - case UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: - return ReturnValue(ur_device_affinity_domain_flag_t( - UR_DEVICE_AFFINITY_DOMAIN_FLAG_NUMA | - UR_DEVICE_AFFINITY_DOMAIN_FLAG_NEXT_PARTITIONABLE)); - case UR_DEVICE_INFO_PARTITION_TYPE: { - // For root-device there is no partitioning to report. - if (!Device->isSubDevice()) - return ReturnValue(ur_device_partition_property_t(0)); - - if (Device->isCCS()) { - struct { - ur_device_partition_property_t Arr[2]; - } PartitionProperties = { - {UR_DEVICE_PARTITION_BY_CSLICE, ur_device_partition_property_t(0)}}; - return ReturnValue(PartitionProperties); - } - - struct { - ur_device_partition_property_t Arr[3]; - } PartitionProperties = { - {UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, - (ur_device_partition_property_t) - UR_DEVICE_AFFINITY_DOMAIN_FLAG_NEXT_PARTITIONABLE, - ur_device_partition_property_t(0)}}; - return ReturnValue(PartitionProperties); - } - - // Everything under here is not supported yet - - case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION: - return ReturnValue(""); - case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: - return ReturnValue(uint32_t{true}); - case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE: - return ReturnValue( - size_t{Device->ZeDeviceModuleProperties->printfBufferSize}); - case UR_DEVICE_INFO_PROFILE: - return ReturnValue("FULL_PROFILE"); - case UR_DEVICE_INFO_BUILT_IN_KERNELS: - // TODO: To find out correct value - return ReturnValue(""); - case UR_DEVICE_INFO_QUEUE_PROPERTIES: - return ReturnValue( - ur_queue_flag_t(UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE | - UR_QUEUE_FLAG_PROFILING_ENABLE)); - case UR_DEVICE_INFO_EXECUTION_CAPABILITIES: - return ReturnValue(ur_device_exec_capability_flag_t{ - UR_DEVICE_EXEC_CAPABILITY_FLAG_NATIVE_KERNEL}); - case UR_DEVICE_INFO_ENDIAN_LITTLE: - return ReturnValue(uint32_t{true}); - case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: - return ReturnValue(uint32_t{Device->ZeDeviceProperties->flags & - ZE_DEVICE_PROPERTY_FLAG_ECC}); - case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: - return ReturnValue(size_t{Device->ZeDeviceProperties->timerResolution}); - case UR_DEVICE_INFO_LOCAL_MEM_TYPE: - return ReturnValue(UR_DEVICE_LOCAL_MEM_TYPE_LOCAL); - case UR_DEVICE_INFO_MAX_CONSTANT_ARGS: - return ReturnValue(uint32_t{64}); - case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: - return ReturnValue( - uint64_t{Device->ZeDeviceImageProperties->maxImageBufferSize}); - case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: - return ReturnValue(UR_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE); - case UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE: - return ReturnValue( - // TODO[1.0]: how to query cache line-size? - uint32_t{1}); - case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: - return ReturnValue(uint64_t{Device->ZeDeviceCacheProperties->cacheSize}); - case UR_DEVICE_INFO_MAX_PARAMETER_SIZE: - return ReturnValue( - size_t{Device->ZeDeviceModuleProperties->maxArgumentsSize}); - case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: - // SYCL/OpenCL spec is vague on what this means exactly, but seems to - // be for "alignment requirement (in bits) for sub-buffer offsets." - // An OpenCL implementation returns 8*128, but Level Zero can do just 8, - // meaning unaligned access for values of types larger than 8 bits. - return ReturnValue(uint32_t{8}); - case UR_DEVICE_INFO_MAX_SAMPLERS: - return ReturnValue(uint32_t{Device->ZeDeviceImageProperties->maxSamplers}); - case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS: - return ReturnValue( - uint32_t{Device->ZeDeviceImageProperties->maxReadImageArgs}); - case UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: - return ReturnValue( - uint32_t{Device->ZeDeviceImageProperties->maxWriteImageArgs}); - case UR_DEVICE_INFO_SINGLE_FP_CONFIG: { - uint64_t SingleFPValue = 0; - ze_device_fp_flags_t ZeSingleFPCapabilities = - Device->ZeDeviceModuleProperties->fp32flags; - if (ZE_DEVICE_FP_FLAG_DENORM & ZeSingleFPCapabilities) { - SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_DENORM; - } - if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeSingleFPCapabilities) { - SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN; - } - if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeSingleFPCapabilities) { - SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST; - } - if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeSingleFPCapabilities) { - SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO; - } - if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeSingleFPCapabilities) { - SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF; - } - if (ZE_DEVICE_FP_FLAG_FMA & ZeSingleFPCapabilities) { - SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_FMA; - } - if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeSingleFPCapabilities) { - SingleFPValue |= - UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; - } - return ReturnValue(uint64_t{SingleFPValue}); - } - case UR_DEVICE_INFO_HALF_FP_CONFIG: { - uint64_t HalfFPValue = 0; - ze_device_fp_flags_t ZeHalfFPCapabilities = - Device->ZeDeviceModuleProperties->fp16flags; - if (ZE_DEVICE_FP_FLAG_DENORM & ZeHalfFPCapabilities) { - HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_DENORM; - } - if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeHalfFPCapabilities) { - HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN; - } - if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeHalfFPCapabilities) { - HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST; - } - if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeHalfFPCapabilities) { - HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO; - } - if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeHalfFPCapabilities) { - HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF; - } - if (ZE_DEVICE_FP_FLAG_FMA & ZeHalfFPCapabilities) { - HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_FMA; - } - if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeHalfFPCapabilities) { - HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; - } - return ReturnValue(uint64_t{HalfFPValue}); - } - case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: { - uint64_t DoubleFPValue = 0; - ze_device_fp_flags_t ZeDoubleFPCapabilities = - Device->ZeDeviceModuleProperties->fp64flags; - if (ZE_DEVICE_FP_FLAG_DENORM & ZeDoubleFPCapabilities) { - DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_DENORM; - } - if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeDoubleFPCapabilities) { - DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN; - } - if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeDoubleFPCapabilities) { - DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST; - } - if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeDoubleFPCapabilities) { - DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO; - } - if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeDoubleFPCapabilities) { - DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF; - } - if (ZE_DEVICE_FP_FLAG_FMA & ZeDoubleFPCapabilities) { - DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_FMA; - } - if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeDoubleFPCapabilities) { - DoubleFPValue |= - UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; - } - return ReturnValue(uint64_t{DoubleFPValue}); - } - case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: - return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims2D}); - case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: - return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims2D}); - case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: - return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D}); - case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: - return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D}); - case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: - return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D}); - case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: - return ReturnValue( - size_t{Device->ZeDeviceImageProperties->maxImageBufferSize}); - case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: - return ReturnValue( - size_t{Device->ZeDeviceImageProperties->maxImageArraySlices}); - // Handle SIMD widths. - // TODO: can we do better than this? - case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: - case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: - return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 1); - case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: - case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT: - return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 2); - case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: - case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT: - return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 4); - case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: - case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG: - return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 8); - case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: - case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT: - return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 4); - case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: - case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE: - return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 8); - case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: - case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF: - return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 2); - case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { - // Max_num_sub_Groups = maxTotalGroupSize/min(set of subGroupSizes); - uint32_t MinSubGroupSize = - Device->ZeDeviceComputeProperties->subGroupSizes[0]; - for (uint32_t I = 1; - I < Device->ZeDeviceComputeProperties->numSubGroupSizes; I++) { - if (MinSubGroupSize > Device->ZeDeviceComputeProperties->subGroupSizes[I]) - MinSubGroupSize = Device->ZeDeviceComputeProperties->subGroupSizes[I]; - } - return ReturnValue(Device->ZeDeviceComputeProperties->maxTotalGroupSize / - MinSubGroupSize); - } - case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: { - // TODO: Not supported yet. Needs to be updated after support is added. - return ReturnValue(uint32_t{false}); - } - case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { - // ze_device_compute_properties.subGroupSizes is in uint32_t whereas the - // expected return is size_t datatype. size_t can be 8 bytes of data. - return ReturnValue.template operator()( - Device->ZeDeviceComputeProperties->subGroupSizes, - Device->ZeDeviceComputeProperties->numSubGroupSizes); - } - case UR_DEVICE_INFO_IL_VERSION: { - // Set to a space separated list of IL version strings of the form - // _.. - // "SPIR-V" is a required IL prefix when cl_khr_il_progam extension is - // reported. - uint32_t SpirvVersion = - Device->ZeDeviceModuleProperties->spirvVersionSupported; - uint32_t SpirvVersionMajor = ZE_MAJOR_VERSION(SpirvVersion); - uint32_t SpirvVersionMinor = ZE_MINOR_VERSION(SpirvVersion); - - char SpirvVersionString[50]; - int Len = sprintf(SpirvVersionString, "SPIR-V_%d.%d ", SpirvVersionMajor, - SpirvVersionMinor); - // returned string to contain only len number of characters. - std::string ILVersion(SpirvVersionString, Len); - return ReturnValue(ILVersion.c_str()); - } - case UR_DEVICE_INFO_USM_HOST_SUPPORT: - case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: - case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: - case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: - case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: { - auto MapCaps = [](const ze_memory_access_cap_flags_t &ZeCapabilities) { - uint64_t Capabilities = 0; - if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_RW) - Capabilities |= UR_EXT_USM_CAPS_ACCESS; - if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC) - Capabilities |= UR_EXT_USM_CAPS_ATOMIC_ACCESS; - if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT) - Capabilities |= UR_EXT_USM_CAPS_CONCURRENT_ACCESS; - if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC) - Capabilities |= UR_EXT_USM_CAPS_CONCURRENT_ATOMIC_ACCESS; - return Capabilities; - }; - auto &Props = Device->ZeDeviceMemoryAccessProperties; - switch (ParamName) { - case UR_DEVICE_INFO_USM_HOST_SUPPORT: - return ReturnValue(MapCaps(Props->hostAllocCapabilities)); - case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: - return ReturnValue(MapCaps(Props->deviceAllocCapabilities)); - case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: - return ReturnValue(MapCaps(Props->sharedSingleDeviceAllocCapabilities)); - case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: - return ReturnValue(MapCaps(Props->sharedCrossDeviceAllocCapabilities)); - case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: - return ReturnValue(MapCaps(Props->sharedSystemAllocCapabilities)); - default: - die("piDeviceGetInfo: enexpected ParamName."); - } - } - - // intel extensions for GPU information - case UR_DEVICE_INFO_DEVICE_ID: - return ReturnValue(uint32_t{Device->ZeDeviceProperties->deviceId}); - case UR_DEVICE_INFO_PCI_ADDRESS: { - if (getenv("ZES_ENABLE_SYSMAN") == nullptr) { - urPrint("Set SYCL_ENABLE_PCI=1 to obtain PCI data.\n"); - return UR_RESULT_ERROR_INVALID_VALUE; - } - ZesStruct ZeDevicePciProperties; - ZE_CALL(zesDevicePciGetProperties, (ZeDevice, &ZeDevicePciProperties)); - constexpr size_t AddressBufferSize = 13; - char AddressBuffer[AddressBufferSize]; - std::snprintf(AddressBuffer, AddressBufferSize, "%04x:%02x:%02x.%01x", - ZeDevicePciProperties.address.domain, - ZeDevicePciProperties.address.bus, - ZeDevicePciProperties.address.device, - ZeDevicePciProperties.address.function); - return ReturnValue(AddressBuffer); - } - - case UR_EXT_DEVICE_INFO_FREE_MEMORY: { - if (getenv("ZES_ENABLE_SYSMAN") == nullptr) { - setErrorMessage("Set ZES_ENABLE_SYSMAN=1 to obtain free memory", - UR_RESULT_SUCCESS); - return UR_EXT_RESULT_ADAPTER_SPECIFIC_ERROR; - } - // Only report device memory which zeMemAllocDevice can allocate from. - // Currently this is only the one enumerated with ordinal 0. - uint64_t FreeMemory = 0; - uint32_t MemCount = 0; - ZE_CALL(zesDeviceEnumMemoryModules, (ZeDevice, &MemCount, nullptr)); - if (MemCount != 0) { - std::vector ZesMemHandles(MemCount); - ZE_CALL(zesDeviceEnumMemoryModules, - (ZeDevice, &MemCount, ZesMemHandles.data())); - for (auto &ZesMemHandle : ZesMemHandles) { - ZesStruct ZesMemProperties; - ZE_CALL(zesMemoryGetProperties, (ZesMemHandle, &ZesMemProperties)); - // For root-device report memory from all memory modules since that - // is what totally available in the default implicit scaling mode. - // For sub-devices only report memory local to them. - if (!Device->isSubDevice() || Device->ZeDeviceProperties->subdeviceId == - ZesMemProperties.subdeviceId) { - - ZesStruct ZesMemState; - ZE_CALL(zesMemoryGetState, (ZesMemHandle, &ZesMemState)); - FreeMemory += ZesMemState.free; - } - } - } - return ReturnValue(FreeMemory); - } - case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: { - // If there are not any memory modules then return 0. - if (Device->ZeDeviceMemoryProperties->first.empty()) - return ReturnValue(uint32_t{0}); - - // If there are multiple memory modules on the device then we have to report - // the value of the slowest memory. - auto Comp = [](const ze_device_memory_properties_t &A, - const ze_device_memory_properties_t &B) -> bool { - return A.maxClockRate < B.maxClockRate; - }; - auto MinIt = - std::min_element(Device->ZeDeviceMemoryProperties->first.begin(), - Device->ZeDeviceMemoryProperties->first.end(), Comp); - return ReturnValue(uint32_t{MinIt->maxClockRate}); - } - case UR_EXT_DEVICE_INFO_MEMORY_BUS_WIDTH: { - // If there are not any memory modules then return 0. - if (Device->ZeDeviceMemoryProperties->first.empty()) - return ReturnValue(uint32_t{0}); - - // If there are multiple memory modules on the device then we have to report - // the value of the slowest memory. - auto Comp = [](const ze_device_memory_properties_t &A, - const ze_device_memory_properties_t &B) -> bool { - return A.maxBusWidth < B.maxBusWidth; - }; - auto MinIt = - std::min_element(Device->ZeDeviceMemoryProperties->first.begin(), - Device->ZeDeviceMemoryProperties->first.end(), Comp); - return ReturnValue(uint32_t{MinIt->maxBusWidth}); - } - case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: { - if (Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] - .ZeIndex >= 0) - // Sub-sub-device represents a particular compute index already. - return ReturnValue(int32_t{1}); - - auto ZeDeviceNumIndices = - Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] - .ZeProperties.numQueues; - return ReturnValue(int32_t(ZeDeviceNumIndices)); - } break; - case UR_DEVICE_INFO_GPU_EU_COUNT: { - uint32_t count = Device->ZeDeviceProperties->numEUsPerSubslice * - Device->ZeDeviceProperties->numSubslicesPerSlice * - Device->ZeDeviceProperties->numSlices; - return ReturnValue(uint32_t{count}); - } - case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH: - return ReturnValue( - uint32_t{Device->ZeDeviceProperties->physicalEUSimdWidth}); - case UR_EXT_DEVICE_INFO_GPU_SLICES: - return ReturnValue(uint32_t{Device->ZeDeviceProperties->numSlices}); - case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE: - return ReturnValue( - uint32_t{Device->ZeDeviceProperties->numSubslicesPerSlice}); - case UR_EXT_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE: - return ReturnValue(uint32_t{Device->ZeDeviceProperties->numEUsPerSubslice}); - case UR_EXT_DEVICE_INFO_GPU_HW_THREADS_PER_EU: - return ReturnValue(uint32_t{Device->ZeDeviceProperties->numThreadsPerEU}); - case UR_EXT_DEVICE_INFO_MAX_MEM_BANDWIDTH: - // currently not supported in level zero runtime - return UR_RESULT_ERROR_INVALID_VALUE; - case UR_DEVICE_INFO_BFLOAT16: { - // bfloat16 math functions are not yet supported on Intel GPUs. - return ReturnValue(bool{false}); - } - case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { - // There are no explicit restrictions in L0 programming guide, so assume all - // are supported - ur_memory_scope_capability_flags_t result = - UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM; - - return ReturnValue(result); - } - case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: { - // There are no explicit restrictions in L0 programming guide, so assume all - // are supported - ur_memory_order_capability_flags_t result = - UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | - UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | - UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | - UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL | - UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST; - - return ReturnValue(result); - } - case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { - // There are no explicit restrictions in L0 programming guide, so assume all - // are supported - ur_memory_scope_capability_flags_t result = - UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM; - - return ReturnValue(result); - } - - case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { - ur_memory_order_capability_flags_t capabilities = - UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | - UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | - UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | - UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL | - UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST; - return ReturnValue(capabilities); - } - case UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT: - return ReturnValue(pi_bool{false}); - case UR_DEVICE_INFO_IMAGE_SRGB: - return ReturnValue(pi_bool{false}); - - // TODO: Implement. - default: - urPrint("Unsupported ParamName in piGetDeviceInfo\n"); - urPrint("ParamName=%d(0x%x)\n", ParamName, ParamName); - return UR_RESULT_ERROR_INVALID_VALUE; - } - - return UR_RESULT_SUCCESS; -} - -// UR_L0_USE_COPY_ENGINE can be set to an integer value, or -// a pair of integer values of the form "lower_index:upper_index". -// Here, the indices point to copy engines in a list of all available copy -// engines. -// This functions returns this pair of indices. -// If the user specifies only a single integer, a value of 0 indicates that -// the copy engines will not be used at all. A value of 1 indicates that all -// available copy engines can be used. -const std::pair -getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device) { - const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE"); - const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE"); - static const char *EnvVar = UrRet ? UrRet : (PiRet ? PiRet : nullptr); - - // If the environment variable is not set, no copy engines are used when - // immediate commandlists are being used. For standard commandlists all are - // used. - if (!EnvVar) { - if (Device->useImmediateCommandLists()) - return std::pair(0, 0); // Only main copy engine will be used. - return std::pair(0, INT_MAX); // All copy engines will be used. - } - std::string CopyEngineRange = EnvVar; - // Environment variable can be a single integer or a pair of integers - // separated by ":" - auto pos = CopyEngineRange.find(":"); - if (pos == std::string::npos) { - bool UseCopyEngine = (std::stoi(CopyEngineRange) != 0); - if (UseCopyEngine) - return std::pair(0, INT_MAX); // All copy engines can be used. - return std::pair(-1, -1); // No copy engines will be used. - } - int LowerCopyEngineIndex = std::stoi(CopyEngineRange.substr(0, pos)); - int UpperCopyEngineIndex = std::stoi(CopyEngineRange.substr(pos + 1)); - if ((LowerCopyEngineIndex > UpperCopyEngineIndex) || - (LowerCopyEngineIndex < -1) || (UpperCopyEngineIndex < -1)) { - urPrint("UR_L0_LEVEL_ZERO_USE_COPY_ENGINE: invalid value provided, " - "default set.\n"); - LowerCopyEngineIndex = 0; - UpperCopyEngineIndex = INT_MAX; - } - return std::pair(LowerCopyEngineIndex, UpperCopyEngineIndex); -} - -bool CopyEngineRequested(const ur_device_handle_t &Device) { - int LowerCopyQueueIndex = getRangeOfAllowedCopyEngines(Device).first; - int UpperCopyQueueIndex = getRangeOfAllowedCopyEngines(Device).second; - return ((LowerCopyQueueIndex != -1) || (UpperCopyQueueIndex != -1)); -} - -// Whether immediate commandlists will be used for kernel launches and copies. -// The default is standard commandlists. Setting 1 or 2 specifies use of -// immediate commandlists. - -// Get value of immediate commandlists env var setting or -1 if unset. -_ur_device_handle_t::ImmCmdlistMode -_ur_device_handle_t::useImmediateCommandLists() { - // If immediate commandlist setting is not explicitly set, then use the device - // default. - static const int ImmediateCommandlistsSetting = [] { - char *UrRet = std::getenv("UR_L0_USE_IMMEDIATE_COMMANDLISTS"); - char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"); - const char *ImmediateCommandlistsSettingStr = - UrRet ? UrRet : (PiRet ? PiRet : nullptr); - if (!ImmediateCommandlistsSettingStr) - return -1; - return std::stoi(ImmediateCommandlistsSettingStr); - }(); - - if (ImmediateCommandlistsSetting == -1) - // Immediate command lists will be used by default only on Linux PVC. -#ifdef _WIN32 - return NotUsed; -#else - return isPVC() ? PerQueue : NotUsed; -#endif - - switch (ImmediateCommandlistsSetting) { - case 0: - return NotUsed; - case 1: - return PerQueue; - case 2: - return PerThreadPerQueue; - default: - return NotUsed; - } -} - -// Get value of device scope events env var setting or default setting -static const EventsScope DeviceEventsSetting = [] { - char *UrRet = std::getenv("UR_L0_DEVICE_SCOPE_EVENTS"); - char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS"); - const char *DeviceEventsSettingStr = - UrRet ? UrRet : (PiRet ? PiRet : nullptr); - if (DeviceEventsSettingStr) { - // Override the default if user has explicitly chosen the events scope. - switch (std::stoi(DeviceEventsSettingStr)) { - case 0: - return AllHostVisible; - case 1: - return OnDemandHostVisibleProxy; - case 2: - return LastCommandInBatchHostVisible; - default: - // fallthrough to default setting - break; - } - } - // This is our default setting, which is expected to be the fastest - // with the modern GPU drivers. - return AllHostVisible; -}(); - -ur_result_t _ur_device_handle_t::initialize(int SubSubDeviceOrdinal, - int SubSubDeviceIndex) { - - // Maintain various device properties cache. - // Note that we just describe here how to compute the data. - // The real initialization is upon first access. - // - auto ZeDevice = this->ZeDevice; - ZeDeviceProperties.Compute = [ZeDevice](ze_device_properties_t &Properties) { - ZE_CALL_NOCHECK(zeDeviceGetProperties, (ZeDevice, &Properties)); - }; - - ZeDeviceComputeProperties.Compute = - [ZeDevice](ze_device_compute_properties_t &Properties) { - ZE_CALL_NOCHECK(zeDeviceGetComputeProperties, (ZeDevice, &Properties)); - }; - - ZeDeviceImageProperties.Compute = - [ZeDevice](ze_device_image_properties_t &Properties) { - ZE_CALL_NOCHECK(zeDeviceGetImageProperties, (ZeDevice, &Properties)); - }; - - ZeDeviceModuleProperties.Compute = - [ZeDevice](ze_device_module_properties_t &Properties) { - ZE_CALL_NOCHECK(zeDeviceGetModuleProperties, (ZeDevice, &Properties)); - }; - - ZeDeviceMemoryProperties.Compute = - [ZeDevice]( - std::pair>, - std::vector>> - &Properties) { - uint32_t Count = 0; - ZE_CALL_NOCHECK(zeDeviceGetMemoryProperties, - (ZeDevice, &Count, nullptr)); - - auto &PropertiesVector = Properties.first; - auto &PropertiesExtVector = Properties.second; - - PropertiesVector.resize(Count); - PropertiesExtVector.resize(Count); - // Request for extended memory properties be read in - for (uint32_t I = 0; I < Count; ++I) - PropertiesVector[I].pNext = (void *)&PropertiesExtVector[I]; - - ZE_CALL_NOCHECK(zeDeviceGetMemoryProperties, - (ZeDevice, &Count, PropertiesVector.data())); - }; - - ZeDeviceMemoryAccessProperties.Compute = - [ZeDevice](ze_device_memory_access_properties_t &Properties) { - ZE_CALL_NOCHECK(zeDeviceGetMemoryAccessProperties, - (ZeDevice, &Properties)); - }; - - ZeDeviceCacheProperties.Compute = - [ZeDevice](ze_device_cache_properties_t &Properties) { - // TODO: Since v1.0 there can be multiple cache properties. - // For now remember the first one, if any. - uint32_t Count = 0; - ZE_CALL_NOCHECK(zeDeviceGetCacheProperties, - (ZeDevice, &Count, nullptr)); - if (Count > 0) - Count = 1; - ZE_CALL_NOCHECK(zeDeviceGetCacheProperties, - (ZeDevice, &Count, &Properties)); - }; - - ImmCommandListUsed = this->useImmediateCommandLists(); - - if (ImmCommandListUsed == ImmCmdlistMode::NotUsed) { - ZeEventsScope = DeviceEventsSetting; - } - - uint32_t numQueueGroups = 0; - ZE_CALL(zeDeviceGetCommandQueueGroupProperties, - (ZeDevice, &numQueueGroups, nullptr)); - if (numQueueGroups == 0) { - return UR_RESULT_ERROR_UNKNOWN; - } - urPrint("NOTE: Number of queue groups = %d\n", numQueueGroups); - std::vector> - QueueGroupProperties(numQueueGroups); - ZE_CALL(zeDeviceGetCommandQueueGroupProperties, - (ZeDevice, &numQueueGroups, QueueGroupProperties.data())); - - // Initialize ordinal and compute queue group properties - for (uint32_t i = 0; i < numQueueGroups; i++) { - if (QueueGroupProperties[i].flags & - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) { - QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal = - i; - QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] - .ZeProperties = QueueGroupProperties[i]; - break; - } - } - - // Reinitialize a sub-sub-device with its own ordinal, index. - // Our sub-sub-device representation is currently [Level-Zero sub-device - // handle + Level-Zero compute group/engine index]. Only the specified - // index queue will be used to submit work to the sub-sub-device. - if (SubSubDeviceOrdinal >= 0) { - QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal = - SubSubDeviceOrdinal; - QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeIndex = - SubSubDeviceIndex; - } else { // Proceed with initialization for root and sub-device - // How is it possible that there are no "compute" capabilities? - if (QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal < - 0) { - return UR_RESULT_ERROR_UNKNOWN; - } - - if (CopyEngineRequested((ur_device_handle_t)this)) { - for (uint32_t i = 0; i < numQueueGroups; i++) { - if (((QueueGroupProperties[i].flags & - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) && - (QueueGroupProperties[i].flags & - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) { - if (QueueGroupProperties[i].numQueues == 1) { - QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal = i; - QueueGroup[queue_group_info_t::MainCopy].ZeProperties = - QueueGroupProperties[i]; - } else { - QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal = i; - QueueGroup[queue_group_info_t::LinkCopy].ZeProperties = - QueueGroupProperties[i]; - break; - } - } - } - if (QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal < 0) - urPrint("NOTE: main blitter/copy engine is not available\n"); - else - urPrint("NOTE: main blitter/copy engine is available\n"); - - if (QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal < 0) - urPrint("NOTE: link blitter/copy engines are not available\n"); - else - urPrint("NOTE: link blitter/copy engines are available\n"); - } - } - - return UR_RESULT_SUCCESS; -} - -// Get the cached PI device created for the L0 device handle. -// Return NULL if no such PI device found. -ur_device_handle_t -_ur_platform_handle_t::getDeviceFromNativeHandle(ze_device_handle_t ZeDevice) { - - ur_result_t Res = populateDeviceCacheIfNeeded(); - if (Res != UR_RESULT_SUCCESS) { - return nullptr; - } - - // TODO: our sub-sub-device representation is currently [Level-Zero device - // handle + Level-Zero compute group/engine index], so there is now no 1:1 - // mapping from L0 device handle to PI device assumed in this function. Until - // Level-Zero adds unique ze_device_handle_t for sub-sub-devices, here we - // filter out PI sub-sub-devices. - std::shared_lock Lock(PiDevicesCacheMutex); - auto it = std::find_if(PiDevicesCache.begin(), PiDevicesCache.end(), - [&](std::unique_ptr &D) { - return D.get()->ZeDevice == ZeDevice && - (D.get()->RootDevice == nullptr || - D.get()->RootDevice->RootDevice == nullptr); - }); - if (it != PiDevicesCache.end()) { - return (*it).get(); - } - return nullptr; -} - -// Check the device cache and load it if necessary. -ur_result_t _ur_platform_handle_t::populateDeviceCacheIfNeeded() { - std::scoped_lock Lock(PiDevicesCacheMutex); - - if (DeviceCachePopulated) { - return UR_RESULT_SUCCESS; - } - - uint32_t ZeDeviceCount = 0; - ZE_CALL(zeDeviceGet, (ZeDriver, &ZeDeviceCount, nullptr)); - - try { - std::vector ZeDevices(ZeDeviceCount); - ZE_CALL(zeDeviceGet, (ZeDriver, &ZeDeviceCount, ZeDevices.data())); - - for (uint32_t I = 0; I < ZeDeviceCount; ++I) { - std::unique_ptr Device( - new ur_device_handle_t_(ZeDevices[I], (ur_platform_handle_t)this)); - auto Result = Device->initialize(); - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - - // Additionally we need to cache all sub-devices too, such that they - // are readily visible to the piextDeviceCreateWithNativeHandle. - // - uint32_t SubDevicesCount = 0; - ZE_CALL(zeDeviceGetSubDevices, - (Device->ZeDevice, &SubDevicesCount, nullptr)); - - auto ZeSubdevices = new ze_device_handle_t[SubDevicesCount]; - ZE_CALL(zeDeviceGetSubDevices, - (Device->ZeDevice, &SubDevicesCount, ZeSubdevices)); - - // Wrap the Level Zero sub-devices into PI sub-devices, and add them to - // cache. - for (uint32_t I = 0; I < SubDevicesCount; ++I) { - std::unique_ptr PiSubDevice( - new ur_device_handle_t_(ZeSubdevices[I], (ur_platform_handle_t)this, - Device.get())); - auto Result = PiSubDevice->initialize(); - if (Result != UR_RESULT_SUCCESS) { - delete[] ZeSubdevices; - return Result; - } - - // collect all the ordinals for the sub-sub-devices - std::vector Ordinals; - - uint32_t numQueueGroups = 0; - ZE_CALL(zeDeviceGetCommandQueueGroupProperties, - (PiSubDevice->ZeDevice, &numQueueGroups, nullptr)); - if (numQueueGroups == 0) { - return UR_RESULT_ERROR_UNKNOWN; - } - std::vector QueueGroupProperties( - numQueueGroups); - ZE_CALL(zeDeviceGetCommandQueueGroupProperties, - (PiSubDevice->ZeDevice, &numQueueGroups, - QueueGroupProperties.data())); - - for (uint32_t i = 0; i < numQueueGroups; i++) { - if (QueueGroupProperties[i].flags & - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE && - QueueGroupProperties[i].numQueues > 1) { - Ordinals.push_back(i); - } - } - - // If isn't PVC, then submissions to different CCS can be executed on - // the same EUs still, so we cannot treat them as sub-sub-devices. - if (PiSubDevice->isPVC() || ExposeCSliceInAffinityPartitioning) { - // Create PI sub-sub-devices with the sub-device for all the ordinals. - // Each {ordinal, index} points to a specific CCS which constructs - // a sub-sub-device at this point. - // - // FIXME: Level Zero creates multiple PiDevices for a single physical - // device when sub-device is partitioned into sub-sub-devices. - // Sub-sub-device is technically a command queue and we should not - // build program for each command queue. PiDevice is probably not the - // right abstraction for a Level Zero command queue. - for (uint32_t J = 0; J < Ordinals.size(); ++J) { - for (uint32_t K = 0; - K < QueueGroupProperties[Ordinals[J]].numQueues; ++K) { - std::unique_ptr PiSubSubDevice( - new ur_device_handle_t_(ZeSubdevices[I], - (ur_platform_handle_t)this, - PiSubDevice.get())); - auto Result = PiSubSubDevice->initialize(Ordinals[J], K); - if (Result != UR_RESULT_SUCCESS) { - return Result; - } - - // save pointers to sub-sub-devices for quick retrieval in the - // future. - PiSubDevice->SubDevices.push_back(PiSubSubDevice.get()); - PiDevicesCache.push_back(std::move(PiSubSubDevice)); - } - } - } - - // save pointers to sub-devices for quick retrieval in the future. - Device->SubDevices.push_back(PiSubDevice.get()); - PiDevicesCache.push_back(std::move(PiSubDevice)); - } - delete[] ZeSubdevices; - - // Save the root device in the cache for future uses. - PiDevicesCache.push_back(std::move(Device)); - } - } catch (const std::bad_alloc &) { - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - DeviceCachePopulated = true; - return UR_RESULT_SUCCESS; -} - -ur_result_t urDeviceRetain(ur_device_handle_t Device) { - PI_ASSERT(Device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - - // The root-device ref-count remains unchanged (always 1). - if (Device->isSubDevice()) { - Device->RefCount.increment(); - } - return UR_RESULT_SUCCESS; -} - -ur_result_t urDeviceRelease(ur_device_handle_t Device) { - PI_ASSERT(Device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - - // Root devices are destroyed during the piTearDown process. - if (Device->isSubDevice()) { - if (Device->RefCount.decrementAndTest()) { - delete Device; - } - } - - return UR_RESULT_SUCCESS; -} - -void ZeUSMImportExtension::setZeUSMImport(_ur_platform_handle_t *Platform) { - // Whether env var SYCL_USM_HOSTPTR_IMPORT has been set requesting - // host ptr import during buffer creation. - const char *USMHostPtrImportStr = std::getenv("SYCL_USM_HOSTPTR_IMPORT"); - if (!USMHostPtrImportStr || std::atoi(USMHostPtrImportStr) == 0) - return; - - // Check if USM hostptr import feature is available. - ze_driver_handle_t DriverHandle = Platform->ZeDriver; - if (ZE_CALL_NOCHECK( - zeDriverGetExtensionFunctionAddress, - (DriverHandle, "zexDriverImportExternalPointer", - reinterpret_cast(&zexDriverImportExternalPointer))) == 0) { - ZE_CALL_NOCHECK( - zeDriverGetExtensionFunctionAddress, - (DriverHandle, "zexDriverReleaseImportedPointer", - reinterpret_cast(&zexDriverReleaseImportedPointer))); - // Hostptr import/release is turned on because it has been requested - // by the env var, and this platform supports the APIs. - Enabled = true; - // Hostptr import is only possible if piMemBufferCreate receives a - // hostptr as an argument. The SYCL runtime passes a host ptr - // only when SYCL_HOST_UNIFIED_MEMORY is enabled. Therefore we turn it on. - setEnvVar("SYCL_HOST_UNIFIED_MEMORY", "1"); - } -} -void ZeUSMImportExtension::doZeUSMImport(ze_driver_handle_t DriverHandle, - void *HostPtr, size_t Size) { - ZE_CALL_NOCHECK(zexDriverImportExternalPointer, - (DriverHandle, HostPtr, Size)); -} -void ZeUSMImportExtension::doZeUSMRelease(ze_driver_handle_t DriverHandle, - void *HostPtr) { - ZE_CALL_NOCHECK(zexDriverReleaseImportedPointer, (DriverHandle, HostPtr)); -} - -ur_result_t urDevicePartition( - ur_device_handle_t Device, ///< [in] handle of the device to partition. - const ur_device_partition_property_t - *Properties, ///< [in] null-terminated array of <$_device_partition_t - ///< enum, value> pairs. - uint32_t NumDevices, ///< [in] the number of sub-devices. - ur_device_handle_t - *OutDevices, ///< [out][optional][range(0, NumDevices)] array of handle - ///< of devices. If NumDevices is less than the number of - ///< sub-devices available, then the function shall only - ///< retrieve that number of sub-devices. - uint32_t *pNumDevicesRet ///< [out][optional] pointer to the number of - ///< sub-devices the device can be partitioned into - ///< according to the partitioning property. -) { - PI_ASSERT(Device, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - // Other partitioning ways are not supported by Level Zero - if (Properties[0] == UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN) { - if ((Properties[1] != UR_DEVICE_AFFINITY_DOMAIN_FLAG_NEXT_PARTITIONABLE && - Properties[1] != UR_DEVICE_AFFINITY_DOMAIN_FLAG_NUMA)) { - return UR_RESULT_ERROR_INVALID_VALUE; - } - } else if (Properties[0] == UR_DEVICE_PARTITION_BY_CSLICE) { - if (Properties[1] != 0) { - return UR_RESULT_ERROR_INVALID_VALUE; - } - } else { - return UR_RESULT_ERROR_INVALID_VALUE; - } - - // Devices cache is normally created in piDevicesGet but still make - // sure that cache is populated. - // - auto Res = Device->Platform->populateDeviceCacheIfNeeded(); - if (Res != UR_RESULT_SUCCESS) { - return Res; - } - - auto EffectiveNumDevices = [&]() -> decltype(Device->SubDevices.size()) { - if (Device->SubDevices.size() == 0) - return 0; - - // Sub-Sub-Devices are partitioned by CSlices, not by affinity domain. - // However, if - // UR_L0_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING overrides that - // still expose CSlices in partitioning by affinity domain for compatibility - // reasons. - if (Properties[0] == UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN && - !ExposeCSliceInAffinityPartitioning) { - if (Device->isSubDevice()) { - return 0; - } - } - if (Properties[0] == UR_DEVICE_PARTITION_BY_CSLICE) { - // Not a CSlice-based partitioning. - if (!Device->SubDevices[0]->isCCS()) { - return 0; - } - } - - return Device->SubDevices.size(); - }(); - - // TODO: Consider support for partitioning to <= total sub-devices. - // Currently supported partitioning (by affinity domain/numa) would always - // partition to all sub-devices. - // - if (NumDevices != 0) - PI_ASSERT(NumDevices == EffectiveNumDevices, UR_RESULT_ERROR_INVALID_VALUE); - - for (uint32_t I = 0; I < NumDevices; I++) { - OutDevices[I] = Device->SubDevices[I]; - // reusing the same pi_device needs to increment the reference count - urDeviceRetain(OutDevices[I]); - } - - if (pNumDevicesRet) { - *pNumDevicesRet = EffectiveNumDevices; - } - return UR_RESULT_SUCCESS; -} - -ur_result_t urInit([[maybe_unused]] ur_device_init_flags_t device_flags) { - return UR_RESULT_SUCCESS; -} - -ur_result_t urTearDown([[maybe_unused]] void *pParams) { - return UR_RESULT_SUCCESS; -} diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp index ed815806a2258..5095e168a4a3e 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp @@ -21,190 +21,12 @@ #include #include "ur_level_zero_common.hpp" - -struct _ur_platform_handle_t; -// using ur_platform_handle_t = _ur_platform_handle_t *; -struct _ur_device_handle_t; -// using ur_device_handle_t = _ur_device_handle_t *; - -struct _ur_platform_handle_t : public _ur_platform { - _ur_platform_handle_t(ze_driver_handle_t Driver) : ZeDriver{Driver} {} - // Performs initialization of a newly constructed PI platform. - ur_result_t initialize(); - - // Level Zero lacks the notion of a platform, but there is a driver, which is - // a pretty good fit to keep here. - ze_driver_handle_t ZeDriver; - - // Cache versions info from zeDriverGetProperties. - std::string ZeDriverVersion; - std::string ZeDriverApiVersion; - ze_api_version_t ZeApiVersion; - - // Cache driver extensions - std::unordered_map zeDriverExtensionMap; - - // Flags to tell whether various Level Zero platform extensions are available. - bool ZeDriverGlobalOffsetExtensionFound{false}; - bool ZeDriverModuleProgramExtensionFound{false}; - - // Cache UR devices for reuse - std::vector> PiDevicesCache; - ur_shared_mutex PiDevicesCacheMutex; - bool DeviceCachePopulated = false; - - // Check the device cache and load it if necessary. - ur_result_t populateDeviceCacheIfNeeded(); - - // Return the PI device from cache that represents given native device. - // If not found, then nullptr is returned. - ur_device_handle_t getDeviceFromNativeHandle(ze_device_handle_t); -}; - -enum EventsScope { - // All events are created host-visible. - AllHostVisible, - // All events are created with device-scope and only when - // host waits them or queries their status that a proxy - // host-visible event is created and set to signal after - // original event signals. - OnDemandHostVisibleProxy, - // All events are created with device-scope and only - // when a batch of commands is submitted for execution a - // last command in that batch is added to signal host-visible - // completion of each command in this batch (the default mode). - LastCommandInBatchHostVisible -}; - -struct _ur_device_handle_t : _ur_object { - _ur_device_handle_t(ze_device_handle_t Device, ur_platform_handle_t Plt, - ur_device_handle_t ParentDevice = nullptr) - : ZeDevice{Device}, Platform{Plt}, RootDevice{ParentDevice}, - ZeDeviceProperties{}, ZeDeviceComputeProperties{} { - // NOTE: one must additionally call initialize() to complete - // UR device creation. - } - - // The helper structure that keeps info about a command queue groups of the - // device. It is not changed after it is initialized. - struct queue_group_info_t { - enum type { - MainCopy, - LinkCopy, - Compute, - Size // must be last - }; - - // Keep the ordinal of the commands group as returned by - // zeDeviceGetCommandQueueGroupProperties. A value of "-1" means that - // there is no such queue group available in the Level Zero runtime. - int32_t ZeOrdinal{-1}; - - // Keep the index of the specific queue in this queue group where - // all the command enqueues of the corresponding type should go to. - // The value of "-1" means that no hard binding is defined and - // implementation can choose specific queue index on its own. - int32_t ZeIndex{-1}; - - // Keeps the queue group properties. - ZeStruct ZeProperties; - }; - - std::vector QueueGroup = - std::vector(queue_group_info_t::Size); - - // This returns "true" if a main copy engine is available for use. - bool hasMainCopyEngine() const { - return QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal >= 0; - } - - // This returns "true" if a link copy engine is available for use. - bool hasLinkCopyEngine() const { - return QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal >= 0; - } - - // This returns "true" if a main or link copy engine is available for use. - bool hasCopyEngine() const { - return hasMainCopyEngine() || hasLinkCopyEngine(); - } - - // Initialize the entire UR device. - // Optional param `SubSubDeviceOrdinal` `SubSubDeviceIndex` are the compute - // command queue ordinal and index respectively, used to initialize - // sub-sub-devices. - ur_result_t initialize(int SubSubDeviceOrdinal = -1, - int SubSubDeviceIndex = -1); - - // Level Zero device handle. - // This field is only set at _ur_device_handle_t creation time, and cannot - // change. Therefore it can be accessed without holding a lock on this - // _ur_device_handle_t. - const ze_device_handle_t ZeDevice; - - // Keep the subdevices that are partitioned from this ur_device_handle_t for - // reuse The order of sub-devices in this vector is repeated from the - // ze_device_handle_t array that are returned from zeDeviceGetSubDevices() - // call, which will always return sub-devices in the fixed same order. - std::vector SubDevices; - - // PI platform to which this device belongs. - // This field is only set at _ur_device_handle_t creation time, and cannot - // change. Therefore it can be accessed without holding a lock on this - // _ur_device_handle_t. - ur_platform_handle_t Platform; - - // Root-device of a sub-device, null if this is not a sub-device. - // This field is only set at _ur_device_handle_t creation time, and cannot - // change. Therefore it can be accessed without holding a lock on this - // _ur_device_handle_t. - const ur_device_handle_t RootDevice; - - enum ImmCmdlistMode { - // Immediate commandlists are not used. - NotUsed = 0, - // One set of compute and copy immediate commandlists per queue. - PerQueue, - // One set of compute and copy immediate commandlists per host thread that - // accesses the queue. - PerThreadPerQueue - }; - // Read env settings to select immediate commandlist mode. - ImmCmdlistMode useImmediateCommandLists(); - - // Returns whether immediate command lists are used on this device. - ImmCmdlistMode ImmCommandListUsed{}; - - // Scope of events used for events on the device - // Can be adjusted with UR_DEVICE_SCOPE_EVENTS - // for non-immediate command lists - EventsScope ZeEventsScope = AllHostVisible; - - bool isSubDevice() { return RootDevice != nullptr; } - - // Is this a Data Center GPU Max series (aka PVC)? - // TODO: change to use - // https://spec.oneapi.io/level-zero/latest/core/api.html#ze-device-ip-version-ext-t - // when that is stable. - bool isPVC() { - return (ZeDeviceProperties->deviceId & 0xff0) == 0xbd0 || - (ZeDeviceProperties->deviceId & 0xff0) == 0xb60; - } - - // Does this device represent a single compute slice? - bool isCCS() const { - return QueueGroup[_ur_device_handle_t::queue_group_info_t::Compute] - .ZeIndex >= 0; - } - - // Cache of the immutable device properties. - ZeCache> ZeDeviceProperties; - ZeCache> ZeDeviceComputeProperties; - ZeCache> ZeDeviceImageProperties; - ZeCache> ZeDeviceModuleProperties; - ZeCache>, - std::vector>>> - ZeDeviceMemoryProperties; - ZeCache> - ZeDeviceMemoryAccessProperties; - ZeCache> ZeDeviceCacheProperties; -}; +#include "ur_level_zero_context.hpp" +#include "ur_level_zero_device.hpp" +#include "ur_level_zero_event.hpp" +#include "ur_level_zero_kernel.hpp" +#include "ur_level_zero_mem.hpp" +#include "ur_level_zero_platform.hpp" +#include "ur_level_zero_program.hpp" +#include "ur_level_zero_queue.hpp" +#include "ur_level_zero_sampler.hpp" diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp index 16b4c1ef4e582..a26e3412fadca 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp @@ -8,9 +8,13 @@ #pragma once #include -#include +#include #include +#include +#include +#include #include +#include #include #include @@ -19,15 +23,6 @@ #include #include "ur/usm_allocator_config.hpp" -#include "ur_level_zero_context.hpp" -#include "ur_level_zero_device.hpp" -#include "ur_level_zero_event.hpp" -#include "ur_level_zero_mem.hpp" -#include "ur_level_zero_module.hpp" -#include "ur_level_zero_platform.hpp" -#include "ur_level_zero_program.hpp" -#include "ur_level_zero_queue.hpp" -#include "ur_level_zero_sampler.hpp" struct _ur_platform_handle_t; @@ -298,6 +293,30 @@ template struct ZesStruct : public T { } }; +// Trace an internal PI call; returns in case of an error. +#define UR_CALL(Call) \ + { \ + if (PrintTrace) \ + fprintf(stderr, "UR ---> %s\n", #Call); \ + ur_result_t Result = (Call); \ + if (PrintTrace) \ + fprintf(stderr, "UR <--- %s(%s)\n", #Call, getUrResultString(Result)); \ + if (Result != UR_RESULT_SUCCESS) \ + return Result; \ + } + +// This function will ensure compatibility with both Linux and Windows for +// setting environment variables. +bool setEnvVar(const char *name, const char *value); + +// Prints to stderr if UR_L0_DEBUG allows it +void urPrint(const char *Format, ...); + +// Helper for one-liner validation +#define UR_ASSERT(condition, error) \ + if (!(condition)) \ + return error; + // Map Level Zero runtime error code to UR error code. ur_result_t ze2urResult(ze_result_t ZeResult); @@ -316,14 +335,14 @@ ur_result_t ze2urResult(ze_result_t ZeResult); // Record for a memory allocation. This structure is used to keep information // for each memory allocation. struct MemAllocRecord : _ur_object { - MemAllocRecord(pi_context Context, bool OwnZeMemHandle = true) + MemAllocRecord(ur_context_handle_t Context, bool OwnZeMemHandle = true) : Context(Context), OwnZeMemHandle(OwnZeMemHandle) {} // Currently kernel can reference memory allocations from different contexts // and we need to know the context of a memory allocation when we release it // in piKernelRelease. // TODO: this should go away when memory isolation issue is fixed in the Level // Zero runtime. - pi_context Context; + ur_context_handle_t Context; // Indicates if we own the native memory handle or it came from interop that // asked to not transfer the ownership to SYCL RT. @@ -341,6 +360,130 @@ const bool IndirectAccessTrackingEnabled = [] { return RetVal; }(); +extern const bool UseUSMAllocator; + +// The getInfo*/ReturnHelper facilities provide shortcut way of +// writing return bytes for the various getInfo APIs. +template +ur_result_t urL0getInfoImpl(size_t param_value_size, void *param_value, + size_t *param_value_size_ret, T value, + size_t value_size, Assign &&assign_func) { + + if (param_value != nullptr) { + + if (param_value_size < value_size) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + + assign_func(param_value, value, value_size); + } + + if (param_value_size_ret != nullptr) { + *param_value_size_ret = value_size; + } + + return UR_RESULT_SUCCESS; +} + +template +ur_result_t urL0getInfo(size_t param_value_size, void *param_value, + size_t *param_value_size_ret, T value) { + + auto assignment = [](void *param_value, T value, size_t value_size) { + std::ignore = value_size; + *static_cast(param_value) = value; + }; + + return urL0getInfoImpl(param_value_size, param_value, param_value_size_ret, + value, sizeof(T), assignment); +} + +template +ur_result_t urL0getInfoArray(size_t array_length, size_t param_value_size, + void *param_value, size_t *param_value_size_ret, + const T *value) { + return urL0getInfoImpl(param_value_size, param_value, param_value_size_ret, + value, array_length * sizeof(T), memcpy); +} + +template +ur_result_t urL0getInfoArray(size_t array_length, size_t param_value_size, + void *param_value, size_t *param_value_size_ret, + const T *value) { + if (param_value) { + memset(param_value, 0, param_value_size); + for (uint32_t I = 0; I < array_length; I++) + ((RetType *)param_value)[I] = (RetType)value[I]; + } + if (param_value_size_ret) + *param_value_size_ret = array_length * sizeof(RetType); + return UR_RESULT_SUCCESS; +} + +template <> +inline ur_result_t +urL0getInfo(size_t param_value_size, void *param_value, + size_t *param_value_size_ret, const char *value) { + return urL0getInfoArray(strlen(value) + 1, param_value_size, param_value, + param_value_size_ret, value); +} + +class UrL0ReturnHelperBase { +public: + UrL0ReturnHelperBase(size_t param_value_size, void *param_value, + size_t *param_value_size_ret) + : param_value_size(param_value_size), param_value(param_value), + param_value_size_ret(param_value_size_ret) {} + + // A version where in/out info size is represented by a single pointer + // to a value which is updated on return + UrL0ReturnHelperBase(size_t *param_value_size, void *param_value) + : param_value_size(*param_value_size), param_value(param_value), + param_value_size_ret(param_value_size) {} + + // Scalar return value + template ur_result_t operator()(const T &t) { + return getInfo(param_value_size, param_value, param_value_size_ret, t); + } + + // Array return value + template ur_result_t operator()(const T *t, size_t s) { + return urL0getInfoArray(s, param_value_size, param_value, + param_value_size_ret, t); + } + + // Array return value where element type is differrent from T + template + ur_result_t operator()(const T *t, size_t s) { + return urL0getInfoArray(s, param_value_size, param_value, + param_value_size_ret, t); + } + +protected: + size_t param_value_size; + void *param_value; + size_t *param_value_size_ret; +}; + +// A version of return helper that returns pi_result and not ur_result_t +class UrL0ReturnHelper : public UrL0ReturnHelperBase { +public: + using UrL0ReturnHelperBase::UrL0ReturnHelperBase; + + template ur_result_t operator()(const T &t) { + return UrL0ReturnHelperBase::operator()(t); + } + // Array return value + template ur_result_t operator()(const T *t, size_t s) { + return UrL0ReturnHelperBase::operator()(t, s); + } + // Array return value where element type is differrent from T + template + ur_result_t operator()(const T *t, size_t s) { + return UrL0ReturnHelperBase::operator()(t, s); + } +}; + const bool ExposeCSliceInAffinityPartitioning = [] { char *UrRet = std::getenv("UR_L0_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING"); char *PiRet = @@ -366,7 +509,7 @@ class ZeUSMImportExtension { ZeUSMImportExtension() : Enabled{false} {} - void setZeUSMImport(_ur_platform_handle_t *Platform); + void setZeUSMImport(ur_platform_handle_t_ *Platform); void doZeUSMImport(ze_driver_handle_t DriverHandle, void *HostPtr, size_t Size); void doZeUSMRelease(ze_driver_handle_t DriverHandle, void *HostPtr); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp index 43b8d9981b039..815a1a5db06cf 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp @@ -6,4 +6,688 @@ // //===-----------------------------------------------------------------===// +#include +#include +#include +#include + +#include "ur_level_zero.hpp" #include "ur_level_zero_context.hpp" +#include + +UR_APIEXPORT ur_result_t UR_APICALL urContextCreate( + uint32_t DeviceCount, ///< [in] the number of devices given in phDevices + const ur_device_handle_t + *Devices, ///< [in][range(0, DeviceCount)] array of handle of devices. + const ur_context_properties_t + *Properties, ///< [in][optional] pointer to context creation properties. + ur_context_handle_t + *RetContext ///< [out] pointer to handle of context object created +) { + std::ignore = Properties; + + ur_platform_handle_t Platform = Devices[0]->Platform; + ZeStruct ContextDesc{}; + + ze_context_handle_t ZeContext{}; + ZE2UR_CALL(zeContextCreate, (Platform->ZeDriver, &ContextDesc, &ZeContext)); + try { + ur_context_handle_t_ *Context = + new ur_context_handle_t_(ZeContext, DeviceCount, Devices, true); + + Context->initialize(); + *RetContext = reinterpret_cast(Context); + if (IndirectAccessTrackingEnabled) { + std::scoped_lock Lock(Platform->ContextsMutex); + Platform->Contexts.push_back(*RetContext); + } + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextRetain( + ur_context_handle_t + Context ///< [in] handle of the context to get a reference of. +) { + Context->RefCount.increment(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextRelease( + ur_context_handle_t Context ///< [in] handle of the context to release. +) { + ur_platform_handle_t Plt = Context->getPlatform(); + std::unique_lock ContextsLock(Plt->ContextsMutex, + std::defer_lock); + if (IndirectAccessTrackingEnabled) + ContextsLock.lock(); + + return ContextReleaseHelper(Context); +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( + ur_context_handle_t Context, ///< [in] handle of the context + ur_context_info_t ContextInfoType, ///< [in] type of the info to retrieve + size_t PropSize, ///< [in] the number of bytes of memory pointed to by + ///< pContextInfo. + void *ContextInfo, ///< [out][optional] array of bytes holding the info. + ///< if propSize is not equal to or greater than the + ///< real number of bytes needed to return the info then + ///< the ::UR_RESULT_ERROR_INVALID_SIZE error is + ///< returned and pContextInfo is not used. + size_t *PropSizeRet ///< [out][optional] pointer to the actual size in + ///< bytes of data queried by ContextInfoType. +) { + std::shared_lock Lock(Context->Mutex); + UrReturnHelper ReturnValue(PropSize, ContextInfo, PropSizeRet); + switch ( + (uint32_t)ContextInfoType) { // cast to avoid warnings on EXT enum values + case UR_CONTEXT_INFO_DEVICES: + return ReturnValue(&Context->Devices[0], Context->Devices.size()); + case UR_CONTEXT_INFO_NUM_DEVICES: + return ReturnValue(uint32_t(Context->Devices.size())); + case UR_EXT_CONTEXT_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{Context->RefCount.load()}); + case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: + // 2D USM memcpy is supported. + return ReturnValue(pi_bool{true}); + case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT: + // 2D USM fill is not supported. + return ReturnValue(pi_bool{false}); + case UR_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { + ur_memory_order_capability_flags_t Capabilities = + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL | + UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST; + return ReturnValue(Capabilities); + } + default: + // TODO: implement other parameters + die("urGetContextInfo: unsuppported ParamName."); + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle( + ur_context_handle_t Context, ///< [in] handle of the context. + ur_native_handle_t *NativeContext ///< [out] a pointer to the native + ///< handle of the context. +) { + *NativeContext = reinterpret_cast(Context->ZeContext); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( + ur_native_handle_t + NativeContext, ///< [in] the native handle of the context. + ur_context_handle_t *Context ///< [out] pointer to the handle of the + ///< context object created. +) { + try { + ze_context_handle_t ZeContext = + reinterpret_cast(NativeContext); + ur_context_handle_t_ *UrContext = new ur_context_handle_t_(ZeContext); + UrContext->initialize(); + *Context = reinterpret_cast(UrContext); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urContextSetExtendedDeleter( + ur_context_handle_t Context, ///< [in] handle of the context. + ur_context_extended_deleter_t + Deleter, ///< [in] Function pointer to extended deleter. + void *UserData ///< [in][out][optional] pointer to data to be passed to + ///< callback. +) { + std::ignore = Context; + std::ignore = Deleter; + std::ignore = UserData; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t ur_context_handle_t_::initialize() { + + // Helper lambda to create various USM allocators for a device. + // Note that the CCS devices and their respective subdevices share a + // common ze_device_handle and therefore, also share USM allocators. + auto createUSMAllocators = [this](ur_device_handle_t Device) { + SharedMemAllocContexts.emplace( + std::piecewise_construct, std::make_tuple(Device->ZeDevice), + std::make_tuple( + std::unique_ptr(new USMSharedMemoryAlloc( + reinterpret_cast(this), + reinterpret_cast(Device))), + USMAllocatorConfigInstance.Configs[usm_settings::MemType::Shared])); + + SharedReadOnlyMemAllocContexts.emplace( + std::piecewise_construct, std::make_tuple(Device->ZeDevice), + std::make_tuple( + std::unique_ptr(new USMSharedReadOnlyMemoryAlloc( + reinterpret_cast(this), + reinterpret_cast(Device))), + USMAllocatorConfigInstance + .Configs[usm_settings::MemType::SharedReadOnly])); + + DeviceMemAllocContexts.emplace( + std::piecewise_construct, std::make_tuple(Device->ZeDevice), + std::make_tuple( + std::unique_ptr(new USMDeviceMemoryAlloc( + reinterpret_cast(this), + reinterpret_cast(Device))), + USMAllocatorConfigInstance.Configs[usm_settings::MemType::Device])); + }; + + // Recursive helper to call createUSMAllocators for all sub-devices + std::function createUSMAllocatorsRecursive; + createUSMAllocatorsRecursive = + [createUSMAllocators, + &createUSMAllocatorsRecursive](ur_device_handle_t Device) -> void { + createUSMAllocators(Device); + for (auto &SubDevice : Device->SubDevices) + createUSMAllocatorsRecursive(SubDevice); + }; + + // Create USM allocator context for each pair (device, context). + // + for (auto &Device : Devices) { + createUSMAllocatorsRecursive(Device); + } + // Create USM allocator context for host. Device and Shared USM allocations + // are device-specific. Host allocations are not device-dependent therefore + // we don't need a map with device as key. + HostMemAllocContext = std::make_unique( + std::unique_ptr( + new USMHostMemoryAlloc(reinterpret_cast(this))), + USMAllocatorConfigInstance.Configs[usm_settings::MemType::Host]); + + // We may allocate memory to this root device so create allocators. + if (SingleRootDevice && + DeviceMemAllocContexts.find(SingleRootDevice->ZeDevice) == + DeviceMemAllocContexts.end()) { + createUSMAllocators(SingleRootDevice); + } + + // Create the immediate command list to be used for initializations. + // Created as synchronous so level-zero performs implicit synchronization and + // there is no need to query for completion in the plugin + // + // TODO: we use Device[0] here as the single immediate command-list + // for buffer creation and migration. Initialization is in + // in sync and is always performed to Devices[0] as well but + // D2D migartion, if no P2P, is broken since it should use + // immediate command-list for the specfic devices, and this single one. + // + ur_device_handle_t Device = SingleRootDevice ? SingleRootDevice : Devices[0]; + + // Prefer to use copy engine for initialization copies, + // if available and allowed (main copy engine with index 0). + ZeStruct ZeCommandQueueDesc; + const auto &Range = getRangeOfAllowedCopyEngines((ur_device_handle_t)Device); + ZeCommandQueueDesc.ordinal = + Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] + .ZeOrdinal; + if (Range.first >= 0 && + Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::MainCopy] + .ZeOrdinal != -1) + ZeCommandQueueDesc.ordinal = + Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::MainCopy] + .ZeOrdinal; + + ZeCommandQueueDesc.index = 0; + ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS; + ZE2UR_CALL( + zeCommandListCreateImmediate, + (ZeContext, Device->ZeDevice, &ZeCommandQueueDesc, &ZeCommandListInit)); + return UR_RESULT_SUCCESS; +} + +ur_device_handle_t ur_context_handle_t_::getRootDevice() const { + assert(Devices.size() > 0); + + if (Devices.size() == 1) + return Devices[0]; + + // Check if we have context with subdevices of the same device (context + // may include root device itself as well) + ur_device_handle_t ContextRootDevice = + Devices[0]->RootDevice ? Devices[0]->RootDevice : Devices[0]; + + // For context with sub subdevices, the ContextRootDevice might still + // not be the root device. + // Check whether the ContextRootDevice is the subdevice or root device. + if (ContextRootDevice->isSubDevice()) { + ContextRootDevice = ContextRootDevice->RootDevice; + } + + for (auto &Device : Devices) { + if ((!Device->RootDevice && Device != ContextRootDevice) || + (Device->RootDevice && Device->RootDevice != ContextRootDevice)) { + ContextRootDevice = nullptr; + break; + } + } + return ContextRootDevice; +} + +// Helper function to release the context, a caller must lock the platform-level +// mutex guarding the container with contexts because the context can be removed +// from the list of tracked contexts. +ur_result_t ContextReleaseHelper(ur_context_handle_t Context) { + + if (!Context->RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + + if (IndirectAccessTrackingEnabled) { + ur_platform_handle_t Plt = Context->getPlatform(); + auto &Contexts = Plt->Contexts; + auto It = std::find(Contexts.begin(), Contexts.end(), Context); + if (It != Contexts.end()) + Contexts.erase(It); + } + ze_context_handle_t DestroyZeContext = + Context->OwnZeContext ? Context->ZeContext : nullptr; + + // Clean up any live memory associated with Context + ur_result_t Result = Context->finalize(); + + // We must delete Context first and then destroy zeContext because + // Context deallocation requires ZeContext in some member deallocation of + // pi_context. + delete Context; + + // Destruction of some members of pi_context uses L0 context + // and therefore it must be valid at that point. + // Technically it should be placed to the destructor of pi_context + // but this makes API error handling more complex. + if (DestroyZeContext) { + auto ZeResult = ZE_CALL_NOCHECK(zeContextDestroy, (DestroyZeContext)); + // Gracefully handle the case that L0 was already unloaded. + if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) + return ze2urResult(ZeResult); + } + + return Result; +} + +ur_platform_handle_t ur_context_handle_t_::getPlatform() const { + return Devices[0]->Platform; +} + +ur_result_t ur_context_handle_t_::finalize() { + // This function is called when pi_context is deallocated, piContextRelease. + // There could be some memory that may have not been deallocated. + // For example, event and event pool caches would be still alive. + + if (!DisableEventsCaching) { + std::scoped_lock Lock(EventCacheMutex); + for (auto &EventCache : EventCaches) { + for (auto &Event : EventCache) { + auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent)); + // Gracefully handle the case that L0 was already unloaded. + if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) + return ze2urResult(ZeResult); + delete Event; + } + EventCache.clear(); + } + } + { + std::scoped_lock Lock(ZeEventPoolCacheMutex); + for (auto &ZePoolCache : ZeEventPoolCache) { + for (auto &ZePool : ZePoolCache) { + auto ZeResult = ZE_CALL_NOCHECK(zeEventPoolDestroy, (ZePool)); + // Gracefully handle the case that L0 was already unloaded. + if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) + return ze2urResult(ZeResult); + } + ZePoolCache.clear(); + } + } + + // Destroy the command list used for initializations + auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListInit)); + // Gracefully handle the case that L0 was already unloaded. + if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) + return ze2urResult(ZeResult); + + std::scoped_lock Lock(ZeCommandListCacheMutex); + for (auto &List : ZeComputeCommandListCache) { + for (ze_command_list_handle_t &ZeCommandList : List.second) { + if (ZeCommandList) + if (ZeCommandList) { + auto ZeResult = + ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); + // Gracefully handle the case that L0 was already unloaded. + if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) + return ze2urResult(ZeResult); + } + } + } + for (auto &List : ZeCopyCommandListCache) { + for (ze_command_list_handle_t &ZeCommandList : List.second) { + if (ZeCommandList) { + auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); + // Gracefully handle the case that L0 was already unloaded. + if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) + return ze2urResult(ZeResult); + } + } + } + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( + ze_event_pool_handle_t &Pool, size_t &Index, bool HostVisible, + bool ProfilingEnabled) { + // Lock while updating event pool machinery. + std::scoped_lock Lock(ZeEventPoolCacheMutex); + + std::list *ZePoolCache = + getZeEventPoolCache(HostVisible, ProfilingEnabled); + + if (!ZePoolCache->empty()) { + if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) { + if (DisableEventsCaching) { + // Remove full pool from the cache if events caching is disabled. + ZePoolCache->erase(ZePoolCache->begin()); + } else { + // If event caching is enabled then we don't destroy events so there is + // no need to remove pool from the cache and add it back when it has + // available slots. Just keep it in the tail of the cache so that all + // pools can be destroyed during context destruction. + ZePoolCache->push_front(nullptr); + } + } + } + if (ZePoolCache->empty()) { + ZePoolCache->push_back(nullptr); + } + + // We shall be adding an event to the front pool. + ze_event_pool_handle_t *ZePool = &ZePoolCache->front(); + Index = 0; + // Create one event ZePool per MaxNumEventsPerPool events + if (*ZePool == nullptr) { + ZeStruct ZeEventPoolDesc; + ZeEventPoolDesc.count = MaxNumEventsPerPool; + ZeEventPoolDesc.flags = 0; + if (HostVisible) + ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + if (ProfilingEnabled) + ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + urPrint("ze_event_pool_desc_t flags set to: %d\n", ZeEventPoolDesc.flags); + + std::vector ZeDevices; + std::for_each( + Devices.begin(), Devices.end(), + [&](const ur_device_handle_t &D) { ZeDevices.push_back(D->ZeDevice); }); + + ZE2UR_CALL(zeEventPoolCreate, (ZeContext, &ZeEventPoolDesc, + ZeDevices.size(), &ZeDevices[0], ZePool)); + NumEventsAvailableInEventPool[*ZePool] = MaxNumEventsPerPool - 1; + NumEventsUnreleasedInEventPool[*ZePool] = 1; + } else { + Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[*ZePool]; + --NumEventsAvailableInEventPool[*ZePool]; + ++NumEventsUnreleasedInEventPool[*ZePool]; + } + Pool = *ZePool; + return UR_RESULT_SUCCESS; +} + +ur_event_handle_t +ur_context_handle_t_::getEventFromContextCache(bool HostVisible, + bool WithProfiling) { + std::scoped_lock Lock(EventCacheMutex); + auto Cache = getEventCache(HostVisible, WithProfiling); + if (Cache->empty()) + return nullptr; + + auto It = Cache->begin(); + ur_event_handle_t Event = *It; + Cache->erase(It); + // We have to reset event before using it. + Event->reset(); + return Event; +} + +void ur_context_handle_t_::addEventToContextCache(ur_event_handle_t Event) { + std::scoped_lock Lock(EventCacheMutex); + auto Cache = + getEventCache(Event->isHostVisible(), Event->isProfilingEnabled()); + Cache->emplace_back(Event); +} + +ur_result_t +ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) { + std::shared_lock EventLock(Event->Mutex, std::defer_lock); + std::scoped_lock> LockAll( + ZeEventPoolCacheMutex, EventLock); + if (!Event->ZeEventPool) { + // This must be an interop event created on a users's pool. + // Do nothing. + return UR_RESULT_SUCCESS; + } + + std::list *ZePoolCache = + getZeEventPoolCache(Event->isHostVisible(), Event->isProfilingEnabled()); + + // Put the empty pool to the cache of the pools. + if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) + die("Invalid event release: event pool doesn't have unreleased events"); + if (--NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) { + if (ZePoolCache->front() != Event->ZeEventPool) { + ZePoolCache->push_back(Event->ZeEventPool); + } + NumEventsAvailableInEventPool[Event->ZeEventPool] = MaxNumEventsPerPool; + } + + return UR_RESULT_SUCCESS; +} + +// Get value of the threshold for number of events in immediate command lists. +// If number of events in the immediate command list exceeds this threshold then +// cleanup process for those events is executed. +static const size_t ImmCmdListsEventCleanupThreshold = [] { + const char *ImmCmdListsEventCleanupThresholdStr = std::getenv( + "SYCL_PI_LEVEL_ZERO_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD"); + static constexpr int Default = 20; + if (!ImmCmdListsEventCleanupThresholdStr) + return Default; + + int Threshold = std::atoi(ImmCmdListsEventCleanupThresholdStr); + + // Basically disable threshold if negative value is provided. + if (Threshold < 0) + return INT_MAX; + + return Threshold; +}(); + +// Get value of the threshold for number of active command lists allowed before +// we start heuristically cleaning them up. +static const size_t CmdListsCleanupThreshold = [] { + const char *CmdListsCleanupThresholdStr = + std::getenv("SYCL_PI_LEVEL_ZERO_COMMANDLISTS_CLEANUP_THRESHOLD"); + static constexpr int Default = 20; + if (!CmdListsCleanupThresholdStr) + return Default; + + int Threshold = std::atoi(CmdListsCleanupThresholdStr); + + // Basically disable threshold if negative value is provided. + if (Threshold < 0) + return INT_MAX; + + return Threshold; +}(); + +// Retrieve an available command list to be used in a PI call. +ur_result_t ur_context_handle_t_::getAvailableCommandList( + ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList, + bool UseCopyEngine, bool AllowBatching, + ze_command_queue_handle_t *ForcedCmdQueue) { + // Immediate commandlists have been pre-allocated and are always available. + if (Queue->Device->ImmCommandListUsed) { + CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList(); + if (CommandList->second.EventList.size() > + ImmCmdListsEventCleanupThreshold) { + std::vector EventListToCleanup; + Queue->resetCommandList(CommandList, false, EventListToCleanup); + CleanupEventListFromResetCmdList(EventListToCleanup, true); + } + UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList)); + if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine)) + return Res; + return UR_RESULT_SUCCESS; + } else { + // Cleanup regular command-lists if there are too many. + // It handles the case that the queue is not synced to the host + // for a long time and we want to reclaim the command-lists for + // use by other queues. + if (Queue->CommandListMap.size() > CmdListsCleanupThreshold) { + resetCommandLists(Queue); + } + } + + auto &CommandBatch = + UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch; + // Handle batching of commands + // First see if there is an command-list open for batching commands + // for this queue. + if (Queue->hasOpenCommandList(UseCopyEngine)) { + if (AllowBatching) { + CommandList = CommandBatch.OpenCommandList; + UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList)); + return UR_RESULT_SUCCESS; + } + // If this command isn't allowed to be batched or doesn't match the forced + // command queue, then we need to go ahead and execute what is already in + // the batched list, and then go on to process this. On exit from + // executeOpenCommandList OpenCommandList will be invalidated. + if (auto Res = Queue->executeOpenCommandList(UseCopyEngine)) + return Res; + // Note that active barriers do not need to be inserted here as they will + // have been enqueued into the command-list when they were created. + } + + // Create/Reuse the command list, because in Level Zero commands are added to + // the command lists, and later are then added to the command queue. + // Each command list is paired with an associated fence to track when the + // command list is available for reuse. + ur_result_t pi_result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + + // Initally, we need to check if a command list has already been created + // on this device that is available for use. If so, then reuse that + // Level-Zero Command List and Fence for this PI call. + { + // Make sure to acquire the lock before checking the size, or there + // will be a race condition. + std::scoped_lock Lock(Queue->Context->ZeCommandListCacheMutex); + // Under mutex since operator[] does insertion on the first usage for every + // unique ZeDevice. + auto &ZeCommandListCache = + UseCopyEngine + ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice] + : Queue->Context + ->ZeComputeCommandListCache[Queue->Device->ZeDevice]; + + for (auto ZeCommandListIt = ZeCommandListCache.begin(); + ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) { + auto &ZeCommandList = *ZeCommandListIt; + auto it = Queue->CommandListMap.find(ZeCommandList); + if (it != Queue->CommandListMap.end()) { + if (ForcedCmdQueue && *ForcedCmdQueue != it->second.ZeQueue) + continue; + CommandList = it; + if (CommandList->second.ZeFence != nullptr) + CommandList->second.ZeFenceInUse = true; + } else { + // If there is a command list available on this context, but it + // wasn't yet used in this queue then create a new entry in this + // queue's map to hold the fence and other associated command + // list information. + auto &QGroup = Queue->getQueueGroup(UseCopyEngine); + uint32_t QueueGroupOrdinal; + auto &ZeCommandQueue = ForcedCmdQueue + ? *ForcedCmdQueue + : QGroup.getZeQueue(&QueueGroupOrdinal); + if (ForcedCmdQueue) + QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue); + + ze_fence_handle_t ZeFence; + ZeStruct ZeFenceDesc; + ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); + CommandList = Queue->CommandListMap + .emplace(ZeCommandList, + pi_command_list_info_t{ZeFence, true, false, + ZeCommandQueue, + QueueGroupOrdinal}) + .first; + } + ZeCommandListCache.erase(ZeCommandListIt); + if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList)) + return Res; + if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine)) + return Res; + return UR_RESULT_SUCCESS; + } + } + + // If there are no available command lists in the cache, then we check for + // command lists that have already signalled, but have not been added to the + // available list yet. Each command list has a fence associated which tracks + // if a command list has completed dispatch of its commands and is ready for + // reuse. If a command list is found to have been signalled, then the + // command list & fence are reset and we return. + for (auto it = Queue->CommandListMap.begin(); + it != Queue->CommandListMap.end(); ++it) { + // Make sure this is the command list type needed. + if (UseCopyEngine != it->second.isCopy(Queue)) + continue; + + ze_result_t ZeResult = + ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence)); + if (ZeResult == ZE_RESULT_SUCCESS) { + std::vector EventListToCleanup; + Queue->resetCommandList(it, false, EventListToCleanup); + CleanupEventListFromResetCmdList(EventListToCleanup, + true /* QueueLocked */); + CommandList = it; + CommandList->second.ZeFenceInUse = true; + if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList)) + return Res; + return UR_RESULT_SUCCESS; + } + } + + // If there are no available command lists nor signalled command lists, + // then we must create another command list. + pi_result = Queue->createCommandList(UseCopyEngine, CommandList); + CommandList->second.ZeFenceInUse = true; + return pi_result; +} + +bool ur_context_handle_t_::isValidDevice(ur_device_handle_t Device) const { + while (Device) { + if (std::find(Devices.begin(), Devices.end(), Device) != Devices.end()) + return true; + Device = Device->RootDevice; + } + return false; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp index 28b4bf599b8a0..8cb8a94124b6a 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp @@ -7,8 +7,235 @@ //===-----------------------------------------------------------------===// #pragma once +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + #include "ur_level_zero_common.hpp" +#include "ur_level_zero_queue.hpp" +#include + +struct ur_context_handle_t_ : _ur_object { + ur_context_handle_t_(ze_context_handle_t ZeContext, uint32_t NumDevices, + const ur_device_handle_t *Devs, bool OwnZeContext) + : ZeContext{ZeContext}, Devices{Devs, Devs + NumDevices}, + OwnZeContext{OwnZeContext} {} + + ur_context_handle_t_(ze_context_handle_t ZeContext) : ZeContext{ZeContext} {} + + // A L0 context handle is primarily used during creation and management of + // resources that may be used by multiple devices. + // This field is only set at _pi_context creation time, and cannot change. + // Therefore it can be accessed without holding a lock on this _pi_context. + const ze_context_handle_t ZeContext{}; + + // Keep the PI devices this PI context was created for. + // This field is only set at _pi_context creation time, and cannot change. + // Therefore it can be accessed without holding a lock on this _pi_context. + // const std::vector Devices; + std::vector Devices; + + // Indicates if we own the ZeContext or it came from interop that + // asked to not transfer the ownership to SYCL RT. + bool OwnZeContext = false; + + // Immediate Level Zero command list for the device in this context, to be + // used for initializations. To be created as: + // - Immediate command list: So any command appended to it is immediately + // offloaded to the device. + // - Synchronous: So implicit synchronization is made inside the level-zero + // driver. + // There will be a list of immediate command lists (for each device) when + // support of the multiple devices per context will be added. + ze_command_list_handle_t ZeCommandListInit{}; + + // Mutex for the immediate command list. Per the Level Zero spec memory copy + // operations submitted to an immediate command list are not allowed to be + // called from simultaneous threads. + ur_mutex ImmediateCommandListMutex; + + // Mutex Lock for the Command List Cache. This lock is used to control both + // compute and copy command list caches. + ur_mutex ZeCommandListCacheMutex; + + // If context contains one device or sub-devices of the same device, we want + // to save this device. + // This field is only set at _pi_context creation time, and cannot change. + // Therefore it can be accessed without holding a lock on this _pi_context. + ur_device_handle_t SingleRootDevice = nullptr; + + // Cache of all currently available/completed command/copy lists. + // Note that command-list can only be re-used on the same device. + // + // TODO: explore if we should use root-device for creating command-lists + // as spec says that in that case any sub-device can re-use it: "The + // application must only use the command list for the device, or its + // sub-devices, which was provided during creation." + // + std::unordered_map> + ZeComputeCommandListCache; + std::unordered_map> + ZeCopyCommandListCache; + + // Store USM allocator context(internal allocator structures) + // for USM shared and device allocations. There is 1 allocator context + // per each pair of (context, device) per each memory type. + std::unordered_map + DeviceMemAllocContexts; + std::unordered_map + SharedMemAllocContexts; + std::unordered_map + SharedReadOnlyMemAllocContexts; + + // Since L0 native runtime does not distinguisg "shared device_read_only" + // vs regular "shared" allocations, we have keep track of it to use + // proper USMAllocContext when freeing allocations. + std::unordered_set SharedReadOnlyAllocs; + + // Store the host allocator context. It does not depend on any device. + std::unique_ptr HostMemAllocContext; + + // We need to store all memory allocations in the context because there could + // be kernels with indirect access. Kernels with indirect access start to + // reference all existing memory allocations at the time when they are + // submitted to the device. Referenced memory allocations can be released only + // when kernel has finished execution. + std::unordered_map MemAllocs; + + // Following member variables are used to manage assignment of events + // to event pools. + // + // TODO: Create pi_event_pool class to encapsulate working with pools. + // This will avoid needing the use of maps below, and cleanup the + // pi_context overall. + // -struct _ur_context_handle_t : _ur_object { - _ur_context_handle_t() {} + // The cache of event pools from where new events are allocated from. + // The head event pool is where the next event would be added to if there + // is still some room there. If there is no room in the head then + // the following event pool is taken (guranteed to be empty) and made the + // head. In case there is no next pool, a new pool is created and made the + // head. + // + // Cache of event pools to which host-visible events are added to. + std::vector> ZeEventPoolCache{4}; + + // This map will be used to determine if a pool is full or not + // by storing number of empty slots available in the pool. + std::unordered_map + NumEventsAvailableInEventPool; + // This map will be used to determine number of unreleased events in the pool. + // We use separate maps for number of event slots available in the pool from + // the number of events unreleased in the pool. + // This will help when we try to make the code thread-safe. + std::unordered_map + NumEventsUnreleasedInEventPool; + + // Mutex to control operations on event pool caches and the helper maps + // holding the current pool usage counts. + ur_mutex ZeEventPoolCacheMutex; + + // Mutex to control operations on event caches. + ur_mutex EventCacheMutex; + + // Caches for events. + std::vector> EventCaches{4}; + + // Initialize the PI context. + ur_result_t initialize(); + + // If context contains one device then return this device. + // If context contains sub-devices of the same device, then return this parent + // device. Return nullptr if context consists of several devices which are not + // sub-devices of the same device. We call returned device the root device of + // a context. + // TODO: get rid of this when contexts with multiple devices are supported for + // images. + ur_device_handle_t getRootDevice() const; + + // Finalize the PI context + ur_result_t finalize(); + + // Return the Platform, which is the same for all devices in the context + ur_platform_handle_t getPlatform() const; + + // Get index of the free slot in the available pool. If there is no available + // pool then create new one. The HostVisible parameter tells if we need a + // slot for a host-visible event. The ProfilingEnabled tells is we need a + // slot for an event with profiling capabilities. + ur_result_t getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &, + bool HostVisible, + bool ProfilingEnabled); + + // Get pi_event from cache. + ur_event_handle_t getEventFromContextCache(bool HostVisible, + bool WithProfiling); + + // Add pi_event to cache. + void addEventToContextCache(ur_event_handle_t); + + auto getZeEventPoolCache(bool HostVisible, bool WithProfiling) { + if (HostVisible) + return WithProfiling ? &ZeEventPoolCache[0] : &ZeEventPoolCache[1]; + else + return WithProfiling ? &ZeEventPoolCache[2] : &ZeEventPoolCache[3]; + } + + // Decrement number of events living in the pool upon event destroy + // and return the pool to the cache if there are no unreleased events. + ur_result_t decrementUnreleasedEventsInPool(ur_event_handle_t Event); + + // Retrieves a command list for executing on this device along with + // a fence to be used in tracking the execution of this command list. + // If a command list has been created on this device which has + // completed its commands, then that command list and its associated fence + // will be reused. Otherwise, a new command list and fence will be created for + // running on this device. L0 fences are created on a L0 command queue so the + // caller must pass a command queue to create a new fence for the new command + // list if a command list/fence pair is not available. All Command Lists & + // associated fences are destroyed at Device Release. + // If UseCopyEngine is true, the command will eventually be executed in a + // copy engine. Otherwise, the command will be executed in a compute engine. + // If AllowBatching is true, then the command list returned may already have + // command in it, if AllowBatching is false, any open command lists that + // already exist in Queue will be closed and executed. + // If ForcedCmdQueue is not nullptr, the resulting command list must be tied + // to the contained command queue. This option is ignored if immediate + // command lists are used. + // When using immediate commandlists, retrieves an immediate command list + // for executing on this device. Immediate commandlists are created only + // once for each SYCL Queue and after that they are reused. + ur_result_t + getAvailableCommandList(ur_queue_handle_t Queue, + ur_command_list_ptr_t &CommandList, + bool UseCopyEngine, bool AllowBatching = false, + ze_command_queue_handle_t *ForcedCmdQueue = nullptr); + + // Checks if Device is covered by this context. + // For that the Device or its root devices need to be in the context. + bool isValidDevice(ur_device_handle_t Device) const; + +private: + // Get the cache of events for a provided scope and profiling mode. + auto getEventCache(bool HostVisible, bool WithProfiling) { + if (HostVisible) + return WithProfiling ? &EventCaches[0] : &EventCaches[1]; + else + return WithProfiling ? &EventCaches[2] : &EventCaches[3]; + } }; + +// Helper function to release the context, a caller must lock the platform-level +// mutex guarding the container with contexts because the context can be removed +// from the list of tracked contexts. +ur_result_t ContextReleaseHelper(ur_context_handle_t Context); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp index 4ef5d989ca953..8983835ad0811 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp @@ -7,3 +7,1259 @@ //===-----------------------------------------------------------------===// #include "ur_level_zero_device.hpp" +#include +#include +#include + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet( + ur_platform_handle_t Platform, ///< [in] handle of the platform instance + ur_device_type_t DeviceType, ///< [in] the type of the devices. + uint32_t NumEntries, ///< [in] the number of devices to be added to + ///< phDevices. If phDevices in not NULL then + ///< NumEntries should be greater than zero, otherwise + ///< ::UR_RESULT_ERROR_INVALID_SIZE, will be returned. + ur_device_handle_t + *Devices, ///< [out][optional][range(0, NumEntries)] array of handle of + ///< devices. If NumEntries is less than the number of devices + ///< available, then platform shall only retrieve that number + ///< of devices. + uint32_t *NumDevices ///< [out][optional] pointer to the number of devices. + ///< pNumDevices will be updated with the total number + ///< of devices available. + +) { + + auto Res = Platform->populateDeviceCacheIfNeeded(); + if (Res != UR_RESULT_SUCCESS) { + return Res; + } + + // Filter available devices based on input DeviceType. + std::vector MatchedDevices; + std::shared_lock Lock(Platform->PiDevicesCacheMutex); + for (auto &D : Platform->PiDevicesCache) { + // Only ever return root-devices from piDevicesGet, but the + // devices cache also keeps sub-devices. + if (D->isSubDevice()) + continue; + + bool Matched = false; + switch (DeviceType) { + case UR_DEVICE_TYPE_ALL: + Matched = true; + break; + case UR_DEVICE_TYPE_GPU: + case UR_DEVICE_TYPE_DEFAULT: + Matched = (D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_GPU); + break; + case UR_DEVICE_TYPE_CPU: + Matched = (D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_CPU); + break; + case UR_DEVICE_TYPE_FPGA: + Matched = D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_FPGA; + break; + case UR_DEVICE_TYPE_MCA: + Matched = D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_MCA; + break; + default: + Matched = false; + urPrint("Unknown device type"); + break; + } + if (Matched) + MatchedDevices.push_back(D.get()); + } + + uint32_t ZeDeviceCount = MatchedDevices.size(); + + auto N = std::min(ZeDeviceCount, NumEntries); + if (Devices) + std::copy_n(MatchedDevices.begin(), N, Devices); + + if (NumDevices) { + if (*NumDevices == 0) + *NumDevices = ZeDeviceCount; + else + *NumDevices = N; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( + ur_device_handle_t Device, ///< [in] handle of the device instance + ur_device_info_t ParamName, ///< [in] type of the info to retrieve + size_t propSize, ///< [in] the number of bytes pointed to by pDeviceInfo. + void *ParamValue, ///< [out][optional] array of bytes holding the info. + ///< If propSize is not equal to or greater than the real + ///< number of bytes needed to return the info then the + ///< ::UR_RESULT_ERROR_INVALID_SIZE error is returned and + ///< pDeviceInfo is not used. + size_t *pSize ///< [out][optional] pointer to the actual size in bytes of + ///< the queried infoType. +) { + UrReturnHelper ReturnValue(propSize, ParamValue, pSize); + + ze_device_handle_t ZeDevice = Device->ZeDevice; + + switch ((int)ParamName) { + case UR_DEVICE_INFO_TYPE: { + switch (Device->ZeDeviceProperties->type) { + case ZE_DEVICE_TYPE_GPU: + return ReturnValue(UR_DEVICE_TYPE_GPU); + case ZE_DEVICE_TYPE_CPU: + return ReturnValue(UR_DEVICE_TYPE_CPU); + case ZE_DEVICE_TYPE_FPGA: + return ReturnValue(UR_DEVICE_TYPE_FPGA); + default: + urPrint("This device type is not supported\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + } + case UR_DEVICE_INFO_PARENT_DEVICE: + return ReturnValue(Device->RootDevice); + case UR_DEVICE_INFO_PLATFORM: + return ReturnValue(Device->Platform); + case UR_DEVICE_INFO_VENDOR_ID: + return ReturnValue(uint32_t{Device->ZeDeviceProperties->vendorId}); + case UR_DEVICE_INFO_UUID: { + // Intel extension for device UUID. This returns the UUID as + // std::array. For details about this extension, + // see sycl/doc/extensions/supported/sycl_ext_intel_device_info.md. + const auto &UUID = Device->ZeDeviceProperties->uuid.id; + return ReturnValue(UUID, sizeof(UUID)); + } + case UR_DEVICE_INFO_ATOMIC_64: + return ReturnValue(uint32_t{Device->ZeDeviceModuleProperties->flags & + ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS}); + case UR_DEVICE_INFO_EXTENSIONS: { + // Convention adopted from OpenCL: + // "Returns a space separated list of extension names (the extension + // names themselves do not contain any spaces) supported by the device." + // + // TODO: Use proper mechanism to get this information from Level Zero after + // it is added to Level Zero. + // Hardcoding the few we know are supported by the current hardware. + // + // + std::string SupportedExtensions; + + // cl_khr_il_program - OpenCL 2.0 KHR extension for SPIR-V support. Core + // feature in >OpenCL 2.1 + // cl_khr_subgroups - Extension adds support for implementation-controlled + // subgroups. + // cl_intel_subgroups - Extension adds subgroup features, defined by Intel. + // cl_intel_subgroups_short - Extension adds subgroup functions described in + // the cl_intel_subgroups extension to support 16-bit integer data types + // for performance. + // cl_intel_required_subgroup_size - Extension to allow programmers to + // optionally specify the required subgroup size for a kernel function. + // cl_khr_fp16 - Optional half floating-point support. + // cl_khr_fp64 - Support for double floating-point precision. + // cl_khr_int64_base_atomics, cl_khr_int64_extended_atomics - Optional + // extensions that implement atomic operations on 64-bit signed and + // unsigned integers to locations in __global and __local memory. + // cl_khr_3d_image_writes - Extension to enable writes to 3D image memory + // objects. + // + // Hardcoding some extensions we know are supported by all Level Zero + // devices. + SupportedExtensions += (ZE_SUPPORTED_EXTENSIONS); + if (Device->ZeDeviceModuleProperties->flags & ZE_DEVICE_MODULE_FLAG_FP16) + SupportedExtensions += ("cl_khr_fp16 "); + if (Device->ZeDeviceModuleProperties->flags & ZE_DEVICE_MODULE_FLAG_FP64) + SupportedExtensions += ("cl_khr_fp64 "); + if (Device->ZeDeviceModuleProperties->flags & + ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS) + // int64AtomicsSupported indicates support for both. + SupportedExtensions += + ("cl_khr_int64_base_atomics cl_khr_int64_extended_atomics "); + if (Device->ZeDeviceImageProperties->maxImageDims3D > 0) + // Supports reading and writing of images. + SupportedExtensions += ("cl_khr_3d_image_writes "); + + // L0 does not tell us if bfloat16 is supported. + // For now, assume ATS and PVC support it. + // TODO: change the way we detect bfloat16 support. + if ((Device->ZeDeviceProperties->deviceId & 0xfff) == 0x201 || + (Device->ZeDeviceProperties->deviceId & 0xff0) == 0xbd0) + SupportedExtensions += ("cl_intel_bfloat16_conversions "); + + return ReturnValue(SupportedExtensions.c_str()); + } + case UR_DEVICE_INFO_NAME: + return ReturnValue(Device->ZeDeviceProperties->name); + // zeModuleCreate allows using root device module for sub-devices: + // > The application must only use the module for the device, or its + // > sub-devices, which was provided during creation. + case UR_EXT_DEVICE_INFO_BUILD_ON_SUBDEVICE: + return ReturnValue(uint32_t{0}); + case UR_DEVICE_INFO_COMPILER_AVAILABLE: + return ReturnValue(uint32_t{1}); + case UR_DEVICE_INFO_LINKER_AVAILABLE: + return ReturnValue(uint32_t{1}); + case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: { + uint32_t MaxComputeUnits = + Device->ZeDeviceProperties->numEUsPerSubslice * + Device->ZeDeviceProperties->numSubslicesPerSlice * + Device->ZeDeviceProperties->numSlices; + + bool RepresentsCSlice = + Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] + .ZeIndex >= 0; + if (RepresentsCSlice) + MaxComputeUnits /= Device->RootDevice->SubDevices.size(); + + return ReturnValue(uint32_t{MaxComputeUnits}); + } + case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: + // Level Zero spec defines only three dimensions + return ReturnValue(uint32_t{3}); + case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: + return ReturnValue( + uint64_t{Device->ZeDeviceComputeProperties->maxTotalGroupSize}); + case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: { + struct { + size_t Arr[3]; + } MaxGroupSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX, + Device->ZeDeviceComputeProperties->maxGroupSizeY, + Device->ZeDeviceComputeProperties->maxGroupSizeZ}}; + return ReturnValue(MaxGroupSize); + } + case UR_EXT_DEVICE_INFO_MAX_WORK_GROUPS_3D: { + struct { + size_t Arr[3]; + } MaxGroupCounts = {{Device->ZeDeviceComputeProperties->maxGroupCountX, + Device->ZeDeviceComputeProperties->maxGroupCountY, + Device->ZeDeviceComputeProperties->maxGroupCountZ}}; + return ReturnValue(MaxGroupCounts); + } + case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: + return ReturnValue(uint32_t{Device->ZeDeviceProperties->coreClockRate}); + case UR_DEVICE_INFO_ADDRESS_BITS: { + // TODO: To confirm with spec. + return ReturnValue(uint32_t{64}); + } + case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: + return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize}); + case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: { + uint64_t GlobalMemSize = 0; + for (const auto &ZeDeviceMemoryExtProperty : + Device->ZeDeviceMemoryProperties->second) { + GlobalMemSize += ZeDeviceMemoryExtProperty.physicalSize; + } + return ReturnValue(uint64_t{GlobalMemSize}); + } + case UR_DEVICE_INFO_LOCAL_MEM_SIZE: + return ReturnValue( + uint64_t{Device->ZeDeviceComputeProperties->maxSharedLocalMemory}); + case UR_DEVICE_INFO_IMAGE_SUPPORTED: + return ReturnValue( + uint32_t{Device->ZeDeviceImageProperties->maxImageDims1D > 0}); + case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: + return ReturnValue(uint32_t{(Device->ZeDeviceProperties->flags & + ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0}); + case UR_DEVICE_INFO_AVAILABLE: + return ReturnValue(uint32_t{ZeDevice ? true : false}); + case UR_DEVICE_INFO_VENDOR: + // TODO: Level-Zero does not return vendor's name at the moment + // only the ID. + return ReturnValue("Intel(R) Corporation"); + case UR_DEVICE_INFO_DRIVER_VERSION: + return ReturnValue(Device->Platform->ZeDriverVersion.c_str()); + case UR_DEVICE_INFO_VERSION: + return ReturnValue(Device->Platform->ZeDriverApiVersion.c_str()); + case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: { + auto Res = Device->Platform->populateDeviceCacheIfNeeded(); + if (Res != UR_RESULT_SUCCESS) { + return Res; + } + return ReturnValue((uint32_t)Device->SubDevices.size()); + } + case UR_DEVICE_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{Device->RefCount.load()}); + case UR_DEVICE_INFO_PARTITION_PROPERTIES: { + // SYCL spec says: if this SYCL device cannot be partitioned into at least + // two sub devices then the returned vector must be empty. + auto Res = Device->Platform->populateDeviceCacheIfNeeded(); + if (Res != UR_RESULT_SUCCESS) { + return Res; + } + + uint32_t ZeSubDeviceCount = Device->SubDevices.size(); + if (ZeSubDeviceCount < 2) { + return ReturnValue((ur_device_partition_property_t)0); + } + bool PartitionedByCSlice = Device->SubDevices[0]->isCCS(); + + auto ReturnHelper = [&](auto... Partitions) { + struct { + ur_device_partition_property_t Arr[sizeof...(Partitions) + 1]; + } PartitionProperties = { + {Partitions..., ur_device_partition_property_t(0)}}; + return ReturnValue(PartitionProperties); + }; + + if (ExposeCSliceInAffinityPartitioning) { + if (PartitionedByCSlice) + return ReturnHelper(UR_DEVICE_PARTITION_BY_CSLICE, + UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); + + else + return ReturnHelper(UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); + } else { + return ReturnHelper(PartitionedByCSlice + ? UR_DEVICE_PARTITION_BY_CSLICE + : UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); + } + break; + } + case UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: + return ReturnValue(ur_device_affinity_domain_flag_t( + UR_DEVICE_AFFINITY_DOMAIN_FLAG_NUMA | + UR_DEVICE_AFFINITY_DOMAIN_FLAG_NEXT_PARTITIONABLE)); + case UR_DEVICE_INFO_PARTITION_TYPE: { + // For root-device there is no partitioning to report. + if (!Device->isSubDevice()) + return ReturnValue(ur_device_partition_property_t(0)); + + if (Device->isCCS()) { + struct { + ur_device_partition_property_t Arr[2]; + } PartitionProperties = { + {UR_DEVICE_PARTITION_BY_CSLICE, ur_device_partition_property_t(0)}}; + return ReturnValue(PartitionProperties); + } + + struct { + ur_device_partition_property_t Arr[3]; + } PartitionProperties = { + {UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, + (ur_device_partition_property_t) + UR_DEVICE_AFFINITY_DOMAIN_FLAG_NEXT_PARTITIONABLE, + ur_device_partition_property_t(0)}}; + return ReturnValue(PartitionProperties); + } + + // Everything under here is not supported yet + + case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION: + return ReturnValue(""); + case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: + return ReturnValue(uint32_t{true}); + case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE: + return ReturnValue( + size_t{Device->ZeDeviceModuleProperties->printfBufferSize}); + case UR_DEVICE_INFO_PROFILE: + return ReturnValue("FULL_PROFILE"); + case UR_DEVICE_INFO_BUILT_IN_KERNELS: + // TODO: To find out correct value + return ReturnValue(""); + case UR_DEVICE_INFO_QUEUE_PROPERTIES: + return ReturnValue( + ur_queue_flag_t(UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE | + UR_QUEUE_FLAG_PROFILING_ENABLE)); + case UR_DEVICE_INFO_EXECUTION_CAPABILITIES: + return ReturnValue(ur_device_exec_capability_flag_t{ + UR_DEVICE_EXEC_CAPABILITY_FLAG_NATIVE_KERNEL}); + case UR_DEVICE_INFO_ENDIAN_LITTLE: + return ReturnValue(uint32_t{true}); + case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: + return ReturnValue(uint32_t{Device->ZeDeviceProperties->flags & + ZE_DEVICE_PROPERTY_FLAG_ECC}); + case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: + return ReturnValue(size_t{Device->ZeDeviceProperties->timerResolution}); + case UR_DEVICE_INFO_LOCAL_MEM_TYPE: + return ReturnValue(UR_DEVICE_LOCAL_MEM_TYPE_LOCAL); + case UR_DEVICE_INFO_MAX_CONSTANT_ARGS: + return ReturnValue(uint32_t{64}); + case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: + return ReturnValue( + uint64_t{Device->ZeDeviceImageProperties->maxImageBufferSize}); + case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: + return ReturnValue(UR_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE); + case UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE: + return ReturnValue( + // TODO[1.0]: how to query cache line-size? + uint32_t{1}); + case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: + return ReturnValue(uint64_t{Device->ZeDeviceCacheProperties->cacheSize}); + case UR_DEVICE_INFO_MAX_PARAMETER_SIZE: + return ReturnValue( + size_t{Device->ZeDeviceModuleProperties->maxArgumentsSize}); + case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: + // SYCL/OpenCL spec is vague on what this means exactly, but seems to + // be for "alignment requirement (in bits) for sub-buffer offsets." + // An OpenCL implementation returns 8*128, but Level Zero can do just 8, + // meaning unaligned access for values of types larger than 8 bits. + return ReturnValue(uint32_t{8}); + case UR_DEVICE_INFO_MAX_SAMPLERS: + return ReturnValue(uint32_t{Device->ZeDeviceImageProperties->maxSamplers}); + case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS: + return ReturnValue( + uint32_t{Device->ZeDeviceImageProperties->maxReadImageArgs}); + case UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: + return ReturnValue( + uint32_t{Device->ZeDeviceImageProperties->maxWriteImageArgs}); + case UR_DEVICE_INFO_SINGLE_FP_CONFIG: { + uint64_t SingleFPValue = 0; + ze_device_fp_flags_t ZeSingleFPCapabilities = + Device->ZeDeviceModuleProperties->fp32flags; + if (ZE_DEVICE_FP_FLAG_DENORM & ZeSingleFPCapabilities) { + SingleFPValue |= UR_FP_CAPABILITY_FLAG_DENORM; + } + if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeSingleFPCapabilities) { + SingleFPValue |= UR_FP_CAPABILITY_FLAG_INF_NAN; + } + if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeSingleFPCapabilities) { + SingleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST; + } + if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeSingleFPCapabilities) { + SingleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_ZERO; + } + if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeSingleFPCapabilities) { + SingleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_INF; + } + if (ZE_DEVICE_FP_FLAG_FMA & ZeSingleFPCapabilities) { + SingleFPValue |= UR_FP_CAPABILITY_FLAG_FMA; + } + if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeSingleFPCapabilities) { + SingleFPValue |= UR_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; + } + return ReturnValue(uint64_t{SingleFPValue}); + } + case UR_DEVICE_INFO_HALF_FP_CONFIG: { + uint64_t HalfFPValue = 0; + ze_device_fp_flags_t ZeHalfFPCapabilities = + Device->ZeDeviceModuleProperties->fp16flags; + if (ZE_DEVICE_FP_FLAG_DENORM & ZeHalfFPCapabilities) { + HalfFPValue |= UR_FP_CAPABILITY_FLAG_DENORM; + } + if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeHalfFPCapabilities) { + HalfFPValue |= UR_FP_CAPABILITY_FLAG_INF_NAN; + } + if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeHalfFPCapabilities) { + HalfFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST; + } + if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeHalfFPCapabilities) { + HalfFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_ZERO; + } + if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeHalfFPCapabilities) { + HalfFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_INF; + } + if (ZE_DEVICE_FP_FLAG_FMA & ZeHalfFPCapabilities) { + HalfFPValue |= UR_FP_CAPABILITY_FLAG_FMA; + } + if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeHalfFPCapabilities) { + HalfFPValue |= UR_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; + } + return ReturnValue(uint64_t{HalfFPValue}); + } + case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: { + uint64_t DoubleFPValue = 0; + ze_device_fp_flags_t ZeDoubleFPCapabilities = + Device->ZeDeviceModuleProperties->fp64flags; + if (ZE_DEVICE_FP_FLAG_DENORM & ZeDoubleFPCapabilities) { + DoubleFPValue |= UR_FP_CAPABILITY_FLAG_DENORM; + } + if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeDoubleFPCapabilities) { + DoubleFPValue |= UR_FP_CAPABILITY_FLAG_INF_NAN; + } + if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeDoubleFPCapabilities) { + DoubleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST; + } + if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeDoubleFPCapabilities) { + DoubleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_ZERO; + } + if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeDoubleFPCapabilities) { + DoubleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_INF; + } + if (ZE_DEVICE_FP_FLAG_FMA & ZeDoubleFPCapabilities) { + DoubleFPValue |= UR_FP_CAPABILITY_FLAG_FMA; + } + if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeDoubleFPCapabilities) { + DoubleFPValue |= UR_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; + } + return ReturnValue(uint64_t{DoubleFPValue}); + } + case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: + return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims2D}); + case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: + return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims2D}); + case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: + return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D}); + case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: + return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D}); + case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: + return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D}); + case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: + return ReturnValue( + size_t{Device->ZeDeviceImageProperties->maxImageBufferSize}); + case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: + return ReturnValue( + size_t{Device->ZeDeviceImageProperties->maxImageArraySlices}); + // Handle SIMD widths. + // TODO: can we do better than this? + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: + return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 1); + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT: + return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 2); + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT: + return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 4); + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG: + return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 8); + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT: + return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 4); + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE: + return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 8); + case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: + case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF: + return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 2); + case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: { + // Max_num_sub_Groups = maxTotalGroupSize/min(set of subGroupSizes); + uint32_t MinSubGroupSize = + Device->ZeDeviceComputeProperties->subGroupSizes[0]; + for (uint32_t I = 1; + I < Device->ZeDeviceComputeProperties->numSubGroupSizes; I++) { + if (MinSubGroupSize > Device->ZeDeviceComputeProperties->subGroupSizes[I]) + MinSubGroupSize = Device->ZeDeviceComputeProperties->subGroupSizes[I]; + } + return ReturnValue(Device->ZeDeviceComputeProperties->maxTotalGroupSize / + MinSubGroupSize); + } + case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: { + // TODO: Not supported yet. Needs to be updated after support is added. + return ReturnValue(uint32_t{false}); + } + case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { + // ze_device_compute_properties.subGroupSizes is in uint32_t whereas the + // expected return is size_t datatype. size_t can be 8 bytes of data. + return ReturnValue.template operator()( + Device->ZeDeviceComputeProperties->subGroupSizes, + Device->ZeDeviceComputeProperties->numSubGroupSizes); + } + case UR_DEVICE_INFO_IL_VERSION: { + // Set to a space separated list of IL version strings of the form + // _.. + // "SPIR-V" is a required IL prefix when cl_khr_il_progam extension is + // reported. + uint32_t SpirvVersion = + Device->ZeDeviceModuleProperties->spirvVersionSupported; + uint32_t SpirvVersionMajor = ZE_MAJOR_VERSION(SpirvVersion); + uint32_t SpirvVersionMinor = ZE_MINOR_VERSION(SpirvVersion); + + char SpirvVersionString[50]; + int Len = sprintf(SpirvVersionString, "SPIR-V_%d.%d ", SpirvVersionMajor, + SpirvVersionMinor); + // returned string to contain only len number of characters. + std::string ILVersion(SpirvVersionString, Len); + return ReturnValue(ILVersion.c_str()); + } + case UR_DEVICE_INFO_USM_HOST_SUPPORT: + case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: + case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: + case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: + case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: { + auto MapCaps = [](const ze_memory_access_cap_flags_t &ZeCapabilities) { + uint64_t Capabilities = 0; + if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_RW) + Capabilities |= UR_EXT_USM_CAPS_ACCESS; + if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC) + Capabilities |= UR_EXT_USM_CAPS_ATOMIC_ACCESS; + if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT) + Capabilities |= UR_EXT_USM_CAPS_CONCURRENT_ACCESS; + if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC) + Capabilities |= UR_EXT_USM_CAPS_CONCURRENT_ATOMIC_ACCESS; + return Capabilities; + }; + auto &Props = Device->ZeDeviceMemoryAccessProperties; + switch (ParamName) { + case UR_DEVICE_INFO_USM_HOST_SUPPORT: + return ReturnValue(MapCaps(Props->hostAllocCapabilities)); + case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: + return ReturnValue(MapCaps(Props->deviceAllocCapabilities)); + case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: + return ReturnValue(MapCaps(Props->sharedSingleDeviceAllocCapabilities)); + case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: + return ReturnValue(MapCaps(Props->sharedCrossDeviceAllocCapabilities)); + case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: + return ReturnValue(MapCaps(Props->sharedSystemAllocCapabilities)); + default: + die("urDeviceGetInfo: unexpected ParamName."); + } + } + + // intel extensions for GPU information + case UR_DEVICE_INFO_DEVICE_ID: + return ReturnValue(uint32_t{Device->ZeDeviceProperties->deviceId}); + case UR_DEVICE_INFO_PCI_ADDRESS: { + if (getenv("ZES_ENABLE_SYSMAN") == nullptr) { + urPrint("Set SYCL_ENABLE_PCI=1 to obtain PCI data.\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + ZesStruct ZeDevicePciProperties; + ZE2UR_CALL(zesDevicePciGetProperties, (ZeDevice, &ZeDevicePciProperties)); + constexpr size_t AddressBufferSize = 13; + char AddressBuffer[AddressBufferSize]; + std::snprintf(AddressBuffer, AddressBufferSize, "%04x:%02x:%02x.%01x", + ZeDevicePciProperties.address.domain, + ZeDevicePciProperties.address.bus, + ZeDevicePciProperties.address.device, + ZeDevicePciProperties.address.function); + return ReturnValue(AddressBuffer); + } + + case UR_EXT_DEVICE_INFO_FREE_MEMORY: { + if (getenv("ZES_ENABLE_SYSMAN") == nullptr) { + setErrorMessage("Set ZES_ENABLE_SYSMAN=1 to obtain free memory", + UR_RESULT_SUCCESS); + return UR_EXT_RESULT_ADAPTER_SPECIFIC_ERROR; + } + // Only report device memory which zeMemAllocDevice can allocate from. + // Currently this is only the one enumerated with ordinal 0. + uint64_t FreeMemory = 0; + uint32_t MemCount = 0; + ZE2UR_CALL(zesDeviceEnumMemoryModules, (ZeDevice, &MemCount, nullptr)); + if (MemCount != 0) { + std::vector ZesMemHandles(MemCount); + ZE2UR_CALL(zesDeviceEnumMemoryModules, + (ZeDevice, &MemCount, ZesMemHandles.data())); + for (auto &ZesMemHandle : ZesMemHandles) { + ZesStruct ZesMemProperties; + ZE2UR_CALL(zesMemoryGetProperties, (ZesMemHandle, &ZesMemProperties)); + // For root-device report memory from all memory modules since that + // is what totally available in the default implicit scaling mode. + // For sub-devices only report memory local to them. + if (!Device->isSubDevice() || Device->ZeDeviceProperties->subdeviceId == + ZesMemProperties.subdeviceId) { + + ZesStruct ZesMemState; + ZE2UR_CALL(zesMemoryGetState, (ZesMemHandle, &ZesMemState)); + FreeMemory += ZesMemState.free; + } + } + } + return ReturnValue(FreeMemory); + } + case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: { + // If there are not any memory modules then return 0. + if (Device->ZeDeviceMemoryProperties->first.empty()) + return ReturnValue(uint32_t{0}); + + // If there are multiple memory modules on the device then we have to report + // the value of the slowest memory. + auto Comp = [](const ze_device_memory_properties_t &A, + const ze_device_memory_properties_t &B) -> bool { + return A.maxClockRate < B.maxClockRate; + }; + auto MinIt = + std::min_element(Device->ZeDeviceMemoryProperties->first.begin(), + Device->ZeDeviceMemoryProperties->first.end(), Comp); + return ReturnValue(uint32_t{MinIt->maxClockRate}); + } + case UR_EXT_DEVICE_INFO_MEMORY_BUS_WIDTH: { + // If there are not any memory modules then return 0. + if (Device->ZeDeviceMemoryProperties->first.empty()) + return ReturnValue(uint32_t{0}); + + // If there are multiple memory modules on the device then we have to report + // the value of the slowest memory. + auto Comp = [](const ze_device_memory_properties_t &A, + const ze_device_memory_properties_t &B) -> bool { + return A.maxBusWidth < B.maxBusWidth; + }; + auto MinIt = + std::min_element(Device->ZeDeviceMemoryProperties->first.begin(), + Device->ZeDeviceMemoryProperties->first.end(), Comp); + return ReturnValue(uint32_t{MinIt->maxBusWidth}); + } + case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: { + if (Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] + .ZeIndex >= 0) + // Sub-sub-device represents a particular compute index already. + return ReturnValue(int32_t{1}); + + auto ZeDeviceNumIndices = + Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] + .ZeProperties.numQueues; + return ReturnValue(int32_t(ZeDeviceNumIndices)); + } break; + case UR_DEVICE_INFO_GPU_EU_COUNT: { + uint32_t count = Device->ZeDeviceProperties->numEUsPerSubslice * + Device->ZeDeviceProperties->numSubslicesPerSlice * + Device->ZeDeviceProperties->numSlices; + return ReturnValue(uint32_t{count}); + } + case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH: + return ReturnValue( + uint32_t{Device->ZeDeviceProperties->physicalEUSimdWidth}); + case UR_EXT_DEVICE_INFO_GPU_SLICES: + return ReturnValue(uint32_t{Device->ZeDeviceProperties->numSlices}); + case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE: + return ReturnValue( + uint32_t{Device->ZeDeviceProperties->numSubslicesPerSlice}); + case UR_EXT_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE: + return ReturnValue(uint32_t{Device->ZeDeviceProperties->numEUsPerSubslice}); + case UR_EXT_DEVICE_INFO_GPU_HW_THREADS_PER_EU: + return ReturnValue(uint32_t{Device->ZeDeviceProperties->numThreadsPerEU}); + case UR_EXT_DEVICE_INFO_MAX_MEM_BANDWIDTH: + // currently not supported in level zero runtime + return UR_RESULT_ERROR_INVALID_VALUE; + case UR_DEVICE_INFO_BFLOAT16: { + // bfloat16 math functions are not yet supported on Intel GPUs. + return ReturnValue(bool{false}); + } + case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { + // There are no explicit restrictions in L0 programming guide, so assume all + // are supported + ur_memory_scope_capability_flags_t result = + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM; + + return ReturnValue(result); + } + case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: { + // There are no explicit restrictions in L0 programming guide, so assume all + // are supported + ur_memory_order_capability_flags_t result = + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL | + UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST; + + return ReturnValue(result); + } + case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { + // There are no explicit restrictions in L0 programming guide, so assume all + // are supported + ur_memory_scope_capability_flags_t result = + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM; + + return ReturnValue(result); + } + + case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { + ur_memory_order_capability_flags_t capabilities = + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | + UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL | + UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST; + return ReturnValue(capabilities); + } + + default: + urPrint("Unsupported ParamName in urGetDeviceInfo\n"); + urPrint("ParamName=%d(0x%x)\n", ParamName, ParamName); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return UR_RESULT_SUCCESS; +} + +// SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE can be set to an integer value, or +// a pair of integer values of the form "lower_index:upper_index". +// Here, the indices point to copy engines in a list of all available copy +// engines. +// This functions returns this pair of indices. +// If the user specifies only a single integer, a value of 0 indicates that +// the copy engines will not be used at all. A value of 1 indicates that all +// available copy engines can be used. +const std::pair +getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device) { + static const char *EnvVar = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE"); + // If the environment variable is not set, no copy engines are used when + // immediate commandlists are being used. For standard commandlists all are + // used. + if (!EnvVar) { + if (Device->ImmCommandListUsed) + return std::pair(-1, -1); // No copy engines can be used. + return std::pair(0, INT_MAX); // All copy engines will be used. + } + std::string CopyEngineRange = EnvVar; + // Environment variable can be a single integer or a pair of integers + // separated by ":" + auto pos = CopyEngineRange.find(":"); + if (pos == std::string::npos) { + bool UseCopyEngine = (std::stoi(CopyEngineRange) != 0); + if (UseCopyEngine) + return std::pair(0, INT_MAX); // All copy engines can be used. + return std::pair(-1, -1); // No copy engines will be used. + } + int LowerCopyEngineIndex = std::stoi(CopyEngineRange.substr(0, pos)); + int UpperCopyEngineIndex = std::stoi(CopyEngineRange.substr(pos + 1)); + if ((LowerCopyEngineIndex > UpperCopyEngineIndex) || + (LowerCopyEngineIndex < -1) || (UpperCopyEngineIndex < -1)) { + urPrint("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE: invalid value provided, " + "default set.\n"); + LowerCopyEngineIndex = 0; + UpperCopyEngineIndex = INT_MAX; + } + return std::pair(LowerCopyEngineIndex, UpperCopyEngineIndex); +} + +bool CopyEngineRequested(const ur_device_handle_t &Device) { + int LowerCopyQueueIndex = getRangeOfAllowedCopyEngines(Device).first; + int UpperCopyQueueIndex = getRangeOfAllowedCopyEngines(Device).second; + return ((LowerCopyQueueIndex != -1) || (UpperCopyQueueIndex != -1)); +} + +// Whether immediate commandlists will be used for kernel launches and copies. +// The default is standard commandlists. Setting 1 or 2 specifies use of +// immediate commandlists. Note: when immediate commandlists are used then +// device-only events must be either AllHostVisible or OnDemandHostVisibleProxy. +// (See env var SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS). + +// Get value of immediate commandlists env var setting or -1 if unset +ur_device_handle_t_::ImmCmdlistMode +ur_device_handle_t_::useImmediateCommandLists() { + // If immediate commandlist setting is not explicitly set, then use the device + // default. + static const int ImmediateCommandlistsSetting = [] { + const char *ImmediateCommandlistsSettingStr = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"); + if (!ImmediateCommandlistsSettingStr) + return -1; + return std::stoi(ImmediateCommandlistsSettingStr); + }(); + + if (ImmediateCommandlistsSetting == -1) + // Change this to PerQueue as default after more testing. + return NotUsed; + switch (ImmediateCommandlistsSetting) { + case 0: + return NotUsed; + case 1: + return PerQueue; + case 2: + return PerThreadPerQueue; + default: + return NotUsed; + } +} + +ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, + int SubSubDeviceIndex) { + uint32_t numQueueGroups = 0; + ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties, + (ZeDevice, &numQueueGroups, nullptr)); + if (numQueueGroups == 0) { + return UR_RESULT_ERROR_UNKNOWN; + } + urPrint("NOTE: Number of queue groups = %d\n", numQueueGroups); + std::vector> + QueueGroupProperties(numQueueGroups); + ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties, + (ZeDevice, &numQueueGroups, QueueGroupProperties.data())); + + // Initialize ordinal and compute queue group properties + for (uint32_t i = 0; i < numQueueGroups; i++) { + if (QueueGroupProperties[i].flags & + ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) { + QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal = + i; + QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] + .ZeProperties = QueueGroupProperties[i]; + break; + } + } + + // Reinitialize a sub-sub-device with its own ordinal, index. + // Our sub-sub-device representation is currently [Level-Zero sub-device + // handle + Level-Zero compute group/engine index]. Only the specified + // index queue will be used to submit work to the sub-sub-device. + if (SubSubDeviceOrdinal >= 0) { + QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal = + SubSubDeviceOrdinal; + QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeIndex = + SubSubDeviceIndex; + } else { // Proceed with initialization for root and sub-device + // How is it possible that there are no "compute" capabilities? + if (QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal < + 0) { + return UR_RESULT_ERROR_UNKNOWN; + } + + if (CopyEngineRequested((ur_device_handle_t)this)) { + for (uint32_t i = 0; i < numQueueGroups; i++) { + if (((QueueGroupProperties[i].flags & + ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) && + (QueueGroupProperties[i].flags & + ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) { + if (QueueGroupProperties[i].numQueues == 1) { + QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal = i; + QueueGroup[queue_group_info_t::MainCopy].ZeProperties = + QueueGroupProperties[i]; + } else { + QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal = i; + QueueGroup[queue_group_info_t::LinkCopy].ZeProperties = + QueueGroupProperties[i]; + break; + } + } + } + if (QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal < 0) + urPrint("NOTE: main blitter/copy engine is not available\n"); + else + urPrint("NOTE: main blitter/copy engine is available\n"); + + if (QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal < 0) + urPrint("NOTE: link blitter/copy engines are not available\n"); + else + urPrint("NOTE: link blitter/copy engines are available\n"); + } + } + + // Maintain various device properties cache. + // Note that we just describe here how to compute the data. + // The real initialization is upon first access. + // + auto ZeDevice = this->ZeDevice; + ZeDeviceProperties.Compute = [ZeDevice](ze_device_properties_t &Properties) { + ZE_CALL_NOCHECK(zeDeviceGetProperties, (ZeDevice, &Properties)); + }; + + ZeDeviceComputeProperties.Compute = + [ZeDevice](ze_device_compute_properties_t &Properties) { + ZE_CALL_NOCHECK(zeDeviceGetComputeProperties, (ZeDevice, &Properties)); + }; + + ZeDeviceImageProperties.Compute = + [ZeDevice](ze_device_image_properties_t &Properties) { + ZE_CALL_NOCHECK(zeDeviceGetImageProperties, (ZeDevice, &Properties)); + }; + + ZeDeviceModuleProperties.Compute = + [ZeDevice](ze_device_module_properties_t &Properties) { + ZE_CALL_NOCHECK(zeDeviceGetModuleProperties, (ZeDevice, &Properties)); + }; + + ZeDeviceMemoryProperties.Compute = + [ZeDevice]( + std::pair>, + std::vector>> + &Properties) { + uint32_t Count = 0; + ZE_CALL_NOCHECK(zeDeviceGetMemoryProperties, + (ZeDevice, &Count, nullptr)); + + auto &PropertiesVector = Properties.first; + auto &PropertiesExtVector = Properties.second; + + PropertiesVector.resize(Count); + PropertiesExtVector.resize(Count); + // Request for extended memory properties be read in + for (uint32_t I = 0; I < Count; ++I) + PropertiesVector[I].pNext = (void *)&PropertiesExtVector[I]; + + ZE_CALL_NOCHECK(zeDeviceGetMemoryProperties, + (ZeDevice, &Count, PropertiesVector.data())); + }; + + ZeDeviceMemoryAccessProperties.Compute = + [ZeDevice](ze_device_memory_access_properties_t &Properties) { + ZE_CALL_NOCHECK(zeDeviceGetMemoryAccessProperties, + (ZeDevice, &Properties)); + }; + + ZeDeviceCacheProperties.Compute = + [ZeDevice](ze_device_cache_properties_t &Properties) { + // TODO: Since v1.0 there can be multiple cache properties. + // For now remember the first one, if any. + uint32_t Count = 0; + ZE_CALL_NOCHECK(zeDeviceGetCacheProperties, + (ZeDevice, &Count, nullptr)); + if (Count > 0) + Count = 1; + ZE_CALL_NOCHECK(zeDeviceGetCacheProperties, + (ZeDevice, &Count, &Properties)); + }; + + ImmCommandListUsed = this->useImmediateCommandLists(); + + if (ImmCommandListUsed == ImmCmdlistMode::NotUsed) { + ZeEventsScope = DeviceEventsSetting; + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t urDeviceRetain(ur_device_handle_t Device) { + + // The root-device ref-count remains unchanged (always 1). + if (Device->isSubDevice()) { + Device->RefCount.increment(); + } + return UR_RESULT_SUCCESS; +} + +ur_result_t urDeviceRelease(ur_device_handle_t Device) { + // Root devices are destroyed during the piTearDown process. + if (Device->isSubDevice()) { + if (Device->RefCount.decrementAndTest()) { + delete Device; + } + } + + return UR_RESULT_SUCCESS; +} + +void ZeUSMImportExtension::setZeUSMImport(ur_platform_handle_t_ *Platform) { + // Whether env var SYCL_USM_HOSTPTR_IMPORT has been set requesting + // host ptr import during buffer creation. + const char *USMHostPtrImportStr = std::getenv("SYCL_USM_HOSTPTR_IMPORT"); + if (!USMHostPtrImportStr || std::atoi(USMHostPtrImportStr) == 0) + return; + + // Check if USM hostptr import feature is available. + ze_driver_handle_t DriverHandle = Platform->ZeDriver; + if (ZE_CALL_NOCHECK( + zeDriverGetExtensionFunctionAddress, + (DriverHandle, "zexDriverImportExternalPointer", + reinterpret_cast(&zexDriverImportExternalPointer))) == 0) { + ZE_CALL_NOCHECK( + zeDriverGetExtensionFunctionAddress, + (DriverHandle, "zexDriverReleaseImportedPointer", + reinterpret_cast(&zexDriverReleaseImportedPointer))); + // Hostptr import/release is turned on because it has been requested + // by the env var, and this platform supports the APIs. + Enabled = true; + // Hostptr import is only possible if piMemBufferCreate receives a + // hostptr as an argument. The SYCL runtime passes a host ptr + // only when SYCL_HOST_UNIFIED_MEMORY is enabled. Therefore we turn it on. + setEnvVar("SYCL_HOST_UNIFIED_MEMORY", "1"); + } +} +void ZeUSMImportExtension::doZeUSMImport(ze_driver_handle_t DriverHandle, + void *HostPtr, size_t Size) { + ZE_CALL_NOCHECK(zexDriverImportExternalPointer, + (DriverHandle, HostPtr, Size)); +} +void ZeUSMImportExtension::doZeUSMRelease(ze_driver_handle_t DriverHandle, + void *HostPtr) { + ZE_CALL_NOCHECK(zexDriverReleaseImportedPointer, (DriverHandle, HostPtr)); +} + +UR_APIEXPORT ur_result_t UR_APICALL urDevicePartition( + ur_device_handle_t Device, ///< [in] handle of the device to partition. + const ur_device_partition_property_t + *Properties, ///< [in] null-terminated array of <$_device_partition_t + ///< enum, value> pairs. + uint32_t NumDevices, ///< [in] the number of sub-devices. + ur_device_handle_t + *OutDevices, ///< [out][optional][range(0, NumDevices)] array of handle + ///< of devices. If NumDevices is less than the number of + ///< sub-devices available, then the function shall only + ///< retrieve that number of sub-devices. + uint32_t *NumDevicesRet ///< [out][optional] pointer to the number of + ///< sub-devices the device can be partitioned into + ///< according to the partitioning property. +) { + // Other partitioning ways are not supported by Level Zero + if (Properties[0] == UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN) { + if ((Properties[1] != UR_DEVICE_AFFINITY_DOMAIN_FLAG_NEXT_PARTITIONABLE && + Properties[1] != UR_DEVICE_AFFINITY_DOMAIN_FLAG_NUMA)) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + } else if (Properties[0] == UR_DEVICE_PARTITION_BY_CSLICE) { + if (Properties[1] != 0) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + } else { + return UR_RESULT_ERROR_INVALID_VALUE; + } + + // Devices cache is normally created in piDevicesGet but still make + // sure that cache is populated. + // + auto Res = Device->Platform->populateDeviceCacheIfNeeded(); + if (Res != UR_RESULT_SUCCESS) { + return Res; + } + + auto EffectiveNumDevices = [&]() -> decltype(Device->SubDevices.size()) { + if (Device->SubDevices.size() == 0) + return 0; + + // Sub-Sub-Devices are partitioned by CSlices, not by affinity domain. + // However, if + // SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING overrides that + // still expose CSlices in partitioning by affinity domain for compatibility + // reasons. + if (Properties[0] == UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN && + !ExposeCSliceInAffinityPartitioning) { + if (Device->isSubDevice()) { + return 0; + } + } + if (Properties[0] == UR_DEVICE_PARTITION_BY_CSLICE) { + // Not a CSlice-based partitioning. + if (!Device->SubDevices[0]->isCCS()) { + return 0; + } + } + + return Device->SubDevices.size(); + }(); + + // TODO: Consider support for partitioning to <= total sub-devices. + // Currently supported partitioning (by affinity domain/numa) would always + // partition to all sub-devices. + // + if (NumDevices != 0) + UR_ASSERT(NumDevices == EffectiveNumDevices, UR_RESULT_ERROR_INVALID_VALUE); + + for (uint32_t I = 0; I < NumDevices; I++) { + OutDevices[I] = Device->SubDevices[I]; + // reusing the same pi_device needs to increment the reference count + urDeviceRetain(OutDevices[I]); + } + + if (NumDevicesRet) { + *NumDevicesRet = EffectiveNumDevices; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( + ur_device_handle_t + Device, ///< [in] handle of the device to select binary for. + const uint8_t **BinaryArray, ///< [in] the array of binaries to select from. + uint32_t NumBinaries, ///< [in] the number of binaries passed in ppBinaries. + ///< Must greater than or equal to zero otherwise + ///< ::UR_RESULT_ERROR_INVALID_VALUE is returned. + uint32_t + *SelectedBinary ///< [out] the index of the selected binary in the input + ///< array of binaries. If a suitable binary was not + ///< found the function returns ${X}_INVALID_BINARY. +) { + // TODO: this is a bare-bones implementation for choosing a device image + // that would be compatible with the targeted device. An AOT-compiled + // image is preferred over SPIR-V for known devices (i.e. Intel devices) + // The implementation makes no effort to differentiate between multiple images + // for the given device, and simply picks the first one compatible. + // + // Real implementation will use the same mechanism OpenCL ICD dispatcher + // uses. Something like: + // PI_VALIDATE_HANDLE_RETURN_HANDLE(ctx, PI_ERROR_INVALID_CONTEXT); + // return context->dispatch->piextDeviceSelectIR( + // ctx, images, num_images, selected_image); + // where context->dispatch is set to the dispatch table provided by PI + // plugin for platform/device the ctx was created for. + + // Look for GEN binary, which we known can only be handled by Level-Zero now. + const char *BinaryTarget = __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN; + + pi_device_binary *Binaries = + reinterpret_cast(const_cast(BinaryArray)); + + uint32_t *SelectedBinaryInd = SelectedBinary; + + // Find the appropriate device image, fallback to spirv if not found + constexpr uint32_t InvalidInd = std::numeric_limits::max(); + uint32_t Spirv = InvalidInd; + + for (uint32_t i = 0; i < NumBinaries; ++i) { + if (strcmp(Binaries[i]->DeviceTargetSpec, BinaryTarget) == 0) { + *SelectedBinaryInd = i; + return UR_RESULT_SUCCESS; + } + if (strcmp(Binaries[i]->DeviceTargetSpec, + __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64) == 0) + Spirv = i; + } + // Points to a spirv image, if such indeed was found + if ((*SelectedBinaryInd = Spirv) != InvalidInd) + return UR_RESULT_SUCCESS; + + // No image can be loaded for the given device + return UR_RESULT_ERROR_INVALID_BINARY; +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( + ur_device_handle_t Device, ///< [in] handle of the device. + ur_native_handle_t + *NativeDevice ///< [out] a pointer to the native handle of the device. +) { + *NativeDevice = reinterpret_cast(Device->ZeDevice); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( + ur_native_handle_t NativeDevice, ///< [in] the native handle of the device. + ur_platform_handle_t Platform, ///< [in] handle of the platform instance + ur_device_handle_t + *Device ///< [out] pointer to the handle of the device object created. +) { + auto ZeDevice = ur_cast(NativeDevice); + + // The SYCL spec requires that the set of devices must remain fixed for the + // duration of the application's execution. We assume that we found all of the + // Level Zero devices when we initialized the platforms/devices cache, so the + // "NativeHandle" must already be in the cache. If it is not, this must not be + // a valid Level Zero device. + // + // TODO: maybe we should populate cache of platforms if it wasn't already. + // For now assert that is was populated. + UR_ASSERT(PiPlatformCachePopulated, UR_RESULT_ERROR_INVALID_VALUE); + const std::lock_guard Lock{*PiPlatformsCacheMutex}; + + ur_device_handle_t Dev = nullptr; + for (ur_platform_handle_t ThePlatform : *PiPlatformsCache) { + Dev = ThePlatform->getDeviceFromNativeHandle(ZeDevice); + if (Dev) { + // Check that the input Platform, if was given, matches the found one. + UR_ASSERT(!Platform || Platform == ThePlatform, + UR_RESULT_ERROR_INVALID_PLATFORM); + break; + } + } + + if (Dev == nullptr) + return UR_RESULT_ERROR_INVALID_VALUE; + + *Device = Dev; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetGlobalTimestamps( + ur_device_handle_t Device, ///< [in] handle of the device instance + uint64_t *DeviceTimestamp, ///< [out][optional] pointer to the Device's + ///< global timestamp that correlates with the + ///< Host's global timestamp value + uint64_t *HostTimestamp ///< [out][optional] pointer to the Host's global + ///< timestamp that correlates with the Device's + ///< global timestamp value +) { + const uint64_t &ZeTimerResolution = + Device->ZeDeviceProperties->timerResolution; + const uint64_t TimestampMaxCount = + ((1ULL << Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL); + uint64_t DeviceClockCount, Dummy; + + ZE2UR_CALL(zeDeviceGetGlobalTimestamps, + (Device->ZeDevice, + HostTimestamp == nullptr ? &Dummy : HostTimestamp, + &DeviceClockCount)); + + if (DeviceTimestamp != nullptr) { + *DeviceTimestamp = + (DeviceClockCount & TimestampMaxCount) * ZeTimerResolution; + } + + return UR_RESULT_SUCCESS; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp index ecbc9cc6a21d2..09e942a6441b8 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp @@ -7,4 +7,160 @@ //===-----------------------------------------------------------------===// #pragma once +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + #include "ur_level_zero_common.hpp" + +enum EventsScope { + // All events are created host-visible. + AllHostVisible, + // All events are created with device-scope and only when + // host waits them or queries their status that a proxy + // host-visible event is created and set to signal after + // original event signals. + OnDemandHostVisibleProxy, + // All events are created with device-scope and only + // when a batch of commands is submitted for execution a + // last command in that batch is added to signal host-visible + // completion of each command in this batch (the default mode). + LastCommandInBatchHostVisible +}; + +struct ur_device_handle_t_ : _ur_object { + ur_device_handle_t_(ze_device_handle_t Device, ur_platform_handle_t Plt, + ur_device_handle_t ParentDevice = nullptr) + : ZeDevice{Device}, Platform{Plt}, RootDevice{ParentDevice}, + ZeDeviceProperties{}, ZeDeviceComputeProperties{} { + // NOTE: one must additionally call initialize() to complete + // UR device creation. + } + + // The helper structure that keeps info about a command queue groups of the + // device. It is not changed after it is initialized. + struct queue_group_info_t { + enum type { + MainCopy, + LinkCopy, + Compute, + Size // must be last + }; + + // Keep the ordinal of the commands group as returned by + // zeDeviceGetCommandQueueGroupProperties. A value of "-1" means that + // there is no such queue group available in the Level Zero runtime. + int32_t ZeOrdinal{-1}; + + // Keep the index of the specific queue in this queue group where + // all the command enqueues of the corresponding type should go to. + // The value of "-1" means that no hard binding is defined and + // implementation can choose specific queue index on its own. + int32_t ZeIndex{-1}; + + // Keeps the queue group properties. + ZeStruct ZeProperties; + }; + + std::vector QueueGroup = + std::vector(queue_group_info_t::Size); + + // This returns "true" if a main copy engine is available for use. + bool hasMainCopyEngine() const { + return QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal >= 0; + } + + // This returns "true" if a link copy engine is available for use. + bool hasLinkCopyEngine() const { + return QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal >= 0; + } + + // This returns "true" if a main or link copy engine is available for use. + bool hasCopyEngine() const { + return hasMainCopyEngine() || hasLinkCopyEngine(); + } + + // Initialize the entire UR device. + // Optional param `SubSubDeviceOrdinal` `SubSubDeviceIndex` are the compute + // command queue ordinal and index respectively, used to initialize + // sub-sub-devices. + ur_result_t initialize(int SubSubDeviceOrdinal = -1, + int SubSubDeviceIndex = -1); + + // Level Zero device handle. + // This field is only set at _ur_device_handle_t creation time, and cannot + // change. Therefore it can be accessed without holding a lock on this + // _ur_device_handle_t. + const ze_device_handle_t ZeDevice; + + // Keep the subdevices that are partitioned from this ur_device_handle_t for + // reuse The order of sub-devices in this vector is repeated from the + // ze_device_handle_t array that are returned from zeDeviceGetSubDevices() + // call, which will always return sub-devices in the fixed same order. + std::vector SubDevices; + + // PI platform to which this device belongs. + // This field is only set at _ur_device_handle_t creation time, and cannot + // change. Therefore it can be accessed without holding a lock on this + // _ur_device_handle_t. + ur_platform_handle_t Platform; + + // Root-device of a sub-device, null if this is not a sub-device. + // This field is only set at _ur_device_handle_t creation time, and cannot + // change. Therefore it can be accessed without holding a lock on this + // _ur_device_handle_t. + const ur_device_handle_t RootDevice; + + enum ImmCmdlistMode { + // Immediate commandlists are not used. + NotUsed = 0, + // One set of compute and copy immediate commandlists per queue. + PerQueue, + // One set of compute and copy immediate commandlists per host thread that + // accesses the queue. + PerThreadPerQueue + }; + // Read env settings to select immediate commandlist mode. + ImmCmdlistMode useImmediateCommandLists(); + + // Returns whether immediate command lists are used on this device. + ImmCmdlistMode ImmCommandListUsed{}; + + // Scope of events used for events on the device + // Can be adjusted with SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS + // for non-immediate command lists + EventsScope ZeEventsScope = AllHostVisible; + + bool isSubDevice() { return RootDevice != nullptr; } + + // Is this a Data Center GPU Max series (aka PVC). + bool isPVC() { return (ZeDeviceProperties->deviceId & 0xff0) == 0xbd0; } + + // Does this device represent a single compute slice? + bool isCCS() const { + return QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] + .ZeIndex >= 0; + } + + // Cache of the immutable device properties. + ZeCache> ZeDeviceProperties; + ZeCache> ZeDeviceComputeProperties; + ZeCache> ZeDeviceImageProperties; + ZeCache> ZeDeviceModuleProperties; + ZeCache>, + std::vector>>> + ZeDeviceMemoryProperties; + ZeCache> + ZeDeviceMemoryAccessProperties; + ZeCache> ZeDeviceCacheProperties; +}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp index 2889db7884b0e..318a931d608f3 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp @@ -6,4 +6,1171 @@ // //===-----------------------------------------------------------------===// +#include +#include +#include +#include + +#include "ur_level_zero_common.hpp" #include "ur_level_zero_event.hpp" +#include + +void printZeEventList(const _ur_ze_event_list_t &UrZeEventList) { + urPrint(" NumEventsInWaitList %d:", UrZeEventList.Length); + + for (uint32_t I = 0; I < UrZeEventList.Length; I++) { + urPrint(" %#llx", ur_cast(UrZeEventList.ZeEventList[I])); + } + + urPrint("\n"); +} + +// This is an experimental option that allows the use of multiple command lists +// when submitting barriers. The default is 0. +static const bool UseMultipleCmdlistBarriers = [] { + const char *UseMultipleCmdlistBarriersFlag = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS"); + if (!UseMultipleCmdlistBarriersFlag) + return true; + return std::stoi(UseMultipleCmdlistBarriersFlag) > 0; +}(); + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating that + ///< all previously enqueued commands must be complete. + ur_event_handle_t + *OutEvent ///< [in,out][optional] return an event object that identifies + ///< this particular command instance. +) { + if (EventWaitList) { + bool UseCopyEngine = false; + + // Lock automatically releases when this goes out of scope. + std::scoped_lock lock(Queue->Mutex); + + _ur_ze_event_list_t TmpWaitList = {}; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + // Get a new command list to be used on this call + ur_command_list_ptr_t CommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, + UseCopyEngine)); + + ze_event_handle_t ZeEvent = nullptr; + ur_event_handle_t InternalEvent; + bool IsInternal = OutEvent == nullptr; + ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; + UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_EXT_COMMAND_TYPE_USER, + CommandList, IsInternal)); + + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + + const auto &WaitList = (*Event)->WaitList; + auto ZeCommandList = CommandList->first; + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); + + ZE2UR_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent)); + + // Execute command list asynchronously as the event will be used + // to track down its completion. + return Queue->executeCommandList(CommandList); + } + + { + // If wait-list is empty, then this particular command should wait until + // all previous enqueued commands to the command-queue have completed. + // + // TODO: find a way to do that without blocking the host. + + // Lock automatically releases when this goes out of scope. + std::scoped_lock lock(Queue->Mutex); + + if (OutEvent) { + UR_CALL(createEventAndAssociateQueue(Queue, OutEvent, + UR_EXT_COMMAND_TYPE_USER, + Queue->CommandListMap.end(), + /* IsInternal */ false)); + } + + Queue->synchronize(); + + if (OutEvent) { + Queue->LastCommandEvent = reinterpret_cast(*OutEvent); + + ZE2UR_CALL(zeEventHostSignal, ((*OutEvent)->ZeEvent)); + (*OutEvent)->Completed = true; + } + } + + if (!Queue->Device->ImmCommandListUsed) { + std::unique_lock Lock(Queue->Mutex); + resetCommandLists(Queue); + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating that + ///< all previously enqueued commands must be complete. + ur_event_handle_t + *OutEvent ///< [in,out][optional] return an event object that identifies + ///< this particular command instance. +) { + + // Lock automatically releases when this goes out of scope. + std::scoped_lock lock(Queue->Mutex); + + // Helper function for appending a barrier to a command list. + auto insertBarrierIntoCmdList = + [&Queue](ur_command_list_ptr_t CmdList, + const _ur_ze_event_list_t &EventWaitList, + ur_event_handle_t &Event, bool IsInternal) { + UR_CALL(createEventAndAssociateQueue( + Queue, &Event, UR_EXT_COMMAND_TYPE_USER, CmdList, IsInternal)); + + Event->WaitList = EventWaitList; + ZE2UR_CALL(zeCommandListAppendBarrier, + (CmdList->first, Event->ZeEvent, EventWaitList.Length, + EventWaitList.ZeEventList)); + return UR_RESULT_SUCCESS; + }; + + ur_event_handle_t InternalEvent; + bool IsInternal = OutEvent == nullptr; + ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; + + // Indicator for whether batching is allowed. This may be changed later in + // this function, but allow it by default. + bool OkToBatch = true; + + // If we have a list of events to make the barrier from, then we can create a + // barrier on these and use the resulting event as our future barrier. + // We use the same approach if + // SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a + // positive value. + // We use the same approach if we have in-order queue because every command + // depends on previous one, so we don't need to insert barrier to multiple + // command lists. + if (NumEventsInWaitList || !UseMultipleCmdlistBarriers || + Queue->isInOrderQueue()) { + // Retain the events as they will be owned by the result event. + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, false /*UseCopyEngine=*/)); + + // Get an arbitrary command-list in the queue. + ur_command_list_ptr_t CmdList; + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CmdList, false /*UseCopyEngine=*/, OkToBatch)); + + // Insert the barrier into the command-list and execute. + UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal)); + + UR_CALL(Queue->executeCommandList(CmdList, false, OkToBatch)); + + // Because of the dependency between commands in the in-order queue we don't + // need to keep track of any active barriers if we have in-order queue. + if (UseMultipleCmdlistBarriers && !Queue->isInOrderQueue()) { + auto UREvent = reinterpret_cast(*Event); + Queue->ActiveBarriers.add(UREvent); + } + return UR_RESULT_SUCCESS; + } + + // Since there are no events to explicitly create a barrier for, we are + // inserting a queue-wide barrier. + + // Command list(s) for putting barriers. + std::vector CmdLists; + + // There must be at least one L0 queue. + auto &ComputeGroup = Queue->ComputeQueueGroupsByTID.get(); + auto &CopyGroup = Queue->CopyQueueGroupsByTID.get(); + UR_ASSERT(!ComputeGroup.ZeQueues.empty() || !CopyGroup.ZeQueues.empty(), + UR_RESULT_ERROR_INVALID_QUEUE); + + size_t NumQueues = 0; + for (auto &QueueMap : + {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID}) + for (auto &QueueGroup : QueueMap) + NumQueues += QueueGroup.second.ZeQueues.size(); + + OkToBatch = true; + // Get an available command list tied to each command queue. We need + // these so a queue-wide barrier can be inserted into each command + // queue. + CmdLists.reserve(NumQueues); + for (auto &QueueMap : + {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID}) + for (auto &QueueGroup : QueueMap) { + bool UseCopyEngine = + QueueGroup.second.Type != ur_queue_handle_t_::queue_type::Compute; + if (Queue->Device->ImmCommandListUsed) { + // If immediate command lists are being used, each will act as their own + // queue, so we must insert a barrier into each. + for (auto ImmCmdList : QueueGroup.second.ImmCmdLists) + if (ImmCmdList != Queue->CommandListMap.end()) + CmdLists.push_back(ImmCmdList); + } else { + for (auto ZeQueue : QueueGroup.second.ZeQueues) { + if (ZeQueue) { + ur_command_list_ptr_t CmdList; + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CmdList, UseCopyEngine, OkToBatch, &ZeQueue)); + CmdLists.push_back(CmdList); + } + } + } + } + + // If no activity has occurred on the queue then there will be no cmdlists. + // We need one for generating an Event, so create one. + if (CmdLists.size() == 0) { + // Get any available command list. + ur_command_list_ptr_t CmdList; + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CmdList, false /*UseCopyEngine=*/, OkToBatch)); + CmdLists.push_back(CmdList); + } + + if (CmdLists.size() > 1) { + // Insert a barrier into each unique command queue using the available + // command-lists. + std::vector EventWaitVector(CmdLists.size()); + for (size_t I = 0; I < CmdLists.size(); ++I) { + UR_CALL(insertBarrierIntoCmdList(CmdLists[I], _ur_ze_event_list_t{}, + EventWaitVector[I], + true /*IsInternal*/)); + } + // If there were multiple queues we need to create a "convergence" event to + // be our active barrier. This convergence event is signalled by a barrier + // on all the events from the barriers we have inserted into each queue. + // Use the first command list as our convergence command list. + ur_command_list_ptr_t &ConvergenceCmdList = CmdLists[0]; + + // Create an event list. It will take ownership over all relevant events so + // we relinquish ownership and let it keep all events it needs. + _ur_ze_event_list_t BaseWaitList; + UR_CALL(BaseWaitList.createAndRetainUrZeEventList( + EventWaitVector.size(), + reinterpret_cast(EventWaitVector.data()), + Queue, ConvergenceCmdList->second.isCopy(Queue))); + + // Insert a barrier with the events from each command-queue into the + // convergence command list. The resulting event signals the convergence of + // all barriers. + UR_CALL(insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList, *Event, + IsInternal)); + } else { + // If there is only a single queue then insert a barrier and the single + // result event can be used as our active barrier and used as the return + // event. Take into account whether output event is discarded or not. + UR_CALL(insertBarrierIntoCmdList(CmdLists[0], _ur_ze_event_list_t{}, *Event, + IsInternal)); + } + + // Execute each command list so the barriers can be encountered. + for (ur_command_list_ptr_t &CmdList : CmdLists) + UR_CALL(Queue->executeCommandList(CmdList, false, OkToBatch)); + + UR_CALL(Queue->ActiveBarriers.clear()); + auto UREvent = reinterpret_cast(*Event); + Queue->ActiveBarriers.add(UREvent); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo( + ur_event_handle_t Event, ///< [in] handle of the event object + ur_event_info_t PropName, ///< [in] the name of the event property to query + size_t PropValueSize, ///< [in] size in bytes of the event property value + void *PropValue, ///< [out][optional] value of the event property + size_t + *PropValueSizeRet ///< [out][optional] bytes returned in event property +) { + UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet); + + switch (PropName) { + case UR_EVENT_INFO_COMMAND_QUEUE: { + std::shared_lock EventLock(Event->Mutex); + return ReturnValue(ur_queue_handle_t{Event->UrQueue}); + } + case UR_EVENT_INFO_CONTEXT: { + std::shared_lock EventLock(Event->Mutex); + return ReturnValue(ur_context_handle_t{Event->Context}); + } + case UR_EVENT_INFO_COMMAND_TYPE: { + std::shared_lock EventLock(Event->Mutex); + return ReturnValue(ur_cast(Event->CommandType)); + } + case UR_EVENT_INFO_COMMAND_EXECUTION_STATUS: { + // Check to see if the event's Queue has an open command list due to + // batching. If so, go ahead and close and submit it, because it is + // possible that this is trying to query some event's status that + // is part of the batch. This isn't strictly required, but it seems + // like a reasonable thing to do. + auto UrQueue = Event->UrQueue; + if (UrQueue) { + // Lock automatically releases when this goes out of scope. + std::scoped_lock lock(UrQueue->Mutex); + const auto &OpenCommandList = UrQueue->eventOpenCommandList(Event); + if (OpenCommandList != UrQueue->CommandListMap.end()) { + UR_CALL(UrQueue->executeOpenCommandList( + OpenCommandList->second.isCopy(UrQueue))); + } + } + + // Level Zero has a much more explicit notion of command submission than + // OpenCL. It doesn't happen unless the user submits a command list. We've + // done it just above so the status is at least PI_EVENT_RUNNING. + uint32_t Result = ur_cast(UR_EVENT_STATUS_RUNNING); + + // Make sure that we query a host-visible event only. + // If one wasn't yet created then don't create it here as well, and + // just conservatively return that event is not yet completed. + std::shared_lock EventLock(Event->Mutex); + auto HostVisibleEvent = Event->HostVisibleEvent; + if (Event->Completed) { + Result = UR_EVENT_STATUS_COMPLETE; + } else if (HostVisibleEvent) { + ze_result_t ZeResult; + ZeResult = + ZE_CALL_NOCHECK(zeEventQueryStatus, (HostVisibleEvent->ZeEvent)); + if (ZeResult == ZE_RESULT_SUCCESS) { + Result = UR_EVENT_STATUS_COMPLETE; + } + } + return ReturnValue(Result); + return UR_RESULT_SUCCESS; + } + case UR_EVENT_INFO_REFERENCE_COUNT: { + return ReturnValue(Event->RefCount.load()); + } + default: + urPrint("Unsupported ParamName in urEventGetInfo: ParamName=%d(%x)\n", + PropName, PropName); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( + ur_event_handle_t Event, ///< [in] handle of the event object + ur_profiling_info_t + PropName, ///< [in] the name of the profiling property to query + size_t + PropValueSize, ///< [in] size in bytes of the profiling property value + void *PropValue, ///< [out][optional] value of the profiling property + size_t *PropValueSizeRet ///< [out][optional] pointer to the actual size in + ///< bytes returned in propValue +) { + std::shared_lock EventLock(Event->Mutex); + if (Event->UrQueue && + (Event->UrQueue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) == 0) { + return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; + } + + ur_device_handle_t Device = + Event->UrQueue ? Event->UrQueue->Device : Event->Context->Devices[0]; + + uint64_t ZeTimerResolution = Device->ZeDeviceProperties->timerResolution; + const uint64_t TimestampMaxValue = + ((1ULL << Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL); + + UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet); + + ze_kernel_timestamp_result_t tsResult; + + switch (PropName) { + case UR_PROFILING_INFO_COMMAND_START: { + ZE2UR_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult)); + uint64_t ContextStartTime = + (tsResult.global.kernelStart & TimestampMaxValue) * ZeTimerResolution; + return ReturnValue(ContextStartTime); + } + case UR_PROFILING_INFO_COMMAND_END: { + ZE2UR_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult)); + + uint64_t ContextStartTime = + (tsResult.global.kernelStart & TimestampMaxValue); + uint64_t ContextEndTime = (tsResult.global.kernelEnd & TimestampMaxValue); + + // + // Handle a possible wrap-around (the underlying HW counter is < 64-bit). + // Note, it will not report correct time if there were multiple wrap + // arounds, and the longer term plan is to enlarge the capacity of the + // HW timestamps. + // + if (ContextEndTime <= ContextStartTime) { + ContextEndTime += TimestampMaxValue; + } + ContextEndTime *= ZeTimerResolution; + return ReturnValue(ContextEndTime); + } + case UR_PROFILING_INFO_COMMAND_QUEUED: + case UR_PROFILING_INFO_COMMAND_SUBMIT: + // Note: No users for this case + // TODO: Implement commmand submission time when needed, + // by recording device timestamp (using zeDeviceGetGlobalTimestamps) + // before submitting command to device + return ReturnValue(uint64_t{0}); + default: + urPrint("urEventGetProfilingInfo: not supported ParamName\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( + ze_event_handle_t &ZeHostVisibleEvent) { + + std::scoped_lock Lock(UrQueue->Mutex, + this->Mutex); + + if (!HostVisibleEvent) { + if (UrQueue->Device->ZeEventsScope != OnDemandHostVisibleProxy) + die("getOrCreateHostVisibleEvent: missing host-visible event"); + + // Submit the command(s) signalling the proxy event to the queue. + // We have to first submit a wait for the device-only event for which this + // proxy is created. + // + // Get a new command list to be used on this call + + // We want to batch these commands to avoid extra submissions (costly) + bool OkToBatch = true; + + ur_command_list_ptr_t CommandList{}; + UR_CALL(UrQueue->Context->getAvailableCommandList( + UrQueue, CommandList, false /* UseCopyEngine */, OkToBatch)) + + // Create a "proxy" host-visible event. + UR_CALL(createEventAndAssociateQueue( + UrQueue, &HostVisibleEvent, UR_EXT_COMMAND_TYPE_USER, CommandList, + /* IsInternal */ false, /* HostVisible */ true)); + + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (CommandList->first, 1, &ZeEvent)); + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (CommandList->first, HostVisibleEvent->ZeEvent)); + + UR_CALL(UrQueue->executeCommandList(CommandList, false, OkToBatch)) + } + + ZeHostVisibleEvent = HostVisibleEvent->ZeEvent; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventWait( + uint32_t NumEvents, ///< [in] number of events in the event list + const ur_event_handle_t + *EventWaitList ///< [in][range(0, numEvents)] pointer to a list of + ///< events to wait for completion +) { + for (uint32_t I = 0; I < NumEvents; I++) { + if (EventWaitList[I]->UrQueue->Device->ZeEventsScope == + OnDemandHostVisibleProxy) { + // Make sure to add all host-visible "proxy" event signals if needed. + // This ensures that all signalling commands are submitted below and + // thus proxy events can be waited without a deadlock. + // + ur_event_handle_t_ *Event = + ur_cast(EventWaitList[I]); + if (!Event->hasExternalRefs()) + die("urEventsWait must not be called for an internal event"); + + ze_event_handle_t ZeHostVisibleEvent; + if (auto Res = Event->getOrCreateHostVisibleEvent(ZeHostVisibleEvent)) + return Res; + } + } + // Submit dependent open command lists for execution, if any + for (uint32_t I = 0; I < NumEvents; I++) { + ur_event_handle_t_ *Event = ur_cast(EventWaitList[I]); + auto UrQueue = Event->UrQueue; + if (UrQueue) { + // Lock automatically releases when this goes out of scope. + std::scoped_lock lock(UrQueue->Mutex); + + UR_CALL(UrQueue->executeAllOpenCommandLists()); + } + } + std::unordered_set Queues; + for (uint32_t I = 0; I < NumEvents; I++) { + { + ur_event_handle_t_ *Event = + ur_cast(EventWaitList[I]); + { + std::shared_lock EventLock(Event->Mutex); + if (!Event->hasExternalRefs()) + die("piEventsWait must not be called for an internal event"); + + if (!Event->Completed) { + auto HostVisibleEvent = Event->HostVisibleEvent; + if (!HostVisibleEvent) + die("The host-visible proxy event missing"); + + ze_event_handle_t ZeEvent = HostVisibleEvent->ZeEvent; + urPrint("ZeEvent = %#llx\n", ur_cast(ZeEvent)); + ZE2UR_CALL(zeHostSynchronize, (ZeEvent)); + Event->Completed = true; + } + } + if (auto Q = Event->UrQueue) { + if (Q->Device->ImmCommandListUsed && Q->isInOrderQueue()) + // Use information about waited event to cleanup completed events in + // the in-order queue. + CleanupEventsInImmCmdLists( + Event->UrQueue, false /* QueueLocked */, false /* QueueSynced */, + reinterpret_cast(Event)); + else { + // NOTE: we are cleaning up after the event here to free resources + // sooner in case run-time is not calling piEventRelease soon enough. + CleanupCompletedEvent(reinterpret_cast(Event)); + // For the case when we have out-of-order queue or regular command + // lists its more efficient to check fences so put the queue in the + // set to cleanup later. + Queues.insert(Q); + } + } + } + } + + // We waited some events above, check queue for signaled command lists and + // reset them. + for (auto &Q : Queues) { + std::unique_lock Lock(Q->Mutex); + resetCommandLists(Q); + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventRetain( + ur_event_handle_t Event ///< [in] handle of the event object +) { + Event->RefCountExternal++; + Event->RefCount.increment(); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventRelease( + ur_event_handle_t Event ///< [in] handle of the event object +) { + Event->RefCountExternal--; + UR_CALL(urEventReleaseInternal(Event)); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle( + ur_event_handle_t Event, ///< [in] handle of the event. + ur_native_handle_t + *NativeEvent ///< [out] a pointer to the native handle of the event. +) { + { + std::shared_lock Lock(Event->Mutex); + auto *ZeEvent = ur_cast(NativeEvent); + *ZeEvent = Event->ZeEvent; + } + // Event can potentially be in an open command-list, make sure that + // it is submitted for execution to avoid potential deadlock if + // interop app is going to wait for it. + auto Queue = Event->UrQueue; + if (Queue) { + std::scoped_lock lock(Queue->Mutex); + const auto &OpenCommandList = Queue->eventOpenCommandList(Event); + if (OpenCommandList != Queue->CommandListMap.end()) { + UR_CALL( + Queue->executeOpenCommandList(OpenCommandList->second.isCopy(Queue))); + } + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urExtEventCreate( + ur_context_handle_t Context, ///< [in] handle of the context object + ur_event_handle_t + *Event ///< [out] pointer to the handle of the event object created. +) { + UR_CALL(EventCreate(Context, nullptr, true, Event)); + + (*Event)->RefCountExternal++; + ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent)); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( + ur_native_handle_t NativeEvent, ///< [in] the native handle of the event. + ur_context_handle_t Context, ///< [in] handle of the context object + ur_event_handle_t + *Event ///< [out] pointer to the handle of the event object created. +) { + + // we dont have urEventCreate, so use this check for now to know that + // the call comes from piEventCreate() + if (NativeEvent == nullptr) { + UR_CALL(EventCreate(Context, nullptr, true, Event)); + + (*Event)->RefCountExternal++; + ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent)); + return UR_RESULT_SUCCESS; + } + + auto ZeEvent = ur_cast(NativeEvent); + ur_event_handle_t_ *UrEvent{}; + try { + UrEvent = new ur_event_handle_t_(ZeEvent, nullptr /* ZeEventPool */, + Context, UR_EXT_COMMAND_TYPE_USER, true); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + // Assume native event is host-visible, or otherwise we'd + // need to create a host-visible proxy for it. + UrEvent->HostVisibleEvent = reinterpret_cast(UrEvent); + + // Unlike regular events managed by SYCL RT we don't have to wait for interop + // events completion, and not need to do the their `cleanup()`. This in + // particular guarantees that the extra `piEventRelease` is not called on + // them. That release is needed to match the `piEventRetain` of regular events + // made for waiting for event completion, but not this interop event. + UrEvent->CleanedUp = true; + + *Event = reinterpret_cast(UrEvent); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback( + ur_event_handle_t Event, ///< [in] handle of the event object + ur_execution_info_t ExecStatus, ///< [in] execution status of the event + ur_event_callback_t Notify, ///< [in] execution status of the event + void *UserData ///< [in][out][optional] pointer to data to be passed to + ///< callback. +) { + std::ignore = Event; + std::ignore = ExecStatus; + std::ignore = Notify; + std::ignore = UserData; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urEventReleaseInternal(ur_event_handle_t Event) { + if (!Event->RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + + if (Event->CommandType == UR_COMMAND_MEM_UNMAP && Event->CommandData) { + // Free the memory allocated in the piEnqueueMemBufferMap. + if (auto Res = ZeMemFreeHelper(Event->Context, Event->CommandData)) + return Res; + Event->CommandData = nullptr; + } + if (Event->OwnNativeHandle) { + if (DisableEventsCaching) { + auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent)); + // Gracefully handle the case that L0 was already unloaded. + if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) + return ze2urResult(ZeResult); + auto Context = Event->Context; + if (auto Res = Context->decrementUnreleasedEventsInPool(Event)) + return Res; + } + } + // It is possible that host-visible event was never created. + // In case it was check if that's different from this same event + // and release a reference to it. + if (Event->HostVisibleEvent && Event->HostVisibleEvent != Event) { + // Decrement ref-count of the host-visible proxy event. + UR_CALL(urEventReleaseInternal(Event->HostVisibleEvent)); + } + + // Save pointer to the queue before deleting/resetting event. + // When we add an event to the cache we need to check whether profiling is + // enabled or not, so we access properties of the queue and that's why queue + // must released later. + auto Queue = Event->UrQueue; + if (DisableEventsCaching || !Event->OwnNativeHandle) { + delete Event; + } else { + Event->Context->addEventToContextCache(Event); + } + + // We intentionally incremented the reference counter when an event is + // created so that we can avoid pi_queue is released before the associated + // pi_event is released. Here we have to decrement it so pi_queue + // can be released successfully. + if (Queue) { + UR_CALL(urQueueReleaseInternal(Queue)); + } + + return UR_RESULT_SUCCESS; +} + +// Helper function to implement zeHostSynchronize. +// The behavior is to avoid infinite wait during host sync under ZE_DEBUG. +// This allows for a much more responsive debugging of hangs. +// +template +ze_result_t zeHostSynchronizeImpl(Func Api, T Handle) { + if (!UrL0Debug) { + return Api(Handle, UINT64_MAX); + } + + ze_result_t R; + while ((R = Api(Handle, 1000)) == ZE_RESULT_NOT_READY) + ; + return R; +} + +// Template function to do various types of host synchronizations. +// This is intended to be used instead of direct calls to specific +// Level-Zero synchronization APIs. +// +template ze_result_t zeHostSynchronize(T Handle); +template <> ze_result_t zeHostSynchronize(ze_event_handle_t Handle) { + return zeHostSynchronizeImpl(zeEventHostSynchronize, Handle); +} +template <> ze_result_t zeHostSynchronize(ze_command_queue_handle_t Handle) { + return zeHostSynchronizeImpl(zeCommandQueueSynchronize, Handle); +} + +// Perform any necessary cleanup after an event has been signalled. +// This currently makes sure to release any kernel that may have been used by +// the event, updates the last command event in the queue and cleans up all dep +// events of the event. +// If the caller locks queue mutex then it must pass 'true' to QueueLocked. +ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked) { + ur_kernel_handle_t AssociatedKernel = nullptr; + // List of dependent events. + std::list EventsToBeReleased; + ur_queue_handle_t AssociatedQueue = nullptr; + { + std::scoped_lock EventLock(Event->Mutex); + // Exit early of event was already cleanedup. + if (Event->CleanedUp) + return UR_RESULT_SUCCESS; + + AssociatedQueue = Event->UrQueue; + + // Remember the kernel associated with this event if there is one. We are + // going to release it later. + if (Event->CommandType == UR_COMMAND_KERNEL_LAUNCH && Event->CommandData) { + AssociatedKernel = + reinterpret_cast(Event->CommandData); + Event->CommandData = nullptr; + } + + // Make a list of all the dependent events that must have signalled + // because this event was dependent on them. + Event->WaitList.collectEventsForReleaseAndDestroyPiZeEventList( + EventsToBeReleased); + + Event->CleanedUp = true; + } + + auto ReleaseIndirectMem = [](ur_kernel_handle_t Kernel) { + if (IndirectAccessTrackingEnabled) { + // piKernelRelease is called by CleanupCompletedEvent(Event) as soon as + // kernel execution has finished. This is the place where we need to + // release memory allocations. If kernel is not in use (not submitted by + // some other thread) then release referenced memory allocations. As a + // result, memory can be deallocated and context can be removed from + // container in the platform. That's why we need to lock a mutex here. + ur_platform_handle_t Plt = Kernel->Program->Context->getPlatform(); + std::scoped_lock ContextsLock(Plt->ContextsMutex); + + if (--Kernel->SubmissionsCount == 0) { + // Kernel is not submitted for execution, release referenced memory + // allocations. + for (auto &MemAlloc : Kernel->MemAllocs) { + // std::pair *, Hash + USMFreeHelper(MemAlloc->second.Context, MemAlloc->first, + MemAlloc->second.OwnZeMemHandle); + } + Kernel->MemAllocs.clear(); + } + } + }; + + // We've reset event data members above, now cleanup resources. + if (AssociatedKernel) { + ReleaseIndirectMem(AssociatedKernel); + UR_CALL(urKernelRelease(AssociatedKernel)); + } + + if (AssociatedQueue) { + { + // Lock automatically releases when this goes out of scope. + std::unique_lock QueueLock(AssociatedQueue->Mutex, + std::defer_lock); + if (!QueueLocked) + QueueLock.lock(); + + // If this event was the LastCommandEvent in the queue, being used + // to make sure that commands were executed in-order, remove this. + // If we don't do this, the event can get released and freed leaving + // a dangling pointer to this event. It could also cause unneeded + // already finished events to show up in the wait list. + if (AssociatedQueue->LastCommandEvent == Event) { + AssociatedQueue->LastCommandEvent = nullptr; + } + } + + // Release this event since we explicitly retained it on creation and + // association with queue. Events which don't have associated queue doesn't + // require this release because it means that they are not created using + // createEventAndAssociateQueue, i.e. additional retain was not made. + UR_CALL(urEventReleaseInternal(Event)); + } + + // The list of dependent events will be appended to as we walk it so that this + // algorithm doesn't go recursive due to dependent events themselves being + // dependent on other events forming a potentially very deep tree, and deep + // recursion. That turned out to be a significant problem with the recursive + // code that preceded this implementation. + while (!EventsToBeReleased.empty()) { + ur_event_handle_t DepEvent = EventsToBeReleased.front(); + DepEvent->Completed = true; + EventsToBeReleased.pop_front(); + + ur_kernel_handle_t DepEventKernel = nullptr; + { + std::scoped_lock DepEventLock(DepEvent->Mutex); + DepEvent->WaitList.collectEventsForReleaseAndDestroyPiZeEventList( + EventsToBeReleased); + if (IndirectAccessTrackingEnabled) { + // DepEvent has finished, we can release the associated kernel if there + // is one. This is the earliest place we can do this and it can't be + // done twice, so it is safe. Lock automatically releases when this goes + // out of scope. + // TODO: this code needs to be moved out of the guard. + if (DepEvent->CommandType == UR_COMMAND_KERNEL_LAUNCH && + DepEvent->CommandData) { + DepEventKernel = + reinterpret_cast(DepEvent->CommandData); + DepEvent->CommandData = nullptr; + } + } + } + if (DepEventKernel) { + ReleaseIndirectMem(DepEventKernel); + // UR_CALL(piKernelRelease(DepEventKernel)); + } + UR_CALL(urEventReleaseInternal(DepEvent)); + } + + return UR_RESULT_SUCCESS; +} + +// Helper function for creating a PI event. +// The "Queue" argument specifies the PI queue where a command is submitted. +// The "HostVisible" argument specifies if event needs to be allocated from +// a host-visible pool. +// +ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, + bool HostVisible, ur_event_handle_t *RetEvent) { + + bool ProfilingEnabled = + !Queue || (Queue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) != 0; + + if (auto CachedEvent = + Context->getEventFromContextCache(HostVisible, ProfilingEnabled)) { + *RetEvent = CachedEvent; + return UR_RESULT_SUCCESS; + } + + ze_event_handle_t ZeEvent; + ze_event_pool_handle_t ZeEventPool = {}; + + size_t Index = 0; + + if (auto Res = Context->getFreeSlotInExistingOrNewPool( + ZeEventPool, Index, HostVisible, ProfilingEnabled)) + return Res; + + ZeStruct ZeEventDesc; + ZeEventDesc.index = Index; + ZeEventDesc.wait = 0; + + if (HostVisible) { + ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + } else { + // + // Set the scope to "device" for every event. This is sufficient for + // global device access and peer device access. If needed to be seen on + // the host we are doing special handling, see EventsScope options. + // + // TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be + // used in some circumstances. + // + ZeEventDesc.signal = 0; + } + + ZE2UR_CALL(zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeEvent)); + + try { + *RetEvent = new ur_event_handle_t_( + ZeEvent, ZeEventPool, reinterpret_cast(Context), + UR_EXT_COMMAND_TYPE_USER, true); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + if (HostVisible) + (*RetEvent)->HostVisibleEvent = + reinterpret_cast(*RetEvent); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_event_handle_t_::reset() { + UrQueue = nullptr; + CleanedUp = false; + Completed = false; + CommandData = nullptr; + CommandType = UR_EXT_COMMAND_TYPE_USER; + WaitList = {}; + RefCountExternal = 0; + RefCount.reset(); + CommandList = std::nullopt; + + if (!isHostVisible()) + HostVisibleEvent = nullptr; + + ZE2UR_CALL(zeEventHostReset, (ZeEvent)); + return UR_RESULT_SUCCESS; +} + +ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( + uint32_t EventListLength, const ur_event_handle_t *EventList, + ur_queue_handle_t CurQueue, bool UseCopyEngine) { + this->Length = 0; + this->ZeEventList = nullptr; + this->UrEventList = nullptr; + + if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) { + if (CurQueue->Device->ImmCommandListUsed) { + if (ReuseDiscardedEvents && CurQueue->isDiscardEvents()) { + // If queue is in-order with discarded events and if + // new command list is different from the last used command list then + // signal new event from the last immediate command list. We are going + // to insert a barrier in the new command list waiting for that event. + auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine); + uint32_t QueueGroupOrdinal, QueueIndex; + auto NextIndex = + QueueGroup.getQueueIndex(&QueueGroupOrdinal, &QueueIndex, + /*QueryOnly */ true); + auto NextImmCmdList = QueueGroup.ImmCmdLists[NextIndex]; + if (CurQueue->LastUsedCommandList != CurQueue->CommandListMap.end() && + CurQueue->LastUsedCommandList != NextImmCmdList) { + CurQueue->signalEventFromCmdListIfLastEventDiscarded( + CurQueue->LastUsedCommandList); + } + } + } else { + // Ensure LastCommandEvent's batch is submitted if it is differrent + // from the one this command is going to. If we reuse discarded events + // then signalEventFromCmdListIfLastEventDiscarded will be called at batch + // close if needed. + const auto &OpenCommandList = + CurQueue->eventOpenCommandList(CurQueue->LastCommandEvent); + if (OpenCommandList != CurQueue->CommandListMap.end() && + OpenCommandList->second.isCopy(CurQueue) != UseCopyEngine) { + + if (auto Res = CurQueue->executeOpenCommandList( + OpenCommandList->second.isCopy(CurQueue))) + return Res; + } + } + } + + // For in-order queues, every command should be executed only after the + // previous command has finished. The event associated with the last + // enqueued command is added into the waitlist to ensure in-order semantics. + bool IncludeLastCommandEvent = + CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr; + + // If the last event is discarded then we already have a barrier waiting for + // that event, so must not include the last command event into the wait + // list because it will cause waiting for event which was reset. + if (ReuseDiscardedEvents && CurQueue->isDiscardEvents() && + CurQueue->LastCommandEvent && CurQueue->LastCommandEvent->IsDiscarded) + IncludeLastCommandEvent = false; + + try { + uint32_t TmpListLength = 0; + + if (IncludeLastCommandEvent) { + this->ZeEventList = new ze_event_handle_t[EventListLength + 1]; + this->UrEventList = new ur_event_handle_t[EventListLength + 1]; + std::shared_lock Lock(CurQueue->LastCommandEvent->Mutex); + this->ZeEventList[0] = CurQueue->LastCommandEvent->ZeEvent; + this->UrEventList[0] = CurQueue->LastCommandEvent; + TmpListLength = 1; + } else if (EventListLength > 0) { + this->ZeEventList = new ze_event_handle_t[EventListLength]; + this->UrEventList = new ur_event_handle_t[EventListLength]; + } + + if (EventListLength > 0) { + for (uint32_t I = 0; I < EventListLength; I++) { + { + std::shared_lock Lock(EventList[I]->Mutex); + if (EventList[I]->Completed) + continue; + + // Poll of the host-visible events. + auto HostVisibleEvent = EventList[I]->HostVisibleEvent; + if (FilterEventWaitList && HostVisibleEvent) { + auto Res = ZE_CALL_NOCHECK(zeEventQueryStatus, + (HostVisibleEvent->ZeEvent)); + if (Res == ZE_RESULT_SUCCESS) { + // Event has already completed, don't put it into the list + continue; + } + } + } + + auto Queue = EventList[I]->UrQueue; + if (Queue) { + // The caller of createAndRetainUrZeEventList must already hold + // a lock of the CurQueue. Additionally lock the Queue if it + // is different from CurQueue. + // TODO: rework this to avoid deadlock when another thread is + // locking the same queues but in a different order. + auto Lock = ((Queue == CurQueue) + ? std::unique_lock() + : std::unique_lock(Queue->Mutex)); + + // If the event that is going to be waited is in an open batch + // different from where this next command is going to be added, + // then we have to force execute of that open command-list + // to avoid deadlocks. + // + const auto &OpenCommandList = + Queue->eventOpenCommandList(EventList[I]); + if (OpenCommandList != Queue->CommandListMap.end()) { + + if (Queue == CurQueue && + OpenCommandList->second.isCopy(Queue) == UseCopyEngine) { + // Don't force execute the batch yet since the new command + // is going to the same open batch as the dependent event. + } else { + if (auto Res = Queue->executeOpenCommandList( + OpenCommandList->second.isCopy(Queue))) + return Res; + } + } + } else { + // There is a dependency on an interop-event. + // Similarily to the above to avoid dead locks ensure that + // execution of all prior commands in the current command- + // batch is visible to the host. This may not be the case + // when we intended to have only last command in the batch + // produce host-visible event, e.g. + // + // event0 = interop event + // event1 = command1 (already in batch, no deps) + // event2 = command2 (is being added, dep on event0) + // event3 = signal host-visible event for the batch + // event1.wait() + // event0.signal() + // + // Make sure that event1.wait() will wait for a host-visible + // event that is signalled before the command2 is enqueued. + if (CurQueue->Device->ZeEventsScope != AllHostVisible) { + CurQueue->executeAllOpenCommandLists(); + } + } + + std::shared_lock Lock(EventList[I]->Mutex); + this->ZeEventList[TmpListLength] = EventList[I]->ZeEvent; + this->UrEventList[TmpListLength] = EventList[I]; + TmpListLength += 1; + } + } + + this->Length = TmpListLength; + + } catch (...) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + for (uint32_t I = 0; I < this->Length; I++) { + this->UrEventList[I]->RefCount.increment(); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t _ur_ze_event_list_t::collectEventsForReleaseAndDestroyPiZeEventList( + std::list &EventsToBeReleased) { + // acquire a lock before reading the length and list fields. + // Acquire the lock, copy the needed data locally, and reset + // the fields, then release the lock. + // Only then do we do the actual actions to release and destroy, + // holding the lock for the minimum time necessary. + uint32_t LocLength = 0; + ze_event_handle_t *LocZeEventList = nullptr; + ur_event_handle_t *LocPiEventList = nullptr; + + { + // acquire the lock and copy fields locally + // Lock automatically releases when this goes out of scope. + std::scoped_lock lock(this->UrZeEventListMutex); + + LocLength = Length; + LocZeEventList = ZeEventList; + LocPiEventList = UrEventList; + + Length = 0; + ZeEventList = nullptr; + UrEventList = nullptr; + + // release lock by ending scope. + } + + for (uint32_t I = 0; I < LocLength; I++) { + // Add the event to be released to the list + EventsToBeReleased.push_back(LocPiEventList[I]); + } + + if (LocZeEventList != nullptr) { + delete[] LocZeEventList; + } + if (LocPiEventList != nullptr) { + delete[] LocPiEventList; + } + + return UR_RESULT_SUCCESS; +} + +// Tells if this event is with profiling capabilities. +bool ur_event_handle_t_::isProfilingEnabled() const { + return !UrQueue || // tentatively assume user events are profiling enabled + (UrQueue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) != 0; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp index 64443b6d5575c..6acbd7459ef83 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp @@ -7,8 +7,265 @@ //===-----------------------------------------------------------------===// #pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + #include "ur_level_zero_common.hpp" +#include "ur_level_zero_queue.hpp" + +extern "C" { +ur_result_t urEventReleaseInternal(ur_event_handle_t Event); +ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, + bool HostVisible, ur_event_handle_t *RetEvent); +} // extern "C" + +// This is an experimental option that allows to disable caching of events in +// the context. +const bool DisableEventsCaching = [] { + const char *DisableEventsCachingFlag = + std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_EVENTS_CACHING"); + if (!DisableEventsCachingFlag) + return false; + return std::stoi(DisableEventsCachingFlag) != 0; +}(); + +// This is an experimental option that allows reset and reuse of uncompleted +// events in the in-order queue with discard_events property. +const bool ReuseDiscardedEvents = [] { + const char *ReuseDiscardedEventsFlag = + std::getenv("SYCL_PI_LEVEL_ZERO_REUSE_DISCARDED_EVENTS"); + if (!ReuseDiscardedEventsFlag) + return true; + return std::stoi(ReuseDiscardedEventsFlag) > 0; +}(); + +// Maximum number of events that can be present in an event ZePool is captured +// here. Setting it to 256 gave best possible performance for several +// benchmarks. +const uint32_t MaxNumEventsPerPool = [] { + const auto MaxNumEventsPerPoolEnv = + std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); + uint32_t Result = + MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256; + if (Result <= 0) + Result = 256; + return Result; +}(); + +const bool FilterEventWaitList = [] { + const char *Ret = std::getenv("SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST"); + const bool RetVal = Ret ? std::stoi(Ret) : 1; + return RetVal; +}(); + +struct _ur_ze_event_list_t { + // List of level zero events for this event list. + ze_event_handle_t *ZeEventList = {nullptr}; + + // List of pi_events for this event list. + ur_event_handle_t *UrEventList = {nullptr}; + + // length of both the lists. The actual allocation of these lists + // may be longer than this length. This length is the actual number + // of elements in the above arrays that are valid. + uint32_t Length = {0}; + + // A mutex is needed for destroying the event list. + // Creation is already thread-safe because we only create the list + // when an event is initially created. However, it might be + // possible to have multiple threads racing to destroy the list, + // so this will be used to make list destruction thread-safe. + ur_mutex UrZeEventListMutex; + + // Initialize this using the array of events in EventList, and retain + // all the pi_events in the created data structure. + // CurQueue is the pi_queue that the command with this event wait + // list is going to be added to. That is needed to flush command + // batches for wait events that are in other queues. + // UseCopyEngine indicates if the next command (the one that this + // event wait-list is for) is going to go to copy or compute + // queue. This is used to properly submit the dependent open + // command-lists. + ur_result_t createAndRetainUrZeEventList(uint32_t EventListLength, + const ur_event_handle_t *EventList, + ur_queue_handle_t CurQueue, + bool UseCopyEngine); + + // Add all the events in this object's UrEventList to the end + // of the list EventsToBeReleased. Destroy pi_ze_event_list_t data + // structure fields making it look empty. + ur_result_t collectEventsForReleaseAndDestroyPiZeEventList( + std::list &EventsToBeReleased); + + // Had to create custom assignment operator because the mutex is + // not assignment copyable. Just field by field copy of the other + // fields. + _ur_ze_event_list_t &operator=(const _ur_ze_event_list_t &other) { + if (this != &other) { + this->ZeEventList = other.ZeEventList; + this->UrEventList = other.UrEventList; + this->Length = other.Length; + } + return *this; + } +}; + +void printZeEventList(const _ur_ze_event_list_t &PiZeEventList); + +struct ur_event_handle_t_ : _ur_object { + ur_event_handle_t_(ze_event_handle_t ZeEvent, + ze_event_pool_handle_t ZeEventPool, + ur_context_handle_t Context, ur_command_t CommandType, + bool OwnZeEvent) + : ZeEvent{ZeEvent}, ZeEventPool{ZeEventPool}, Context{Context}, + CommandType{CommandType}, CommandData{nullptr} { + OwnNativeHandle = OwnZeEvent; + } + + // Level Zero event handle. + ze_event_handle_t ZeEvent; + + // Level Zero event pool handle. + ze_event_pool_handle_t ZeEventPool; -struct _ur_event_handle_t : _ur_object { - _ur_event_handle_t() {} + // In case we use device-only events this holds their host-visible + // counterpart. If this event is itself host-visble then HostVisibleEvent + // points to this event. If this event is not host-visible then this field can + // be: 1) null, meaning that a host-visible event wasn't yet created 2) a PI + // event created internally that host will actually be redirected + // to wait/query instead of this PI event. + // + // The HostVisibleEvent is a reference counted PI event and can be used more + // than by just this one event, depending on the mode (see EventsScope). + // + ur_event_handle_t HostVisibleEvent = {nullptr}; + bool isHostVisible() const { + return this == + const_cast( + reinterpret_cast(HostVisibleEvent)); + } + + // Provide direct access to Context, instead of going via queue. + // Not every PI event has a queue, and we need a handle to Context + // to get to event pool related information. + ur_context_handle_t Context; + + // Keeps the command-queue and command associated with the event. + // These are NULL for the user events. + ur_queue_handle_t UrQueue = {nullptr}; + ur_command_t CommandType; + + // Opaque data to hold any data needed for CommandType. + void *CommandData; + + // Command list associated with the pi_event. + std::optional CommandList; + + // List of events that were in the wait list of the command that will + // signal this event. These events must be retained when the command is + // enqueued, and must then be released when this event has signalled. + // This list must be destroyed once the event has signalled. + _ur_ze_event_list_t WaitList; + + // Tracks if the needed cleanup was already performed for + // a completed event. This allows to control that some cleanup + // actions are performed only once. + // + bool CleanedUp = {false}; + + // Indicates that this PI event had already completed in the sense + // that no other synchromization is needed. Note that the underlying + // L0 event (if any) is not guranteed to have been signalled, or + // being visible to the host at all. + bool Completed = {false}; + + // Indicates that this event is discarded, i.e. it is not visible outside of + // plugin. + bool IsDiscarded = {false}; + + // Besides each PI object keeping a total reference count in + // _ur_object::RefCount we keep special track of the event *external* + // references. This way we are able to tell when the event is not referenced + // externally anymore, i.e. it can't be passed as a dependency event to + // piEnqueue* functions and explicitly waited meaning that we can do some + // optimizations: + // 1. For in-order queues we can reset and reuse event even if it was not yet + // completed by submitting a reset command to the queue (since there are no + // external references, we know that nobody can wait this event somewhere in + // parallel thread or pass it as a dependency which may lead to hang) + // 2. We can avoid creating host proxy event. + // This counter doesn't track the lifetime of an event object. Even if it + // reaches zero an event object may not be destroyed and can be used + // internally in the plugin. + std::atomic RefCountExternal{0}; + + bool hasExternalRefs() { return RefCountExternal != 0; } + + // Reset _pi_event object. + ur_result_t reset(); + + // Tells if this event is with profiling capabilities. + bool isProfilingEnabled() const; + + // Get the host-visible event or create one and enqueue its signal. + ur_result_t getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent); }; + +// Helper function to implement zeHostSynchronize. +// The behavior is to avoid infinite wait during host sync under ZE_DEBUG. +// This allows for a much more responsive debugging of hangs. +// +template +ze_result_t zeHostSynchronizeImpl(Func Api, T Handle); + +// Template function to do various types of host synchronizations. +// This is intended to be used instead of direct calls to specific +// Level-Zero synchronization APIs. +// +template ze_result_t zeHostSynchronize(T Handle); +template <> ze_result_t zeHostSynchronize(ze_event_handle_t Handle); +template <> ze_result_t zeHostSynchronize(ze_command_queue_handle_t Handle); + +// Perform any necessary cleanup after an event has been signalled. +// This currently makes sure to release any kernel that may have been used by +// the event, updates the last command event in the queue and cleans up all dep +// events of the event. +// If the caller locks queue mutex then it must pass 'true' to QueueLocked. +ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, + bool QueueLocked = false); + +// Get value of device scope events env var setting or default setting +static const EventsScope DeviceEventsSetting = [] { + const char *DeviceEventsSettingStr = + std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS"); + if (DeviceEventsSettingStr) { + // Override the default if user has explicitly chosen the events scope. + switch (std::stoi(DeviceEventsSettingStr)) { + case 0: + return AllHostVisible; + case 1: + return OnDemandHostVisibleProxy; + case 2: + return LastCommandInBatchHostVisible; + default: + // fallthrough to default setting + break; + } + } + // This is our default setting, which is expected to be the fastest + // with the modern GPU drivers. + return AllHostVisible; +}(); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp new file mode 100644 index 0000000000000..2a69a905c8e84 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp @@ -0,0 +1,771 @@ +//===--------- ur_level_zero_kernel.cpp - Level Zero Adapter ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "ur_level_zero_kernel.hpp" +#include + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object + uint32_t WorkDim, ///< [in] number of dimensions, from 1 to 3, to specify + ///< the global and work-group work-items + const size_t + *GlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned + ///< values that specify the offset used to + ///< calculate the global ID of a work-item + const size_t *GlobalWorkSize, ///< [in] pointer to an array of workDim + ///< unsigned values that specify the number + ///< of global work-items in workDim that + ///< will execute the kernel function + const size_t + *LocalWorkSize, ///< [in][optional] pointer to an array of workDim + ///< unsigned values that specify the number of local + ///< work-items forming a work-group that will execute + ///< the kernel function. If nullptr, the runtime + ///< implementation will choose the work-group size. + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before the kernel execution. If nullptr, the + ///< numEventsInWaitList must be 0, indicating that no + ///< wait event. + ur_event_handle_t + *OutEvent ///< [in,out][optional] return an event object that identifies + ///< this particular kernel execution instance. +) { + // Lock automatically releases when this goes out of scope. + std::scoped_lock Lock( + Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex); + if (GlobalWorkOffset != NULL) { + if (!Queue->Device->Platform->ZeDriverGlobalOffsetExtensionFound) { + urPrint("No global offset extension found on this driver\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + ZE2UR_CALL(zeKernelSetGlobalOffsetExp, + (Kernel->ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1], + GlobalWorkOffset[2])); + } + + // If there are any pending arguments set them now. + for (auto &Arg : Kernel->PendingArguments) { + // The ArgValue may be a NULL pointer in which case a NULL value is used for + // the kernel argument declared as a pointer to global or constant memory. + char **ZeHandlePtr = nullptr; + if (Arg.Value) { + UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode, + Queue->Device)); + } + ZE2UR_CALL(zeKernelSetArgumentValue, + (Kernel->ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr)); + } + Kernel->PendingArguments.clear(); + + ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; + uint32_t WG[3]{}; + + // global_work_size of unused dimensions must be set to 1 + UR_ASSERT(WorkDim == 3 || GlobalWorkSize[2] == 1, + UR_RESULT_ERROR_INVALID_VALUE); + UR_ASSERT(WorkDim >= 2 || GlobalWorkSize[1] == 1, + UR_RESULT_ERROR_INVALID_VALUE); + if (LocalWorkSize) { + // L0 + UR_ASSERT(LocalWorkSize[0] < std::numeric_limits::max(), + UR_RESULT_ERROR_INVALID_VALUE); + UR_ASSERT(LocalWorkSize[1] < std::numeric_limits::max(), + UR_RESULT_ERROR_INVALID_VALUE); + UR_ASSERT(LocalWorkSize[2] < std::numeric_limits::max(), + UR_RESULT_ERROR_INVALID_VALUE); + WG[0] = static_cast(LocalWorkSize[0]); + WG[1] = static_cast(LocalWorkSize[1]); + WG[2] = static_cast(LocalWorkSize[2]); + } else { + // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize + // values do not fit to 32-bit that the API only supports currently. + bool SuggestGroupSize = true; + for (int I : {0, 1, 2}) { + if (GlobalWorkSize[I] > UINT32_MAX) { + SuggestGroupSize = false; + } + } + if (SuggestGroupSize) { + ZE2UR_CALL(zeKernelSuggestGroupSize, + (Kernel->ZeKernel, GlobalWorkSize[0], GlobalWorkSize[1], + GlobalWorkSize[2], &WG[0], &WG[1], &WG[2])); + } else { + for (int I : {0, 1, 2}) { + // Try to find a I-dimension WG size that the GlobalWorkSize[I] is + // fully divisable with. Start with the max possible size in + // each dimension. + uint32_t GroupSize[] = { + Queue->Device->ZeDeviceComputeProperties->maxGroupSizeX, + Queue->Device->ZeDeviceComputeProperties->maxGroupSizeY, + Queue->Device->ZeDeviceComputeProperties->maxGroupSizeZ}; + GroupSize[I] = std::min(size_t(GroupSize[I]), GlobalWorkSize[I]); + while (GlobalWorkSize[I] % GroupSize[I]) { + --GroupSize[I]; + } + if (GlobalWorkSize[I] / GroupSize[I] > UINT32_MAX) { + urPrint("urEnqueueKernelLaunch: can't find a WG size " + "suitable for global work size > UINT32_MAX\n"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + WG[I] = GroupSize[I]; + } + urPrint("urEnqueueKernelLaunch: using computed WG size = {%d, %d, %d}\n", + WG[0], WG[1], WG[2]); + } + } + + // TODO: assert if sizes do not fit into 32-bit? + + switch (WorkDim) { + case 3: + ZeThreadGroupDimensions.groupCountX = + static_cast(GlobalWorkSize[0] / WG[0]); + ZeThreadGroupDimensions.groupCountY = + static_cast(GlobalWorkSize[1] / WG[1]); + ZeThreadGroupDimensions.groupCountZ = + static_cast(GlobalWorkSize[2] / WG[2]); + break; + case 2: + ZeThreadGroupDimensions.groupCountX = + static_cast(GlobalWorkSize[0] / WG[0]); + ZeThreadGroupDimensions.groupCountY = + static_cast(GlobalWorkSize[1] / WG[1]); + WG[2] = 1; + break; + case 1: + ZeThreadGroupDimensions.groupCountX = + static_cast(GlobalWorkSize[0] / WG[0]); + WG[1] = WG[2] = 1; + break; + + default: + urPrint("urEnqueueKernelLaunch: unsupported work_dim\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + // Error handling for non-uniform group size case + if (GlobalWorkSize[0] != + size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) { + urPrint("urEnqueueKernelLaunch: invalid work_dim. The range is not a " + "multiple of the group size in the 1st dimension\n"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + if (GlobalWorkSize[1] != + size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) { + urPrint("urEnqueueKernelLaunch: invalid work_dim. The range is not a " + "multiple of the group size in the 2nd dimension\n"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + if (GlobalWorkSize[2] != + size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) { + urPrint("urEnqueueKernelLaunch: invalid work_dim. The range is not a " + "multiple of the group size in the 3rd dimension\n"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + + ZE2UR_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2])); + + bool UseCopyEngine = false; + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + // Get a new command list to be used on this call + ur_command_list_ptr_t CommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, true /* AllowBatching */)); + + ze_event_handle_t ZeEvent = nullptr; + ur_event_handle_t InternalEvent{}; + bool IsInternal = OutEvent == nullptr; + ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; + + UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH, + CommandList, IsInternal)); + + ZeEvent = (*OutEvent)->ZeEvent; + (*OutEvent)->WaitList = TmpWaitList; + + // Save the kernel in the event, so that when the event is signalled + // the code can do a piKernelRelease on this kernel. + (*OutEvent)->CommandData = (void *)Kernel; + + // Increment the reference count of the Kernel and indicate that the Kernel is + // in use. Once the event has been signalled, the code in + // CleanupCompletedEvent(Event) will do a piReleaseKernel to update the + // reference count on the kernel, using the kernel saved in CommandData. + UR_CALL(urKernelRetain(Kernel)); + + // Add to list of kernels to be submitted + if (IndirectAccessTrackingEnabled) + Queue->KernelsToBeSubmitted.push_back(Kernel); + + if (Queue->Device->ImmCommandListUsed && IndirectAccessTrackingEnabled) { + // If using immediate commandlists then gathering of indirect + // references and appending to the queue (which means submission) + // must be done together. + std::unique_lock ContextsLock( + Queue->Device->Platform->ContextsMutex, std::defer_lock); + // We are going to submit kernels for execution. If indirect access flag is + // set for a kernel then we need to make a snapshot of existing memory + // allocations in all contexts in the platform. We need to lock the mutex + // guarding the list of contexts in the platform to prevent creation of new + // memory alocations in any context before we submit the kernel for + // execution. + ContextsLock.lock(); + Queue->CaptureIndirectAccesses(); + // Add the command to the command list, which implies submission. + ZE2UR_CALL(zeCommandListAppendLaunchKernel, + (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions, + ZeEvent, (*OutEvent)->WaitList.Length, + (*OutEvent)->WaitList.ZeEventList)); + } else { + // Add the command to the command list for later submission. + // No lock is needed here, unlike the immediate commandlist case above, + // because the kernels are not actually submitted yet. Kernels will be + // submitted only when the comamndlist is closed. Then, a lock is held. + ZE2UR_CALL(zeCommandListAppendLaunchKernel, + (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions, + ZeEvent, (*OutEvent)->WaitList.Length, + (*OutEvent)->WaitList.ZeEventList)); + } + + urPrint("calling zeCommandListAppendLaunchKernel() with" + " ZeEvent %#llx\n", + ur_cast(ZeEvent)); + printZeEventList((*OutEvent)->WaitList); + + // Execute command list asynchronously, as the event will be used + // to track down its completion. + UR_CALL(Queue->executeCommandList(CommandList, false, true)); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( + ur_queue_handle_t Queue, ///< [in] handle of the queue to submit to. + ur_program_handle_t Program, ///< [in] handle of the program containing the + ///< device global variable. + const char + *Name, ///< [in] the unique identifier for the device global variable. + bool BlockingWrite, ///< [in] indicates if this operation should block. + size_t Count, ///< [in] the number of bytes to copy. + size_t Offset, ///< [in] the byte offset into the device global variable to + ///< start copying. + const void *Src, ///< [in] pointer to where the data must be copied from. + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list. + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before the kernel execution. If nullptr, the + ///< numEventsInWaitList must be 0, indicating that no + ///< wait event. + ur_event_handle_t + *Event ///< [in,out][optional] return an event object that identifies + ///< this particular kernel execution instance. +) { + std::scoped_lock lock(Queue->Mutex); + + // Find global variable pointer + size_t GlobalVarSize = 0; + void *GlobalVarPtr = nullptr; + ZE2UR_CALL(zeModuleGetGlobalPointer, + (Program->ZeModule, Name, &GlobalVarSize, &GlobalVarPtr)); + if (GlobalVarSize < Offset + Count) { + setErrorMessage("Write device global variable is out of range.", + UR_RESULT_ERROR_INVALID_VALUE); + return UR_RESULT_ERROR_UNKNOWN; + } + + // Copy engine is preferred only for host to device transfer. + // Device to device transfers run faster on compute engines. + bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src); + + // Temporary option added to use copy engine for D2D copy + PreferCopyEngine |= UseCopyEngineForD2DCopy; + + return enqueueMemCopyHelper(UR_COMMAND_DEVICE_GLOBAL_VARIABLE_WRITE, Queue, + ur_cast(GlobalVarPtr) + Offset, + BlockingWrite, Count, Src, NumEventsInWaitList, + EventWaitList, Event, PreferCopyEngine); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( + ur_queue_handle_t Queue, ///< [in] handle of the queue to submit to. + ur_program_handle_t Program, ///< [in] handle of the program containing the + ///< device global variable. + const char + *Name, ///< [in] the unique identifier for the device global variable. + bool BlockingRead, ///< [in] indicates if this operation should block. + size_t Count, ///< [in] the number of bytes to copy. + size_t Offset, ///< [in] the byte offset into the device global variable to + ///< start copying. + void *Dst, ///< [in] pointer to where the data must be copied to. + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list. + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before the kernel execution. If nullptr, the + ///< numEventsInWaitList must be 0, indicating that no + ///< wait event. + ur_event_handle_t + *Event ///< [in,out][optional] return an event object that identifies + ///< this particular kernel execution instance. +) { + + std::scoped_lock lock(Queue->Mutex); + + // Find global variable pointer + size_t GlobalVarSize = 0; + void *GlobalVarPtr = nullptr; + ZE2UR_CALL(zeModuleGetGlobalPointer, + (Program->ZeModule, Name, &GlobalVarSize, &GlobalVarPtr)); + if (GlobalVarSize < Offset + Count) { + setErrorMessage("Read from device global variable is out of range.", + UR_RESULT_ERROR_INVALID_VALUE); + return UR_RESULT_ERROR_UNKNOWN; + } + + // Copy engine is preferred only for host to device transfer. + // Device to device transfers run faster on compute engines. + bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Dst); + + // Temporary option added to use copy engine for D2D copy + PreferCopyEngine |= UseCopyEngineForD2DCopy; + + return enqueueMemCopyHelper( + UR_COMMAND_DEVICE_GLOBAL_VARIABLE_READ, Queue, Dst, BlockingRead, Count, + ur_cast(GlobalVarPtr) + Offset, NumEventsInWaitList, + EventWaitList, Event, PreferCopyEngine); +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelCreate( + ur_program_handle_t Program, ///< [in] handle of the program instance + const char *KernelName, ///< [in] pointer to null-terminated string. + ur_kernel_handle_t + *RetKernel ///< [out] pointer to handle of kernel object created. +) { + std::shared_lock Guard(Program->Mutex); + if (Program->State != ur_program_handle_t_::state::Exe) { + return UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE; + } + + ZeStruct ZeKernelDesc; + ZeKernelDesc.flags = 0; + ZeKernelDesc.pKernelName = KernelName; + + ze_kernel_handle_t ZeKernel; + ZE2UR_CALL(zeKernelCreate, (Program->ZeModule, &ZeKernelDesc, &ZeKernel)); + + try { + ur_kernel_handle_t_ *UrKernel = + new ur_kernel_handle_t_(ZeKernel, true, Program); + *RetKernel = reinterpret_cast(UrKernel); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + UR_CALL((*RetKernel)->initialize()); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( + ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object + uint32_t ArgIndex, ///< [in] argument index in range [0, num args - 1] + size_t ArgSize, ///< [in] size of argument type + const void + *PArgValue ///< [in] argument value represented as matching arg type. +) { + // OpenCL: "the arg_value pointer can be NULL or point to a NULL value + // in which case a NULL value will be used as the value for the argument + // declared as a pointer to global or constant memory in the kernel" + // + // We don't know the type of the argument but it seems that the only time + // SYCL RT would send a pointer to NULL in 'arg_value' is when the argument + // is a NULL pointer. Treat a pointer to NULL in 'arg_value' as a NULL. + if (ArgSize == sizeof(void *) && PArgValue && + *(void **)(const_cast(PArgValue)) == nullptr) { + PArgValue = nullptr; + } + + std::scoped_lock Guard(Kernel->Mutex); + ZE2UR_CALL(zeKernelSetArgumentValue, + (Kernel->ZeKernel, ArgIndex, ArgSize, PArgValue)); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgLocal( + ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object + uint32_t ArgIndex, ///< [in] argument index in range [0, num args - 1] + size_t ArgSize ///< [in] size of the local buffer to be allocated by the + ///< runtime +) { + std::ignore = Kernel; + std::ignore = ArgIndex; + std::ignore = ArgSize; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo( + ur_kernel_handle_t Kernel, ///< [in] handle of the Kernel object + ur_kernel_info_t ParamName, ///< [in] name of the Kernel property to query + size_t PropSize, ///< [in] the size of the Kernel property value. + void *KernelInfo, ///< [in,out][optional] array of bytes holding the kernel + ///< info property. If propSize is not equal to or + ///< greater than the real number of bytes needed to + ///< return the info then the + ///< ::UR_RESULT_ERROR_INVALID_SIZE error is returned and + ///< pKernelInfo is not used. + size_t *PropSizeRet ///< [out][optional] pointer to the actual size in + ///< bytes of data being queried by propName. +) { + + UrL0ReturnHelper ReturnValue(PropSize, KernelInfo, PropSizeRet); + + std::shared_lock Guard(Kernel->Mutex); + switch (ParamName) { + case UR_KERNEL_INFO_CONTEXT: + return ReturnValue(ur_context_handle_t{Kernel->Program->Context}); + case UR_KERNEL_INFO_PROGRAM: + return ReturnValue(ur_program_handle_t{Kernel->Program}); + case UR_KERNEL_INFO_FUNCTION_NAME: + try { + std::string &KernelName = *Kernel->ZeKernelName.operator->(); + return ReturnValue(static_cast(KernelName.c_str())); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + case UR_KERNEL_INFO_NUM_ARGS: + return ReturnValue(uint32_t{Kernel->ZeKernelProperties->numKernelArgs}); + case UR_KERNEL_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{Kernel->RefCount.load()}); + case UR_KERNEL_INFO_ATTRIBUTES: + try { + uint32_t Size; + ZE2UR_CALL(zeKernelGetSourceAttributes, + (Kernel->ZeKernel, &Size, nullptr)); + char *attributes = new char[Size]; + ZE2UR_CALL(zeKernelGetSourceAttributes, + (Kernel->ZeKernel, &Size, &attributes)); + auto Res = ReturnValue(attributes); + delete[] attributes; + return Res; + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + default: + urPrint("Unsupported ParamName in urKernelGetInfo: ParamName=%d(0x%x)\n", + ParamName, ParamName); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo( + ur_kernel_handle_t Kernel, ///< [in] handle of the Kernel object + ur_device_handle_t Device, ///< [in] handle of the Device object + ur_kernel_group_info_t + ParamName, ///< [in] name of the work Group property to query + size_t + ParamValueSize, ///< [in] size of the Kernel Work Group property value + void *ParamValue, ///< [in,out][optional][range(0, propSize)] value of the + ///< Kernel Work Group property. + size_t *ParamValueSizeRet ///< [out][optional] pointer to the actual size in + ///< bytes of data being queried by propName. +) { + UrL0ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); + + std::shared_lock Guard(Kernel->Mutex); + switch (ParamName) { + case UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: { + // TODO: To revisit after level_zero/issues/262 is resolved + struct { + size_t Arr[3]; + } WorkSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX, + Device->ZeDeviceComputeProperties->maxGroupSizeY, + Device->ZeDeviceComputeProperties->maxGroupSizeZ}}; + return ReturnValue(WorkSize); + } + case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { + // As of right now, L0 is missing API to query kernel and device specific + // max work group size. + return ReturnValue( + pi_uint64{Device->ZeDeviceComputeProperties->maxTotalGroupSize}); + } + case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { + struct { + size_t Arr[3]; + } WgSize = {{Kernel->ZeKernelProperties->requiredGroupSizeX, + Kernel->ZeKernelProperties->requiredGroupSizeY, + Kernel->ZeKernelProperties->requiredGroupSizeZ}}; + return ReturnValue(WgSize); + } + case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: + return ReturnValue(uint32_t{Kernel->ZeKernelProperties->localMemSize}); + case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: { + return ReturnValue(size_t{Device->ZeDeviceProperties->physicalEUSimdWidth}); + } + case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: { + return ReturnValue(uint32_t{Kernel->ZeKernelProperties->privateMemSize}); + } + default: { + urPrint("Unknown ParamName in urKernelGetGroupInfo: ParamName=%d(0x%x)\n", + ParamName, ParamName); + return UR_RESULT_ERROR_INVALID_VALUE; + } + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSubGroupInfo( + ur_kernel_handle_t Kernel, ///< [in] handle of the Kernel object + ur_device_handle_t Device, ///< [in] handle of the Device object + ur_kernel_sub_group_info_t + PropName, ///< [in] name of the SubGroup property to query + size_t PropSize, ///< [in] size of the Kernel SubGroup property value + void *PropValue, ///< [in,out][range(0, propSize)][optional] value of the + ///< Kernel SubGroup property. + size_t *PropSizeRet ///< [out][optional] pointer to the actual size in + ///< bytes of data being queried by propName. +) { + std::ignore = Device; + + UrReturnHelper ReturnValue(PropSize, PropValue, PropSizeRet); + + std::shared_lock Guard(Kernel->Mutex); + if (PropName == UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE) { + ReturnValue(uint32_t{Kernel->ZeKernelProperties->maxSubgroupSize}); + } else if (PropName == UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS) { + ReturnValue(uint32_t{Kernel->ZeKernelProperties->maxNumSubgroups}); + } else if (PropName == UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS) { + ReturnValue(uint32_t{Kernel->ZeKernelProperties->requiredNumSubGroups}); + } else if (PropName == UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL) { + ReturnValue(uint32_t{Kernel->ZeKernelProperties->requiredSubgroupSize}); + } else { + die("urKernelGetSubGroupInfo: parameter not implemented"); + return {}; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain( + ur_kernel_handle_t Kernel ///< [in] handle for the Kernel to retain +) { + Kernel->RefCount.increment(); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelRelease( + ur_kernel_handle_t Kernel ///< [in] handle for the Kernel to release +) { + if (!Kernel->RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + + auto KernelProgram = Kernel->Program; + if (Kernel->OwnNativeHandle) { + auto ZeResult = ZE_CALL_NOCHECK(zeKernelDestroy, (Kernel->ZeKernel)); + // Gracefully handle the case that L0 was already unloaded. + if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) + return ze2urResult(ZeResult); + } + if (IndirectAccessTrackingEnabled) { + UR_CALL(urContextRelease(KernelProgram->Context)); + } + // do a release on the program this kernel was part of + UR_CALL(urProgramRelease(KernelProgram)); + delete Kernel; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( + ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object + uint32_t ArgIndex, ///< [in] argument index in range [0, num args - 1] + const void *ArgValue ///< [in][optional] SVM pointer to memory location + ///< holding the argument value. If null then argument + ///< value is considered null. +) { + std::ignore = Kernel; + std::ignore = ArgIndex; + std::ignore = ArgValue; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( + ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object + uint32_t ArgIndex, ///< [in] argument index in range [0, num args - 1] + size_t ArgSize, ///< [in] size of argument type + const void *ArgValue ///< [in][optional] SVM pointer to memory location + ///< holding the argument value. If null then argument + ///< value is considered null. +) { + UR_CALL(urKernelSetArgValue(Kernel, ArgIndex, ArgSize, ArgValue)); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo( + ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object + ur_kernel_exec_info_t PropName, ///< [in] name of the execution attribute + size_t PropSize, ///< [in] size in byte the attribute value + const void *PropValue ///< [in][range(0, propSize)] pointer to memory + ///< location holding the property value. +) { + std::scoped_lock Guard(Kernel->Mutex); + if (PropName == UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS && + *(static_cast(PropValue)) == PI_TRUE) { + // The whole point for users really was to not need to know anything + // about the types of allocations kernel uses. So in DPC++ we always + // just set all 3 modes for each kernel. + ze_kernel_indirect_access_flags_t IndirectFlags = + ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST | + ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE | + ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED; + ZE2UR_CALL(zeKernelSetIndirectAccess, (Kernel->ZeKernel, IndirectFlags)); + } else if (PropName == UR_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG) { + ze_cache_config_flag_t ZeCacheConfig{}; + auto CacheConfig = *(static_cast(PropValue)); + if (CacheConfig == UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM) + ZeCacheConfig = ZE_CACHE_CONFIG_FLAG_LARGE_SLM; + else if (CacheConfig == UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA) + ZeCacheConfig = ZE_CACHE_CONFIG_FLAG_LARGE_DATA; + else if (CacheConfig == UR_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT) + ZeCacheConfig = static_cast(0); + else + // Unexpected cache configuration value. + return UR_RESULT_ERROR_INVALID_VALUE; + ZE2UR_CALL(zeKernelSetCacheConfig, (Kernel->ZeKernel, ZeCacheConfig);); + } else { + urPrint("urKernelSetExecInfo: unsupported ParamName\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler( + ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object + uint32_t ArgIndex, ///< [in] argument index in range [0, num args - 1] + ur_sampler_handle_t ArgValue ///< [in] handle of Sampler object. +) { + std::scoped_lock Guard(Kernel->Mutex); + ZE2UR_CALL(zeKernelSetArgumentValue, + (ur_cast(Kernel->ZeKernel), ArgIndex, + sizeof(void *), &ArgValue->ZeSampler)); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( + ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object + uint32_t ArgIndex, ///< [in] argument index in range [0, num args - 1] + ur_mem_handle_t ArgValue ///< [in][optional] handle of Memory object. +) { + std::scoped_lock Guard(Kernel->Mutex); + // The ArgValue may be a NULL pointer in which case a NULL value is used for + // the kernel argument declared as a pointer to global or constant memory. + + ur_mem_handle_t_ *UrMem = ur_cast(ArgValue); + + auto Arg = UrMem ? UrMem : nullptr; + Kernel->PendingArguments.push_back( + {ArgIndex, sizeof(void *), Arg, ur_mem_handle_t_::read_write}); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( + ur_kernel_handle_t Kernel, ///< [in] handle of the kernel. + ur_native_handle_t + *NativeKernel ///< [out] a pointer to the native handle of the kernel. +) { + std::shared_lock Guard(Kernel->Mutex); + + *NativeKernel = reinterpret_cast(Kernel->ZeKernel); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( + ur_native_handle_t NativeKernel, ///< [in] the native handle of the kernel. + ur_context_handle_t Context, ///< [in] handle of the context object + ur_kernel_handle_t * + RetKernel ///< [out] pointer to the handle of the kernel object created. +) { + ze_kernel_handle_t ZeKernel = ur_cast(NativeKernel); + ur_kernel_handle_t_ *Kernel = nullptr; + try { + Kernel = new ur_kernel_handle_t_(ZeKernel, + false, // OwnZeKernel + Context); + *RetKernel = reinterpret_cast(Kernel); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + UR_CALL(Kernel->initialize()); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_kernel_handle_t_::initialize() { + // Retain the program and context to show it's used by this kernel. + UR_CALL(urProgramRetain(Program)); + + if (IndirectAccessTrackingEnabled) + // TODO: do piContextRetain without the guard + UR_CALL(urContextRetain(Program->Context)); + + // Set up how to obtain kernel properties when needed. + ZeKernelProperties.Compute = [this](ze_kernel_properties_t &Properties) { + ZE_CALL_NOCHECK(zeKernelGetProperties, (ZeKernel, &Properties)); + }; + + // Cache kernel name. + ZeKernelName.Compute = [this](std::string &Name) { + size_t Size = 0; + ZE_CALL_NOCHECK(zeKernelGetName, (ZeKernel, &Size, nullptr)); + char *KernelName = new char[Size]; + ZE_CALL_NOCHECK(zeKernelGetName, (ZeKernel, &Size, KernelName)); + Name = KernelName; + delete[] KernelName; + }; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants( + ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object + uint32_t Count, ///< [in] the number of elements in the pSpecConstants array + const ur_specialization_constant_info_t + *SpecConstants ///< [in] array of specialization constant value + ///< descriptions +) { + std::ignore = Kernel; + std::ignore = Count; + std::ignore = SpecConstants; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.hpp new file mode 100644 index 0000000000000..db7b87a6f6f82 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.hpp @@ -0,0 +1,97 @@ +//===--------- ur_level_zero_kernel.hpp - Level Zero Adapter ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include "ur_level_zero_common.hpp" +#include "ur_level_zero_mem.hpp" +#include + +struct ur_kernel_handle_t_ : _ur_object { + ur_kernel_handle_t_(ze_kernel_handle_t Kernel, bool OwnZeHandle, + ur_program_handle_t Program) + : Program{Program}, ZeKernel{Kernel}, SubmissionsCount{0}, MemAllocs{} { + OwnNativeHandle = OwnZeHandle; + } + + ur_kernel_handle_t_(ze_kernel_handle_t Kernel, bool OwnZeHandle, + ur_context_handle_t Context) + : Context{Context}, ZeKernel{Kernel}, SubmissionsCount{0}, MemAllocs{} { + OwnNativeHandle = OwnZeHandle; + } + + // Keep the program of the kernel. + ur_context_handle_t Context; + + // Keep the program of the kernel. + ur_program_handle_t Program; + + // Level Zero function handle. + ze_kernel_handle_t ZeKernel; + + // Counter to track the number of submissions of the kernel. + // When this value is zero, it means that kernel is not submitted for an + // execution - at this time we can release memory allocations referenced by + // this kernel. We can do this when RefCount turns to 0 but it is too late + // because kernels are cached in the context by SYCL RT and they are released + // only during context object destruction. Regular RefCount is not usable to + // track submissions because user/SYCL RT can retain kernel object any number + // of times. And that's why there is no value of RefCount which can mean zero + // submissions. + std::atomic SubmissionsCount; + + // Returns true if kernel has indirect access, false otherwise. + bool hasIndirectAccess() { + // Currently indirect access flag is set for all kernels and there is no API + // to check if kernel actually indirectly access smth. + return true; + } + + // Hash function object for the unordered_set below. + struct Hash { + size_t operator()(const std::pair *P) const { + return std::hash()(P->first); + } + }; + + // If kernel has indirect access we need to make a snapshot of all existing + // memory allocations to defer deletion of these memory allocations to the + // moment when kernel execution has finished. + // We store pointers to the elements because pointers are not invalidated by + // insert/delete for std::unordered_map (iterators are invalidated). We need + // to take a snapshot instead of just reference-counting the allocations, + // because picture of active allocations can change during kernel execution + // (new allocations can be added) and we need to know which memory allocations + // were retained by this kernel to release them (and don't touch new + // allocations) at kernel completion. Same kernel may be submitted several + // times and retained allocations may be different at each submission. That's + // why we have a set of memory allocations here and increase ref count only + // once even if kernel is submitted many times. We don't want to know how many + // times and which allocations were retained by each submission. We release + // all allocations in the set only when SubmissionsCount == 0. + std::unordered_set *, Hash> MemAllocs; + + // Completed initialization of PI kernel. Must be called after construction. + ur_result_t initialize(); + + // Keeps info about an argument to the kernel enough to set it with + // zeKernelSetArgumentValue. + struct ArgumentInfo { + uint32_t Index; + size_t Size; + // const ur_mem_handle_t_ *Value; + ur_mem_handle_t_ *Value; + ur_mem_handle_t_::access_mode_t AccessMode{ur_mem_handle_t_::unknown}; + }; + // Arguments that still need to be set (with zeKernelSetArgumentValue) + // before kernel is enqueued. + std::vector PendingArguments; + + // Cache of the kernel properties. + ZeCache> ZeKernelProperties; + ZeCache ZeKernelName; +}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index 080cb2eb5d201..e2b0b597eb2b1 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -6,7 +6,2530 @@ // //===-----------------------------------------------------------------===// -#include "ur_level_zero_mem.hpp" +#include +#include +#include + +#include "ur_level_zero.hpp" +#include "ur_level_zero_context.hpp" +#include "ur_level_zero_event.hpp" +#include + +// Default to using compute engine for fill operation, but allow to +// override this with an environment variable. +static bool PreferCopyEngine = [] { + const char *Env = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL"); + return Env ? std::stoi(Env) != 0 : false; +}(); + +// Helper function to check if a pointer is a device pointer. +bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr) { + ze_device_handle_t ZeDeviceHandle; + ZeStruct ZeMemoryAllocationProperties; + + // Query memory type of the pointer + ZE2UR_CALL(zeMemGetAllocProperties, + (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties, + &ZeDeviceHandle)); + + return (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_DEVICE); +} + +// Shared by all memory read/write/copy PI interfaces. +// PI interfaces must have queue's and destination buffer's mutexes locked for +// exclusive use and source buffer's mutex locked for shared use on entry. +ur_result_t enqueueMemCopyHelper(ur_command_t CommandType, + ur_queue_handle_t Queue, void *Dst, + pi_bool BlockingWrite, size_t Size, + const void *Src, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, + ur_event_handle_t *OutEvent, + bool PreferCopyEngine) { + bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); + + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + // We want to batch these commands to avoid extra submissions (costly) + bool OkToBatch = true; + + // Get a new command list to be used on this call + ur_command_list_ptr_t CommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, + UseCopyEngine, OkToBatch)); + + ze_event_handle_t ZeEvent = nullptr; + ur_event_handle_t InternalEvent; + bool IsInternal = OutEvent == nullptr; + ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; + UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, + IsInternal)); + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + + const auto &ZeCommandList = CommandList->first; + const auto &WaitList = (*Event)->WaitList; + + urPrint("calling zeCommandListAppendMemoryCopy() with\n" + " ZeEvent %#llx\n", + ur_cast(ZeEvent)); + printZeEventList(WaitList); + + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (ZeCommandList, Dst, Src, Size, ZeEvent, WaitList.Length, + WaitList.ZeEventList)); + + UR_CALL(Queue->executeCommandList(CommandList, BlockingWrite, OkToBatch)); + + return UR_RESULT_SUCCESS; +} + +// Shared by all memory read/write/copy rect PI interfaces. +// PI interfaces must have queue's and destination buffer's mutexes locked for +// exclusive use and source buffer's mutex locked for shared use on entry. +ur_result_t enqueueMemCopyRectHelper( + ur_command_t CommandType, ur_queue_handle_t Queue, const void *SrcBuffer, + void *DstBuffer, ur_rect_offset_t SrcOrigin, ur_rect_offset_t DstOrigin, + ur_rect_region_t Region, size_t SrcRowPitch, size_t DstRowPitch, + size_t SrcSlicePitch, size_t DstSlicePitch, pi_bool Blocking, + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_event_handle_t *OutEvent, bool PreferCopyEngine) { + bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); + + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + // We want to batch these commands to avoid extra submissions (costly) + bool OkToBatch = true; + + // Get a new command list to be used on this call + ur_command_list_ptr_t CommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, + UseCopyEngine, OkToBatch)); + + ze_event_handle_t ZeEvent = nullptr; + ur_event_handle_t InternalEvent; + bool IsInternal = OutEvent == nullptr; + ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; + UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, + IsInternal)); + + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + + const auto &ZeCommandList = CommandList->first; + const auto &WaitList = (*Event)->WaitList; + + urPrint("calling zeCommandListAppendMemoryCopy() with\n" + " ZeEvent %#llx\n", + ur_cast(ZeEvent)); + printZeEventList(WaitList); + + uint32_t SrcOriginX = ur_cast(SrcOrigin.x); + uint32_t SrcOriginY = ur_cast(SrcOrigin.y); + uint32_t SrcOriginZ = ur_cast(SrcOrigin.z); + + uint32_t SrcPitch = SrcRowPitch; + if (SrcPitch == 0) + SrcPitch = ur_cast(Region.width); + + if (SrcSlicePitch == 0) + SrcSlicePitch = ur_cast(Region.height) * SrcPitch; + + uint32_t DstOriginX = ur_cast(DstOrigin.x); + uint32_t DstOriginY = ur_cast(DstOrigin.y); + uint32_t DstOriginZ = ur_cast(DstOrigin.z); + + uint32_t DstPitch = DstRowPitch; + if (DstPitch == 0) + DstPitch = ur_cast(Region.width); + + if (DstSlicePitch == 0) + DstSlicePitch = ur_cast(Region.height) * DstPitch; + + uint32_t Width = ur_cast(Region.width); + uint32_t Height = ur_cast(Region.height); + uint32_t Depth = ur_cast(Region.depth); + + const ze_copy_region_t ZeSrcRegion = {SrcOriginX, SrcOriginY, SrcOriginZ, + Width, Height, Depth}; + const ze_copy_region_t ZeDstRegion = {DstOriginX, DstOriginY, DstOriginZ, + Width, Height, Depth}; + + ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, + (ZeCommandList, DstBuffer, &ZeDstRegion, DstPitch, DstSlicePitch, + SrcBuffer, &ZeSrcRegion, SrcPitch, SrcSlicePitch, nullptr, + WaitList.Length, WaitList.ZeEventList)); + + urPrint("calling zeCommandListAppendMemoryCopyRegion()\n"); + + ZE2UR_CALL(zeCommandListAppendBarrier, (ZeCommandList, ZeEvent, 0, nullptr)); + + urPrint("calling zeCommandListAppendBarrier() with Event %#llx\n", + ur_cast(ZeEvent)); + + UR_CALL(Queue->executeCommandList(CommandList, Blocking, OkToBatch)); + + return UR_RESULT_SUCCESS; +} + +// PI interfaces must have queue's and buffer's mutexes locked on entry. +static ur_result_t enqueueMemFillHelper(ur_command_t CommandType, + ur_queue_handle_t Queue, void *Ptr, + const void *Pattern, size_t PatternSize, + size_t Size, + uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, + ur_event_handle_t *OutEvent) { + // Pattern size must be a power of two. + UR_ASSERT((PatternSize > 0) && ((PatternSize & (PatternSize - 1)) == 0), + UR_RESULT_ERROR_INVALID_VALUE); + auto &Device = Queue->Device; + + // Make sure that pattern size matches the capability of the copy queues. + // Check both main and link groups as we don't known which one will be used. + // + if (PreferCopyEngine && Device->hasCopyEngine()) { + if (Device->hasMainCopyEngine() && + Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::MainCopy] + .ZeProperties.maxMemoryFillPatternSize < PatternSize) { + PreferCopyEngine = false; + } + if (Device->hasLinkCopyEngine() && + Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::LinkCopy] + .ZeProperties.maxMemoryFillPatternSize < PatternSize) { + PreferCopyEngine = false; + } + } + + bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); + if (!UseCopyEngine) { + // Pattern size must fit the compute queue capabilities. + UR_ASSERT( + PatternSize <= + Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] + .ZeProperties.maxMemoryFillPatternSize, + UR_RESULT_ERROR_INVALID_VALUE); + } + + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + ur_command_list_ptr_t CommandList{}; + // We want to batch these commands to avoid extra submissions (costly) + bool OkToBatch = true; + UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, + UseCopyEngine, OkToBatch)); + + ze_event_handle_t ZeEvent = nullptr; + ur_event_handle_t InternalEvent; + bool IsInternal = OutEvent == nullptr; + ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; + UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, + IsInternal)); + + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + + const auto &ZeCommandList = CommandList->first; + const auto &WaitList = (*Event)->WaitList; + + ZE2UR_CALL(zeCommandListAppendMemoryFill, + (ZeCommandList, Ptr, Pattern, PatternSize, Size, ZeEvent, + WaitList.Length, WaitList.ZeEventList)); + + urPrint("calling zeCommandListAppendMemoryFill() with\n" + " ZeEvent %#llx\n", + ur_cast(ZeEvent)); + printZeEventList(WaitList); + + // Execute command list asynchronously, as the event will be used + // to track down its completion. + UR_CALL(Queue->executeCommandList(CommandList, false, OkToBatch)); + + return UR_RESULT_SUCCESS; +} + +// If indirect access tracking is enabled then performs reference counting, +// otherwise just calls zeMemAllocHost. +static ur_result_t ZeHostMemAllocHelper(void **ResultPtr, + ur_context_handle_t UrContext, + size_t Size) { + ur_platform_handle_t Plt = UrContext->getPlatform(); + std::unique_lock ContextsLock(Plt->ContextsMutex, + std::defer_lock); + if (IndirectAccessTrackingEnabled) { + // Lock the mutex which is guarding contexts container in the platform. + // This prevents new kernels from being submitted in any context while + // we are in the process of allocating a memory, this is needed to + // properly capture allocations by kernels with indirect access. + ContextsLock.lock(); + // We are going to defer memory release if there are kernels with + // indirect access, that is why explicitly retain context to be sure + // that it is released after all memory allocations in this context are + // released. + UR_CALL(urContextRetain(UrContext)); + } + + ZeStruct ZeDesc; + ZeDesc.flags = 0; + ZE2UR_CALL(zeMemAllocHost, + (UrContext->ZeContext, &ZeDesc, Size, 1, ResultPtr)); + + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + UrContext->MemAllocs.emplace( + std::piecewise_construct, std::forward_as_tuple(*ResultPtr), + std::forward_as_tuple( + reinterpret_cast(UrContext))); + } + return UR_RESULT_SUCCESS; +} + +static ur_result_t getImageRegionHelper(_ur_image *Mem, + ur_rect_offset_t *Origin, + ur_rect_region_t *Region, + ze_image_region_t &ZeRegion) { + UR_ASSERT(Mem, UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(Origin, UR_RESULT_ERROR_INVALID_VALUE); + + auto UrImage = static_cast<_ur_image *>(Mem); + ze_image_desc_t &ZeImageDesc = UrImage->ZeImageDesc; + +#ifndef NDEBUG + UR_ASSERT(Mem->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT((ZeImageDesc.type == ZE_IMAGE_TYPE_1D && Origin->y == 0 && + Origin->z == 0) || + (ZeImageDesc.type == ZE_IMAGE_TYPE_1DARRAY && Origin->z == 0) || + (ZeImageDesc.type == ZE_IMAGE_TYPE_2D && Origin->z == 0) || + (ZeImageDesc.type == ZE_IMAGE_TYPE_3D), + UR_RESULT_ERROR_INVALID_VALUE); + + UR_ASSERT(Region->width && Region->height && Region->depth, + UR_RESULT_ERROR_INVALID_VALUE); + UR_ASSERT( + (ZeImageDesc.type == ZE_IMAGE_TYPE_1D && Region->height == 1 && + Region->depth == 1) || + (ZeImageDesc.type == ZE_IMAGE_TYPE_1DARRAY && Region->depth == 1) || + (ZeImageDesc.type == ZE_IMAGE_TYPE_2D && Region->depth == 1) || + (ZeImageDesc.type == ZE_IMAGE_TYPE_3D), + UR_RESULT_ERROR_INVALID_VALUE); +#endif // !NDEBUG + + uint32_t OriginX = ur_cast(Origin->x); + uint32_t OriginY = ur_cast(Origin->y); + uint32_t OriginZ = ur_cast(Origin->z); + + uint32_t Width = ur_cast(Region->width); + uint32_t Height = ur_cast(Region->height); + uint32_t Depth = ur_cast(Region->depth); + + ZeRegion = {OriginX, OriginY, OriginZ, Width, Height, Depth}; + + return UR_RESULT_SUCCESS; +} + +// Helper function to implement image read/write/copy. +// PI interfaces must have queue's and destination image's mutexes locked for +// exclusive use and source image's mutex locked for shared use on entry. +static ur_result_t enqueueMemImageCommandHelper( + ur_command_t CommandType, ur_queue_handle_t Queue, + const void *Src, // image or ptr + void *Dst, // image or ptr + pi_bool IsBlocking, ur_rect_offset_t *SrcOrigin, + ur_rect_offset_t *DstOrigin, ur_rect_region_t *Region, size_t RowPitch, + size_t SlicePitch, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, ur_event_handle_t *OutEvent, + bool PreferCopyEngine = false) { + bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); + + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + // We want to batch these commands to avoid extra submissions (costly) + bool OkToBatch = true; + + // Get a new command list to be used on this call + ur_command_list_ptr_t CommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, + UseCopyEngine, OkToBatch)); + + ze_event_handle_t ZeEvent = nullptr; + ur_event_handle_t InternalEvent; + bool IsInternal = OutEvent == nullptr; + ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; + UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, + IsInternal)); + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + + const auto &ZeCommandList = CommandList->first; + const auto &WaitList = (*Event)->WaitList; + + if (CommandType == UR_COMMAND_MEM_IMAGE_READ) { + _ur_image *SrcMem = ur_cast<_ur_image *>(const_cast(Src)); + + ze_image_region_t ZeSrcRegion; + UR_CALL(getImageRegionHelper(SrcMem, SrcOrigin, Region, ZeSrcRegion)); + + // TODO: Level Zero does not support row_pitch/slice_pitch for images yet. + // Check that SYCL RT did not want pitch larger than default. + std::ignore = RowPitch; + std::ignore = SlicePitch; + UR_ASSERT(SrcMem->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + auto SrcImage = SrcMem; + const ze_image_desc_t &ZeImageDesc = SrcImage->ZeImageDesc; + UR_ASSERT( + RowPitch == 0 || + // special case RGBA image pitch equal to region's width + (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32 && + RowPitch == 4 * 4 * ZeSrcRegion.width) || + (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16 && + RowPitch == 4 * 2 * ZeSrcRegion.width) || + (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 && + RowPitch == 4 * ZeSrcRegion.width), + UR_RESULT_ERROR_INVALID_IMAGE_SIZE); + UR_ASSERT(SlicePitch == 0 || SlicePitch == RowPitch * ZeSrcRegion.height, + UR_RESULT_ERROR_INVALID_IMAGE_SIZE); + + char *ZeHandleSrc = nullptr; + UR_CALL(SrcMem->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, + Queue->Device)); + ZE2UR_CALL(zeCommandListAppendImageCopyToMemory, + (ZeCommandList, Dst, ur_cast(ZeHandleSrc), + &ZeSrcRegion, ZeEvent, WaitList.Length, WaitList.ZeEventList)); + } else if (CommandType == UR_COMMAND_MEM_IMAGE_WRITE) { + _ur_image *DstMem = ur_cast<_ur_image *>(Dst); + ze_image_region_t ZeDstRegion; + UR_CALL(getImageRegionHelper(DstMem, DstOrigin, Region, ZeDstRegion)); + + // TODO: Level Zero does not support row_pitch/slice_pitch for images yet. + // Check that SYCL RT did not want pitch larger than default. + UR_ASSERT(DstMem->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + auto DstImage = static_cast<_ur_image *>(DstMem); + const ze_image_desc_t &ZeImageDesc = DstImage->ZeImageDesc; + UR_ASSERT( + RowPitch == 0 || + // special case RGBA image pitch equal to region's width + (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32 && + RowPitch == 4 * 4 * ZeDstRegion.width) || + (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16 && + RowPitch == 4 * 2 * ZeDstRegion.width) || + (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 && + RowPitch == 4 * ZeDstRegion.width), + UR_RESULT_ERROR_INVALID_IMAGE_SIZE); + UR_ASSERT(SlicePitch == 0 || SlicePitch == RowPitch * ZeDstRegion.height, + UR_RESULT_ERROR_INVALID_IMAGE_SIZE); + + char *ZeHandleDst = nullptr; + UR_CALL(DstMem->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + Queue->Device)); + ZE2UR_CALL(zeCommandListAppendImageCopyFromMemory, + (ZeCommandList, ur_cast(ZeHandleDst), Src, + &ZeDstRegion, ZeEvent, WaitList.Length, WaitList.ZeEventList)); + } else if (CommandType == UR_COMMAND_MEM_IMAGE_COPY) { + _ur_image *SrcImage = ur_cast<_ur_image *>(const_cast(Src)); + _ur_image *DstImage = ur_cast<_ur_image *>(Dst); + + ze_image_region_t ZeSrcRegion; + UR_CALL(getImageRegionHelper(SrcImage, SrcOrigin, Region, ZeSrcRegion)); + ze_image_region_t ZeDstRegion; + UR_CALL(getImageRegionHelper(DstImage, DstOrigin, Region, ZeDstRegion)); + + char *ZeHandleSrc = nullptr; + char *ZeHandleDst = nullptr; + UR_CALL(SrcImage->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, + Queue->Device)); + UR_CALL(DstImage->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + Queue->Device)); + ZE2UR_CALL(zeCommandListAppendImageCopyRegion, + (ZeCommandList, ur_cast(ZeHandleDst), + ur_cast(ZeHandleSrc), &ZeDstRegion, + &ZeSrcRegion, ZeEvent, 0, nullptr)); + } else { + urPrint("enqueueMemImageUpdate: unsupported image command type\n"); + return UR_RESULT_ERROR_INVALID_OPERATION; + } + + UR_CALL(Queue->executeCommandList(CommandList, IsBlocking, OkToBatch)); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_mem_handle_t hBuffer, ///< [in] handle of the buffer object + bool blockingRead, ///< [in] indicates blocking (true), non-blocking (false) + size_t offset, ///< [in] offset in bytes in the buffer object + size_t size, ///< [in] size in bytes of data being read + void *pDst, ///< [in] pointer to host memory where data is to be read into + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *phEvent ///< [in,out][optional] return an event object that identifies + ///< this particular command instance. +) { + ur_mem_handle_t_ *Src = ur_cast(hBuffer); + + std::shared_lock SrcLock(Src->Mutex, std::defer_lock); + std::scoped_lock, ur_shared_mutex> LockAll( + SrcLock, Queue->Mutex); + + char *ZeHandleSrc = nullptr; + UR_CALL(Src->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, + Queue->Device)); + return enqueueMemCopyHelper(UR_COMMAND_MEM_BUFFER_READ, Queue, pDst, + blockingRead, size, ZeHandleSrc + offset, + numEventsInWaitList, phEventWaitList, phEvent, + true /* PreferCopyEngine */); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_mem_handle_t hBuffer, ///< [in] handle of the buffer object + bool + blockingWrite, ///< [in] indicates blocking (true), non-blocking (false) + size_t offset, ///< [in] offset in bytes in the buffer object + size_t size, ///< [in] size in bytes of data being written + const void + *pSrc, ///< [in] pointer to host memory where data is to be written from + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *phEvent ///< [in,out][optional] return an event object that identifies + ///< this particular command instance. +) { + ur_mem_handle_t_ *Buffer = ur_cast(hBuffer); + + std::scoped_lock Lock(Queue->Mutex, + Buffer->Mutex); + + char *ZeHandleDst = nullptr; + UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + Queue->Device)); + return enqueueMemCopyHelper(UR_COMMAND_MEM_BUFFER_WRITE, Queue, + ZeHandleDst + offset, // dst + blockingWrite, size, + pSrc, // src + numEventsInWaitList, phEventWaitList, phEvent, + true /* PreferCopyEngine */); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_mem_handle_t hBuffer, ///< [in] handle of the buffer object + bool blockingRead, ///< [in] indicates blocking (true), non-blocking (false) + ur_rect_offset_t bufferOffset, ///< [in] 3D offset in the buffer + ur_rect_offset_t hostOffset, ///< [in] 3D offset in the host region + ur_rect_region_t + region, ///< [in] 3D rectangular region descriptor: width, height, depth + size_t bufferRowPitch, ///< [in] length of each row in bytes in the buffer + ///< object + size_t bufferSlicePitch, ///< [in] length of each 2D slice in bytes in the + ///< buffer object being read + size_t hostRowPitch, ///< [in] length of each row in bytes in the host + ///< memory region pointed by dst + size_t hostSlicePitch, ///< [in] length of each 2D slice in bytes in the + ///< host memory region pointed by dst + void *pDst, ///< [in] pointer to host memory where data is to be read into + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *phEvent ///< [in,out][optional] return an event object that identifies + ///< this particular command instance. +) { + ur_mem_handle_t_ *Buffer = ur_cast(hBuffer); + + std::shared_lock SrcLock(Buffer->Mutex, std::defer_lock); + std::scoped_lock, ur_shared_mutex> LockAll( + SrcLock, Queue->Mutex); + + char *ZeHandleSrc; + UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, + Queue->Device)); + return enqueueMemCopyRectHelper( + UR_COMMAND_MEM_BUFFER_READ_RECT, Queue, ZeHandleSrc, pDst, bufferOffset, + hostOffset, region, bufferRowPitch, hostRowPitch, bufferSlicePitch, + hostSlicePitch, blockingRead, numEventsInWaitList, phEventWaitList, + phEvent); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_mem_handle_t hBuffer, ///< [in] handle of the buffer object + bool + blockingWrite, ///< [in] indicates blocking (true), non-blocking (false) + ur_rect_offset_t bufferOffset, ///< [in] 3D offset in the buffer + ur_rect_offset_t hostOffset, ///< [in] 3D offset in the host region + ur_rect_region_t + region, ///< [in] 3D rectangular region descriptor: width, height, depth + size_t bufferRowPitch, ///< [in] length of each row in bytes in the buffer + ///< object + size_t bufferSlicePitch, ///< [in] length of each 2D slice in bytes in the + ///< buffer object being written + size_t hostRowPitch, ///< [in] length of each row in bytes in the host + ///< memory region pointed by src + size_t hostSlicePitch, ///< [in] length of each 2D slice in bytes in the + ///< host memory region pointed by src + void + *pSrc, ///< [in] pointer to host memory where data is to be written from + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< points to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *phEvent ///< [in,out][optional] return an event object that identifies + ///< this particular command instance. +) { + ur_mem_handle_t_ *Buffer = ur_cast(hBuffer); + + std::scoped_lock Lock(Queue->Mutex, + Buffer->Mutex); + + char *ZeHandleDst = nullptr; + UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + Queue->Device)); + return enqueueMemCopyRectHelper( + UR_COMMAND_MEM_BUFFER_WRITE_RECT, Queue, + const_cast(static_cast(pSrc)), ZeHandleDst, + hostOffset, bufferOffset, region, hostRowPitch, bufferRowPitch, + hostSlicePitch, bufferSlicePitch, blockingWrite, numEventsInWaitList, + phEventWaitList, phEvent); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_mem_handle_t BufferSrc, ///< [in] handle of the src buffer object + ur_mem_handle_t BufferDst, ///< [in] handle of the dest buffer object + size_t SrcOffset, ///< [in] offset into hBufferSrc to begin copying from + size_t DstOffset, ///< [in] offset info hBufferDst to begin copying into + size_t Size, ///< [in] size in bytes of data being copied + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *OutEvent ///< [in,out][optional] return an event object that identifies + ///< this particular command instance. +) { + _ur_buffer *SrcBuffer = ur_cast<_ur_buffer *>(BufferSrc); + _ur_buffer *DstBuffer = ur_cast<_ur_buffer *>(BufferDst); + + UR_ASSERT(!SrcBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(!DstBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + std::shared_lock SrcLock(SrcBuffer->Mutex, std::defer_lock); + std::scoped_lock, ur_shared_mutex, + ur_shared_mutex> + LockAll(SrcLock, DstBuffer->Mutex, Queue->Mutex); + + // Copy engine is preferred only for host to device transfer. + // Device to device transfers run faster on compute engines. + bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost); + + // Temporary option added to use copy engine for D2D copy + PreferCopyEngine |= UseCopyEngineForD2DCopy; + + char *ZeHandleSrc = nullptr; + UR_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, + Queue->Device)); + char *ZeHandleDst = nullptr; + UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + Queue->Device)); + + return enqueueMemCopyHelper( + UR_COMMAND_MEM_BUFFER_COPY, Queue, ZeHandleDst + DstOffset, + false, // blocking + Size, ZeHandleSrc + SrcOffset, NumEventsInWaitList, EventWaitList, + OutEvent, PreferCopyEngine); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_mem_handle_t BufferSrc, ///< [in] handle of the source buffer object + ur_mem_handle_t BufferDst, ///< [in] handle of the dest buffer object + ur_rect_offset_t SrcOrigin, ///< [in] 3D offset in the source buffer + ur_rect_offset_t DstOrigin, ///< [in] 3D offset in the destination buffer + ur_rect_region_t SrcRegion, ///< [in] source 3D rectangular region + ///< descriptor: width, height, depth + size_t SrcRowPitch, ///< [in] length of each row in bytes in the source + ///< buffer object + size_t SrcSlicePitch, ///< [in] length of each 2D slice in bytes in the + ///< source buffer object + size_t DstRowPitch, ///< [in] length of each row in bytes in the destination + ///< buffer object + size_t DstSlicePitch, ///< [in] length of each 2D slice in bytes in the + ///< destination buffer object + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *OutEvent ///< [in,out][optional] return an event object that identifies + ///< this particular command instance. +) { + _ur_buffer *SrcBuffer = ur_cast<_ur_buffer *>(BufferSrc); + _ur_buffer *DstBuffer = ur_cast<_ur_buffer *>(BufferDst); + + UR_ASSERT(!SrcBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(!DstBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + std::shared_lock SrcLock(SrcBuffer->Mutex, std::defer_lock); + std::scoped_lock, ur_shared_mutex, + ur_shared_mutex> + LockAll(SrcLock, DstBuffer->Mutex, Queue->Mutex); + + // Copy engine is preferred only for host to device transfer. + // Device to device transfers run faster on compute engines. + bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost); + + char *ZeHandleSrc = nullptr; + UR_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, + Queue->Device)); + char *ZeHandleDst = nullptr; + UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + Queue->Device)); + + return enqueueMemCopyRectHelper( + UR_COMMAND_MEM_BUFFER_COPY_RECT, Queue, ZeHandleSrc, ZeHandleDst, + SrcOrigin, DstOrigin, SrcRegion, SrcRowPitch, DstRowPitch, SrcSlicePitch, + DstSlicePitch, + false, // blocking + NumEventsInWaitList, EventWaitList, OutEvent, PreferCopyEngine); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_mem_handle_t Buffer, ///< [in] handle of the buffer object + const void *Pattern, ///< [in] pointer to the fill pattern + size_t PatternSize, ///< [in] size in bytes of the pattern + size_t Offset, ///< [in] offset into the buffer + size_t Size, ///< [in] fill size in bytes, must be a multiple of patternSize + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *OutEvent ///< [in,out][optional] return an event object that identifies + ///< this particular command instance. +) { + std::scoped_lock Lock(Queue->Mutex, + Buffer->Mutex); + + char *ZeHandleDst = nullptr; + UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + Queue->Device)); + return enqueueMemFillHelper(UR_COMMAND_MEM_BUFFER_FILL, Queue, + ZeHandleDst + Offset, Pattern, PatternSize, Size, + NumEventsInWaitList, EventWaitList, OutEvent); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_mem_handle_t Image, ///< [in] handle of the image object + bool BlockingRead, ///< [in] indicates blocking (true), non-blocking (false) + ur_rect_offset_t Origin, ///< [in] defines the (x,y,z) offset in pixels in + ///< the 1D, 2D, or 3D image + ur_rect_region_t Region, ///< [in] defines the (width, height, depth) in + ///< pixels of the 1D, 2D, or 3D image + size_t RowPitch, ///< [in] length of each row in bytes + size_t SlicePitch, ///< [in] length of each 2D slice of the 3D image + void *Dst, ///< [in] pointer to host memory where image is to be read into + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *OutEvent ///< [in,out][optional] return an event object that identifies + ///< this particular command instance. +) { + std::scoped_lock Lock(Queue->Mutex, + Image->Mutex); + return enqueueMemImageCommandHelper( + UR_COMMAND_MEM_IMAGE_READ, Queue, Image, Dst, BlockingRead, &Origin, + nullptr, &Region, RowPitch, SlicePitch, NumEventsInWaitList, + EventWaitList, OutEvent); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_mem_handle_t Image, ///< [in] handle of the image object + bool + BlockingWrite, ///< [in] indicates blocking (true), non-blocking (false) + ur_rect_offset_t Origin, ///< [in] defines the (x,y,z) offset in pixels in + ///< the 1D, 2D, or 3D image + ur_rect_region_t Region, ///< [in] defines the (width, height, depth) in + ///< pixels of the 1D, 2D, or 3D image + size_t RowPitch, ///< [in] length of each row in bytes + size_t SlicePitch, ///< [in] length of each 2D slice of the 3D image + void *Src, ///< [in] pointer to host memory where image is to be read into + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *OutEvent ///< [in,out][optional] return an event object that identifies + ///< this particular command instance. +) { + std::scoped_lock Lock(Queue->Mutex, + Image->Mutex); + return enqueueMemImageCommandHelper( + UR_COMMAND_MEM_IMAGE_WRITE, Queue, Src, Image, BlockingWrite, nullptr, + &Origin, &Region, RowPitch, SlicePitch, NumEventsInWaitList, + EventWaitList, OutEvent); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_mem_handle_t ImageSrc, ///< [in] handle of the src image object + ur_mem_handle_t ImageDst, ///< [in] handle of the dest image object + ur_rect_offset_t SrcOrigin, ///< [in] defines the (x,y,z) offset in pixels + ///< in the source 1D, 2D, or 3D image + ur_rect_offset_t DstOrigin, ///< [in] defines the (x,y,z) offset in pixels + ///< in the destination 1D, 2D, or 3D image + ur_rect_region_t Region, ///< [in] defines the (width, height, depth) in + ///< pixels of the 1D, 2D, or 3D image + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *OutEvent ///< [in,out][optional] return an event object that identifies + ///< this particular command instance. +) { + std::shared_lock SrcLock(ImageSrc->Mutex, std::defer_lock); + std::scoped_lock, ur_shared_mutex, + ur_shared_mutex> + LockAll(SrcLock, ImageDst->Mutex, Queue->Mutex); + // Copy engine is preferred only for host to device transfer. + // Device to device transfers run faster on compute engines. + // Images are always allocated on device. + bool PreferCopyEngine = false; + return enqueueMemImageCommandHelper( + UR_COMMAND_MEM_IMAGE_COPY, Queue, ImageSrc, ImageDst, + false, // is_blocking + &SrcOrigin, &DstOrigin, &Region, + 0, // row pitch + 0, // slice pitch + NumEventsInWaitList, EventWaitList, OutEvent, PreferCopyEngine); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_mem_handle_t Buf, ///< [in] handle of the buffer object + bool BlockingMap, ///< [in] indicates blocking (true), non-blocking (false) + ur_map_flags_t MapFlags, ///< [in] flags for read, write, readwrite mapping + size_t Offset, ///< [in] offset in bytes of the buffer region being mapped + size_t Size, ///< [in] size in bytes of the buffer region being mapped + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *OutEvent, ///< [in,out][optional] return an event object that + ///< identifies this particular command instance. + void **RetMap ///< [in,out] return mapped pointer. TODO: move it before + ///< numEventsInWaitList? +) { + + auto Buffer = ur_cast<_ur_buffer *>(Buf); + + UR_ASSERT(!Buffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + ur_event_handle_t InternalEvent; + bool IsInternal = OutEvent == nullptr; + ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; + ze_event_handle_t ZeEvent = nullptr; + + bool UseCopyEngine = false; + { + // Lock automatically releases when this goes out of scope. + std::scoped_lock lock(Queue->Mutex); + + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + UR_CALL( + createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_BUFFER_MAP, + Queue->CommandListMap.end(), IsInternal)); + + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + } + + // Translate the host access mode info. + ur_mem_handle_t_::access_mode_t AccessMode = ur_mem_handle_t_::unknown; + if (MapFlags & UR_EXT_MAP_FLAG_WRITE_INVALIDATE_REGION) + AccessMode = ur_mem_handle_t_::write_only; + else { + if (MapFlags & UR_MAP_FLAG_READ) { + AccessMode = ur_mem_handle_t_::read_only; + if (MapFlags & UR_MAP_FLAG_WRITE) + AccessMode = ur_mem_handle_t_::read_write; + } else if (MapFlags & UR_MAP_FLAG_WRITE) + AccessMode = ur_mem_handle_t_::write_only; + } + + UR_ASSERT(AccessMode != ur_mem_handle_t_::unknown, + UR_RESULT_ERROR_INVALID_VALUE); + + // TODO: Level Zero is missing the memory "mapping" capabilities, so we are + // left to doing new memory allocation and a copy (read) on discrete devices. + // For integrated devices, we have allocated the buffer in host memory so no + // actions are needed here except for synchronizing on incoming events. + // A host-to-host copy is done if a host pointer had been supplied during + // buffer creation on integrated devices. + // + // TODO: for discrete, check if the input buffer is already allocated + // in shared memory and thus is accessible from the host as is. + // Can we get SYCL RT to predict/allocate in shared memory + // from the beginning? + + // For integrated devices the buffer has been allocated in host memory. + if (Buffer->OnHost) { + // Wait on incoming events before doing the copy + if (NumEventsInWaitList > 0) + UR_CALL(urEventWait(NumEventsInWaitList, EventWaitList)); + + if (Queue->isInOrderQueue()) + UR_CALL(urQueueFinish(Queue)); + + // Lock automatically releases when this goes out of scope. + std::scoped_lock Guard(Buffer->Mutex); + + char *ZeHandleSrc; + UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device)); + + if (Buffer->MapHostPtr) { + *RetMap = Buffer->MapHostPtr + Offset; + if (ZeHandleSrc != Buffer->MapHostPtr && + AccessMode != ur_mem_handle_t_::write_only) { + memcpy(*RetMap, ZeHandleSrc + Offset, Size); + } + } else { + *RetMap = ZeHandleSrc + Offset; + } + + auto Res = Buffer->Mappings.insert({*RetMap, {Offset, Size}}); + // False as the second value in pair means that mapping was not inserted + // because mapping already exists. + if (!Res.second) { + urPrint("urEnqueueMemBufferMap: duplicate mapping detected\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + // Signal this event + ZE2UR_CALL(zeEventHostSignal, (ZeEvent)); + (*Event)->Completed = true; + return UR_RESULT_SUCCESS; + } + + // Lock automatically releases when this goes out of scope. + std::scoped_lock Lock(Queue->Mutex, + Buffer->Mutex); + + if (Buffer->MapHostPtr) { + *RetMap = Buffer->MapHostPtr + Offset; + } else { + // TODO: use USM host allocator here + // TODO: Do we even need every map to allocate new host memory? + // In the case when the buffer is "OnHost" we use single allocation. + UR_CALL(ZeHostMemAllocHelper(RetMap, Queue->Context, Size)); + } + + // Take a shortcut if the host is not going to read buffer's data. + if (AccessMode == ur_mem_handle_t_::write_only) { + (*Event)->Completed = true; + } else { + // For discrete devices we need a command list + ur_command_list_ptr_t CommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, + UseCopyEngine)); + + // Add the event to the command list. + CommandList->second.append(reinterpret_cast(*Event)); + (*Event)->RefCount.increment(); + + const auto &ZeCommandList = CommandList->first; + const auto &WaitList = (*Event)->WaitList; + + char *ZeHandleSrc; + UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device)); + + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (ZeCommandList, *RetMap, ZeHandleSrc + Offset, Size, ZeEvent, + WaitList.Length, WaitList.ZeEventList)); + + UR_CALL(Queue->executeCommandList(CommandList, BlockingMap)); + } + + auto Res = Buffer->Mappings.insert({*RetMap, {Offset, Size}}); + // False as the second value in pair means that mapping was not inserted + // because mapping already exists. + if (!Res.second) { + urPrint("urEnqueueMemBufferMap: duplicate mapping detected\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_mem_handle_t Mem, ///< [in] handle of the memory (buffer or image) object + void *MappedPtr, ///< [in] mapped host address + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *OutEvent ///< [in,out][optional] return an event object that identifies + ///< this particular command instance. +) { + UR_ASSERT(!Mem->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + auto Buffer = ur_cast<_ur_buffer *>(Mem); + + bool UseCopyEngine = false; + + ze_event_handle_t ZeEvent = nullptr; + ur_event_handle_t InternalEvent; + bool IsInternal = OutEvent == nullptr; + ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; + { + // Lock automatically releases when this goes out of scope. + std::scoped_lock lock(Queue->Mutex); + + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_UNMAP, + Queue->CommandListMap.end(), + IsInternal)); + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + } + + _ur_buffer::Mapping MapInfo = {}; + { + // Lock automatically releases when this goes out of scope. + std::scoped_lock Guard(Buffer->Mutex); + auto It = Buffer->Mappings.find(MappedPtr); + if (It == Buffer->Mappings.end()) { + urPrint("urEnqueueMemUnmap: unknown memory mapping\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + MapInfo = It->second; + Buffer->Mappings.erase(It); + + // NOTE: we still have to free the host memory allocated/returned by + // piEnqueueMemBufferMap, but can only do so after the above copy + // is completed. Instead of waiting for It here (blocking), we shall + // do so in piEventRelease called for the pi_event tracking the unmap. + // In the case of an integrated device, the map operation does not allocate + // any memory, so there is nothing to free. This is indicated by a nullptr. + (*Event)->CommandData = + (Buffer->OnHost ? nullptr : (Buffer->MapHostPtr ? nullptr : MappedPtr)); + } + + // For integrated devices the buffer is allocated in host memory. + if (Buffer->OnHost) { + // Wait on incoming events before doing the copy + if (NumEventsInWaitList > 0) + UR_CALL(urEventWait(NumEventsInWaitList, EventWaitList)); + + if (Queue->isInOrderQueue()) + UR_CALL(urQueueFinish(Queue)); + + char *ZeHandleDst; + UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + Queue->Device)); + + std::scoped_lock Guard(Buffer->Mutex); + if (Buffer->MapHostPtr) + memcpy(ZeHandleDst + MapInfo.Offset, MappedPtr, MapInfo.Size); + + // Signal this event + ZE2UR_CALL(zeEventHostSignal, (ZeEvent)); + (*Event)->Completed = true; + return UR_RESULT_SUCCESS; + } + + // Lock automatically releases when this goes out of scope. + std::scoped_lock Lock(Queue->Mutex, + Buffer->Mutex); + + ur_command_list_ptr_t CommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList( + reinterpret_cast(Queue), CommandList, UseCopyEngine)); + + CommandList->second.append(reinterpret_cast(*Event)); + (*Event)->RefCount.increment(); + + const auto &ZeCommandList = CommandList->first; + + // TODO: Level Zero is missing the memory "mapping" capabilities, so we are + // left to doing copy (write back to the device). + // + // NOTE: Keep this in sync with the implementation of + // piEnqueueMemBufferMap. + + char *ZeHandleDst; + UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + Queue->Device)); + + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (ZeCommandList, ZeHandleDst + MapInfo.Offset, MappedPtr, + MapInfo.Size, ZeEvent, (*Event)->WaitList.Length, + (*Event)->WaitList.ZeEventList)); + + // Execute command list asynchronously, as the event will be used + // to track down its completion. + UR_CALL(Queue->executeCommandList(CommandList)); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemset( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + void *Ptr, ///< [in] pointer to USM memory object + int8_t ByteValue, ///< [in] byte value to fill + size_t Count, ///< [in] size in bytes to be set + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t *Event ///< [in,out][optional] return an event object that + ///< identifies this particular command instance. +) { + std::ignore = Queue; + std::ignore = Ptr; + std::ignore = ByteValue; + std::ignore = Count; + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = Event; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + bool Blocking, ///< [in] blocking or non-blocking copy + void *Dst, ///< [in] pointer to the destination USM memory object + const void *Src, ///< [in] pointer to the source USM memory object + size_t Size, ///< [in] size in bytes to be copied + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *OutEvent ///< [in,out][optional] return an event object that identifies + ///< this particular command instance. +) { + std::scoped_lock lock(Queue->Mutex); + + // Device to Device copies are found to execute slower on copy engine + // (versus compute engine). + bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src) || + !IsDevicePointer(Queue->Context, Dst); + + // Temporary option added to use copy engine for D2D copy + PreferCopyEngine |= UseCopyEngineForD2DCopy; + + return enqueueMemCopyHelper( // TODO: do we need a new command type for this? + UR_COMMAND_MEM_BUFFER_COPY, Queue, Dst, Blocking, Size, Src, + NumEventsInWaitList, EventWaitList, OutEvent, PreferCopyEngine); +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + const void *Mem, ///< [in] pointer to the USM memory object + size_t Size, ///< [in] size in bytes to be fetched + ur_usm_migration_flags_t Flags, ///< [in] USM prefetch flags + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *OutEvent ///< [in,out][optional] return an event object that identifies + ///< this particular command instance. +) { + // Lock automatically releases when this goes out of scope. + std::scoped_lock lock(Queue->Mutex); + + bool UseCopyEngine = false; + + // Please note that the following code should be run before the + // subsequent getAvailableCommandList() call so that there is no + // dead-lock from waiting unsubmitted events in an open batch. + // The createAndRetainUrZeEventList() has the proper side-effect + // of submitting batches with dependent events. + // + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + // Get a new command list to be used on this call + ur_command_list_ptr_t CommandList{}; + // TODO: Change UseCopyEngine argument to 'true' once L0 backend + // support is added + UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, + UseCopyEngine)); + + // TODO: do we need to create a unique command type for this? + ze_event_handle_t ZeEvent = nullptr; + ur_event_handle_t InternalEvent; + bool IsInternal = OutEvent == nullptr; + ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; + UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_EXT_COMMAND_TYPE_USER, + CommandList, IsInternal)); + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + + const auto &WaitList = (*Event)->WaitList; + const auto &ZeCommandList = CommandList->first; + if (WaitList.Length) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); + } + // TODO: figure out how to translate "flags" + ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, (ZeCommandList, Mem, Size)); + + // TODO: Level Zero does not have a completion "event" with the prefetch API, + // so manually add command to signal our event. + ZE2UR_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent)); + + UR_CALL(Queue->executeCommandList(CommandList, false)); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemAdvise( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + const void *Mem, ///< [in] pointer to the USM memory object + size_t Size, ///< [in] size in bytes to be advised + ur_mem_advice_t Advice, ///< [in] USM memory advice + ur_event_handle_t + *OutEvent ///< [in,out][optional] return an event object that identifies + ///< this particular command instance. +) { + // Lock automatically releases when this goes out of scope. + std::scoped_lock lock(Queue->Mutex); + + auto ZeAdvice = ur_cast(Advice); + + bool UseCopyEngine = false; + + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList(0, nullptr, Queue, + UseCopyEngine)); + + // Get a new command list to be used on this call + ur_command_list_ptr_t CommandList{}; + // UseCopyEngine is set to 'false' here. + // TODO: Additional analysis is required to check if this operation will + // run faster on copy engines. + UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, + UseCopyEngine)); + + // TODO: do we need to create a unique command type for this? + ze_event_handle_t ZeEvent = nullptr; + ur_event_handle_t InternalEvent{}; + bool IsInternal = OutEvent == nullptr; + ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; + UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_EXT_COMMAND_TYPE_USER, + CommandList, IsInternal)); + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; + + const auto &ZeCommandList = CommandList->first; + const auto &WaitList = (*Event)->WaitList; + + if (WaitList.Length) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); + } + + ZE2UR_CALL(zeCommandListAppendMemAdvise, + (ZeCommandList, Queue->Device->ZeDevice, Mem, Size, ZeAdvice)); + + // TODO: Level Zero does not have a completion "event" with the advise API, + // so manually add command to signal our event. + ZE2UR_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent)); + + Queue->executeCommandList(CommandList, false); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D( + ur_queue_handle_t Queue, ///< [in] handle of the queue to submit to. + void *Mem, ///< [in] pointer to memory to be filled. + size_t Pitch, ///< [in] the total width of the destination memory including + ///< padding. + size_t PatternSize, ///< [in] the size in bytes of the pattern. + const void *Pattern, ///< [in] pointer with the bytes of the pattern to set. + size_t Width, ///< [in] the width in bytes of each row to fill. + size_t Height, ///< [in] the height of the columns to fill. + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before the kernel execution. If nullptr, the + ///< numEventsInWaitList must be 0, indicating that no + ///< wait event. + ur_event_handle_t + *OutEvent ///< [in,out][optional] return an event object that identifies + ///< this particular kernel execution instance. +) { + std::ignore = Queue; + std::ignore = Mem; + std::ignore = Pitch; + std::ignore = PatternSize; + std::ignore = Pattern; + std::ignore = Width; + std::ignore = Height; + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = OutEvent; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemset2D( + ur_queue_handle_t Queue, ///< [in] handle of the queue to submit to. + void *Mem, ///< [in] pointer to memory to be filled. + size_t Pitch, ///< [in] the total width of the destination memory including + ///< padding. + int Value, ///< [in] the value to fill into the region in pMem. + size_t Width, ///< [in] the width in bytes of each row to set. + size_t Height, ///< [in] the height of the columns to set. + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before the kernel execution. If nullptr, the + ///< numEventsInWaitList must be 0, indicating that no + ///< wait event. + ur_event_handle_t + *OutEvent ///< [in,out][optional] return an event object that identifies + ///< this particular kernel execution instance. +) { + std::ignore = Queue; + std::ignore = Mem; + std::ignore = Pitch; + std::ignore = Value; + std::ignore = Width; + std::ignore = Height; + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = OutEvent; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( + ur_queue_handle_t Queue, ///< [in] handle of the queue to submit to. + bool Blocking, ///< [in] indicates if this operation should block the host. + void *Dst, ///< [in] pointer to memory where data will be copied. + size_t DstPitch, ///< [in] the total width of the source memory including + ///< padding. + const void *Src, ///< [in] pointer to memory to be copied. + size_t SrcPitch, ///< [in] the total width of the source memory including + ///< padding. + size_t Width, ///< [in] the width in bytes of each row to be copied. + size_t Height, ///< [in] the height of columns to be copied. + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before the kernel execution. If nullptr, the + ///< numEventsInWaitList must be 0, indicating that no + ///< wait event. + ur_event_handle_t + *Event ///< [in,out][optional] return an event object that identifies + ///< this particular kernel execution instance. +) { + + ur_rect_offset_t ZeroOffset{0, 0, 0}; + ur_rect_region_t Region{Width, Height, 0}; + + std::scoped_lock lock(Queue->Mutex); + + // Device to Device copies are found to execute slower on copy engine + // (versus compute engine). + bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src) || + !IsDevicePointer(Queue->Context, Dst); + + // Temporary option added to use copy engine for D2D copy + PreferCopyEngine |= UseCopyEngineForD2DCopy; + + return enqueueMemCopyRectHelper( // TODO: do we need a new command type for + // this? + UR_COMMAND_MEM_BUFFER_COPY_RECT, Queue, Src, Dst, ZeroOffset, ZeroOffset, + Region, SrcPitch, DstPitch, 0, /*SrcSlicePitch=*/ + 0, /*DstSlicePitch=*/ + Blocking, NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine); +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( + ur_context_handle_t Context, ///< [in] handle of the context object + ur_mem_flags_t Flags, ///< [in] allocation and usage information flags + const ur_image_format_t + *ImageFormat, ///< [in] pointer to image format specification + const ur_image_desc_t *ImageDesc, ///< [in] pointer to image description + void *Host, ///< [in] pointer to the buffer data + ur_mem_handle_t *Mem ///< [out] pointer to handle of image object created +) { + ze_image_format_type_t ZeImageFormatType; + size_t ZeImageFormatTypeSize; + switch (ImageFormat->channelType) { + case UR_IMAGE_CHANNEL_TYPE_FLOAT: { + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT; + ZeImageFormatTypeSize = 32; + break; + } + case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT: { + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT; + ZeImageFormatTypeSize = 16; + break; + } + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: { + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT; + ZeImageFormatTypeSize = 32; + break; + } + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: { + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT; + ZeImageFormatTypeSize = 16; + break; + } + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: { + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT; + ZeImageFormatTypeSize = 8; + break; + } + case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16: { + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM; + ZeImageFormatTypeSize = 16; + break; + } + case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8: { + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM; + ZeImageFormatTypeSize = 8; + break; + } + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32: { + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT; + ZeImageFormatTypeSize = 32; + break; + } + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16: { + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT; + ZeImageFormatTypeSize = 16; + break; + } + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8: { + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT; + ZeImageFormatTypeSize = 8; + break; + } + case UR_IMAGE_CHANNEL_TYPE_SNORM_INT16: { + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM; + ZeImageFormatTypeSize = 16; + break; + } + case UR_IMAGE_CHANNEL_TYPE_SNORM_INT8: { + ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM; + ZeImageFormatTypeSize = 8; + break; + } + default: + urPrint("urMemImageCreate: unsupported image data type: data type = %d\n", + ImageFormat->channelType); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + // TODO: populate the layout mapping + ze_image_format_layout_t ZeImageFormatLayout; + switch (ImageFormat->channelOrder) { + case UR_IMAGE_CHANNEL_ORDER_RGBA: { + switch (ZeImageFormatTypeSize) { + case 8: + ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8; + break; + case 16: + ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16; + break; + case 32: + ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32; + break; + default: + urPrint("urMemImageCreate: unexpected data type Size\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + break; + } + default: + urPrint("format layout = %d\n", ImageFormat->channelOrder); + die("urMemImageCreate: unsupported image format layout\n"); + break; + } + + ze_image_format_t ZeFormatDesc = { + ZeImageFormatLayout, ZeImageFormatType, + // TODO: are swizzles deducted from image_format->image_channel_order? + ZE_IMAGE_FORMAT_SWIZZLE_R, ZE_IMAGE_FORMAT_SWIZZLE_G, + ZE_IMAGE_FORMAT_SWIZZLE_B, ZE_IMAGE_FORMAT_SWIZZLE_A}; + + ze_image_type_t ZeImageType; + switch (ImageDesc->type) { + case UR_MEM_TYPE_IMAGE1D: + ZeImageType = ZE_IMAGE_TYPE_1D; + break; + case UR_MEM_TYPE_IMAGE2D: + ZeImageType = ZE_IMAGE_TYPE_2D; + break; + case UR_MEM_TYPE_IMAGE3D: + ZeImageType = ZE_IMAGE_TYPE_3D; + break; + case UR_MEM_TYPE_IMAGE1D_ARRAY: + ZeImageType = ZE_IMAGE_TYPE_1DARRAY; + break; + case UR_MEM_TYPE_IMAGE2D_ARRAY: + ZeImageType = ZE_IMAGE_TYPE_2DARRAY; + break; + default: + urPrint("urMemImageCreate: unsupported image type\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + ZeStruct ZeImageDesc; + ZeImageDesc.arraylevels = ZeImageDesc.flags = 0; + ZeImageDesc.type = ZeImageType; + ZeImageDesc.format = ZeFormatDesc; + ZeImageDesc.width = ur_cast(ImageDesc->width); + ZeImageDesc.height = ur_cast(ImageDesc->height); + ZeImageDesc.depth = ur_cast(ImageDesc->depth); + ZeImageDesc.arraylevels = ur_cast(ImageDesc->arraySize); + ZeImageDesc.miplevels = ImageDesc->numMipLevel; + + std::shared_lock Lock(Context->Mutex); + + // Currently we have the "0" device in context with mutliple root devices to + // own the image. + // TODO: Implement explicit copying for acessing the image from other devices + // in the context. + ur_device_handle_t Device = Context->SingleRootDevice + ? Context->SingleRootDevice + : Context->Devices[0]; + ze_image_handle_t ZeImage; + ZE2UR_CALL(zeImageCreate, + (Context->ZeContext, Device->ZeDevice, &ZeImageDesc, &ZeImage)); + + try { + auto UrImage = + new _ur_image(ur_cast(Context), ZeImage); + *Mem = reinterpret_cast(UrImage); + +#ifndef NDEBUG + UrImage->ZeImageDesc = ZeImageDesc; +#endif // !NDEBUG + + if ((Flags & UR_MEM_FLAG_USE_HOST_POINTER) != 0 || + (Flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) != 0) { + // Initialize image synchronously with immediate offload. + // zeCommandListAppendImageCopyFromMemory must not be called from + // simultaneous threads with the same command list handle, so we need + // exclusive lock. + std::scoped_lock Lock(Context->ImmediateCommandListMutex); + ZE2UR_CALL(zeCommandListAppendImageCopyFromMemory, + (Context->ZeCommandListInit, ZeImage, Host, nullptr, nullptr, + 0, nullptr)); + } + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( + ur_context_handle_t Context, ///< [in] handle of the context object + ur_mem_flags_t Flags, ///< [in] allocation and usage information flags + size_t Size, ///< [in] size in bytes of the memory object to be allocated + void *Host, ///< [in][optional] pointer to the buffer data + ur_mem_handle_t + *RetBuffer ///< [out] pointer to handle of the memory buffer created +) { + if (Flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { + // Having PI_MEM_FLAGS_HOST_PTR_ALLOC for buffer requires allocation of + // pinned host memory, see: + // sycl/doc/extensions/supported/sycl_ext_oneapi_use_pinned_host_memory_property.asciidoc + // We are however missing such functionality in Level Zero, so we just + // ignore the flag for now. + // + } + + // If USM Import feature is enabled and hostptr is supplied, + // import the hostptr if not already imported into USM. + // Data transfer rate is maximized when both source and destination + // are USM pointers. Promotion of the host pointer to USM thus + // optimizes data transfer performance. + bool HostPtrImported = false; + if (ZeUSMImport.Enabled && Host != nullptr && + (Flags & UR_MEM_FLAG_USE_HOST_POINTER) != 0) { + // Query memory type of the host pointer + ze_device_handle_t ZeDeviceHandle; + ZeStruct ZeMemoryAllocationProperties; + ZE2UR_CALL(zeMemGetAllocProperties, + (Context->ZeContext, Host, &ZeMemoryAllocationProperties, + &ZeDeviceHandle)); + + // If not shared of any type, we can import the ptr + if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) { + // Promote the host ptr to USM host memory + ze_driver_handle_t driverHandle = Context->getPlatform()->ZeDriver; + ZeUSMImport.doZeUSMImport(driverHandle, Host, Size); + HostPtrImported = true; + } + } + + _ur_buffer *Buffer = nullptr; + auto HostPtrOrNull = (Flags & UR_MEM_FLAG_USE_HOST_POINTER) + ? reinterpret_cast(Host) + : nullptr; + try { + Buffer = new _ur_buffer(Context, Size, HostPtrOrNull, HostPtrImported); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + // Initialize the buffer with user data + if (Host) { + if ((Flags & UR_MEM_FLAG_USE_HOST_POINTER) != 0 || + (Flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) != 0) { + + // We don't yet know which device needs this buffer, so make the first + // device in the context be the master, and hold the initial valid + // allocation. + char *ZeHandleDst; + UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + Context->Devices[0])); + if (Buffer->OnHost) { + // Do a host to host copy. + // For an imported HostPtr the copy is unneeded. + if (!HostPtrImported) + memcpy(ZeHandleDst, Host, Size); + } else { + // Initialize the buffer synchronously with immediate offload + // zeCommandListAppendMemoryCopy must not be called from simultaneous + // threads with the same command list handle, so we need exclusive lock. + std::scoped_lock Lock(Context->ImmediateCommandListMutex); + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (Context->ZeCommandListInit, ZeHandleDst, Host, Size, + nullptr, 0, nullptr)); + } + } else if (Flags == 0 || (Flags == UR_MEM_FLAG_READ_WRITE)) { + // Nothing more to do. + } else + die("urMemBufferCreate: not implemented"); + } + + *RetBuffer = reinterpret_cast(Buffer); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemRetain( + ur_mem_handle_t Mem ///< [in] handle of the memory object to get access +) { + Mem->RefCount.increment(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemRelease( + ur_mem_handle_t Mem ///< [in] handle of the memory object to release +) { + if (!Mem->RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + + if (Mem->isImage()) { + char *ZeHandleImage; + UR_CALL(Mem->getZeHandle(ZeHandleImage, ur_mem_handle_t_::write_only)); + auto ZeResult = ZE_CALL_NOCHECK( + zeImageDestroy, (ur_cast(ZeHandleImage))); + // Gracefully handle the case that L0 was already unloaded. + if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) + return ze2urResult(ZeResult); + } else { + auto Buffer = reinterpret_cast<_ur_buffer *>(Mem); + Buffer->free(); + } + delete Mem; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( + ur_mem_handle_t + Buffer, ///< [in] handle of the buffer object to allocate from + ur_mem_flags_t Flags, ///< [in] allocation and usage information flags + ur_buffer_create_type_t BufferCreateType, ///< [in] buffer creation type + ur_buffer_region_t + *BufferCreateInfo, ///< [in] pointer to buffer create region information + ur_mem_handle_t + *RetMem ///< [out] pointer to the handle of sub buffer created +) { + UR_ASSERT(Buffer && !Buffer->isImage() && + !(static_cast<_ur_buffer *>(Buffer))->isSubBuffer(), + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + std::shared_lock Guard(Buffer->Mutex); + + if (Flags != UR_MEM_FLAG_READ_WRITE) { + die("urMemBufferPartition: Level-Zero implements only read-write buffer," + "no read-only or write-only yet."); + } + + try { + auto partitionedBuffer = + new _ur_buffer(static_cast<_ur_buffer *>(Buffer), + BufferCreateInfo->origin, BufferCreateInfo->size); + *RetMem = reinterpret_cast(partitionedBuffer); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle( + ur_mem_handle_t Mem, ///< [in] handle of the mem. + ur_native_handle_t + *NativeMem ///< [out] a pointer to the native handle of the mem. +) { + std::shared_lock Guard(Mem->Mutex); + char *ZeHandle = nullptr; + UR_CALL(Mem->getZeHandle(ZeHandle, ur_mem_handle_t_::read_write)); + *NativeMem = ur_cast(ZeHandle); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemCreateWithNativeHandle( + ur_native_handle_t NativeMem, ///< [in] the native handle of the mem. + ur_context_handle_t Context, ///< [in] handle of the context object + ur_mem_handle_t + *Mem ///< [out] pointer to the handle of the mem object created. +) { + std::shared_lock Lock(Context->Mutex); + + // TODO: Get OwnNativeHandle from the output parameter while we get it in + // interface + bool OwnNativeHandle = (*Mem)->OwnNativeHandle; + + // Get base of the allocation + void *Base = nullptr; + size_t Size = 0; + void *Ptr = ur_cast(NativeMem); + ZE2UR_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, &Base, &Size)); + UR_ASSERT(Ptr == Base, UR_RESULT_ERROR_INVALID_VALUE); + + ZeStruct ZeMemProps; + ze_device_handle_t ZeDevice = nullptr; + ZE2UR_CALL(zeMemGetAllocProperties, + (Context->ZeContext, Ptr, &ZeMemProps, &ZeDevice)); + + // Check type of the allocation + switch (ZeMemProps.type) { + case ZE_MEMORY_TYPE_HOST: + case ZE_MEMORY_TYPE_SHARED: + case ZE_MEMORY_TYPE_DEVICE: + break; + case ZE_MEMORY_TYPE_UNKNOWN: + // Memory allocation is unrelated to the context + return UR_RESULT_ERROR_INVALID_CONTEXT; + default: + die("Unexpected memory type"); + } + + ur_device_handle_t Device{}; + if (ZeDevice) { + Device = Context->getPlatform()->getDeviceFromNativeHandle(ZeDevice); + UR_ASSERT(Context->isValidDevice(Device), UR_RESULT_ERROR_INVALID_CONTEXT); + } + + _ur_buffer *Buffer = nullptr; + try { + Buffer = new _ur_buffer(Context, Device, Size); + *Mem = reinterpret_cast(Buffer); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + ur_platform_handle_t Plt = Context->getPlatform(); + std::unique_lock ContextsLock(Plt->ContextsMutex, + std::defer_lock); + // If we don't own the native handle then we can't control deallocation of + // that memory so there is no point of keeping track of the memory + // allocation for deferred memory release in the mode when indirect access + // tracking is enabled. + if (IndirectAccessTrackingEnabled && OwnNativeHandle) { + // We need to keep track of all memory allocations in the context + ContextsLock.lock(); + // Retain context to be sure that it is released after all memory + // allocations in this context are released. + UR_CALL(urContextRetain(Context)); + + Context->MemAllocs.emplace( + std::piecewise_construct, std::forward_as_tuple(Ptr), + std::forward_as_tuple(Context, + true /*ownNativeHandle, how do we pass it here? or + do we move all this logic to pi2ur? */ + )); + } + + if (Device) { + // If this allocation is on a device, then we re-use it for the buffer. + // Nothing to do. + } else if (Buffer->OnHost) { + // If this is host allocation and buffer always stays on host there + // nothing more to do. + } else { + // In all other cases (shared allocation, or host allocation that cannot + // represent the buffer in this context) copy the data to a newly + // created device allocation. + char *ZeHandleDst; + UR_CALL( + Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, Device)); + + // zeCommandListAppendMemoryCopy must not be called from simultaneous + // threads with the same command list handle, so we need exclusive lock. + std::scoped_lock Lock(Context->ImmediateCommandListMutex); + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (Context->ZeCommandListInit, ZeHandleDst, Ptr, Size, nullptr, 0, + nullptr)); + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo( + ur_mem_handle_t Memory, ///< [in] handle to the memory object being queried. + ur_mem_info_t MemInfoType, ///< [in] type of the info to retrieve. + size_t PropSize, ///< [in] the number of bytes of memory pointed to by + ///< pMemInfo. + void *MemInfo, ///< [out][optional] array of bytes holding the info. + ///< If propSize is less than the real number of bytes + ///< needed to return the info then the + ///< ::UR_RESULT_ERROR_INVALID_SIZE error is returned and + ///< pMemInfo is not used. + size_t *PropSizeRet ///< [out][optional] pointer to the actual size in + ///< bytes of data queried by pMemInfo. +) { + UR_ASSERT(!Memory->isImage(), UR_RESULT_ERROR_INVALID_VALUE); + + auto Buffer = reinterpret_cast<_ur_buffer *>(Memory); + std::shared_lock Lock(Buffer->Mutex); + UrReturnHelper ReturnValue(PropSize, MemInfo, PropSizeRet); + + switch (MemInfoType) { + case UR_MEM_INFO_CONTEXT: { + return ReturnValue(Buffer->UrContext); + } + case UR_MEM_INFO_SIZE: { + // Get size of the allocation + return ReturnValue(size_t{Buffer->Size}); + } + default: { + die("urMemGetInfo: Parameter is not implemented"); + } + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo( + ur_mem_handle_t Memory, ///< [in] handle to the image object being queried. + ur_image_info_t ImgInfoType, ///< [in] type of image info to retrieve. + size_t PropSize, ///< [in] the number of bytes of memory pointer to by + ///< pImgInfo. + void *ImgInfo, ///< [out][optional] array of bytes holding the info. + ///< If propSize is less than the real number of bytes + ///< needed to return the info then the + ///< ::UR_RESULT_ERROR_INVALID_SIZE error is returned and + ///< pImgInfo is not used. + size_t *PropSizeRet ///< [out][optional] pointer to the actual size in + ///< bytes of data queried by pImgInfo. +) { + std::ignore = Memory; + std::ignore = ImgInfoType; + std::ignore = PropSize; + std::ignore = ImgInfo; + std::ignore = PropSizeRet; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( + ur_context_handle_t Context, ///< [in] handle of the context object + ur_usm_desc_t *USMDesc, ///< [in][optional] USM memory allocation descriptor + ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created + ///< using urUSMPoolCreate + size_t + Size, ///< [in] size in bytes of the USM memory object to be allocated + uint32_t Align, ///< [in] alignment of the USM memory object + void **RetMem ///< [out] pointer to USM host memory object +) { + std::ignore = Pool; + + // L0 supports alignment up to 64KB and silently ignores higher values. + // We flag alignment > 64KB as an invalid value. + if (Align > 65536) + return UR_RESULT_ERROR_INVALID_VALUE; + + ur_usm_mem_flags_t *USMFlag = &USMDesc->flags; + std::ignore = USMFlag; + + ur_platform_handle_t Plt = Context->getPlatform(); + // If indirect access tracking is enabled then lock the mutex which is + // guarding contexts container in the platform. This prevents new kernels from + // being submitted in any context while we are in the process of allocating a + // memory, this is needed to properly capture allocations by kernels with + // indirect access. This lock also protects access to the context's data + // structures. If indirect access tracking is not enabled then lock context + // mutex to protect access to context's data structures. + std::shared_lock ContextLock(Context->Mutex, + std::defer_lock); + std::unique_lock IndirectAccessTrackingLock( + Plt->ContextsMutex, std::defer_lock); + if (IndirectAccessTrackingEnabled) { + IndirectAccessTrackingLock.lock(); + // We are going to defer memory release if there are kernels with indirect + // access, that is why explicitly retain context to be sure that it is + // released after all memory allocations in this context are released. + UR_CALL(urContextRetain(Context)); + } else { + ContextLock.lock(); + } + + if (!UseUSMAllocator || + // L0 spec says that allocation fails if Alignment != 2^n, in order to + // keep the same behavior for the allocator, just call L0 API directly and + // return the error code. + ((Align & (Align - 1)) != 0)) { + ur_usm_mem_flags_t Properties{}; + ur_result_t Res = + USMHostAllocImpl(RetMem, Context, &Properties, Size, Align); + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*RetMem), + std::forward_as_tuple(Context)); + } + return Res; + } + + // There is a single allocator for Host USM allocations, so we don't need to + // find the allocator depending on context as we do for Shared and Device + // allocations. + try { + *RetMem = Context->HostMemAllocContext->allocate(Size, Align); + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*RetMem), + std::forward_as_tuple(Context)); + } + } catch (const UsmAllocationException &Ex) { + *RetMem = nullptr; + return Ex.getError(); + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( + ur_context_handle_t Context, ///< [in] handle of the context object + ur_device_handle_t Device, ///< [in] handle of the device object + ur_usm_desc_t *USMDesc, ///< [in][optional] USM memory allocation descriptor + ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created + ///< using urUSMPoolCreate + size_t + Size, ///< [in] size in bytes of the USM memory object to be allocated + uint32_t Alignment, ///< [in] alignment of the USM memory object + void **RetMem ///< [out] pointer to USM device memory object +) { + std::ignore = Pool; + + // L0 supports alignment up to 64KB and silently ignores higher values. + // We flag alignment > 64KB as an invalid value. + if (Alignment > 65536) + return UR_RESULT_ERROR_INVALID_VALUE; + + ur_usm_mem_flags_t *USMProp = &USMDesc->flags; + std::ignore = USMProp; + + ur_platform_handle_t Plt = Device->Platform; + + // If indirect access tracking is enabled then lock the mutex which is + // guarding contexts container in the platform. This prevents new kernels from + // being submitted in any context while we are in the process of allocating a + // memory, this is needed to properly capture allocations by kernels with + // indirect access. This lock also protects access to the context's data + // structures. If indirect access tracking is not enabled then lock context + // mutex to protect access to context's data structures. + std::shared_lock ContextLock(Context->Mutex, + std::defer_lock); + std::unique_lock IndirectAccessTrackingLock( + Plt->ContextsMutex, std::defer_lock); + if (IndirectAccessTrackingEnabled) { + IndirectAccessTrackingLock.lock(); + // We are going to defer memory release if there are kernels with indirect + // access, that is why explicitly retain context to be sure that it is + // released after all memory allocations in this context are released. + UR_CALL(urContextRetain(Context)); + } else { + ContextLock.lock(); + } + + if (!UseUSMAllocator || + // L0 spec says that allocation fails if Alignment != 2^n, in order to + // keep the same behavior for the allocator, just call L0 API directly and + // return the error code. + ((Alignment & (Alignment - 1)) != 0)) { + ur_result_t Res = + USMDeviceAllocImpl(RetMem, Context, Device, nullptr, Size, Alignment); + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*RetMem), + std::forward_as_tuple(Context)); + } + return Res; + } + + try { + auto It = Context->DeviceMemAllocContexts.find(Device->ZeDevice); + if (It == Context->DeviceMemAllocContexts.end()) + return UR_RESULT_ERROR_INVALID_VALUE; + + *RetMem = It->second.allocate(Size, Alignment); + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*RetMem), + std::forward_as_tuple(Context)); + } + + } catch (const UsmAllocationException &Ex) { + *RetMem = nullptr; + return Ex.getError(); + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( + ur_context_handle_t Context, ///< [in] handle of the context object + ur_device_handle_t Device, ///< [in] handle of the device object + ur_usm_desc_t *USMDesc, ///< [in][optional] USM memory allocation descriptor + ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created + ///< using urUSMPoolCreate + size_t + Size, ///< [in] size in bytes of the USM memory object to be allocated + uint32_t Alignment, ///< [in] alignment of the USM memory object + void **RetMem ///< [out] pointer to USM shared memory object +) { + std::ignore = Pool; + + ur_usm_mem_flags_t *Properties = &USMDesc->flags; + + // See if the memory is going to be read-only on the device. + bool DeviceReadOnly = false; + + // L0 supports alignment up to 64KB and silently ignores higher values. + // We flag alignment > 64KB as an invalid value. + if (Alignment > 65536) + return UR_RESULT_ERROR_INVALID_VALUE; + + ur_platform_handle_t Plt = Device->Platform; + + // If indirect access tracking is enabled then lock the mutex which is + // guarding contexts container in the platform. This prevents new kernels from + // being submitted in any context while we are in the process of allocating a + // memory, this is needed to properly capture allocations by kernels with + // indirect access. This lock also protects access to the context's data + // structures. If indirect access tracking is not enabled then lock context + // mutex to protect access to context's data structures. + std::scoped_lock Lock( + IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex); + + if (IndirectAccessTrackingEnabled) { + // We are going to defer memory release if there are kernels with indirect + // access, that is why explicitly retain context to be sure that it is + // released after all memory allocations in this context are released. + UR_CALL(urContextRetain(Context)); + } + + if (!UseUSMAllocator || + // L0 spec says that allocation fails if Alignment != 2^n, in order to + // keep the same behavior for the allocator, just call L0 API directly and + // return the error code. + ((Alignment & (Alignment - 1)) != 0)) { + ur_result_t Res = USMSharedAllocImpl(RetMem, Context, Device, Properties, + Size, Alignment); + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*RetMem), + std::forward_as_tuple(Context)); + } + return Res; + } + + try { + auto &Allocator = (DeviceReadOnly ? Context->SharedReadOnlyMemAllocContexts + : Context->SharedMemAllocContexts); + auto It = Allocator.find(Device->ZeDevice); + if (It == Allocator.end()) + return UR_RESULT_ERROR_INVALID_VALUE; + + *RetMem = It->second.allocate(Size, Alignment); + if (DeviceReadOnly) { + Context->SharedReadOnlyAllocs.insert(*RetMem); + } + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*RetMem), + std::forward_as_tuple(Context)); + } + } catch (const UsmAllocationException &Ex) { + *RetMem = nullptr; + return Ex.getError(); + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMFree( + ur_context_handle_t Context, ///< [in] handle of the context object + void *Mem ///< [in] pointer to USM memory object +) { + ur_platform_handle_t Plt = Context->getPlatform(); + + std::scoped_lock Lock( + IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex); + + return USMFreeHelper(Context, Mem); +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMGetMemAllocInfo( + ur_context_handle_t Context, ///< [in] handle of the context object + const void *Ptr, ///< [in] pointer to USM memory object + ur_usm_alloc_info_t + PropName, ///< [in] the name of the USM allocation property to query + size_t PropValueSize, ///< [in] size in bytes of the USM allocation property + ///< value + void *PropValue, ///< [out][optional] value of the USM allocation property + size_t *PropValueSizeRet ///< [out][optional] bytes returned in USM + ///< allocation property +) { + ze_device_handle_t ZeDeviceHandle; + ZeStruct ZeMemoryAllocationProperties; + + ZE2UR_CALL(zeMemGetAllocProperties, + (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties, + &ZeDeviceHandle)); + + UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet); + switch (PropName) { + case UR_USM_ALLOC_INFO_TYPE: { + pi_usm_type MemAllocaType; + switch (ZeMemoryAllocationProperties.type) { + case ZE_MEMORY_TYPE_UNKNOWN: + MemAllocaType = PI_MEM_TYPE_UNKNOWN; + break; + case ZE_MEMORY_TYPE_HOST: + MemAllocaType = PI_MEM_TYPE_HOST; + break; + case ZE_MEMORY_TYPE_DEVICE: + MemAllocaType = PI_MEM_TYPE_DEVICE; + break; + case ZE_MEMORY_TYPE_SHARED: + MemAllocaType = PI_MEM_TYPE_SHARED; + break; + default: + urPrint("urUSMGetMemAllocInfo: unexpected usm memory type\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + return ReturnValue(MemAllocaType); + } + case UR_USM_ALLOC_INFO_DEVICE: + if (ZeDeviceHandle) { + auto Platform = Context->getPlatform(); + auto Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle); + return Device ? ReturnValue(Device) : UR_RESULT_ERROR_INVALID_VALUE; + } else { + return UR_RESULT_ERROR_INVALID_VALUE; + } + case UR_USM_ALLOC_INFO_BASE_PTR: { + void *Base; + ZE2UR_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, &Base, nullptr)); + return ReturnValue(Base); + } + case UR_USM_ALLOC_INFO_SIZE: { + size_t Size; + ZE2UR_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, nullptr, &Size)); + return ReturnValue(Size); + } + default: + urPrint("urUSMGetMemAllocInfo: unsupported ParamName\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + return UR_RESULT_SUCCESS; +} + +ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) { + ZE2UR_CALL(zeMemFree, (Context->ZeContext, Ptr)); + return UR_RESULT_SUCCESS; +} + +void *USMMemoryAllocBase::allocate(size_t Size) { + void *Ptr = nullptr; + + auto Res = allocateImpl(&Ptr, Size, sizeof(void *)); + if (Res != UR_RESULT_SUCCESS) { + throw UsmAllocationException(Res); + } + + return Ptr; +} + +void *USMMemoryAllocBase::allocate(size_t Size, size_t Alignment) { + void *Ptr = nullptr; + + auto Res = allocateImpl(&Ptr, Size, Alignment); + if (Res != UR_RESULT_SUCCESS) { + throw UsmAllocationException(Res); + } + return Ptr; +} + +void USMMemoryAllocBase::deallocate(void *Ptr) { + auto Res = USMFreeImpl(Context, Ptr); + if (Res != UR_RESULT_SUCCESS) { + throw UsmAllocationException(Res); + } +} + +ur_result_t USMSharedMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) { + return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, Size, + Alignment); +} + +ur_result_t USMSharedReadOnlyMemoryAlloc::allocateImpl(void **ResultPtr, + size_t Size, + uint32_t Alignment) { + ur_usm_mem_flags_t Props = UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY; + return USMSharedAllocImpl(ResultPtr, Context, Device, &Props, Size, + Alignment); +} + +ur_result_t USMDeviceMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) { + return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size, + Alignment); +} + +ur_result_t USMHostMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) { + return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment); +} + +enum class USMAllocationForceResidencyType { + // [Default] Do not force memory residency at allocation time. + None = 0, + // Force memory resident on the device of allocation at allocation time. + // For host allocation force residency on all devices in a context. + Device = 1, + // Force memory resident on all devices in the context with P2P + // access to the device of allocation. + // For host allocation force residency on all devices in a context. + P2PDevices = 2 +}; + +// Returns the desired USM residency setting +static USMAllocationForceResidencyType USMAllocationForceResidency = [] { + const auto Str = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT"); + if (!Str) + return USMAllocationForceResidencyType::None; + switch (std::atoi(Str)) { + case 1: + return USMAllocationForceResidencyType::Device; + case 2: + return USMAllocationForceResidencyType::P2PDevices; + default: + return USMAllocationForceResidencyType::None; + }; +}(); + +// Make USM allocation resident as requested +static ur_result_t USMAllocationMakeResident( + ur_context_handle_t Context, + ur_device_handle_t Device, // nullptr for host allocation + void *Ptr, size_t Size) { + + std::list Devices; + + if (USMAllocationForceResidency == USMAllocationForceResidencyType::None) + return UR_RESULT_SUCCESS; + else if (!Device) { + // Host allocation, make it resident on all devices in the context + Devices.insert(Devices.end(), Context->Devices.begin(), + Context->Devices.end()); + } else { + Devices.push_back(Device); + if (USMAllocationForceResidency == + USMAllocationForceResidencyType::P2PDevices) { + ze_bool_t P2P; + for (const auto &D : Context->Devices) { + if (D == Device) + continue; + // TODO: Cache P2P devices for a context + ZE2UR_CALL(zeDeviceCanAccessPeer, + (D->ZeDevice, Device->ZeDevice, &P2P)); + if (P2P) + Devices.push_back(D); + } + } + } + for (const auto &D : Devices) { + ZE2UR_CALL(zeContextMakeMemoryResident, + (Context->ZeContext, D->ZeDevice, Ptr, Size)); + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate( + ur_context_handle_t Context, ///< [in] handle of the context object + ur_usm_pool_desc_t + *PoolDesc, ///< [in] pointer to USM pool descriptor. Can be chained with + ///< ::ur_usm_pool_limits_desc_t + ur_usm_pool_handle_t *Pool ///< [out] pointer to USM memory pool +) { + std::ignore = Context; + std::ignore = PoolDesc; + std::ignore = Pool; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolDestroy( + ur_context_handle_t Context, ///< [in] handle of the context object + ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool +) { + std::ignore = Context; + std::ignore = Pool; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_device_handle_t Device, + ur_usm_mem_flags_t *Properties, size_t Size, + uint32_t Alignment) { + // TODO: translate PI properties to Level Zero flags + ZeStruct ZeDesc; + ZeDesc.flags = 0; + ZeDesc.ordinal = 0; + + ZeStruct RelaxedDesc; + if (Size > Device->ZeDeviceProperties->maxMemAllocSize) { + // Tell Level-Zero to accept Size > maxMemAllocSize + RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE; + ZeDesc.pNext = &RelaxedDesc; + } + + ZE2UR_CALL(zeMemAllocDevice, (Context->ZeContext, &ZeDesc, Size, Alignment, + Device->ZeDevice, ResultPtr)); + + UR_ASSERT(Alignment == 0 || + reinterpret_cast(*ResultPtr) % Alignment == 0, + UR_RESULT_ERROR_INVALID_VALUE); + + USMAllocationMakeResident(Context, Device, *ResultPtr, Size); + return UR_RESULT_SUCCESS; +} + +ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_device_handle_t Device, ur_usm_mem_flags_t *, + size_t Size, uint32_t Alignment) { + + // TODO: translate PI properties to Level Zero flags + ZeStruct ZeHostDesc; + ZeHostDesc.flags = 0; + ZeStruct ZeDevDesc; + ZeDevDesc.flags = 0; + ZeDevDesc.ordinal = 0; + + ZeStruct RelaxedDesc; + if (Size > Device->ZeDeviceProperties->maxMemAllocSize) { + // Tell Level-Zero to accept Size > maxMemAllocSize + RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE; + ZeDevDesc.pNext = &RelaxedDesc; + } + + ZE2UR_CALL(zeMemAllocShared, (Context->ZeContext, &ZeDevDesc, &ZeHostDesc, + Size, Alignment, Device->ZeDevice, ResultPtr)); + + UR_ASSERT(Alignment == 0 || + reinterpret_cast(*ResultPtr) % Alignment == 0, + UR_RESULT_ERROR_INVALID_VALUE); + + USMAllocationMakeResident(Context, Device, *ResultPtr, Size); + + // TODO: Handle PI_MEM_ALLOC_DEVICE_READ_ONLY. + return UR_RESULT_SUCCESS; +} + +ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_usm_mem_flags_t *Properties, size_t Size, + uint32_t Alignment) { + // TODO: translate PI properties to Level Zero flags + ZeStruct ZeHostDesc; + ZeHostDesc.flags = 0; + ZE2UR_CALL(zeMemAllocHost, + (Context->ZeContext, &ZeHostDesc, Size, Alignment, ResultPtr)); + + UR_ASSERT(Alignment == 0 || + reinterpret_cast(*ResultPtr) % Alignment == 0, + UR_RESULT_ERROR_INVALID_VALUE); + + USMAllocationMakeResident(Context, nullptr, *ResultPtr, Size); + + return UR_RESULT_SUCCESS; +} + +// If indirect access tracking is not enabled then this functions just performs +// zeMemFree. If indirect access tracking is enabled then reference counting is +// performed. +ur_result_t ZeMemFreeHelper(ur_context_handle_t Context, void *Ptr) { + ur_platform_handle_t Plt = Context->getPlatform(); + std::unique_lock ContextsLock(Plt->ContextsMutex, + std::defer_lock); + if (IndirectAccessTrackingEnabled) { + ContextsLock.lock(); + auto It = Context->MemAllocs.find(Ptr); + if (It == std::end(Context->MemAllocs)) { + die("All memory allocations must be tracked!"); + } + if (!It->second.RefCount.decrementAndTest()) { + // Memory can't be deallocated yet. + return UR_RESULT_SUCCESS; + } + + // Reference count is zero, it is ok to free memory. + // We don't need to track this allocation anymore. + Context->MemAllocs.erase(It); + } + + ZE2UR_CALL(zeMemFree, (Context->ZeContext, Ptr)); + + if (IndirectAccessTrackingEnabled) + UR_CALL(ContextReleaseHelper(Context)); + + return UR_RESULT_SUCCESS; +} bool ShouldUseUSMAllocator() { // Enable allocator by default if it's not explicitly disabled @@ -15,4 +2538,535 @@ bool ShouldUseUSMAllocator() { const char *Ret = UrRet ? UrRet : (PiRet ? PiRet : nullptr); return Ret == nullptr; } -const bool UseUSMAllocator = ShouldUseUSMAllocator(); \ No newline at end of file + +const bool UseUSMAllocator = ShouldUseUSMAllocator(); + +// Helper function to deallocate USM memory, if indirect access support is +// enabled then a caller must lock the platform-level mutex guarding the +// container with contexts because deallocating the memory can turn RefCount of +// a context to 0 and as a result the context being removed from the list of +// tracked contexts. +// If indirect access tracking is not enabled then caller must lock Context +// mutex. +ur_result_t USMFreeHelper(ur_context_handle_t Context, void *Ptr, + bool OwnZeMemHandle) { + if (!OwnZeMemHandle) { + // Memory should not be freed + return UR_RESULT_SUCCESS; + } + + if (IndirectAccessTrackingEnabled) { + auto It = Context->MemAllocs.find(Ptr); + if (It == std::end(Context->MemAllocs)) { + die("All memory allocations must be tracked!"); + } + if (!It->second.RefCount.decrementAndTest()) { + // Memory can't be deallocated yet. + return UR_RESULT_SUCCESS; + } + + // Reference count is zero, it is ok to free memory. + // We don't need to track this allocation anymore. + Context->MemAllocs.erase(It); + } + + if (!UseUSMAllocator) { + ur_result_t Res = USMFreeImpl(Context, Ptr); + if (IndirectAccessTrackingEnabled) + UR_CALL(ContextReleaseHelper(Context)); + return Res; + } + + // Query the device of the allocation to determine the right allocator context + ze_device_handle_t ZeDeviceHandle; + ZeStruct ZeMemoryAllocationProperties; + + // Query memory type of the pointer we're freeing to determine the correct + // way to do it(directly or via an allocator) + auto ZeResult = + ZE_CALL_NOCHECK(zeMemGetAllocProperties, + (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties, + &ZeDeviceHandle)); + + // Handle the case that L0 RT was already unloaded + if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) { + if (IndirectAccessTrackingEnabled) + UR_CALL(ContextReleaseHelper(Context)); + return UR_RESULT_SUCCESS; + } else if (ZeResult) { + return ze2urResult(ZeResult); + } + + // If memory type is host release from host pool + if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST) { + try { + Context->HostMemAllocContext->deallocate(Ptr); + } catch (const UsmAllocationException &Ex) { + return Ex.getError(); + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + if (IndirectAccessTrackingEnabled) + UR_CALL(ContextReleaseHelper(Context)); + return UR_RESULT_SUCCESS; + } + + // Points out an allocation in SharedReadOnlyMemAllocContexts + auto SharedReadOnlyAllocsIterator = Context->SharedReadOnlyAllocs.end(); + + if (!ZeDeviceHandle) { + // The only case where it is OK not have device identified is + // if the memory is not known to the driver. We should not ever get + // this either, probably. + UR_ASSERT(ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN, + UR_RESULT_ERROR_INVALID_DEVICE); + } else { + ur_device_handle_t Device; + // All context member devices or their descendants are of the same platform. + auto Platform = Context->getPlatform(); + Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle); + UR_ASSERT(Device, UR_RESULT_ERROR_INVALID_DEVICE); + + auto DeallocationHelper = + [Context, Device, + Ptr](std::unordered_map + &AllocContextMap) { + try { + auto It = AllocContextMap.find(Device->ZeDevice); + if (It == AllocContextMap.end()) + return UR_RESULT_ERROR_INVALID_VALUE; + + // The right context is found, deallocate the pointer + It->second.deallocate(Ptr); + } catch (const UsmAllocationException &Ex) { + return Ex.getError(); + } + + if (IndirectAccessTrackingEnabled) + UR_CALL(ContextReleaseHelper(Context)); + return UR_RESULT_SUCCESS; + }; + + switch (ZeMemoryAllocationProperties.type) { + case ZE_MEMORY_TYPE_SHARED: + // Distinguish device_read_only allocations since they have own pool. + SharedReadOnlyAllocsIterator = Context->SharedReadOnlyAllocs.find(Ptr); + return DeallocationHelper(SharedReadOnlyAllocsIterator != + Context->SharedReadOnlyAllocs.end() + ? Context->SharedReadOnlyMemAllocContexts + : Context->SharedMemAllocContexts); + case ZE_MEMORY_TYPE_DEVICE: + return DeallocationHelper(Context->DeviceMemAllocContexts); + default: + // Handled below + break; + } + } + + ur_result_t Res = USMFreeImpl(Context, Ptr); + if (SharedReadOnlyAllocsIterator != Context->SharedReadOnlyAllocs.end()) { + Context->SharedReadOnlyAllocs.erase(SharedReadOnlyAllocsIterator); + } + if (IndirectAccessTrackingEnabled) + UR_CALL(ContextReleaseHelper(Context)); + return Res; +} + +// If indirect access tracking is enabled then performs reference counting, +// otherwise just calls zeMemAllocDevice. +static ur_result_t ZeDeviceMemAllocHelper(void **ResultPtr, + ur_context_handle_t Context, + ur_device_handle_t Device, + size_t Size) { + ur_platform_handle_t Plt = Device->Platform; + std::unique_lock ContextsLock(Plt->ContextsMutex, + std::defer_lock); + if (IndirectAccessTrackingEnabled) { + // Lock the mutex which is guarding contexts container in the platform. + // This prevents new kernels from being submitted in any context while + // we are in the process of allocating a memory, this is needed to + // properly capture allocations by kernels with indirect access. + ContextsLock.lock(); + // We are going to defer memory release if there are kernels with + // indirect access, that is why explicitly retain context to be sure + // that it is released after all memory allocations in this context are + // released. + UR_CALL(urContextRetain(Context)); + } + + ze_device_mem_alloc_desc_t ZeDesc = {}; + ZeDesc.flags = 0; + ZeDesc.ordinal = 0; + ZE2UR_CALL(zeMemAllocDevice, (Context->ZeContext, &ZeDesc, Size, 1, + Device->ZeDevice, ResultPtr)); + + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*ResultPtr), + std::forward_as_tuple(Context)); + } + return UR_RESULT_SUCCESS; +} + +ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, + ur_device_handle_t Device) { + + // NOTE: There might be no valid allocation at all yet and we get + // here from piEnqueueKernelLaunch that would be doing the buffer + // initialization. In this case the Device is not null as kernel + // launch is always on a specific device. + if (!Device) + Device = LastDeviceWithValidAllocation; + // If the device is still not selected then use the first one in + // the context of the buffer. + if (!Device) + Device = UrContext->Devices[0]; + + auto &Allocation = Allocations[Device]; + + // Sub-buffers don't maintain own allocations but rely on parent buffer. + if (isSubBuffer()) { + UR_CALL(SubBuffer.Parent->getZeHandle(ZeHandle, AccessMode, Device)); + ZeHandle += SubBuffer.Origin; + // Still store the allocation info in the PI sub-buffer for + // getZeHandlePtr to work. At least zeKernelSetArgumentValue needs to + // be given a pointer to the allocation handle rather than its value. + // + Allocation.ZeHandle = ZeHandle; + Allocation.ReleaseAction = allocation_t::keep; + LastDeviceWithValidAllocation = Device; + return UR_RESULT_SUCCESS; + } + + // First handle case where the buffer is represented by only + // a single host allocation. + if (OnHost) { + auto &HostAllocation = Allocations[nullptr]; + // The host allocation may already exists, e.g. with imported + // host ptr, or in case of interop buffer. + if (!HostAllocation.ZeHandle) { + if (USMAllocatorConfigInstance.EnableBuffers) { + HostAllocation.ReleaseAction = allocation_t::free; + ur_usm_desc_t USMDesc{}; + ur_usm_pool_handle_t Pool{}; + UR_CALL(urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, getAlignment(), + reinterpret_cast(&ZeHandle))); + } else { + HostAllocation.ReleaseAction = allocation_t::free_native; + UR_CALL(ZeHostMemAllocHelper(reinterpret_cast(&ZeHandle), + UrContext, Size)); + } + HostAllocation.ZeHandle = ZeHandle; + HostAllocation.Valid = true; + } + Allocation = HostAllocation; + Allocation.ReleaseAction = allocation_t::keep; + ZeHandle = Allocation.ZeHandle; + LastDeviceWithValidAllocation = Device; + return UR_RESULT_SUCCESS; + } + // Reads user setting on how to deal with buffers in contexts where + // all devices have the same root-device. Returns "true" if the + // preference is to have allocate on each [sub-]device and migrate + // normally (copy) to other sub-devices as needed. Returns "false" + // if the preference is to have single root-device allocations + // serve the needs of all [sub-]devices, meaning potentially more + // cross-tile traffic. + // + static const bool SingleRootDeviceBufferMigration = [] { + const char *EnvStr = + std::getenv("SYCL_PI_LEVEL_ZERO_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION"); + if (EnvStr) + return (std::stoi(EnvStr) != 0); + // The default is to migrate normally, which may not always be the + // best option (depends on buffer access patterns), but is an + // overall win on the set of the available benchmarks. + return true; + }(); + + // Peform actual device allocation as needed. + if (!Allocation.ZeHandle) { + if (!SingleRootDeviceBufferMigration && UrContext->SingleRootDevice && + UrContext->SingleRootDevice != Device) { + // If all devices in the context are sub-devices of the same device + // then we reuse root-device allocation by all sub-devices in the + // context. + // TODO: we can probably generalize this and share root-device + // allocations by its own sub-devices even if not all other + // devices in the context have the same root. + UR_CALL(getZeHandle(ZeHandle, AccessMode, UrContext->SingleRootDevice)); + Allocation.ReleaseAction = allocation_t::keep; + Allocation.ZeHandle = ZeHandle; + Allocation.Valid = true; + return UR_RESULT_SUCCESS; + } else { // Create device allocation + if (USMAllocatorConfigInstance.EnableBuffers) { + Allocation.ReleaseAction = allocation_t::free; + ur_usm_desc_t USMDesc{}; + ur_usm_pool_handle_t Pool{}; + UR_CALL(urUSMDeviceAlloc(UrContext, Device, &USMDesc, Pool, Size, + getAlignment(), + reinterpret_cast(&ZeHandle))); + } else { + Allocation.ReleaseAction = allocation_t::free_native; + UR_CALL(ZeDeviceMemAllocHelper(reinterpret_cast(&ZeHandle), + UrContext, Device, Size)); + } + } + Allocation.ZeHandle = ZeHandle; + } else { + ZeHandle = Allocation.ZeHandle; + } + + // If some prior access invalidated this allocation then make it valid again. + if (!Allocation.Valid) { + // LastDeviceWithValidAllocation should always have valid allocation. + if (Device == LastDeviceWithValidAllocation) + die("getZeHandle: last used allocation is not valid"); + + // For write-only access the allocation contents is not going to be used. + // So don't do anything to make it "valid". + bool NeedCopy = AccessMode != ur_mem_handle_t_::write_only; + // It's also possible that the buffer doesn't have a valid allocation + // yet presumably when it is passed to a kernel that will perform + // it's intialization. + if (NeedCopy && !LastDeviceWithValidAllocation) { + NeedCopy = false; + } + char *ZeHandleSrc = nullptr; + if (NeedCopy) { + UR_CALL(getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, + LastDeviceWithValidAllocation)); + // It's possible with the single root-device contexts that + // the buffer is represented by the single root-device + // allocation and then skip the copy to itself. + if (ZeHandleSrc == ZeHandle) + NeedCopy = false; + } + + if (NeedCopy) { + // Copy valid buffer data to this allocation. + // TODO: see if we should better use peer's device allocation used + // directly, if that capability is reported with zeDeviceCanAccessPeer, + // instead of maintaining a separate allocation and performing + // explciit copies. + // + // zeCommandListAppendMemoryCopy must not be called from simultaneous + // threads with the same command list handle, so we need exclusive lock. + ze_bool_t P2P = false; + ZE2UR_CALL( + zeDeviceCanAccessPeer, + (Device->ZeDevice, LastDeviceWithValidAllocation->ZeDevice, &P2P)); + if (!P2P) { + // P2P copy is not possible, so copy through the host. + auto &HostAllocation = Allocations[nullptr]; + // The host allocation may already exists, e.g. with imported + // host ptr, or in case of interop buffer. + if (!HostAllocation.ZeHandle) { + void *ZeHandleHost; + if (USMAllocatorConfigInstance.EnableBuffers) { + HostAllocation.ReleaseAction = allocation_t::free; + ur_usm_desc_t USMDesc{}; + ur_usm_pool_handle_t Pool{}; + UR_CALL(urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, + getAlignment(), &ZeHandleHost)); + } else { + HostAllocation.ReleaseAction = allocation_t::free_native; + UR_CALL(ZeHostMemAllocHelper(&ZeHandleHost, UrContext, Size)); + } + HostAllocation.ZeHandle = reinterpret_cast(ZeHandleHost); + HostAllocation.Valid = false; + } + std::scoped_lock Lock(UrContext->ImmediateCommandListMutex); + if (!HostAllocation.Valid) { + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (UrContext->ZeCommandListInit, HostAllocation.ZeHandle, + ZeHandleSrc, Size, nullptr, 0, nullptr)); + // Mark the host allocation data as valid so it can be reused. + // It will be invalidated below if the current access is not + // read-only. + HostAllocation.Valid = true; + } + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (UrContext->ZeCommandListInit, ZeHandle, + HostAllocation.ZeHandle, Size, nullptr, 0, nullptr)); + } else { + // Perform P2P copy. + std::scoped_lock Lock(UrContext->ImmediateCommandListMutex); + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (UrContext->ZeCommandListInit, ZeHandle, ZeHandleSrc, Size, + nullptr, 0, nullptr)); + } + } + Allocation.Valid = true; + LastDeviceWithValidAllocation = Device; + } + + // Invalidate other allocations that would become not valid if + // this access is not read-only. + if (AccessMode != ur_mem_handle_t_::read_only) { + for (auto &Alloc : Allocations) { + if (Alloc.first != LastDeviceWithValidAllocation) + Alloc.second.Valid = false; + } + } + + urPrint("getZeHandle(pi_device{%p}) = %p\n", (void *)Device, + (void *)Allocation.ZeHandle); + return UR_RESULT_SUCCESS; +} + +ur_result_t _ur_buffer::free() { + for (auto &Alloc : Allocations) { + auto &ZeHandle = Alloc.second.ZeHandle; + // It is possible that the real allocation wasn't made if the buffer + // wasn't really used in this location. + if (!ZeHandle) + continue; + + switch (Alloc.second.ReleaseAction) { + case allocation_t::keep: + break; + case allocation_t::free: { + ur_platform_handle_t Plt = UrContext->getPlatform(); + std::scoped_lock Lock(IndirectAccessTrackingEnabled + ? Plt->ContextsMutex + : UrContext->Mutex); + + UR_CALL(USMFreeHelper(reinterpret_cast(UrContext), + ZeHandle)); + break; + } + case allocation_t::free_native: + UR_CALL(ZeMemFreeHelper(UrContext, ZeHandle)); + break; + case allocation_t::unimport: + ZeUSMImport.doZeUSMRelease(UrContext->getPlatform()->ZeDriver, ZeHandle); + break; + default: + die("_ur_buffer::free(): Unhandled release action"); + } + ZeHandle = nullptr; // don't leave hanging pointers + } + return UR_RESULT_SUCCESS; +} + +// Buffer constructor +_ur_buffer::_ur_buffer(ur_context_handle_t Context, size_t Size, char *HostPtr, + bool ImportedHostPtr = false) + : ur_mem_handle_t_(Context), Size(Size), SubBuffer{nullptr, 0} { + + // We treat integrated devices (physical memory shared with the CPU) + // differently from discrete devices (those with distinct memories). + // For integrated devices, allocating the buffer in the host memory + // enables automatic access from the device, and makes copying + // unnecessary in the map/unmap operations. This improves performance. + OnHost = Context->Devices.size() == 1 && + Context->Devices[0]->ZeDeviceProperties->flags & + ZE_DEVICE_PROPERTY_FLAG_INTEGRATED; + + // Fill the host allocation data. + if (HostPtr) { + MapHostPtr = HostPtr; + // If this host ptr is imported to USM then use this as a host + // allocation for this buffer. + if (ImportedHostPtr) { + Allocations[nullptr].ZeHandle = HostPtr; + Allocations[nullptr].Valid = true; + Allocations[nullptr].ReleaseAction = _ur_buffer::allocation_t::unimport; + } + } + + // This initialization does not end up with any valid allocation yet. + LastDeviceWithValidAllocation = nullptr; +} + +_ur_buffer::_ur_buffer(ur_context_handle_t Context, ur_device_handle_t Device, + size_t Size) + : ur_mem_handle_t_(Context, Device), Size(Size) {} + +// Interop-buffer constructor +_ur_buffer::_ur_buffer(ur_context_handle_t Context, size_t Size, + ur_device_handle_t Device, char *ZeMemHandle, + bool OwnZeMemHandle) + : ur_mem_handle_t_(Context, Device), Size(Size), SubBuffer{nullptr, 0} { + + // Device == nullptr means host allocation + Allocations[Device].ZeHandle = ZeMemHandle; + Allocations[Device].Valid = true; + Allocations[Device].ReleaseAction = + OwnZeMemHandle ? allocation_t::free_native : allocation_t::keep; + + // Check if this buffer can always stay on host + OnHost = false; + if (!Device) { // Host allocation + if (Context->Devices.size() == 1 && + Context->Devices[0]->ZeDeviceProperties->flags & + ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) { + OnHost = true; + MapHostPtr = ZeMemHandle; // map to this allocation + } + } + LastDeviceWithValidAllocation = Device; +} + +ur_result_t _ur_buffer::getZeHandlePtr(char **&ZeHandlePtr, + access_mode_t AccessMode, + ur_device_handle_t Device) { + char *ZeHandle; + UR_CALL(getZeHandle(ZeHandle, AccessMode, Device)); + ZeHandlePtr = &Allocations[Device].ZeHandle; + return UR_RESULT_SUCCESS; +} + +size_t _ur_buffer::getAlignment() const { + // Choose an alignment that is at most 64 and is the next power of 2 + // for sizes less than 64. + auto Alignment = Size; + if (Alignment > 32UL) + Alignment = 64UL; + else if (Alignment > 16UL) + Alignment = 32UL; + else if (Alignment > 8UL) + Alignment = 16UL; + else if (Alignment > 4UL) + Alignment = 8UL; + else if (Alignment > 2UL) + Alignment = 4UL; + else if (Alignment > 1UL) + Alignment = 2UL; + else + Alignment = 1UL; + return Alignment; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + void *Ptr, ///< [in] pointer to USM memory object + size_t PatternSize, ///< [in] the size in bytes of the pattern. Must be a + ///< power of 2 and less than or equal to width. + const void *Pattern, ///< [in] pointer with the bytes of the pattern to set. + size_t Size, ///< [in] size in bytes to be set. Must be a multiple of + ///< patternSize. + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, the + ///< numEventsInWaitList must be 0, indicating that this + ///< command does not wait on any event to complete. + ur_event_handle_t *Event ///< [out][optional] return an event object that + ///< identifies this particular command instance. +) { + std::ignore = Queue; + std::ignore = Ptr; + std::ignore = PatternSize; + std::ignore = Pattern; + std::ignore = Size; + std::ignore = NumEventsInWaitList; + std::ignore = EventWaitList; + std::ignore = Event; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp index f8b2231909604..56b0c4a9dbaa6 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp @@ -8,7 +8,296 @@ #pragma once #include "ur_level_zero_common.hpp" +#include +#include +#include +#include +#include +#include +#include -struct _ur_mem_handle_t : _ur_object { - _ur_mem_handle_t() {} +#include +#include +#include +#include +#include + +#include "ur_level_zero.hpp" + +struct ur_device_handle_t_; + +bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr); + +// This is an experimental option to test performance of device to device copy +// operations on copy engines (versus compute engine) +const bool UseCopyEngineForD2DCopy = [] { + const char *CopyEngineForD2DCopy = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY"); + return (CopyEngineForD2DCopy && (std::stoi(CopyEngineForD2DCopy) != 0)); +}(); + +// Shared by all memory read/write/copy PI interfaces. +// PI interfaces must have queue's and destination buffer's mutexes locked for +// exclusive use and source buffer's mutex locked for shared use on entry. +ur_result_t enqueueMemCopyHelper(ur_command_t CommandType, + ur_queue_handle_t Queue, void *Dst, + pi_bool BlockingWrite, size_t Size, + const void *Src, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, + ur_event_handle_t *OutEvent, + bool PreferCopyEngine); + +ur_result_t enqueueMemCopyRectHelper( + ur_command_t CommandType, ur_queue_handle_t Queue, const void *SrcBuffer, + void *DstBuffer, ur_rect_offset_t SrcOrigin, ur_rect_offset_t DstOrigin, + ur_rect_region_t Region, size_t SrcRowPitch, size_t DstRowPitch, + size_t SrcSlicePitch, size_t DstSlicePitch, pi_bool Blocking, + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, + ur_event_handle_t *OutEvent, bool PreferCopyEngine = false); + +ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr); + +// Exception type to pass allocation errors +class UsmAllocationException { + const ur_result_t Error; + +public: + UsmAllocationException(ur_result_t Err) : Error{Err} {} + ur_result_t getError() const { return Error; } }; + +struct ur_mem_handle_t_ : _ur_object { + // Keeps the PI context of this memory handle. + ur_context_handle_t UrContext; + + // Keeps device of this memory handle + ur_device_handle_t UrDevice; + + // Enumerates all possible types of accesses. + enum access_mode_t { unknown, read_write, read_only, write_only }; + + // Interface of the _ur_mem object + + // Get the Level Zero handle of the current memory object + virtual ur_result_t getZeHandle(char *&ZeHandle, access_mode_t, + ur_device_handle_t Device = nullptr) = 0; + + // Get a pointer to the Level Zero handle of the current memory object + virtual ur_result_t getZeHandlePtr(char **&ZeHandlePtr, access_mode_t, + ur_device_handle_t Device = nullptr) = 0; + + // Method to get type of the derived object (image or buffer) + virtual bool isImage() const = 0; + + virtual ~ur_mem_handle_t_() = default; + +protected: + ur_mem_handle_t_(ur_context_handle_t Context) : UrContext{Context} {} + + ur_mem_handle_t_(ur_context_handle_t Context, ur_device_handle_t Device) + : UrContext{Context}, UrDevice(Device) {} +}; + +struct _ur_buffer final : ur_mem_handle_t_ { + // Buffer constructor + _ur_buffer(ur_context_handle_t Context, ur_device_handle_t UrDevice, + size_t Size); + + _ur_buffer(ur_context_handle_t Context, size_t Size, char *HostPtr, + bool ImportedHostPtr); + + // Sub-buffer constructor + _ur_buffer(_ur_buffer *Parent, size_t Origin, size_t Size) + : ur_mem_handle_t_(Parent->UrContext), Size(Size), + SubBuffer{Parent, Origin} {} + + // Interop-buffer constructor + _ur_buffer(ur_context_handle_t Context, size_t Size, + ur_device_handle_t Device, char *ZeMemHandle, bool OwnZeMemHandle); + + // Returns a pointer to the USM allocation representing this PI buffer + // on the specified Device. If Device is nullptr then the returned + // USM allocation is on the device where this buffer was used the latest. + // The returned allocation is always valid, i.e. its contents is + // up-to-date and any data copies needed for that are performed under + // the hood. + // + virtual ur_result_t getZeHandle(char *&ZeHandle, access_mode_t, + ur_device_handle_t Device = nullptr) override; + virtual ur_result_t + getZeHandlePtr(char **&ZeHandlePtr, access_mode_t, + ur_device_handle_t Device = nullptr) override; + + bool isImage() const override { return false; } + + bool isSubBuffer() const { return SubBuffer.Parent != nullptr; } + + // Frees all allocations made for the buffer. + ur_result_t free(); + + // Information about a single allocation representing this buffer. + struct allocation_t { + // Level Zero memory handle is really just a naked pointer. + // It is just convenient to have it char * to simplify offset arithmetics. + char *ZeHandle{nullptr}; + // Indicates if this allocation's data is valid. + bool Valid{false}; + // Specifies the action that needs to be taken for this + // allocation at buffer destruction. + enum { + keep, // do nothing, the allocation is not owned by us + unimport, // release of the imported allocation + free, // free from the pooling context (default) + free_native // free with a native call + } ReleaseAction{free}; + }; + + // We maintain multiple allocations on possibly all devices in the context. + // The "nullptr" device identifies a host allocation representing buffer. + // Sub-buffers don't maintain own allocations but rely on parent buffer. + std::unordered_map Allocations; + ur_device_handle_t LastDeviceWithValidAllocation{nullptr}; + + // Flag to indicate that this memory is allocated in host memory. + // Integrated device accesses this memory. + bool OnHost{false}; + + // Tells the host allocation to use for buffer map operations. + char *MapHostPtr{nullptr}; + + // Supplementary data to keep track of the mappings of this buffer + // created with piEnqueueMemBufferMap. + struct Mapping { + // The offset in the buffer giving the start of the mapped region. + size_t Offset; + // The size of the mapped region. + size_t Size; + }; + + // The key is the host pointer representing an active mapping. + // The value is the information needed to maintain/undo the mapping. + std::unordered_map Mappings; + + // The size and alignment of the buffer + size_t Size; + size_t getAlignment() const; + + struct { + _ur_buffer *Parent; + size_t Origin; // only valid if Parent != nullptr + } SubBuffer; +}; + +struct _ur_image final : ur_mem_handle_t_ { + // Image constructor + _ur_image(ur_context_handle_t UrContext, ze_image_handle_t ZeImage) + : ur_mem_handle_t_(UrContext), ZeImage{ZeImage} {} + + virtual ur_result_t getZeHandle(char *&ZeHandle, access_mode_t, + ur_device_handle_t = nullptr) override { + ZeHandle = reinterpret_cast(ZeImage); + return UR_RESULT_SUCCESS; + } + virtual ur_result_t getZeHandlePtr(char **&ZeHandlePtr, access_mode_t, + ur_device_handle_t = nullptr) override { + ZeHandlePtr = reinterpret_cast(&ZeImage); + return UR_RESULT_SUCCESS; + } + + bool isImage() const override { return true; } + +#ifndef NDEBUG + // Keep the descriptor of the image (for debugging purposes) + ZeStruct ZeImageDesc; +#endif // !NDEBUG + + // Level Zero image handle. + ze_image_handle_t ZeImage; +}; + +// Implements memory allocation via L0 RT for USM allocator interface. +class USMMemoryAllocBase : public SystemMemory { +protected: + ur_context_handle_t Context; + ur_device_handle_t Device; + // Internal allocation routine which must be implemented for each allocation + // type + virtual ur_result_t allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) = 0; + +public: + USMMemoryAllocBase(ur_context_handle_t Ctx, ur_device_handle_t Dev) + : Context{Ctx}, Device{Dev} {} + void *allocate(size_t Size) override final; + void *allocate(size_t Size, size_t Alignment) override final; + void deallocate(void *Ptr) override final; +}; + +// Allocation routines for shared memory type +class USMSharedMemoryAlloc : public USMMemoryAllocBase { +protected: + ur_result_t allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) override; + +public: + USMSharedMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev) + : USMMemoryAllocBase(Ctx, Dev) {} +}; + +// Allocation routines for shared memory type that is only modified from host. +class USMSharedReadOnlyMemoryAlloc : public USMMemoryAllocBase { +protected: + ur_result_t allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) override; + +public: + USMSharedReadOnlyMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev) + : USMMemoryAllocBase(Ctx, Dev) {} +}; + +// Allocation routines for device memory type +class USMDeviceMemoryAlloc : public USMMemoryAllocBase { +protected: + ur_result_t allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) override; + +public: + USMDeviceMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev) + : USMMemoryAllocBase(Ctx, Dev) {} +}; + +// Allocation routines for host memory type +class USMHostMemoryAlloc : public USMMemoryAllocBase { +protected: + ur_result_t allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) override; + +public: + USMHostMemoryAlloc(ur_context_handle_t Ctx) + : USMMemoryAllocBase(Ctx, nullptr) {} +}; + +ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_device_handle_t Device, + ur_usm_mem_flags_t *Properties, size_t Size, + uint32_t Alignment); + +ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_device_handle_t Device, ur_usm_mem_flags_t *, + size_t Size, uint32_t Alignment); + +ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_usm_mem_flags_t *Properties, size_t Size, + uint32_t Alignment); + +// If indirect access tracking is not enabled then this functions just performs +// zeMemFree. If indirect access tracking is enabled then reference counting is +// performed. +ur_result_t ZeMemFreeHelper(ur_context_handle_t Context, void *Ptr); + +ur_result_t USMFreeHelper(ur_context_handle_t Context, void *Ptr, + bool OwnZeMemHandle = true); + +bool ShouldUseUSMAllocator(); + +extern const bool UseUSMAllocator; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_module.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_module.cpp deleted file mode 100644 index 22476938ac884..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_module.cpp +++ /dev/null @@ -1,9 +0,0 @@ -//===--------- ur_level_zero_module.cpp - Level Zero Adapter ----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===-----------------------------------------------------------------===// - -#include "ur_level_zero_module.hpp" diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_module.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_module.hpp deleted file mode 100644 index 8ff81196df096..0000000000000 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_module.hpp +++ /dev/null @@ -1,18 +0,0 @@ -//===--------- ur_level_zero_module.hpp - Level Zero Adapter ----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===-----------------------------------------------------------------===// -#pragma once - -#include "ur_level_zero_common.hpp" - -struct _ur_module_handle_t : _ur_object { - _ur_module_handle_t() {} -}; - -struct _ur_kernel_handle_t : _ur_object { - _ur_kernel_handle_t() {} -}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp index 99fab2d48dc16..1f2430274e6f4 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp @@ -7,3 +7,534 @@ //===-----------------------------------------------------------------===// #include "ur_level_zero_platform.hpp" +#include + +UR_APIEXPORT ur_result_t UR_APICALL urInit( + ur_device_init_flags_t + DeviceFlags ///< [in] device initialization flags. + ///< must be 0 (default) or a combination of + ///< ::ur_device_init_flag_t. +) { + std::ignore = DeviceFlags; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urTearDown( + void *Params ///< [in] pointer to tear down parameters +) { + // reclaim pi_platform objects here since we don't have piPlatformRelease. + for (ur_platform_handle_t Platform : *PiPlatformsCache) { + delete Platform; + } + delete PiPlatformsCache; + delete PiPlatformsCacheMutex; + + bool LeakFound = false; + // Print the balance of various create/destroy native calls. + // The idea is to verify if the number of create(+) and destroy(-) calls are + // matched. + if (UrL0Debug & UR_L0_DEBUG_CALL_COUNT) { + // clang-format off + // + // The format of this table is such that each row accounts for a + // specific type of objects, and all elements in the raw except the last + // one are allocating objects of that type, while the last element is known + // to deallocate objects of that type. + // + std::vector> CreateDestroySet = { + {"zeContextCreate", "zeContextDestroy"}, + {"zeCommandQueueCreate", "zeCommandQueueDestroy"}, + {"zeModuleCreate", "zeModuleDestroy"}, + {"zeKernelCreate", "zeKernelDestroy"}, + {"zeEventPoolCreate", "zeEventPoolDestroy"}, + {"zeCommandListCreateImmediate", "zeCommandListCreate", "zeCommandListDestroy"}, + {"zeEventCreate", "zeEventDestroy"}, + {"zeFenceCreate", "zeFenceDestroy"}, + {"zeImageCreate", "zeImageDestroy"}, + {"zeSamplerCreate", "zeSamplerDestroy"}, + {"zeMemAllocDevice", "zeMemAllocHost", "zeMemAllocShared", "zeMemFree"}, + }; + + // A sample output aimed below is this: + // ------------------------------------------------------------------------ + // zeContextCreate = 1 \---> zeContextDestroy = 1 + // zeCommandQueueCreate = 1 \---> zeCommandQueueDestroy = 1 + // zeModuleCreate = 1 \---> zeModuleDestroy = 1 + // zeKernelCreate = 1 \---> zeKernelDestroy = 1 + // zeEventPoolCreate = 1 \---> zeEventPoolDestroy = 1 + // zeCommandListCreateImmediate = 1 | + // zeCommandListCreate = 1 \---> zeCommandListDestroy = 1 ---> LEAK = 1 + // zeEventCreate = 2 \---> zeEventDestroy = 2 + // zeFenceCreate = 1 \---> zeFenceDestroy = 1 + // zeImageCreate = 0 \---> zeImageDestroy = 0 + // zeSamplerCreate = 0 \---> zeSamplerDestroy = 0 + // zeMemAllocDevice = 0 | + // zeMemAllocHost = 1 | + // zeMemAllocShared = 0 \---> zeMemFree = 1 + // + // clang-format on + + fprintf(stderr, "ZE_DEBUG=%d: check balance of create/destroy calls\n", + UR_L0_DEBUG_CALL_COUNT); + fprintf(stderr, + "----------------------------------------------------------\n"); + for (const auto &Row : CreateDestroySet) { + int diff = 0; + for (auto I = Row.begin(); I != Row.end();) { + const char *ZeName = *I; + const auto &ZeCount = (*ZeCallCount)[*I]; + + bool First = (I == Row.begin()); + bool Last = (++I == Row.end()); + + if (Last) { + fprintf(stderr, " \\--->"); + diff -= ZeCount; + } else { + diff += ZeCount; + if (!First) { + fprintf(stderr, " | \n"); + } + } + + fprintf(stderr, "%30s = %-5d", ZeName, ZeCount); + } + + if (diff) { + LeakFound = true; + fprintf(stderr, " ---> LEAK = %d", diff); + } + fprintf(stderr, "\n"); + } + + ZeCallCount->clear(); + delete ZeCallCount; + ZeCallCount = nullptr; + } + if (LeakFound) + return UR_RESULT_ERROR_INVALID_MEM_OBJECT; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGet( + uint32_t NumEntries, ///< [in] the number of platforms to be added to + ///< phPlatforms. If phPlatforms is not NULL, then + ///< NumEntries should be greater than zero, otherwise + ///< ::UR_RESULT_ERROR_INVALID_SIZE, will be returned. + ur_platform_handle_t + *Platforms, ///< [out][optional][range(0, NumEntries)] array of handle + ///< of platforms. If NumEntries is less than the number of + ///< platforms available, then + ///< ::urPlatformGet shall only retrieve that number of + ///< platforms. + uint32_t *NumPlatforms ///< [out][optional] returns the total number of + ///< platforms available. +) { + static std::once_flag ZeCallCountInitialized; + try { + std::call_once(ZeCallCountInitialized, []() { + if (UrL0Debug & UR_L0_DEBUG_CALL_COUNT) { + ZeCallCount = new std::map; + } + }); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + // Setting these environment variables before running zeInit will enable the + // validation layer in the Level Zero loader. + if (UrL0Debug & UR_L0_DEBUG_VALIDATION) { + setEnvVar("ZE_ENABLE_VALIDATION_LAYER", "1"); + setEnvVar("ZE_ENABLE_PARAMETER_VALIDATION", "1"); + } + + // Enable SYSMAN support for obtaining the PCI address + // and maximum memory bandwidth. + if (getenv("SYCL_ENABLE_PCI") != nullptr) { + setEnvVar("ZES_ENABLE_SYSMAN", "1"); + } + + // TODO: We can still safely recover if something goes wrong during the init. + // Implement handling segfault using sigaction. + + // We must only initialize the driver once, even if piPlatformsGet() is called + // multiple times. Declaring the return value as "static" ensures it's only + // called once. + static ze_result_t ZeResult = ZE_CALL_NOCHECK(zeInit, (0)); + + // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms. + if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) { + UR_ASSERT(NumEntries != 0, UR_RESULT_ERROR_INVALID_VALUE); + if (NumPlatforms) + *NumPlatforms = 0; + return UR_RESULT_SUCCESS; + } + + if (ZeResult != ZE_RESULT_SUCCESS) { + urPrint("zeInit: Level Zero initialization failure\n"); + return ze2urResult(ZeResult); + } + + // Cache pi_platforms for reuse in the future + // It solves two problems; + // 1. sycl::platform equality issue; we always return the same pi_platform. + // 2. performance; we can save time by immediately return from cache. + // + + const std::lock_guard Lock{*PiPlatformsCacheMutex}; + if (!PiPlatformCachePopulated) { + try { + // Level Zero does not have concept of Platforms, but Level Zero driver is + // the closest match. + uint32_t ZeDriverCount = 0; + ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, nullptr)); + if (ZeDriverCount == 0) { + PiPlatformCachePopulated = true; + } else { + std::vector ZeDrivers; + ZeDrivers.resize(ZeDriverCount); + + ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data())); + for (uint32_t I = 0; I < ZeDriverCount; ++I) { + auto Platform = new ur_platform_handle_t_(ZeDrivers[I]); + // Save a copy in the cache for future uses. + PiPlatformsCache->push_back(Platform); + + UR_CALL(Platform->initialize()); + } + PiPlatformCachePopulated = true; + } + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + } + + // Populate returned platforms from the cache. + if (Platforms) { + UR_ASSERT(NumEntries <= PiPlatformsCache->size(), + UR_RESULT_ERROR_INVALID_PLATFORM); + std::copy_n(PiPlatformsCache->begin(), NumEntries, Platforms); + } + + if (NumPlatforms) { + if (*NumPlatforms == 0) + *NumPlatforms = PiPlatformsCache->size(); + else + *NumPlatforms = std::min(PiPlatformsCache->size(), (size_t)NumEntries); + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo( + ur_platform_handle_t Platform, ///< [in] handle of the platform + ur_platform_info_t ParamName, ///< [in] type of the info to retrieve + size_t Size, ///< [in] the number of bytes pointed to by pPlatformInfo. + void *ParamValue, ///< [out][optional] array of bytes holding the info. + ///< If Size is not equal to or greater to the real number + ///< of bytes needed to return the info then the + ///< ::UR_RESULT_ERROR_INVALID_SIZE error is returned and + ///< pPlatformInfo is not used. + size_t *SizeRet ///< [out][optional] pointer to the actual number of bytes + ///< being queried by pPlatformInfo. +) { + UrReturnHelper ReturnValue(Size, ParamValue, SizeRet); + + switch (ParamName) { + case UR_PLATFORM_INFO_NAME: + // TODO: Query Level Zero driver when relevant info is added there. + return ReturnValue("Intel(R) oneAPI Unified Runtime over Level-Zero"); + case UR_PLATFORM_INFO_VENDOR_NAME: + // TODO: Query Level Zero driver when relevant info is added there. + return ReturnValue("Intel(R) Corporation"); + case UR_PLATFORM_INFO_EXTENSIONS: + // Convention adopted from OpenCL: + // "Returns a space-separated list of extension names (the extension + // names themselves do not contain any spaces) supported by the platform. + // Extensions defined here must be supported by all devices associated + // with this platform." + // + // TODO: Check the common extensions supported by all connected devices and + // return them. For now, hardcoding some extensions we know are supported by + // all Level Zero devices. + return ReturnValue(ZE_SUPPORTED_EXTENSIONS); + case UR_PLATFORM_INFO_PROFILE: + // TODO: figure out what this means and how is this used + return ReturnValue("FULL_PROFILE"); + case UR_PLATFORM_INFO_VERSION: + // TODO: this should query to zeDriverGetDriverVersion + // but we don't yet have the driver handle here. + // + // From OpenCL 2.1: "This version string has the following format: + // OpenCL. Follow the same notation here. + // + return ReturnValue(Platform->ZeDriverApiVersion.c_str()); + default: + urPrint("urPlatformGetInfo: unrecognized ParamName\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion( + ur_platform_handle_t Driver, ///< [in] handle of the platform + ur_api_version_t *Version ///< [out] api version +) { + std::ignore = Driver; + std::ignore = Version; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle( + ur_platform_handle_t Platform, ///< [in] handle of the platform. + ur_native_handle_t *NativePlatform ///< [out] a pointer to the native + ///< handle of the platform. +) { + // Extract the Level Zero driver handle from the given PI platform + *NativePlatform = reinterpret_cast(Platform->ZeDriver); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( + ur_native_handle_t + NativePlatform, ///< [in] the native handle of the platform. + ur_platform_handle_t *Platform ///< [out] pointer to the handle of the + ///< platform object created. +) { + auto ZeDriver = ur_cast(NativePlatform); + + uint32_t NumPlatforms = 0; + UR_CALL(urPlatformGet(0, nullptr, &NumPlatforms)); + + if (NumPlatforms) { + std::vector Platforms(NumPlatforms); + UR_CALL(urPlatformGet(NumPlatforms, Platforms.data(), nullptr)); + + // The SYCL spec requires that the set of platforms must remain fixed for + // the duration of the application's execution. We assume that we found all + // of the Level Zero drivers when we initialized the platform cache, so the + // "NativeHandle" must already be in the cache. If it is not, this must not + // be a valid Level Zero driver. + for (const ur_platform_handle_t &CachedPlatform : Platforms) { + if (CachedPlatform->ZeDriver == ZeDriver) { + *Platform = CachedPlatform; + return UR_RESULT_SUCCESS; + } + } + } + + return UR_RESULT_ERROR_INVALID_VALUE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urGetLastResult( + ur_platform_handle_t Platform, ///< [in] handle of the platform instance + const char **Message ///< [out] pointer to a string containing adapter + ///< specific result in string representation. +) { + std::ignore = Platform; + std::ignore = Message; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t ur_platform_handle_t_::initialize() { + // Cache driver properties + ZeStruct ZeDriverProperties; + ZE2UR_CALL(zeDriverGetProperties, (ZeDriver, &ZeDriverProperties)); + uint32_t DriverVersion = ZeDriverProperties.driverVersion; + // Intel Level-Zero GPU driver stores version as: + // | 31 - 24 | 23 - 16 | 15 - 0 | + // | Major | Minor | Build | + auto VersionMajor = std::to_string((DriverVersion & 0xFF000000) >> 24); + auto VersionMinor = std::to_string((DriverVersion & 0x00FF0000) >> 16); + auto VersionBuild = std::to_string(DriverVersion & 0x0000FFFF); + ZeDriverVersion = VersionMajor + "." + VersionMinor + "." + VersionBuild; + + ZE2UR_CALL(zeDriverGetApiVersion, (ZeDriver, &ZeApiVersion)); + ZeDriverApiVersion = std::to_string(ZE_MAJOR_VERSION(ZeApiVersion)) + "." + + std::to_string(ZE_MINOR_VERSION(ZeApiVersion)); + + // Cache driver extension properties + uint32_t Count = 0; + ZE2UR_CALL(zeDriverGetExtensionProperties, (ZeDriver, &Count, nullptr)); + + std::vector ZeExtensions(Count); + + ZE2UR_CALL(zeDriverGetExtensionProperties, + (ZeDriver, &Count, ZeExtensions.data())); + + for (auto &extension : ZeExtensions) { + // Check if global offset extension is available + if (strncmp(extension.name, ZE_GLOBAL_OFFSET_EXP_NAME, + strlen(ZE_GLOBAL_OFFSET_EXP_NAME) + 1) == 0) { + if (extension.version == ZE_GLOBAL_OFFSET_EXP_VERSION_1_0) { + ZeDriverGlobalOffsetExtensionFound = true; + } + } + // Check if extension is available for "static linking" (compiling multiple + // SPIR-V modules together into one Level Zero module). + if (strncmp(extension.name, ZE_MODULE_PROGRAM_EXP_NAME, + strlen(ZE_MODULE_PROGRAM_EXP_NAME) + 1) == 0) { + if (extension.version == ZE_MODULE_PROGRAM_EXP_VERSION_1_0) { + ZeDriverModuleProgramExtensionFound = true; + } + } + zeDriverExtensionMap[extension.name] = extension.version; + } + + // Check if import user ptr into USM feature has been requested. + // If yes, then set up L0 API pointers if the platform supports it. + ZeUSMImport.setZeUSMImport(this); + + return UR_RESULT_SUCCESS; +} + +// Get the cached PI device created for the L0 device handle. +// Return NULL if no such PI device found. +ur_device_handle_t +ur_platform_handle_t_::getDeviceFromNativeHandle(ze_device_handle_t ZeDevice) { + + ur_result_t Res = populateDeviceCacheIfNeeded(); + if (Res != UR_RESULT_SUCCESS) { + return nullptr; + } + + // TODO: our sub-sub-device representation is currently [Level-Zero device + // handle + Level-Zero compute group/engine index], so there is now no 1:1 + // mapping from L0 device handle to PI device assumed in this function. Until + // Level-Zero adds unique ze_device_handle_t for sub-sub-devices, here we + // filter out PI sub-sub-devices. + std::shared_lock Lock(PiDevicesCacheMutex); + auto it = std::find_if(PiDevicesCache.begin(), PiDevicesCache.end(), + [&](std::unique_ptr &D) { + return D.get()->ZeDevice == ZeDevice && + (D.get()->RootDevice == nullptr || + D.get()->RootDevice->RootDevice == nullptr); + }); + if (it != PiDevicesCache.end()) { + return (*it).get(); + } + return nullptr; +} + +// Check the device cache and load it if necessary. +ur_result_t ur_platform_handle_t_::populateDeviceCacheIfNeeded() { + std::scoped_lock Lock(PiDevicesCacheMutex); + + if (DeviceCachePopulated) { + return UR_RESULT_SUCCESS; + } + + uint32_t ZeDeviceCount = 0; + ZE2UR_CALL(zeDeviceGet, (ZeDriver, &ZeDeviceCount, nullptr)); + + try { + std::vector ZeDevices(ZeDeviceCount); + ZE2UR_CALL(zeDeviceGet, (ZeDriver, &ZeDeviceCount, ZeDevices.data())); + + for (uint32_t I = 0; I < ZeDeviceCount; ++I) { + std::unique_ptr Device( + new ur_device_handle_t_(ZeDevices[I], (ur_platform_handle_t)this)); + UR_CALL(Device->initialize()); + + // Additionally we need to cache all sub-devices too, such that they + // are readily visible to the piextDeviceCreateWithNativeHandle. + // + uint32_t SubDevicesCount = 0; + ZE2UR_CALL(zeDeviceGetSubDevices, + (Device->ZeDevice, &SubDevicesCount, nullptr)); + + auto ZeSubdevices = new ze_device_handle_t[SubDevicesCount]; + ZE2UR_CALL(zeDeviceGetSubDevices, + (Device->ZeDevice, &SubDevicesCount, ZeSubdevices)); + + // Wrap the Level Zero sub-devices into PI sub-devices, and add them to + // cache. + for (uint32_t I = 0; I < SubDevicesCount; ++I) { + std::unique_ptr UrSubDevice( + new ur_device_handle_t_(ZeSubdevices[I], (ur_platform_handle_t)this, + Device.get())); + auto Result = UrSubDevice->initialize(); + if (Result != UR_RESULT_SUCCESS) { + delete[] ZeSubdevices; + return Result; + } + + // collect all the ordinals for the sub-sub-devices + std::vector Ordinals; + + uint32_t numQueueGroups = 0; + ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties, + (UrSubDevice->ZeDevice, &numQueueGroups, nullptr)); + if (numQueueGroups == 0) { + return UR_RESULT_ERROR_UNKNOWN; + } + std::vector QueueGroupProperties( + numQueueGroups); + ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties, + (UrSubDevice->ZeDevice, &numQueueGroups, + QueueGroupProperties.data())); + + for (uint32_t i = 0; i < numQueueGroups; i++) { + if (QueueGroupProperties[i].flags & + ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE && + QueueGroupProperties[i].numQueues > 1) { + Ordinals.push_back(i); + } + } + + // If isn't PVC, then submissions to different CCS can be executed on + // the same EUs still, so we cannot treat them as sub-sub-devices. + if (UrSubDevice->isPVC() || ExposeCSliceInAffinityPartitioning) { + // Create PI sub-sub-devices with the sub-device for all the ordinals. + // Each {ordinal, index} points to a specific CCS which constructs + // a sub-sub-device at this point. + // + // FIXME: Level Zero creates multiple PiDevices for a single physical + // device when sub-device is partitioned into sub-sub-devices. + // Sub-sub-device is technically a command queue and we should not + // build program for each command queue. PiDevice is probably not the + // right abstraction for a Level Zero command queue. + for (uint32_t J = 0; J < Ordinals.size(); ++J) { + for (uint32_t K = 0; + K < QueueGroupProperties[Ordinals[J]].numQueues; ++K) { + std::unique_ptr PiSubSubDevice( + new ur_device_handle_t_(ZeSubdevices[I], + (ur_platform_handle_t)this, + UrSubDevice.get())); + UR_CALL(PiSubSubDevice->initialize(Ordinals[J], K)); + + // save pointers to sub-sub-devices for quick retrieval in the + // future. + UrSubDevice->SubDevices.push_back(PiSubSubDevice.get()); + PiDevicesCache.push_back(std::move(PiSubSubDevice)); + } + } + } + + // save pointers to sub-devices for quick retrieval in the future. + Device->SubDevices.push_back(UrSubDevice.get()); + PiDevicesCache.push_back(std::move(UrSubDevice)); + } + delete[] ZeSubdevices; + + // Save the root device in the cache for future uses. + PiDevicesCache.push_back(std::move(Device)); + } + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + DeviceCachePopulated = true; + return UR_RESULT_SUCCESS; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.hpp index 40f5b961b8df0..2894de7139619 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.hpp @@ -8,3 +8,47 @@ #pragma once #include "ur_level_zero_common.hpp" + +struct ur_device_handle_t_; + +struct ur_platform_handle_t_ : public _ur_platform { + ur_platform_handle_t_(ze_driver_handle_t Driver) : ZeDriver{Driver} {} + // Performs initialization of a newly constructed PI platform. + ur_result_t initialize(); + + // Level Zero lacks the notion of a platform, but there is a driver, which is + // a pretty good fit to keep here. + ze_driver_handle_t ZeDriver; + + // Cache versions info from zeDriverGetProperties. + std::string ZeDriverVersion; + std::string ZeDriverApiVersion; + ze_api_version_t ZeApiVersion; + + // Cache driver extensions + std::unordered_map zeDriverExtensionMap; + + // Flags to tell whether various Level Zero platform extensions are available. + bool ZeDriverGlobalOffsetExtensionFound{false}; + bool ZeDriverModuleProgramExtensionFound{false}; + + // Cache UR devices for reuse + std::vector> PiDevicesCache; + ur_shared_mutex PiDevicesCacheMutex; + bool DeviceCachePopulated = false; + + // Check the device cache and load it if necessary. + ur_result_t populateDeviceCacheIfNeeded(); + + // Return the PI device from cache that represents given native device. + // If not found, then nullptr is returned. + ur_device_handle_t getDeviceFromNativeHandle(ze_device_handle_t); + + // Keep track of all contexts in the platform. This is needed to manage + // a lifetime of memory allocations in each context when there are kernels + // with indirect access. + // TODO: should be deleted when memory isolation in the context is implemented + // in the driver. + std::list Contexts; + ur_shared_mutex ContextsMutex; +}; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp index ff45091ce6795..f9e32aa395084 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp @@ -7,3 +7,761 @@ //===-----------------------------------------------------------------===// #include "ur_level_zero_program.hpp" +#include + +extern "C" { +// Check to see if a Level Zero module has any unresolved symbols. +// +// @param ZeModule The module handle to check. +// @param ZeBuildLog If there are unresolved symbols, this build log handle is +// modified to receive information telling which symbols +// are unresolved. +// +// @return ZE_RESULT_ERROR_MODULE_LINK_FAILURE indicates there are unresolved +// symbols. ZE_RESULT_SUCCESS indicates all symbols are resolved. Any other +// value indicates there was an error and we cannot tell if symbols are +// resolved. +static ze_result_t +checkUnresolvedSymbols(ze_module_handle_t ZeModule, + ze_module_build_log_handle_t *ZeBuildLog) { + + // First check to see if the module has any imported symbols. If there are + // no imported symbols, it's not possible to have any unresolved symbols. We + // do this check first because we assume it's faster than the call to + // zeModuleDynamicLink below. + ZeStruct ZeModuleProps; + ze_result_t ZeResult = + ZE_CALL_NOCHECK(zeModuleGetProperties, (ZeModule, &ZeModuleProps)); + if (ZeResult != ZE_RESULT_SUCCESS) + return ZeResult; + + // If there are imported symbols, attempt to "link" the module with itself. + // As a side effect, this will return the error + // ZE_RESULT_ERROR_MODULE_LINK_FAILURE if there are any unresolved symbols. + if (ZeModuleProps.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS) { + return ZE_CALL_NOCHECK(zeModuleDynamicLink, (1, &ZeModule, ZeBuildLog)); + } + return ZE_RESULT_SUCCESS; +} +} // extern "C" + +UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithIL( + ur_context_handle_t Context, ///< [in] handle of the context instance + const void *IL, ///< [in] pointer to IL binary. + size_t Length, ///< [in] length of `pIL` in bytes. + const ur_program_properties_t + *Properties, ///< [in][optional] pointer to program creation properties. + ur_program_handle_t + *Program ///< [out] pointer to handle of program object created. +) { + try { + ur_program_handle_t_ *UrProgram = + new ur_program_handle_t_(ur_program_handle_t_::IL, Context, IL, Length); + *Program = reinterpret_cast(UrProgram); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( + ur_context_handle_t Context, ///< [in] handle of the context instance + ur_device_handle_t + Device, ///< [in] handle to device associated with binary. + size_t Size, ///< [in] size in bytes. + const uint8_t *Binary, ///< [in] pointer to binary. + const ur_program_properties_t + *Properties, ///< [in][optional] pointer to program creation properties. + ur_program_handle_t + *Program ///< [out] pointer to handle of Program object created. +) { + // In OpenCL, clCreateProgramWithBinary() can be used to load any of the + // following: "program executable", "compiled program", or "library of + // compiled programs". In addition, the loaded program can be either + // IL (SPIR-v) or native device code. For now, we assume that + // piProgramCreateWithBinary() is only used to load a "program executable" + // as native device code. + // If we wanted to support all the same cases as OpenCL, we would need to + // somehow examine the binary image to distinguish the cases. Alternatively, + // we could change the PI interface and have the caller pass additional + // information to distinguish the cases. + + try { + ur_program_handle_t_ *UrProgram = new ur_program_handle_t_( + ur_program_handle_t_::Native, Context, Binary, Size); + *Program = reinterpret_cast(UrProgram); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild( + ur_context_handle_t Context, ///< [in] handle of the context instance. + ur_program_handle_t Program, ///< [in] Handle of the program to build. + const char *Options ///< [in][optional] pointer to build options + ///< null-terminated string. +) { + // TODO + // Check if device belongs to associated context. + // UR_ASSERT(Program->Context, UR_RESULT_ERROR_INVALID_PROGRAM); + // UR_ASSERT(Program->Context->isValidDevice(Devices[0]), + // UR_RESULT_ERROR_INVALID_VALUE); + + // We should have either IL or native device code. + UR_ASSERT(Program->Code, UR_RESULT_ERROR_INVALID_PROGRAM); + + // It is legal to build a program created from either IL or from native + // device code. + if (Program->State != ur_program_handle_t_::IL && + Program->State != ur_program_handle_t_::Native) { + return UR_RESULT_ERROR_INVALID_OPERATION; + } + + std::scoped_lock Guard(Program->Mutex); + + // Ask Level Zero to build and load the native code onto the device. + ZeStruct ZeModuleDesc; + ur_program_handle_t_::SpecConstantShim Shim(Program); + ZeModuleDesc.format = (Program->State == ur_program_handle_t_::IL) + ? ZE_MODULE_FORMAT_IL_SPIRV + : ZE_MODULE_FORMAT_NATIVE; + ZeModuleDesc.inputSize = Program->CodeLength; + ZeModuleDesc.pInputModule = Program->Code.get(); + ZeModuleDesc.pBuildFlags = Options; + ZeModuleDesc.pConstants = Shim.ze(); + + ze_device_handle_t ZeDevice = Context->Devices[0]->ZeDevice; + ze_context_handle_t ZeContext = Program->Context->ZeContext; + ze_module_handle_t ZeModule = nullptr; + + ur_result_t Result = UR_RESULT_SUCCESS; + Program->State = ur_program_handle_t_::Exe; + ze_result_t ZeResult = + ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc, + &ZeModule, &Program->ZeBuildLog)); + if (ZeResult != ZE_RESULT_SUCCESS) { + // We adjust pi_program below to avoid attempting to release zeModule when + // RT calls piProgramRelease(). + Program->State = ur_program_handle_t_::Invalid; + Result = ze2urResult(ZeResult); + if (Program->ZeBuildLog) { + ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (Program->ZeBuildLog)); + Program->ZeBuildLog = nullptr; + } + if (ZeModule) { + ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule)); + ZeModule = nullptr; + } + } else { + // The call to zeModuleCreate does not report an error if there are + // unresolved symbols because it thinks these could be resolved later via a + // call to zeModuleDynamicLink. However, modules created with + // piProgramBuild are supposed to be fully linked and ready to use. + // Therefore, do an extra check now for unresolved symbols. + ZeResult = checkUnresolvedSymbols(ZeModule, &Program->ZeBuildLog); + if (ZeResult != ZE_RESULT_SUCCESS) { + Program->State = ur_program_handle_t_::Invalid; + Result = (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE) + ? UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE + : ze2urResult(ZeResult); + if (ZeModule) { + ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule)); + ZeModule = nullptr; + } + } + } + + // We no longer need the IL / native code. + Program->Code.reset(); + Program->ZeModule = ZeModule; + return Result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile( + ur_context_handle_t Context, ///< [in] handle of the context instance. + ur_program_handle_t + Program, ///< [in][out] handle of the program to compile. + const char *Options ///< [in][optional] pointer to build options + ///< null-terminated string. +) { + + std::scoped_lock Guard(Program->Mutex); + + // It's only valid to compile a program created from IL (we don't support + // programs created from source code). + // + // The OpenCL spec says that the header parameters are ignored when compiling + // IL programs, so we don't validate them. + if (Program->State != ur_program_handle_t_::IL) + return UR_RESULT_ERROR_INVALID_OPERATION; + + // We don't compile anything now. Instead, we delay compilation until + // piProgramLink, where we do both compilation and linking as a single step. + // This produces better code because the driver can do cross-module + // optimizations. Therefore, we just remember the compilation flags, so we + // can use them later. + if (Options) + Program->BuildFlags = Options; + Program->State = ur_program_handle_t_::Object; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramLink( + ur_context_handle_t Context, ///< [in] handle of the context instance. + uint32_t Count, ///< [in] number of program handles in `phPrograms`. + const ur_program_handle_t *Programs, ///< [in][range(0, count)] pointer to + ///< array of program handles. + const char *Options, ///< [in][optional] pointer to linker options + ///< null-terminated string. + ur_program_handle_t + *Program ///< [out] pointer to handle of program object created. +) { + // TODO + // UR_ASSERT(Context->isValidDevice(Context->Devices[0]), + // UR_RESULT_ERROR_INVALID_DEVICE); + + // We do not support any link flags at this time because the Level Zero API + // does not have any way to pass flags that are specific to linking. + if (Options && *Options != '\0') { + std::string ErrorMessage( + "Level Zero does not support kernel link flags: \""); + ErrorMessage.append(Options); + ErrorMessage.push_back('\"'); + ur_program_handle_t_ *UrProgram = new ur_program_handle_t_( + ur_program_handle_t_::Invalid, Context, ErrorMessage); + *Program = reinterpret_cast(UrProgram); + return UR_RESULT_ERROR_PROGRAM_LINK_FAILURE; + } + + ur_result_t UrResult = UR_RESULT_SUCCESS; + try { + // Acquire a "shared" lock on each of the input programs, and also validate + // that they are all in Object state. + // + // There is no danger of deadlock here even if two threads call + // piProgramLink simultaneously with the same input programs in a different + // order. If we were acquiring these with "exclusive" access, this could + // lead to a classic lock ordering deadlock. However, there is no such + // deadlock potential with "shared" access. There could also be a deadlock + // potential if there was some other code that holds more than one of these + // locks simultaneously with "exclusive" access. However, there is no such + // code like that, so this is also not a danger. + std::vector> Guards(Count); + for (uint32_t I = 0; I < Count; I++) { + std::shared_lock Guard(Programs[I]->Mutex); + Guards[I].swap(Guard); + if (Programs[I]->State != ur_program_handle_t_::Object) { + return UR_RESULT_ERROR_INVALID_OPERATION; + } + } + + // Previous calls to piProgramCompile did not actually compile the SPIR-V. + // Instead, we postpone compilation until this point, when all the modules + // are linked together. By doing compilation and linking together, the JIT + // compiler is able see all modules and do cross-module optimizations. + // + // Construct a ze_module_program_exp_desc_t which contains information about + // all of the modules that will be linked together. + ZeStruct ZeExtModuleDesc; + std::vector CodeSizes(Count); + std::vector CodeBufs(Count); + std::vector BuildFlagPtrs(Count); + std::vector SpecConstPtrs(Count); + std::vector SpecConstShims; + SpecConstShims.reserve(Count); + + for (uint32_t I = 0; I < Count; I++) { + ur_program_handle_t Program = Programs[I]; + CodeSizes[I] = Program->CodeLength; + CodeBufs[I] = Program->Code.get(); + BuildFlagPtrs[I] = Program->BuildFlags.c_str(); + SpecConstShims.emplace_back(Program); + SpecConstPtrs[I] = SpecConstShims[I].ze(); + } + + ZeExtModuleDesc.count = Count; + ZeExtModuleDesc.inputSizes = CodeSizes.data(); + ZeExtModuleDesc.pInputModules = CodeBufs.data(); + ZeExtModuleDesc.pBuildFlags = BuildFlagPtrs.data(); + ZeExtModuleDesc.pConstants = SpecConstPtrs.data(); + + ZeStruct ZeModuleDesc; + ZeModuleDesc.pNext = &ZeExtModuleDesc; + ZeModuleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV; + + // This works around a bug in the Level Zero driver. When "ZE_DEBUG=-1", + // the driver does validation of the API calls, and it expects + // "pInputModule" to be non-NULL and "inputSize" to be non-zero. This + // validation is wrong when using the "ze_module_program_exp_desc_t" + // extension because those fields are supposed to be ignored. As a + // workaround, set both fields to 1. + // + // TODO: Remove this workaround when the driver is fixed. + ZeModuleDesc.pInputModule = reinterpret_cast(1); + ZeModuleDesc.inputSize = 1; + + // We need a Level Zero extension to compile multiple programs together into + // a single Level Zero module. However, we don't need that extension if + // there happens to be only one input program. + // + // The "|| (NumInputPrograms == 1)" term is a workaround for a bug in the + // Level Zero driver. The driver's "ze_module_program_exp_desc_t" + // extension should work even in the case when there is just one input + // module. However, there is currently a bug in the driver that leads to a + // crash. As a workaround, do not use the extension when there is one + // input module. + // + // TODO: Remove this workaround when the driver is fixed. + if (!Context->Devices[0]->Platform->ZeDriverModuleProgramExtensionFound || + (Count == 1)) { + if (Count == 1) { + ZeModuleDesc.pNext = nullptr; + ZeModuleDesc.inputSize = ZeExtModuleDesc.inputSizes[0]; + ZeModuleDesc.pInputModule = ZeExtModuleDesc.pInputModules[0]; + ZeModuleDesc.pBuildFlags = ZeExtModuleDesc.pBuildFlags[0]; + ZeModuleDesc.pConstants = ZeExtModuleDesc.pConstants[0]; + } else { + urPrint("urProgramLink: level_zero driver does not have static linking " + "support."); + return UR_RESULT_ERROR_INVALID_VALUE; + } + } + + // Call the Level Zero API to compile, link, and create the module. + ze_device_handle_t ZeDevice = Context->Devices[0]->ZeDevice; + ze_context_handle_t ZeContext = Context->ZeContext; + ze_module_handle_t ZeModule = nullptr; + ze_module_build_log_handle_t ZeBuildLog = nullptr; + ze_result_t ZeResult = + ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc, + &ZeModule, &ZeBuildLog)); + + // We still create a ur_program_handle_t_ object even if there is a + // BUILD_FAILURE because we need the object to hold the ZeBuildLog. There + // is no build log created for other errors, so we don't create an object. + UrResult = ze2urResult(ZeResult); + if (ZeResult != ZE_RESULT_SUCCESS && + ZeResult != ZE_RESULT_ERROR_MODULE_BUILD_FAILURE) { + return ze2urResult(ZeResult); + } + + // The call to zeModuleCreate does not report an error if there are + // unresolved symbols because it thinks these could be resolved later via a + // call to zeModuleDynamicLink. However, modules created with piProgramLink + // are supposed to be fully linked and ready to use. Therefore, do an extra + // check now for unresolved symbols. Note that we still create a + // ur_program_handle_t_ if there are unresolved symbols because the + // ZeBuildLog tells which symbols are unresolved. + if (ZeResult == ZE_RESULT_SUCCESS) { + ZeResult = checkUnresolvedSymbols(ZeModule, &ZeBuildLog); + if (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE) { + UrResult = + UR_RESULT_ERROR_UNKNOWN; // TODO: + // UR_RESULT_ERROR_PROGRAM_LINK_FAILURE; + } else if (ZeResult != ZE_RESULT_SUCCESS) { + return ze2urResult(ZeResult); + } + } + + ur_program_handle_t_::state State = (UrResult == UR_RESULT_SUCCESS) + ? ur_program_handle_t_::Exe + : ur_program_handle_t_::Invalid; + ur_program_handle_t_ *UrProgram = + new ur_program_handle_t_(State, Context, ZeModule, ZeBuildLog); + *Program = reinterpret_cast(UrProgram); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UrResult; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramRetain( + ur_program_handle_t Program ///< [in] handle for the Program to retain +) { + Program->RefCount.increment(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramRelease( + ur_program_handle_t Program ///< [in] handle for the Program to release +) { + if (!Program->RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + + delete Program; + + return UR_RESULT_SUCCESS; +} + +// Function gets characters between delimeter's in str +// then checks if they are equal to the sub_str. +// returns true if there is at least one instance +// returns false if there are no instances of the name +static bool is_in_separated_string(const std::string &str, char delimiter, + const std::string &sub_str) { + size_t beg = 0; + size_t length = 0; + for (const auto &x : str) { + if (x == delimiter) { + if (str.substr(beg, length) == sub_str) + return true; + + beg += length + 1; + length = 0; + continue; + } + length++; + } + if (length != 0) + if (str.substr(beg, length) == sub_str) + return true; + + return false; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer( + ur_device_handle_t + Device, ///< [in] handle of the device to retrieve pointer for. + ur_program_handle_t + Program, ///< [in] handle of the program to search for function in. + ///< The program must already be built to the specified + ///< device, or otherwise + ///< ::UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE is returned. + const char *FunctionName, ///< [in] A null-terminates string denoting the + ///< mangled function name. + void **FunctionPointerRet ///< [out] Returns the pointer to the function if + ///< it is found in the program. +) { + std::ignore = Device; + + std::shared_lock Guard(Program->Mutex); + if (Program->State != ur_program_handle_t_::Exe) { + return UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE; + } + + ze_result_t ZeResult = + ZE_CALL_NOCHECK(zeModuleGetFunctionPointer, + (Program->ZeModule, FunctionName, FunctionPointerRet)); + + // zeModuleGetFunctionPointer currently fails for all + // kernels regardless of if the kernel exist or not + // with ZE_RESULT_ERROR_INVALID_ARGUMENT + // TODO: remove when this is no longer the case + // If zeModuleGetFunctionPointer returns invalid argument, + // fallback to searching through kernel list and return + // PI_ERROR_FUNCTION_ADDRESS_IS_NOT_AVAILABLE if the function exists + // or PI_ERROR_INVALID_KERNEL_NAME if the function does not exist. + // FunctionPointerRet should always be 0 + if (ZeResult == ZE_RESULT_ERROR_INVALID_ARGUMENT) { + size_t Size; + *FunctionPointerRet = 0; + UR_CALL(urProgramGetInfo(Program, UR_PROGRAM_INFO_KERNEL_NAMES, 0, nullptr, + &Size)); + + std::string ClResult(Size, ' '); + UR_CALL(urProgramGetInfo(Program, UR_PROGRAM_INFO_KERNEL_NAMES, + ClResult.size(), &ClResult[0], nullptr)); + + // Get rid of the null terminator and search for kernel_name + // If function can be found return error code to indicate it + // exists + ClResult.pop_back(); + if (is_in_separated_string(ClResult, ';', std::string(FunctionName))) + return UR_RESULT_ERROR_INVALID_FUNCTION_NAME; + + return UR_RESULT_ERROR_INVALID_KERNEL_NAME; + } + + if (ZeResult == ZE_RESULT_ERROR_INVALID_FUNCTION_NAME) { + *FunctionPointerRet = 0; + return UR_RESULT_ERROR_INVALID_KERNEL_NAME; + } + + return ze2urResult(ZeResult); +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo( + ur_program_handle_t Program, ///< [in] handle of the Program object + ur_program_info_t PropName, ///< [in] name of the Program property to query + size_t PropSize, ///< [in] the size of the Program property. + void *ProgramInfo, ///< [in,out][optional] array of bytes of holding the + ///< program info property. If propSize is not equal to + ///< or greater than the real number of bytes needed to + ///< return the info then the + ///< ::UR_RESULT_ERROR_INVALID_SIZE error is returned + ///< and pProgramInfo is not used. + size_t *PropSizeRet ///< [out][optional] pointer to the actual size in + ///< bytes of data copied to propName. +) { + UrReturnHelper ReturnValue(PropSize, ProgramInfo, PropSizeRet); + + switch (PropName) { + case UR_PROGRAM_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{Program->RefCount.load()}); + case UR_PROGRAM_INFO_CONTEXT: + return ReturnValue(Program->Context); + case UR_PROGRAM_INFO_NUM_DEVICES: + // TODO: return true number of devices this program exists for. + return ReturnValue(uint32_t{1}); + case UR_PROGRAM_INFO_DEVICES: + // TODO: return all devices this program exists for. + return ReturnValue(Program->Context->Devices[0]); + case UR_PROGRAM_INFO_BINARY_SIZES: { + std::shared_lock Guard(Program->Mutex); + size_t SzBinary; + if (Program->State == ur_program_handle_t_::IL || + Program->State == ur_program_handle_t_::Native || + Program->State == ur_program_handle_t_::Object) { + SzBinary = Program->CodeLength; + } else if (Program->State == ur_program_handle_t_::Exe) { + ZE2UR_CALL(zeModuleGetNativeBinary, + (Program->ZeModule, &SzBinary, nullptr)); + } else { + return UR_RESULT_ERROR_INVALID_PROGRAM; + } + // This is an array of 1 element, initialized as if it were scalar. + return ReturnValue(size_t{SzBinary}); + } + case UR_PROGRAM_INFO_BINARIES: { + // The caller sets "ParamValue" to an array of pointers, one for each + // device. Since Level Zero supports only one device, there is only one + // pointer. If the pointer is NULL, we don't do anything. Otherwise, we + // copy the program's binary image to the buffer at that pointer. + uint8_t **PBinary = ur_cast(ProgramInfo); + if (!PBinary[0]) + break; + + std::shared_lock Guard(Program->Mutex); + if (Program->State == ur_program_handle_t_::IL || + Program->State == ur_program_handle_t_::Native || + Program->State == ur_program_handle_t_::Object) { + std::memcpy(PBinary[0], Program->Code.get(), Program->CodeLength); + } else if (Program->State == ur_program_handle_t_::Exe) { + size_t SzBinary = 0; + ZE2UR_CALL(zeModuleGetNativeBinary, + (Program->ZeModule, &SzBinary, PBinary[0])); + } else { + return UR_RESULT_ERROR_INVALID_PROGRAM; + } + break; + } + case UR_PROGRAM_INFO_NUM_KERNELS: { + std::shared_lock Guard(Program->Mutex); + uint32_t NumKernels; + if (Program->State == ur_program_handle_t_::IL || + Program->State == ur_program_handle_t_::Native || + Program->State == ur_program_handle_t_::Object) { + return UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE; + } else if (Program->State == ur_program_handle_t_::Exe) { + NumKernels = 0; + ZE2UR_CALL(zeModuleGetKernelNames, + (Program->ZeModule, &NumKernels, nullptr)); + } else { + return UR_RESULT_ERROR_INVALID_PROGRAM; + } + return ReturnValue(size_t{NumKernels}); + } + case UR_PROGRAM_INFO_KERNEL_NAMES: + try { + std::shared_lock Guard(Program->Mutex); + std::string PINames{""}; + if (Program->State == ur_program_handle_t_::IL || + Program->State == ur_program_handle_t_::Native || + Program->State == ur_program_handle_t_::Object) { + return UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE; + } else if (Program->State == ur_program_handle_t_::Exe) { + uint32_t Count = 0; + ZE2UR_CALL(zeModuleGetKernelNames, + (Program->ZeModule, &Count, nullptr)); + std::unique_ptr PNames(new const char *[Count]); + ZE2UR_CALL(zeModuleGetKernelNames, + (Program->ZeModule, &Count, PNames.get())); + for (uint32_t I = 0; I < Count; ++I) { + PINames += (I > 0 ? ";" : ""); + PINames += PNames[I]; + } + } else { + return UR_RESULT_ERROR_INVALID_PROGRAM; + } + return ReturnValue(PINames.c_str()); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + default: + die("urProgramGetInfo: not implemented"); + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramGetBuildInfo( + ur_program_handle_t Program, ///< [in] handle of the Program object + ur_device_handle_t Device, ///< [in] handle of the Device object + ur_program_build_info_t + PropName, ///< [in] name of the Program build info to query + size_t PropSize, ///< [in] size of the Program build info property. + void *PropValue, ///< [in,out][optional] value of the Program build + ///< property. If propSize is not equal to or greater than + ///< the real number of bytes needed to return the info + ///< then the ::UR_RESULT_ERROR_INVALID_SIZE error is + ///< returned and pKernelInfo is not used. + size_t *PropSizeRet ///< [out][optional] pointer to the actual size in + ///< bytes of data being queried by propName. +) { + std::ignore = Device; + + std::shared_lock Guard(Program->Mutex); + UrReturnHelper ReturnValue(PropSize, PropValue, PropSizeRet); + if (PropName == UR_PROGRAM_BUILD_INFO_BINARY_TYPE) { + ur_program_binary_type_t Type = UR_PROGRAM_BINARY_TYPE_NONE; + if (Program->State == ur_program_handle_t_::Object) { + Type = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT; + } else if (Program->State == ur_program_handle_t_::Exe) { + Type = UR_PROGRAM_BINARY_TYPE_EXECUTABLE; + } + return ReturnValue(ur_program_binary_type_t{Type}); + } + if (PropName == UR_PROGRAM_BUILD_INFO_OPTIONS) { + // TODO: how to get module build options out of Level Zero? + // For the programs that we compiled we can remember the options + // passed with piProgramCompile/piProgramBuild, but what can we + // return for programs that were built outside and registered + // with piProgramRegister? + return ReturnValue(""); + } else if (PropName == UR_PROGRAM_BUILD_INFO_LOG) { + // Check first to see if the plugin code recorded an error message. + if (!Program->ErrorMessage.empty()) { + return ReturnValue(Program->ErrorMessage.c_str()); + } + + // Next check if there is a Level Zero build log. + if (Program->ZeBuildLog) { + size_t LogSize = PropSize; + ZE2UR_CALL(zeModuleBuildLogGetString, + (Program->ZeBuildLog, &LogSize, ur_cast(PropValue))); + if (PropSizeRet) { + *PropSizeRet = LogSize; + } + return UR_RESULT_SUCCESS; + } + + // Otherwise, there is no error. The OpenCL spec says to return an empty + // string if there ws no previous attempt to compile, build, or link the + // program. + return ReturnValue(""); + } else { + urPrint("urProgramGetBuildInfo: unsupported ParamName\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstant( + ur_program_handle_t Program, ///< [in] handle of the Program object + uint32_t SpecId, ///< [in] specification constant Id + size_t SpecSize, ///< [in] size of the specialization constant value + const void *SpecValue ///< [in] pointer to the specialization value bytes +) { + std::ignore = Program; + std::ignore = SpecId; + std::ignore = SpecSize; + std::ignore = SpecValue; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle( + ur_program_handle_t Program, ///< [in] handle of the program. + ur_native_handle_t *NativeProgram ///< [out] a pointer to the native + ///< handle of the program. +) { + auto ZeModule = ur_cast(NativeProgram); + + std::shared_lock Guard(Program->Mutex); + switch (Program->State) { + case ur_program_handle_t_::Exe: { + *ZeModule = Program->ZeModule; + break; + } + + default: + return UR_RESULT_ERROR_INVALID_OPERATION; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( + ur_native_handle_t + NativeProgram, ///< [in] the native handle of the program. + ur_context_handle_t Context, ///< [in] handle of the context instance + ur_program_handle_t *Program ///< [out] pointer to the handle of the + ///< program object created. +) { + auto ZeModule = ur_cast(NativeProgram); + + // We assume here that programs created from a native handle always + // represent a fully linked executable (state Exe) and not an unlinked + // executable (state Object). + + try { + ur_program_handle_t_ *UrProgram = + new ur_program_handle_t_(ur_program_handle_t_::Exe, Context, ZeModule); + *Program = reinterpret_cast(UrProgram); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; +} + +ur_program_handle_t_::~ur_program_handle_t_() { + // According to Level Zero Specification, all kernels and build logs + // must be destroyed before the Module can be destroyed. So, be sure + // to destroy build log before destroying the module. + // printf("ZeBuildLog %lx\n", (unsigned long int)ZeBuildLog); + if (ZeBuildLog) { + ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (ZeBuildLog)); + } + + // printf("ZeModule %lx OwnZeModule %d\n", (unsigned long int)ZeModule, + // OwnZeModule); + if (ZeModule && OwnZeModule) { + ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule)); + } +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants( + ur_program_handle_t Program, ///< [in] handle of the Program object + uint32_t Count, ///< [in] the number of elements in the pSpecConstants array + const ur_specialization_constant_info_t + *SpecConstants ///< [in][range(0, count)] array of specialization + ///< constant value descriptions +) { + std::scoped_lock Guard(Program->Mutex); + + // Remember the value of this specialization constant until the program is + // built. Note that we only save the pointer to the buffer that contains the + // value. The caller is responsible for maintaining storage for this buffer. + // + // NOTE: SpecSize is unused in Level Zero, the size is known from SPIR-V by + // SpecID. + for (uint32_t SpecIt = 0; SpecIt < Count; SpecIt++) { + uint32_t SpecId = SpecConstants[SpecIt].id; + Program->SpecConstants[SpecId] = SpecConstants[SpecIt].pValue; + } + return UR_RESULT_SUCCESS; +} \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.hpp index 9a2f9604f08c5..35cd9fe93ae1d 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.hpp @@ -9,6 +9,125 @@ #include "ur_level_zero_common.hpp" -struct _ur_program_handle_t : _ur_object { - _ur_program_handle_t() {} +struct ur_program_handle_t_ : _ur_object { + // ur_program_handle_t_() {} + + typedef enum { + // The program has been created from intermediate language (SPIR-V), but it + // is not yet compiled. + IL, + + // The program has been created by loading native code, but it has not yet + // been built. This is equivalent to an OpenCL "program executable" that + // is loaded via clCreateProgramWithBinary(). + Native, + + // The program was notionally compiled from SPIR-V form. However, since we + // postpone compilation until the module is linked, the internal state + // still represents the module as SPIR-V. + Object, + + // The program has been built or linked, and it is represented as a Level + // Zero module. + Exe, + + // An error occurred during piProgramLink, but we created a _pi_program + // object anyways in order to hold the ZeBuildLog. Note that the ZeModule + // may or may not be nullptr in this state, depending on the error. + Invalid + } state; + + // A utility class that converts specialization constants into the form + // required by the Level Zero driver. + class SpecConstantShim { + public: + SpecConstantShim(ur_program_handle_t_ *Program) { + ZeSpecConstants.numConstants = Program->SpecConstants.size(); + ZeSpecContantsIds.reserve(ZeSpecConstants.numConstants); + ZeSpecContantsValues.reserve(ZeSpecConstants.numConstants); + + for (auto &SpecConstant : Program->SpecConstants) { + ZeSpecContantsIds.push_back(SpecConstant.first); + ZeSpecContantsValues.push_back(SpecConstant.second); + } + ZeSpecConstants.pConstantIds = ZeSpecContantsIds.data(); + ZeSpecConstants.pConstantValues = ZeSpecContantsValues.data(); + } + + const ze_module_constants_t *ze() { return &ZeSpecConstants; } + + private: + std::vector ZeSpecContantsIds; + std::vector ZeSpecContantsValues; + ze_module_constants_t ZeSpecConstants; + }; + + // Construct a program in IL or Native state. + ur_program_handle_t_(state St, ur_context_handle_t Context, const void *Input, + size_t Length) + : Context{Context}, OwnZeModule{true}, State{St}, + Code{new uint8_t[Length]}, CodeLength{Length}, ZeModule{nullptr}, + ZeBuildLog{nullptr} { + std::memcpy(Code.get(), Input, Length); + } + + // Construct a program in Exe or Invalid state. + ur_program_handle_t_(state St, ur_context_handle_t Context, + ze_module_handle_t ZeModule, + ze_module_build_log_handle_t ZeBuildLog) + : Context{Context}, OwnZeModule{true}, State{St}, ZeModule{ZeModule}, + ZeBuildLog{ZeBuildLog} {} + + // Construct a program in Exe state (interop). + ur_program_handle_t_(state St, ur_context_handle_t Context, + ze_module_handle_t ZeModule, bool OwnZeModule) + : Context{Context}, OwnZeModule{OwnZeModule}, State{St}, + ZeModule{ZeModule}, ZeBuildLog{nullptr} {} + + // Construct a program from native handle + ur_program_handle_t_(state St, ur_context_handle_t Context, + ze_module_handle_t ZeModule) + : Context{Context}, OwnZeModule{true}, State{St}, ZeModule{ZeModule}, + ZeBuildLog{nullptr} {} + + // Construct a program in Invalid state with a custom error message. + ur_program_handle_t_(state St, ur_context_handle_t Context, + const std::string &ErrorMessage) + : Context{Context}, OwnZeModule{true}, ErrorMessage{ErrorMessage}, + State{St}, ZeModule{nullptr}, ZeBuildLog{nullptr} {} + + ~ur_program_handle_t_(); + + const ur_context_handle_t Context; // Context of the program. + + // Indicates if we own the ZeModule or it came from interop that + // asked to not transfer the ownership to SYCL RT. + const bool OwnZeModule; + + // This error message is used only in Invalid state to hold a custom error + // message from a call to piProgramLink. + const std::string ErrorMessage; + + state State; + + // In IL and Object states, this contains the SPIR-V representation of the + // module. In Native state, it contains the native code. + std::unique_ptr Code; // Array containing raw IL / native code. + size_t CodeLength{0}; // Size (bytes) of the array. + + // Used only in IL and Object states. Contains the SPIR-V specialization + // constants as a map from the SPIR-V "SpecID" to a buffer that contains the + // associated value. The caller of the PI layer is responsible for + // maintaining the storage of this buffer. + std::unordered_map SpecConstants; + + // Used only in Object state. Contains the build flags from the last call to + // piProgramCompile(). + std::string BuildFlags; + + // The Level Zero module handle. Used primarily in Exe state. + ze_module_handle_t ZeModule{}; + + // The Level Zero build log from the last call to zeModuleCreate(). + ze_module_build_log_handle_t ZeBuildLog{}; }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp index 8838bb74269bf..e3e21eb3e98e2 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp @@ -6,4 +6,1786 @@ // //===-----------------------------------------------------------------===// +#include +#include +#include +#include + +#include "ur_level_zero_common.hpp" #include "ur_level_zero_queue.hpp" +#include + +/// @brief Cleanup events in the immediate lists of the queue. +/// @param Queue Queue where events need to be cleaned up. +/// @param QueueLocked Indicates if the queue mutex is locked by caller. +/// @param QueueSynced 'true' if queue was synchronized before the +/// call and no other commands were submitted after synchronization, 'false' +/// otherwise. +/// @param CompletedEvent Hint providing an event which was synchronized before +/// the call, in case of in-order queue it allows to cleanup all preceding +/// events. +/// @return PI_SUCCESS if successful, PI error code otherwise. +ur_result_t CleanupEventsInImmCmdLists(ur_queue_handle_t UrQueue, + bool QueueLocked, bool QueueSynced, + ur_event_handle_t CompletedEvent) { + // Handle only immediate command lists here. + if (!UrQueue || !UrQueue->Device->ImmCommandListUsed) + return UR_RESULT_SUCCESS; + + ur_event_handle_t_ *UrCompletedEvent = + reinterpret_cast(CompletedEvent); + + std::vector EventListToCleanup; + { + std::unique_lock QueueLock(UrQueue->Mutex, + std::defer_lock); + if (!QueueLocked) + QueueLock.lock(); + // If queue is locked and fully synchronized then cleanup all events. + // If queue is not locked then by this time there may be new submitted + // commands so we can't do full cleanup. + if (QueueLocked && + (QueueSynced || (UrQueue->isInOrderQueue() && + (reinterpret_cast( + UrCompletedEvent) == UrQueue->LastCommandEvent || + !UrQueue->LastCommandEvent)))) { + UrQueue->LastCommandEvent = nullptr; + for (auto &&It = UrQueue->CommandListMap.begin(); + It != UrQueue->CommandListMap.end(); ++It) { + UR_CALL(UrQueue->resetCommandList(It, true, EventListToCleanup, + false /* CheckStatus */)); + } + } else if (UrQueue->isInOrderQueue() && UrCompletedEvent) { + // If the queue is in-order and we have information about completed event + // then cleanup all events in the command list preceding to CompletedEvent + // including itself. + + // Check that the comleted event has associated command list. + if (!(UrCompletedEvent->CommandList && + UrCompletedEvent->CommandList.value() != + UrQueue->CommandListMap.end())) + return UR_RESULT_SUCCESS; + + auto &CmdListEvents = + UrCompletedEvent->CommandList.value()->second.EventList; + auto CompletedEventIt = std::find(CmdListEvents.begin(), + CmdListEvents.end(), UrCompletedEvent); + if (CompletedEventIt != CmdListEvents.end()) { + // We can cleanup all events prior to the completed event in this + // command list and completed event itself. + // TODO: we can potentially cleanup more events here by finding + // completed events on another command lists, but it is currently not + // implemented. + std::move(std::begin(CmdListEvents), CompletedEventIt + 1, + std::back_inserter(EventListToCleanup)); + CmdListEvents.erase(CmdListEvents.begin(), CompletedEventIt + 1); + } + } else { + // Fallback to resetCommandList over all command lists. + for (auto &&It = UrQueue->CommandListMap.begin(); + It != UrQueue->CommandListMap.end(); ++It) { + UR_CALL(UrQueue->resetCommandList(It, true, EventListToCleanup, + true /* CheckStatus */)); + } + } + } + UR_CALL(CleanupEventListFromResetCmdList(EventListToCleanup, QueueLocked)); + return UR_RESULT_SUCCESS; +} + +/// @brief Reset signalled command lists in the queue and put them to the cache +/// of command lists. Also cleanup events associated with signalled command +/// lists. Queue must be locked by the caller for modification. +/// @param Queue Queue where we look for signalled command lists and cleanup +/// events. +/// @return PI_SUCCESS if successful, PI error code otherwise. +ur_result_t resetCommandLists(ur_queue_handle_t Queue) { + // Handle immediate command lists here, they don't need to be reset and we + // only need to cleanup events. + if (Queue->Device->ImmCommandListUsed) { + UR_CALL(CleanupEventsInImmCmdLists(Queue, true /*locked*/)); + return UR_RESULT_SUCCESS; + } + + // We need events to be cleaned up out of scope where queue is locked to avoid + // nested locks, because event cleanup requires event to be locked. Nested + // locks are hard to control and can cause deadlocks if mutexes are locked in + // different order. + std::vector EventListToCleanup; + + // We check for command lists that have been already signalled, but have not + // been added to the available list yet. Each command list has a fence + // associated which tracks if a command list has completed dispatch of its + // commands and is ready for reuse. If a command list is found to have been + // signalled, then the command list & fence are reset and command list is + // returned to the command list cache. All events associated with command + // list are cleaned up if command list was reset. + for (auto &&it = Queue->CommandListMap.begin(); + it != Queue->CommandListMap.end(); ++it) { + // Immediate commandlists don't use a fence and are handled separately + // above. + assert(it->second.ZeFence != nullptr); + // It is possible that the fence was already noted as signalled and + // reset. In that case the ZeFenceInUse flag will be false. + if (it->second.ZeFenceInUse) { + ze_result_t ZeResult = + ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence)); + if (ZeResult == ZE_RESULT_SUCCESS) + UR_CALL(Queue->resetCommandList(it, true, EventListToCleanup)); + } + } + CleanupEventListFromResetCmdList(EventListToCleanup, true /*locked*/); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_queue_info_t ParamName, ///< [in] name of the queue property to query + size_t ParamValueSize, ///< [in] size in bytes of the queue property value + ///< provided + void *ParamValue, ///< [out] value of the queue property + size_t *ParamValueSizeRet ///< [out] size in bytes returned in queue + ///< property value +) { + + std::shared_lock Lock(Queue->Mutex); + UrReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); + // TODO: consider support for queue properties and size + switch ((uint32_t)ParamName) { // cast to avoid warnings on EXT enum values + case UR_QUEUE_INFO_CONTEXT: + return ReturnValue(Queue->Context); + case UR_QUEUE_INFO_DEVICE: + return ReturnValue(Queue->Device); + case UR_QUEUE_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{Queue->RefCount.load()}); + case UR_QUEUE_INFO_PROPERTIES: + die("UR_QUEUE_INFO_PROPERTIES in urQueueGetInfo not implemented\n"); + break; + case UR_QUEUE_INFO_SIZE: + die("UR_QUEUE_INFO_SIZE in urQueueGetInfo not implemented\n"); + break; + case UR_QUEUE_INFO_DEVICE_DEFAULT: + die("UR_QUEUE_INFO_DEVICE_DEFAULT in urQueueGetInfo not implemented\n"); + break; + case UR_EXT_ONEAPI_QUEUE_INFO_EMPTY: { + // We can exit early if we have in-order queue. + if (Queue->isInOrderQueue()) { + if (!Queue->LastCommandEvent) + return ReturnValue(true); + + // We can check status of the event only if it isn't discarded otherwise + // it may be reset (because we are free to reuse such events) and + // zeEventQueryStatus will hang. + // TODO: use more robust way to check that ZeEvent is not owned by + // LastCommandEvent. + if (!Queue->LastCommandEvent->IsDiscarded) { + ze_result_t ZeResult = ZE_CALL_NOCHECK( + zeEventQueryStatus, (Queue->LastCommandEvent->ZeEvent)); + if (ZeResult == ZE_RESULT_NOT_READY) { + return ReturnValue(false); + } else if (ZeResult != ZE_RESULT_SUCCESS) { + return ze2urResult(ZeResult); + } + return ReturnValue(true); + } + // For immediate command lists we have to check status of the event + // because immediate command lists are not associated with level zero + // queue. Conservatively return false in this case because last event is + // discarded and we can't check its status. + if (Queue->Device->ImmCommandListUsed) + return ReturnValue(false); + } + + // If we have any open command list which is not empty then return false + // because it means that there are commands which are not even submitted for + // execution yet. + using IsCopy = bool; + if (Queue->hasOpenCommandList(IsCopy{true}) || + Queue->hasOpenCommandList(IsCopy{false})) + return ReturnValue(false); + + for (const auto &QueueMap : + {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID}) { + for (const auto &QueueGroup : QueueMap) { + if (Queue->Device->ImmCommandListUsed) { + // Immediate command lists are not associated with any Level Zero + // queue, that's why we have to check status of events in each + // immediate command list. Start checking from the end and exit early + // if some event is not completed. + for (const auto &ImmCmdList : QueueGroup.second.ImmCmdLists) { + if (ImmCmdList == Queue->CommandListMap.end()) + continue; + + auto EventList = ImmCmdList->second.EventList; + for (auto It = EventList.crbegin(); It != EventList.crend(); It++) { + ze_result_t ZeResult = + ZE_CALL_NOCHECK(zeEventQueryStatus, ((*It)->ZeEvent)); + if (ZeResult == ZE_RESULT_NOT_READY) { + return ReturnValue(false); + } else if (ZeResult != ZE_RESULT_SUCCESS) { + return ze2urResult(ZeResult); + } + } + } + } else { + for (const auto &ZeQueue : QueueGroup.second.ZeQueues) { + if (!ZeQueue) + continue; + // Provide 0 as the timeout parameter to immediately get the status + // of the Level Zero queue. + ze_result_t ZeResult = ZE_CALL_NOCHECK(zeCommandQueueSynchronize, + (ZeQueue, /* timeout */ 0)); + if (ZeResult == ZE_RESULT_NOT_READY) { + return ReturnValue(false); + } else if (ZeResult != ZE_RESULT_SUCCESS) { + return ze2urResult(ZeResult); + } + } + } + } + } + return ReturnValue(true); + } + default: + urPrint("Unsupported ParamName in urQueueGetInfo: ParamName=%d(0x%x)\n", + ParamName, ParamName); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return UR_RESULT_SUCCESS; +} + +// Controls if we should choose doing eager initialization +// to make it happen on warmup paths and have the reportable +// paths be less likely affected. +// +static bool doEagerInit = [] { + const char *EagerInit = std::getenv("SYCL_EAGER_INIT"); + return EagerInit ? std::atoi(EagerInit) != 0 : false; +}(); + +UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_device_handle_t hDevice, ///< [in] handle of the device object + const ur_queue_property_t + *pProps, ///< [in] specifies a list of queue properties and their + ///< corresponding values. Each property name is immediately + ///< followed by the corresponding desired value. The list is + ///< terminated with a 0. If a property value is not specified, + ///< then its default value will be used. + ur_queue_handle_t + *phQueue ///< [out] pointer to handle of queue object created +) { + ur_context_handle_t Context = hContext; + ur_device_handle_t Device = hDevice; + ur_queue_handle_t_ **Queue = reinterpret_cast(phQueue); + + Context->Devices[0] = Device; + + const pi_queue_properties *Properties = + reinterpret_cast(pProps); + pi_queue_properties Flags = Properties[1]; + + auto ForceComputeIndex = Properties[2] == PI_QUEUE_COMPUTE_INDEX + ? static_cast(Properties[3]) + : -1; // Use default/round-robin. + + UR_ASSERT(Context->isValidDevice(Device), UR_RESULT_ERROR_INVALID_DEVICE); + + // Create placeholder queues in the compute queue group. + // Actual L0 queues will be created at first use. + std::vector ZeComputeCommandQueues( + Device->QueueGroup[ur_queue_handle_t_::queue_type::Compute] + .ZeProperties.numQueues, + nullptr); + + // Create placeholder queues in the copy queue group (main and link + // native groups are combined into one group). + // Actual L0 queues will be created at first use. + size_t NumCopyGroups = 0; + if (Device->hasMainCopyEngine()) { + NumCopyGroups += + Device->QueueGroup[ur_queue_handle_t_::queue_type::MainCopy] + .ZeProperties.numQueues; + } + if (Device->hasLinkCopyEngine()) { + NumCopyGroups += + Device->QueueGroup[ur_queue_handle_t_::queue_type::LinkCopy] + .ZeProperties.numQueues; + } + std::vector ZeCopyCommandQueues(NumCopyGroups, + nullptr); + + try { + *Queue = + new ur_queue_handle_t_(ZeComputeCommandQueues, ZeCopyCommandQueues, + Context, Device, true, Flags, ForceComputeIndex); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + // Do eager initialization of Level Zero handles on request. + if (doEagerInit) { + ur_queue_handle_t Q = *phQueue; + // Creates said number of command-lists. + auto warmupQueueGroup = [Q](bool UseCopyEngine, + uint32_t RepeatCount) -> ur_result_t { + ur_command_list_ptr_t CommandList; + while (RepeatCount--) { + if (Q->Device->ImmCommandListUsed) { + CommandList = Q->getQueueGroup(UseCopyEngine).getImmCmdList(); + } else { + // Heuristically create some number of regular command-list to reuse. + for (int I = 0; I < 10; ++I) { + UR_CALL(Q->createCommandList(UseCopyEngine, CommandList)); + // Immediately return them to the cache of available command-lists. + std::vector EventsUnused; + UR_CALL(Q->resetCommandList(CommandList, true /* MakeAvailable */, + EventsUnused)); + } + } + } + return UR_RESULT_SUCCESS; + }; + // Create as many command-lists as there are queues in the group. + // With this the underlying round-robin logic would initialize all + // native queues, and create command-lists and their fences. + // At this point only the thread creating the queue will have associated + // command-lists. Other threads have not accessed the queue yet. So we can + // only warmup the initial thread's command-lists. + auto QueueGroup = Q->ComputeQueueGroupsByTID.get(); + UR_CALL(warmupQueueGroup(false, QueueGroup.UpperIndex - + QueueGroup.LowerIndex + 1)); + if (Q->useCopyEngine()) { + auto QueueGroup = Q->CopyQueueGroupsByTID.get(); + UR_CALL(warmupQueueGroup(true, QueueGroup.UpperIndex - + QueueGroup.LowerIndex + 1)); + } + // TODO: warmup event pools. Both host-visible and device-only. + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain( + ur_queue_handle_t Queue ///< [in] handle of the queue object to get access +) { + { + std::scoped_lock Lock(Queue->Mutex); + Queue->RefCountExternal++; + } + Queue->RefCount.increment(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease( + ur_queue_handle_t Queue ///< [in] handle of the queue object to release +) { + + std::vector EventListToCleanup; + { + std::scoped_lock Lock(Queue->Mutex); + + if ((--Queue->RefCountExternal) != 0) + return UR_RESULT_SUCCESS; + + // When external reference count goes to zero it is still possible + // that internal references still exists, e.g. command-lists that + // are not yet completed. So do full queue synchronization here + // and perform proper cleanup. + // + // It is possible to get to here and still have an open command list + // if no wait or finish ever occurred for this queue. + if (auto Res = Queue->executeAllOpenCommandLists()) + return Res; + + // Make sure all commands get executed. + Queue->synchronize(); + + // Destroy all the fences created associated with this queue. + for (auto it = Queue->CommandListMap.begin(); + it != Queue->CommandListMap.end(); ++it) { + // This fence wasn't yet signalled when we polled it for recycling + // the command-list, so need to release the command-list too. + // For immediate commandlists we don't need to do an L0 reset of the + // commandlist but do need to do event cleanup which is also in the + // resetCommandList function. + // If the fence is a nullptr we are using immediate commandlists, + // otherwise regular commandlists which use a fence. + if (it->second.ZeFence == nullptr || it->second.ZeFenceInUse) { + Queue->resetCommandList(it, true, EventListToCleanup); + } + // TODO: remove "if" when the problem is fixed in the level zero + // runtime. Destroy only if a queue is healthy. Destroying a fence may + // cause a hang otherwise. + // If the fence is a nullptr we are using immediate commandlists. + if (Queue->Healthy && it->second.ZeFence != nullptr) { + auto ZeResult = ZE_CALL_NOCHECK(zeFenceDestroy, (it->second.ZeFence)); + // Gracefully handle the case that L0 was already unloaded. + if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) + return ze2urResult(ZeResult); + } + } + Queue->CommandListMap.clear(); + } + + for (auto &Event : EventListToCleanup) { + // We don't need to synchronize the events since the queue + // synchronized above already does that. + { + std::scoped_lock EventLock(Event->Mutex); + Event->Completed = true; + } + UR_CALL(CleanupCompletedEvent(Event)); + // This event was removed from the command list, so decrement ref count + // (it was incremented when they were added to the command list). + UR_CALL(urEventReleaseInternal(reinterpret_cast(Event))); + } + UR_CALL(urQueueReleaseInternal(reinterpret_cast(Queue))); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( + ur_queue_handle_t Queue, ///< [in] handle of the queue. + ur_native_handle_t + *NativeQueue ///< [out] a pointer to the native handle of the queue. +) { + // Lock automatically releases when this goes out of scope. + std::shared_lock lock(Queue->Mutex); + + auto ZeQueue = ur_cast(NativeQueue); + + // Extract a Level Zero compute queue handle from the given PI queue + auto &QueueGroup = Queue->getQueueGroup(false /*compute*/); + uint32_t QueueGroupOrdinalUnused; + *ZeQueue = QueueGroup.getZeQueue(&QueueGroupOrdinalUnused); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( + ur_native_handle_t NativeQueue, ///< [in] the native handle of the queue. + ur_context_handle_t Context, ///< [in] handle of the context object + ur_queue_handle_t + *RetQueue ///< [out] pointer to the handle of the queue object created. +) { + auto ZeQueue = ur_cast(NativeQueue); + // Assume this is the "0" index queue in the compute command-group. + std::vector ZeQueues{ZeQueue}; + + // TODO: see what we can do to correctly initialize PI queue for + // compute vs. copy Level-Zero queue. Currently we will send + // all commands to the "ZeQueue". + std::vector ZeroCopyQueues; + + // Get the device handle from first device in the platform + // Maybe this is not completely correct. + uint32_t NumEntries = 1; + ur_platform_handle_t Platform{}; + UR_CALL(urPlatformGet(NumEntries, &Platform, nullptr)); + + ur_device_handle_t Device; + UR_CALL( + urDeviceGet(Platform, UR_DEVICE_TYPE_GPU, NumEntries, &Device, nullptr)); + + try { + ur_queue_handle_t_ *Queue = new ur_queue_handle_t_(ZeQueues, ZeroCopyQueues, + Context, Device, false); + *RetQueue = reinterpret_cast(Queue); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish( + ur_queue_handle_t hQueue ///< [in] handle of the queue to be finished. +) { + // _pi_queue *PiQueue = reinterpret_cast<_pi_queue *>(Queue); + // ur_queue_handle_t UrQueue = PiQueue->UrQueue; + ur_queue_handle_t_ *UrQueue = reinterpret_cast(hQueue); + + if (UrQueue->Device->ImmCommandListUsed) { + // Lock automatically releases when this goes out of scope. + std::scoped_lock Lock(UrQueue->Mutex); + + UrQueue->synchronize(); + } else { + std::unique_lock Lock(UrQueue->Mutex); + std::vector ZeQueues; + + // execute any command list that may still be open. + UR_CALL(UrQueue->executeAllOpenCommandLists()); + + // Make a copy of queues to sync and release the lock. + for (auto &QueueMap : + {UrQueue->ComputeQueueGroupsByTID, UrQueue->CopyQueueGroupsByTID}) + for (auto &QueueGroup : QueueMap) + std::copy(QueueGroup.second.ZeQueues.begin(), + QueueGroup.second.ZeQueues.end(), + std::back_inserter(ZeQueues)); + + // Remember the last command's event. + auto LastCommandEvent = UrQueue->LastCommandEvent; + + // Don't hold a lock to the queue's mutex while waiting. + // This allows continue working with the queue from other threads. + // TODO: this currently exhibits some issues in the driver, so + // we control this with an env var. Remove this control when + // we settle one way or the other. + static bool HoldLock = + std::getenv("SYCL_PI_LEVEL_ZERO_QUEUE_FINISH_HOLD_LOCK") != nullptr; + if (!HoldLock) { + Lock.unlock(); + } + + for (auto &ZeQueue : ZeQueues) { + if (ZeQueue) + ZE2UR_CALL(zeHostSynchronize, (ZeQueue)); + } + + // Prevent unneeded already finished events to show up in the wait list. + // We can only do so if nothing else was submitted to the queue + // while we were synchronizing it. + if (!HoldLock) { + std::scoped_lock Lock(UrQueue->Mutex); + if (LastCommandEvent == UrQueue->LastCommandEvent) { + UrQueue->LastCommandEvent = nullptr; + } + } else { + UrQueue->LastCommandEvent = nullptr; + } + } + // Reset signalled command lists and return them back to the cache of + // available command lists. Events in the immediate command lists are cleaned + // up in synchronize(). + if (!UrQueue->Device->ImmCommandListUsed) { + std::unique_lock Lock(UrQueue->Mutex); + resetCommandLists(UrQueue); + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush( + ur_queue_handle_t Queue ///< [in] handle of the queue to be flushed. +) { + // Flushing cross-queue dependencies is covered by + // createAndRetainUrZeEventList, so this can be left as a no-op. + std::ignore = Queue; + return UR_RESULT_SUCCESS; +} + +// Configuration of the command-list batching. +struct zeCommandListBatchConfig { + // Default value of 0. This specifies to use dynamic batch size adjustment. + // Other values will try to collect specified amount of commands. + uint32_t Size{0}; + + // If doing dynamic batching, specifies start batch size. + uint32_t DynamicSizeStart{4}; + + // The maximum size for dynamic batch. + uint32_t DynamicSizeMax{64}; + + // The step size for dynamic batch increases. + uint32_t DynamicSizeStep{1}; + + // Thresholds for when increase batch size (number of closed early is small + // and number of closed full is high). + uint32_t NumTimesClosedEarlyThreshold{3}; + uint32_t NumTimesClosedFullThreshold{8}; + + // Tells the starting size of a batch. + uint32_t startSize() const { return Size > 0 ? Size : DynamicSizeStart; } + // Tells is we are doing dynamic batch size adjustment. + bool dynamic() const { return Size == 0; } +}; + +// Helper function to initialize static variables that holds batch config info +// for compute and copy command batching. +static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) { + zeCommandListBatchConfig Config{}; // default initialize + + // Default value of 0. This specifies to use dynamic batch size adjustment. + const auto BatchSizeStr = + (IsCopy) ? std::getenv("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE") + : std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE"); + if (BatchSizeStr) { + pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr); + // Level Zero may only support a limted number of commands per command + // list. The actual upper limit is not specified by the Level Zero + // Specification. For now we allow an arbitrary upper limit. + if (BatchSizeStrVal > 0) { + Config.Size = BatchSizeStrVal; + } else if (BatchSizeStrVal == 0) { + Config.Size = 0; + // We are requested to do dynamic batching. Collect specifics, if any. + // The extended format supported is ":" separated values. + // + // NOTE: these extra settings are experimental and are intended to + // be used only for finding a better default heuristic. + // + std::string BatchConfig(BatchSizeStr); + size_t Ord = 0; + size_t Pos = 0; + while (true) { + if (++Ord > 5) + break; + + Pos = BatchConfig.find(":", Pos); + if (Pos == std::string::npos) + break; + ++Pos; // past the ":" + + uint32_t Val; + try { + Val = std::stoi(BatchConfig.substr(Pos)); + } catch (...) { + if (IsCopy) + urPrint( + "SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: failed to parse value\n"); + else + urPrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: failed to parse value\n"); + break; + } + switch (Ord) { + case 1: + Config.DynamicSizeStart = Val; + break; + case 2: + Config.DynamicSizeMax = Val; + break; + case 3: + Config.DynamicSizeStep = Val; + break; + case 4: + Config.NumTimesClosedEarlyThreshold = Val; + break; + case 5: + Config.NumTimesClosedFullThreshold = Val; + break; + default: + die("Unexpected batch config"); + } + if (IsCopy) + urPrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: dynamic batch param " + "#%d: %d\n", + (int)Ord, (int)Val); + else + urPrint( + "SYCL_PI_LEVEL_ZERO_BATCH_SIZE: dynamic batch param #%d: %d\n", + (int)Ord, (int)Val); + }; + + } else { + // Negative batch sizes are silently ignored. + if (IsCopy) + urPrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: ignored negative value\n"); + else + urPrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: ignored negative value\n"); + } + } + return Config; +} + +// SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE can be set to an integer (>=0) in +// which case all compute commands will be submitted to the command-queue +// with the given index in the compute command group. If it is instead set +// to negative then all available compute engines may be used. +// +// The default value is "0". +// +static const std::pair getRangeOfAllowedComputeEngines() { + static const char *EnvVar = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE"); + // If the environment variable is not set only use "0" CCS for now. + // TODO: allow all CCSs when HW support is complete. + if (!EnvVar) + return std::pair(0, 0); + + auto EnvVarValue = std::atoi(EnvVar); + if (EnvVarValue >= 0) { + return std::pair(EnvVarValue, EnvVarValue); + } + + return std::pair(0, INT_MAX); +} + +// Static variable that holds batch config info for compute command batching. +static const zeCommandListBatchConfig ZeCommandListBatchComputeConfig = [] { + using IsCopy = bool; + return ZeCommandListBatchConfig(IsCopy{false}); +}(); + +// Static variable that holds batch config info for copy command batching. +static const zeCommandListBatchConfig ZeCommandListBatchCopyConfig = [] { + using IsCopy = bool; + return ZeCommandListBatchConfig(IsCopy{true}); +}(); + +ur_queue_handle_t_::ur_queue_handle_t_( + std::vector &ComputeQueues, + std::vector &CopyQueues, + ur_context_handle_t Context, ur_device_handle_t Device, + bool OwnZeCommandQueue, pi_queue_properties Properties, + int ForceComputeIndex) + : Context{Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue}, + Properties(Properties) { + // Compute group initialization. + // First, see if the queue's device allows for round-robin or it is + // fixed to one particular compute CCS (it is so for sub-sub-devices). + auto &ComputeQueueGroupInfo = Device->QueueGroup[queue_type::Compute]; + pi_queue_group_t ComputeQueueGroup{reinterpret_cast(this), + queue_type::Compute}; + ComputeQueueGroup.ZeQueues = ComputeQueues; + // Create space to hold immediate commandlists corresponding to the + // ZeQueues + if (Device->ImmCommandListUsed) { + ComputeQueueGroup.ImmCmdLists = std::vector( + ComputeQueueGroup.ZeQueues.size(), CommandListMap.end()); + } + if (ComputeQueueGroupInfo.ZeIndex >= 0) { + // Sub-sub-device + + // sycl::ext::intel::property::queue::compute_index works with any + // backend/device by allowing single zero index if multiple compute CCSes + // are not supported. Sub-sub-device falls into the same bucket. + assert(ForceComputeIndex <= 0); + ComputeQueueGroup.LowerIndex = ComputeQueueGroupInfo.ZeIndex; + ComputeQueueGroup.UpperIndex = ComputeQueueGroupInfo.ZeIndex; + ComputeQueueGroup.NextIndex = ComputeQueueGroupInfo.ZeIndex; + } else if (ForceComputeIndex >= 0) { + ComputeQueueGroup.LowerIndex = ForceComputeIndex; + ComputeQueueGroup.UpperIndex = ForceComputeIndex; + ComputeQueueGroup.NextIndex = ForceComputeIndex; + } else { + // Set-up to round-robin across allowed range of engines. + uint32_t FilterLowerIndex = getRangeOfAllowedComputeEngines().first; + uint32_t FilterUpperIndex = getRangeOfAllowedComputeEngines().second; + FilterUpperIndex = std::min((size_t)FilterUpperIndex, + FilterLowerIndex + ComputeQueues.size() - 1); + if (FilterLowerIndex <= FilterUpperIndex) { + ComputeQueueGroup.LowerIndex = FilterLowerIndex; + ComputeQueueGroup.UpperIndex = FilterUpperIndex; + ComputeQueueGroup.NextIndex = ComputeQueueGroup.LowerIndex; + } else { + die("No compute queue available/allowed."); + } + } + if (Device->ImmCommandListUsed) { + // Create space to hold immediate commandlists corresponding to the + // ZeQueues + ComputeQueueGroup.ImmCmdLists = std::vector( + ComputeQueueGroup.ZeQueues.size(), CommandListMap.end()); + } + + ComputeQueueGroupsByTID.set(ComputeQueueGroup); + + // Copy group initialization. + pi_queue_group_t CopyQueueGroup{reinterpret_cast(this), + queue_type::MainCopy}; + const auto &Range = getRangeOfAllowedCopyEngines((ur_device_handle_t)Device); + if (Range.first < 0 || Range.second < 0) { + // We are asked not to use copy engines, just do nothing. + // Leave CopyQueueGroup.ZeQueues empty, and it won't be used. + } else { + uint32_t FilterLowerIndex = Range.first; + uint32_t FilterUpperIndex = Range.second; + FilterUpperIndex = std::min((size_t)FilterUpperIndex, + FilterLowerIndex + CopyQueues.size() - 1); + if (FilterLowerIndex <= FilterUpperIndex) { + CopyQueueGroup.ZeQueues = CopyQueues; + CopyQueueGroup.LowerIndex = FilterLowerIndex; + CopyQueueGroup.UpperIndex = FilterUpperIndex; + CopyQueueGroup.NextIndex = CopyQueueGroup.LowerIndex; + // Create space to hold immediate commandlists corresponding to the + // ZeQueues + if (Device->ImmCommandListUsed) { + CopyQueueGroup.ImmCmdLists = std::vector( + CopyQueueGroup.ZeQueues.size(), CommandListMap.end()); + } + } + } + CopyQueueGroupsByTID.set(CopyQueueGroup); + + // Initialize compute/copy command batches. + ComputeCommandBatch.OpenCommandList = CommandListMap.end(); + CopyCommandBatch.OpenCommandList = CommandListMap.end(); + ComputeCommandBatch.QueueBatchSize = + ZeCommandListBatchComputeConfig.startSize(); + CopyCommandBatch.QueueBatchSize = ZeCommandListBatchCopyConfig.startSize(); +} + +void ur_queue_handle_t_::adjustBatchSizeForFullBatch(bool IsCopy) { + auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; + auto &ZeCommandListBatchConfig = + IsCopy ? ZeCommandListBatchCopyConfig : ZeCommandListBatchComputeConfig; + uint32_t &QueueBatchSize = CommandBatch.QueueBatchSize; + // QueueBatchSize of 0 means never allow batching. + if (QueueBatchSize == 0 || !ZeCommandListBatchConfig.dynamic()) + return; + CommandBatch.NumTimesClosedFull += 1; + + // If the number of times the list has been closed early is low, and + // the number of times it has been closed full is high, then raise + // the batching size slowly. Don't raise it if it is already pretty + // high. + if (CommandBatch.NumTimesClosedEarly <= + ZeCommandListBatchConfig.NumTimesClosedEarlyThreshold && + CommandBatch.NumTimesClosedFull > + ZeCommandListBatchConfig.NumTimesClosedFullThreshold) { + if (QueueBatchSize < ZeCommandListBatchConfig.DynamicSizeMax) { + QueueBatchSize += ZeCommandListBatchConfig.DynamicSizeStep; + urPrint("Raising QueueBatchSize to %d\n", QueueBatchSize); + } + CommandBatch.NumTimesClosedEarly = 0; + CommandBatch.NumTimesClosedFull = 0; + } +} + +void ur_queue_handle_t_::adjustBatchSizeForPartialBatch(bool IsCopy) { + auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; + auto &ZeCommandListBatchConfig = + IsCopy ? ZeCommandListBatchCopyConfig : ZeCommandListBatchComputeConfig; + uint32_t &QueueBatchSize = CommandBatch.QueueBatchSize; + // QueueBatchSize of 0 means never allow batching. + if (QueueBatchSize == 0 || !ZeCommandListBatchConfig.dynamic()) + return; + CommandBatch.NumTimesClosedEarly += 1; + + // If we are closing early more than about 3x the number of times + // it is closing full, lower the batch size to the value of the + // current open command list. This is trying to quickly get to a + // batch size that will be able to be closed full at least once + // in a while. + if (CommandBatch.NumTimesClosedEarly > + (CommandBatch.NumTimesClosedFull + 1) * 3) { + QueueBatchSize = CommandBatch.OpenCommandList->second.size() - 1; + if (QueueBatchSize < 1) + QueueBatchSize = 1; + urPrint("Lowering QueueBatchSize to %d\n", QueueBatchSize); + CommandBatch.NumTimesClosedEarly = 0; + CommandBatch.NumTimesClosedFull = 0; + } +} + +ur_result_t +ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, + bool IsBlocking, bool OKToBatchCommand) { + // Do nothing if command list is already closed. + if (CommandList->second.IsClosed) + return UR_RESULT_SUCCESS; + + bool UseCopyEngine = + CommandList->second.isCopy(reinterpret_cast(this)); + + // If the current LastCommandEvent is the nullptr, then it means + // either that no command has ever been issued to the queue + // or it means that the LastCommandEvent has been signalled and + // therefore that this Queue is idle. + // + // NOTE: this behavior adds some flakyness to the batching + // since last command's event may or may not be completed by the + // time we get here depending on timings and system/gpu load. + // So, disable it for modes where we print PI traces. Printing + // traces incurs much different timings than real execution + // ansyway, and many regression tests use it. + // + bool CurrentlyEmpty = !PrintTrace && this->LastCommandEvent == nullptr; + + // The list can be empty if command-list only contains signals of proxy + // events. It is possible that executeCommandList is called twice for the same + // command list without new appended command. We don't to want process the + // same last command event twice that's why additionally check that new + // command was appended to the command list. + if (!CommandList->second.EventList.empty() && + this->LastCommandEvent != CommandList->second.EventList.back()) { + this->LastCommandEvent = CommandList->second.EventList.back(); + if (doReuseDiscardedEvents()) { + UR_CALL(resetDiscardedEvent(CommandList)); + } + } + + this->LastUsedCommandList = CommandList; + + if (!Device->ImmCommandListUsed) { + // Batch if allowed to, but don't batch if we know there are no kernels + // from this queue that are currently executing. This is intended to get + // kernels started as soon as possible when there are no kernels from this + // queue awaiting execution, while allowing batching to occur when there + // are kernels already executing. Also, if we are using fixed size batching, + // as indicated by !ZeCommandListBatch.dynamic(), then just ignore + // CurrentlyEmpty as we want to strictly follow the batching the user + // specified. + auto &CommandBatch = UseCopyEngine ? CopyCommandBatch : ComputeCommandBatch; + auto &ZeCommandListBatchConfig = UseCopyEngine + ? ZeCommandListBatchCopyConfig + : ZeCommandListBatchComputeConfig; + if (OKToBatchCommand && this->isBatchingAllowed(UseCopyEngine) && + (!ZeCommandListBatchConfig.dynamic() || !CurrentlyEmpty)) { + + if (hasOpenCommandList(UseCopyEngine) && + CommandBatch.OpenCommandList != CommandList) + die("executeCommandList: OpenCommandList should be equal to" + "null or CommandList"); + + if (CommandList->second.size() < CommandBatch.QueueBatchSize) { + CommandBatch.OpenCommandList = CommandList; + return UR_RESULT_SUCCESS; + } + + adjustBatchSizeForFullBatch(UseCopyEngine); + CommandBatch.OpenCommandList = CommandListMap.end(); + } + } + + auto &ZeCommandQueue = CommandList->second.ZeQueue; + // Scope of the lock must be till the end of the function, otherwise new mem + // allocs can be created between the moment when we made a snapshot and the + // moment when command list is closed and executed. But mutex is locked only + // if indirect access tracking enabled, because std::defer_lock is used. + // unique_lock destructor at the end of the function will unlock the mutex + // if it was locked (which happens only if IndirectAccessTrackingEnabled is + // true). + std::unique_lock ContextsLock( + Device->Platform->ContextsMutex, std::defer_lock); + + if (IndirectAccessTrackingEnabled) { + // We are going to submit kernels for execution. If indirect access flag is + // set for a kernel then we need to make a snapshot of existing memory + // allocations in all contexts in the platform. We need to lock the mutex + // guarding the list of contexts in the platform to prevent creation of new + // memory alocations in any context before we submit the kernel for + // execution. + ContextsLock.lock(); + CaptureIndirectAccesses(); + } + + if (!Device->ImmCommandListUsed) { + // In this mode all inner-batch events have device visibility only, + // and we want the last command in the batch to signal a host-visible + // event that anybody waiting for any event in the batch will + // really be using. + // We need to create a proxy host-visible event only if the list of events + // in the command list is not empty, otherwise we are going to just create + // and remove proxy event right away and dereference deleted object + // afterwards. + if (Device->ZeEventsScope == LastCommandInBatchHostVisible && + !CommandList->second.EventList.empty()) { + // If there are only internal events in the command list then we don't + // need to create host proxy event. + auto Result = std::find_if( + CommandList->second.EventList.begin(), + CommandList->second.EventList.end(), + [](ur_event_handle_t E) { return E->hasExternalRefs(); }); + if (Result != CommandList->second.EventList.end()) { + // Create a "proxy" host-visible event. + // + ur_event_handle_t HostVisibleEvent; + auto Res = createEventAndAssociateQueue( + reinterpret_cast(this), &HostVisibleEvent, + UR_EXT_COMMAND_TYPE_USER, CommandList, + /* IsInternal */ false, /* HostVisible */ true); + if (Res) + return Res; + + // Update each command's event in the command-list to "see" this + // proxy event as a host-visible counterpart. + for (auto &Event : CommandList->second.EventList) { + std::scoped_lock EventLock(Event->Mutex); + // Internal event doesn't need host-visible proxy. + if (!Event->hasExternalRefs()) + continue; + + if (!Event->HostVisibleEvent) { + Event->HostVisibleEvent = + reinterpret_cast(HostVisibleEvent); + HostVisibleEvent->RefCount.increment(); + } + } + + // Decrement the reference count of the event such that all the + // remaining references are from the other commands in this batch and + // from the command-list itself. This host-visible event will not be + // waited/released by SYCL RT, so it must be destroyed after all events + // in the batch are gone. We know that refcount is more than 2 because + // we check that EventList of the command list is not empty above, i.e. + // after createEventAndAssociateQueue ref count is 2 and then +1 for + // each event in the EventList. + UR_CALL(urEventReleaseInternal(HostVisibleEvent)); + + if (doReuseDiscardedEvents()) { + // If we have in-order queue with discarded events then we want to + // treat this event as regular event. We insert a barrier in the next + // command list to wait for this event. + LastCommandEvent = HostVisibleEvent; + } else { + // For all other queues treat this as a special event and indicate no + // cleanup is needed. + // TODO: always treat this host event as a regular event. + UR_CALL(urEventReleaseInternal(HostVisibleEvent)); + HostVisibleEvent->CleanedUp = true; + } + + // Finally set to signal the host-visible event at the end of the + // command-list after a barrier that waits for all commands + // completion. + if (doReuseDiscardedEvents() && LastCommandEvent && + LastCommandEvent->IsDiscarded) { + // If we the last event is discarded then we already have a barrier + // inserted, so just signal the event. + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (CommandList->first, HostVisibleEvent->ZeEvent)); + } else { + ZE2UR_CALL( + zeCommandListAppendBarrier, + (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr)); + } + } else { + // If we don't have host visible proxy then signal event if needed. + this->signalEventFromCmdListIfLastEventDiscarded(CommandList); + } + } else { + // If we don't have host visible proxy then signal event if needed. + this->signalEventFromCmdListIfLastEventDiscarded(CommandList); + } + + // Close the command list and have it ready for dispatch. + ZE2UR_CALL(zeCommandListClose, (CommandList->first)); + // Mark this command list as closed. + CommandList->second.IsClosed = true; + this->LastUsedCommandList = CommandListMap.end(); + // Offload command list to the GPU for asynchronous execution + auto ZeCommandList = CommandList->first; + auto ZeResult = ZE_CALL_NOCHECK( + zeCommandQueueExecuteCommandLists, + (ZeCommandQueue, 1, &ZeCommandList, CommandList->second.ZeFence)); + if (ZeResult != ZE_RESULT_SUCCESS) { + this->Healthy = false; + if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) { + // Turn into a more informative end-user error. + return UR_RESULT_ERROR_UNKNOWN; + } + return ze2urResult(ZeResult); + } + } + + // Check global control to make every command blocking for debugging. + if (IsBlocking || (UrL0Serialize & UrL0SerializeBlock) != 0) { + if (Device->ImmCommandListUsed) { + synchronize(); + } else { + // Wait until command lists attached to the command queue are executed. + ZE2UR_CALL(zeHostSynchronize, (ZeCommandQueue)); + } + } + return UR_RESULT_SUCCESS; +} + +bool ur_queue_handle_t_::doReuseDiscardedEvents() { + return ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents(); +} + +ur_result_t +ur_queue_handle_t_::resetDiscardedEvent(ur_command_list_ptr_t CommandList) { + if (LastCommandEvent && LastCommandEvent->IsDiscarded) { + ZE2UR_CALL(zeCommandListAppendBarrier, + (CommandList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); + ZE2UR_CALL(zeCommandListAppendEventReset, + (CommandList->first, LastCommandEvent->ZeEvent)); + + // Create new pi_event but with the same ze_event_handle_t. We are going + // to use this pi_event for the next command with discarded event. + ur_event_handle_t_ *PiEvent; + try { + PiEvent = new ur_event_handle_t_( + LastCommandEvent->ZeEvent, LastCommandEvent->ZeEventPool, + reinterpret_cast(Context), + UR_EXT_COMMAND_TYPE_USER, true); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + if (LastCommandEvent->isHostVisible()) + PiEvent->HostVisibleEvent = reinterpret_cast(PiEvent); + + UR_CALL(addEventToQueueCache(reinterpret_cast(PiEvent))); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_handle_t_::addEventToQueueCache(ur_event_handle_t Event) { + auto Cache = Event->isHostVisible() ? &EventCaches[0] : &EventCaches[1]; + Cache->emplace_back(Event); + return UR_RESULT_SUCCESS; +} + +void ur_queue_handle_t_::active_barriers::add(ur_event_handle_t &Event) { + Event->RefCount.increment(); + Events.push_back(Event); +} + +ur_result_t ur_queue_handle_t_::active_barriers::clear() { + for (const auto &Event : Events) + UR_CALL(urEventReleaseInternal(Event)); + Events.clear(); + return UR_RESULT_SUCCESS; +} + +ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) { + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); + + if (!UrQueue->RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + + for (auto &Cache : UrQueue->EventCaches) + for (auto &Event : Cache) + UR_CALL(urEventReleaseInternal(Event)); + + if (UrQueue->OwnZeCommandQueue) { + for (auto &QueueMap : + {UrQueue->ComputeQueueGroupsByTID, UrQueue->CopyQueueGroupsByTID}) + for (auto &QueueGroup : QueueMap) + for (auto &ZeQueue : QueueGroup.second.ZeQueues) + if (ZeQueue) { + auto ZeResult = ZE_CALL_NOCHECK(zeCommandQueueDestroy, (ZeQueue)); + // Gracefully handle the case that L0 was already unloaded. + if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) + return ze2urResult(ZeResult); + } + } + + urPrint("urQueueRelease(compute) NumTimesClosedFull %d, " + "NumTimesClosedEarly %d\n", + UrQueue->ComputeCommandBatch.NumTimesClosedFull, + UrQueue->ComputeCommandBatch.NumTimesClosedEarly); + urPrint("urQueueRelease(copy) NumTimesClosedFull %d, NumTimesClosedEarly " + "%d\n", + UrQueue->CopyCommandBatch.NumTimesClosedFull, + UrQueue->CopyCommandBatch.NumTimesClosedEarly); + + delete UrQueue; + + return UR_RESULT_SUCCESS; +} + +bool ur_queue_handle_t_::isBatchingAllowed(bool IsCopy) const { + auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; + return (CommandBatch.QueueBatchSize > 0 && + ((UrL0Serialize & UrL0SerializeBlock) == 0)); +} + +bool ur_queue_handle_t_::isDiscardEvents() const { + return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS) != 0); +} + +bool ur_queue_handle_t_::isPriorityLow() const { + return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW) != 0); +} + +bool ur_queue_handle_t_::isPriorityHigh() const { + return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH) != 0); +} + +bool ur_queue_handle_t_::isInOrderQueue() const { + // If out-of-order queue property is not set, then this is a in-order queue. + return ((this->Properties & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) == + 0); +} + +// Helper function to perform the necessary cleanup of the events from reset cmd +// list. +ur_result_t CleanupEventListFromResetCmdList( + std::vector &EventListToCleanup, bool QueueLocked) { + for (auto &Event : EventListToCleanup) { + // We don't need to synchronize the events since the fence associated with + // the command list was synchronized. + { + std::scoped_lock EventLock(Event->Mutex); + Event->Completed = true; + } + UR_CALL(CleanupCompletedEvent(Event, QueueLocked)); + // This event was removed from the command list, so decrement ref count + // (it was incremented when they were added to the command list). + UR_CALL(urEventReleaseInternal(Event)); + } + return UR_RESULT_SUCCESS; +} + +// Wait on all operations in flight on this Queue. +// The caller is expected to hold a lock on the Queue. +// For standard commandlists sync the L0 queues directly. +// For immediate commandlists add barriers to all commandlists associated +// with the Queue. An alternative approach would be to wait on all Events +// associated with the in-flight operations. +// TODO: Event release in immediate commandlist mode is driven by the SYCL +// runtime. Need to investigate whether relase can be done earlier, at sync +// points such as this, to reduce total number of active Events. +ur_result_t ur_queue_handle_t_::synchronize() { + if (!Healthy) + return UR_RESULT_SUCCESS; + + auto syncImmCmdList = [](ur_queue_handle_t_ *Queue, + ur_command_list_ptr_t ImmCmdList) { + if (ImmCmdList == Queue->CommandListMap.end()) + return UR_RESULT_SUCCESS; + + ur_event_handle_t Event{}; + ur_result_t Res = createEventAndAssociateQueue( + reinterpret_cast(Queue), &Event, + UR_EXT_COMMAND_TYPE_USER, ImmCmdList, /* IsInternal */ false); + if (Res != UR_RESULT_SUCCESS) + return Res; + auto zeEvent = Event->ZeEvent; + ZE2UR_CALL(zeCommandListAppendBarrier, + (ImmCmdList->first, zeEvent, 0, nullptr)); + ZE2UR_CALL(zeHostSynchronize, (zeEvent)); + Event->Completed = true; + UR_CALL(urEventRelease(Event)); + // Cleanup all events from the synced command list. + auto EventListToCleanup = std::move(ImmCmdList->second.EventList); + ImmCmdList->second.EventList.clear(); + CleanupEventListFromResetCmdList(EventListToCleanup, true); + return UR_RESULT_SUCCESS; + }; + + for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) + for (auto &QueueGroup : QueueMap) { + if (Device->ImmCommandListUsed) { + for (auto ImmCmdList : QueueGroup.second.ImmCmdLists) + syncImmCmdList(this, ImmCmdList); + } else { + for (auto &ZeQueue : QueueGroup.second.ZeQueues) + if (ZeQueue) + ZE2UR_CALL(zeHostSynchronize, (ZeQueue)); + } + } + LastCommandEvent = nullptr; + + // With the entire queue synchronized, the active barriers must be done so we + // can remove them. + if (auto Res = ActiveBarriers.clear()) + return Res; + + return UR_RESULT_SUCCESS; +} + +ur_event_handle_t ur_queue_handle_t_::getEventFromQueueCache(bool HostVisible) { + auto Cache = HostVisible ? &EventCaches[0] : &EventCaches[1]; + + // If we don't have any events, return nullptr. + // If we have only a single event then it was used by the last command and we + // can't use it now because we have to enforce round robin between two events. + if (Cache->size() < 2) + return nullptr; + + // If there are two events then return an event from the beginning of the list + // since event of the last command is added to the end of the list. + auto It = Cache->begin(); + ur_event_handle_t RetEvent = *It; + Cache->erase(It); + return RetEvent; +} + +// This helper function creates a pi_event and associate a pi_queue. +// Note that the caller of this function must have acquired lock on the Queue +// that is passed in. +// \param Queue pi_queue to associate with a new event. +// \param Event a pointer to hold the newly created pi_event +// \param CommandType various command type determined by the caller +// \param CommandList is the command list where the event is added +// \param IsInternal tells if the event is internal, i.e. visible in the L0 +// plugin only. +// \param HostVisible tells if the event must be created in the +// host-visible pool. If not set then this function will decide. +ur_result_t createEventAndAssociateQueue(ur_queue_handle_t Queue, + ur_event_handle_t *Event, + ur_command_t CommandType, + ur_command_list_ptr_t CommandList, + bool IsInternal, + std::optional HostVisible) { + + if (!HostVisible.has_value()) { + // Internal/discarded events do not need host-scope visibility. + HostVisible = + IsInternal ? false : Queue->Device->ZeEventsScope == AllHostVisible; + } + + // If event is discarded then try to get event from the queue cache. + *Event = + IsInternal ? Queue->getEventFromQueueCache(HostVisible.value()) : nullptr; + + if (*Event == nullptr) + UR_CALL(EventCreate(Queue->Context, Queue, HostVisible.value(), Event)); + + (*Event)->UrQueue = Queue; + (*Event)->CommandType = CommandType; + (*Event)->IsDiscarded = IsInternal; + (*Event)->CommandList = CommandList; + // Discarded event doesn't own ze_event, it is used by multiple pi_event + // objects. We destroy corresponding ze_event by releasing events from the + // events cache at queue destruction. Event in the cache owns the Level Zero + // event. + if (IsInternal) + (*Event)->OwnNativeHandle = false; + + // Append this Event to the CommandList, if any + if (CommandList != Queue->CommandListMap.end()) { + CommandList->second.append(*Event); + (*Event)->RefCount.increment(); + } + + // We need to increment the reference counter here to avoid pi_queue + // being released before the associated pi_event is released because + // piEventRelease requires access to the associated pi_queue. + // In piEventRelease, the reference counter of the Queue is decremented + // to release it. + Queue->RefCount.increment(); + + // SYCL RT does not track completion of the events, so it could + // release a PI event as soon as that's not being waited in the app. + // But we have to ensure that the event is not destroyed before + // it is really signalled, so retain it explicitly here and + // release in CleanupCompletedEvent(Event). + // If the event is internal then don't increment the reference count as this + // event will not be waited/released by SYCL RT, so it must be destroyed by + // EventRelease in resetCommandList. + if (!IsInternal) + UR_CALL(urEventRetain(*Event)); + + return UR_RESULT_SUCCESS; +} + +void ur_queue_handle_t_::CaptureIndirectAccesses() { + for (auto &Kernel : KernelsToBeSubmitted) { + if (!Kernel->hasIndirectAccess()) + continue; + + auto &Contexts = Device->Platform->Contexts; + for (auto &Ctx : Contexts) { + for (auto &Elem : Ctx->MemAllocs) { + const auto &Pair = Kernel->MemAllocs.insert(&Elem); + // Kernel is referencing this memory allocation from now. + // If this memory allocation was already captured for this kernel, it + // means that kernel is submitted several times. Increase reference + // count only once because we release all allocations only when + // SubmissionsCount turns to 0. We don't want to know how many times + // allocation was retained by each submission. + if (Pair.second) + Elem.second.RefCount.increment(); + } + } + Kernel->SubmissionsCount++; + } + KernelsToBeSubmitted.clear(); +} + +ur_result_t ur_queue_handle_t_::signalEventFromCmdListIfLastEventDiscarded( + ur_command_list_ptr_t CommandList) { + // We signal new event at the end of command list only if we have queue with + // discard_events property and the last command event is discarded. + if (!(doReuseDiscardedEvents() && LastCommandEvent && + LastCommandEvent->IsDiscarded)) + return UR_RESULT_SUCCESS; + + // NOTE: We create this "glue" event not as internal so it is not + // participating in the discarded events reset/reuse logic, but + // with no host-visibility since it is not going to be waited + // from the host. + ur_event_handle_t Event; + UR_CALL(createEventAndAssociateQueue( + reinterpret_cast(this), &Event, + UR_EXT_COMMAND_TYPE_USER, CommandList, + /* IsInternal */ false, /* HostVisible */ false)); + UR_CALL(urEventReleaseInternal(Event)); + LastCommandEvent = Event; + + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (CommandList->first, Event->ZeEvent)); + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_handle_t_::executeOpenCommandList(bool IsCopy) { + auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch; + // If there are any commands still in the open command list for this + // queue, then close and execute that command list now. + if (hasOpenCommandList(IsCopy)) { + adjustBatchSizeForPartialBatch(IsCopy); + auto Res = executeCommandList(CommandBatch.OpenCommandList, false, false); + CommandBatch.OpenCommandList = CommandListMap.end(); + return Res; + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_handle_t_::resetCommandList( + ur_command_list_ptr_t CommandList, bool MakeAvailable, + std::vector &EventListToCleanup, bool CheckStatus) { + bool UseCopyEngine = CommandList->second.isCopy(this); + + // Immediate commandlists do not have an associated fence. + if (CommandList->second.ZeFence != nullptr) { + // Fence had been signalled meaning the associated command-list completed. + // Reset the fence and put the command list into a cache for reuse in PI + // calls. + ZE2UR_CALL(zeFenceReset, (CommandList->second.ZeFence)); + ZE2UR_CALL(zeCommandListReset, (CommandList->first)); + CommandList->second.ZeFenceInUse = false; + CommandList->second.IsClosed = false; + } + + auto &EventList = CommandList->second.EventList; + // Check if standard commandlist or fully synced in-order queue. + // If one of those conditions is met then we are sure that all events are + // completed so we don't need to check event status. + if (!CheckStatus || CommandList->second.ZeFence != nullptr || + (isInOrderQueue() && !LastCommandEvent)) { + // Remember all the events in this command list which needs to be + // released/cleaned up and clear event list associated with command list. + std::move(std::begin(EventList), std::end(EventList), + std::back_inserter(EventListToCleanup)); + EventList.clear(); + } else if (!isDiscardEvents()) { + // For immediate commandlist reset only those events that have signalled. + // If events in the queue are discarded then we can't check their status. + for (auto it = EventList.begin(); it != EventList.end();) { + std::scoped_lock EventLock((*it)->Mutex); + ze_result_t ZeResult = + (*it)->Completed + ? ZE_RESULT_SUCCESS + : ZE_CALL_NOCHECK(zeEventQueryStatus, ((*it)->ZeEvent)); + // Break early as soon as we found first incomplete event because next + // events are submitted even later. We are not trying to find all + // completed events here because it may be costly. I.e. we are checking + // only elements which are most likely completed because they were + // submitted earlier. It is guaranteed that all events will be eventually + // cleaned up at queue sync/release. + if (ZeResult == ZE_RESULT_NOT_READY) + break; + + if (ZeResult != ZE_RESULT_SUCCESS) + return ze2urResult(ZeResult); + + EventListToCleanup.push_back(std::move((*it))); + it = EventList.erase(it); + } + } + + // Standard commandlists move in and out of the cache as they are recycled. + // Immediate commandlists are always available. + if (CommandList->second.ZeFence != nullptr && MakeAvailable) { + std::scoped_lock Lock(this->Context->ZeCommandListCacheMutex); + auto &ZeCommandListCache = + UseCopyEngine + ? this->Context->ZeCopyCommandListCache[this->Device->ZeDevice] + : this->Context->ZeComputeCommandListCache[this->Device->ZeDevice]; + ZeCommandListCache.push_back(CommandList->first); + } + + return UR_RESULT_SUCCESS; +} + +bool pi_command_list_info_t::isCopy(ur_queue_handle_t Queue) const { + return ZeQueueGroupOrdinal != + (uint32_t)Queue->Device + ->QueueGroup + [ur_device_handle_t_::queue_group_info_t::type::Compute] + .ZeOrdinal; +} + +ur_command_list_ptr_t +ur_queue_handle_t_::eventOpenCommandList(ur_event_handle_t Event) { + using IsCopy = bool; + + if (Device->ImmCommandListUsed) { + // When using immediate commandlists there are no open command lists. + return CommandListMap.end(); + } + + if (hasOpenCommandList(IsCopy{false})) { + const auto &ComputeEventList = + ComputeCommandBatch.OpenCommandList->second.EventList; + if (std::find(ComputeEventList.begin(), ComputeEventList.end(), Event) != + ComputeEventList.end()) + return ComputeCommandBatch.OpenCommandList; + } + if (hasOpenCommandList(IsCopy{true})) { + const auto &CopyEventList = + CopyCommandBatch.OpenCommandList->second.EventList; + if (std::find(CopyEventList.begin(), CopyEventList.end(), Event) != + CopyEventList.end()) + return CopyCommandBatch.OpenCommandList; + } + return CommandListMap.end(); +} + +ur_queue_handle_t_::pi_queue_group_t & +ur_queue_handle_t_::getQueueGroup(bool UseCopyEngine) { + auto &Map = (UseCopyEngine ? CopyQueueGroupsByTID : ComputeQueueGroupsByTID); + return Map.get(); +} + +// Return the index of the next queue to use based on a +// round robin strategy and the queue group ordinal. +uint32_t ur_queue_handle_t_::pi_queue_group_t::getQueueIndex( + uint32_t *QueueGroupOrdinal, uint32_t *QueueIndex, bool QueryOnly) { + auto CurrentIndex = NextIndex; + + if (!QueryOnly) { + ++NextIndex; + if (NextIndex > UpperIndex) + NextIndex = LowerIndex; + } + + // Find out the right queue group ordinal (first queue might be "main" or + // "link") + auto QueueType = Type; + if (QueueType != queue_type::Compute) + QueueType = (CurrentIndex == 0 && Queue->Device->hasMainCopyEngine()) + ? queue_type::MainCopy + : queue_type::LinkCopy; + + *QueueGroupOrdinal = Queue->Device->QueueGroup[QueueType].ZeOrdinal; + // Adjust the index to the L0 queue group since we represent "main" and + // "link" + // L0 groups with a single copy group ("main" would take "0" index). + auto ZeCommandQueueIndex = CurrentIndex; + if (QueueType == queue_type::LinkCopy && Queue->Device->hasMainCopyEngine()) { + ZeCommandQueueIndex -= 1; + } + *QueueIndex = ZeCommandQueueIndex; + + return CurrentIndex; +} + +// This function will return one of possibly multiple available native +// queues and the value of the queue group ordinal. +ze_command_queue_handle_t & +ur_queue_handle_t_::pi_queue_group_t::getZeQueue(uint32_t *QueueGroupOrdinal) { + + // QueueIndex is the proper L0 index. + // Index is the plugins concept of index, with main and link copy engines in + // one range. + uint32_t QueueIndex; + auto Index = getQueueIndex(QueueGroupOrdinal, &QueueIndex); + + ze_command_queue_handle_t &ZeQueue = ZeQueues[Index]; + if (ZeQueue) + return ZeQueue; + + ZeStruct ZeCommandQueueDesc; + ZeCommandQueueDesc.ordinal = *QueueGroupOrdinal; + ZeCommandQueueDesc.index = QueueIndex; + ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; + const char *Priority = "Normal"; + if (Queue->isPriorityLow()) { + ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW; + Priority = "Low"; + } else if (Queue->isPriorityHigh()) { + ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH; + Priority = "High"; + } + + // Evaluate performance of explicit usage for "0" index. + if (QueueIndex != 0) { + ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY; + } + + urPrint("[getZeQueue]: create queue ordinal = %d, index = %d " + "(round robin in [%d, %d]) priority = %s\n", + ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex, + UpperIndex, Priority); + + auto ZeResult = ZE_CALL_NOCHECK( + zeCommandQueueCreate, (Queue->Context->ZeContext, Queue->Device->ZeDevice, + &ZeCommandQueueDesc, &ZeQueue)); + if (ZeResult) { + die("[L0] getZeQueue: failed to create queue"); + } + + return ZeQueue; +} + +int32_t ur_queue_handle_t_::pi_queue_group_t::getCmdQueueOrdinal( + ze_command_queue_handle_t CmdQueue) { + // Find out the right queue group ordinal (first queue might be "main" or + // "link") + auto QueueType = Type; + if (QueueType != queue_type::Compute) + QueueType = (ZeQueues[0] == CmdQueue && Queue->Device->hasMainCopyEngine()) + ? queue_type::MainCopy + : queue_type::LinkCopy; + return Queue->Device->QueueGroup[QueueType].ZeOrdinal; +} + +// Helper function to create a new command-list to this queue and associated +// fence tracking its completion. This command list & fence are added to the +// map of command lists in this queue with ZeFenceInUse = false. +// The caller must hold a lock of the queue already. +ur_result_t ur_queue_handle_t_::createCommandList( + bool UseCopyEngine, ur_command_list_ptr_t &CommandList, + ze_command_queue_handle_t *ForcedCmdQueue) { + + ze_fence_handle_t ZeFence; + ZeStruct ZeFenceDesc; + ze_command_list_handle_t ZeCommandList; + + uint32_t QueueGroupOrdinal; + auto &QGroup = getQueueGroup(UseCopyEngine); + auto &ZeCommandQueue = + ForcedCmdQueue ? *ForcedCmdQueue : QGroup.getZeQueue(&QueueGroupOrdinal); + if (ForcedCmdQueue) + QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue); + + ZeStruct ZeCommandListDesc; + ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; + + ZE2UR_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice, + &ZeCommandListDesc, &ZeCommandList)); + + ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); + std::tie(CommandList, std::ignore) = CommandListMap.insert( + std::pair( + ZeCommandList, + {ZeFence, false, false, ZeCommandQueue, QueueGroupOrdinal})); + + UR_CALL(insertStartBarrierIfDiscardEventsMode(CommandList)); + UR_CALL(insertActiveBarriers(CommandList, UseCopyEngine)); + return UR_RESULT_SUCCESS; +} + +ur_result_t +ur_queue_handle_t_::insertActiveBarriers(ur_command_list_ptr_t &CmdList, + bool UseCopyEngine) { + // Early exit if there are no active barriers. + if (ActiveBarriers.empty()) + return UR_RESULT_SUCCESS; + + // Create a wait-list and retain events. + _ur_ze_event_list_t ActiveBarriersWaitList; + UR_CALL(ActiveBarriersWaitList.createAndRetainUrZeEventList( + ActiveBarriers.vector().size(), ActiveBarriers.vector().data(), + reinterpret_cast(this), UseCopyEngine)); + + // We can now replace active barriers with the ones in the wait list. + UR_CALL(ActiveBarriers.clear()); + + if (ActiveBarriersWaitList.Length == 0) { + return UR_RESULT_SUCCESS; + } + + for (uint32_t I = 0; I < ActiveBarriersWaitList.Length; ++I) { + auto &Event = ActiveBarriersWaitList.UrEventList[I]; + ActiveBarriers.add(Event); + } + + ur_event_handle_t Event = nullptr; + if (auto Res = createEventAndAssociateQueue( + reinterpret_cast(this), &Event, + UR_EXT_COMMAND_TYPE_USER, CmdList, + /*IsInternal*/ true)) + return Res; + + Event->WaitList = ActiveBarriersWaitList; + Event->OwnNativeHandle = true; + + // If there are more active barriers, insert a barrier on the command-list. We + // do not need an event for finishing so we pass nullptr. + ZE2UR_CALL(zeCommandListAppendBarrier, + (CmdList->first, nullptr, ActiveBarriersWaitList.Length, + ActiveBarriersWaitList.ZeEventList)); + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_handle_t_::insertStartBarrierIfDiscardEventsMode( + ur_command_list_ptr_t &CmdList) { + // If current command list is different from the last command list then insert + // a barrier waiting for the last command event. + if (doReuseDiscardedEvents() && CmdList != LastUsedCommandList && + LastCommandEvent) { + ZE2UR_CALL(zeCommandListAppendBarrier, + (CmdList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); + LastCommandEvent = nullptr; + } + return UR_RESULT_SUCCESS; +} + +// This is an experimental option that allows the use of copy engine, if +// available in the device, in Level Zero plugin for copy operations submitted +// to an in-order queue. The default is 1. +static const bool UseCopyEngineForInOrderQueue = [] { + const char *CopyEngineForInOrderQueue = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE"); + return (!CopyEngineForInOrderQueue || + (std::stoi(CopyEngineForInOrderQueue) != 0)); +}(); + +bool ur_queue_handle_t_::useCopyEngine(bool PreferCopyEngine) const { + auto InitialCopyGroup = CopyQueueGroupsByTID.begin()->second; + return PreferCopyEngine && InitialCopyGroup.ZeQueues.size() > 0 && + (!isInOrderQueue() || UseCopyEngineForInOrderQueue); +} + +// This function will return one of po6ssibly multiple available +// immediate commandlists associated with this Queue. +ur_command_list_ptr_t &ur_queue_handle_t_::pi_queue_group_t::getImmCmdList() { + + uint32_t QueueIndex, QueueOrdinal; + auto Index = getQueueIndex(&QueueOrdinal, &QueueIndex); + + if (ImmCmdLists[Index] != Queue->CommandListMap.end()) + return ImmCmdLists[Index]; + + ZeStruct ZeCommandQueueDesc; + ZeCommandQueueDesc.ordinal = QueueOrdinal; + ZeCommandQueueDesc.index = QueueIndex; + ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; + const char *Priority = "Normal"; + if (Queue->isPriorityLow()) { + ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW; + Priority = "Low"; + } else if (Queue->isPriorityHigh()) { + ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH; + Priority = "High"; + } + + // Evaluate performance of explicit usage for "0" index. + if (QueueIndex != 0) { + ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY; + } + + urPrint("[getZeQueue]: create queue ordinal = %d, index = %d " + "(round robin in [%d, %d]) priority = %s\n", + ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex, + UpperIndex, Priority); + + ze_command_list_handle_t ZeCommandList; + ZE_CALL_NOCHECK(zeCommandListCreateImmediate, + (Queue->Context->ZeContext, Queue->Device->ZeDevice, + &ZeCommandQueueDesc, &ZeCommandList)); + ImmCmdLists[Index] = + Queue->CommandListMap + .insert(std::pair{ + ZeCommandList, {nullptr, true, false, nullptr, QueueOrdinal}}) + .first; + // Add this commandlist to the cache so it can be destroyed as part of + // urQueueReleaseInternal + auto QueueType = Type; + std::scoped_lock Lock(Queue->Context->ZeCommandListCacheMutex); + auto &ZeCommandListCache = + QueueType == queue_type::Compute + ? Queue->Context->ZeComputeCommandListCache[Queue->Device->ZeDevice] + : Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]; + ZeCommandListCache.push_back(ZeCommandList); + + return ImmCmdLists[Index]; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp index 8aa0e11a42d9a..c7b81dbf30af3 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp @@ -1,4 +1,4 @@ -//===--------- ur_level_zero_queue.hpp - Level Zero Adapter -----------===// +//===--------- ur_level_zero.hpp - Level Zero Adapter -----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -7,8 +7,504 @@ //===-----------------------------------------------------------------===// #pragma once +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + #include "ur_level_zero_common.hpp" +#include "ur_level_zero_device.hpp" + +extern "C" { +ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue); +} // extern "C" + +ur_result_t resetCommandLists(ur_queue_handle_t Queue); +ur_result_t +CleanupEventsInImmCmdLists(ur_queue_handle_t UrQueue, bool QueueLocked = false, + bool QueueSynced = false, + ur_event_handle_t CompletedEvent = nullptr); + +// Structure describing the specific use of a command-list in a queue. +// This is because command-lists are re-used across multiple queues +// in the same context. +struct pi_command_list_info_t { + // The Level-Zero fence that will be signalled at completion. + // Immediate commandlists do not have an associated fence. + // A nullptr for the fence indicates that this is an immediate commandlist. + ze_fence_handle_t ZeFence{nullptr}; + // Record if the fence is in use. + // This is needed to avoid leak of the tracked command-list if the fence + // was not yet signaled at the time all events in that list were already + // completed (we are polling the fence at events completion). The fence + // may be still "in-use" due to sporadic delay in HW. + bool ZeFenceInUse{false}; + + // Indicates if command list is in closed state. This is needed to avoid + // appending commands to the closed command list. + bool IsClosed{false}; + + // Record the queue to which the command list will be submitted. + ze_command_queue_handle_t ZeQueue{nullptr}; + // Keeps the ordinal of the ZeQueue queue group. Invalid if ZeQueue==nullptr + uint32_t ZeQueueGroupOrdinal{0}; + // Helper functions to tell if this is a copy command-list. + bool isCopy(ur_queue_handle_t Queue) const; + + // Keeps events created by commands submitted into this command-list. + // TODO: use this for explicit wait/cleanup of events at command-list + // completion. + // TODO: use this for optimizing events in the same command-list, e.g. + // only have last one visible to the host. + std::vector EventList{}; + size_t size() const { return EventList.size(); } + void append(ur_event_handle_t Event) { EventList.push_back(Event); } +}; + +// The map type that would track all command-lists in a queue. +using ur_command_list_map_t = + std::unordered_map; +// The iterator pointing to a specific command-list in use. +using ur_command_list_ptr_t = ur_command_list_map_t::iterator; + +struct ur_queue_handle_t_ : _ur_object { + ur_queue_handle_t_(std::vector &ComputeQueues, + std::vector &CopyQueues, + ur_context_handle_t Context, ur_device_handle_t Device, + bool OwnZeCommandQueue, pi_queue_properties Properties = 0, + int ForceComputeIndex = -1); + + using queue_type = ur_device_handle_t_::queue_group_info_t::type; + // PI queue is in general a one to many mapping to L0 native queues. + struct pi_queue_group_t { + ur_queue_handle_t Queue; + pi_queue_group_t() = delete; + + // The Queue argument captures the enclosing PI queue. + // The Type argument specifies the type of this queue group. + // The actual ZeQueues are populated at PI queue construction. + pi_queue_group_t(ur_queue_handle_t Queue, queue_type Type) + : Queue(Queue), Type(Type) {} + + // The type of the queue group. + queue_type Type; + bool isCopy() const { return Type != queue_type::Compute; } + + // Level Zero command queue handles. + std::vector ZeQueues; + + // Immediate commandlist handles, one per Level Zero command queue handle. + // These are created only once, along with the L0 queues (see above) + // and reused thereafter. + std::vector ImmCmdLists; + + // Return the index of the next queue to use based on a + // round robin strategy and the queue group ordinal. + // If QueryOnly is true then return index values but don't update internal + // index data members of the queue. + uint32_t getQueueIndex(uint32_t *QueueGroupOrdinal, uint32_t *QueueIndex, + bool QueryOnly = false); + + // Get the ordinal for a command queue handle. + int32_t getCmdQueueOrdinal(ze_command_queue_handle_t CmdQueue); + + // This function will return one of possibly multiple available native + // queues and the value of the queue group ordinal. + ze_command_queue_handle_t &getZeQueue(uint32_t *QueueGroupOrdinal); + + // This function returns the next immediate commandlist to use. + ur_command_list_ptr_t &getImmCmdList(); + + // These indices are to filter specific range of the queues to use, + // and to organize round-robin across them. + uint32_t UpperIndex{0}; + uint32_t LowerIndex{0}; + uint32_t NextIndex{0}; + }; + + // Helper class to facilitate per-thread queue groups + // We maintain a hashtable of queue groups if requested to do them per-thread. + // Otherwise it is just single entry used for all threads. + struct pi_queue_group_by_tid_t + : public std::unordered_map { + bool PerThread = false; + + // Returns thread id if doing per-thread, or a generic id that represents + // all the threads. + std::thread::id tid() const { + return PerThread ? std::this_thread::get_id() : std::thread::id(); + } + + // Make the specified queue group be the master + void set(const pi_queue_group_t &QueueGroup) { + const auto &Device = QueueGroup.Queue->Device; + PerThread = + Device->ImmCommandListUsed == ur_device_handle_t_::PerThreadPerQueue; + assert(empty()); + insert({tid(), QueueGroup}); + } + + // Get a queue group to use for this thread + pi_queue_group_t &get() { + assert(!empty()); + auto It = find(tid()); + if (It != end()) { + return It->second; + } + // Add new queue group for this thread initialized from a master entry. + auto QueueGroup = begin()->second; + // Create space for queues and immediate commandlists, which are created + // on demand. + QueueGroup.ZeQueues = std::vector( + QueueGroup.ZeQueues.size(), nullptr); + QueueGroup.ImmCmdLists = std::vector( + QueueGroup.ZeQueues.size(), QueueGroup.Queue->CommandListMap.end()); + + std::tie(It, std::ignore) = insert({tid(), QueueGroup}); + return It->second; + } + }; + + // A map of compute groups containing compute queue handles, one per thread. + // When a queue is accessed from multiple host threads, a separate queue group + // is created for each thread. The key used for mapping is the thread ID. + pi_queue_group_by_tid_t ComputeQueueGroupsByTID; + + // A group containing copy queue handles. The main copy engine, if available, + // comes first followed by link copy engines, if available. + // When a queue is accessed from multiple host threads, a separate queue group + // is created for each thread. The key used for mapping is the thread ID. + pi_queue_group_by_tid_t CopyQueueGroupsByTID; + + // Keeps the PI context to which this queue belongs. + // This field is only set at _pi_queue creation time, and cannot change. + // Therefore it can be accessed without holding a lock on this _pi_queue. + const ur_context_handle_t Context; + + // Keeps the PI device to which this queue belongs. + // This field is only set at _pi_queue creation time, and cannot change. + // Therefore it can be accessed without holding a lock on this _pi_queue. + const ur_device_handle_t Device; + + // Keeps track of the event associated with the last enqueued command into + // this queue. this is used to add dependency with the last command to add + // in-order semantics and updated with the latest event each time a new + // command is enqueued. + ur_event_handle_t LastCommandEvent = nullptr; + + // Indicates if we own the ZeCommandQueue or it came from interop that + // asked to not transfer the ownership to SYCL RT. + bool OwnZeCommandQueue; + + // Keeps the properties of this queue. + pi_queue_properties Properties; + + // Map of all command lists used in this queue. + ur_command_list_map_t CommandListMap; + + // Helper data structure to hold all variables related to batching + struct command_batch { + // These two members are used to keep track of how often the + // batching closes and executes a command list before reaching the + // QueueComputeBatchSize limit, versus how often we reach the limit. + // This info might be used to vary the QueueComputeBatchSize value. + uint32_t NumTimesClosedEarly = {0}; + uint32_t NumTimesClosedFull = {0}; -struct _ur_queue_handle_t : _ur_object { - _ur_queue_handle_t() {} + // Open command list fields for batching commands into this queue. + ur_command_list_ptr_t OpenCommandList{}; + + // Approximate number of commands that are allowed to be batched for + // this queue. + // Added this member to the queue rather than using a global variable + // so that future implementation could use heuristics to change this on + // a queue specific basis. And by putting it in the queue itself, this + // is thread safe because of the locking of the queue that occurs. + uint32_t QueueBatchSize = {0}; + }; + + // ComputeCommandBatch holds data related to batching of non-copy commands. + // CopyCommandBatch holds data related to batching of copy commands. + command_batch ComputeCommandBatch, CopyCommandBatch; + + // A helper structure to keep active barriers of the queue. + // It additionally manages ref-count of events in this list. + struct active_barriers { + std::vector Events; + void add(ur_event_handle_t &Event); + ur_result_t clear(); + bool empty() { return Events.empty(); } + std::vector &vector() { return Events; } + }; + // A collection of currently active barriers. + // These should be inserted into a command list whenever an available command + // list is needed for a command. + active_barriers ActiveBarriers; + + // Besides each PI object keeping a total reference count in + // _ur_object::RefCount we keep special track of the queue *external* + // references. This way we are able to tell when the queue is being finished + // externally, and can wait for internal references to complete, and do proper + // cleanup of the queue. + // This counter doesn't track the lifetime of a queue object, it only tracks + // the number of external references. I.e. even if it reaches zero a queue + // object may not be destroyed and can be used internally in the plugin. + // That's why we intentionally don't use atomic type for this counter to + // enforce guarding with a mutex all the work involving this counter. + uint32_t RefCountExternal{1}; + + // Indicates that the queue is healthy and all operations on it are OK. + bool Healthy{true}; + + // The following data structures and methods are used only for handling + // in-order queue with discard_events property. Some commands in such queue + // may have discarded event. Which means that event is not visible outside of + // the plugin. It is possible to reset and reuse discarded events in the same + // in-order queue because of the dependency between commands. We don't have to + // wait event completion to do this. We use the following 2-event model to + // reuse events inside each command list: + // + // Operation1 = zeCommantListAppendMemoryCopy (signal ze_event1) + // zeCommandListAppendBarrier(wait for ze_event1) + // zeCommandListAppendEventReset(ze_event1) + // # Create new pi_event using ze_event1 and append to the cache. + // + // Operation2 = zeCommandListAppendMemoryCopy (signal ze_event2) + // zeCommandListAppendBarrier(wait for ze_event2) + // zeCommandListAppendEventReset(ze_event2) + // # Create new pi_event using ze_event2 and append to the cache. + // + // # Get pi_event from the beginning of the cache because there are two events + // # there. So it is guaranteed that we do round-robin between two events - + // # event from the last command is appended to the cache. + // Operation3 = zeCommandListAppendMemoryCopy (signal ze_event1) + // # The same ze_event1 is used for Operation1 and Operation3. + // + // When we switch to a different command list we need to signal new event and + // wait for it in the new command list using barrier. + // [CmdList1] + // Operation1 = zeCommantListAppendMemoryCopy (signal event1) + // zeCommandListAppendBarrier(wait for event1) + // zeCommandListAppendEventReset(event1) + // zeCommandListAppendSignalEvent(NewEvent) + // + // [CmdList2] + // zeCommandListAppendBarrier(wait for NewEvent) + // + // This barrier guarantees that command list execution starts only after + // completion of previous command list which signals aforementioned event. It + // allows to reset and reuse same event handles inside all command lists in + // scope of the queue. It means that we need 2 reusable events of each type + // (host-visible and device-scope) per queue at maximum. + + // This data member keeps track of the last used command list and allows to + // handle switch of immediate command lists because immediate command lists + // are never closed unlike regular command lists. + ur_command_list_ptr_t LastUsedCommandList = CommandListMap.end(); + + // Vector of 2 lists of reusable events: host-visible and device-scope. + // They are separated to allow faster access to stored events depending on + // requested type of event. Each list contains events which can be reused + // inside all command lists in the queue as described in the 2-event model. + // Leftover events in the cache are relased at the queue destruction. + std::vector> EventCaches{2}; + + // adjust the queue's batch size, knowing that the current command list + // is being closed with a full batch. + // For copy commands, IsCopy is set to 'true'. + // For non-copy commands, IsCopy is set to 'false'. + void adjustBatchSizeForFullBatch(bool IsCopy); + + // adjust the queue's batch size, knowing that the current command list + // is being closed with only a partial batch of commands. + // For copy commands, IsCopy is set to 'true'. + // For non-copy commands, IsCopy is set to 'false'. + void adjustBatchSizeForPartialBatch(bool IsCopy); + + // Attach a command list to this queue. + // For non-immediate commandlist also close and execute it. + // Note that this command list cannot be appended to after this. + // The "IsBlocking" tells if the wait for completion is required. + // If OKToBatchCommand is true, then this command list may be executed + // immediately, or it may be left open for other future command to be + // batched into. + // If IsBlocking is true, then batching will not be allowed regardless + // of the value of OKToBatchCommand + // + // For immediate commandlists, no close and execute is necessary. + ur_result_t executeCommandList(ur_command_list_ptr_t CommandList, + bool IsBlocking = false, + bool OKToBatchCommand = false); + + // Helper method telling whether we need to reuse discarded event in this + // queue. + bool doReuseDiscardedEvents(); + + // Append command to provided command list to wait and reset the last event if + // it is discarded and create new pi_event wrapper using the same native event + // and put it to the cache. We call this method after each command submission + // to make native event available to use by next commands. + ur_result_t resetDiscardedEvent(ur_command_list_ptr_t); + + // Put pi_event to the cache. Provided pi_event object is not used by + // any command but its ZeEvent is used by many pi_event objects. + // Commands to wait and reset ZeEvent must be submitted to the queue before + // calling this method. + ur_result_t addEventToQueueCache(ur_event_handle_t Event); + + // Returns true if any commands for this queue are allowed to + // be batched together. + // For copy commands, IsCopy is set to 'true'. + // For non-copy commands, IsCopy is set to 'false'. + bool isBatchingAllowed(bool IsCopy) const; + + // Returns true if the queue is a in-order queue. + bool isInOrderQueue() const; + + // Returns true if the queue has discard events property. + bool isDiscardEvents() const; + + // Returns true if the queue has explicit priority set by user. + bool isPriorityLow() const; + bool isPriorityHigh() const; + + // Wait for all commandlists associated with this Queue to finish operations. + ur_result_t synchronize(); + + // Get event from the queue's cache. + // Returns nullptr if the cache doesn't contain any reusable events or if the + // cache contains only one event which corresponds to the previous command and + // can't be used for the current command because we can't use the same event + // two times in a row and have to do round-robin between two events. Otherwise + // it picks an event from the beginning of the cache and returns it. Event + // from the last command is always appended to the end of the list. + ur_event_handle_t getEventFromQueueCache(bool HostVisible); + + // Returns true if an OpenCommandList has commands that need to be submitted. + // If IsCopy is 'true', then the OpenCommandList containing copy commands is + // checked. Otherwise, the OpenCommandList containing compute commands is + // checked. + bool hasOpenCommandList(bool IsCopy) const { + auto CommandBatch = (IsCopy) ? CopyCommandBatch : ComputeCommandBatch; + return CommandBatch.OpenCommandList != CommandListMap.end(); + } + + // Update map of memory references made by the kernels about to be submitted + void CaptureIndirectAccesses(); + + // Kernel is not necessarily submitted for execution during + // piEnqueueKernelLaunch, it may be batched. That's why we need to save the + // list of kernels which is going to be submitted but have not been submitted + // yet. This is needed to capture memory allocations for each kernel with + // indirect access in the list at the moment when kernel is really submitted + // for execution. + std::vector KernelsToBeSubmitted; + + // Append command to the command list to signal new event if the last event in + // the command list is discarded. While we submit commands in scope of the + // same command list we can reset and reuse events but when we switch to a + // different command list we currently need to signal new event and wait for + // it in the new command list using barrier. + ur_result_t signalEventFromCmdListIfLastEventDiscarded(ur_command_list_ptr_t); + + // If there is an open command list associated with this queue, + // close it, execute it, and reset the corresponding OpenCommandList. + // If IsCopy is 'true', then the OpenCommandList containing copy commands is + // executed. Otherwise OpenCommandList containing compute commands is + // executed. + ur_result_t executeOpenCommandList(bool IsCopy); + + // Wrapper function to execute both OpenCommandLists (Copy and Compute). + // This wrapper is helpful when all 'open' commands need to be executed. + // Call-sites instances: piQuueueFinish, piQueueRelease, etc. + ur_result_t executeAllOpenCommandLists() { + using IsCopy = bool; + if (auto Res = executeOpenCommandList(IsCopy{false})) + return Res; + if (auto Res = executeOpenCommandList(IsCopy{true})) + return Res; + return UR_RESULT_SUCCESS; + } + + /// @brief Resets the command list and associated fence in the map and removes + /// events from the command list. + /// @param CommandList The caller must verify that this command list and fence + /// have been signalled. + /// @param MakeAvailable If the reset command list should be made available, + /// then MakeAvailable needs to be set to true. + /// @param EventListToCleanup The EventListToCleanup contains a list of + /// events from the command list which need to be cleaned up. + /// @param CheckStatus Hint informing whether we need to check status of the + /// events before removing them from the immediate command list. This is + /// needed because immediate command lists are not associated with fences and + /// in general status of the event needs to be checked. + /// @return PI_SUCCESS if successful, PI error code otherwise. + ur_result_t + resetCommandList(ur_command_list_ptr_t CommandList, bool MakeAvailable, + std::vector &EventListToCleanup, + bool CheckStatus = true); + + // Gets the open command containing the event, or CommandListMap.end() + ur_command_list_ptr_t eventOpenCommandList(ur_event_handle_t Event); + + // Return the queue group to use based on standard/immediate commandlist mode, + // and if immediate mode, the thread-specific group. + pi_queue_group_t &getQueueGroup(bool UseCopyEngine); + + // Helper function to create a new command-list to this queue and associated + // fence tracking its completion. This command list & fence are added to the + // map of command lists in this queue with ZeFenceInUse = false. + // The caller must hold a lock of the queue already. + ur_result_t + createCommandList(bool UseCopyEngine, ur_command_list_ptr_t &CommandList, + ze_command_queue_handle_t *ForcedCmdQueue = nullptr); + + // Inserts a barrier waiting for all unfinished events in ActiveBarriers into + // CmdList. Any finished events will be removed from ActiveBarriers. + ur_result_t insertActiveBarriers(ur_command_list_ptr_t &CmdList, + bool UseCopyEngine); + + // This function considers multiple factors including copy engine + // availability and user preference and returns a boolean that is used to + // specify if copy engine will eventually be used for a particular command. + bool useCopyEngine(bool PreferCopyEngine = true) const; + + // Insert a barrier waiting for the last command event into the beginning of + // command list. This barrier guarantees that command list execution starts + // only after completion of previous command list which signals aforementioned + // event. It allows to reset and reuse same event handles inside all command + // lists in the queue. + ur_result_t + insertStartBarrierIfDiscardEventsMode(ur_command_list_ptr_t &CmdList); }; + +// This helper function creates a pi_event and associate a pi_queue. +// Note that the caller of this function must have acquired lock on the Queue +// that is passed in. +// \param Queue pi_queue to associate with a new event. +// \param Event a pointer to hold the newly created pi_event +// \param CommandType various command type determined by the caller +// \param CommandList is the command list where the event is added +// \param IsInternal tells if the event is internal, i.e. visible in the L0 +// plugin only. +// \param ForceHostVisible tells if the event must be created in +// the host-visible pool +ur_result_t createEventAndAssociateQueue( + ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType, + ur_command_list_ptr_t CommandList, bool IsInternal = false, + std::optional HostVisible = std::nullopt); + +// Helper function to perform the necessary cleanup of the events from reset cmd +// list. +ur_result_t CleanupEventListFromResetCmdList( + std::vector &EventListToCleanup, + bool QueueLocked = false); \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp index 7014f92ddfb90..1b5496f5f59ed 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp @@ -7,3 +7,206 @@ //===-----------------------------------------------------------------===// #include "ur_level_zero_sampler.hpp" +#include + +UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate( + ur_context_handle_t Context, ///< [in] handle of the context object + const ur_sampler_property_t + *Props, ///< [in] specifies a list of sampler property names and their + ///< corresponding values. + ur_sampler_handle_t + *Sampler ///< [out] pointer to handle of sampler object created +) { + std::shared_lock Lock(Context->Mutex); + + // Have the "0" device in context to own the sampler. Rely on Level-Zero + // drivers to perform migration as necessary for sharing it across multiple + // devices in the context. + // + // TODO: figure out if we instead need explicit copying for acessing + // the sampler from other devices in the context. + // + ur_device_handle_t Device = Context->Devices[0]; + + ze_sampler_handle_t ZeSampler; + ZeStruct ZeSamplerDesc; + + // Set the default values for the ZeSamplerDesc. + ZeSamplerDesc.isNormalized = PI_TRUE; + ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_CLAMP; + ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST; + + // Update the values of the ZeSamplerDesc from the pi_sampler_properties list. + // Default values will be used if any of the following is true: + // a) SamplerProperties list is NULL + // b) SamplerProperties list is missing any properties + + if (Props) { + const ur_sampler_property_t *CurProperty = Props; + + while (*CurProperty != 0) { + switch (*CurProperty) { + case UR_SAMPLER_PROPERTIES_NORMALIZED_COORDS: { + bool CurValueBool = ur_cast(*(++CurProperty)); + + if (CurValueBool == PI_TRUE) + ZeSamplerDesc.isNormalized = PI_TRUE; + else if (CurValueBool == PI_FALSE) + ZeSamplerDesc.isNormalized = PI_FALSE; + else { + urPrint("urSamplerCreate: unsupported " + "UR_SAMPLER_INFO_NORMALIZED_COORDS value\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + } break; + + case UR_SAMPLER_PROPERTIES_ADDRESSING_MODE: { + ur_sampler_addressing_mode_t CurValueAddressingMode = + ur_cast( + ur_cast(*(++CurProperty))); + + // Level Zero runtime with API version 1.2 and lower has a bug: + // ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER is implemented as "clamp to + // edge" and ZE_SAMPLER_ADDRESS_MODE_CLAMP is implemented as "clamp to + // border", i.e. logic is flipped. Starting from API version 1.3 this + // problem is going to be fixed. That's why check for API version to set + // an address mode. + ze_api_version_t ZeApiVersion = Context->getPlatform()->ZeApiVersion; + // TODO: add support for PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE + switch (CurValueAddressingMode) { + case UR_SAMPLER_ADDRESSING_MODE_NONE: + ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_NONE; + break; + case UR_SAMPLER_ADDRESSING_MODE_REPEAT: + ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_REPEAT; + break; + case UR_SAMPLER_ADDRESSING_MODE_CLAMP: + ZeSamplerDesc.addressMode = + ZeApiVersion < ZE_MAKE_VERSION(1, 3) + ? ZE_SAMPLER_ADDRESS_MODE_CLAMP + : ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; + break; + case UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: + ZeSamplerDesc.addressMode = + ZeApiVersion < ZE_MAKE_VERSION(1, 3) + ? ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER + : ZE_SAMPLER_ADDRESS_MODE_CLAMP; + break; + case UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: + ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_MIRROR; + break; + default: + urPrint("urSamplerCreate: unsupported " + "UR_SAMPLER_PROPERTIES_ADDRESSING_MODEE " + "value\n"); + urPrint("UR_SAMPLER_PROPERTIES_ADDRESSING_MODEE=%d\n", + CurValueAddressingMode); + return UR_RESULT_ERROR_INVALID_VALUE; + } + } break; + + case UR_SAMPLER_PROPERTIES_FILTER_MODE: { + ur_ext_sampler_filter_mode_t CurValueFilterMode = + ur_cast( + ur_cast(*(++CurProperty))); + + if (CurValueFilterMode == UR_EXT_SAMPLER_FILTER_MODE_NEAREST) + ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST; + else if (CurValueFilterMode == UR_EXT_SAMPLER_FILTER_MODE_LINEAR) + ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_LINEAR; + else { + urPrint("UR_SAMPLER_FILTER_MODE=%d\n", CurValueFilterMode); + urPrint( + "urSamplerCreate: unsupported UR_SAMPLER_FILTER_MODE value\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + } break; + + default: + break; + } + CurProperty++; + } + } + + ZE2UR_CALL(zeSamplerCreate, (Context->ZeContext, Device->ZeDevice, + &ZeSamplerDesc, // TODO: translate properties + &ZeSampler)); + + try { + ur_sampler_handle_t_ *UrSampler = new ur_sampler_handle_t_(ZeSampler); + *Sampler = reinterpret_cast(UrSampler); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urSamplerRetain( + ur_sampler_handle_t + Sampler ///< [in] handle of the sampler object to get access +) { + Sampler->RefCount.increment(); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urSamplerRelease( + ur_sampler_handle_t + Sampler ///< [in] handle of the sampler object to release +) { + if (!Sampler->RefCount.decrementAndTest()) + return UR_RESULT_SUCCESS; + + auto ZeResult = ZE_CALL_NOCHECK(zeSamplerDestroy, (Sampler->ZeSampler)); + // Gracefully handle the case that L0 was already unloaded. + if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) + return ze2urResult(ZeResult); + delete Sampler; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urSamplerGetInfo( + ur_sampler_handle_t Sampler, ///< [in] handle of the sampler object + ur_sampler_info_t PropName, ///< [in] name of the sampler property to query + size_t PropValueSize, ///< [in] size in bytes of the sampler property value + ///< provided + void *PropValue, ///< [out] value of the sampler property + size_t + *PropSizeRet ///< [out] size in bytes returned in sampler property value +) { + std::ignore = Sampler; + std::ignore = PropName; + std::ignore = PropValueSize; + std::ignore = PropValue; + std::ignore = PropSizeRet; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urSamplerGetNativeHandle( + ur_sampler_handle_t Sampler, ///< [in] handle of the sampler. + ur_native_handle_t *NativeSampler ///< [out] a pointer to the native + ///< handle of the sampler. +) { + std::ignore = Sampler; + std::ignore = NativeSampler; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreateWithNativeHandle( + ur_native_handle_t + NativeSampler, ///< [in] the native handle of the sampler. + ur_context_handle_t Context, ///< [in] handle of the context object + ur_sampler_handle_t *Sampler ///< [out] pointer to the handle of the + ///< sampler object created. +) { + std::ignore = NativeSampler; + std::ignore = Context; + std::ignore = Sampler; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.hpp index abbfb76c8e126..22463f76906e4 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.hpp @@ -9,6 +9,9 @@ #include "ur_level_zero_common.hpp" -struct _ur_sampler_handle_t : _ur_object { - _ur_sampler_handle_t() {} +struct ur_sampler_handle_t_ : _ur_object { + ur_sampler_handle_t_(ze_sampler_handle_t Sampler) : ZeSampler{Sampler} {} + + // Level Zero sampler handle. + ze_sampler_handle_t ZeSampler; }; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp index 0a58c57319b7b..a117de71b57e6 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp @@ -32,7 +32,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable( } pDdiTable->pfnInit = urInit; - pDdiTable->pfnGetLastResult = nullptr; + pDdiTable->pfnGetLastResult = urGetLastResult; pDdiTable->pfnTearDown = urTearDown; return retVal; @@ -48,13 +48,13 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetContextProcAddrTable( return retVal; } - pDdiTable->pfnCreate = nullptr; - pDdiTable->pfnRetain = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnSetExtendedDeleter = nullptr; + pDdiTable->pfnCreate = urContextCreate; + pDdiTable->pfnRetain = urContextRetain; + pDdiTable->pfnRelease = urContextRelease; + pDdiTable->pfnGetInfo = urContextGetInfo; + pDdiTable->pfnGetNativeHandle = urContextGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = urContextCreateWithNativeHandle; + pDdiTable->pfnSetExtendedDeleter = urContextSetExtendedDeleter; return retVal; } @@ -69,28 +69,29 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( return retVal; } - pDdiTable->pfnKernelLaunch = nullptr; - pDdiTable->pfnEventsWait = nullptr; - pDdiTable->pfnEventsWaitWithBarrier = nullptr; - pDdiTable->pfnMemBufferRead = nullptr; - pDdiTable->pfnMemBufferWrite = nullptr; - pDdiTable->pfnMemBufferReadRect = nullptr; - pDdiTable->pfnMemBufferWriteRect = nullptr; - pDdiTable->pfnMemBufferCopy = nullptr; - pDdiTable->pfnMemBufferCopyRect = nullptr; - pDdiTable->pfnMemBufferFill = nullptr; - pDdiTable->pfnMemImageRead = nullptr; - pDdiTable->pfnMemImageWrite = nullptr; - pDdiTable->pfnMemImageCopy = nullptr; - pDdiTable->pfnMemBufferMap = nullptr; - pDdiTable->pfnMemUnmap = nullptr; - pDdiTable->pfnUSMMemcpy = nullptr; - pDdiTable->pfnUSMPrefetch = nullptr; - pDdiTable->pfnUSMAdvise = nullptr; - pDdiTable->pfnUSMFill2D = nullptr; - pDdiTable->pfnUSMMemcpy2D = nullptr; - pDdiTable->pfnDeviceGlobalVariableWrite = nullptr; - pDdiTable->pfnDeviceGlobalVariableRead = nullptr; + pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch; + pDdiTable->pfnEventsWait = urEnqueueEventsWait; + pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier; + pDdiTable->pfnMemBufferRead = urEnqueueMemBufferRead; + pDdiTable->pfnMemBufferWrite = urEnqueueMemBufferWrite; + pDdiTable->pfnMemBufferReadRect = urEnqueueMemBufferReadRect; + pDdiTable->pfnMemBufferWriteRect = urEnqueueMemBufferWriteRect; + pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy; + pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect; + pDdiTable->pfnMemBufferFill = urEnqueueMemBufferFill; + pDdiTable->pfnMemImageRead = urEnqueueMemImageRead; + pDdiTable->pfnMemImageWrite = urEnqueueMemImageWrite; + pDdiTable->pfnMemImageCopy = urEnqueueMemImageCopy; + pDdiTable->pfnMemBufferMap = urEnqueueMemBufferMap; + pDdiTable->pfnMemUnmap = urEnqueueMemUnmap; + pDdiTable->pfnUSMFill = urEnqueueUSMFill; + pDdiTable->pfnUSMMemcpy = urEnqueueUSMMemcpy; + pDdiTable->pfnUSMPrefetch = urEnqueueUSMPrefetch; + pDdiTable->pfnUSMMemAdvise = urEnqueueUSMMemAdvise; + pDdiTable->pfnUSMFill2D = urEnqueueUSMFill2D; + pDdiTable->pfnUSMMemcpy2D = urEnqueueUSMMemcpy2D; + pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite; + pDdiTable->pfnDeviceGlobalVariableRead = urEnqueueDeviceGlobalVariableRead; return retVal; } @@ -104,14 +105,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable( if (UR_RESULT_SUCCESS != retVal) { return retVal; } - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetProfilingInfo = nullptr; - pDdiTable->pfnWait = nullptr; - pDdiTable->pfnRetain = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnSetCallback = nullptr; + pDdiTable->pfnGetInfo = urEventGetInfo; + pDdiTable->pfnGetProfilingInfo = urEventGetProfilingInfo; + pDdiTable->pfnWait = urEventWait; + pDdiTable->pfnRetain = urEventRetain; + pDdiTable->pfnRelease = urEventRelease; + pDdiTable->pfnGetNativeHandle = urEventGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = urEventCreateWithNativeHandle; + pDdiTable->pfnSetCallback = urEventSetCallback; return retVal; } @@ -125,20 +126,21 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( if (UR_RESULT_SUCCESS != retVal) { return retVal; } - pDdiTable->pfnCreate = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetGroupInfo = nullptr; - pDdiTable->pfnGetSubGroupInfo = nullptr; - pDdiTable->pfnRetain = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnSetArgValue = nullptr; - pDdiTable->pfnSetArgLocal = nullptr; - pDdiTable->pfnSetArgPointer = nullptr; - pDdiTable->pfnSetExecInfo = nullptr; - pDdiTable->pfnSetArgSampler = nullptr; - pDdiTable->pfnSetArgMemObj = nullptr; + pDdiTable->pfnCreate = urKernelCreate; + pDdiTable->pfnGetInfo = urKernelGetInfo; + pDdiTable->pfnGetGroupInfo = urKernelGetGroupInfo; + pDdiTable->pfnGetSubGroupInfo = urKernelGetSubGroupInfo; + pDdiTable->pfnRetain = urKernelRetain; + pDdiTable->pfnRelease = urKernelRelease; + pDdiTable->pfnGetNativeHandle = urKernelGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = urKernelCreateWithNativeHandle; + pDdiTable->pfnSetArgValue = urKernelSetArgValue; + pDdiTable->pfnSetArgLocal = urKernelSetArgLocal; + pDdiTable->pfnSetArgPointer = urKernelSetArgPointer; + pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; + pDdiTable->pfnSetArgSampler = urKernelSetArgSampler; + pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj; + pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; return retVal; } @@ -151,15 +153,15 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetMemProcAddrTable( if (UR_RESULT_SUCCESS != retVal) { return retVal; } - pDdiTable->pfnImageCreate = nullptr; - pDdiTable->pfnBufferCreate = nullptr; - pDdiTable->pfnRetain = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnBufferPartition = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnImageGetInfo = nullptr; + pDdiTable->pfnImageCreate = urMemImageCreate; + pDdiTable->pfnBufferCreate = urMemBufferCreate; + pDdiTable->pfnRetain = urMemRetain; + pDdiTable->pfnRelease = urMemRelease; + pDdiTable->pfnBufferPartition = urMemBufferPartition; + pDdiTable->pfnGetNativeHandle = urMemGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = urMemCreateWithNativeHandle; + pDdiTable->pfnGetInfo = urMemGetInfo; + pDdiTable->pfnImageGetInfo = urMemImageGetInfo; return retVal; } @@ -175,9 +177,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable( } pDdiTable->pfnGet = urPlatformGet; pDdiTable->pfnGetInfo = urPlatformGetInfo; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnGetApiVersion = nullptr; + pDdiTable->pfnGetNativeHandle = urPlatformGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = urPlatformCreateWithNativeHandle; + pDdiTable->pfnGetApiVersion = urPlatformGetApiVersion; return retVal; } @@ -192,14 +194,20 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable( if (UR_RESULT_SUCCESS != retVal) { return retVal; } - pDdiTable->pfnCreateWithBinary = nullptr; - pDdiTable->pfnRetain = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnGetFunctionPointer = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetBuildInfo = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnCreateWithIL = urProgramCreateWithIL; + pDdiTable->pfnCreateWithBinary = urProgramCreateWithBinary; + pDdiTable->pfnBuild = urProgramBuild; + pDdiTable->pfnCompile = urProgramCompile; + pDdiTable->pfnLink = urProgramLink; + pDdiTable->pfnRetain = urProgramRetain; + pDdiTable->pfnRelease = urProgramRelease; + pDdiTable->pfnGetFunctionPointer = urProgramGetFunctionPointer; + pDdiTable->pfnGetInfo = urProgramGetInfo; + pDdiTable->pfnGetBuildInfo = urProgramGetBuildInfo; + pDdiTable->pfnSetSpecializationConstants = + urProgramSetSpecializationConstants; + pDdiTable->pfnGetNativeHandle = urProgramGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = urProgramCreateWithNativeHandle; return retVal; } @@ -214,14 +222,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( return retVal; } - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnCreate = nullptr; - pDdiTable->pfnRetain = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnFinish = nullptr; - pDdiTable->pfnFlush = nullptr; + pDdiTable->pfnGetInfo = urQueueGetInfo; + pDdiTable->pfnCreate = urQueueCreate; + pDdiTable->pfnRetain = urQueueRetain; + pDdiTable->pfnRelease = urQueueRelease; + pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; + pDdiTable->pfnFinish = urQueueFinish; + pDdiTable->pfnFlush = urQueueFlush; return retVal; } @@ -235,12 +243,12 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable( if (UR_RESULT_SUCCESS != retVal) { return retVal; } - pDdiTable->pfnCreate = nullptr; - pDdiTable->pfnRetain = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnGetInfo = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnCreate = urSamplerCreate; + pDdiTable->pfnRetain = urSamplerRetain; + pDdiTable->pfnRelease = urSamplerRelease; + pDdiTable->pfnGetInfo = urSamplerGetInfo; + pDdiTable->pfnGetNativeHandle = urSamplerGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = urSamplerCreateWithNativeHandle; return retVal; } @@ -254,11 +262,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMProcAddrTable( if (UR_RESULT_SUCCESS != retVal) { return retVal; } - pDdiTable->pfnHostAlloc = nullptr; - pDdiTable->pfnDeviceAlloc = nullptr; - pDdiTable->pfnSharedAlloc = nullptr; - pDdiTable->pfnFree = nullptr; - pDdiTable->pfnGetMemAllocInfo = nullptr; + + pDdiTable->pfnHostAlloc = urUSMHostAlloc; + pDdiTable->pfnDeviceAlloc = urUSMDeviceAlloc; + pDdiTable->pfnSharedAlloc = urUSMSharedAlloc; + pDdiTable->pfnFree = urUSMFree; + pDdiTable->pfnGetMemAllocInfo = urUSMGetMemAllocInfo; + pDdiTable->pfnPoolCreate = urUSMPoolCreate; + pDdiTable->pfnPoolDestroy = urUSMPoolDestroy; return retVal; } @@ -277,10 +288,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( pDdiTable->pfnRetain = urDeviceRetain; pDdiTable->pfnRelease = urDeviceRelease; pDdiTable->pfnPartition = urDevicePartition; - pDdiTable->pfnSelectBinary = nullptr; - pDdiTable->pfnGetNativeHandle = nullptr; - pDdiTable->pfnCreateWithNativeHandle = nullptr; - pDdiTable->pfnGetGlobalTimestamps = nullptr; + pDdiTable->pfnSelectBinary = urDeviceSelectBinary; + pDdiTable->pfnGetNativeHandle = urDeviceGetNativeHandle; + pDdiTable->pfnCreateWithNativeHandle = urDeviceCreateWithNativeHandle; + pDdiTable->pfnGetGlobalTimestamps = urDeviceGetGlobalTimestamps; return retVal; } diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp index 70a52aabe290c..d25e36db39bc5 100644 --- a/sycl/plugins/unified_runtime/ur/ur.hpp +++ b/sycl/plugins/unified_runtime/ur/ur.hpp @@ -48,6 +48,9 @@ const int UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT = UR_EXT_DEVICE_INFO_END - 15; const ur_device_info_t UR_EXT_DEVICE_INFO_OPENCL_C_VERSION = (ur_device_info_t)0x103D; +const uint32_t UR_EXT_MAP_FLAG_WRITE_INVALIDATE_REGION = + (UR_MAP_FLAG_WRITE << 1); + const int UR_EXT_RESULT_END = 0x1000; const ur_result_t UR_EXT_RESULT_ADAPTER_SPECIFIC_ERROR = ur_result_t(UR_EXT_RESULT_END - 1); @@ -57,6 +60,38 @@ const int UR_EXT_USM_CAPS_ATOMIC_ACCESS = 1 << 1; const int UR_EXT_USM_CAPS_CONCURRENT_ACCESS = 1 << 2; const int UR_EXT_USM_CAPS_CONCURRENT_ATOMIC_ACCESS = 1 << 3; +const int UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY = 1 << 5; + +const ur_context_info_t UR_EXT_CONTEXT_INFO_REFERENCE_COUNT = + (ur_context_info_t)(UR_CONTEXT_INFO_FORCE_UINT32 - 2); + +const ur_context_info_t UR_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = + (ur_context_info_t)(UR_CONTEXT_INFO_FORCE_UINT32 - 1); + +const ur_queue_info_t UR_EXT_ONEAPI_QUEUE_INFO_EMPTY = + (ur_queue_info_t)(UR_QUEUE_INFO_SIZE + 1); + +const ur_command_t UR_EXT_COMMAND_TYPE_USER = + (ur_command_t)((uint32_t)UR_COMMAND_FORCE_UINT32 - 1); + +const ur_image_channel_order_t UR_EXT_IMAGE_CHANNEL_ORDER_ABGR = + ur_image_channel_order_t(UR_IMAGE_CHANNEL_ORDER_FORCE_UINT32 - 1); + +typedef enum ur_ext_sampler_filter_mode_t { + UR_EXT_SAMPLER_FILTER_MODE_NEAREST = 0, + UR_EXT_SAMPLER_FILTER_MODE_LINEAR = 1, + UR_EXT_SAMPLER_FILTER_MODE_FORCE_UINT32 = 0x7fffffff +} ur_ext_sampler_filter_mode_t; + +const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG = + (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 1); +const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM = + (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 2); +const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA = + (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 3); +const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT = + (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 4); + // Terminates the process with a catastrophic error message. [[noreturn]] inline void die(const char *Message) { std::cerr << "die: " << Message << std::endl; @@ -228,10 +263,14 @@ struct _ur_object { // std::shared_lock Obj3Lock(Obj3->Mutex, std::defer_lock); // std::scoped_lock LockAll(Obj1->Mutex, Obj2->Mutex, Obj3Lock); ur_shared_mutex Mutex; + + // Indicates if we own the native handle or it came from interop that + // asked to not transfer the ownership to SYCL RT. + bool OwnNativeHandle = false; }; // Helper for one-liner validation -#define PI_ASSERT(condition, error) \ +#define UR_ASSERT(condition, error) \ if (!(condition)) \ return error; @@ -279,7 +318,7 @@ ur_result_t getInfo(size_t param_value_size, void *param_value, size_t *param_value_size_ret, T value) { auto assignment = [](void *param_value, T value, size_t value_size) { - (void)value_size; + std::ignore = value_size; *static_cast(param_value) = value; }; diff --git a/sycl/plugins/unified_runtime/ur_bindings.hpp b/sycl/plugins/unified_runtime/ur_bindings.hpp old mode 100755 new mode 100644 index 8597547221a88..4b58d0f73ff87 --- a/sycl/plugins/unified_runtime/ur_bindings.hpp +++ b/sycl/plugins/unified_runtime/ur_bindings.hpp @@ -9,44 +9,3 @@ #include #include - -// Make the Unified Runtime handles definition complete. -// This is used in various "create" API where new handles are allocated. -struct ur_platform_handle_t_ : public _ur_platform_handle_t { - using _ur_platform_handle_t::_ur_platform_handle_t; -}; -struct ur_device_handle_t_ : public _ur_device_handle_t { - using _ur_device_handle_t::_ur_device_handle_t; -}; - -struct ur_context_handle_t_ : public _ur_context_handle_t { - using _ur_context_handle_t::_ur_context_handle_t; -}; - -struct ur_event_handle_t_ : public _ur_event_handle_t { - using _ur_event_handle_t::_ur_event_handle_t; -}; - -struct ur_program_handle_t_ : public _ur_program_handle_t { - using _ur_program_handle_t::_ur_program_handle_t; -}; - -struct ur_module_handle_t_ : public _ur_module_handle_t { - using _ur_module_handle_t::_ur_module_handle_t; -}; - -struct ur_kernel_handle_t_ : public _ur_kernel_handle_t { - using _ur_kernel_handle_t::_ur_kernel_handle_t; -}; - -struct ur_queue_handle_t_ : public _ur_queue_handle_t { - using _ur_queue_handle_t::_ur_queue_handle_t; -}; - -struct ur_sampler_handle_t_ : public _ur_sampler_handle_t { - using _ur_sampler_handle_t::_ur_sampler_handle_t; -}; - -struct ur_mem_handle_t_ : public _ur_mem_handle_t { - using _ur_mem_handle_t::_ur_mem_handle_t; -}; From 9644ae2a2b052f5384253a22c8595eaac20bfc39 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 6 Apr 2023 00:16:57 -0700 Subject: [PATCH 02/50] Some fixes Signed-off-by: Jaime Arteaga --- sycl/plugins/unified_runtime/pi2ur.hpp | 31 ++++++++++++++----- .../level_zero/ur_level_zero_kernel.cpp | 17 +++++----- .../adapters/level_zero/ur_level_zero_mem.cpp | 31 +++++++++++++------ 3 files changed, 54 insertions(+), 25 deletions(-) diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 5ca4b1b9ae4f6..3c81faab879b1 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -2573,6 +2573,23 @@ inline pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context, auto UrDevice = reinterpret_cast(Device); ur_usm_desc_t USMDesc{}; + if (Properties) { + if (Properties[0] == PI_MEM_ALLOC_FLAGS) { + if (Properties[1] == PI_MEM_ALLOC_WRTITE_COMBINED) { + USMDesc.flags |= UR_USM_MEM_FLAG_WRITE_COMBINED; + } + if (Properties[1] == PI_MEM_ALLOC_INITIAL_PLACEMENT_DEVICE) { + USMDesc.flags |= UR_USM_MEM_FLAG_INITIAL_PLACEMENT_DEVICE; + } + if (Properties[1] == PI_MEM_ALLOC_INITIAL_PLACEMENT_HOST) { + USMDesc.flags |= UR_USM_MEM_FLAG_INITIAL_PLACEMENT_HOST; + } + if (Properties[1] == PI_MEM_ALLOC_DEVICE_READ_ONLY) { + USMDesc.flags |= UR_USM_MEM_FLAG_DEVICE_READ_ONLY; + } + } + } + ur_usm_pool_handle_t Pool{}; HANDLE_ERRORS(urUSMSharedAlloc(UrContext, UrDevice, &USMDesc, Pool, Size, Alignment, ResultPtr)); @@ -2987,8 +3004,10 @@ inline pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, pi_uint32 NumEventsInWaitList, const pi_event *EventsWaitList, pi_event *OutEvent) { - PI_ASSERT(Ptr, PI_ERROR_INVALID_MEM_OBJECT); PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + if (!Ptr) { + return PI_ERROR_INVALID_VALUE; + } ur_queue_handle_t UrQueue = reinterpret_cast(Queue); ur_mem_handle_t UrBuffer = reinterpret_cast(Ptr); @@ -2997,12 +3016,10 @@ inline pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - uint32_t Pattern = Value; - size_t PatternSize = sizeof(Pattern); - HANDLE_ERRORS(urEnqueueMemBufferFill( - UrQueue, UrBuffer, - const_cast(reinterpret_cast(&Pattern)), PatternSize, - 0, Count, NumEventsInWaitList, UrEventsWaitList, UrEvent)); + size_t PatternSize = 1; + HANDLE_ERRORS(urEnqueueMemBufferFill(UrQueue, UrBuffer, &Value, PatternSize, + 0, Count, NumEventsInWaitList, + UrEventsWaitList, UrEvent)); return PI_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp index 2a69a905c8e84..74571b0ef8669 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp @@ -192,13 +192,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH, CommandList, IsInternal)); - - ZeEvent = (*OutEvent)->ZeEvent; - (*OutEvent)->WaitList = TmpWaitList; + ZeEvent = (*Event)->ZeEvent; + (*Event)->WaitList = TmpWaitList; // Save the kernel in the event, so that when the event is signalled // the code can do a piKernelRelease on this kernel. - (*OutEvent)->CommandData = (void *)Kernel; + (*Event)->CommandData = (void *)Kernel; // Increment the reference count of the Kernel and indicate that the Kernel is // in use. Once the event has been signalled, the code in @@ -227,8 +226,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // Add the command to the command list, which implies submission. ZE2UR_CALL(zeCommandListAppendLaunchKernel, (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions, - ZeEvent, (*OutEvent)->WaitList.Length, - (*OutEvent)->WaitList.ZeEventList)); + ZeEvent, (*Event)->WaitList.Length, + (*Event)->WaitList.ZeEventList)); } else { // Add the command to the command list for later submission. // No lock is needed here, unlike the immediate commandlist case above, @@ -236,14 +235,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // submitted only when the comamndlist is closed. Then, a lock is held. ZE2UR_CALL(zeCommandListAppendLaunchKernel, (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions, - ZeEvent, (*OutEvent)->WaitList.Length, - (*OutEvent)->WaitList.ZeEventList)); + ZeEvent, (*Event)->WaitList.Length, + (*Event)->WaitList.ZeEventList)); } urPrint("calling zeCommandListAppendLaunchKernel() with" " ZeEvent %#llx\n", ur_cast(ZeEvent)); - printZeEventList((*OutEvent)->WaitList); + printZeEventList((*Event)->WaitList); // Execute command list asynchronously, as the event will be used // to track down its completion. diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index e2b0b597eb2b1..76cce8b081c34 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -753,15 +753,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - std::scoped_lock Lock(Queue->Mutex, - Buffer->Mutex); + // std::scoped_lock Lock(Queue->Mutex, + // Buffer->Mutex); + std::scoped_lock Lock(Queue->Mutex); - char *ZeHandleDst = nullptr; - UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); - return enqueueMemFillHelper(UR_COMMAND_MEM_BUFFER_FILL, Queue, - ZeHandleDst + Offset, Pattern, PatternSize, Size, - NumEventsInWaitList, EventWaitList, OutEvent); + // if Offset is not zero, then look for Ze Handle to + // determine correct dst with offset + if (Offset != 0) { + char *ZeHandleDst = nullptr; + _ur_buffer *UrBuffer = reinterpret_cast<_ur_buffer *>(Buffer); + UR_CALL(UrBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + Queue->Device)); + return enqueueMemFillHelper( + UR_COMMAND_MEM_BUFFER_FILL, Queue, ZeHandleDst + Offset, Pattern, + PatternSize, Size, NumEventsInWaitList, EventWaitList, OutEvent); + } else { + return enqueueMemFillHelper( + // TODO: do we need a new command type for USM memset? + UR_COMMAND_MEM_BUFFER_FILL, Queue, Buffer, + Pattern, // It will be interpreted as an 8-bit value, + PatternSize, // which is indicated with this pattern_size==1 + Size, NumEventsInWaitList, EventWaitList, OutEvent); + } } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( @@ -2131,7 +2144,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( ur_usm_mem_flags_t *Properties = &USMDesc->flags; // See if the memory is going to be read-only on the device. - bool DeviceReadOnly = false; + bool DeviceReadOnly = *Properties & UR_USM_MEM_FLAG_DEVICE_READ_ONLY; // L0 supports alignment up to 64KB and silently ignores higher values. // We flag alignment > 64KB as an invalid value. From b447851670ca4009cef1cb21490a59de504d5ce0 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 6 Apr 2023 15:43:16 -0700 Subject: [PATCH 03/50] Some fixes Signed-off-by: Jaime Arteaga --- sycl/plugins/unified_runtime/pi2ur.hpp | 7 ++- .../level_zero/ur_level_zero_context.cpp | 12 ++++- .../adapters/level_zero/ur_level_zero_mem.cpp | 52 +++++++------------ 3 files changed, 34 insertions(+), 37 deletions(-) diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 3c81faab879b1..6a7a0898dca99 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -3010,16 +3010,15 @@ inline pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, } ur_queue_handle_t UrQueue = reinterpret_cast(Queue); - ur_mem_handle_t UrBuffer = reinterpret_cast(Ptr); const ur_event_handle_t *UrEventsWaitList = reinterpret_cast(EventsWaitList); ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); size_t PatternSize = 1; - HANDLE_ERRORS(urEnqueueMemBufferFill(UrQueue, UrBuffer, &Value, PatternSize, - 0, Count, NumEventsInWaitList, - UrEventsWaitList, UrEvent)); + HANDLE_ERRORS(urEnqueueUSMFill(UrQueue, Ptr, PatternSize, &Value, Count, + NumEventsInWaitList, UrEventsWaitList, + UrEvent)); return PI_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp index 815a1a5db06cf..5f54f588febe4 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp @@ -70,6 +70,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextRelease( return ContextReleaseHelper(Context); } +// Due to a bug with 2D memory copy to and from non-USM pointers, this option is +// disabled by default. +static const bool UseMemcpy2DOperations = [] { + const char *UseMemcpy2DOperationsFlag = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D"); + if (!UseMemcpy2DOperationsFlag) + return false; + return std::stoi(UseMemcpy2DOperationsFlag) > 0; +}(); + UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( ur_context_handle_t Context, ///< [in] handle of the context ur_context_info_t ContextInfoType, ///< [in] type of the info to retrieve @@ -95,7 +105,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( return ReturnValue(uint32_t{Context->RefCount.load()}); case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: // 2D USM memcpy is supported. - return ReturnValue(pi_bool{true}); + return ReturnValue(pi_bool{UseMemcpy2DOperations}); case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT: // 2D USM fill is not supported. return ReturnValue(pi_bool{false}); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index 76cce8b081c34..bb146c2728e1b 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -753,28 +753,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. ) { - // std::scoped_lock Lock(Queue->Mutex, - // Buffer->Mutex); - std::scoped_lock Lock(Queue->Mutex); + std::scoped_lock Lock(Queue->Mutex, + Buffer->Mutex); - // if Offset is not zero, then look for Ze Handle to - // determine correct dst with offset - if (Offset != 0) { - char *ZeHandleDst = nullptr; - _ur_buffer *UrBuffer = reinterpret_cast<_ur_buffer *>(Buffer); - UR_CALL(UrBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, - Queue->Device)); - return enqueueMemFillHelper( - UR_COMMAND_MEM_BUFFER_FILL, Queue, ZeHandleDst + Offset, Pattern, - PatternSize, Size, NumEventsInWaitList, EventWaitList, OutEvent); - } else { - return enqueueMemFillHelper( - // TODO: do we need a new command type for USM memset? - UR_COMMAND_MEM_BUFFER_FILL, Queue, Buffer, - Pattern, // It will be interpreted as an 8-bit value, - PatternSize, // which is indicated with this pattern_size==1 - Size, NumEventsInWaitList, EventWaitList, OutEvent); - } + char *ZeHandleDst = nullptr; + _ur_buffer *UrBuffer = reinterpret_cast<_ur_buffer *>(Buffer); + UR_CALL(UrBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, + Queue->Device)); + return enqueueMemFillHelper( + UR_COMMAND_MEM_BUFFER_FILL, Queue, ZeHandleDst + Offset, + Pattern, // It will be interpreted as an 8-bit value, + PatternSize, // which is indicated with this pattern_size==1 + Size, NumEventsInWaitList, EventWaitList, OutEvent); } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( @@ -3072,14 +3062,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( ur_event_handle_t *Event ///< [out][optional] return an event object that ///< identifies this particular command instance. ) { - std::ignore = Queue; - std::ignore = Ptr; - std::ignore = PatternSize; - std::ignore = Pattern; - std::ignore = Size; - std::ignore = NumEventsInWaitList; - std::ignore = EventWaitList; - std::ignore = Event; - urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} + std::scoped_lock Lock(Queue->Mutex); + + return enqueueMemFillHelper( + // TODO: do we need a new command type for USM memset? + UR_COMMAND_MEM_BUFFER_FILL, Queue, Ptr, + Pattern, // It will be interpreted as an 8-bit value, + PatternSize, // which is indicated with this pattern_size==1 + Size, NumEventsInWaitList, EventWaitList, Event); +} \ No newline at end of file From d7e55784dc198dd8b6d416e9a8f03c1d555e6b76 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 6 Apr 2023 23:05:17 -0700 Subject: [PATCH 04/50] Some fixes Signed-off-by: Jaime Arteaga --- sycl/plugins/unified_runtime/pi2ur.hpp | 63 +++++++++++++------ .../level_zero/ur_level_zero_kernel.cpp | 5 +- .../level_zero/ur_level_zero_queue.hpp | 3 +- .../level_zero/ur_level_zero_sampler.cpp | 23 +++---- 4 files changed, 57 insertions(+), 37 deletions(-) diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 6a7a0898dca99..44aae44d4dec2 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -3448,26 +3448,49 @@ inline pi_result piSamplerCreate(pi_context Context, ur_context_handle_t UrContext = reinterpret_cast(Context); ur_sampler_property_t UrProps[6]{}; - UrProps[0] = UR_SAMPLER_PROPERTIES_NORMALIZED_COORDS; - UrProps[1] = SamplerProperties[1]; - - UrProps[2] = UR_SAMPLER_PROPERTIES_ADDRESSING_MODE; - if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT) - UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT; - else if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_REPEAT) - UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_REPEAT; - else if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE) - UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; - else if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_CLAMP) - UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_CLAMP; - else if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_NONE) - UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_NONE; - - UrProps[4] = UR_SAMPLER_PROPERTIES_FILTER_MODE; - if (SamplerProperties[4] & PI_SAMPLER_FILTER_MODE_NEAREST) - UrProps[5] = UR_EXT_SAMPLER_FILTER_MODE_NEAREST; - else if (SamplerProperties[4] & PI_SAMPLER_FILTER_MODE_LINEAR) - UrProps[5] = UR_EXT_SAMPLER_FILTER_MODE_LINEAR; + const pi_sampler_properties *CurProperty = SamplerProperties; + while (*CurProperty != 0) { + switch (*CurProperty) { + case PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS: { + UrProps[0] = UR_SAMPLER_PROPERTIES_NORMALIZED_COORDS; + UrProps[1] = ur_cast(*(++CurProperty)); + } break; + + case PI_SAMPLER_PROPERTIES_ADDRESSING_MODE: { + UrProps[2] = UR_SAMPLER_PROPERTIES_ADDRESSING_MODE; + pi_sampler_addressing_mode CurValueAddressingMode = + ur_cast( + ur_cast(*(++CurProperty))); + + if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT) + UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT; + else if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_REPEAT) + UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_REPEAT; + else if (CurValueAddressingMode == + PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE) + UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; + else if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_CLAMP) + UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_CLAMP; + else if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_NONE) + UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_NONE; + } break; + + case PI_SAMPLER_PROPERTIES_FILTER_MODE: { + UrProps[4] = UR_SAMPLER_PROPERTIES_FILTER_MODE; + pi_sampler_filter_mode CurValueFilterMode = + ur_cast(ur_cast(*(++CurProperty))); + + if (CurValueFilterMode == PI_SAMPLER_FILTER_MODE_NEAREST) + UrProps[5] = UR_EXT_SAMPLER_FILTER_MODE_NEAREST; + else if (CurValueFilterMode == PI_SAMPLER_FILTER_MODE_LINEAR) + UrProps[5] = UR_EXT_SAMPLER_FILTER_MODE_LINEAR; + } break; + + default: + break; + } + CurProperty++; + } ur_sampler_handle_t *UrSampler = reinterpret_cast(RetSampler); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp index 74571b0ef8669..92061bc0e91c4 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp @@ -669,9 +669,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler( ur_sampler_handle_t ArgValue ///< [in] handle of Sampler object. ) { std::scoped_lock Guard(Kernel->Mutex); - ZE2UR_CALL(zeKernelSetArgumentValue, - (ur_cast(Kernel->ZeKernel), ArgIndex, - sizeof(void *), &ArgValue->ZeSampler)); + ZE2UR_CALL(zeKernelSetArgumentValue, (Kernel->ZeKernel, ArgIndex, + sizeof(void *), &ArgValue->ZeSampler)); return UR_RESULT_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp index c7b81dbf30af3..75b64638ac262 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -500,7 +501,7 @@ struct ur_queue_handle_t_ : _ur_object { // the host-visible pool ur_result_t createEventAndAssociateQueue( ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType, - ur_command_list_ptr_t CommandList, bool IsInternal = false, + ur_command_list_ptr_t CommandList, bool IsInternal, std::optional HostVisible = std::nullopt); // Helper function to perform the necessary cleanup of the events from reset cmd diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp index 1b5496f5f59ed..5fdeb4ca0a7af 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp @@ -32,7 +32,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate( ZeStruct ZeSamplerDesc; // Set the default values for the ZeSamplerDesc. - ZeSamplerDesc.isNormalized = PI_TRUE; + ZeSamplerDesc.isNormalized = true; ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_CLAMP; ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST; @@ -42,16 +42,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate( // b) SamplerProperties list is missing any properties if (Props) { - const ur_sampler_property_t *CurProperty = Props; - - while (*CurProperty != 0) { - switch (*CurProperty) { + uint32_t PropCount = 0; + while (PropCount < 6) { // We expect only 3 pairs of sampler properties + switch (Props[PropCount]) { case UR_SAMPLER_PROPERTIES_NORMALIZED_COORDS: { - bool CurValueBool = ur_cast(*(++CurProperty)); + auto CurValueBool = Props[++PropCount]; - if (CurValueBool == PI_TRUE) + if (CurValueBool == 1UL) ZeSamplerDesc.isNormalized = PI_TRUE; - else if (CurValueBool == PI_FALSE) + else if (CurValueBool == 0UL) ZeSamplerDesc.isNormalized = PI_FALSE; else { urPrint("urSamplerCreate: unsupported " @@ -62,8 +61,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate( case UR_SAMPLER_PROPERTIES_ADDRESSING_MODE: { ur_sampler_addressing_mode_t CurValueAddressingMode = - ur_cast( - ur_cast(*(++CurProperty))); + static_cast(Props[++PropCount]); // Level Zero runtime with API version 1.2 and lower has a bug: // ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER is implemented as "clamp to @@ -107,8 +105,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate( case UR_SAMPLER_PROPERTIES_FILTER_MODE: { ur_ext_sampler_filter_mode_t CurValueFilterMode = - ur_cast( - ur_cast(*(++CurProperty))); + static_cast(Props[++PropCount]); if (CurValueFilterMode == UR_EXT_SAMPLER_FILTER_MODE_NEAREST) ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST; @@ -125,7 +122,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate( default: break; } - CurProperty++; + PropCount++; } } From b9681bbf22d1c26f66ed7a8d9681b19fe3edaab9 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 6 Apr 2023 23:55:06 -0700 Subject: [PATCH 05/50] Stubs for the make_queue interop APIs Signed-off-by: Jaime Arteaga --- sycl/plugins/level_zero/pi_level_zero.cpp | 22 ++++++++++++++++ .../unified_runtime/pi_unified_runtime.cpp | 25 +++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 44d747c12b871..c8b823d47602e 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -160,6 +160,28 @@ pi_result piextQueueCreate(pi_context Context, pi_device Device, return pi2ur::piextQueueCreate(Context, Device, Properties, Queue); } +pi_result piextQueueCreate2(pi_context Context, pi_device Device, + pi_queue_properties *Properties, pi_queue *Queue) { + return pi2ur::piextQueueCreate(Context, Device, Properties, Queue); +} + +pi_result piextQueueGetNativeHandle2(pi_queue Queue, + pi_native_handle *NativeHandle, + int32_t *NativeHandleDesc) { + std::ignore = NativeHandleDesc; + return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle); +} + +pi_result piextQueueCreateWithNativeHandle2( + pi_native_handle NativeHandle, int32_t NativeHandleDesc, pi_context Context, + pi_device Device, bool OwnNativeHandle, pi_queue_properties *Properties, + pi_queue *Queue) { + std::ignore = NativeHandleDesc; + std::ignore = Properties; + return pi2ur::piextQueueCreateWithNativeHandle(NativeHandle, Context, Device, + OwnNativeHandle, Queue); +} + pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index ba1cb72e8518f..b719273bf484e 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -100,6 +100,28 @@ __SYCL_EXPORT pi_result piextQueueCreate(pi_context Context, pi_device Device, return pi2ur::piextQueueCreate(Context, Device, Properties, Queue); } +__SYCL_EXPORT pi_result piextQueueCreate2(pi_context Context, pi_device Device, + pi_queue_properties *Properties, + pi_queue *Queue) { + return pi2ur::piextQueueCreate(Context, Device, Properties, Queue); +} + +__SYCL_EXPORT pi_result piextQueueGetNativeHandle2( + pi_queue Queue, pi_native_handle *NativeHandle, int32_t *NativeHandleDesc) { + std::ignore = NativeHandleDesc; + return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle); +} + +__SYCL_EXPORT pi_result piextQueueCreateWithNativeHandle2( + pi_native_handle NativeHandle, int32_t NativeHandleDesc, pi_context Context, + pi_device Device, bool OwnNativeHandle, pi_queue_properties *Properties, + pi_queue *Queue) { + std::ignore = NativeHandleDesc; + std::ignore = Properties; + return pi2ur::piextQueueCreateWithNativeHandle(NativeHandle, Context, Device, + OwnNativeHandle, Queue); +} + __SYCL_EXPORT pi_result piQueueRelease(pi_queue Queue) { return pi2ur::piQueueRelease(Queue); } @@ -1020,6 +1042,9 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) { _PI_API(piQueueFlush) _PI_API(piextQueueGetNativeHandle) _PI_API(piextQueueCreateWithNativeHandle) + _PI_API(piextQueueCreate2) + _PI_API(piextQueueGetNativeHandle2) + _PI_API(piextQueueCreateWithNativeHandle2) _PI_API(piProgramCreate) _PI_API(piProgramBuild) From 165b2c2b7720d3f618a212da2bd02c936b02e0e6 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Fri, 7 Apr 2023 16:56:07 -0700 Subject: [PATCH 06/50] Use custom urContextCreateWithNativeHandle This requires for now using a custom loader with the proper parameters Signed-off-by: Jaime Arteaga --- sycl/plugins/unified_runtime/CMakeLists.txt | 4 +- sycl/plugins/unified_runtime/pi2ur.hpp | 187 +++++++++++------- .../level_zero/ur_level_zero_context.cpp | 14 +- .../level_zero/ur_level_zero_context.hpp | 9 +- .../level_zero/ur_level_zero_device.cpp | 57 +++--- .../level_zero/ur_level_zero_kernel.cpp | 5 +- .../adapters/level_zero/ur_level_zero_mem.cpp | 71 ++++--- .../adapters/level_zero/ur_level_zero_mem.hpp | 6 +- .../level_zero/ur_level_zero_queue.cpp | 54 +++-- .../level_zero/ur_level_zero_queue.hpp | 15 +- .../level_zero/ur_level_zero_sampler.cpp | 127 +++++------- .../level_zero/ur_loader_interface.cpp | 2 +- sycl/plugins/unified_runtime/ur/ur.hpp | 26 +-- 13 files changed, 299 insertions(+), 278 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index a4eee6963601e..5b709ef7adacf 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -3,8 +3,8 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_DIR) include(FetchContent) - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - set(UNIFIED_RUNTIME_TAG 74843ea0800e6fb7ce0f82e0ef991fc258f4b9bd) + set(UNIFIED_RUNTIME_REPO "https://github.com/jandres742/unified-runtime.git") + set(UNIFIED_RUNTIME_TAG b5c2119ba147306a76067e86c25e0c6c383172c6) message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}") FetchContent_Declare(unified-runtime diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 44aae44d4dec2..509448db3d3a4 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -1014,9 +1014,44 @@ piextDeviceSelectBinary(pi_device Device, // TODO: does this need to be context? pi_uint32 *SelectedBinaryInd) { auto UrDevice = reinterpret_cast(Device); - const uint8_t **UrBinaries = - const_cast(reinterpret_cast(Binaries)); - HANDLE_ERRORS(urDeviceSelectBinary(UrDevice, UrBinaries, NumBinaries, + std::vector UrBinaries(NumBinaries); + + for (uint32_t BinaryCount = 0; BinaryCount < NumBinaries; BinaryCount++) { + if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec, + __SYCL_PI_DEVICE_BINARY_TARGET_UNKNOWN) == 0) + UrBinaries[BinaryCount].pDeviceTargetSpec = + UR_DEVICE_BINARY_TARGET_UNKNOWN; + else if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec, + __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32) == 0) + UrBinaries[BinaryCount].pDeviceTargetSpec = + UR_DEVICE_BINARY_TARGET_SPIRV32; + else if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec, + __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64) == 0) + UrBinaries[BinaryCount].pDeviceTargetSpec = + UR_DEVICE_BINARY_TARGET_SPIRV64; + else if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec, + __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_X86_64) == 0) + UrBinaries[BinaryCount].pDeviceTargetSpec = + UR_DEVICE_BINARY_TARGET_SPIRV64_X86_64; + else if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec, + __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN) == 0) + UrBinaries[BinaryCount].pDeviceTargetSpec = + UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; + else if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec, + __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_FPGA) == 0) + UrBinaries[BinaryCount].pDeviceTargetSpec = + UR_DEVICE_BINARY_TARGET_SPIRV64_FPGA; + else if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec, + __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64) == 0) + UrBinaries[BinaryCount].pDeviceTargetSpec = + UR_DEVICE_BINARY_TARGET_NVPTX64; + else if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec, + __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN) == 0) + UrBinaries[BinaryCount].pDeviceTargetSpec = + UR_DEVICE_BINARY_TARGET_AMDGCN; + } + + HANDLE_ERRORS(urDeviceSelectBinary(UrDevice, UrBinaries.data(), NumBinaries, SelectedBinaryInd)); return PI_SUCCESS; } @@ -1074,10 +1109,13 @@ inline pi_result piextContextCreateWithNativeHandle( ur_native_handle_t NativeContext = reinterpret_cast(NativeHandle); + const ur_device_handle_t *UrDevices = + reinterpret_cast(Devices); ur_context_handle_t *UrContext = reinterpret_cast(RetContext); - HANDLE_ERRORS(urContextCreateWithNativeHandle(NativeContext, UrContext)); - (*UrContext)->OwnZeContext = OwnNativeHandle; + + HANDLE_ERRORS(urContextCreateWithNativeHandle( + NativeContext, NumDevices, UrDevices, OwnNativeHandle, UrContext)); return PI_SUCCESS; } @@ -1096,21 +1134,16 @@ inline pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName, ContextInfoType = UR_CONTEXT_INFO_DEVICES; break; } - case PI_CONTEXT_INFO_PLATFORM: { - die("urGetContextInfo: unsuppported ParamName."); - } case PI_CONTEXT_INFO_NUM_DEVICES: { ContextInfoType = UR_CONTEXT_INFO_NUM_DEVICES; break; } - case PI_CONTEXT_INFO_PROPERTIES: { - die("urGetContextInfo: unsuppported ParamName."); - } case PI_CONTEXT_INFO_REFERENCE_COUNT: { ContextInfoType = UR_EXT_CONTEXT_INFO_REFERENCE_COUNT; break; } case PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT: { + case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT: ContextInfoType = UR_CONTEXT_INFO_USM_FILL2D_SUPPORT; break; } @@ -1127,7 +1160,7 @@ inline pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName, die("These queries should have never come here"); } default: { - die("piGetContextInfo: unsuppported ParamName."); + die("piContextGetInfo: unsuppported ParamName."); } } @@ -1155,19 +1188,6 @@ inline pi_result piContextRelease(pi_context Context) { /////////////////////////////////////////////////////////////////////////////// // Queue -inline pi_result piQueueCreate(pi_context Context, pi_device Device, - pi_queue_properties Flags, pi_queue *Queue) { - - ur_context_handle_t UrContext = - reinterpret_cast(Context); - auto UrDevice = reinterpret_cast(Device); - ur_queue_property_t Props{}; - ur_queue_handle_t *UrQueue = reinterpret_cast(Queue); - HANDLE_ERRORS(urQueueCreate(UrContext, UrDevice, &Props, UrQueue)); - - return PI_SUCCESS; -} - inline pi_result piextQueueCreate(pi_context Context, pi_device Device, pi_queue_properties *Properties, pi_queue *Queue) { @@ -1194,38 +1214,46 @@ inline pi_result piextQueueCreate(pi_context Context, pi_device Device, PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); - ur_queue_property_t props[5]{}; - props[0] = UR_QUEUE_PROPERTIES_FLAGS; + ur_queue_properties_t UrProperties{}; if (Properties[1] & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) - props[1] |= UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; + UrProperties.flags |= UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; if (Properties[1] & PI_QUEUE_FLAG_PROFILING_ENABLE) - props[1] |= UR_QUEUE_FLAG_PROFILING_ENABLE; + UrProperties.flags |= UR_QUEUE_FLAG_PROFILING_ENABLE; if (Properties[1] & PI_QUEUE_FLAG_ON_DEVICE) - props[1] |= UR_QUEUE_FLAG_ON_DEVICE; + UrProperties.flags |= UR_QUEUE_FLAG_ON_DEVICE; if (Properties[1] & PI_QUEUE_FLAG_ON_DEVICE_DEFAULT) - props[1] |= UR_QUEUE_FLAG_ON_DEVICE_DEFAULT; + UrProperties.flags |= UR_QUEUE_FLAG_ON_DEVICE_DEFAULT; if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS) - props[1] |= UR_QUEUE_FLAG_DISCARD_EVENTS; + UrProperties.flags |= UR_QUEUE_FLAG_DISCARD_EVENTS; if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW) - props[1] |= UR_QUEUE_FLAG_PRIORITY_LOW; + UrProperties.flags |= UR_QUEUE_FLAG_PRIORITY_LOW; if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH) - props[1] |= UR_QUEUE_FLAG_PRIORITY_HIGH; + UrProperties.flags |= UR_QUEUE_FLAG_PRIORITY_HIGH; + ur_queue_index_properties_t IndexProperties{}; + IndexProperties.stype = UR_STRUCTURE_TYPE_QUEUE_INDEX_PROPERTIES; if (Properties[2] != 0) { - props[2] = UR_QUEUE_PROPERTIES_COMPUTE_INDEX; - props[3] = Properties[3]; + IndexProperties.computeIndex = Properties[3]; } + UrProperties.pNext = &IndexProperties; + ur_context_handle_t UrContext = reinterpret_cast(Context); auto UrDevice = reinterpret_cast(Device); ur_queue_handle_t *UrQueue = reinterpret_cast(Queue); - HANDLE_ERRORS(urQueueCreate(UrContext, UrDevice, props, UrQueue)); + HANDLE_ERRORS(urQueueCreate(UrContext, UrDevice, &UrProperties, UrQueue)); return PI_SUCCESS; } +inline pi_result piQueueCreate(pi_context Context, pi_device Device, + pi_queue_properties Flags, pi_queue *Queue) { + pi_queue_properties Properties[] = {PI_QUEUE_FLAGS, Flags, 0}; + return pi2ur::piextQueueCreate(Context, Device, Properties, Queue); +} + inline pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle, pi_context Context, pi_device Device, @@ -1308,7 +1336,7 @@ inline pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName, break; } case PI_QUEUE_INFO_PROPERTIES: { - UrParamName = UR_QUEUE_INFO_PROPERTIES; + UrParamName = UR_QUEUE_INFO_FLAGS; break; } case PI_QUEUE_INFO_REFERENCE_COUNT: { @@ -1766,25 +1794,40 @@ inline pi_result piKernelSetExecInfo(pi_kernel Kernel, PI_ASSERT(ParamValue, PI_ERROR_INVALID_VALUE); ur_kernel_handle_t UrKernel = reinterpret_cast(Kernel); - ur_kernel_exec_info_t propName{}; + ur_kernel_exec_info_t PropName{}; + uint64_t PropValue{}; switch (ParamName) { case PI_USM_INDIRECT_ACCESS: { - propName = UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS; + PropName = UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS; + PropValue = *(static_cast(const_cast(ParamValue))); break; } case PI_USM_PTRS: { - propName = UR_KERNEL_EXEC_INFO_USM_PTRS; + PropName = UR_KERNEL_EXEC_INFO_USM_PTRS; break; } case PI_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG: { - propName = UR_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG; + PropName = UR_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG; + auto Param = (*(static_cast(ParamValue))); + if (Param == PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM) { + PropValue = + static_cast(UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM); + } else if (Param == PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA) { + PropValue = + static_cast(UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA); + break; + } else if (Param == PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT) { + PropValue = static_cast(UR_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT); + } else { + die("piKernelSetExecInfo: unsupported ParamValue\n"); + } break; } default: - return PI_ERROR_INVALID_PROPERTY; + die("piKernelSetExecInfo: unsupported ParamName\n"); } HANDLE_ERRORS( - urKernelSetExecInfo(UrKernel, propName, ParamValueSize, ParamValue)); + urKernelSetExecInfo(UrKernel, PropName, ParamValueSize, &PropValue)); return PI_SUCCESS; } @@ -2164,9 +2207,11 @@ inline pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, UrBufferFlags |= UR_MEM_FLAG_ALLOC_HOST_POINTER; } + ur_buffer_properties_t UrProps{}; + UrProps.pHost = HostPtr; ur_mem_handle_t *UrBuffer = reinterpret_cast(RetMem); HANDLE_ERRORS( - urMemBufferCreate(UrContext, UrBufferFlags, Size, HostPtr, UrBuffer)); + urMemBufferCreate(UrContext, UrBufferFlags, Size, &UrProps, UrBuffer)); return PI_SUCCESS; } @@ -2178,9 +2223,9 @@ inline pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context, ur_context_handle_t UrContext = reinterpret_cast(Context); ur_usm_desc_t USMDesc{}; + USMDesc.align = Alignment; ur_usm_pool_handle_t Pool{}; - HANDLE_ERRORS( - urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, Alignment, ResultPtr)); + HANDLE_ERRORS(urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, ResultPtr)); return PI_SUCCESS; } @@ -2551,9 +2596,10 @@ inline pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, auto UrDevice = reinterpret_cast(Device); ur_usm_desc_t USMDesc{}; + USMDesc.align = Alignment; ur_usm_pool_handle_t Pool{}; - HANDLE_ERRORS(urUSMDeviceAlloc(UrContext, UrDevice, &USMDesc, Pool, Size, - Alignment, ResultPtr)); + HANDLE_ERRORS( + urUSMDeviceAlloc(UrContext, UrDevice, &USMDesc, Pool, Size, ResultPtr)); return PI_SUCCESS; } @@ -2576,23 +2622,25 @@ inline pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context, if (Properties) { if (Properties[0] == PI_MEM_ALLOC_FLAGS) { if (Properties[1] == PI_MEM_ALLOC_WRTITE_COMBINED) { - USMDesc.flags |= UR_USM_MEM_FLAG_WRITE_COMBINED; + USMDesc.flags |= UR_EXT_USM_MEM_FLAG_WRITE_COMBINED; } if (Properties[1] == PI_MEM_ALLOC_INITIAL_PLACEMENT_DEVICE) { - USMDesc.flags |= UR_USM_MEM_FLAG_INITIAL_PLACEMENT_DEVICE; + USMDesc.flags |= UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_DEVICE; } if (Properties[1] == PI_MEM_ALLOC_INITIAL_PLACEMENT_HOST) { - USMDesc.flags |= UR_USM_MEM_FLAG_INITIAL_PLACEMENT_HOST; + USMDesc.flags |= UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_HOST; } if (Properties[1] == PI_MEM_ALLOC_DEVICE_READ_ONLY) { - USMDesc.flags |= UR_USM_MEM_FLAG_DEVICE_READ_ONLY; + USMDesc.flags |= UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY; } } } + USMDesc.align = Alignment; + ur_usm_pool_handle_t Pool{}; - HANDLE_ERRORS(urUSMSharedAlloc(UrContext, UrDevice, &USMDesc, Pool, Size, - Alignment, ResultPtr)); + HANDLE_ERRORS( + urUSMSharedAlloc(UrContext, UrDevice, &USMDesc, Pool, Size, ResultPtr)); return PI_SUCCESS; } @@ -2682,8 +2730,8 @@ inline pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr, // TODO: to map from pi_mem_advice to ur_mem_advice_t // once we have those defined - ur_mem_advice_t UrAdvice{}; - HANDLE_ERRORS(urEnqueueUSMMemAdvise(UrQueue, Ptr, Length, UrAdvice, UrEvent)); + ur_usm_advice_flags_t UrAdvice{}; + HANDLE_ERRORS(urEnqueueUSMAdvise(UrQueue, Ptr, Length, UrAdvice, UrEvent)); return PI_SUCCESS; } @@ -3387,7 +3435,7 @@ inline pi_result piextEventCreateWithNativeHandle(pi_native_handle NativeHandle, ur_context_handle_t UrContext = reinterpret_cast(Context); - ur_event_handle_t *UrEvent = reinterpret_cast(*Event); + ur_event_handle_t *UrEvent = reinterpret_cast(Event); HANDLE_ERRORS( urEventCreateWithNativeHandle(UrNativeKernel, UrContext, UrEvent)); (*UrEvent)->OwnNativeHandle = OwnNativeHandle; @@ -3447,43 +3495,40 @@ inline pi_result piSamplerCreate(pi_context Context, ur_context_handle_t UrContext = reinterpret_cast(Context); - ur_sampler_property_t UrProps[6]{}; + ur_sampler_desc_t UrProps{}; const pi_sampler_properties *CurProperty = SamplerProperties; while (*CurProperty != 0) { switch (*CurProperty) { case PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS: { - UrProps[0] = UR_SAMPLER_PROPERTIES_NORMALIZED_COORDS; - UrProps[1] = ur_cast(*(++CurProperty)); + UrProps.normalizedCoords = ur_cast(*(++CurProperty)); } break; case PI_SAMPLER_PROPERTIES_ADDRESSING_MODE: { - UrProps[2] = UR_SAMPLER_PROPERTIES_ADDRESSING_MODE; pi_sampler_addressing_mode CurValueAddressingMode = ur_cast( ur_cast(*(++CurProperty))); if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT) - UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT; + UrProps.addressingMode = UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT; else if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_REPEAT) - UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_REPEAT; + UrProps.addressingMode = UR_SAMPLER_ADDRESSING_MODE_REPEAT; else if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE) - UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; + UrProps.addressingMode = UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE; else if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_CLAMP) - UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_CLAMP; + UrProps.addressingMode = UR_SAMPLER_ADDRESSING_MODE_CLAMP; else if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_NONE) - UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_NONE; + UrProps.addressingMode = UR_SAMPLER_ADDRESSING_MODE_NONE; } break; case PI_SAMPLER_PROPERTIES_FILTER_MODE: { - UrProps[4] = UR_SAMPLER_PROPERTIES_FILTER_MODE; pi_sampler_filter_mode CurValueFilterMode = ur_cast(ur_cast(*(++CurProperty))); if (CurValueFilterMode == PI_SAMPLER_FILTER_MODE_NEAREST) - UrProps[5] = UR_EXT_SAMPLER_FILTER_MODE_NEAREST; + UrProps.filterMode = UR_SAMPLER_FILTER_MODE_NEAREST; else if (CurValueFilterMode == PI_SAMPLER_FILTER_MODE_LINEAR) - UrProps[5] = UR_EXT_SAMPLER_FILTER_MODE_LINEAR; + UrProps.filterMode = UR_SAMPLER_FILTER_MODE_LINEAR; } break; default: @@ -3495,7 +3540,7 @@ inline pi_result piSamplerCreate(pi_context Context, ur_sampler_handle_t *UrSampler = reinterpret_cast(RetSampler); - HANDLE_ERRORS(urSamplerCreate(UrContext, UrProps, UrSampler)); + HANDLE_ERRORS(urSamplerCreate(UrContext, &UrProps, UrSampler)); return PI_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp index 5f54f588febe4..2f29904b04563 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp @@ -137,14 +137,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( ur_native_handle_t - NativeContext, ///< [in] the native handle of the context. - ur_context_handle_t *Context ///< [out] pointer to the handle of the - ///< context object created. + NativeContext, ///< [in] the native handle of the context. + uint32_t NumDevices, const ur_device_handle_t *Devices, + bool OwnNativeHandle, + ur_context_handle_t + *Context ///< [out] pointer to the handle of the context object created. ) { try { ze_context_handle_t ZeContext = reinterpret_cast(NativeContext); - ur_context_handle_t_ *UrContext = new ur_context_handle_t_(ZeContext); + ur_context_handle_t_ *UrContext = new ur_context_handle_t_( + ZeContext, NumDevices, Devices, OwnNativeHandle); UrContext->initialize(); *Context = reinterpret_cast(UrContext); } catch (const std::bad_alloc &) { @@ -152,7 +155,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } - return UR_RESULT_SUCCESS; } @@ -310,7 +312,7 @@ ur_result_t ContextReleaseHelper(ur_context_handle_t Context) { Contexts.erase(It); } ze_context_handle_t DestroyZeContext = - Context->OwnZeContext ? Context->ZeContext : nullptr; + Context->OwnNativeHandle ? Context->ZeContext : nullptr; // Clean up any live memory associated with Context ur_result_t Result = Context->finalize(); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp index 8cb8a94124b6a..a980a80a855f3 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp @@ -29,7 +29,9 @@ struct ur_context_handle_t_ : _ur_object { ur_context_handle_t_(ze_context_handle_t ZeContext, uint32_t NumDevices, const ur_device_handle_t *Devs, bool OwnZeContext) : ZeContext{ZeContext}, Devices{Devs, Devs + NumDevices}, - OwnZeContext{OwnZeContext} {} + NumDevices{NumDevices} { + OwnNativeHandle = OwnZeContext; + } ur_context_handle_t_(ze_context_handle_t ZeContext) : ZeContext{ZeContext} {} @@ -44,10 +46,7 @@ struct ur_context_handle_t_ : _ur_object { // Therefore it can be accessed without holding a lock on this _pi_context. // const std::vector Devices; std::vector Devices; - - // Indicates if we own the ZeContext or it came from interop that - // asked to not transfer the ownership to SYCL RT. - bool OwnZeContext = false; + uint32_t NumDevices{}; // Immediate Level Zero command list for the device in this context, to be // used for initializations. To be created as: diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp index 8983835ad0811..0a21858fc2842 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp @@ -406,25 +406,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( ze_device_fp_flags_t ZeSingleFPCapabilities = Device->ZeDeviceModuleProperties->fp32flags; if (ZE_DEVICE_FP_FLAG_DENORM & ZeSingleFPCapabilities) { - SingleFPValue |= UR_FP_CAPABILITY_FLAG_DENORM; + SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_DENORM; } if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeSingleFPCapabilities) { - SingleFPValue |= UR_FP_CAPABILITY_FLAG_INF_NAN; + SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN; } if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeSingleFPCapabilities) { - SingleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST; + SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST; } if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeSingleFPCapabilities) { - SingleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_ZERO; + SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO; } if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeSingleFPCapabilities) { - SingleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_INF; + SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF; } if (ZE_DEVICE_FP_FLAG_FMA & ZeSingleFPCapabilities) { - SingleFPValue |= UR_FP_CAPABILITY_FLAG_FMA; + SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_FMA; } if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeSingleFPCapabilities) { - SingleFPValue |= UR_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; + SingleFPValue |= + UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; } return ReturnValue(uint64_t{SingleFPValue}); } @@ -433,25 +434,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( ze_device_fp_flags_t ZeHalfFPCapabilities = Device->ZeDeviceModuleProperties->fp16flags; if (ZE_DEVICE_FP_FLAG_DENORM & ZeHalfFPCapabilities) { - HalfFPValue |= UR_FP_CAPABILITY_FLAG_DENORM; + HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_DENORM; } if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeHalfFPCapabilities) { - HalfFPValue |= UR_FP_CAPABILITY_FLAG_INF_NAN; + HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN; } if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeHalfFPCapabilities) { - HalfFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST; + HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST; } if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeHalfFPCapabilities) { - HalfFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_ZERO; + HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO; } if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeHalfFPCapabilities) { - HalfFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_INF; + HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF; } if (ZE_DEVICE_FP_FLAG_FMA & ZeHalfFPCapabilities) { - HalfFPValue |= UR_FP_CAPABILITY_FLAG_FMA; + HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_FMA; } if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeHalfFPCapabilities) { - HalfFPValue |= UR_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; + HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; } return ReturnValue(uint64_t{HalfFPValue}); } @@ -460,25 +461,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( ze_device_fp_flags_t ZeDoubleFPCapabilities = Device->ZeDeviceModuleProperties->fp64flags; if (ZE_DEVICE_FP_FLAG_DENORM & ZeDoubleFPCapabilities) { - DoubleFPValue |= UR_FP_CAPABILITY_FLAG_DENORM; + DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_DENORM; } if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeDoubleFPCapabilities) { - DoubleFPValue |= UR_FP_CAPABILITY_FLAG_INF_NAN; + DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN; } if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeDoubleFPCapabilities) { - DoubleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST; + DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST; } if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeDoubleFPCapabilities) { - DoubleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_ZERO; + DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO; } if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeDoubleFPCapabilities) { - DoubleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_INF; + DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF; } if (ZE_DEVICE_FP_FLAG_FMA & ZeDoubleFPCapabilities) { - DoubleFPValue |= UR_FP_CAPABILITY_FLAG_FMA; + DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_FMA; } if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeDoubleFPCapabilities) { - DoubleFPValue |= UR_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; + DoubleFPValue |= + UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; } return ReturnValue(uint64_t{DoubleFPValue}); } @@ -1138,7 +1140,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDevicePartition( UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( ur_device_handle_t Device, ///< [in] handle of the device to select binary for. - const uint8_t **BinaryArray, ///< [in] the array of binaries to select from. + const ur_device_binary_t + *Binaries, ///< [in] the array of binaries to select from. uint32_t NumBinaries, ///< [in] the number of binaries passed in ppBinaries. ///< Must greater than or equal to zero otherwise ///< ::UR_RESULT_ERROR_INVALID_VALUE is returned. @@ -1162,10 +1165,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( // plugin for platform/device the ctx was created for. // Look for GEN binary, which we known can only be handled by Level-Zero now. - const char *BinaryTarget = __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN; - - pi_device_binary *Binaries = - reinterpret_cast(const_cast(BinaryArray)); + const char *BinaryTarget = + UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; //__SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN; uint32_t *SelectedBinaryInd = SelectedBinary; @@ -1174,11 +1175,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( uint32_t Spirv = InvalidInd; for (uint32_t i = 0; i < NumBinaries; ++i) { - if (strcmp(Binaries[i]->DeviceTargetSpec, BinaryTarget) == 0) { + if (strcmp(Binaries[i].pDeviceTargetSpec, BinaryTarget) == 0) { *SelectedBinaryInd = i; return UR_RESULT_SUCCESS; } - if (strcmp(Binaries[i]->DeviceTargetSpec, + if (strcmp(Binaries[i].pDeviceTargetSpec, __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64) == 0) Spirv = i; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp index 92061bc0e91c4..336f8ea530cdb 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp @@ -631,6 +631,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo( const void *PropValue ///< [in][range(0, propSize)] pointer to memory ///< location holding the property value. ) { + std::ignore = PropSize; + std::scoped_lock Guard(Kernel->Mutex); if (PropName == UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS && *(static_cast(PropValue)) == PI_TRUE) { @@ -644,7 +646,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo( ZE2UR_CALL(zeKernelSetIndirectAccess, (Kernel->ZeKernel, IndirectFlags)); } else if (PropName == UR_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG) { ze_cache_config_flag_t ZeCacheConfig{}; - auto CacheConfig = *(static_cast(PropValue)); + auto CacheConfig = + *(static_cast(PropValue)); if (CacheConfig == UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM) ZeCacheConfig = ZE_CACHE_CONFIG_FLAG_LARGE_SLM; else if (CacheConfig == UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index bb146c2728e1b..d09f18fe76c48 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -1281,11 +1281,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemAdvise( - ur_queue_handle_t Queue, ///< [in] handle of the queue object - const void *Mem, ///< [in] pointer to the USM memory object - size_t Size, ///< [in] size in bytes to be advised - ur_mem_advice_t Advice, ///< [in] USM memory advice +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + const void *Mem, ///< [in] pointer to the USM memory object + size_t Size, ///< [in] size in bytes to be advised + ur_usm_advice_flags_t Advice, ///< [in] USM memory advice ur_event_handle_t *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular command instance. @@ -1636,7 +1636,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( ur_context_handle_t Context, ///< [in] handle of the context object ur_mem_flags_t Flags, ///< [in] allocation and usage information flags size_t Size, ///< [in] size in bytes of the memory object to be allocated - void *Host, ///< [in][optional] pointer to the buffer data + const ur_buffer_properties_t *Properties, ur_mem_handle_t *RetBuffer ///< [out] pointer to handle of the memory buffer created ) { @@ -1649,6 +1649,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( // } + void *Host = Properties->pHost; + // If USM Import feature is enabled and hostptr is supplied, // import the hostptr if not already imported into USM. // Data transfer rate is maximized when both source and destination @@ -1755,7 +1757,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( Buffer, ///< [in] handle of the buffer object to allocate from ur_mem_flags_t Flags, ///< [in] allocation and usage information flags ur_buffer_create_type_t BufferCreateType, ///< [in] buffer creation type - ur_buffer_region_t + const ur_buffer_region_t *BufferCreateInfo, ///< [in] pointer to buffer create region information ur_mem_handle_t *RetMem ///< [out] pointer to the handle of sub buffer created @@ -1957,22 +1959,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo( UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( ur_context_handle_t Context, ///< [in] handle of the context object - ur_usm_desc_t *USMDesc, ///< [in][optional] USM memory allocation descriptor + const ur_usm_desc_t + *USMDesc, ///< [in][optional] USM memory allocation descriptor ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created ///< using urUSMPoolCreate size_t Size, ///< [in] size in bytes of the USM memory object to be allocated - uint32_t Align, ///< [in] alignment of the USM memory object - void **RetMem ///< [out] pointer to USM host memory object + void **RetMem ///< [out] pointer to USM host memory object ) { std::ignore = Pool; + uint32_t Align = USMDesc->align; // L0 supports alignment up to 64KB and silently ignores higher values. // We flag alignment > 64KB as an invalid value. if (Align > 65536) return UR_RESULT_ERROR_INVALID_VALUE; - ur_usm_mem_flags_t *USMFlag = &USMDesc->flags; + const ur_usm_flags_t *USMFlag = &USMDesc->flags; std::ignore = USMFlag; ur_platform_handle_t Plt = Context->getPlatform(); @@ -2002,7 +2005,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( // keep the same behavior for the allocator, just call L0 API directly and // return the error code. ((Align & (Align - 1)) != 0)) { - ur_usm_mem_flags_t Properties{}; + ur_usm_flags_t Properties{}; ur_result_t Res = USMHostAllocImpl(RetMem, Context, &Properties, Size, Align); if (IndirectAccessTrackingEnabled) { @@ -2038,22 +2041,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( ur_context_handle_t Context, ///< [in] handle of the context object ur_device_handle_t Device, ///< [in] handle of the device object - ur_usm_desc_t *USMDesc, ///< [in][optional] USM memory allocation descriptor + const ur_usm_desc_t + *USMDesc, ///< [in][optional] USM memory allocation descriptor ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created ///< using urUSMPoolCreate size_t Size, ///< [in] size in bytes of the USM memory object to be allocated - uint32_t Alignment, ///< [in] alignment of the USM memory object - void **RetMem ///< [out] pointer to USM device memory object + void **RetMem ///< [out] pointer to USM device memory object ) { std::ignore = Pool; + uint32_t Alignment = USMDesc->align; + // L0 supports alignment up to 64KB and silently ignores higher values. // We flag alignment > 64KB as an invalid value. if (Alignment > 65536) return UR_RESULT_ERROR_INVALID_VALUE; - ur_usm_mem_flags_t *USMProp = &USMDesc->flags; + const ur_usm_flags_t *USMProp = &USMDesc->flags; std::ignore = USMProp; ur_platform_handle_t Plt = Device->Platform; @@ -2121,20 +2126,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( ur_context_handle_t Context, ///< [in] handle of the context object ur_device_handle_t Device, ///< [in] handle of the device object - ur_usm_desc_t *USMDesc, ///< [in][optional] USM memory allocation descriptor + const ur_usm_desc_t + *USMDesc, ///< [in][optional] USM memory allocation descriptor ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created ///< using urUSMPoolCreate size_t Size, ///< [in] size in bytes of the USM memory object to be allocated - uint32_t Alignment, ///< [in] alignment of the USM memory object - void **RetMem ///< [out] pointer to USM shared memory object + void **RetMem ///< [out] pointer to USM shared memory object ) { std::ignore = Pool; - ur_usm_mem_flags_t *Properties = &USMDesc->flags; + const ur_usm_flags_t *Properties = &USMDesc->flags; + uint32_t Alignment = USMDesc->align; // See if the memory is going to be read-only on the device. - bool DeviceReadOnly = *Properties & UR_USM_MEM_FLAG_DEVICE_READ_ONLY; + bool DeviceReadOnly = *Properties & UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY; // L0 supports alignment up to 64KB and silently ignores higher values. // We flag alignment > 64KB as an invalid value. @@ -2165,8 +2171,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( // keep the same behavior for the allocator, just call L0 API directly and // return the error code. ((Alignment & (Alignment - 1)) != 0)) { - ur_result_t Res = USMSharedAllocImpl(RetMem, Context, Device, Properties, - Size, Alignment); + ur_result_t Res = USMSharedAllocImpl( + RetMem, Context, Device, const_cast(Properties), Size, + Alignment); if (IndirectAccessTrackingEnabled) { // Keep track of all memory allocations in the context Context->MemAllocs.emplace(std::piecewise_construct, @@ -2323,7 +2330,7 @@ ur_result_t USMSharedMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, ur_result_t USMSharedReadOnlyMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - ur_usm_mem_flags_t Props = UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY; + ur_usm_flags_t Props = UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY; return USMSharedAllocImpl(ResultPtr, Context, Device, &Props, Size, Alignment); } @@ -2429,7 +2436,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolDestroy( ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, ur_device_handle_t Device, - ur_usm_mem_flags_t *Properties, size_t Size, + ur_usm_flags_t *Properties, size_t Size, uint32_t Alignment) { // TODO: translate PI properties to Level Zero flags ZeStruct ZeDesc; @@ -2455,7 +2462,7 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, } ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t Device, ur_usm_mem_flags_t *, + ur_device_handle_t Device, ur_usm_flags_t *, size_t Size, uint32_t Alignment) { // TODO: translate PI properties to Level Zero flags @@ -2486,7 +2493,7 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, } ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_usm_mem_flags_t *Properties, size_t Size, + ur_usm_flags_t *Properties, size_t Size, uint32_t Alignment) { // TODO: translate PI properties to Level Zero flags ZeStruct ZeHostDesc; @@ -2752,8 +2759,9 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, if (USMAllocatorConfigInstance.EnableBuffers) { HostAllocation.ReleaseAction = allocation_t::free; ur_usm_desc_t USMDesc{}; + USMDesc.align = getAlignment(); ur_usm_pool_handle_t Pool{}; - UR_CALL(urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, getAlignment(), + UR_CALL(urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, reinterpret_cast(&ZeHandle))); } else { HostAllocation.ReleaseAction = allocation_t::free_native; @@ -2807,9 +2815,9 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, if (USMAllocatorConfigInstance.EnableBuffers) { Allocation.ReleaseAction = allocation_t::free; ur_usm_desc_t USMDesc{}; + USMDesc.align = getAlignment(); ur_usm_pool_handle_t Pool{}; UR_CALL(urUSMDeviceAlloc(UrContext, Device, &USMDesc, Pool, Size, - getAlignment(), reinterpret_cast(&ZeHandle))); } else { Allocation.ReleaseAction = allocation_t::free_native; @@ -2871,9 +2879,10 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, if (USMAllocatorConfigInstance.EnableBuffers) { HostAllocation.ReleaseAction = allocation_t::free; ur_usm_desc_t USMDesc{}; + USMDesc.align = getAlignment(); ur_usm_pool_handle_t Pool{}; - UR_CALL(urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, - getAlignment(), &ZeHandleHost)); + UR_CALL( + urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, &ZeHandleHost)); } else { HostAllocation.ReleaseAction = allocation_t::free_native; UR_CALL(ZeHostMemAllocHelper(&ZeHandleHost, UrContext, Size)); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp index 56b0c4a9dbaa6..575ab61959184 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp @@ -279,15 +279,15 @@ class USMHostMemoryAlloc : public USMMemoryAllocBase { ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, ur_device_handle_t Device, - ur_usm_mem_flags_t *Properties, size_t Size, + ur_usm_flags_t *Properties, size_t Size, uint32_t Alignment); ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t Device, ur_usm_mem_flags_t *, + ur_device_handle_t Device, ur_usm_flags_t *, size_t Size, uint32_t Alignment); ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_usm_mem_flags_t *Properties, size_t Size, + ur_usm_flags_t *Properties, size_t Size, uint32_t Alignment); // If indirect access tracking is not enabled then this functions just performs diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp index e3e21eb3e98e2..941804b535b3c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp @@ -158,8 +158,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo( return ReturnValue(Queue->Device); case UR_QUEUE_INFO_REFERENCE_COUNT: return ReturnValue(uint32_t{Queue->RefCount.load()}); - case UR_QUEUE_INFO_PROPERTIES: - die("UR_QUEUE_INFO_PROPERTIES in urQueueGetInfo not implemented\n"); + case UR_QUEUE_INFO_FLAGS: + die("UR_QUEUE_INFO_FLAGS in urQueueGetInfo not implemented\n"); break; case UR_QUEUE_INFO_SIZE: die("UR_QUEUE_INFO_SIZE in urQueueGetInfo not implemented\n"); @@ -265,30 +265,29 @@ static bool doEagerInit = [] { }(); UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( - ur_context_handle_t hContext, ///< [in] handle of the context object - ur_device_handle_t hDevice, ///< [in] handle of the device object - const ur_queue_property_t - *pProps, ///< [in] specifies a list of queue properties and their - ///< corresponding values. Each property name is immediately - ///< followed by the corresponding desired value. The list is - ///< terminated with a 0. If a property value is not specified, - ///< then its default value will be used. + ur_context_handle_t Context, ///< [in] handle of the context object + ur_device_handle_t Device, ///< [in] handle of the device object + const ur_queue_properties_t + *Props, ///< [in] specifies a list of queue properties and their + ///< corresponding values. Each property name is immediately + ///< followed by the corresponding desired value. The list is + ///< terminated with a 0. If a property value is not specified, + ///< then its default value will be used. ur_queue_handle_t - *phQueue ///< [out] pointer to handle of queue object created + *Queue ///< [out] pointer to handle of queue object created ) { - ur_context_handle_t Context = hContext; - ur_device_handle_t Device = hDevice; - ur_queue_handle_t_ **Queue = reinterpret_cast(phQueue); - Context->Devices[0] = Device; - const pi_queue_properties *Properties = - reinterpret_cast(pProps); - pi_queue_properties Flags = Properties[1]; - - auto ForceComputeIndex = Properties[2] == PI_QUEUE_COMPUTE_INDEX - ? static_cast(Properties[3]) - : -1; // Use default/round-robin. + int ForceComputeIndex = -1; // Use default/round-robin. + if (Props->pNext) { + const ur_base_properties_t *extendedDesc = + reinterpret_cast(Props->pNext); + if (extendedDesc->stype == UR_STRUCTURE_TYPE_QUEUE_INDEX_PROPERTIES) { + const ur_queue_index_properties_t *IndexProperties = + reinterpret_cast(extendedDesc); + ForceComputeIndex = IndexProperties->computeIndex; + } + } UR_ASSERT(Context->isValidDevice(Device), UR_RESULT_ERROR_INVALID_DEVICE); @@ -317,9 +316,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( nullptr); try { - *Queue = - new ur_queue_handle_t_(ZeComputeCommandQueues, ZeCopyCommandQueues, - Context, Device, true, Flags, ForceComputeIndex); + *Queue = new ur_queue_handle_t_(ZeComputeCommandQueues, ZeCopyCommandQueues, + Context, Device, true, Props->flags, + ForceComputeIndex); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { @@ -328,7 +327,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( // Do eager initialization of Level Zero handles on request. if (doEagerInit) { - ur_queue_handle_t Q = *phQueue; + ur_queue_handle_t Q = *Queue; // Creates said number of command-lists. auto warmupQueueGroup = [Q](bool UseCopyEngine, uint32_t RepeatCount) -> ur_result_t { @@ -732,8 +731,7 @@ ur_queue_handle_t_::ur_queue_handle_t_( std::vector &ComputeQueues, std::vector &CopyQueues, ur_context_handle_t Context, ur_device_handle_t Device, - bool OwnZeCommandQueue, pi_queue_properties Properties, - int ForceComputeIndex) + bool OwnZeCommandQueue, ur_queue_flags_t Properties, int ForceComputeIndex) : Context{Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue}, Properties(Properties) { // Compute group initialization. diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp index 75b64638ac262..76cfda295f2f8 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp @@ -10,11 +10,11 @@ #include #include #include +#include #include #include #include #include -#include #include #include @@ -81,7 +81,7 @@ struct ur_queue_handle_t_ : _ur_object { ur_queue_handle_t_(std::vector &ComputeQueues, std::vector &CopyQueues, ur_context_handle_t Context, ur_device_handle_t Device, - bool OwnZeCommandQueue, pi_queue_properties Properties = 0, + bool OwnZeCommandQueue, ur_queue_flags_t Properties = 0, int ForceComputeIndex = -1); using queue_type = ur_device_handle_t_::queue_group_info_t::type; @@ -207,7 +207,7 @@ struct ur_queue_handle_t_ : _ur_object { bool OwnZeCommandQueue; // Keeps the properties of this queue. - pi_queue_properties Properties; + ur_queue_flags_t Properties; // Map of all command lists used in this queue. ur_command_list_map_t CommandListMap; @@ -499,10 +499,11 @@ struct ur_queue_handle_t_ : _ur_object { // plugin only. // \param ForceHostVisible tells if the event must be created in // the host-visible pool -ur_result_t createEventAndAssociateQueue( - ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType, - ur_command_list_ptr_t CommandList, bool IsInternal, - std::optional HostVisible = std::nullopt); +ur_result_t +createEventAndAssociateQueue(ur_queue_handle_t Queue, ur_event_handle_t *Event, + ur_command_t CommandType, + ur_command_list_ptr_t CommandList, bool IsInternal, + std::optional HostVisible = std::nullopt); // Helper function to perform the necessary cleanup of the events from reset cmd // list. diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp index 5fdeb4ca0a7af..42c431ec94632 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp @@ -11,7 +11,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate( ur_context_handle_t Context, ///< [in] handle of the context object - const ur_sampler_property_t + const ur_sampler_desc_t *Props, ///< [in] specifies a list of sampler property names and their ///< corresponding values. ur_sampler_handle_t @@ -42,87 +42,50 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate( // b) SamplerProperties list is missing any properties if (Props) { - uint32_t PropCount = 0; - while (PropCount < 6) { // We expect only 3 pairs of sampler properties - switch (Props[PropCount]) { - case UR_SAMPLER_PROPERTIES_NORMALIZED_COORDS: { - auto CurValueBool = Props[++PropCount]; - - if (CurValueBool == 1UL) - ZeSamplerDesc.isNormalized = PI_TRUE; - else if (CurValueBool == 0UL) - ZeSamplerDesc.isNormalized = PI_FALSE; - else { - urPrint("urSamplerCreate: unsupported " - "UR_SAMPLER_INFO_NORMALIZED_COORDS value\n"); - return UR_RESULT_ERROR_INVALID_VALUE; - } - } break; - - case UR_SAMPLER_PROPERTIES_ADDRESSING_MODE: { - ur_sampler_addressing_mode_t CurValueAddressingMode = - static_cast(Props[++PropCount]); - - // Level Zero runtime with API version 1.2 and lower has a bug: - // ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER is implemented as "clamp to - // edge" and ZE_SAMPLER_ADDRESS_MODE_CLAMP is implemented as "clamp to - // border", i.e. logic is flipped. Starting from API version 1.3 this - // problem is going to be fixed. That's why check for API version to set - // an address mode. - ze_api_version_t ZeApiVersion = Context->getPlatform()->ZeApiVersion; - // TODO: add support for PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE - switch (CurValueAddressingMode) { - case UR_SAMPLER_ADDRESSING_MODE_NONE: - ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_NONE; - break; - case UR_SAMPLER_ADDRESSING_MODE_REPEAT: - ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_REPEAT; - break; - case UR_SAMPLER_ADDRESSING_MODE_CLAMP: - ZeSamplerDesc.addressMode = - ZeApiVersion < ZE_MAKE_VERSION(1, 3) - ? ZE_SAMPLER_ADDRESS_MODE_CLAMP - : ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; - break; - case UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: - ZeSamplerDesc.addressMode = - ZeApiVersion < ZE_MAKE_VERSION(1, 3) - ? ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER - : ZE_SAMPLER_ADDRESS_MODE_CLAMP; - break; - case UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: - ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_MIRROR; - break; - default: - urPrint("urSamplerCreate: unsupported " - "UR_SAMPLER_PROPERTIES_ADDRESSING_MODEE " - "value\n"); - urPrint("UR_SAMPLER_PROPERTIES_ADDRESSING_MODEE=%d\n", - CurValueAddressingMode); - return UR_RESULT_ERROR_INVALID_VALUE; - } - } break; - - case UR_SAMPLER_PROPERTIES_FILTER_MODE: { - ur_ext_sampler_filter_mode_t CurValueFilterMode = - static_cast(Props[++PropCount]); - - if (CurValueFilterMode == UR_EXT_SAMPLER_FILTER_MODE_NEAREST) - ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST; - else if (CurValueFilterMode == UR_EXT_SAMPLER_FILTER_MODE_LINEAR) - ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_LINEAR; - else { - urPrint("UR_SAMPLER_FILTER_MODE=%d\n", CurValueFilterMode); - urPrint( - "urSamplerCreate: unsupported UR_SAMPLER_FILTER_MODE value\n"); - return UR_RESULT_ERROR_INVALID_VALUE; - } - } break; - - default: - break; - } - PropCount++; + ZeSamplerDesc.isNormalized = Props->normalizedCoords; + + // Level Zero runtime with API version 1.2 and lower has a bug: + // ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER is implemented as "clamp to + // edge" and ZE_SAMPLER_ADDRESS_MODE_CLAMP is implemented as "clamp to + // border", i.e. logic is flipped. Starting from API version 1.3 this + // problem is going to be fixed. That's why check for API version to set + // an address mode. + ze_api_version_t ZeApiVersion = Context->getPlatform()->ZeApiVersion; + // TODO: add support for PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE + switch (Props->addressingMode) { + case UR_SAMPLER_ADDRESSING_MODE_NONE: + ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_NONE; + break; + case UR_SAMPLER_ADDRESSING_MODE_REPEAT: + ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_REPEAT; + break; + case UR_SAMPLER_ADDRESSING_MODE_CLAMP: + ZeSamplerDesc.addressMode = ZeApiVersion < ZE_MAKE_VERSION(1, 3) + ? ZE_SAMPLER_ADDRESS_MODE_CLAMP + : ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER; + break; + case UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: + ZeSamplerDesc.addressMode = ZeApiVersion < ZE_MAKE_VERSION(1, 3) + ? ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER + : ZE_SAMPLER_ADDRESS_MODE_CLAMP; + break; + case UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: + ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_MIRROR; + break; + default: + urPrint("urSamplerCreate: unsupported " + "UR_SAMPLER_PROPERTIES_ADDRESSING_MODEE " + "value\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + if (Props->filterMode == UR_SAMPLER_FILTER_MODE_NEAREST) + ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST; + else if (Props->filterMode == UR_SAMPLER_FILTER_MODE_LINEAR) + ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_LINEAR; + else { + urPrint("urSamplerCreate: unsupported UR_SAMPLER_FILTER_MODE value\n"); + return UR_RESULT_ERROR_INVALID_VALUE; } } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp index a117de71b57e6..e6164fe6519af 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp @@ -87,7 +87,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( pDdiTable->pfnUSMFill = urEnqueueUSMFill; pDdiTable->pfnUSMMemcpy = urEnqueueUSMMemcpy; pDdiTable->pfnUSMPrefetch = urEnqueueUSMPrefetch; - pDdiTable->pfnUSMMemAdvise = urEnqueueUSMMemAdvise; + pDdiTable->pfnUSMAdvise = urEnqueueUSMAdvise; pDdiTable->pfnUSMFill2D = urEnqueueUSMFill2D; pDdiTable->pfnUSMMemcpy2D = urEnqueueUSMMemcpy2D; pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite; diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp index d25e36db39bc5..c1d7464387922 100644 --- a/sycl/plugins/unified_runtime/ur/ur.hpp +++ b/sycl/plugins/unified_runtime/ur/ur.hpp @@ -60,7 +60,10 @@ const int UR_EXT_USM_CAPS_ATOMIC_ACCESS = 1 << 1; const int UR_EXT_USM_CAPS_CONCURRENT_ACCESS = 1 << 2; const int UR_EXT_USM_CAPS_CONCURRENT_ATOMIC_ACCESS = 1 << 3; -const int UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY = 1 << 5; +const int UR_EXT_USM_MEM_FLAG_WRITE_COMBINED = 1 << 27; +const int UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_DEVICE = 1 << 28; +const int UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_HOST = 1 << 29; +const int UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY = 1 << 30; const ur_context_info_t UR_EXT_CONTEXT_INFO_REFERENCE_COUNT = (ur_context_info_t)(UR_CONTEXT_INFO_FORCE_UINT32 - 2); @@ -77,20 +80,17 @@ const ur_command_t UR_EXT_COMMAND_TYPE_USER = const ur_image_channel_order_t UR_EXT_IMAGE_CHANNEL_ORDER_ABGR = ur_image_channel_order_t(UR_IMAGE_CHANNEL_ORDER_FORCE_UINT32 - 1); -typedef enum ur_ext_sampler_filter_mode_t { - UR_EXT_SAMPLER_FILTER_MODE_NEAREST = 0, - UR_EXT_SAMPLER_FILTER_MODE_LINEAR = 1, - UR_EXT_SAMPLER_FILTER_MODE_FORCE_UINT32 = 0x7fffffff -} ur_ext_sampler_filter_mode_t; - const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG = (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 1); -const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM = - (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 2); -const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA = - (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 3); -const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT = - (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 4); + +typedef enum { + // No preference for SLM or data cache. + UR_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT = 0x0, + // Large SLM size. + UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM = 0x1, + // Large General Data size. + UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA = 0x2 +} ur_kernel_cache_config; // Terminates the process with a catastrophic error message. [[noreturn]] inline void die(const char *Message) { From 0535b22cd0ed3cd140c6159c7f4eb1b3d5f86abc Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Sun, 9 Apr 2023 21:30:49 -0700 Subject: [PATCH 07/50] Some fixes to interop and other tests Signed-off-by: Jaime Arteaga --- sycl/plugins/unified_runtime/CMakeLists.txt | 2 +- sycl/plugins/unified_runtime/pi2ur.hpp | 15 +++++++++------ .../level_zero/ur_level_zero_kernel.cpp | 3 +++ .../adapters/level_zero/ur_level_zero_mem.cpp | 17 ++++++----------- 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 5b709ef7adacf..059990a7906e3 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -4,7 +4,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/jandres742/unified-runtime.git") - set(UNIFIED_RUNTIME_TAG b5c2119ba147306a76067e86c25e0c6c383172c6) + set(UNIFIED_RUNTIME_TAG 6bcd2a224d717cf904568d7311e84e2d057fcbef) message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}") FetchContent_Declare(unified-runtime diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 509448db3d3a4..4cac17faf43d9 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -1724,7 +1724,9 @@ inline pi_result piextKernelSetArgMemObj(pi_kernel Kernel, pi_uint32 ArgIndex, PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL); - ur_mem_handle_t UrMemory = reinterpret_cast(*ArgValue); + ur_mem_handle_t UrMemory{}; + if (ArgValue) + UrMemory = reinterpret_cast(*ArgValue); // We don't yet know the device where this kernel will next be run on. // Thus we can't know the actual memory allocation that needs to be used. @@ -1765,10 +1767,11 @@ piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle, reinterpret_cast(NativeHandle); ur_context_handle_t UrContext = reinterpret_cast(Context); - std::ignore = Program; + ur_program_handle_t UrProgram = + reinterpret_cast(Program); ur_kernel_handle_t *UrKernel = reinterpret_cast(Kernel); - HANDLE_ERRORS( - urKernelCreateWithNativeHandle(UrNativeKernel, UrContext, UrKernel)); + HANDLE_ERRORS(urKernelCreateWithNativeHandle(UrNativeKernel, UrContext, + UrProgram, UrKernel)); (*UrKernel)->OwnNativeHandle = OwnNativeHandle; return PI_SUCCESS; @@ -2580,8 +2583,8 @@ inline pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, ur_mem_handle_t *UrMem = reinterpret_cast(Mem); // TODO: Pass OwnNativeHandle to the output parameter // while we get it in interface - (*UrMem)->OwnNativeHandle = OwnNativeHandle; - HANDLE_ERRORS(urMemCreateWithNativeHandle(UrNativeMem, UrContext, UrMem)); + HANDLE_ERRORS(urMemCreateWithNativeHandle(UrNativeMem, UrContext, + OwnNativeHandle, UrMem)); return PI_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp index 336f8ea530cdb..d5cc3f3894abf 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp @@ -710,6 +710,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( ur_native_handle_t NativeKernel, ///< [in] the native handle of the kernel. ur_context_handle_t Context, ///< [in] handle of the context object + ur_program_handle_t Program, ur_kernel_handle_t * RetKernel ///< [out] pointer to the handle of the kernel object created. ) { @@ -726,6 +727,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( return UR_RESULT_ERROR_UNKNOWN; } + Kernel->Program = Program; + UR_CALL(Kernel->initialize()); return UR_RESULT_SUCCESS; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index d09f18fe76c48..2a96cf1c9a8c9 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -1803,15 +1803,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urMemCreateWithNativeHandle( ur_native_handle_t NativeMem, ///< [in] the native handle of the mem. ur_context_handle_t Context, ///< [in] handle of the context object + bool OwnNativeHandle, ur_mem_handle_t *Mem ///< [out] pointer to the handle of the mem object created. ) { std::shared_lock Lock(Context->Mutex); - // TODO: Get OwnNativeHandle from the output parameter while we get it in - // interface - bool OwnNativeHandle = (*Mem)->OwnNativeHandle; - // Get base of the allocation void *Base = nullptr; size_t Size = 0; @@ -1845,7 +1842,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemCreateWithNativeHandle( _ur_buffer *Buffer = nullptr; try { - Buffer = new _ur_buffer(Context, Device, Size); + Buffer = new _ur_buffer(Context, Size, Device, ur_cast(NativeMem), + OwnNativeHandle); *Mem = reinterpret_cast(Buffer); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; @@ -1867,12 +1865,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemCreateWithNativeHandle( // allocations in this context are released. UR_CALL(urContextRetain(Context)); - Context->MemAllocs.emplace( - std::piecewise_construct, std::forward_as_tuple(Ptr), - std::forward_as_tuple(Context, - true /*ownNativeHandle, how do we pass it here? or - do we move all this logic to pi2ur? */ - )); + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(Ptr), + std::forward_as_tuple(Context, OwnNativeHandle)); } if (Device) { From e079d6aa03de99a16e71a1dacc85353179d2c12c Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Mon, 10 Apr 2023 09:43:18 -0700 Subject: [PATCH 08/50] Implement urPlatformGetApiVersion Signed-off-by: Jaime Arteaga --- .../ur/adapters/level_zero/ur_level_zero_platform.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp index 1f2430274e6f4..c247b4d854047 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp @@ -288,10 +288,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion( ur_platform_handle_t Driver, ///< [in] handle of the platform ur_api_version_t *Version ///< [out] api version ) { - std::ignore = Driver; - std::ignore = Version; - urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + *Version = UR_API_VERSION_0_6; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle( From baefa3a32691fd327707498e7d9ba0a3536bb5fc Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Mon, 10 Apr 2023 10:47:37 -0700 Subject: [PATCH 09/50] Add UR_CONTEXT_INFO_REFERENCE_COUNT Signed-off-by: Jaime Arteaga --- sycl/plugins/unified_runtime/pi2ur.hpp | 2 +- .../ur/adapters/level_zero/ur_level_zero_context.cpp | 2 +- sycl/plugins/unified_runtime/ur/ur.hpp | 3 --- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 4cac17faf43d9..5a0d83352b146 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -1139,7 +1139,7 @@ inline pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName, break; } case PI_CONTEXT_INFO_REFERENCE_COUNT: { - ContextInfoType = UR_EXT_CONTEXT_INFO_REFERENCE_COUNT; + ContextInfoType = UR_CONTEXT_INFO_REFERENCE_COUNT; break; } case PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT: { diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp index 2f29904b04563..6dc5ad362d6ed 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp @@ -101,7 +101,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( return ReturnValue(&Context->Devices[0], Context->Devices.size()); case UR_CONTEXT_INFO_NUM_DEVICES: return ReturnValue(uint32_t(Context->Devices.size())); - case UR_EXT_CONTEXT_INFO_REFERENCE_COUNT: + case UR_CONTEXT_INFO_REFERENCE_COUNT: return ReturnValue(uint32_t{Context->RefCount.load()}); case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: // 2D USM memcpy is supported. diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp index c1d7464387922..f8578cafbb3b8 100644 --- a/sycl/plugins/unified_runtime/ur/ur.hpp +++ b/sycl/plugins/unified_runtime/ur/ur.hpp @@ -65,9 +65,6 @@ const int UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_DEVICE = 1 << 28; const int UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_HOST = 1 << 29; const int UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY = 1 << 30; -const ur_context_info_t UR_EXT_CONTEXT_INFO_REFERENCE_COUNT = - (ur_context_info_t)(UR_CONTEXT_INFO_FORCE_UINT32 - 2); - const ur_context_info_t UR_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = (ur_context_info_t)(UR_CONTEXT_INFO_FORCE_UINT32 - 1); From 7f8a6c527115289d04a443db1687957c8a7b737d Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Mon, 10 Apr 2023 11:07:12 -0700 Subject: [PATCH 10/50] Port Optimize sync of an in-order queue https://github.com/intel/llvm/pull/8601 https://github.com/intel/llvm/pull/8993 Signed-off-by: Jaime Arteaga --- .../level_zero/ur_level_zero_queue.cpp | 32 +++++++++++++------ 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp index 941804b535b3c..b4bdf3347096f 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp @@ -1248,6 +1248,7 @@ ur_result_t ur_queue_handle_t_::synchronize() { ZE2UR_CALL(zeHostSynchronize, (zeEvent)); Event->Completed = true; UR_CALL(urEventRelease(Event)); + // Cleanup all events from the synced command list. auto EventListToCleanup = std::move(ImmCmdList->second.EventList); ImmCmdList->second.EventList.clear(); @@ -1255,17 +1256,30 @@ ur_result_t ur_queue_handle_t_::synchronize() { return UR_RESULT_SUCCESS; }; - for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) - for (auto &QueueGroup : QueueMap) { - if (Device->ImmCommandListUsed) { - for (auto ImmCmdList : QueueGroup.second.ImmCmdLists) - syncImmCmdList(this, ImmCmdList); - } else { - for (auto &ZeQueue : QueueGroup.second.ZeQueues) - if (ZeQueue) - ZE2UR_CALL(zeHostSynchronize, (ZeQueue)); + // Do nothing if the queue is empty + if (!LastCommandEvent) + return UR_RESULT_SUCCESS; + + // For in-order queue just wait for the last command. + // If event is discarded then it can be in reset state or underlying level + // zero handle can have device scope, so we can't synchronize the last event. + if (isInOrderQueue() && !LastCommandEvent->IsDiscarded) { + ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent)); + } else { + // Otherwise sync all L0 queues/immediate command-lists. + for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) { + for (auto &QueueGroup : QueueMap) { + if (Device->ImmCommandListUsed) { + for (auto ImmCmdList : QueueGroup.second.ImmCmdLists) + syncImmCmdList(this, ImmCmdList); + } else { + for (auto &ZeQueue : QueueGroup.second.ZeQueues) + if (ZeQueue) + ZE2UR_CALL(zeHostSynchronize, (ZeQueue)); + } } } + } LastCommandEvent = nullptr; // With the entire queue synchronized, the active barriers must be done so we From 28d7280b73bda08a3509a8183efd79c66b8304e7 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Wed, 12 Apr 2023 07:47:06 -0700 Subject: [PATCH 11/50] Port Do not use piGetDeviceAndHostTimer for only host time query https://github.com/intel/llvm/pull/8996 Signed-off-by: Jaime Arteaga --- .../ur/adapters/level_zero/ur_level_zero_event.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp index 318a931d608f3..f1eba37f331ec 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp @@ -426,9 +426,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( case UR_PROFILING_INFO_COMMAND_QUEUED: case UR_PROFILING_INFO_COMMAND_SUBMIT: // Note: No users for this case - // TODO: Implement commmand submission time when needed, - // by recording device timestamp (using zeDeviceGetGlobalTimestamps) - // before submitting command to device + // The "command_submit" time is implemented by recording submission + // timestamp with a call to piGetDeviceAndHostTimer before command enqueue. + // return ReturnValue(uint64_t{0}); default: urPrint("urEventGetProfilingInfo: not supported ParamName\n"); From 77af5383da98519c52f06fbf0834cec3e920b3ff Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 13 Apr 2023 22:31:40 -0700 Subject: [PATCH 12/50] Port Fix PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE queries https://github.com/intel/llvm/pull/8769 Signed-off-by: Jaime Arteaga --- .../ur/adapters/level_zero/ur_level_zero_kernel.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp index d5cc3f3894abf..7a523e561967a 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp @@ -500,10 +500,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo( // TODO: To revisit after level_zero/issues/262 is resolved struct { size_t Arr[3]; - } WorkSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX, - Device->ZeDeviceComputeProperties->maxGroupSizeY, - Device->ZeDeviceComputeProperties->maxGroupSizeZ}}; - return ReturnValue(WorkSize); + } GlobalWorkSize = {{(Device->ZeDeviceComputeProperties->maxGroupSizeX * + Device->ZeDeviceComputeProperties->maxGroupCountX), + (Device->ZeDeviceComputeProperties->maxGroupSizeY * + Device->ZeDeviceComputeProperties->maxGroupCountY), + (Device->ZeDeviceComputeProperties->maxGroupSizeZ * + Device->ZeDeviceComputeProperties->maxGroupCountZ)}}; + return ReturnValue(GlobalWorkSize); } case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: { // As of right now, L0 is missing API to query kernel and device specific From d39759aeb2f3daa32fab08f407d051acba7ab435 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 13 Apr 2023 22:34:59 -0700 Subject: [PATCH 13/50] Port Retain build-log when program build failed https://github.com/intel/llvm/pull/8848 Signed-off-by: Jaime Arteaga --- .../ur/adapters/level_zero/ur_level_zero_program.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp index f9e32aa395084..0b4d07b0366a3 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp @@ -654,6 +654,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetBuildInfo( if (PropSizeRet) { *PropSizeRet = LogSize; } + if (PropValue) { + // When the program build fails in piProgramBuild(), we delayed cleaning + // up the build log because RT later calls this routine to get the + // failed build log. + // To avoid memory leaks, we should clean up the failed build log here + // because RT does not create sycl::program when piProgramBuild() fails, + // thus it won't call piProgramRelease() to clean up the build log. + if (Program->State == ur_program_handle_t_::Invalid) { + ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (Program->ZeBuildLog)); + Program->ZeBuildLog = nullptr; + } + } return UR_RESULT_SUCCESS; } From a02b1a5322ae297335949ed9602845c4064f6411 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 13 Apr 2023 22:40:02 -0700 Subject: [PATCH 14/50] Port Heuristically reduce overhead from immediate command-list cleanup https://github.com/intel/llvm/pull/9052 Signed-off-by: Jaime Arteaga --- .../level_zero/ur_level_zero_context.cpp | 2 +- .../level_zero/ur_level_zero_queue.cpp | 41 +++++++++++++++---- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp index 6dc5ad362d6ed..55354358124bd 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp @@ -518,7 +518,7 @@ ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) { static const size_t ImmCmdListsEventCleanupThreshold = [] { const char *ImmCmdListsEventCleanupThresholdStr = std::getenv( "SYCL_PI_LEVEL_ZERO_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD"); - static constexpr int Default = 20; + static constexpr int Default = 1000; if (!ImmCmdListsEventCleanupThresholdStr) return Default; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp index b4bdf3347096f..298b9d65467fb 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp @@ -1467,26 +1467,49 @@ ur_result_t ur_queue_handle_t_::resetCommandList( std::back_inserter(EventListToCleanup)); EventList.clear(); } else if (!isDiscardEvents()) { - // For immediate commandlist reset only those events that have signalled. // If events in the queue are discarded then we can't check their status. - for (auto it = EventList.begin(); it != EventList.end();) { - std::scoped_lock EventLock((*it)->Mutex); + // Helper for checking of event completion + auto EventCompleted = [](ur_event_handle_t Event) -> bool { + std::scoped_lock EventLock(Event->Mutex); ze_result_t ZeResult = - (*it)->Completed + Event->Completed ? ZE_RESULT_SUCCESS - : ZE_CALL_NOCHECK(zeEventQueryStatus, ((*it)->ZeEvent)); + : ZE_CALL_NOCHECK(zeEventQueryStatus, (Event->ZeEvent)); + return ZeResult == ZE_RESULT_SUCCESS; + }; + // Handle in-order specially as we can just in few checks (with binary + // search) a completed event and then all events before it are also + // done. + if (isInOrderQueue()) { + size_t Bisect = EventList.size(); + size_t Iter = 0; + for (auto it = EventList.rbegin(); it != EventList.rend(); ++Iter) { + if (!EventCompleted(*it)) { + if (Bisect > 1 && Iter < 3) { // Heuristically limit by 3 checks + Bisect >>= 1; + it += Bisect; + continue; + } + break; + } + // Bulk move of event up to "it" to the list ready for cleanup + std::move(it, EventList.rend(), std::back_inserter(EventListToCleanup)); + EventList.erase(EventList.begin(), it.base()); + break; + } + return UR_RESULT_SUCCESS; + } + // For immediate commandlist reset only those events that have signalled. + for (auto it = EventList.begin(); it != EventList.end();) { // Break early as soon as we found first incomplete event because next // events are submitted even later. We are not trying to find all // completed events here because it may be costly. I.e. we are checking // only elements which are most likely completed because they were // submitted earlier. It is guaranteed that all events will be eventually // cleaned up at queue sync/release. - if (ZeResult == ZE_RESULT_NOT_READY) + if (!EventCompleted(*it)) break; - if (ZeResult != ZE_RESULT_SUCCESS) - return ze2urResult(ZeResult); - EventListToCleanup.push_back(std::move((*it))); it = EventList.erase(it); } From 7250d4610102365d04443d9e8f92d30ec596682c Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 13 Apr 2023 22:49:52 -0700 Subject: [PATCH 15/50] Port Add support to propagate compile flags to device backend compiler https://github.com/intel/llvm/pull/8763 Signed-off-by: Jaime Arteaga --- sycl/plugins/level_zero/pi_level_zero.cpp | 19 ++++++++++++ sycl/plugins/unified_runtime/CMakeLists.txt | 2 +- sycl/plugins/unified_runtime/pi2ur.hpp | 16 ++++++++++ .../unified_runtime/pi_unified_runtime.cpp | 8 +++++ .../level_zero/ur_level_zero_platform.cpp | 30 +++++++++++++++++++ .../level_zero/ur_loader_interface.cpp | 1 + 6 files changed, 75 insertions(+), 1 deletion(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index c8b823d47602e..ee8b124a19d86 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -53,6 +53,25 @@ pi_result piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle, } pi_result piPluginGetLastError(char **message) { + return pi2ur::piPluginGetLastError(message); +} + +// Returns plugin specific backend option. +// Current support is only for optimization options. +// Return '-ze-opt-disable' for frontend_option = -O0. +// Return '-ze-opt-level=1' for frontend_option = -O1 or -O2. +// Return '-ze-opt-level=2' for frontend_option = -O3. +pi_result piPluginGetBackendOption(pi_platform platform, + const char *frontend_option, + const char **backend_option) { + return pi2ur::piPluginGetBackendOption(platform, frontend_option, + backend_option); +} + +pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, + pi_uint32 NumEntries, pi_device *Devices, + pi_uint32 *NumDevices) { + return pi2ur::piDevicesGet(Platform, DeviceType, NumEntries, Devices, NumDevices); } diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 059990a7906e3..3af474c33af6c 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -4,7 +4,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/jandres742/unified-runtime.git") - set(UNIFIED_RUNTIME_TAG 6bcd2a224d717cf904568d7311e84e2d057fcbef) + set(UNIFIED_RUNTIME_TAG b674dc2b59997d5b6cff462f8c33ee05a2ce0450) message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}") FetchContent_Declare(unified-runtime diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 5a0d83352b146..8aef9ef74f0b6 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -506,6 +506,22 @@ inline pi_result piextPluginGetOpaqueData(void *opaque_data_param, return PI_ERROR_UNKNOWN; } +// Returns plugin specific backend option. +// Current support is only for optimization options. +// Return '-ze-opt-disable' for frontend_option = -O0. +// Return '-ze-opt-level=1' for frontend_option = -O1 or -O2. +// Return '-ze-opt-level=2' for frontend_option = -O3. +inline pi_result piPluginGetBackendOption(pi_platform Platform, + const char *FrontendOption, + const char **PlatformOption) { + + auto UrPlatform = reinterpret_cast(Platform); + HANDLE_ERRORS( + urPlatformGetBackendOption(UrPlatform, FrontendOption, PlatformOption)); + + return PI_SUCCESS; +} + // Platform /////////////////////////////////////////////////////////////////////////////// diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index b719273bf484e..3aa24712478ea 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -981,6 +981,13 @@ __SYCL_EXPORT pi_result piGetDeviceAndHostTimer(pi_device Device, return pi2ur::piGetDeviceAndHostTimer(Device, DeviceTime, HostTime); } +__SYCL_EXPORT pi_result piPluginGetBackendOption(pi_platform platform, + const char *frontend_option, + const char **backend_option) { + return pi2ur::piPluginGetBackendOption(platform, frontend_option, + backend_option); +} + // This interface is not in Unified Runtime currently __SYCL_EXPORT pi_result piTearDown(void *PluginParameter) { return pi2ur::piTearDown(PluginParameter); @@ -1025,6 +1032,7 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) { _PI_API(piextPlatformCreateWithNativeHandle) _PI_API(piextDeviceGetNativeHandle) _PI_API(piextDeviceCreateWithNativeHandle) + _PI_API(piPluginGetBackendOption) _PI_API(piContextCreate) _PI_API(piContextRelease) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp index c247b4d854047..61c021472bf3f 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp @@ -536,3 +536,33 @@ ur_result_t ur_platform_handle_t_::populateDeviceCacheIfNeeded() { DeviceCachePopulated = true; return UR_RESULT_SUCCESS; } + +UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption( + ur_platform_handle_t Platform, ///< [in] handle of the platform instance. + const char *FrontendOption, ///< [in] string containing the frontend option. + const char * + *PlatformOption ///< [out] returns the correct platform specific + ///< compiler option based on the frontend option. +) { + using namespace std::literals; + if (FrontendOption == nullptr) { + return UR_RESULT_SUCCESS; + } + if (FrontendOption == ""sv) { + *PlatformOption = ""; + return UR_RESULT_SUCCESS; + } + if (FrontendOption == "-O0"sv) { + *PlatformOption = "-ze-opt-disable"; + return UR_RESULT_SUCCESS; + } + if (FrontendOption == "-O1"sv || FrontendOption == "-O2"sv) { + *PlatformOption = "-ze-opt-level=1"; + return UR_RESULT_SUCCESS; + } + if (FrontendOption == "-O3"sv) { + *PlatformOption = "-ze-opt-level=2"; + return UR_RESULT_SUCCESS; + } + return UR_RESULT_ERROR_INVALID_VALUE; +} \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp index e6164fe6519af..01b174aa93774 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp @@ -180,6 +180,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable( pDdiTable->pfnGetNativeHandle = urPlatformGetNativeHandle; pDdiTable->pfnCreateWithNativeHandle = urPlatformCreateWithNativeHandle; pDdiTable->pfnGetApiVersion = urPlatformGetApiVersion; + pDdiTable->pfnGetBackendOption = urPlatformGetBackendOption; return retVal; } From f569e92029a1f50313e2a2ce5be216fa8cfa671c Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Mon, 17 Apr 2023 09:55:03 -0700 Subject: [PATCH 16/50] Update loader Signed-off-by: Jaime Arteaga --- sycl/plugins/unified_runtime/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 3af474c33af6c..770b3d360f1f3 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -4,7 +4,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/jandres742/unified-runtime.git") - set(UNIFIED_RUNTIME_TAG b674dc2b59997d5b6cff462f8c33ee05a2ce0450) + set(UNIFIED_RUNTIME_TAG 91d194234710f40c7d4dc3670cca7abc2020682f) message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}") FetchContent_Declare(unified-runtime From 3d18689d7af9021d87a064103d36e2c2a25c4028 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Tue, 18 Apr 2023 16:28:40 -0700 Subject: [PATCH 17/50] Some fixes for test_queue, and rebase loader Signed-off-by: Jaime Arteaga --- sycl/plugins/unified_runtime/CMakeLists.txt | 2 +- .../level_zero/ur_level_zero_device.cpp | 5 ++++ .../adapters/level_zero/ur_level_zero_mem.cpp | 5 +++- .../level_zero/ur_level_zero_queue.cpp | 27 ++++++++++++------- 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 770b3d360f1f3..d66bfaba17bf1 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -4,7 +4,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/jandres742/unified-runtime.git") - set(UNIFIED_RUNTIME_TAG 91d194234710f40c7d4dc3670cca7abc2020682f) + set(UNIFIED_RUNTIME_TAG 586cc2d9a9612ad6886704aba7b38f1cd8ae610e) message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}") FetchContent_Declare(unified-runtime diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp index 0a21858fc2842..28b91a729e328 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp @@ -764,6 +764,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(capabilities); } + case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: + case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: { + return ReturnValue(0); + } + default: urPrint("Unsupported ParamName in urGetDeviceInfo\n"); urPrint("ParamName=%d(0x%x)\n", ParamName, ParamName); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index 2a96cf1c9a8c9..73ac57cf9ec93 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -1649,7 +1649,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( // } - void *Host = Properties->pHost; + void *Host = nullptr; + if (Properties) { + Host = Properties->pHost; + } // If USM Import feature is enabled and hostptr is supplied, // import the hostptr if not already imported into USM. diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp index 298b9d65467fb..df7d39be50cb5 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp @@ -278,14 +278,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( ) { Context->Devices[0] = Device; + ur_queue_flags_t Flags{}; + if (Props) { + Flags = Props->flags; + } + int ForceComputeIndex = -1; // Use default/round-robin. - if (Props->pNext) { - const ur_base_properties_t *extendedDesc = - reinterpret_cast(Props->pNext); - if (extendedDesc->stype == UR_STRUCTURE_TYPE_QUEUE_INDEX_PROPERTIES) { - const ur_queue_index_properties_t *IndexProperties = - reinterpret_cast(extendedDesc); - ForceComputeIndex = IndexProperties->computeIndex; + if (Props) { + if (Props->pNext) { + const ur_base_properties_t *extendedDesc = + reinterpret_cast(Props->pNext); + if (extendedDesc->stype == UR_STRUCTURE_TYPE_QUEUE_INDEX_PROPERTIES) { + const ur_queue_index_properties_t *IndexProperties = + reinterpret_cast(extendedDesc); + ForceComputeIndex = IndexProperties->computeIndex; + } } } @@ -316,9 +323,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( nullptr); try { - *Queue = new ur_queue_handle_t_(ZeComputeCommandQueues, ZeCopyCommandQueues, - Context, Device, true, Props->flags, - ForceComputeIndex); + *Queue = + new ur_queue_handle_t_(ZeComputeCommandQueues, ZeCopyCommandQueues, + Context, Device, true, Flags, ForceComputeIndex); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { From 4c6b101202659e023574e4b595c88f4bc9021003 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Tue, 18 Apr 2023 16:36:19 -0700 Subject: [PATCH 18/50] Partially port native image handle support for LevelZero https://github.com/intel/llvm/pull/8603/files Signed-off-by: Jaime Arteaga --- sycl/plugins/level_zero/pi_level_zero.cpp | 44 +++--------- sycl/plugins/unified_runtime/pi2ur.hpp | 23 ++++++ .../unified_runtime/pi_unified_runtime.cpp | 8 +++ .../adapters/level_zero/ur_level_zero_mem.cpp | 72 ++++++++++++++++--- .../adapters/level_zero/ur_level_zero_mem.hpp | 7 ++ 5 files changed, 108 insertions(+), 46 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index ee8b124a19d86..bc55890ada108 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -271,42 +271,6 @@ pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, ownNativeHandle, Mem); } -pi_result piextMemImageCreateWithNativeHandle( - pi_native_handle NativeHandle, pi_context Context, bool OwnNativeHandle, - const pi_image_format *ImageFormat, const pi_image_desc *ImageDesc, - pi_mem *RetImage) { - - PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE); - PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - - std::shared_lock Lock(Context->Mutex); - - ze_image_handle_t ZeHImage = pi_cast(NativeHandle); - - try { - auto ZePIImage = new _pi_image(Context, ZeHImage, OwnNativeHandle); - *RetImage = ZePIImage; - -#ifndef NDEBUG - ZeStruct ZeImageDesc; - pi_result DescriptionResult = - pi2zeImageDesc(ImageFormat, ImageDesc, ZeImageDesc); - if (DescriptionResult != PI_SUCCESS) - return DescriptionResult; - - ZePIImage->ZeImageDesc = ZeImageDesc; -#endif // !NDEBUG - - } catch (const std::bad_alloc &) { - return PI_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return PI_ERROR_UNKNOWN; - } - - return PI_SUCCESS; -} - pi_result piProgramCreate(pi_context Context, const void *ILBytes, size_t Length, pi_program *Program) { return pi2ur::piProgramCreate(Context, ILBytes, Length, Program); @@ -323,6 +287,14 @@ pi_result piProgramCreateWithBinary( Metadata, BinaryStatus, Program); } +pi_result piextMemImageCreateWithNativeHandle( + pi_native_handle NativeHandle, pi_context Context, bool OwnNativeHandle, + const pi_image_format *ImageFormat, const pi_image_desc *ImageDesc, + pi_mem *Img) { + return pi2ur::piextMemImageCreateWithNativeHandle( + NativeHandle, Context, OwnNativeHandle, ImageFormat, ImageDesc, Img); +} + pi_result piclProgramCreateWithSource(pi_context Context, pi_uint32 Count, const char **Strings, const size_t *Lengths, diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 8aef9ef74f0b6..55dd4258e6f33 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -2492,6 +2492,29 @@ inline pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, return PI_SUCCESS; } +inline pi_result piextMemImageCreateWithNativeHandle( + pi_native_handle NativeHandle, pi_context Context, bool OwnNativeHandle, + const pi_image_format *ImageFormat, const pi_image_desc *ImageDesc, + pi_mem *RetImage) { + + PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE); + PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + + std::ignore = NativeHandle; + std::ignore = Context; + std::ignore = OwnNativeHandle; + std::ignore = ImageFormat; + std::ignore = ImageDesc; + std::ignore = RetImage; + + // ur_mem_handle_t *UrMem = reinterpret_cast(RetImage); + // HANDLE_ERRORS(urMemImageCreateWithNativeHandle(UrContext, OwnNativeHandle, + // HostPtr, UrMem)); + + return PI_SUCCESS; +} + inline pi_result piMemBufferPartition(pi_mem Buffer, pi_mem_flags Flags, pi_buffer_create_type BufferCreateType, void *BufferCreateInfo, pi_mem *RetMem) { diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index 3aa24712478ea..3cf3e10a21676 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -975,6 +975,14 @@ pi_result piextEnqueueDeviceGlobalVariableRead( NumEventsInWaitList, EventsWaitList, Event); } +pi_result piextMemImageCreateWithNativeHandle( + pi_native_handle NativeHandle, pi_context Context, bool OwnNativeHandle, + const pi_image_format *ImageFormat, const pi_image_desc *ImageDesc, + pi_mem *Img) { + return pi2ur::piextMemImageCreateWithNativeHandle( + NativeHandle, Context, OwnNativeHandle, ImageFormat, ImageDesc, Img); +} + __SYCL_EXPORT pi_result piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime, uint64_t *HostTime) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index 73ac57cf9ec93..5cf5cedc43a00 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -1449,15 +1449,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( Blocking, NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine); } -UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( - ur_context_handle_t Context, ///< [in] handle of the context object - ur_mem_flags_t Flags, ///< [in] allocation and usage information flags - const ur_image_format_t - *ImageFormat, ///< [in] pointer to image format specification - const ur_image_desc_t *ImageDesc, ///< [in] pointer to image description - void *Host, ///< [in] pointer to the buffer data - ur_mem_handle_t *Mem ///< [out] pointer to handle of image object created -) { +static ur_result_t ur2zeImageDesc(const ur_image_format_t *ImageFormat, + const ur_image_desc_t *ImageDesc, + ZeStruct &ZeImageDesc) { + ze_image_format_type_t ZeImageFormatType; size_t ZeImageFormatTypeSize; switch (ImageFormat->channelType) { @@ -1581,7 +1576,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( return UR_RESULT_ERROR_INVALID_VALUE; } - ZeStruct ZeImageDesc; ZeImageDesc.arraylevels = ZeImageDesc.flags = 0; ZeImageDesc.type = ZeImageType; ZeImageDesc.format = ZeFormatDesc; @@ -1591,8 +1585,66 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( ZeImageDesc.arraylevels = ur_cast(ImageDesc->arraySize); ZeImageDesc.miplevels = ImageDesc->numMipLevel; + return UR_RESULT_SUCCESS; +} + +#if 0 +UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( + ur_native_handle_t NativeMem, ///< [in] the native handle of the mem. + ur_context_handle_t Context, ///< [in] handle of the context object + bool OwnNativeHandle, +/* + const ur_image_format_t + *ImageFormat, ///< [in] pointer to image format specification + const ur_image_desc_t *ImageDesc, ///< [in] pointer to image description +*/ + ur_mem_handle_t + *Mem ///< [out] pointer to the handle of the mem object created. +) { + + std::shared_lock Lock(Context->Mutex); + + ze_image_handle_t ZeImage = ur_cast(NativeMem); + +try { + auto UrImage = + new _ur_image(ur_cast(Context), ZeImage, OwnNativeHandle); + *Mem = reinterpret_cast(UrImage); + +/* +#ifndef NDEBUG + ZeStruct ZeImageDesc; + UR_CALL(ur2zeImageDesc(ImageFormat, ImageDesc, ZeImageDesc)); + + UrImage->ZeImageDesc = ZeImageDesc; +#endif // !NDEBUG +*/ + + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; + +} +#endif + +UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( + ur_context_handle_t Context, ///< [in] handle of the context object + ur_mem_flags_t Flags, ///< [in] allocation and usage information flags + const ur_image_format_t + *ImageFormat, ///< [in] pointer to image format specification + const ur_image_desc_t *ImageDesc, ///< [in] pointer to image description + void *Host, ///< [in] pointer to the buffer data + ur_mem_handle_t *Mem ///< [out] pointer to handle of image object created +) { std::shared_lock Lock(Context->Mutex); + ZeStruct ZeImageDesc; + UR_CALL(ur2zeImageDesc(ImageFormat, ImageDesc, ZeImageDesc)); + // Currently we have the "0" device in context with mutliple root devices to // own the image. // TODO: Implement explicit copying for acessing the image from other devices diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp index 575ab61959184..ecae9a0c1b11b 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp @@ -193,6 +193,11 @@ struct _ur_image final : ur_mem_handle_t_ { _ur_image(ur_context_handle_t UrContext, ze_image_handle_t ZeImage) : ur_mem_handle_t_(UrContext), ZeImage{ZeImage} {} + _ur_image(ur_context_handle_t UrContext, ze_image_handle_t ZeImage, + bool OwnNativeHandle) + : ur_mem_handle_t_(UrContext), ZeImage{ZeImage}, + OwnZeMemHandle{OwnNativeHandle} {} + virtual ur_result_t getZeHandle(char *&ZeHandle, access_mode_t, ur_device_handle_t = nullptr) override { ZeHandle = reinterpret_cast(ZeImage); @@ -213,6 +218,8 @@ struct _ur_image final : ur_mem_handle_t_ { // Level Zero image handle. ze_image_handle_t ZeImage; + + bool OwnZeMemHandle = true; }; // Implements memory allocation via L0 RT for USM allocator interface. From 65cd8b2b80ba4bdb1c5ed82c5253357796db43fa Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Tue, 18 Apr 2023 16:40:07 -0700 Subject: [PATCH 19/50] Port Report events as submitted, not running, until they are completed https://github.com/intel/llvm/pull/9094 Signed-off-by: Jaime Arteaga --- .../ur/adapters/level_zero/ur_level_zero_event.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp index f1eba37f331ec..237d27cf130a9 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp @@ -337,8 +337,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo( // Level Zero has a much more explicit notion of command submission than // OpenCL. It doesn't happen unless the user submits a command list. We've - // done it just above so the status is at least PI_EVENT_RUNNING. - uint32_t Result = ur_cast(UR_EVENT_STATUS_RUNNING); + // done it just above so the status is at least PI_EVENT_SUBMITTED. + // + // NOTE: We currently cannot tell if command is currently running, so + // it will always show up "submitted" before it is finally "completed". + // + uint32_t Result = ur_cast(UR_EVENT_STATUS_SUBMITTED); // Make sure that we query a host-visible event only. // If one wasn't yet created then don't create it here as well, and From d9da97b54ab588b3a95cc0dde930f6af870e3d67 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 20 Apr 2023 17:15:31 -0700 Subject: [PATCH 20/50] Remove not needed code Signed-off-by: Jaime Arteaga --- .../level_zero/ur_level_zero_common.hpp | 122 ------------------ .../level_zero/ur_level_zero_kernel.cpp | 4 +- 2 files changed, 2 insertions(+), 124 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp index a26e3412fadca..599527ae34a2d 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp @@ -362,128 +362,6 @@ const bool IndirectAccessTrackingEnabled = [] { extern const bool UseUSMAllocator; -// The getInfo*/ReturnHelper facilities provide shortcut way of -// writing return bytes for the various getInfo APIs. -template -ur_result_t urL0getInfoImpl(size_t param_value_size, void *param_value, - size_t *param_value_size_ret, T value, - size_t value_size, Assign &&assign_func) { - - if (param_value != nullptr) { - - if (param_value_size < value_size) { - return UR_RESULT_ERROR_INVALID_VALUE; - } - - assign_func(param_value, value, value_size); - } - - if (param_value_size_ret != nullptr) { - *param_value_size_ret = value_size; - } - - return UR_RESULT_SUCCESS; -} - -template -ur_result_t urL0getInfo(size_t param_value_size, void *param_value, - size_t *param_value_size_ret, T value) { - - auto assignment = [](void *param_value, T value, size_t value_size) { - std::ignore = value_size; - *static_cast(param_value) = value; - }; - - return urL0getInfoImpl(param_value_size, param_value, param_value_size_ret, - value, sizeof(T), assignment); -} - -template -ur_result_t urL0getInfoArray(size_t array_length, size_t param_value_size, - void *param_value, size_t *param_value_size_ret, - const T *value) { - return urL0getInfoImpl(param_value_size, param_value, param_value_size_ret, - value, array_length * sizeof(T), memcpy); -} - -template -ur_result_t urL0getInfoArray(size_t array_length, size_t param_value_size, - void *param_value, size_t *param_value_size_ret, - const T *value) { - if (param_value) { - memset(param_value, 0, param_value_size); - for (uint32_t I = 0; I < array_length; I++) - ((RetType *)param_value)[I] = (RetType)value[I]; - } - if (param_value_size_ret) - *param_value_size_ret = array_length * sizeof(RetType); - return UR_RESULT_SUCCESS; -} - -template <> -inline ur_result_t -urL0getInfo(size_t param_value_size, void *param_value, - size_t *param_value_size_ret, const char *value) { - return urL0getInfoArray(strlen(value) + 1, param_value_size, param_value, - param_value_size_ret, value); -} - -class UrL0ReturnHelperBase { -public: - UrL0ReturnHelperBase(size_t param_value_size, void *param_value, - size_t *param_value_size_ret) - : param_value_size(param_value_size), param_value(param_value), - param_value_size_ret(param_value_size_ret) {} - - // A version where in/out info size is represented by a single pointer - // to a value which is updated on return - UrL0ReturnHelperBase(size_t *param_value_size, void *param_value) - : param_value_size(*param_value_size), param_value(param_value), - param_value_size_ret(param_value_size) {} - - // Scalar return value - template ur_result_t operator()(const T &t) { - return getInfo(param_value_size, param_value, param_value_size_ret, t); - } - - // Array return value - template ur_result_t operator()(const T *t, size_t s) { - return urL0getInfoArray(s, param_value_size, param_value, - param_value_size_ret, t); - } - - // Array return value where element type is differrent from T - template - ur_result_t operator()(const T *t, size_t s) { - return urL0getInfoArray(s, param_value_size, param_value, - param_value_size_ret, t); - } - -protected: - size_t param_value_size; - void *param_value; - size_t *param_value_size_ret; -}; - -// A version of return helper that returns pi_result and not ur_result_t -class UrL0ReturnHelper : public UrL0ReturnHelperBase { -public: - using UrL0ReturnHelperBase::UrL0ReturnHelperBase; - - template ur_result_t operator()(const T &t) { - return UrL0ReturnHelperBase::operator()(t); - } - // Array return value - template ur_result_t operator()(const T *t, size_t s) { - return UrL0ReturnHelperBase::operator()(t, s); - } - // Array return value where element type is differrent from T - template - ur_result_t operator()(const T *t, size_t s) { - return UrL0ReturnHelperBase::operator()(t, s); - } -}; - const bool ExposeCSliceInAffinityPartitioning = [] { char *UrRet = std::getenv("UR_L0_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING"); char *PiRet = diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp index 7a523e561967a..fae4734b031b0 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp @@ -434,7 +434,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo( ///< bytes of data being queried by propName. ) { - UrL0ReturnHelper ReturnValue(PropSize, KernelInfo, PropSizeRet); + UrReturnHelper ReturnValue(PropSize, KernelInfo, PropSizeRet); std::shared_lock Guard(Kernel->Mutex); switch (ParamName) { @@ -492,7 +492,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo( size_t *ParamValueSizeRet ///< [out][optional] pointer to the actual size in ///< bytes of data being queried by propName. ) { - UrL0ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); + UrReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); std::shared_lock Guard(Kernel->Mutex); switch (ParamName) { From eb9b4b7a81c2f3b026c884daa6bbf23451a882b6 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 20 Apr 2023 17:19:13 -0700 Subject: [PATCH 21/50] Port Change the default to SYCL_PI_LEVEL_ZERO_USM_RESIDENT=2 https://github.com/intel/llvm/pull/9109 Signed-off-by: Jaime Arteaga --- .../ur/adapters/level_zero/ur_level_zero_mem.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index 5cf5cedc43a00..7bdbe320bd0d1 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -2397,12 +2397,12 @@ ur_result_t USMHostMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, } enum class USMAllocationForceResidencyType { - // [Default] Do not force memory residency at allocation time. + // Do not force memory residency at allocation time. None = 0, // Force memory resident on the device of allocation at allocation time. // For host allocation force residency on all devices in a context. Device = 1, - // Force memory resident on all devices in the context with P2P + // [Default] Force memory resident on all devices in the context with P2P // access to the device of allocation. // For host allocation force residency on all devices in a context. P2PDevices = 2 @@ -2412,7 +2412,7 @@ enum class USMAllocationForceResidencyType { static USMAllocationForceResidencyType USMAllocationForceResidency = [] { const auto Str = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT"); if (!Str) - return USMAllocationForceResidencyType::None; + return USMAllocationForceResidencyType::P2PDevices; switch (std::atoi(Str)) { case 1: return USMAllocationForceResidencyType::Device; From df8ca5880b4b079ed783f6d0beb2150506026d4b Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 20 Apr 2023 17:25:33 -0700 Subject: [PATCH 22/50] Port Add infrastructure to know backend of a platform https://github.com/intel/llvm/pull/9067 Signed-off-by: Jaime Arteaga --- sycl/plugins/unified_runtime/pi2ur.hpp | 3 +-- .../ur/adapters/level_zero/ur_level_zero_platform.cpp | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 55dd4258e6f33..fd58feca8be29 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -491,11 +491,10 @@ inline pi_result piPlatformGetInfo(pi_platform Platform, size_t SizeInOut = ParamValueSize; auto UrPlatform = reinterpret_cast(Platform); - HANDLE_ERRORS(urPlatformGetInfo(UrPlatform, UrParamName, ParamValueSize, + HANDLE_ERRORS(urPlatformGetInfo(UrPlatform, UrParamName, SizeInOut, ParamValue, ParamValueSizeRet)); ur2piPlatformInfoValue(UrParamName, ParamValueSize, &SizeInOut, ParamValue); - return PI_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp index 61c021472bf3f..7d0bef4cb84f5 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp @@ -276,6 +276,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo( // information>. Follow the same notation here. // return ReturnValue(Platform->ZeDriverApiVersion.c_str()); + case UR_PLATFORM_INFO_BACKEND: + return ReturnValue(UR_PLATFORM_BACKEND_LEVEL_ZERO); default: urPrint("urPlatformGetInfo: unrecognized ParamName\n"); return UR_RESULT_ERROR_INVALID_VALUE; From f8884d2452dcc07b9f771d8fbf976e52b3003e89 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Sun, 23 Apr 2023 23:07:54 -0700 Subject: [PATCH 23/50] Make USMFreeImpl static Signed-off-by: Jaime Arteaga --- .../adapters/level_zero/ur_level_zero_context.cpp | 13 +++++++++++++ .../ur/adapters/level_zero/ur_level_zero_event.hpp | 13 ------------- .../ur/adapters/level_zero/ur_level_zero_mem.cpp | 2 +- .../ur/adapters/level_zero/ur_level_zero_mem.hpp | 2 -- 4 files changed, 14 insertions(+), 16 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp index 55354358124bd..b9184cb2555cc 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp @@ -403,6 +403,19 @@ ur_result_t ur_context_handle_t_::finalize() { return UR_RESULT_SUCCESS; } +// Maximum number of events that can be present in an event ZePool is captured +// here. Setting it to 256 gave best possible performance for several +// benchmarks. +static const pi_uint32 MaxNumEventsPerPool = [] { + const auto MaxNumEventsPerPoolEnv = + std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); + uint32_t Result = + MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256; + if (Result <= 0) + Result = 256; + return Result; +}(); + ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( ze_event_pool_handle_t &Pool, size_t &Index, bool HostVisible, bool ProfilingEnabled) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp index 6acbd7459ef83..fcb3b156af0db 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp @@ -52,19 +52,6 @@ const bool ReuseDiscardedEvents = [] { return std::stoi(ReuseDiscardedEventsFlag) > 0; }(); -// Maximum number of events that can be present in an event ZePool is captured -// here. Setting it to 256 gave best possible performance for several -// benchmarks. -const uint32_t MaxNumEventsPerPool = [] { - const auto MaxNumEventsPerPoolEnv = - std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); - uint32_t Result = - MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256; - if (Result <= 0) - Result = 256; - return Result; -}(); - const bool FilterEventWaitList = [] { const char *Ret = std::getenv("SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST"); const bool RetVal = Ret ? std::stoi(Ret) : 1; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index 7bdbe320bd0d1..369a0eef74d9e 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -2338,7 +2338,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMGetMemAllocInfo( return UR_RESULT_SUCCESS; } -ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) { +static ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) { ZE2UR_CALL(zeMemFree, (Context->ZeContext, Ptr)); return UR_RESULT_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp index ecae9a0c1b11b..9661063f0e5f2 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp @@ -55,8 +55,6 @@ ur_result_t enqueueMemCopyRectHelper( uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, ur_event_handle_t *OutEvent, bool PreferCopyEngine = false); -ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr); - // Exception type to pass allocation errors class UsmAllocationException { const ur_result_t Error; From 7f20421f6aa4908cee615ad66b5c938e1518e7de Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Sun, 23 Apr 2023 23:17:59 -0700 Subject: [PATCH 24/50] Port Enable immediate command lists by default https://github.com/intel/llvm/pull/8982 Signed-off-by: Jaime Arteaga --- .../level_zero/ur_level_zero_device.cpp | 148 +++++++++--------- 1 file changed, 76 insertions(+), 72 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp index 28b91a729e328..d32eb5bd03308 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp @@ -794,7 +794,7 @@ getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device) { // used. if (!EnvVar) { if (Device->ImmCommandListUsed) - return std::pair(-1, -1); // No copy engines can be used. + return std::pair(0, 0); // Only main copy engine will be used. return std::pair(0, INT_MAX); // All copy engines will be used. } std::string CopyEngineRange = EnvVar; @@ -845,8 +845,12 @@ ur_device_handle_t_::useImmediateCommandLists() { }(); if (ImmediateCommandlistsSetting == -1) - // Change this to PerQueue as default after more testing. + // Change this to PerQueue as default after more testing. +#ifdef _WIN32 return NotUsed; +#else + return isPVC() ? PerQueue : NotUsed; +#endif switch (ImmediateCommandlistsSetting) { case 0: return NotUsed; @@ -861,76 +865,6 @@ ur_device_handle_t_::useImmediateCommandLists() { ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, int SubSubDeviceIndex) { - uint32_t numQueueGroups = 0; - ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties, - (ZeDevice, &numQueueGroups, nullptr)); - if (numQueueGroups == 0) { - return UR_RESULT_ERROR_UNKNOWN; - } - urPrint("NOTE: Number of queue groups = %d\n", numQueueGroups); - std::vector> - QueueGroupProperties(numQueueGroups); - ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties, - (ZeDevice, &numQueueGroups, QueueGroupProperties.data())); - - // Initialize ordinal and compute queue group properties - for (uint32_t i = 0; i < numQueueGroups; i++) { - if (QueueGroupProperties[i].flags & - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) { - QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal = - i; - QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] - .ZeProperties = QueueGroupProperties[i]; - break; - } - } - - // Reinitialize a sub-sub-device with its own ordinal, index. - // Our sub-sub-device representation is currently [Level-Zero sub-device - // handle + Level-Zero compute group/engine index]. Only the specified - // index queue will be used to submit work to the sub-sub-device. - if (SubSubDeviceOrdinal >= 0) { - QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal = - SubSubDeviceOrdinal; - QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeIndex = - SubSubDeviceIndex; - } else { // Proceed with initialization for root and sub-device - // How is it possible that there are no "compute" capabilities? - if (QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal < - 0) { - return UR_RESULT_ERROR_UNKNOWN; - } - - if (CopyEngineRequested((ur_device_handle_t)this)) { - for (uint32_t i = 0; i < numQueueGroups; i++) { - if (((QueueGroupProperties[i].flags & - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) && - (QueueGroupProperties[i].flags & - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) { - if (QueueGroupProperties[i].numQueues == 1) { - QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal = i; - QueueGroup[queue_group_info_t::MainCopy].ZeProperties = - QueueGroupProperties[i]; - } else { - QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal = i; - QueueGroup[queue_group_info_t::LinkCopy].ZeProperties = - QueueGroupProperties[i]; - break; - } - } - } - if (QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal < 0) - urPrint("NOTE: main blitter/copy engine is not available\n"); - else - urPrint("NOTE: main blitter/copy engine is available\n"); - - if (QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal < 0) - urPrint("NOTE: link blitter/copy engines are not available\n"); - else - urPrint("NOTE: link blitter/copy engines are available\n"); - } - } - // Maintain various device properties cache. // Note that we just describe here how to compute the data. // The real initialization is upon first access. @@ -1002,6 +936,76 @@ ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, ZeEventsScope = DeviceEventsSetting; } + uint32_t numQueueGroups = 0; + ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties, + (ZeDevice, &numQueueGroups, nullptr)); + if (numQueueGroups == 0) { + return UR_RESULT_ERROR_UNKNOWN; + } + urPrint("NOTE: Number of queue groups = %d\n", numQueueGroups); + std::vector> + QueueGroupProperties(numQueueGroups); + ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties, + (ZeDevice, &numQueueGroups, QueueGroupProperties.data())); + + // Initialize ordinal and compute queue group properties + for (uint32_t i = 0; i < numQueueGroups; i++) { + if (QueueGroupProperties[i].flags & + ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) { + QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal = + i; + QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] + .ZeProperties = QueueGroupProperties[i]; + break; + } + } + + // Reinitialize a sub-sub-device with its own ordinal, index. + // Our sub-sub-device representation is currently [Level-Zero sub-device + // handle + Level-Zero compute group/engine index]. Only the specified + // index queue will be used to submit work to the sub-sub-device. + if (SubSubDeviceOrdinal >= 0) { + QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal = + SubSubDeviceOrdinal; + QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeIndex = + SubSubDeviceIndex; + } else { // Proceed with initialization for root and sub-device + // How is it possible that there are no "compute" capabilities? + if (QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal < + 0) { + return UR_RESULT_ERROR_UNKNOWN; + } + + if (CopyEngineRequested((ur_device_handle_t)this)) { + for (uint32_t i = 0; i < numQueueGroups; i++) { + if (((QueueGroupProperties[i].flags & + ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) && + (QueueGroupProperties[i].flags & + ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) { + if (QueueGroupProperties[i].numQueues == 1) { + QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal = i; + QueueGroup[queue_group_info_t::MainCopy].ZeProperties = + QueueGroupProperties[i]; + } else { + QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal = i; + QueueGroup[queue_group_info_t::LinkCopy].ZeProperties = + QueueGroupProperties[i]; + break; + } + } + } + if (QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal < 0) + urPrint("NOTE: main blitter/copy engine is not available\n"); + else + urPrint("NOTE: main blitter/copy engine is available\n"); + + if (QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal < 0) + urPrint("NOTE: link blitter/copy engines are not available\n"); + else + urPrint("NOTE: link blitter/copy engines are available\n"); + } + } + return UR_RESULT_SUCCESS; } From caf288876f3aa2d89bdfd74321ae4be5ab4361a4 Mon Sep 17 00:00:00 2001 From: Brandon Yates Date: Tue, 25 Apr 2023 20:49:31 +0000 Subject: [PATCH 25/50] Fix failing device CTS (#2) * Fix failing device CTS Signed-off-by: Brandon Yates --- sycl/plugins/unified_runtime/pi2ur.hpp | 6 +- .../level_zero/ur_level_zero_device.cpp | 59 +++++++++++-------- sycl/plugins/unified_runtime/ur/ur.hpp | 3 - 3 files changed, 36 insertions(+), 32 deletions(-) diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index fd58feca8be29..033920f73321f 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -858,7 +858,7 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, InfoType = (ur_device_info_t)UR_DEVICE_INFO_DEVICE_ID; break; case PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY: - InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_FREE_MEMORY; + InfoType = (ur_device_info_t)UR_DEVICE_INFO_GLOBAL_MEM_FREE; break; case PI_EXT_INTEL_DEVICE_INFO_MEMORY_CLOCK_RATE: InfoType = (ur_device_info_t)UR_DEVICE_INFO_MEMORY_CLOCK_RATE; @@ -870,7 +870,7 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, InfoType = (ur_device_info_t)UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES; break; case PI_DEVICE_INFO_GPU_SLICES: - InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_GPU_SLICES; + InfoType = (ur_device_info_t)UR_DEVICE_INFO_GPU_EU_SLICES; break; case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE: InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE; @@ -879,7 +879,7 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_GPU_HW_THREADS_PER_EU; break; case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH: - InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_MAX_MEM_BANDWIDTH; + InfoType = (ur_device_info_t)UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH; break; case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS: InfoType = (ur_device_info_t)UR_DEVICE_INFO_BFLOAT16; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp index d32eb5bd03308..02c3232176177 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp @@ -89,7 +89,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet( UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( ur_device_handle_t Device, ///< [in] handle of the device instance ur_device_info_t ParamName, ///< [in] type of the info to retrieve - size_t propSize, ///< [in] the number of bytes pointed to by pDeviceInfo. + size_t propSize, ///< [in] the number of bytes pointed to by ParamValue. void *ParamValue, ///< [out][optional] array of bytes holding the info. ///< If propSize is not equal to or greater than the real ///< number of bytes needed to return the info then the @@ -130,8 +130,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(UUID, sizeof(UUID)); } case UR_DEVICE_INFO_ATOMIC_64: - return ReturnValue(uint32_t{Device->ZeDeviceModuleProperties->flags & - ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS}); + return ReturnValue(static_cast(Device->ZeDeviceModuleProperties->flags & + ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS)); case UR_DEVICE_INFO_EXTENSIONS: { // Convention adopted from OpenCL: // "Returns a space separated list of extension names (the extension @@ -195,9 +195,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( case UR_EXT_DEVICE_INFO_BUILD_ON_SUBDEVICE: return ReturnValue(uint32_t{0}); case UR_DEVICE_INFO_COMPILER_AVAILABLE: - return ReturnValue(uint32_t{1}); + return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_LINKER_AVAILABLE: - return ReturnValue(uint32_t{1}); + return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: { uint32_t MaxComputeUnits = Device->ZeDeviceProperties->numEUsPerSubslice * @@ -255,17 +255,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( uint64_t{Device->ZeDeviceComputeProperties->maxSharedLocalMemory}); case UR_DEVICE_INFO_IMAGE_SUPPORTED: return ReturnValue( - uint32_t{Device->ZeDeviceImageProperties->maxImageDims1D > 0}); + static_cast(Device->ZeDeviceImageProperties->maxImageDims1D > 0)); case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: - return ReturnValue(uint32_t{(Device->ZeDeviceProperties->flags & - ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0}); + return ReturnValue(static_cast((Device->ZeDeviceProperties->flags & + ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0)); case UR_DEVICE_INFO_AVAILABLE: - return ReturnValue(uint32_t{ZeDevice ? true : false}); + return ReturnValue(static_cast(ZeDevice ? true : false)); case UR_DEVICE_INFO_VENDOR: // TODO: Level-Zero does not return vendor's name at the moment // only the ID. return ReturnValue("Intel(R) Corporation"); case UR_DEVICE_INFO_DRIVER_VERSION: + case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION: return ReturnValue(Device->Platform->ZeDriverVersion.c_str()); case UR_DEVICE_INFO_VERSION: return ReturnValue(Device->Platform->ZeDriverApiVersion.c_str()); @@ -346,7 +347,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION: return ReturnValue(""); case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: - return ReturnValue(uint32_t{true}); + return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE: return ReturnValue( size_t{Device->ZeDeviceModuleProperties->printfBufferSize}); @@ -363,12 +364,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(ur_device_exec_capability_flag_t{ UR_DEVICE_EXEC_CAPABILITY_FLAG_NATIVE_KERNEL}); case UR_DEVICE_INFO_ENDIAN_LITTLE: - return ReturnValue(uint32_t{true}); + return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: - return ReturnValue(uint32_t{Device->ZeDeviceProperties->flags & - ZE_DEVICE_PROPERTY_FLAG_ECC}); + return ReturnValue(static_cast(Device->ZeDeviceProperties->flags & + ZE_DEVICE_PROPERTY_FLAG_ECC)); case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: - return ReturnValue(size_t{Device->ZeDeviceProperties->timerResolution}); + return ReturnValue(static_cast(Device->ZeDeviceProperties->timerResolution)); case UR_DEVICE_INFO_LOCAL_MEM_TYPE: return ReturnValue(UR_DEVICE_LOCAL_MEM_TYPE_LOCAL); case UR_DEVICE_INFO_MAX_CONSTANT_ARGS: @@ -402,7 +403,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue( uint32_t{Device->ZeDeviceImageProperties->maxWriteImageArgs}); case UR_DEVICE_INFO_SINGLE_FP_CONFIG: { - uint64_t SingleFPValue = 0; + ur_device_fp_capability_flags_t SingleFPValue = 0; ze_device_fp_flags_t ZeSingleFPCapabilities = Device->ZeDeviceModuleProperties->fp32flags; if (ZE_DEVICE_FP_FLAG_DENORM & ZeSingleFPCapabilities) { @@ -427,10 +428,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; } - return ReturnValue(uint64_t{SingleFPValue}); + return ReturnValue(SingleFPValue); } case UR_DEVICE_INFO_HALF_FP_CONFIG: { - uint64_t HalfFPValue = 0; + ur_device_fp_capability_flags_t HalfFPValue = 0; ze_device_fp_flags_t ZeHalfFPCapabilities = Device->ZeDeviceModuleProperties->fp16flags; if (ZE_DEVICE_FP_FLAG_DENORM & ZeHalfFPCapabilities) { @@ -454,10 +455,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeHalfFPCapabilities) { HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; } - return ReturnValue(uint64_t{HalfFPValue}); + return ReturnValue(HalfFPValue); } case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: { - uint64_t DoubleFPValue = 0; + ur_device_fp_capability_flags_t DoubleFPValue = 0; ze_device_fp_flags_t ZeDoubleFPCapabilities = Device->ZeDeviceModuleProperties->fp64flags; if (ZE_DEVICE_FP_FLAG_DENORM & ZeDoubleFPCapabilities) { @@ -482,7 +483,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT; } - return ReturnValue(uint64_t{DoubleFPValue}); + return ReturnValue(DoubleFPValue); } case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims2D}); @@ -537,7 +538,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( } case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: { // TODO: Not supported yet. Needs to be updated after support is added. - return ReturnValue(uint32_t{false}); + return ReturnValue(static_cast(false)); } case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { // ze_device_compute_properties.subGroupSizes is in uint32_t whereas the @@ -617,7 +618,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(AddressBuffer); } - case UR_EXT_DEVICE_INFO_FREE_MEMORY: { + case UR_DEVICE_INFO_GLOBAL_MEM_FREE: { if (getenv("ZES_ENABLE_SYSMAN") == nullptr) { setErrorMessage("Set ZES_ENABLE_SYSMAN=1 to obtain free memory", UR_RESULT_SUCCESS); @@ -698,11 +699,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( Device->ZeDeviceProperties->numSlices; return ReturnValue(uint32_t{count}); } + case UR_DEVICE_INFO_GPU_EU_SLICES: { + return ReturnValue(uint32_t{Device->ZeDeviceProperties->numSlices}); + } case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH: return ReturnValue( uint32_t{Device->ZeDeviceProperties->physicalEUSimdWidth}); - case UR_EXT_DEVICE_INFO_GPU_SLICES: - return ReturnValue(uint32_t{Device->ZeDeviceProperties->numSlices}); case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE: return ReturnValue( uint32_t{Device->ZeDeviceProperties->numSubslicesPerSlice}); @@ -710,7 +712,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(uint32_t{Device->ZeDeviceProperties->numEUsPerSubslice}); case UR_EXT_DEVICE_INFO_GPU_HW_THREADS_PER_EU: return ReturnValue(uint32_t{Device->ZeDeviceProperties->numThreadsPerEU}); - case UR_EXT_DEVICE_INFO_MAX_MEM_BANDWIDTH: + case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: // currently not supported in level zero runtime return UR_RESULT_ERROR_INVALID_VALUE; case UR_DEVICE_INFO_BFLOAT16: { @@ -766,8 +768,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: { - return ReturnValue(0); + ur_queue_flags_t queue_flags = 0; + return ReturnValue(queue_flags); } + case UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS: { + return ReturnValue(static_cast(0)); //__read_write attribute currently undefinde in opencl + } + default: urPrint("Unsupported ParamName in urGetDeviceInfo\n"); diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp index f8578cafbb3b8..c03ba316860f7 100644 --- a/sycl/plugins/unified_runtime/ur/ur.hpp +++ b/sycl/plugins/unified_runtime/ur/ur.hpp @@ -29,17 +29,14 @@ const int UR_EXT_DEVICE_INFO_MAX_WORK_GROUPS_3D = UR_EXT_DEVICE_INFO_END - 2; // UR_EXT_DEVICE_INFO_END - 3; // const int ZER_EXT_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS = // UR_EXT_DEVICE_INFO_END - 4; -const int UR_EXT_DEVICE_INFO_MAX_MEM_BANDWIDTH = UR_EXT_DEVICE_INFO_END - 6; const int UR_EXT_DEVICE_INFO_GPU_HW_THREADS_PER_EU = UR_EXT_DEVICE_INFO_END - 7; const int UR_EXT_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = UR_EXT_DEVICE_INFO_END - 8; -const int UR_EXT_DEVICE_INFO_GPU_SLICES = UR_EXT_DEVICE_INFO_END - 9; // const int UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES = // UR_EXT_DEVICE_INFO_END - 10; const int UR_EXT_DEVICE_INFO_MEMORY_BUS_WIDTH = UR_EXT_DEVICE_INFO_END - 11; // const int ZER_EXT_DEVICE_INFO_MEMORY_CLOCK_RATE = UR_EXT_DEVICE_INFO_END - // 12; -const int UR_EXT_DEVICE_INFO_FREE_MEMORY = UR_EXT_DEVICE_INFO_END - 13; // const int ZER_EXT_DEVICE_INFO_DEVICE_ID = UR_EXT_DEVICE_INFO_END - 14; // const int ZER_EXT_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE = // UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE; From 4ae54329c2ac86217b6e53c29526e2b25a044342 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Tue, 25 Apr 2023 19:00:48 -0700 Subject: [PATCH 26/50] Fix formatting Signed-off-by: Jaime Arteaga --- .../level_zero/ur_level_zero_device.cpp | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp index 02c3232176177..4c15b61e19a6f 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp @@ -130,8 +130,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(UUID, sizeof(UUID)); } case UR_DEVICE_INFO_ATOMIC_64: - return ReturnValue(static_cast(Device->ZeDeviceModuleProperties->flags & - ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS)); + return ReturnValue( + static_cast(Device->ZeDeviceModuleProperties->flags & + ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS)); case UR_DEVICE_INFO_EXTENSIONS: { // Convention adopted from OpenCL: // "Returns a space separated list of extension names (the extension @@ -254,11 +255,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue( uint64_t{Device->ZeDeviceComputeProperties->maxSharedLocalMemory}); case UR_DEVICE_INFO_IMAGE_SUPPORTED: - return ReturnValue( - static_cast(Device->ZeDeviceImageProperties->maxImageDims1D > 0)); + return ReturnValue(static_cast( + Device->ZeDeviceImageProperties->maxImageDims1D > 0)); case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: - return ReturnValue(static_cast((Device->ZeDeviceProperties->flags & - ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0)); + return ReturnValue( + static_cast((Device->ZeDeviceProperties->flags & + ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0)); case UR_DEVICE_INFO_AVAILABLE: return ReturnValue(static_cast(ZeDevice ? true : false)); case UR_DEVICE_INFO_VENDOR: @@ -366,10 +368,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( case UR_DEVICE_INFO_ENDIAN_LITTLE: return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: - return ReturnValue(static_cast(Device->ZeDeviceProperties->flags & - ZE_DEVICE_PROPERTY_FLAG_ECC)); + return ReturnValue(static_cast( + Device->ZeDeviceProperties->flags & ZE_DEVICE_PROPERTY_FLAG_ECC)); case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: - return ReturnValue(static_cast(Device->ZeDeviceProperties->timerResolution)); + return ReturnValue( + static_cast(Device->ZeDeviceProperties->timerResolution)); case UR_DEVICE_INFO_LOCAL_MEM_TYPE: return ReturnValue(UR_DEVICE_LOCAL_MEM_TYPE_LOCAL); case UR_DEVICE_INFO_MAX_CONSTANT_ARGS: @@ -772,10 +775,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(queue_flags); } case UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS: { - return ReturnValue(static_cast(0)); //__read_write attribute currently undefinde in opencl + return ReturnValue(static_cast( + 0)); //__read_write attribute currently undefinde in opencl } - default: urPrint("Unsupported ParamName in urGetDeviceInfo\n"); urPrint("ParamName=%d(0x%x)\n", ParamName, ParamName); From 4aedd0fa072aa4ca6514962bf9864227a16d15ed Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Wed, 26 Apr 2023 16:38:00 -0700 Subject: [PATCH 27/50] Update loader with CreateWithNativeHandle updates Signed-off-by: Jaime Arteaga --- sycl/plugins/unified_runtime/CMakeLists.txt | 2 +- sycl/plugins/unified_runtime/pi2ur.hpp | 37 ++++++++++++------- .../level_zero/ur_level_zero_context.cpp | 3 +- .../level_zero/ur_level_zero_event.cpp | 4 +- .../level_zero/ur_level_zero_kernel.cpp | 4 +- .../adapters/level_zero/ur_level_zero_mem.cpp | 4 +- .../level_zero/ur_level_zero_queue.cpp | 15 +++++--- 7 files changed, 44 insertions(+), 25 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index d66bfaba17bf1..2cda6e083f6c4 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -4,7 +4,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/jandres742/unified-runtime.git") - set(UNIFIED_RUNTIME_TAG 586cc2d9a9612ad6886704aba7b38f1cd8ae610e) + set(UNIFIED_RUNTIME_TAG af603dbef47adb62aafbf067931f0c9358a4cac6) message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}") FetchContent_Declare(unified-runtime diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 033920f73321f..ebca37978a696 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -1129,8 +1129,10 @@ inline pi_result piextContextCreateWithNativeHandle( ur_context_handle_t *UrContext = reinterpret_cast(RetContext); + ur_context_native_properties_t Properties{}; + Properties.isNativeHandleOwned = OwnNativeHandle; HANDLE_ERRORS(urContextCreateWithNativeHandle( - NativeContext, NumDevices, UrDevices, OwnNativeHandle, UrContext)); + NativeContext, NumDevices, UrDevices, &Properties, UrContext)); return PI_SUCCESS; } @@ -1281,13 +1283,14 @@ inline pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle, ur_context_handle_t UrContext = reinterpret_cast(Context); - + ur_device_handle_t UrDevice = reinterpret_cast(Device); ur_native_handle_t UrNativeHandle = reinterpret_cast(NativeHandle); ur_queue_handle_t *UrQueue = reinterpret_cast(Queue); - HANDLE_ERRORS( - urQueueCreateWithNativeHandle(UrNativeHandle, UrContext, UrQueue)); - (*UrQueue)->OwnNativeHandle = OwnNativeHandle; + ur_queue_native_properties_t Properties{}; + Properties.isNativeHandleOwned = OwnNativeHandle; + HANDLE_ERRORS(urQueueCreateWithNativeHandle(UrNativeHandle, UrContext, + UrDevice, &Properties, UrQueue)); return PI_SUCCESS; } @@ -1785,9 +1788,10 @@ piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle, ur_program_handle_t UrProgram = reinterpret_cast(Program); ur_kernel_handle_t *UrKernel = reinterpret_cast(Kernel); - HANDLE_ERRORS(urKernelCreateWithNativeHandle(UrNativeKernel, UrContext, - UrProgram, UrKernel)); - (*UrKernel)->OwnNativeHandle = OwnNativeHandle; + ur_kernel_native_properties_t Properties{}; + Properties.isNativeHandleOwned = OwnNativeHandle; + HANDLE_ERRORS(urKernelCreateWithNativeHandle( + UrNativeKernel, UrContext, UrProgram, &Properties, UrKernel)); return PI_SUCCESS; } @@ -2621,8 +2625,10 @@ inline pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, ur_mem_handle_t *UrMem = reinterpret_cast(Mem); // TODO: Pass OwnNativeHandle to the output parameter // while we get it in interface - HANDLE_ERRORS(urMemCreateWithNativeHandle(UrNativeMem, UrContext, - OwnNativeHandle, UrMem)); + ur_mem_native_properties_t Properties{}; + Properties.isNativeHandleOwned = OwnNativeHandle; + HANDLE_ERRORS( + urMemCreateWithNativeHandle(UrNativeMem, UrContext, &Properties, UrMem)); return PI_SUCCESS; } @@ -3456,7 +3462,9 @@ inline pi_result piEventCreate(pi_context Context, pi_event *RetEvent) { ur_event_handle_t *UrEvent = reinterpret_cast(RetEvent); // pass null for the hNativeHandle to use urEventCreateWithNativeHandle // as urEventCreate - HANDLE_ERRORS(urEventCreateWithNativeHandle(nullptr, UrContext, UrEvent)); + ur_event_native_properties_t Properties{}; + HANDLE_ERRORS( + urEventCreateWithNativeHandle(nullptr, UrContext, &Properties, UrEvent)); return PI_SUCCESS; } @@ -3477,9 +3485,10 @@ inline pi_result piextEventCreateWithNativeHandle(pi_native_handle NativeHandle, reinterpret_cast(Context); ur_event_handle_t *UrEvent = reinterpret_cast(Event); - HANDLE_ERRORS( - urEventCreateWithNativeHandle(UrNativeKernel, UrContext, UrEvent)); - (*UrEvent)->OwnNativeHandle = OwnNativeHandle; + ur_event_native_properties_t Properties{}; + Properties.isNativeHandleOwned = OwnNativeHandle; + HANDLE_ERRORS(urEventCreateWithNativeHandle(UrNativeKernel, UrContext, + &Properties, UrEvent)); return PI_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp index b9184cb2555cc..c0c4f9958aaf0 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp @@ -139,10 +139,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle( ur_native_handle_t NativeContext, ///< [in] the native handle of the context. uint32_t NumDevices, const ur_device_handle_t *Devices, - bool OwnNativeHandle, + const ur_context_native_properties_t *Properties, ur_context_handle_t *Context ///< [out] pointer to the handle of the context object created. ) { + bool OwnNativeHandle = Properties->isNativeHandleOwned; try { ze_context_handle_t ZeContext = reinterpret_cast(NativeContext); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp index 237d27cf130a9..197ec1e8f70aa 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp @@ -625,6 +625,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urExtEventCreate( UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( ur_native_handle_t NativeEvent, ///< [in] the native handle of the event. ur_context_handle_t Context, ///< [in] handle of the context object + const ur_event_native_properties_t *Properties, ur_event_handle_t *Event ///< [out] pointer to the handle of the event object created. ) { @@ -643,7 +644,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( ur_event_handle_t_ *UrEvent{}; try { UrEvent = new ur_event_handle_t_(ZeEvent, nullptr /* ZeEventPool */, - Context, UR_EXT_COMMAND_TYPE_USER, true); + Context, UR_EXT_COMMAND_TYPE_USER, + Properties->isNativeHandleOwned); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp index fae4734b031b0..be7e88ddb6923 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp @@ -714,14 +714,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( ur_native_handle_t NativeKernel, ///< [in] the native handle of the kernel. ur_context_handle_t Context, ///< [in] handle of the context object ur_program_handle_t Program, + const ur_kernel_native_properties_t *Properties, ur_kernel_handle_t * RetKernel ///< [out] pointer to the handle of the kernel object created. ) { ze_kernel_handle_t ZeKernel = ur_cast(NativeKernel); ur_kernel_handle_t_ *Kernel = nullptr; try { - Kernel = new ur_kernel_handle_t_(ZeKernel, - false, // OwnZeKernel + Kernel = new ur_kernel_handle_t_(ZeKernel, Properties->isNativeHandleOwned, Context); *RetKernel = reinterpret_cast(Kernel); } catch (const std::bad_alloc &) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index 369a0eef74d9e..62f1bf19bf24e 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -1858,10 +1858,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urMemCreateWithNativeHandle( ur_native_handle_t NativeMem, ///< [in] the native handle of the mem. ur_context_handle_t Context, ///< [in] handle of the context object - bool OwnNativeHandle, + const ur_mem_native_properties_t *Properties, ur_mem_handle_t *Mem ///< [out] pointer to the handle of the mem object created. ) { + bool OwnNativeHandle = Properties->isNativeHandleOwned; + std::shared_lock Lock(Context->Mutex); // Get base of the allocation diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp index df7d39be50cb5..dc4801f6628b8 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp @@ -474,6 +474,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( ur_native_handle_t NativeQueue, ///< [in] the native handle of the queue. ur_context_handle_t Context, ///< [in] handle of the context object + ur_device_handle_t Device, /// + const ur_queue_native_properties_t *Properties, /// ur_queue_handle_t *RetQueue ///< [out] pointer to the handle of the queue object created. ) { @@ -492,13 +494,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( ur_platform_handle_t Platform{}; UR_CALL(urPlatformGet(NumEntries, &Platform, nullptr)); - ur_device_handle_t Device; - UR_CALL( - urDeviceGet(Platform, UR_DEVICE_TYPE_GPU, NumEntries, &Device, nullptr)); + ur_device_handle_t UrDevice = Device; + if (UrDevice == nullptr) { + UR_CALL(urDeviceGet(Platform, UR_DEVICE_TYPE_GPU, NumEntries, &UrDevice, + nullptr)); + } try { - ur_queue_handle_t_ *Queue = new ur_queue_handle_t_(ZeQueues, ZeroCopyQueues, - Context, Device, false); + ur_queue_handle_t_ *Queue = + new ur_queue_handle_t_(ZeQueues, ZeroCopyQueues, Context, UrDevice, + Properties->isNativeHandleOwned); *RetQueue = reinterpret_cast(Queue); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; From ff370d2cd2190ad2752629dc7f442afde27c2b04 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Wed, 26 Apr 2023 17:36:48 -0700 Subject: [PATCH 28/50] Move some code to L0 Signed-off-by: Jaime Arteaga --- sycl/plugins/unified_runtime/pi2ur.hpp | 19 ---- .../level_zero/ur_level_zero_common.cpp | 17 ++++ .../level_zero/ur_level_zero_common.hpp | 92 +++++++++++++++--- sycl/plugins/unified_runtime/ur/ur.cpp | 17 ---- sycl/plugins/unified_runtime/ur/ur.hpp | 95 +++---------------- 5 files changed, 111 insertions(+), 129 deletions(-) diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index ebca37978a696..138ab10e06ff2 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -7,7 +7,6 @@ //===------------------------------------------------------------------===// #pragma once -#include "ur/adapters/level_zero/ur_level_zero.hpp" #include "ur_api.h" #include #include @@ -382,24 +381,6 @@ inline pi_result ur2piDeviceInfoValue(ur_device_info_t ParamName, return PI_SUCCESS; } -struct _pi_context : ur_context_handle_t_ {}; - -struct _pi_queue : ur_context_handle_t_ {}; - -struct _pi_program : ur_program_handle_t_ {}; - -struct _pi_kernel : ur_kernel_handle_t_ {}; - -struct _pi_mem : ur_mem_handle_t_ {}; - -struct _pi_buffer : ur_mem_handle_t_ {}; - -struct _pi_image : ur_mem_handle_t_ {}; - -struct _pi_sampler : ur_sampler_handle_t_ {}; - -struct _pi_event : ur_event_handle_t_ {}; - namespace pi2ur { inline pi_result piTearDown(void *PluginParameter) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.cpp index ca5259a80abcd..4603fbe741354 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.cpp @@ -264,3 +264,20 @@ template <> zes_structure_type_t getZesStructureType() { template <> zes_structure_type_t getZesStructureType() { return ZES_STRUCTURE_TYPE_MEM_PROPERTIES; } + +// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR +thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS; +thread_local char ErrorMessage[MaxMessageSize]; + +// Utility function for setting a message and warning +[[maybe_unused]] void setErrorMessage(const char *message, + ur_result_t error_code) { + assert(strlen(message) <= MaxMessageSize); + strcpy(ErrorMessage, message); + ErrorMessageCode = error_code; +} + +ur_result_t zerPluginGetLastError(char **message) { + *message = &ErrorMessage[0]; + return ErrorMessageCode; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp index 599527ae34a2d..491c58e668763 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp @@ -26,19 +26,6 @@ struct _ur_platform_handle_t; -template To ur_cast(From Value) { - // TODO: see if more sanity checks are possible. - assert(sizeof(From) == sizeof(To)); - return (To)(Value); -} - -template <> uint32_t inline ur_cast(uint64_t Value) { - // Cast value and check that we don't lose any information. - uint32_t CastedValue = (uint32_t)(Value); - assert((uint64_t)CastedValue == Value); - return CastedValue; -} - static auto getUrResultString = [](ur_result_t Result) { switch (Result) { case UR_RESULT_SUCCESS: @@ -332,6 +319,76 @@ ur_result_t ze2urResult(ze_result_t ZeResult); #define ZE_CALL_NOCHECK(ZeName, ZeArgs) \ ZeCall().doCall(ZeName ZeArgs, #ZeName, #ZeArgs, false) +// This wrapper around std::atomic is created to limit operations with reference +// counter and to make allowed operations more transparent in terms of +// thread-safety in the plugin. increment() and load() operations do not need a +// mutex guard around them since the underlying data is already atomic. +// decrementAndTest() method is used to guard a code which needs to be +// executed when object's ref count becomes zero after release. This method also +// doesn't need a mutex guard because decrement operation is atomic and only one +// thread can reach ref count equal to zero, i.e. only a single thread can pass +// through this check. +struct ReferenceCounter { + ReferenceCounter() : RefCount{1} {} + + // Reset the counter to the initial value. + void reset() { RefCount = 1; } + + // Used when retaining an object. + void increment() { RefCount++; } + + // Supposed to be used in pi*GetInfo* methods where ref count value is + // requested. + uint32_t load() { return RefCount.load(); } + + // This method allows to guard a code which needs to be executed when object's + // ref count becomes zero after release. It is important to notice that only a + // single thread can pass through this check. This is true because of several + // reasons: + // 1. Decrement operation is executed atomically. + // 2. It is not allowed to retain an object after its refcount reaches zero. + // 3. It is not allowed to release an object more times than the value of + // the ref count. + // 2. and 3. basically means that we can't use an object at all as soon as its + // refcount reaches zero. Using this check guarantees that code for deleting + // an object and releasing its resources is executed once by a single thread + // and we don't need to use any mutexes to guard access to this object in the + // scope after this check. Of course if we access another objects in this code + // (not the one which is being deleted) then access to these objects must be + // guarded, for example with a mutex. + bool decrementAndTest() { return --RefCount == 0; } + +private: + std::atomic RefCount; +}; + +// Base class to store common data +struct _ur_object { + _ur_object() : RefCount{} {} + + // Must be atomic to prevent data race when incrementing/decrementing. + ReferenceCounter RefCount; + + // This mutex protects accesses to all the non-const member variables. + // Exclusive access is required to modify any of these members. + // + // To get shared access to the object in a scope use std::shared_lock: + // std::shared_lock Lock(Obj->Mutex); + // To get exclusive access to the object in a scope use std::scoped_lock: + // std::scoped_lock Lock(Obj->Mutex); + // + // If several pi objects are accessed in a scope then each object's mutex must + // be locked. For example, to get write access to Obj1 and Obj2 and read + // access to Obj3 in a scope use the following approach: + // std::shared_lock Obj3Lock(Obj3->Mutex, std::defer_lock); + // std::scoped_lock LockAll(Obj1->Mutex, Obj2->Mutex, Obj3Lock); + ur_shared_mutex Mutex; + + // Indicates if we own the native handle or it came from interop that + // asked to not transfer the ownership to SYCL RT. + bool OwnNativeHandle = false; +}; + // Record for a memory allocation. This structure is used to keep information // for each memory allocation. struct MemAllocRecord : _ur_object { @@ -403,3 +460,12 @@ extern std::map *ZeCallCount; constexpr char ZE_SUPPORTED_EXTENSIONS[] = "cl_khr_il_program cl_khr_subgroups cl_intel_subgroups " "cl_intel_subgroups_short cl_intel_required_subgroup_size "; + +// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR +constexpr size_t MaxMessageSize = 256; +extern thread_local ur_result_t ErrorMessageCode; +extern thread_local char ErrorMessage[MaxMessageSize]; + +// Utility function for setting a message and warning +[[maybe_unused]] void setErrorMessage(const char *message, + ur_result_t error_code); diff --git a/sycl/plugins/unified_runtime/ur/ur.cpp b/sycl/plugins/unified_runtime/ur/ur.cpp index 67a6ac4bb391d..0db860fbd0daa 100644 --- a/sycl/plugins/unified_runtime/ur/ur.cpp +++ b/sycl/plugins/unified_runtime/ur/ur.cpp @@ -28,20 +28,3 @@ std::vector *PiPlatformsCache = new std::vector; SpinLock *PiPlatformsCacheMutex = new SpinLock; bool PiPlatformCachePopulated = false; - -// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR -thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS; -thread_local char ErrorMessage[MaxMessageSize]; - -// Utility function for setting a message and warning -[[maybe_unused]] void setErrorMessage(const char *message, - ur_result_t error_code) { - assert(strlen(message) <= MaxMessageSize); - strcpy(ErrorMessage, message); - ErrorMessageCode = error_code; -} - -ur_result_t zerPluginGetLastError(char **message) { - *message = &ErrorMessage[0]; - return ErrorMessageCode; -} diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp index c03ba316860f7..790c2fd39bd00 100644 --- a/sycl/plugins/unified_runtime/ur/ur.hpp +++ b/sycl/plugins/unified_runtime/ur/ur.hpp @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include #include @@ -20,6 +21,19 @@ #include +template To ur_cast(From Value) { + // TODO: see if more sanity checks are possible. + assert(sizeof(From) == sizeof(To)); + return (To)(Value); +} + +template <> uint32_t inline ur_cast(uint64_t Value) { + // Cast value and check that we don't lose any information. + uint32_t CastedValue = (uint32_t)(Value); + assert((uint64_t)CastedValue == Value); + return CastedValue; +} + // TODO: promote all of the below extensions to the Unified Runtime // and get rid of these ZER_EXT constants. const int UR_EXT_DEVICE_INFO_END = UR_DEVICE_INFO_FORCE_UINT32; @@ -193,76 +207,6 @@ template struct ZeCache : private T { } }; -// This wrapper around std::atomic is created to limit operations with reference -// counter and to make allowed operations more transparent in terms of -// thread-safety in the plugin. increment() and load() operations do not need a -// mutex guard around them since the underlying data is already atomic. -// decrementAndTest() method is used to guard a code which needs to be -// executed when object's ref count becomes zero after release. This method also -// doesn't need a mutex guard because decrement operation is atomic and only one -// thread can reach ref count equal to zero, i.e. only a single thread can pass -// through this check. -struct ReferenceCounter { - ReferenceCounter() : RefCount{1} {} - - // Reset the counter to the initial value. - void reset() { RefCount = 1; } - - // Used when retaining an object. - void increment() { RefCount++; } - - // Supposed to be used in pi*GetInfo* methods where ref count value is - // requested. - uint32_t load() { return RefCount.load(); } - - // This method allows to guard a code which needs to be executed when object's - // ref count becomes zero after release. It is important to notice that only a - // single thread can pass through this check. This is true because of several - // reasons: - // 1. Decrement operation is executed atomically. - // 2. It is not allowed to retain an object after its refcount reaches zero. - // 3. It is not allowed to release an object more times than the value of - // the ref count. - // 2. and 3. basically means that we can't use an object at all as soon as its - // refcount reaches zero. Using this check guarantees that code for deleting - // an object and releasing its resources is executed once by a single thread - // and we don't need to use any mutexes to guard access to this object in the - // scope after this check. Of course if we access another objects in this code - // (not the one which is being deleted) then access to these objects must be - // guarded, for example with a mutex. - bool decrementAndTest() { return --RefCount == 0; } - -private: - std::atomic RefCount; -}; - -// Base class to store common data -struct _ur_object { - _ur_object() : RefCount{} {} - - // Must be atomic to prevent data race when incrementing/decrementing. - ReferenceCounter RefCount; - - // This mutex protects accesses to all the non-const member variables. - // Exclusive access is required to modify any of these members. - // - // To get shared access to the object in a scope use std::shared_lock: - // std::shared_lock Lock(Obj->Mutex); - // To get exclusive access to the object in a scope use std::scoped_lock: - // std::scoped_lock Lock(Obj->Mutex); - // - // If several pi objects are accessed in a scope then each object's mutex must - // be locked. For example, to get write access to Obj1 and Obj2 and read - // access to Obj3 in a scope use the following approach: - // std::shared_lock Obj3Lock(Obj3->Mutex, std::defer_lock); - // std::scoped_lock LockAll(Obj1->Mutex, Obj2->Mutex, Obj3Lock); - ur_shared_mutex Mutex; - - // Indicates if we own the native handle or it came from interop that - // asked to not transfer the ownership to SYCL RT. - bool OwnNativeHandle = false; -}; - // Helper for one-liner validation #define UR_ASSERT(condition, error) \ if (!(condition)) \ @@ -385,13 +329,4 @@ class UrReturnHelper { size_t param_value_size; void *param_value; size_t *param_value_size_ret; -}; - -// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR -constexpr size_t MaxMessageSize = 256; -extern thread_local ur_result_t ErrorMessageCode; -extern thread_local char ErrorMessage[MaxMessageSize]; - -// Utility function for setting a message and warning -[[maybe_unused]] void setErrorMessage(const char *message, - ur_result_t error_code); +}; \ No newline at end of file From 151d5096802446dfe46987526c58139a8661b9b3 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 27 Apr 2023 01:08:41 -0700 Subject: [PATCH 29/50] Add more code for piextMemImageCreateWithNativeHandle Signed-off-by: Jaime Arteaga --- sycl/plugins/unified_runtime/pi2ur.hpp | 206 ++++++++++-------- .../adapters/level_zero/ur_level_zero_mem.cpp | 152 ++++++++----- 2 files changed, 214 insertions(+), 144 deletions(-) diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 138ab10e06ff2..602b23329ec4d 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -2258,100 +2258,70 @@ inline pi_result piMemGetInfo(pi_mem Mem, pi_mem_info ParamName, return PI_SUCCESS; } -inline pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, - const pi_image_format *ImageFormat, - const pi_image_desc *ImageDesc, void *HostPtr, - pi_mem *RetImage) { - - // TODO: implement read-only, write-only - if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) { - die("piMemImageCreate: Level-Zero implements only read-write buffer," - "no read-only or write-only yet."); - } - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE); - PI_ASSERT(ImageFormat, PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); - - ur_context_handle_t UrContext = - reinterpret_cast(Context); - - ur_mem_flags_t UrFlags{}; - if (Flags & PI_MEM_FLAGS_ACCESS_RW) { - UrFlags |= UR_MEM_FLAG_READ_WRITE; - } - if (Flags & PI_MEM_ACCESS_READ_ONLY) { - UrFlags |= UR_MEM_FLAG_READ_ONLY; - } - if (Flags & PI_MEM_FLAGS_HOST_PTR_USE) { - UrFlags |= UR_MEM_FLAG_USE_HOST_POINTER; - } - if (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) { - UrFlags |= UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER; - } - if (Flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) { - UrFlags |= UR_MEM_FLAG_ALLOC_HOST_POINTER; - } +static void pi2urImageDesc(const pi_image_format *ImageFormat, + const pi_image_desc *ImageDesc, + ur_image_format_t *UrFormat, + ur_image_desc_t *UrDesc) { - ur_image_format_t UrFormat{}; switch (ImageFormat->image_channel_data_type) { case PI_IMAGE_CHANNEL_TYPE_SNORM_INT8: { - UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SNORM_INT8; + UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_SNORM_INT8; break; } case PI_IMAGE_CHANNEL_TYPE_SNORM_INT16: { - UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SNORM_INT16; + UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_SNORM_INT16; break; } case PI_IMAGE_CHANNEL_TYPE_UNORM_INT8: { - UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_INT8; + UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_INT8; break; } case PI_IMAGE_CHANNEL_TYPE_UNORM_INT16: { - UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_INT16; + UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_INT16; break; } case PI_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565: { - UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565; + UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565; break; } case PI_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555: { - UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555; + UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555; break; } case PI_IMAGE_CHANNEL_TYPE_UNORM_INT_101010: { - UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_INT_101010; + UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_INT_101010; break; } case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8: { - UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8; + UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8; break; } case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT16: { - UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16; + UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16; break; } case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT32: { - UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32; + UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32; break; } case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: { - UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8; + UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8; break; } case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: { - UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16; + UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16; break; } case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: { - UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32; + UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32; break; } case PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT: { - UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT; + UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT; break; } case PI_IMAGE_CHANNEL_TYPE_FLOAT: { - UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_FLOAT; + UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_FLOAT; break; } default: { @@ -2360,113 +2330,153 @@ inline pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, } switch (ImageFormat->image_channel_order) { case PI_IMAGE_CHANNEL_ORDER_A: { - UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_A; + UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_A; break; } case PI_IMAGE_CHANNEL_ORDER_R: { - UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_R; + UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_R; break; } case PI_IMAGE_CHANNEL_ORDER_RG: { - UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RG; + UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_RG; break; } case PI_IMAGE_CHANNEL_ORDER_RA: { - UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RA; + UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_RA; break; } case PI_IMAGE_CHANNEL_ORDER_RGB: { - UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RGB; + UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_RGB; break; } case PI_IMAGE_CHANNEL_ORDER_RGBA: { - UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RGBA; + UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_RGBA; break; } case PI_IMAGE_CHANNEL_ORDER_BGRA: { - UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_BGRA; + UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_BGRA; break; } case PI_IMAGE_CHANNEL_ORDER_ARGB: { - UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_ARGB; + UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_ARGB; break; } case PI_IMAGE_CHANNEL_ORDER_ABGR: { - UrFormat.channelOrder = UR_EXT_IMAGE_CHANNEL_ORDER_ABGR; + UrFormat->channelOrder = UR_EXT_IMAGE_CHANNEL_ORDER_ABGR; break; } case PI_IMAGE_CHANNEL_ORDER_INTENSITY: { - UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_INTENSITY; + UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_INTENSITY; break; } case PI_IMAGE_CHANNEL_ORDER_LUMINANCE: { - UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_LUMINANCE; + UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_LUMINANCE; break; } case PI_IMAGE_CHANNEL_ORDER_Rx: { - UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RX; + UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_RX; break; } case PI_IMAGE_CHANNEL_ORDER_RGx: { - UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RGX; + UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_RGX; break; } case PI_IMAGE_CHANNEL_ORDER_RGBx: { - UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RGBX; + UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_RGBX; break; } case PI_IMAGE_CHANNEL_ORDER_sRGBA: { - UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_SRGBA; + UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_SRGBA; break; } default: { die("piMemImageCreate: unsuppported image_channel_data_type."); } } - ur_image_desc_t UrDesc{}; - UrDesc.arraySize = ImageDesc->image_array_size; - UrDesc.depth = ImageDesc->image_depth; - UrDesc.height = ImageDesc->image_height; - UrDesc.numMipLevel = ImageDesc->num_mip_levels; - UrDesc.numSamples = ImageDesc->num_samples; - UrDesc.rowPitch = ImageDesc->image_row_pitch; - UrDesc.slicePitch = ImageDesc->image_slice_pitch; + + UrDesc->arraySize = ImageDesc->image_array_size; + UrDesc->depth = ImageDesc->image_depth; + UrDesc->height = ImageDesc->image_height; + UrDesc->numMipLevel = ImageDesc->num_mip_levels; + UrDesc->numSamples = ImageDesc->num_samples; + UrDesc->rowPitch = ImageDesc->image_row_pitch; + UrDesc->slicePitch = ImageDesc->image_slice_pitch; switch (ImageDesc->image_type) { case PI_MEM_TYPE_BUFFER: { - UrDesc.type = UR_MEM_TYPE_BUFFER; + UrDesc->type = UR_MEM_TYPE_BUFFER; break; } case PI_MEM_TYPE_IMAGE2D: { - UrDesc.type = UR_MEM_TYPE_IMAGE2D; + UrDesc->type = UR_MEM_TYPE_IMAGE2D; break; } case PI_MEM_TYPE_IMAGE3D: { - UrDesc.type = UR_MEM_TYPE_IMAGE3D; + UrDesc->type = UR_MEM_TYPE_IMAGE3D; break; } case PI_MEM_TYPE_IMAGE2D_ARRAY: { - UrDesc.type = UR_MEM_TYPE_IMAGE2D_ARRAY; + UrDesc->type = UR_MEM_TYPE_IMAGE2D_ARRAY; break; } case PI_MEM_TYPE_IMAGE1D: { - UrDesc.type = UR_MEM_TYPE_IMAGE1D; + UrDesc->type = UR_MEM_TYPE_IMAGE1D; break; } case PI_MEM_TYPE_IMAGE1D_ARRAY: { - UrDesc.type = UR_MEM_TYPE_IMAGE1D_ARRAY; + UrDesc->type = UR_MEM_TYPE_IMAGE1D_ARRAY; break; } case PI_MEM_TYPE_IMAGE1D_BUFFER: { - UrDesc.type = UR_MEM_TYPE_IMAGE1D_BUFFER; + UrDesc->type = UR_MEM_TYPE_IMAGE1D_BUFFER; break; } default: { die("piMemImageCreate: unsuppported image_type."); } } - UrDesc.width = ImageDesc->image_width; - UrDesc.arraySize = ImageDesc->image_array_size; - UrDesc.arraySize = ImageDesc->image_array_size; + UrDesc->width = ImageDesc->image_width; + UrDesc->arraySize = ImageDesc->image_array_size; + UrDesc->arraySize = ImageDesc->image_array_size; +} + +inline pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, + const pi_image_format *ImageFormat, + const pi_image_desc *ImageDesc, void *HostPtr, + pi_mem *RetImage) { + + // TODO: implement read-only, write-only + if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) { + die("piMemImageCreate: Level-Zero implements only read-write buffer," + "no read-only or write-only yet."); + } + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE); + PI_ASSERT(ImageFormat, PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + + ur_mem_flags_t UrFlags{}; + if (Flags & PI_MEM_FLAGS_ACCESS_RW) { + UrFlags |= UR_MEM_FLAG_READ_WRITE; + } + if (Flags & PI_MEM_ACCESS_READ_ONLY) { + UrFlags |= UR_MEM_FLAG_READ_ONLY; + } + if (Flags & PI_MEM_FLAGS_HOST_PTR_USE) { + UrFlags |= UR_MEM_FLAG_USE_HOST_POINTER; + } + if (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) { + UrFlags |= UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER; + } + if (Flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) { + UrFlags |= UR_MEM_FLAG_ALLOC_HOST_POINTER; + } + + ur_image_format_t UrFormat{}; + ur_image_desc_t UrDesc{}; + pi2urImageDesc(ImageFormat, ImageDesc, &UrFormat, &UrDesc); + // TODO: UrDesc doesn't have something for ImageDesc->buffer ur_mem_handle_t *UrMem = reinterpret_cast(RetImage); @@ -2485,16 +2495,28 @@ inline pi_result piextMemImageCreateWithNativeHandle( PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); - std::ignore = NativeHandle; - std::ignore = Context; - std::ignore = OwnNativeHandle; - std::ignore = ImageFormat; - std::ignore = ImageDesc; - std::ignore = RetImage; - - // ur_mem_handle_t *UrMem = reinterpret_cast(RetImage); - // HANDLE_ERRORS(urMemImageCreateWithNativeHandle(UrContext, OwnNativeHandle, - // HostPtr, UrMem)); + ur_native_handle_t UrNativeMem = + reinterpret_cast(NativeHandle); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + + ur_mem_handle_t *UrMem = reinterpret_cast(RetImage); + ur_mem_native_properties_t Properties{}; + Properties.isNativeHandleOwned = OwnNativeHandle; + + ur_image_format_t UrFormat{}; + ur_image_desc_t UrDesc{}; + pi2urImageDesc(ImageFormat, ImageDesc, &UrFormat, &UrDesc); + + ur_mem_image_native_properties_t ImageProperties{}; + ImageProperties.stype = UR_STRUCTURE_TYPE_MEM_IMAGE_NATIVE_PROPERTIES; + ImageProperties.pImageFormat = &UrFormat; + ImageProperties.pImageDesc = &UrDesc; + Properties.pNext = &ImageProperties; + + HANDLE_ERRORS( + urMemCreateWithNativeHandle(UrNativeMem, UrContext, &Properties, UrMem)); return PI_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index 62f1bf19bf24e..6dc21eab41d4f 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -1588,49 +1588,6 @@ static ur_result_t ur2zeImageDesc(const ur_image_format_t *ImageFormat, return UR_RESULT_SUCCESS; } -#if 0 -UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( - ur_native_handle_t NativeMem, ///< [in] the native handle of the mem. - ur_context_handle_t Context, ///< [in] handle of the context object - bool OwnNativeHandle, -/* - const ur_image_format_t - *ImageFormat, ///< [in] pointer to image format specification - const ur_image_desc_t *ImageDesc, ///< [in] pointer to image description -*/ - ur_mem_handle_t - *Mem ///< [out] pointer to the handle of the mem object created. -) { - - std::shared_lock Lock(Context->Mutex); - - ze_image_handle_t ZeImage = ur_cast(NativeMem); - -try { - auto UrImage = - new _ur_image(ur_cast(Context), ZeImage, OwnNativeHandle); - *Mem = reinterpret_cast(UrImage); - -/* -#ifndef NDEBUG - ZeStruct ZeImageDesc; - UR_CALL(ur2zeImageDesc(ImageFormat, ImageDesc, ZeImageDesc)); - - UrImage->ZeImageDesc = ZeImageDesc; -#endif // !NDEBUG -*/ - - } catch (const std::bad_alloc &) { - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - - return UR_RESULT_SUCCESS; - -} -#endif - UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( ur_context_handle_t Context, ///< [in] handle of the context object ur_mem_flags_t Flags, ///< [in] allocation and usage information flags @@ -1657,8 +1614,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( (Context->ZeContext, Device->ZeDevice, &ZeImageDesc, &ZeImage)); try { - auto UrImage = - new _ur_image(ur_cast(Context), ZeImage); + auto UrImage = new _ur_image(Context, ZeImage); *Mem = reinterpret_cast(UrImage); #ifndef NDEBUG @@ -1684,6 +1640,53 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( return UR_RESULT_SUCCESS; } +#if 0 +UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( + ur_native_handle_t NativeMem, ///< [in] the native handle to the memory. + ur_context_handle_t Context, ///< [in] handle of the context object. + const ur_mem_native_properties_t * + Properties, ///< [in][optional] pointer to native memory creation properties. + ur_mem_handle_t + *Mem ///< [out] pointer to handle of memory object created. +) { + std::shared_lock Lock(Context->Mutex); + + ze_image_handle_t ZeHImage = ur_cast(NativeMem); + + _ur_image *Image = nullptr; + try { + Image = new _ur_image(Context, ZeHImage, Properties->isNativeHandleOwned); + *Mem = reinterpret_cast(Image); + +#ifndef NDEBUG + ZeStruct ZeImageDesc; + if (Properties->pNext != nullptr) { + ur_base_desc_t *BaseDesc = reinterpret_cast(Properties->pNext); + if (BaseDesc->stype == UR_STRUCTURE_TYPE_MEM_IMAGE_NATIVE_PROPERTIES) { + ur_mem_image_native_properties_t *ImageProperties = reinterpret_cast(Properties->pNext); + ur_result_t Res = ur2zeImageDesc(ImageProperties->pImageFormat, + ImageProperties->pImageDesc, + ZeImageDesc); + if (Res != UR_RESULT_SUCCESS) { + delete Image; + *Mem = nullptr; + return Res; + } + } + } + Image->ZeImageDesc = ZeImageDesc; +#endif // !NDEBUG + + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} +#endif + UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( ur_context_handle_t Context, ///< [in] handle of the context object ur_mem_flags_t Flags, ///< [in] allocation and usage information flags @@ -1792,12 +1795,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease( if (Mem->isImage()) { char *ZeHandleImage; - UR_CALL(Mem->getZeHandle(ZeHandleImage, ur_mem_handle_t_::write_only)); - auto ZeResult = ZE_CALL_NOCHECK( - zeImageDestroy, (ur_cast(ZeHandleImage))); - // Gracefully handle the case that L0 was already unloaded. - if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) - return ze2urResult(ZeResult); + auto Image = static_cast<_ur_image *>(Mem); + if (Image->OwnNativeHandle) { + UR_CALL(Mem->getZeHandle(ZeHandleImage, ur_mem_handle_t_::write_only)); + auto ZeResult = ZE_CALL_NOCHECK( + zeImageDestroy, (ur_cast(ZeHandleImage))); + // Gracefully handle the case that L0 was already unloaded. + if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) + return ze2urResult(ZeResult); + } } else { auto Buffer = reinterpret_cast<_ur_buffer *>(Mem); Buffer->free(); @@ -1866,6 +1872,47 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemCreateWithNativeHandle( std::shared_lock Lock(Context->Mutex); + // Check if this is an image + { + if (Properties->pNext != nullptr) { + ur_base_desc_t *BaseDesc = + reinterpret_cast(Properties->pNext); + if (BaseDesc->stype == UR_STRUCTURE_TYPE_MEM_IMAGE_NATIVE_PROPERTIES) { + ur_mem_image_native_properties_t *ImageProperties = + reinterpret_cast( + Properties->pNext); + + ze_image_handle_t ZeHImage = ur_cast(NativeMem); + + _ur_image *Image = nullptr; + try { + Image = + new _ur_image(Context, ZeHImage, Properties->isNativeHandleOwned); + *Mem = reinterpret_cast(Image); + +#ifndef NDEBUG + ZeStruct ZeImageDesc; + ur_result_t Res = + ur2zeImageDesc(ImageProperties->pImageFormat, + ImageProperties->pImageDesc, ZeImageDesc); + if (Res != UR_RESULT_SUCCESS) { + delete Image; + *Mem = nullptr; + return Res; + } + Image->ZeImageDesc = ZeImageDesc; +#endif // !NDEBUG + + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + return UR_RESULT_SUCCESS; + } + } + } + // Get base of the allocation void *Base = nullptr; size_t Size = 0; @@ -1965,7 +2012,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo( size_t *PropSizeRet ///< [out][optional] pointer to the actual size in ///< bytes of data queried by pMemInfo. ) { - UR_ASSERT(!Memory->isImage(), UR_RESULT_ERROR_INVALID_VALUE); + UR_ASSERT(MemInfoType == UR_MEM_INFO_CONTEXT || !Memory->isImage(), + UR_RESULT_ERROR_INVALID_VALUE); auto Buffer = reinterpret_cast<_ur_buffer *>(Memory); std::shared_lock Lock(Buffer->Mutex); From 68413643bea41bfeb081d23a79f6884340bf68e1 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 27 Apr 2023 11:58:29 -0700 Subject: [PATCH 30/50] Port Fix handling of mem_channel buffer property https://github.com/intel/llvm/pull/9203 Signed-off-by: Jaime Arteaga --- .../ur/adapters/level_zero/ur_level_zero_context.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp index c0c4f9958aaf0..cd4513011565d 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp @@ -118,6 +118,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST; return ReturnValue(Capabilities); } + case UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT: + return ReturnValue(pi_bool{false}); default: // TODO: implement other parameters die("urGetContextInfo: unsuppported ParamName."); From a928c7508161cbf37dc598d7ba7092597b451984 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Mon, 1 May 2023 18:25:56 -0700 Subject: [PATCH 31/50] Port Implement proper queries for aspect::ext_oneapi_srgb https://github.com/intel/llvm/pull/9243 Signed-off-by: Jaime Arteaga --- .../ur/adapters/level_zero/ur_level_zero_context.cpp | 3 +-- .../ur/adapters/level_zero/ur_level_zero_device.cpp | 4 ++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp index cd4513011565d..01f36c1814d66 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp @@ -118,8 +118,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST; return ReturnValue(Capabilities); } - case UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT: - return ReturnValue(pi_bool{false}); + default: // TODO: implement other parameters die("urGetContextInfo: unsuppported ParamName."); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp index 4c15b61e19a6f..8e522602146d8 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp @@ -768,6 +768,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST; return ReturnValue(capabilities); } + case UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT: + return ReturnValue(pi_bool{false}); + case UR_DEVICE_INFO_IMAGE_SRGB: + return ReturnValue(pi_bool{false}); case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: { From 5d6d006eafb008eaf6eeac8d7dddb19773b96d34 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Mon, 1 May 2023 20:59:34 -0700 Subject: [PATCH 32/50] Port Avoid leak of active barriers' events https://github.com/intel/llvm/pull/9275 Signed-off-by: Jaime Arteaga --- .../level_zero/ur_level_zero_queue.cpp | 41 +++++++++---------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp index dc4801f6628b8..d91c1fd414181 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp @@ -1268,31 +1268,30 @@ ur_result_t ur_queue_handle_t_::synchronize() { return UR_RESULT_SUCCESS; }; - // Do nothing if the queue is empty - if (!LastCommandEvent) - return UR_RESULT_SUCCESS; - - // For in-order queue just wait for the last command. - // If event is discarded then it can be in reset state or underlying level - // zero handle can have device scope, so we can't synchronize the last event. - if (isInOrderQueue() && !LastCommandEvent->IsDiscarded) { - ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent)); - } else { - // Otherwise sync all L0 queues/immediate command-lists. - for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) { - for (auto &QueueGroup : QueueMap) { - if (Device->ImmCommandListUsed) { - for (auto ImmCmdList : QueueGroup.second.ImmCmdLists) - syncImmCmdList(this, ImmCmdList); - } else { - for (auto &ZeQueue : QueueGroup.second.ZeQueues) - if (ZeQueue) - ZE2UR_CALL(zeHostSynchronize, (ZeQueue)); + if (LastCommandEvent) { + // For in-order queue just wait for the last command. + // If event is discarded then it can be in reset state or underlying level + // zero handle can have device scope, so we can't synchronize the last + // event. + if (isInOrderQueue() && !LastCommandEvent->IsDiscarded) { + ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent)); + } else { + // Otherwise sync all L0 queues/immediate command-lists. + for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) { + for (auto &QueueGroup : QueueMap) { + if (Device->ImmCommandListUsed) { + for (auto ImmCmdList : QueueGroup.second.ImmCmdLists) + syncImmCmdList(this, ImmCmdList); + } else { + for (auto &ZeQueue : QueueGroup.second.ZeQueues) + if (ZeQueue) + ZE2UR_CALL(zeHostSynchronize, (ZeQueue)); + } } } } + LastCommandEvent = nullptr; } - LastCommandEvent = nullptr; // With the entire queue synchronized, the active barriers must be done so we // can remove them. From 943afe7e7f3a97a1e9d719b034f737cf5e9b0bef Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Wed, 3 May 2023 13:54:53 -0700 Subject: [PATCH 33/50] Rebase loader Signed-off-by: Jaime Arteaga --- sycl/plugins/unified_runtime/CMakeLists.txt | 2 +- sycl/plugins/unified_runtime/pi2ur.hpp | 28 ++-- .../adapters/level_zero/ur_level_zero_mem.cpp | 149 ++++++++---------- .../adapters/level_zero/ur_level_zero_mem.hpp | 10 +- .../level_zero/ur_loader_interface.cpp | 4 +- sycl/plugins/unified_runtime/ur/ur.hpp | 5 - 6 files changed, 86 insertions(+), 112 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 2cda6e083f6c4..e9dfeaa6e6fc9 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -4,7 +4,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/jandres742/unified-runtime.git") - set(UNIFIED_RUNTIME_TAG af603dbef47adb62aafbf067931f0c9358a4cac6) + set(UNIFIED_RUNTIME_TAG 8cb3cb2891148a14ef84e840398a1ae8cd84cd6f) message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}") FetchContent_Declare(unified-runtime diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 602b23329ec4d..7444981e836d0 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -2509,14 +2509,8 @@ inline pi_result piextMemImageCreateWithNativeHandle( ur_image_desc_t UrDesc{}; pi2urImageDesc(ImageFormat, ImageDesc, &UrFormat, &UrDesc); - ur_mem_image_native_properties_t ImageProperties{}; - ImageProperties.stype = UR_STRUCTURE_TYPE_MEM_IMAGE_NATIVE_PROPERTIES; - ImageProperties.pImageFormat = &UrFormat; - ImageProperties.pImageDesc = &UrDesc; - Properties.pNext = &ImageProperties; - - HANDLE_ERRORS( - urMemCreateWithNativeHandle(UrNativeMem, UrContext, &Properties, UrMem)); + HANDLE_ERRORS(urMemImageCreateWithNativeHandle( + UrNativeMem, UrContext, &UrFormat, &UrDesc, &Properties, UrMem)); return PI_SUCCESS; } @@ -2630,8 +2624,8 @@ inline pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, // while we get it in interface ur_mem_native_properties_t Properties{}; Properties.isNativeHandleOwned = OwnNativeHandle; - HANDLE_ERRORS( - urMemCreateWithNativeHandle(UrNativeMem, UrContext, &Properties, UrMem)); + HANDLE_ERRORS(urMemBufferCreateWithNativeHandle(UrNativeMem, UrContext, + &Properties, UrMem)); return PI_SUCCESS; } @@ -2669,22 +2663,28 @@ inline pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context, auto UrDevice = reinterpret_cast(Device); ur_usm_desc_t USMDesc{}; + ur_usm_device_desc_t UsmDeviceDesc{}; + UsmDeviceDesc.stype = UR_STRUCTURE_TYPE_USM_DEVICE_DESC; + ur_usm_host_desc_t UsmHostDesc{}; + UsmHostDesc.stype = UR_STRUCTURE_TYPE_USM_HOST_DESC; if (Properties) { if (Properties[0] == PI_MEM_ALLOC_FLAGS) { if (Properties[1] == PI_MEM_ALLOC_WRTITE_COMBINED) { - USMDesc.flags |= UR_EXT_USM_MEM_FLAG_WRITE_COMBINED; + UsmDeviceDesc.flags |= UR_USM_DEVICE_MEM_FLAG_WRITE_COMBINED; } if (Properties[1] == PI_MEM_ALLOC_INITIAL_PLACEMENT_DEVICE) { - USMDesc.flags |= UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_DEVICE; + UsmDeviceDesc.flags |= UR_USM_DEVICE_MEM_FLAG_INITIAL_PLACEMENT; } if (Properties[1] == PI_MEM_ALLOC_INITIAL_PLACEMENT_HOST) { - USMDesc.flags |= UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_HOST; + UsmHostDesc.flags |= UR_USM_HOST_MEM_FLAG_INITIAL_PLACEMENT; } if (Properties[1] == PI_MEM_ALLOC_DEVICE_READ_ONLY) { - USMDesc.flags |= UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY; + UsmDeviceDesc.flags |= UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY; } } } + UsmDeviceDesc.pNext = &UsmHostDesc; + USMDesc.pNext = &UsmDeviceDesc; USMDesc.align = Alignment; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index 6dc21eab41d4f..ed30bf3c9e69b 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -1640,15 +1640,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( return UR_RESULT_SUCCESS; } -#if 0 UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( ur_native_handle_t NativeMem, ///< [in] the native handle to the memory. ur_context_handle_t Context, ///< [in] handle of the context object. - const ur_mem_native_properties_t * - Properties, ///< [in][optional] pointer to native memory creation properties. - ur_mem_handle_t - *Mem ///< [out] pointer to handle of memory object created. -) { + const ur_image_format_t + *ImageFormat, ///< [in] pointer to image format specification. + const ur_image_desc_t *ImageDesc, ///< [in] pointer to image description. + const ur_mem_native_properties_t + *Properties, ///< [in][optional] pointer to native memory creation + ///< properties. + ur_mem_handle_t *Mem) { std::shared_lock Lock(Context->Mutex); ze_image_handle_t ZeHImage = ur_cast(NativeMem); @@ -1660,19 +1661,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( #ifndef NDEBUG ZeStruct ZeImageDesc; - if (Properties->pNext != nullptr) { - ur_base_desc_t *BaseDesc = reinterpret_cast(Properties->pNext); - if (BaseDesc->stype == UR_STRUCTURE_TYPE_MEM_IMAGE_NATIVE_PROPERTIES) { - ur_mem_image_native_properties_t *ImageProperties = reinterpret_cast(Properties->pNext); - ur_result_t Res = ur2zeImageDesc(ImageProperties->pImageFormat, - ImageProperties->pImageDesc, - ZeImageDesc); - if (Res != UR_RESULT_SUCCESS) { - delete Image; - *Mem = nullptr; - return Res; - } - } + ur_result_t Res = ur2zeImageDesc(ImageFormat, ImageDesc, ZeImageDesc); + if (Res != UR_RESULT_SUCCESS) { + delete Image; + *Mem = nullptr; + return Res; } Image->ZeImageDesc = ZeImageDesc; #endif // !NDEBUG @@ -1682,10 +1675,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } - + return UR_RESULT_SUCCESS; } -#endif UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( ur_context_handle_t Context, ///< [in] handle of the context object @@ -1861,58 +1853,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urMemCreateWithNativeHandle( - ur_native_handle_t NativeMem, ///< [in] the native handle of the mem. - ur_context_handle_t Context, ///< [in] handle of the context object - const ur_mem_native_properties_t *Properties, +UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( + ur_native_handle_t NativeMem, ///< [in] the native handle to the memory. + ur_context_handle_t Context, ///< [in] handle of the context object. + const ur_mem_native_properties_t + *Properties, ///< [in][optional] pointer to native memory creation + ///< properties. ur_mem_handle_t - *Mem ///< [out] pointer to the handle of the mem object created. + *Mem ///< [out] pointer to handle of buffer memory object created. ) { bool OwnNativeHandle = Properties->isNativeHandleOwned; std::shared_lock Lock(Context->Mutex); - // Check if this is an image - { - if (Properties->pNext != nullptr) { - ur_base_desc_t *BaseDesc = - reinterpret_cast(Properties->pNext); - if (BaseDesc->stype == UR_STRUCTURE_TYPE_MEM_IMAGE_NATIVE_PROPERTIES) { - ur_mem_image_native_properties_t *ImageProperties = - reinterpret_cast( - Properties->pNext); - - ze_image_handle_t ZeHImage = ur_cast(NativeMem); - - _ur_image *Image = nullptr; - try { - Image = - new _ur_image(Context, ZeHImage, Properties->isNativeHandleOwned); - *Mem = reinterpret_cast(Image); - -#ifndef NDEBUG - ZeStruct ZeImageDesc; - ur_result_t Res = - ur2zeImageDesc(ImageProperties->pImageFormat, - ImageProperties->pImageDesc, ZeImageDesc); - if (Res != UR_RESULT_SUCCESS) { - delete Image; - *Mem = nullptr; - return Res; - } - Image->ZeImageDesc = ZeImageDesc; -#endif // !NDEBUG - - } catch (const std::bad_alloc &) { - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - return UR_RESULT_SUCCESS; - } - } - } - // Get base of the allocation void *Base = nullptr; size_t Size = 0; @@ -2075,8 +2028,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( if (Align > 65536) return UR_RESULT_ERROR_INVALID_VALUE; - const ur_usm_flags_t *USMFlag = &USMDesc->flags; - std::ignore = USMFlag; + const ur_usm_advice_flags_t *USMHintFlags = &USMDesc->hints; + std::ignore = USMHintFlags; ur_platform_handle_t Plt = Context->getPlatform(); // If indirect access tracking is enabled then lock the mutex which is @@ -2105,9 +2058,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( // keep the same behavior for the allocator, just call L0 API directly and // return the error code. ((Align & (Align - 1)) != 0)) { - ur_usm_flags_t Properties{}; - ur_result_t Res = - USMHostAllocImpl(RetMem, Context, &Properties, Size, Align); + ur_usm_host_mem_flags_t Flags{}; + ur_result_t Res = USMHostAllocImpl(RetMem, Context, &Flags, Size, Align); if (IndirectAccessTrackingEnabled) { // Keep track of all memory allocations in the context Context->MemAllocs.emplace(std::piecewise_construct, @@ -2158,8 +2110,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( if (Alignment > 65536) return UR_RESULT_ERROR_INVALID_VALUE; - const ur_usm_flags_t *USMProp = &USMDesc->flags; - std::ignore = USMProp; + const ur_usm_advice_flags_t *USMHintFlags = &USMDesc->hints; + std::ignore = USMHintFlags; ur_platform_handle_t Plt = Device->Platform; @@ -2236,11 +2188,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( ) { std::ignore = Pool; - const ur_usm_flags_t *Properties = &USMDesc->flags; uint32_t Alignment = USMDesc->align; + ur_usm_host_mem_flags_t UsmHostFlags{}; + // See if the memory is going to be read-only on the device. - bool DeviceReadOnly = *Properties & UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY; + bool DeviceReadOnly = false; + ur_usm_device_mem_flags_t UsmDeviceFlags{}; + + void *pNext = const_cast(USMDesc->pNext); + while (pNext != nullptr) { + const ur_base_desc_t *BaseDesc = + reinterpret_cast(pNext); + if (BaseDesc->stype == UR_STRUCTURE_TYPE_USM_DEVICE_DESC) { + const ur_usm_device_desc_t *UsmDeviceDesc = + reinterpret_cast(pNext); + UsmDeviceFlags = UsmDeviceDesc->flags; + } + if (BaseDesc->stype == UR_STRUCTURE_TYPE_USM_HOST_DESC) { + const ur_usm_host_desc_t *UsmHostDesc = + reinterpret_cast(pNext); + UsmHostFlags = UsmHostDesc->flags; + } + pNext = const_cast(BaseDesc->pNext); + } + DeviceReadOnly = UsmDeviceFlags & UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY; // L0 supports alignment up to 64KB and silently ignores higher values. // We flag alignment > 64KB as an invalid value. @@ -2271,9 +2243,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( // keep the same behavior for the allocator, just call L0 API directly and // return the error code. ((Alignment & (Alignment - 1)) != 0)) { - ur_result_t Res = USMSharedAllocImpl( - RetMem, Context, Device, const_cast(Properties), Size, - Alignment); + ur_result_t Res = USMSharedAllocImpl(RetMem, Context, Device, &UsmHostFlags, + &UsmDeviceFlags, Size, Alignment); if (IndirectAccessTrackingEnabled) { // Keep track of all memory allocations in the context Context->MemAllocs.emplace(std::piecewise_construct, @@ -2423,16 +2394,18 @@ void USMMemoryAllocBase::deallocate(void *Ptr) { ur_result_t USMSharedMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, Size, + return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size, Alignment); } ur_result_t USMSharedReadOnlyMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - ur_usm_flags_t Props = UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY; - return USMSharedAllocImpl(ResultPtr, Context, Device, &Props, Size, - Alignment); + ur_usm_device_desc_t UsmDeviceDesc{}; + UsmDeviceDesc.flags = UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY; + ur_usm_host_desc_t UsmHostDesc{}; + return USMSharedAllocImpl(ResultPtr, Context, Device, &UsmDeviceDesc.flags, + &UsmHostDesc.flags, Size, Alignment); } ur_result_t USMDeviceMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, @@ -2536,7 +2509,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolDestroy( ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, ur_device_handle_t Device, - ur_usm_flags_t *Properties, size_t Size, + ur_usm_device_mem_flags_t *Flags, size_t Size, uint32_t Alignment) { // TODO: translate PI properties to Level Zero flags ZeStruct ZeDesc; @@ -2562,8 +2535,10 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, } ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t Device, ur_usm_flags_t *, - size_t Size, uint32_t Alignment) { + ur_device_handle_t Device, + ur_usm_host_mem_flags_t *, + ur_usm_device_mem_flags_t *, size_t Size, + uint32_t Alignment) { // TODO: translate PI properties to Level Zero flags ZeStruct ZeHostDesc; @@ -2593,7 +2568,7 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, } ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_usm_flags_t *Properties, size_t Size, + ur_usm_host_mem_flags_t *Flags, size_t Size, uint32_t Alignment) { // TODO: translate PI properties to Level Zero flags ZeStruct ZeHostDesc; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp index 9661063f0e5f2..fa0aa966688d5 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp @@ -284,15 +284,17 @@ class USMHostMemoryAlloc : public USMMemoryAllocBase { ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, ur_device_handle_t Device, - ur_usm_flags_t *Properties, size_t Size, + ur_usm_device_mem_flags_t *Flags, size_t Size, uint32_t Alignment); ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t Device, ur_usm_flags_t *, - size_t Size, uint32_t Alignment); + ur_device_handle_t Device, + ur_usm_host_mem_flags_t *, + ur_usm_device_mem_flags_t *, size_t Size, + uint32_t Alignment); ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_usm_flags_t *Properties, size_t Size, + ur_usm_host_mem_flags_t *Flags, size_t Size, uint32_t Alignment); // If indirect access tracking is not enabled then this functions just performs diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp index 01b174aa93774..0d37c805bfb2b 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp @@ -159,7 +159,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetMemProcAddrTable( pDdiTable->pfnRelease = urMemRelease; pDdiTable->pfnBufferPartition = urMemBufferPartition; pDdiTable->pfnGetNativeHandle = urMemGetNativeHandle; - pDdiTable->pfnCreateWithNativeHandle = urMemCreateWithNativeHandle; + pDdiTable->pfnBufferCreateWithNativeHandle = + urMemBufferCreateWithNativeHandle; + pDdiTable->pfnImageCreateWithNativeHandle = urMemImageCreateWithNativeHandle; pDdiTable->pfnGetInfo = urMemGetInfo; pDdiTable->pfnImageGetInfo = urMemImageGetInfo; diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp index 790c2fd39bd00..e5bd87108e824 100644 --- a/sycl/plugins/unified_runtime/ur/ur.hpp +++ b/sycl/plugins/unified_runtime/ur/ur.hpp @@ -71,11 +71,6 @@ const int UR_EXT_USM_CAPS_ATOMIC_ACCESS = 1 << 1; const int UR_EXT_USM_CAPS_CONCURRENT_ACCESS = 1 << 2; const int UR_EXT_USM_CAPS_CONCURRENT_ATOMIC_ACCESS = 1 << 3; -const int UR_EXT_USM_MEM_FLAG_WRITE_COMBINED = 1 << 27; -const int UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_DEVICE = 1 << 28; -const int UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_HOST = 1 << 29; -const int UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY = 1 << 30; - const ur_context_info_t UR_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = (ur_context_info_t)(UR_CONTEXT_INFO_FORCE_UINT32 - 1); From e1e5f631d3f08db43ead7a6c2463730e9f0e52d2 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Wed, 3 May 2023 20:04:45 -0700 Subject: [PATCH 34/50] fix interop image Signed-off-by: Jaime Arteaga --- .../ur/adapters/level_zero/ur_level_zero_common.hpp | 8 +++----- .../ur/adapters/level_zero/ur_level_zero_event.cpp | 2 +- .../ur/adapters/level_zero/ur_level_zero_mem.hpp | 9 ++++----- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp index 491c58e668763..f3a8ba48b2eba 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp @@ -393,17 +393,15 @@ struct _ur_object { // for each memory allocation. struct MemAllocRecord : _ur_object { MemAllocRecord(ur_context_handle_t Context, bool OwnZeMemHandle = true) - : Context(Context), OwnZeMemHandle(OwnZeMemHandle) {} + : Context(Context) { + OwnNativeHandle = OwnZeMemHandle; + } // Currently kernel can reference memory allocations from different contexts // and we need to know the context of a memory allocation when we release it // in piKernelRelease. // TODO: this should go away when memory isolation issue is fixed in the Level // Zero runtime. ur_context_handle_t Context; - - // Indicates if we own the native memory handle or it came from interop that - // asked to not transfer the ownership to SYCL RT. - bool OwnZeMemHandle; }; extern usm_settings::USMAllocatorConfig USMAllocatorConfigInstance; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp index 197ec1e8f70aa..446caee4a7e51 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp @@ -813,7 +813,7 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked) { for (auto &MemAlloc : Kernel->MemAllocs) { // std::pair *, Hash USMFreeHelper(MemAlloc->second.Context, MemAlloc->first, - MemAlloc->second.OwnZeMemHandle); + MemAlloc->second.OwnNativeHandle); } Kernel->MemAllocs.clear(); } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp index fa0aa966688d5..12f782d862ac7 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp @@ -192,9 +192,10 @@ struct _ur_image final : ur_mem_handle_t_ { : ur_mem_handle_t_(UrContext), ZeImage{ZeImage} {} _ur_image(ur_context_handle_t UrContext, ze_image_handle_t ZeImage, - bool OwnNativeHandle) - : ur_mem_handle_t_(UrContext), ZeImage{ZeImage}, - OwnZeMemHandle{OwnNativeHandle} {} + bool OwnZeMemHandle) + : ur_mem_handle_t_(UrContext), ZeImage{ZeImage} { + OwnNativeHandle = OwnZeMemHandle; + } virtual ur_result_t getZeHandle(char *&ZeHandle, access_mode_t, ur_device_handle_t = nullptr) override { @@ -216,8 +217,6 @@ struct _ur_image final : ur_mem_handle_t_ { // Level Zero image handle. ze_image_handle_t ZeImage; - - bool OwnZeMemHandle = true; }; // Implements memory allocation via L0 RT for USM allocator interface. From 5eaf7f37112244f2b86bd38dbfcb378f99eb9fb7 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Mon, 8 May 2023 18:47:20 -0700 Subject: [PATCH 35/50] Port Port PI L0 environment variables to UR L0 https://github.com/intel/llvm/pull/9300 Signed-off-by: Jaime Arteaga --- .../level_zero/ur_level_zero_context.cpp | 21 ++++++-- .../level_zero/ur_level_zero_device.cpp | 18 ++++--- .../level_zero/ur_level_zero_device.hpp | 2 +- .../level_zero/ur_level_zero_event.cpp | 7 ++- .../level_zero/ur_level_zero_event.hpp | 12 +++-- .../adapters/level_zero/ur_level_zero_mem.cpp | 15 ++++-- .../adapters/level_zero/ur_level_zero_mem.hpp | 4 +- .../level_zero/ur_level_zero_queue.cpp | 50 ++++++++++++------- sycl/plugins/unified_runtime/ur/ur.cpp | 1 - .../ur/usm_allocator_config.cpp | 2 - 10 files changed, 89 insertions(+), 43 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp index 01f36c1814d66..4c998fb6294ea 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp @@ -73,8 +73,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextRelease( // Due to a bug with 2D memory copy to and from non-USM pointers, this option is // disabled by default. static const bool UseMemcpy2DOperations = [] { + const char *UrRet = std::getenv("UR_L0_USE_NATIVE_USM_MEMCPY2D"); + const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D"); const char *UseMemcpy2DOperationsFlag = - std::getenv("SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D"); + UrRet ? UrRet : (PiRet ? PiRet : nullptr); if (!UseMemcpy2DOperationsFlag) return false; return std::stoi(UseMemcpy2DOperationsFlag) > 0; @@ -409,8 +411,10 @@ ur_result_t ur_context_handle_t_::finalize() { // here. Setting it to 256 gave best possible performance for several // benchmarks. static const pi_uint32 MaxNumEventsPerPool = [] { - const auto MaxNumEventsPerPoolEnv = - std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); + const char *UrRet = std::getenv("UR_L0_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); + const char *PiRet = std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); + const char *MaxNumEventsPerPoolEnv = + UrRet ? UrRet : (PiRet ? PiRet : nullptr); uint32_t Result = MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256; if (Result <= 0) @@ -531,8 +535,12 @@ ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) { // If number of events in the immediate command list exceeds this threshold then // cleanup process for those events is executed. static const size_t ImmCmdListsEventCleanupThreshold = [] { - const char *ImmCmdListsEventCleanupThresholdStr = std::getenv( + const char *UrRet = + std::getenv("UR_L0_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD"); + const char *PiRet = std::getenv( "SYCL_PI_LEVEL_ZERO_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD"); + const char *ImmCmdListsEventCleanupThresholdStr = + UrRet ? UrRet : (PiRet ? PiRet : nullptr); static constexpr int Default = 1000; if (!ImmCmdListsEventCleanupThresholdStr) return Default; @@ -549,8 +557,11 @@ static const size_t ImmCmdListsEventCleanupThreshold = [] { // Get value of the threshold for number of active command lists allowed before // we start heuristically cleaning them up. static const size_t CmdListsCleanupThreshold = [] { - const char *CmdListsCleanupThresholdStr = + const char *UrRet = std::getenv("UR_L0_COMMANDLISTS_CLEANUP_THRESHOLD"); + const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_COMMANDLISTS_CLEANUP_THRESHOLD"); + const char *CmdListsCleanupThresholdStr = + UrRet ? UrRet : (PiRet ? PiRet : nullptr); static constexpr int Default = 20; if (!CmdListsCleanupThresholdStr) return Default; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp index 8e522602146d8..5fac6f1e4d77a 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp @@ -792,7 +792,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return UR_RESULT_SUCCESS; } -// SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE can be set to an integer value, or +// UR_L0_USE_COPY_ENGINE can be set to an integer value, or // a pair of integer values of the form "lower_index:upper_index". // Here, the indices point to copy engines in a list of all available copy // engines. @@ -802,7 +802,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( // available copy engines can be used. const std::pair getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device) { - static const char *EnvVar = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE"); + const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE"); + const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE"); + static const char *EnvVar = UrRet ? UrRet : (PiRet ? PiRet : nullptr); // If the environment variable is not set, no copy engines are used when // immediate commandlists are being used. For standard commandlists all are // used. @@ -825,7 +827,7 @@ getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device) { int UpperCopyEngineIndex = std::stoi(CopyEngineRange.substr(pos + 1)); if ((LowerCopyEngineIndex > UpperCopyEngineIndex) || (LowerCopyEngineIndex < -1) || (UpperCopyEngineIndex < -1)) { - urPrint("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE: invalid value provided, " + urPrint("UR_L0_LEVEL_ZERO_USE_COPY_ENGINE: invalid value provided, " "default set.\n"); LowerCopyEngineIndex = 0; UpperCopyEngineIndex = INT_MAX; @@ -843,16 +845,20 @@ bool CopyEngineRequested(const ur_device_handle_t &Device) { // The default is standard commandlists. Setting 1 or 2 specifies use of // immediate commandlists. Note: when immediate commandlists are used then // device-only events must be either AllHostVisible or OnDemandHostVisibleProxy. -// (See env var SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS). +// (See env var UR_L0_DEVICE_SCOPE_EVENTS). // Get value of immediate commandlists env var setting or -1 if unset ur_device_handle_t_::ImmCmdlistMode ur_device_handle_t_::useImmediateCommandLists() { // If immediate commandlist setting is not explicitly set, then use the device // default. + // TODO: confirm this is good once make_queue revert is added static const int ImmediateCommandlistsSetting = [] { - const char *ImmediateCommandlistsSettingStr = + const char *UrRet = std::getenv("UR_L0_USE_IMMEDIATE_COMMANDLISTS"); + const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"); + const char *ImmediateCommandlistsSettingStr = + UrRet ? UrRet : (PiRet ? PiRet : nullptr); if (!ImmediateCommandlistsSettingStr) return -1; return std::stoi(ImmediateCommandlistsSettingStr); @@ -1122,7 +1128,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDevicePartition( // Sub-Sub-Devices are partitioned by CSlices, not by affinity domain. // However, if - // SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING overrides that + // UR_L0_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING overrides that // still expose CSlices in partitioning by affinity domain for compatibility // reasons. if (Properties[0] == UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN && diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp index 09e942a6441b8..8aff6f170127f 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp @@ -137,7 +137,7 @@ struct ur_device_handle_t_ : _ur_object { ImmCmdlistMode ImmCommandListUsed{}; // Scope of events used for events on the device - // Can be adjusted with SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS + // Can be adjusted with UR_L0_DEVICE_SCOPE_EVENTS // for non-immediate command lists EventsScope ZeEventsScope = AllHostVisible; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp index 446caee4a7e51..d39c40982bd6f 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp @@ -28,8 +28,11 @@ void printZeEventList(const _ur_ze_event_list_t &UrZeEventList) { // This is an experimental option that allows the use of multiple command lists // when submitting barriers. The default is 0. static const bool UseMultipleCmdlistBarriers = [] { - const char *UseMultipleCmdlistBarriersFlag = + const char *UrRet = std::getenv("UR_L0_USE_MULTIPLE_COMMANDLIST_BARRIERS"); + const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS"); + const char *UseMultipleCmdlistBarriersFlag = + UrRet ? UrRet : (PiRet ? PiRet : nullptr); if (!UseMultipleCmdlistBarriersFlag) return true; return std::stoi(UseMultipleCmdlistBarriersFlag) > 0; @@ -162,7 +165,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( // If we have a list of events to make the barrier from, then we can create a // barrier on these and use the resulting event as our future barrier. // We use the same approach if - // SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a + // UR_L0_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a // positive value. // We use the same approach if we have in-order queue because every command // depends on previous one, so we don't need to insert barrier to multiple diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp index fcb3b156af0db..9e129adb0fb7e 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp @@ -35,8 +35,10 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, // This is an experimental option that allows to disable caching of events in // the context. const bool DisableEventsCaching = [] { + const char *UrRet = std::getenv("UR_L0_DISABLE_EVENTS_CACHING"); + const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_EVENTS_CACHING"); const char *DisableEventsCachingFlag = - std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_EVENTS_CACHING"); + UrRet ? UrRet : (PiRet ? PiRet : nullptr); if (!DisableEventsCachingFlag) return false; return std::stoi(DisableEventsCachingFlag) != 0; @@ -45,8 +47,10 @@ const bool DisableEventsCaching = [] { // This is an experimental option that allows reset and reuse of uncompleted // events in the in-order queue with discard_events property. const bool ReuseDiscardedEvents = [] { + const char *UrRet = std::getenv("UR_L0_REUSE_DISCARDED_EVENTS"); + const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_REUSE_DISCARDED_EVENTS"); const char *ReuseDiscardedEventsFlag = - std::getenv("SYCL_PI_LEVEL_ZERO_REUSE_DISCARDED_EVENTS"); + UrRet ? UrRet : (PiRet ? PiRet : nullptr); if (!ReuseDiscardedEventsFlag) return true; return std::stoi(ReuseDiscardedEventsFlag) > 0; @@ -236,8 +240,10 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, // Get value of device scope events env var setting or default setting static const EventsScope DeviceEventsSetting = [] { + char *UrRet = std::getenv("UR_L0_DEVICE_SCOPE_EVENTS"); + char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS"); const char *DeviceEventsSettingStr = - std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS"); + UrRet ? UrRet : (PiRet ? PiRet : nullptr); if (DeviceEventsSettingStr) { // Override the default if user has explicitly chosen the events scope. switch (std::stoi(DeviceEventsSettingStr)) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index ed30bf3c9e69b..43c0d691f5ad0 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -18,8 +18,10 @@ // Default to using compute engine for fill operation, but allow to // override this with an environment variable. static bool PreferCopyEngine = [] { - const char *Env = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL"); - return Env ? std::stoi(Env) != 0 : false; + const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_FILL"); + const char *PiRet = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL"); + return (UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0)); }(); // Helper function to check if a pointer is a device pointer. @@ -2433,7 +2435,9 @@ enum class USMAllocationForceResidencyType { // Returns the desired USM residency setting static USMAllocationForceResidencyType USMAllocationForceResidency = [] { - const auto Str = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT"); + const char *UrRet = std::getenv("UR_L0_USM_RESIDENT"); + const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT"); + const char *Str = UrRet ? UrRet : (PiRet ? PiRet : nullptr); if (!Str) return USMAllocationForceResidencyType::P2PDevices; switch (std::atoi(Str)) { @@ -2861,8 +2865,11 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, // cross-tile traffic. // static const bool SingleRootDeviceBufferMigration = [] { - const char *EnvStr = + const char *UrRet = + std::getenv("UR_L0_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION"); + const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION"); + const char *EnvStr = UrRet ? UrRet : (PiRet ? PiRet : nullptr); if (EnvStr) return (std::stoi(EnvStr) != 0); // The default is to migrate normally, which may not always be the diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp index 12f782d862ac7..e9ad0d49bbdbb 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp @@ -31,8 +31,10 @@ bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr); // This is an experimental option to test performance of device to device copy // operations on copy engines (versus compute engine) const bool UseCopyEngineForD2DCopy = [] { - const char *CopyEngineForD2DCopy = + const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_D2D_COPY"); + const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY"); + const char *CopyEngineForD2DCopy = UrRet ? UrRet : (PiRet ? PiRet : nullptr); return (CopyEngineForD2DCopy && (std::stoi(CopyEngineForD2DCopy) != 0)); }(); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp index d91c1fd414181..fe81cd1e2a3a0 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp @@ -260,7 +260,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo( // paths be less likely affected. // static bool doEagerInit = [] { - const char *EagerInit = std::getenv("SYCL_EAGER_INIT"); + const char *UrRet = std::getenv("UR_L0_EAGER_INIT"); + const char *PiRet = std::getenv("SYCL_EAGER_INIT"); + const char *EagerInit = UrRet ? UrRet : (PiRet ? PiRet : nullptr); return EagerInit ? std::atoi(EagerInit) != 0 : false; }(); @@ -549,8 +551,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish( // TODO: this currently exhibits some issues in the driver, so // we control this with an env var. Remove this control when // we settle one way or the other. + const char *UrRet = std::getenv("UR_L0_QUEUE_FINISH_HOLD_LOCK"); + const char *PiRet = + std::getenv("SYCL_PI_LEVEL_ZERO_QUEUE_FINISH_HOLD_LOCK"); static bool HoldLock = - std::getenv("SYCL_PI_LEVEL_ZERO_QUEUE_FINISH_HOLD_LOCK") != nullptr; + UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0); if (!HoldLock) { Lock.unlock(); } @@ -623,9 +628,16 @@ static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) { zeCommandListBatchConfig Config{}; // default initialize // Default value of 0. This specifies to use dynamic batch size adjustment. - const auto BatchSizeStr = - (IsCopy) ? std::getenv("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE") - : std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE"); + const char *UrRet = nullptr; + const char *PiRet = nullptr; + if (IsCopy) { + UrRet = std::getenv("UR_L0_COPY_BATCH_SIZE"); + PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE"); + } else { + UrRet = std::getenv("UR_L0_BATCH_SIZE"); + PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE"); + } + const char *BatchSizeStr = UrRet ? UrRet : (PiRet ? PiRet : nullptr); if (BatchSizeStr) { pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr); // Level Zero may only support a limted number of commands per command @@ -658,10 +670,9 @@ static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) { Val = std::stoi(BatchConfig.substr(Pos)); } catch (...) { if (IsCopy) - urPrint( - "SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: failed to parse value\n"); + urPrint("UR_L0_COPY_BATCH_SIZE: failed to parse value\n"); else - urPrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: failed to parse value\n"); + urPrint("UR_L0_BATCH_SIZE: failed to parse value\n"); break; } switch (Ord) { @@ -684,27 +695,26 @@ static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) { die("Unexpected batch config"); } if (IsCopy) - urPrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: dynamic batch param " + urPrint("UR_L0_COPY_BATCH_SIZE: dynamic batch param " "#%d: %d\n", (int)Ord, (int)Val); else - urPrint( - "SYCL_PI_LEVEL_ZERO_BATCH_SIZE: dynamic batch param #%d: %d\n", - (int)Ord, (int)Val); + urPrint("UR_L0_BATCH_SIZE: dynamic batch param #%d: %d\n", (int)Ord, + (int)Val); }; } else { // Negative batch sizes are silently ignored. if (IsCopy) - urPrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: ignored negative value\n"); + urPrint("UR_L0_COPY_BATCH_SIZE: ignored negative value\n"); else - urPrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: ignored negative value\n"); + urPrint("UR_L0_BATCH_SIZE: ignored negative value\n"); } } return Config; } -// SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE can be set to an integer (>=0) in +// UR_L0_LEVEL_ZERO_USE_COMPUTE_ENGINE can be set to an integer (>=0) in // which case all compute commands will be submitted to the command-queue // with the given index in the compute command group. If it is instead set // to negative then all available compute engines may be used. @@ -712,8 +722,9 @@ static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) { // The default value is "0". // static const std::pair getRangeOfAllowedComputeEngines() { - static const char *EnvVar = - std::getenv("SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE"); + const char *UrRet = std::getenv("UR_L0_USE_COMPUTE_ENGINE"); + const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE"); + const char *EnvVar = UrRet ? UrRet : (PiRet ? PiRet : nullptr); // If the environment variable is not set only use "0" CCS for now. // TODO: allow all CCSs when HW support is complete. if (!EnvVar) @@ -1769,8 +1780,11 @@ ur_result_t ur_queue_handle_t_::insertStartBarrierIfDiscardEventsMode( // available in the device, in Level Zero plugin for copy operations submitted // to an in-order queue. The default is 1. static const bool UseCopyEngineForInOrderQueue = [] { - const char *CopyEngineForInOrderQueue = + const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE"); + const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE"); + const char *CopyEngineForInOrderQueue = + UrRet ? UrRet : (PiRet ? PiRet : nullptr); return (!CopyEngineForInOrderQueue || (std::stoi(CopyEngineForInOrderQueue) != 0)); }(); diff --git a/sycl/plugins/unified_runtime/ur/ur.cpp b/sycl/plugins/unified_runtime/ur/ur.cpp index 0db860fbd0daa..319e95bde6e72 100644 --- a/sycl/plugins/unified_runtime/ur/ur.cpp +++ b/sycl/plugins/unified_runtime/ur/ur.cpp @@ -15,7 +15,6 @@ bool PrintTrace = [] { const char *UrRet = std::getenv("UR_L0_TRACE"); const char *PiRet = std::getenv("SYCL_PI_TRACE"); const char *Trace = UrRet ? UrRet : (PiRet ? PiRet : nullptr); - const int TraceValue = Trace ? std::stoi(Trace) : 0; if (TraceValue == -1 || TraceValue == 2) { // Means print all traces return true; diff --git a/sycl/plugins/unified_runtime/ur/usm_allocator_config.cpp b/sycl/plugins/unified_runtime/ur/usm_allocator_config.cpp index 30b67945ad28a..8d77a67b1a6e4 100644 --- a/sycl/plugins/unified_runtime/ur/usm_allocator_config.cpp +++ b/sycl/plugins/unified_runtime/ur/usm_allocator_config.cpp @@ -184,7 +184,6 @@ USMAllocatorConfig::USMAllocatorConfig() { const char *UrRet = std::getenv("UR_L0_USM_ALLOCATOR"); const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR"); const char *PoolParams = UrRet ? UrRet : (PiRet ? PiRet : nullptr); - if (PoolParams != nullptr) { std::string Params(PoolParams); size_t Pos = Params.find(';'); @@ -229,7 +228,6 @@ USMAllocatorConfig::USMAllocatorConfig() { const char *PoolTraceVal = UrRetUsmAllocator ? UrRetUsmAllocator : (PiRetUsmAllocator ? PiRetUsmAllocator : nullptr); - int PoolTrace = 0; if (PoolTraceVal != nullptr) { PoolTrace = std::atoi(PoolTraceVal); From a47af47361a879c9dd58f94910a6913070f446d7 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 11 May 2023 15:25:15 -0700 Subject: [PATCH 36/50] fix implementation of urKernelSetArgPointer Signed-off-by: Jaime Arteaga --- .../level_zero/ur_level_zero_kernel.cpp | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp index be7e88ddb6923..da98f12f2580a 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp @@ -608,22 +608,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( ///< holding the argument value. If null then argument ///< value is considered null. ) { - std::ignore = Kernel; - std::ignore = ArgIndex; - std::ignore = ArgValue; - urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( - ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object - uint32_t ArgIndex, ///< [in] argument index in range [0, num args - 1] - size_t ArgSize, ///< [in] size of argument type - const void *ArgValue ///< [in][optional] SVM pointer to memory location - ///< holding the argument value. If null then argument - ///< value is considered null. -) { - UR_CALL(urKernelSetArgValue(Kernel, ArgIndex, ArgSize, ArgValue)); + UR_CALL(urKernelSetArgValue(Kernel, ArgIndex, sizeof(const void *), ArgValue)); return UR_RESULT_SUCCESS; } From ce3aeb176734593d551eea905359080b8a407733 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Fri, 12 May 2023 15:38:19 +0100 Subject: [PATCH 37/50] Bump UR + various pi2ur fixes to allow rebasing cuda adapter (#8) --- sycl/plugins/unified_runtime/CMakeLists.txt | 6 +- sycl/plugins/unified_runtime/pi2ur.hpp | 362 ++++++++++++++---- .../level_zero/ur_level_zero_context.cpp | 2 +- .../level_zero/ur_level_zero_device.cpp | 23 +- .../adapters/level_zero/ur_level_zero_mem.cpp | 41 +- .../level_zero/ur_level_zero_queue.cpp | 2 +- sycl/plugins/unified_runtime/ur/ur.hpp | 57 +-- 7 files changed, 354 insertions(+), 139 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index e9dfeaa6e6fc9..0b4bcef273b73 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -3,8 +3,8 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_DIR) include(FetchContent) - set(UNIFIED_RUNTIME_REPO "https://github.com/jandres742/unified-runtime.git") - set(UNIFIED_RUNTIME_TAG 8cb3cb2891148a14ef84e840398a1ae8cd84cd6f) + set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") + set(UNIFIED_RUNTIME_TAG 0125b2b42aea73c350f7961cd68e0f1f94cc1238) message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}") FetchContent_Declare(unified-runtime @@ -37,7 +37,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D # Restore original flags set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_BAK}") - add_library(UnifiedRuntimeLoader ALIAS loader) + add_library(UnifiedRuntimeLoader ALIAS ur_loader) set(UNIFIED_RUNTIME_SOURCE_DIR ${unified-runtime_SOURCE_DIR} CACHE PATH "Path to Unified Runtime Headers") diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 7444981e836d0..2f3b6211cb46b 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -381,6 +381,76 @@ inline pi_result ur2piDeviceInfoValue(ur_device_info_t ParamName, return PI_SUCCESS; } +// Translate UR device info values to PI info values +inline pi_result ur2piUSMAllocInfoValue(ur_usm_alloc_info_t ParamName, + size_t ParamValueSizePI, + size_t *ParamValueSizeUR, + void *ParamValue) { + ConvertHelper Value(ParamValueSizePI, ParamValue, ParamValueSizeUR); + + if (ParamName == UR_USM_ALLOC_INFO_TYPE) { + auto ConvertFunc = [](ur_usm_type_t UrValue) { + switch (UrValue) { + case UR_USM_TYPE_UNKNOWN: + return PI_MEM_TYPE_UNKNOWN; + case UR_USM_TYPE_HOST: + return PI_MEM_TYPE_HOST; + case UR_USM_TYPE_DEVICE: + return PI_MEM_TYPE_DEVICE; + case UR_USM_TYPE_SHARED: + return PI_MEM_TYPE_SHARED; + default: + die("UR_USM_ALLOC_INFO_TYPE: unhandled value"); + } + }; + return Value.convert(ConvertFunc); + } + + return PI_SUCCESS; +} + +// Handle mismatched PI and UR type return sizes for info queries +inline pi_result fixupInfoValueTypes(size_t ParamValueSizeUR, + size_t *ParamValueSizeRetPI, + void *ParamValue) { + if (ParamValueSizeUR == 1) { + // extend bool to pi_bool (uint32_t) + auto *ValIn = static_cast(ParamValue); + auto *ValOut = static_cast(ParamValue); + *ValOut = static_cast(*ValIn); + if (ParamValueSizeRetPI) { + *ParamValueSizeRetPI = sizeof(pi_bool); + } + } + + return PI_SUCCESS; +} + + +inline ur_result_t +mapPIMetadataToUR(const pi_device_binary_property *pi_metadata, + ur_program_metadata_t *ur_metadata) { + ur_metadata->pName = (*pi_metadata)->Name; + ur_metadata->size = (*pi_metadata)->ValSize; + switch ((*pi_metadata)->Type) { + case PI_PROPERTY_TYPE_UINT32: + ur_metadata->type = UR_PROGRAM_METADATA_TYPE_UINT32; + ur_metadata->value.data32 = (*pi_metadata)->ValSize; + return UR_RESULT_SUCCESS; + case PI_PROPERTY_TYPE_BYTE_ARRAY: + ur_metadata->type = UR_PROGRAM_METADATA_TYPE_BYTE_ARRAY; + ur_metadata->value.pData = (*pi_metadata)->ValAddr; + return UR_RESULT_SUCCESS; + case PI_PROPERTY_TYPE_STRING: + ur_metadata->type = UR_PROGRAM_METADATA_TYPE_STRING; + ur_metadata->value.pString = + reinterpret_cast((*pi_metadata)->ValAddr); + return UR_RESULT_SUCCESS; + default: + return UR_RESULT_ERROR_INVALID_VALUE; + } +} + namespace pi2ur { inline pi_result piTearDown(void *PluginParameter) { @@ -476,6 +546,8 @@ inline pi_result piPlatformGetInfo(pi_platform Platform, ParamValue, ParamValueSizeRet)); ur2piPlatformInfoValue(UrParamName, ParamValueSize, &SizeInOut, ParamValue); + fixupInfoValueTypes(SizeInOut, ParamValueSizeRet, ParamValue); + return PI_SUCCESS; } @@ -827,68 +899,65 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, InfoType = UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE; break; case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: - InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_BUILD_ON_SUBDEVICE; + InfoType = UR_DEVICE_INFO_BUILD_ON_SUBDEVICE; break; case PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D: - InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_MAX_WORK_GROUPS_3D; + InfoType = UR_DEVICE_INFO_MAX_WORK_GROUPS_3D; break; case PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: - InfoType = (ur_device_info_t)UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE; + InfoType = UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE; break; case PI_DEVICE_INFO_DEVICE_ID: - InfoType = (ur_device_info_t)UR_DEVICE_INFO_DEVICE_ID; + InfoType = UR_DEVICE_INFO_DEVICE_ID; break; case PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY: - InfoType = (ur_device_info_t)UR_DEVICE_INFO_GLOBAL_MEM_FREE; + InfoType = UR_DEVICE_INFO_GLOBAL_MEM_FREE; break; case PI_EXT_INTEL_DEVICE_INFO_MEMORY_CLOCK_RATE: - InfoType = (ur_device_info_t)UR_DEVICE_INFO_MEMORY_CLOCK_RATE; + InfoType = UR_DEVICE_INFO_MEMORY_CLOCK_RATE; break; case PI_EXT_INTEL_DEVICE_INFO_MEMORY_BUS_WIDTH: - InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_MEMORY_BUS_WIDTH; + InfoType = UR_DEVICE_INFO_MEMORY_BUS_WIDTH; break; case PI_EXT_INTEL_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: - InfoType = (ur_device_info_t)UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES; + InfoType = UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES; break; case PI_DEVICE_INFO_GPU_SLICES: - InfoType = (ur_device_info_t)UR_DEVICE_INFO_GPU_EU_SLICES; + InfoType = UR_DEVICE_INFO_GPU_EU_SLICES; break; case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE: - InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE; + InfoType = UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE; break; case PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU: - InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_GPU_HW_THREADS_PER_EU; + InfoType = UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU; break; case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH: - InfoType = (ur_device_info_t)UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH; + InfoType = UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH; break; case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS: - InfoType = (ur_device_info_t)UR_DEVICE_INFO_BFLOAT16; + InfoType = UR_DEVICE_INFO_BFLOAT16; break; case PI_EXT_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: - InfoType = - (ur_device_info_t)UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES; + InfoType = UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES; break; case PI_EXT_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: - InfoType = - (ur_device_info_t)UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES; + InfoType = UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES; break; case PI_EXT_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: - InfoType = (ur_device_info_t)UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES; + InfoType = UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES; break; case PI_EXT_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: - InfoType = (ur_device_info_t)UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES; + InfoType = UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES; break; case PI_EXT_INTEL_DEVICE_INFO_MEM_CHANNEL_SUPPORT: - InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT; + InfoType = UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT; break; case PI_DEVICE_INFO_IMAGE_SRGB: - InfoType = (ur_device_info_t)UR_DEVICE_INFO_IMAGE_SRGB; + InfoType = UR_DEVICE_INFO_IMAGE_SRGB; break; case PI_DEVICE_INFO_BACKEND_VERSION: { - // TODO: return some meaningful for backend_version below - ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); - return ReturnValue(""); + InfoType = UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION; + break; } default: return PI_ERROR_UNKNOWN; @@ -903,6 +972,7 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, ParamValueSizeRet)); ur2piDeviceInfoValue(InfoType, ParamValueSize, &SizeInOut, ParamValue); + fixupInfoValueTypes(SizeInOut, ParamValueSizeRet, ParamValue); return PI_SUCCESS; } @@ -1074,13 +1144,12 @@ inline pi_result piContextCreate(const pi_context_properties *Properties, return PI_SUCCESS; } -// FIXME: Dummy implementation to prevent link fail inline pi_result piextContextSetExtendedDeleter( pi_context Context, pi_context_extended_deleter Function, void *UserData) { - std::ignore = Context; - std::ignore = Function; - std::ignore = UserData; - die("piextContextSetExtendedDeleter: not supported"); + auto hContext = reinterpret_cast(Context); + + HANDLE_ERRORS(urContextSetExtendedDeleter(hContext, Function, UserData)); + return PI_SUCCESS; } @@ -1164,6 +1233,8 @@ inline pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName, HANDLE_ERRORS(urContextGetInfo(hContext, ContextInfoType, ParamValueSize, ParamValue, ParamValueSizeRet)); + fixupInfoValueTypes(ParamValueSize, ParamValueSizeRet, ParamValue); + return PI_SUCCESS; } @@ -1213,6 +1284,7 @@ inline pi_result piextQueueCreate(pi_context Context, pi_device Device, PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); ur_queue_properties_t UrProperties{}; + UrProperties.stype = UR_STRUCTURE_TYPE_QUEUE_PROPERTIES; if (Properties[1] & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) UrProperties.flags |= UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; if (Properties[1] & PI_QUEUE_FLAG_PROFILING_ENABLE) @@ -1252,6 +1324,12 @@ inline pi_result piQueueCreate(pi_context Context, pi_device Device, return pi2ur::piextQueueCreate(Context, Device, Properties, Queue); } +inline pi_result piextQueueCreate2(pi_context context, pi_device device, + pi_queue_properties *properties, + pi_queue *queue) { + return pi2ur::piextQueueCreate(context, device, properties, queue); +} + inline pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle, pi_context Context, pi_device Device, @@ -1275,6 +1353,16 @@ inline pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle, return PI_SUCCESS; } +inline pi_result piextQueueCreateWithNativeHandle2( + pi_native_handle nativeHandle, int32_t nativeHandleDesc, pi_context context, + pi_device device, bool pluginOwnsNativeHandle, + pi_queue_properties *Properties, pi_queue *queue) { + (void)nativeHandleDesc; + (void)Properties; + return pi2ur::piextQueueCreateWithNativeHandle(nativeHandle, context, device, + pluginOwnsNativeHandle, queue); +} + inline pi_result piextQueueGetNativeHandle(pi_queue Queue, pi_native_handle *NativeHandle) { @@ -1291,6 +1379,16 @@ inline pi_result piextQueueGetNativeHandle(pi_queue Queue, return PI_SUCCESS; } + +inline pi_result piextQueueGetNativeHandle2(pi_queue Queue, + pi_native_handle *NativeHandle, + int32_t *NativeHandleDesc) { + + (void)NativeHandleDesc; + return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle); +} + + inline pi_result piQueueRelease(pi_queue Queue) { PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); @@ -1347,7 +1445,7 @@ inline pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName, break; } case PI_EXT_ONEAPI_QUEUE_INFO_EMPTY: { - UrParamName = UR_EXT_ONEAPI_QUEUE_INFO_EMPTY; + UrParamName = UR_QUEUE_INFO_EMPTY; break; } default: { @@ -1414,9 +1512,6 @@ inline pi_result piProgramCreateWithBinary( const size_t *Lengths, const unsigned char **Binaries, size_t NumMetadataEntries, const pi_device_binary_property *Metadata, pi_int32 *BinaryStatus, pi_program *Program) { - std::ignore = Metadata; - std::ignore = NumMetadataEntries; - PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); PI_ASSERT(DeviceList && NumDevices, PI_ERROR_INVALID_VALUE); PI_ASSERT(Binaries && Lengths, PI_ERROR_INVALID_VALUE); @@ -1437,8 +1532,18 @@ inline pi_result piProgramCreateWithBinary( reinterpret_cast(Context); auto UrDevice = reinterpret_cast(DeviceList[0]); - // TODO: Translate Metadata into Properties? - ur_program_properties_t Properties{}; + std::unique_ptr pMetadatas( + new ur_program_metadata_t[NumMetadataEntries]); + for (unsigned i = 0; i < NumMetadataEntries; i++) { + HANDLE_ERRORS(mapPIMetadataToUR(&Metadata[i], &pMetadatas[i])); + } + + ur_program_properties_t Properties; + Properties.stype = UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES; + Properties.pNext = nullptr; + Properties.count = NumMetadataEntries; + Properties.pMetadatas = pMetadatas.get(); + ur_program_handle_t *UrProgram = reinterpret_cast(Program); HANDLE_ERRORS(urProgramCreateWithBinary(UrContext, UrDevice, Lengths[0], @@ -1753,6 +1858,15 @@ inline pi_result piKernelSetArg(pi_kernel Kernel, pi_uint32 ArgIndex, return PI_SUCCESS; } +inline pi_result piKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index, + size_t arg_size, const void *arg_value) { + (void)arg_size; + auto hKernel = reinterpret_cast(kernel); + HANDLE_ERRORS(urKernelSetArgPointer(hKernel, arg_index, arg_value)); + + return PI_SUCCESS; +} + inline pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle, pi_context Context, pi_program Program, @@ -2178,14 +2292,6 @@ inline pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); PI_ASSERT(RetMem, PI_ERROR_INVALID_VALUE); - // TODO: implement support for more access modes - if (!((Flags & PI_MEM_FLAGS_ACCESS_RW) || - (Flags & PI_MEM_ACCESS_READ_ONLY))) { - die("piMemBufferCreate: Level-Zero supports read-write and read-only " - "buffer," - "but not other accesses (such as write-only) yet."); - } - if (properties != nullptr) { die("piMemBufferCreate: no mem properties goes to Level-Zero RT yet"); } @@ -2362,7 +2468,7 @@ static void pi2urImageDesc(const pi_image_format *ImageFormat, break; } case PI_IMAGE_CHANNEL_ORDER_ABGR: { - UrFormat->channelOrder = UR_EXT_IMAGE_CHANNEL_ORDER_ABGR; + UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_ABGR; break; } case PI_IMAGE_CHANNEL_ORDER_INTENSITY: { @@ -2444,11 +2550,6 @@ inline pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, const pi_image_desc *ImageDesc, void *HostPtr, pi_mem *RetImage) { - // TODO: implement read-only, write-only - if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) { - die("piMemImageCreate: Level-Zero implements only read-write buffer," - "no read-only or write-only yet."); - } PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE); PI_ASSERT(ImageFormat, PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); @@ -2778,9 +2879,23 @@ inline pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr, ur_event_handle_t *UrEvent = reinterpret_cast(OutEvent); - // TODO: to map from pi_mem_advice to ur_mem_advice_t - // once we have those defined ur_usm_advice_flags_t UrAdvice{}; + if (Advice & PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY) { + UrAdvice |= UR_USM_ADVICE_FLAG_SET_READ_MOSTLY; + } + if (Advice & PI_MEM_ADVICE_CUDA_UNSET_READ_MOSTLY) { + UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY; + } + if (Advice & PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION) { + UrAdvice |= UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION; + } + if (Advice & PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION) { + UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION; + } + if (Advice & PI_MEM_ADVICE_RESET) { + UrAdvice |= UR_USM_ADVICE_FLAG_DEFAULT; + } + HANDLE_ERRORS(urEnqueueUSMAdvise(UrQueue, Ptr, Length, UrAdvice, UrEvent)); return PI_SUCCESS; @@ -2805,18 +2920,18 @@ inline pi_result piextUSMEnqueueFill2D(pi_queue Queue, void *Ptr, size_t Pitch, const pi_event *EventsWaitList, pi_event *Event) { - std::ignore = Queue; - std::ignore = Ptr; - std::ignore = Pitch; - std::ignore = PatternSize; - std::ignore = Pattern; - std::ignore = Width; - std::ignore = Height; - std::ignore = NumEventsWaitList; - std::ignore = EventsWaitList; - std::ignore = Event; - die("piextUSMEnqueueFill2D: not implemented"); - return {}; + + auto hQueue = reinterpret_cast(Queue); + auto phEventWaitList = + reinterpret_cast(EventsWaitList); + auto phEvent = reinterpret_cast(Event); + + HANDLE_ERRORS(urEnqueueUSMFill2D(hQueue, Ptr, Pitch, PatternSize, Pattern, + Width, Height, NumEventsWaitList, + phEventWaitList, phEvent)); + + return PI_SUCCESS; + } inline pi_result piextUSMEnqueueMemset2D(pi_queue Queue, void *Ptr, @@ -2872,25 +2987,57 @@ inline pi_result piextUSMGetMemAllocInfo(pi_context Context, const void *Ptr, } } + size_t SizeInOut = ParamValueSize; HANDLE_ERRORS(urUSMGetMemAllocInfo(UrContext, Ptr, UrParamName, ParamValueSize, ParamValue, ParamValueSizeRet)) + ur2piUSMAllocInfoValue(UrParamName, ParamValueSize, &SizeInOut, ParamValue); return PI_SUCCESS; } inline pi_result piMemImageGetInfo(pi_mem Image, pi_image_info ParamName, size_t ParamValueSize, void *ParamValue, - size_t *ParamValueSizeRet) { // missing - std::ignore = Image; - std::ignore = ParamName; - std::ignore = ParamValueSize; - std::ignore = ParamValue; - std::ignore = ParamValueSizeRet; + size_t *ParamValueSizeRet) { + + auto hMem = reinterpret_cast(Image); - // TODO: use urMemImageGetInfo + ur_image_info_t UrParamName{}; + switch (ParamName) { + case PI_IMAGE_INFO_FORMAT: { + UrParamName = UR_IMAGE_INFO_FORMAT; + break; + } + case PI_IMAGE_INFO_ELEMENT_SIZE: { + UrParamName = UR_IMAGE_INFO_ELEMENT_SIZE; + break; + } + case PI_IMAGE_INFO_ROW_PITCH: { + UrParamName = UR_IMAGE_INFO_ROW_PITCH; + break; + } + case PI_IMAGE_INFO_SLICE_PITCH: { + UrParamName = UR_IMAGE_INFO_SLICE_PITCH; + break; + } + case PI_IMAGE_INFO_WIDTH: { + UrParamName = UR_IMAGE_INFO_WIDTH; + break; + } + case PI_IMAGE_INFO_HEIGHT: { + UrParamName = UR_IMAGE_INFO_HEIGHT; + break; + } + case PI_IMAGE_INFO_DEPTH: { + UrParamName = UR_IMAGE_INFO_DEPTH; + break; + } + default: + return PI_ERROR_UNKNOWN; + } - die("piMemImageGetInfo: not implemented"); - return {}; + HANDLE_ERRORS(urMemImageGetInfo(hMem, UrParamName, ParamValueSize, ParamValue, + ParamValueSizeRet)); + return PI_SUCCESS; } /// USM 2D Memcpy API @@ -3039,7 +3186,7 @@ inline pi_result piEnqueueMemBufferMap( if (MapFlags & PI_MAP_WRITE) UrMapFlags |= UR_MAP_FLAG_WRITE; if (MapFlags & PI_MAP_WRITE_INVALIDATE_REGION) - UrMapFlags |= UR_EXT_MAP_FLAG_WRITE_INVALIDATE_REGION; + UrMapFlags |= UR_MAP_FLAG_WRITE_INVALIDATE_REGION; const ur_event_handle_t *UrEventsWaitList = reinterpret_cast(EventsWaitList); @@ -3356,6 +3503,43 @@ inline pi_result piEnqueueEventsWait(pi_queue Queue, return PI_SUCCESS; } + + +inline pi_result +piextEnqueueReadHostPipe(pi_queue queue, pi_program program, + const char *pipe_symbol, pi_bool blocking, void *ptr, + size_t size, pi_uint32 num_events_in_waitlist, + const pi_event *events_waitlist, pi_event *event) { + auto hQueue = reinterpret_cast(queue); + auto hProgram = reinterpret_cast(program); + auto phEventWaitList = + reinterpret_cast(events_waitlist); + auto phEvent = reinterpret_cast(event); + + HANDLE_ERRORS(urEnqueueReadHostPipe(hQueue, hProgram, pipe_symbol, blocking, + ptr, size, num_events_in_waitlist, + phEventWaitList, phEvent)); + + return PI_SUCCESS; +} + +inline pi_result +piextEnqueueWriteHostPipe(pi_queue queue, pi_program program, + const char *pipe_symbol, pi_bool blocking, void *ptr, + size_t size, pi_uint32 num_events_in_waitlist, + const pi_event *events_waitlist, pi_event *event) { + auto hQueue = reinterpret_cast(queue); + auto hProgram = reinterpret_cast(program); + auto phEventWaitList = + reinterpret_cast(events_waitlist); + auto phEvent = reinterpret_cast(event); + + HANDLE_ERRORS(urEnqueueWriteHostPipe(hQueue, hProgram, pipe_symbol, blocking, + ptr, size, num_events_in_waitlist, + phEventWaitList, phEvent)); + + return PI_SUCCESS; +} // Enqueue /////////////////////////////////////////////////////////////////////////////// @@ -3601,13 +3785,33 @@ inline pi_result piSamplerCreate(pi_context Context, inline pi_result piSamplerGetInfo(pi_sampler Sampler, pi_sampler_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { - std::ignore = Sampler; - std::ignore = ParamName; - std::ignore = ParamValueSize; - std::ignore = ParamValue; - std::ignore = ParamValueSizeRet; + ur_sampler_info_t InfoType{}; + switch (ParamName) { + case PI_SAMPLER_INFO_REFERENCE_COUNT: + InfoType = UR_SAMPLER_INFO_REFERENCE_COUNT; + break; + case PI_SAMPLER_INFO_CONTEXT: + InfoType = UR_SAMPLER_INFO_CONTEXT; + break; + case PI_SAMPLER_INFO_NORMALIZED_COORDS: + InfoType = UR_SAMPLER_INFO_NORMALIZED_COORDS; + break; + case PI_SAMPLER_INFO_ADDRESSING_MODE: + InfoType = UR_SAMPLER_INFO_ADDRESSING_MODE; + break; + case PI_SAMPLER_INFO_FILTER_MODE: + InfoType = UR_SAMPLER_INFO_FILTER_MODE; + break; + default: + return PI_ERROR_UNKNOWN; + } + + size_t SizeInOut = ParamValueSize; + auto hSampler = reinterpret_cast(Sampler); + HANDLE_ERRORS(urSamplerGetInfo(hSampler, InfoType, SizeInOut, ParamValue, + ParamValueSizeRet)); + fixupInfoValueTypes(SizeInOut, ParamValueSizeRet, ParamValue); - die("piSamplerGetInfo: not implemented"); return PI_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp index 4c998fb6294ea..9b61460205087 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp @@ -111,7 +111,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT: // 2D USM fill is not supported. return ReturnValue(pi_bool{false}); - case UR_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { + case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { ur_memory_order_capability_flags_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp index 5fac6f1e4d77a..f1e9ee46ea76b 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp @@ -193,7 +193,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( // zeModuleCreate allows using root device module for sub-devices: // > The application must only use the module for the device, or its // > sub-devices, which was provided during creation. - case UR_EXT_DEVICE_INFO_BUILD_ON_SUBDEVICE: + case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE: return ReturnValue(uint32_t{0}); case UR_DEVICE_INFO_COMPILER_AVAILABLE: return ReturnValue(static_cast(true)); @@ -227,7 +227,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( Device->ZeDeviceComputeProperties->maxGroupSizeZ}}; return ReturnValue(MaxGroupSize); } - case UR_EXT_DEVICE_INFO_MAX_WORK_GROUPS_3D: { + case UR_DEVICE_INFO_MAX_WORK_GROUPS_3D: { struct { size_t Arr[3]; } MaxGroupCounts = {{Device->ZeDeviceComputeProperties->maxGroupCountX, @@ -575,13 +575,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( auto MapCaps = [](const ze_memory_access_cap_flags_t &ZeCapabilities) { uint64_t Capabilities = 0; if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_RW) - Capabilities |= UR_EXT_USM_CAPS_ACCESS; + Capabilities |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS; if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC) - Capabilities |= UR_EXT_USM_CAPS_ATOMIC_ACCESS; + Capabilities |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS; if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT) - Capabilities |= UR_EXT_USM_CAPS_CONCURRENT_ACCESS; + Capabilities |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC) - Capabilities |= UR_EXT_USM_CAPS_CONCURRENT_ATOMIC_ACCESS; + Capabilities |= + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; return Capabilities; }; auto &Props = Device->ZeDeviceMemoryAccessProperties; @@ -625,7 +626,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( if (getenv("ZES_ENABLE_SYSMAN") == nullptr) { setErrorMessage("Set ZES_ENABLE_SYSMAN=1 to obtain free memory", UR_RESULT_SUCCESS); - return UR_EXT_RESULT_ADAPTER_SPECIFIC_ERROR; + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; } // Only report device memory which zeMemAllocDevice can allocate from. // Currently this is only the one enumerated with ordinal 0. @@ -669,7 +670,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( Device->ZeDeviceMemoryProperties->first.end(), Comp); return ReturnValue(uint32_t{MinIt->maxClockRate}); } - case UR_EXT_DEVICE_INFO_MEMORY_BUS_WIDTH: { + case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: { // If there are not any memory modules then return 0. if (Device->ZeDeviceMemoryProperties->first.empty()) return ReturnValue(uint32_t{0}); @@ -711,9 +712,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE: return ReturnValue( uint32_t{Device->ZeDeviceProperties->numSubslicesPerSlice}); - case UR_EXT_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE: + case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE: return ReturnValue(uint32_t{Device->ZeDeviceProperties->numEUsPerSubslice}); - case UR_EXT_DEVICE_INFO_GPU_HW_THREADS_PER_EU: + case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU: return ReturnValue(uint32_t{Device->ZeDeviceProperties->numThreadsPerEU}); case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: // currently not supported in level zero runtime @@ -768,7 +769,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST; return ReturnValue(capabilities); } - case UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT: + case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT: return ReturnValue(pi_bool{false}); case UR_DEVICE_INFO_IMAGE_SRGB: return ReturnValue(pi_bool{false}); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index 43c0d691f5ad0..133306c910ce4 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -921,7 +921,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( // Translate the host access mode info. ur_mem_handle_t_::access_mode_t AccessMode = ur_mem_handle_t_::unknown; - if (MapFlags & UR_EXT_MAP_FLAG_WRITE_INVALIDATE_REGION) + if (MapFlags & UR_MAP_FLAG_WRITE_INVALIDATE_REGION) AccessMode = ur_mem_handle_t_::write_only; else { if (MapFlags & UR_MAP_FLAG_READ) { @@ -3161,4 +3161,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( Pattern, // It will be interpreted as an 8-bit value, PatternSize, // which is indicated with this pattern_size==1 Size, NumEventsInWaitList, EventWaitList, Event); -} \ No newline at end of file +} + +/// Host Pipes +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueReadHostPipe( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, void *pDst, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + std::ignore = hQueue; + std::ignore = hProgram; + std::ignore = pipe_symbol; + std::ignore = blocking; + std::ignore = pDst; + std::ignore = size; + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe( + ur_queue_handle_t hQueue, ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, void *pSrc, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + std::ignore = hQueue; + std::ignore = hProgram; + std::ignore = pipe_symbol; + std::ignore = blocking; + std::ignore = pSrc; + std::ignore = size; + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp index fe81cd1e2a3a0..3ca6ecad4c994 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp @@ -167,7 +167,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo( case UR_QUEUE_INFO_DEVICE_DEFAULT: die("UR_QUEUE_INFO_DEVICE_DEFAULT in urQueueGetInfo not implemented\n"); break; - case UR_EXT_ONEAPI_QUEUE_INFO_EMPTY: { + case UR_QUEUE_INFO_EMPTY: { // We can exit early if we have in-order queue. if (Queue->isInOrderQueue()) { if (!Queue->LastCommandEvent) diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp index e5bd87108e824..24a38ab318751 100644 --- a/sycl/plugins/unified_runtime/ur/ur.hpp +++ b/sycl/plugins/unified_runtime/ur/ur.hpp @@ -36,53 +36,12 @@ template <> uint32_t inline ur_cast(uint64_t Value) { // TODO: promote all of the below extensions to the Unified Runtime // and get rid of these ZER_EXT constants. -const int UR_EXT_DEVICE_INFO_END = UR_DEVICE_INFO_FORCE_UINT32; -const int UR_EXT_DEVICE_INFO_BUILD_ON_SUBDEVICE = UR_EXT_DEVICE_INFO_END - 1; -const int UR_EXT_DEVICE_INFO_MAX_WORK_GROUPS_3D = UR_EXT_DEVICE_INFO_END - 2; -// const int UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = -// UR_EXT_DEVICE_INFO_END - 3; -// const int ZER_EXT_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS = -// UR_EXT_DEVICE_INFO_END - 4; -const int UR_EXT_DEVICE_INFO_GPU_HW_THREADS_PER_EU = UR_EXT_DEVICE_INFO_END - 7; -const int UR_EXT_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = - UR_EXT_DEVICE_INFO_END - 8; -// const int UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES = -// UR_EXT_DEVICE_INFO_END - 10; -const int UR_EXT_DEVICE_INFO_MEMORY_BUS_WIDTH = UR_EXT_DEVICE_INFO_END - 11; -// const int ZER_EXT_DEVICE_INFO_MEMORY_CLOCK_RATE = UR_EXT_DEVICE_INFO_END - -// 12; -// const int ZER_EXT_DEVICE_INFO_DEVICE_ID = UR_EXT_DEVICE_INFO_END - 14; -// const int ZER_EXT_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE = -// UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE; -const int UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT = UR_EXT_DEVICE_INFO_END - 15; - const ur_device_info_t UR_EXT_DEVICE_INFO_OPENCL_C_VERSION = (ur_device_info_t)0x103D; -const uint32_t UR_EXT_MAP_FLAG_WRITE_INVALIDATE_REGION = - (UR_MAP_FLAG_WRITE << 1); - -const int UR_EXT_RESULT_END = 0x1000; -const ur_result_t UR_EXT_RESULT_ADAPTER_SPECIFIC_ERROR = - ur_result_t(UR_EXT_RESULT_END - 1); - -const int UR_EXT_USM_CAPS_ACCESS = 1 << 0; -const int UR_EXT_USM_CAPS_ATOMIC_ACCESS = 1 << 1; -const int UR_EXT_USM_CAPS_CONCURRENT_ACCESS = 1 << 2; -const int UR_EXT_USM_CAPS_CONCURRENT_ATOMIC_ACCESS = 1 << 3; - -const ur_context_info_t UR_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = - (ur_context_info_t)(UR_CONTEXT_INFO_FORCE_UINT32 - 1); - -const ur_queue_info_t UR_EXT_ONEAPI_QUEUE_INFO_EMPTY = - (ur_queue_info_t)(UR_QUEUE_INFO_SIZE + 1); - const ur_command_t UR_EXT_COMMAND_TYPE_USER = (ur_command_t)((uint32_t)UR_COMMAND_FORCE_UINT32 - 1); -const ur_image_channel_order_t UR_EXT_IMAGE_CHANNEL_ORDER_ABGR = - ur_image_channel_order_t(UR_IMAGE_CHANNEL_ORDER_FORCE_UINT32 - 1); - const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG = (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 1); @@ -95,6 +54,20 @@ typedef enum { UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA = 0x2 } ur_kernel_cache_config; +// TODO(ur): These CUDA specific queue properties should live in the UR spec. In +// the mean time just use the PI values. +// PI Command Queue using Default stream +#define __SYCL_UR_CUDA_USE_DEFAULT_STREAM (0xFF03) +// PI Command queue will sync with default stream +#define __SYCL_UR_CUDA_SYNC_WITH_DEFAULT (0xFF04) + +/// Program metadata tags recognized by the UR adapters. For kernels the tag +/// must appear after the kernel name. +#define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE \ + "@reqd_work_group_size" +#define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping" + + // Terminates the process with a catastrophic error message. [[noreturn]] inline void die(const char *Message) { std::cerr << "die: " << Message << std::endl; @@ -324,4 +297,4 @@ class UrReturnHelper { size_t param_value_size; void *param_value; size_t *param_value_size_ret; -}; \ No newline at end of file +}; From a73bc2030cc7bd23bd910dce113c7dba1a5a5f3a Mon Sep 17 00:00:00 2001 From: Brandon Yates Date: Fri, 12 May 2023 20:35:50 +0000 Subject: [PATCH 38/50] Fixes after reg (#9) * Fixes for porting to UR repo (#4) * Fixes for porting to UR repo Signed-off-by: Brandon Yates --- .../ur/adapters/level_zero/ur_level_zero.cpp | 2 +- .../level_zero/ur_level_zero_common.hpp | 4 +-- .../level_zero/ur_level_zero_context.cpp | 10 +++++--- .../level_zero/ur_level_zero_context.hpp | 2 +- .../level_zero/ur_level_zero_device.cpp | 11 ++++---- .../level_zero/ur_level_zero_device.hpp | 2 +- .../level_zero/ur_level_zero_event.cpp | 10 +++++--- .../level_zero/ur_level_zero_event.hpp | 2 +- .../level_zero/ur_level_zero_kernel.cpp | 7 +++--- .../adapters/level_zero/ur_level_zero_mem.cpp | 25 +++++++++++-------- .../adapters/level_zero/ur_level_zero_mem.hpp | 6 ++--- .../level_zero/ur_level_zero_platform.cpp | 3 ++- .../level_zero/ur_level_zero_program.cpp | 3 ++- .../level_zero/ur_level_zero_queue.cpp | 13 +++++----- .../level_zero/ur_level_zero_queue.hpp | 2 +- .../level_zero/ur_level_zero_sampler.cpp | 3 ++- 16 files changed, 60 insertions(+), 45 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp index 51fe4cf9c475b..92ada96340bd9 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp @@ -11,7 +11,7 @@ #include #include "ur_level_zero.hpp" -#include + // Define the static class field std::mutex ZeCall::GlobalLock; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp index f3a8ba48b2eba..9d375bb8e2fab 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp @@ -16,13 +16,13 @@ #include #include -#include + #include #include #include #include -#include "ur/usm_allocator_config.hpp" +#include struct _ur_platform_handle_t; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp index 9b61460205087..e6cd4ff02f981 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp @@ -13,7 +13,8 @@ #include "ur_level_zero.hpp" #include "ur_level_zero_context.hpp" -#include +#include "ur_level_zero.hpp" + UR_APIEXPORT ur_result_t UR_APICALL urContextCreate( uint32_t DeviceCount, ///< [in] the number of devices given in phDevices @@ -107,11 +108,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( return ReturnValue(uint32_t{Context->RefCount.load()}); case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: // 2D USM memcpy is supported. - return ReturnValue(pi_bool{UseMemcpy2DOperations}); + return ReturnValue(ur_bool_t{UseMemcpy2DOperations}); case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT: // 2D USM fill is not supported. - return ReturnValue(pi_bool{false}); + return ReturnValue(ur_bool_t{false}); case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { + ur_memory_order_capability_flags_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | @@ -410,7 +412,7 @@ ur_result_t ur_context_handle_t_::finalize() { // Maximum number of events that can be present in an event ZePool is captured // here. Setting it to 256 gave best possible performance for several // benchmarks. -static const pi_uint32 MaxNumEventsPerPool = [] { +static const uint32_t MaxNumEventsPerPool = [] { const char *UrRet = std::getenv("UR_L0_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); const char *PiRet = std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL"); const char *MaxNumEventsPerPoolEnv = diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp index a980a80a855f3..2a9d2f97e84f9 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp @@ -15,7 +15,7 @@ #include #include -#include + #include #include #include diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp index f1e9ee46ea76b..83a5a33abda51 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp @@ -7,9 +7,10 @@ //===-----------------------------------------------------------------===// #include "ur_level_zero_device.hpp" +#include "ur_level_zero.hpp" #include #include -#include + UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet( ur_platform_handle_t Platform, ///< [in] handle of the platform instance @@ -770,9 +771,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(capabilities); } case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT: - return ReturnValue(pi_bool{false}); + return ReturnValue(ur_bool_t{false}); case UR_DEVICE_INFO_IMAGE_SRGB: - return ReturnValue(pi_bool{false}); + return ReturnValue(ur_bool_t{false}); case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: { @@ -1196,7 +1197,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( // Look for GEN binary, which we known can only be handled by Level-Zero now. const char *BinaryTarget = - UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; //__SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN; + UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; //UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; uint32_t *SelectedBinaryInd = SelectedBinary; @@ -1210,7 +1211,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( return UR_RESULT_SUCCESS; } if (strcmp(Binaries[i].pDeviceTargetSpec, - __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64) == 0) + UR_DEVICE_BINARY_TARGET_SPIRV64) == 0) Spirv = i; } // Points to a spirv image, if such indeed was found diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp index 8aff6f170127f..e8514ce569f45 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp @@ -15,7 +15,7 @@ #include #include -#include + #include #include #include diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp index d39c40982bd6f..72cfcbed5bbbc 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp @@ -13,7 +13,8 @@ #include "ur_level_zero_common.hpp" #include "ur_level_zero_event.hpp" -#include +#include "ur_level_zero.hpp" + void printZeEventList(const _ur_ze_event_list_t &UrZeEventList) { urPrint(" NumEventsInWaitList %d:", UrZeEventList.Length); @@ -389,7 +390,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( ) { std::shared_lock EventLock(Event->Mutex); if (Event->UrQueue && - (Event->UrQueue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) == 0) { + (Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } @@ -649,6 +650,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( UrEvent = new ur_event_handle_t_(ZeEvent, nullptr /* ZeEventPool */, Context, UR_EXT_COMMAND_TYPE_USER, Properties->isNativeHandleOwned); + } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { @@ -902,7 +904,7 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, bool HostVisible, ur_event_handle_t *RetEvent) { bool ProfilingEnabled = - !Queue || (Queue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) != 0; + !Queue || (Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0; if (auto CachedEvent = Context->getEventFromContextCache(HostVisible, ProfilingEnabled)) { @@ -1181,5 +1183,5 @@ ur_result_t _ur_ze_event_list_t::collectEventsForReleaseAndDestroyPiZeEventList( // Tells if this event is with profiling capabilities. bool ur_event_handle_t_::isProfilingEnabled() const { return !UrQueue || // tentatively assume user events are profiling enabled - (UrQueue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) != 0; + (UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp index 9e129adb0fb7e..42c9468ec2ef0 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp @@ -17,7 +17,7 @@ #include #include -#include + #include #include #include diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp index da98f12f2580a..38df90c31ee72 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp @@ -7,7 +7,8 @@ //===-----------------------------------------------------------------===// #include "ur_level_zero_kernel.hpp" -#include +#include "ur_level_zero.hpp" + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t Queue, ///< [in] handle of the queue object @@ -512,7 +513,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo( // As of right now, L0 is missing API to query kernel and device specific // max work group size. return ReturnValue( - pi_uint64{Device->ZeDeviceComputeProperties->maxTotalGroupSize}); + uint64_t{Device->ZeDeviceComputeProperties->maxTotalGroupSize}); } case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { struct { @@ -623,7 +624,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo( std::scoped_lock Guard(Kernel->Mutex); if (PropName == UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS && - *(static_cast(PropValue)) == PI_TRUE) { + *(static_cast(PropValue)) == true) { // The whole point for users really was to not need to know anything // about the types of allocations kernel uses. So in DPC++ we always // just set all 3 modes for each kernel. diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index 133306c910ce4..d61b93d581f5e 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -13,7 +13,8 @@ #include "ur_level_zero.hpp" #include "ur_level_zero_context.hpp" #include "ur_level_zero_event.hpp" -#include +#include "ur_level_zero.hpp" + // Default to using compute engine for fill operation, but allow to // override this with an environment variable. @@ -42,7 +43,7 @@ bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr) { // exclusive use and source buffer's mutex locked for shared use on entry. ur_result_t enqueueMemCopyHelper(ur_command_t CommandType, ur_queue_handle_t Queue, void *Dst, - pi_bool BlockingWrite, size_t Size, + ur_bool_t BlockingWrite, size_t Size, const void *Src, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, ur_event_handle_t *OutEvent, @@ -94,7 +95,7 @@ ur_result_t enqueueMemCopyRectHelper( ur_command_t CommandType, ur_queue_handle_t Queue, const void *SrcBuffer, void *DstBuffer, ur_rect_offset_t SrcOrigin, ur_rect_offset_t DstOrigin, ur_rect_region_t Region, size_t SrcRowPitch, size_t DstRowPitch, - size_t SrcSlicePitch, size_t DstSlicePitch, pi_bool Blocking, + size_t SrcSlicePitch, size_t DstSlicePitch, ur_bool_t Blocking, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, ur_event_handle_t *OutEvent, bool PreferCopyEngine) { bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine); @@ -298,10 +299,10 @@ static ur_result_t getImageRegionHelper(_ur_image *Mem, UR_ASSERT(Mem, UR_RESULT_ERROR_INVALID_MEM_OBJECT); UR_ASSERT(Origin, UR_RESULT_ERROR_INVALID_VALUE); +#ifndef NDEBUG auto UrImage = static_cast<_ur_image *>(Mem); ze_image_desc_t &ZeImageDesc = UrImage->ZeImageDesc; -#ifndef NDEBUG UR_ASSERT(Mem->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); UR_ASSERT((ZeImageDesc.type == ZE_IMAGE_TYPE_1D && Origin->y == 0 && Origin->z == 0) || @@ -341,7 +342,7 @@ static ur_result_t enqueueMemImageCommandHelper( ur_command_t CommandType, ur_queue_handle_t Queue, const void *Src, // image or ptr void *Dst, // image or ptr - pi_bool IsBlocking, ur_rect_offset_t *SrcOrigin, + ur_bool_t IsBlocking, ur_rect_offset_t *SrcOrigin, ur_rect_offset_t *DstOrigin, ur_rect_region_t *Region, size_t RowPitch, size_t SlicePitch, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, ur_event_handle_t *OutEvent, @@ -384,6 +385,7 @@ static ur_result_t enqueueMemImageCommandHelper( std::ignore = SlicePitch; UR_ASSERT(SrcMem->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); +#ifndef NDEBUG auto SrcImage = SrcMem; const ze_image_desc_t &ZeImageDesc = SrcImage->ZeImageDesc; UR_ASSERT( @@ -396,6 +398,7 @@ static ur_result_t enqueueMemImageCommandHelper( (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 && RowPitch == 4 * ZeSrcRegion.width), UR_RESULT_ERROR_INVALID_IMAGE_SIZE); +#endif UR_ASSERT(SlicePitch == 0 || SlicePitch == RowPitch * ZeSrcRegion.height, UR_RESULT_ERROR_INVALID_IMAGE_SIZE); @@ -414,6 +417,7 @@ static ur_result_t enqueueMemImageCommandHelper( // Check that SYCL RT did not want pitch larger than default. UR_ASSERT(DstMem->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); +#ifndef NDEBUG auto DstImage = static_cast<_ur_image *>(DstMem); const ze_image_desc_t &ZeImageDesc = DstImage->ZeImageDesc; UR_ASSERT( @@ -426,6 +430,7 @@ static ur_result_t enqueueMemImageCommandHelper( (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 && RowPitch == 4 * ZeDstRegion.width), UR_RESULT_ERROR_INVALID_IMAGE_SIZE); +#endif UR_ASSERT(SlicePitch == 0 || SlicePitch == RowPitch * ZeDstRegion.height, UR_RESULT_ERROR_INVALID_IMAGE_SIZE); @@ -2316,19 +2321,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMGetMemAllocInfo( UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet); switch (PropName) { case UR_USM_ALLOC_INFO_TYPE: { - pi_usm_type MemAllocaType; + ur_usm_type_t MemAllocaType; switch (ZeMemoryAllocationProperties.type) { case ZE_MEMORY_TYPE_UNKNOWN: - MemAllocaType = PI_MEM_TYPE_UNKNOWN; + MemAllocaType = UR_USM_TYPE_UNKNOWN; break; case ZE_MEMORY_TYPE_HOST: - MemAllocaType = PI_MEM_TYPE_HOST; + MemAllocaType = UR_USM_TYPE_HOST; break; case ZE_MEMORY_TYPE_DEVICE: - MemAllocaType = PI_MEM_TYPE_DEVICE; + MemAllocaType = UR_USM_TYPE_DEVICE; break; case ZE_MEMORY_TYPE_SHARED: - MemAllocaType = PI_MEM_TYPE_SHARED; + MemAllocaType = UR_USM_TYPE_SHARED; break; default: urPrint("urUSMGetMemAllocInfo: unexpected usm memory type\n"); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp index e9ad0d49bbdbb..74850eb78f08e 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp @@ -16,7 +16,7 @@ #include #include -#include + #include #include #include @@ -43,7 +43,7 @@ const bool UseCopyEngineForD2DCopy = [] { // exclusive use and source buffer's mutex locked for shared use on entry. ur_result_t enqueueMemCopyHelper(ur_command_t CommandType, ur_queue_handle_t Queue, void *Dst, - pi_bool BlockingWrite, size_t Size, + ur_bool_t BlockingWrite, size_t Size, const void *Src, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, ur_event_handle_t *OutEvent, @@ -53,7 +53,7 @@ ur_result_t enqueueMemCopyRectHelper( ur_command_t CommandType, ur_queue_handle_t Queue, const void *SrcBuffer, void *DstBuffer, ur_rect_offset_t SrcOrigin, ur_rect_offset_t DstOrigin, ur_rect_region_t Region, size_t SrcRowPitch, size_t DstRowPitch, - size_t SrcSlicePitch, size_t DstSlicePitch, pi_bool Blocking, + size_t SrcSlicePitch, size_t DstSlicePitch, ur_bool_t Blocking, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, ur_event_handle_t *OutEvent, bool PreferCopyEngine = false); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp index 7d0bef4cb84f5..71469d4e68020 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp @@ -7,7 +7,8 @@ //===-----------------------------------------------------------------===// #include "ur_level_zero_platform.hpp" -#include +#include "ur_level_zero.hpp" + UR_APIEXPORT ur_result_t UR_APICALL urInit( ur_device_init_flags_t diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp index 0b4d07b0366a3..281eacdd11509 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp @@ -7,7 +7,8 @@ //===-----------------------------------------------------------------===// #include "ur_level_zero_program.hpp" -#include +#include "ur_level_zero.hpp" + extern "C" { // Check to see if a Level Zero module has any unresolved symbols. diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp index 3ca6ecad4c994..15b6ab451ac3d 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp @@ -13,7 +13,8 @@ #include "ur_level_zero_common.hpp" #include "ur_level_zero_queue.hpp" -#include +#include "ur_level_zero.hpp" + /// @brief Cleanup events in the immediate lists of the queue. /// @param Queue Queue where events need to be cleaned up. @@ -639,7 +640,7 @@ static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) { } const char *BatchSizeStr = UrRet ? UrRet : (PiRet ? PiRet : nullptr); if (BatchSizeStr) { - pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr); + int32_t BatchSizeStrVal = std::atoi(BatchSizeStr); // Level Zero may only support a limted number of commands per command // list. The actual upper limit is not specified by the Level Zero // Specification. For now we allow an arbitrary upper limit. @@ -1205,20 +1206,20 @@ bool ur_queue_handle_t_::isBatchingAllowed(bool IsCopy) const { } bool ur_queue_handle_t_::isDiscardEvents() const { - return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS) != 0); + return ((this->Properties & UR_QUEUE_FLAG_DISCARD_EVENTS) != 0); } bool ur_queue_handle_t_::isPriorityLow() const { - return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW) != 0); + return ((this->Properties & UR_QUEUE_FLAG_PRIORITY_LOW) != 0); } bool ur_queue_handle_t_::isPriorityHigh() const { - return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH) != 0); + return ((this->Properties & UR_QUEUE_FLAG_PRIORITY_HIGH) != 0); } bool ur_queue_handle_t_::isInOrderQueue() const { // If out-of-order queue property is not set, then this is a in-order queue. - return ((this->Properties & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) == + return ((this->Properties & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) == 0); } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp index 76cfda295f2f8..707463ecf55cc 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp @@ -16,7 +16,7 @@ #include #include -#include + #include #include #include diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp index 42c431ec94632..c8cedf4b74f88 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp @@ -7,7 +7,8 @@ //===-----------------------------------------------------------------===// #include "ur_level_zero_sampler.hpp" -#include +#include "ur_level_zero.hpp" + UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate( ur_context_handle_t Context, ///< [in] handle of the context object From 04fc86b37a43a273bcfd2c8498d2e9141adbbc24 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Fri, 12 May 2023 18:33:15 -0700 Subject: [PATCH 39/50] Fix casting for srgba-read.cpp test pi_bool is uint32_t and ur_bool_t is uint8_t, so to make sure correct functionality is maintain, use uint32_t as replacement for pi_bool, instead of ur_bool_t. Also, add back check for urMemImageCreate that was before in piMemImageCreate. Signed-off-by: Jaime Arteaga --- .../ur/adapters/level_zero/ur_level_zero_context.cpp | 6 ++---- .../ur/adapters/level_zero/ur_level_zero_device.cpp | 7 +++---- .../ur/adapters/level_zero/ur_level_zero_mem.cpp | 8 ++++++-- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp index e6cd4ff02f981..c177926c24c30 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp @@ -13,8 +13,6 @@ #include "ur_level_zero.hpp" #include "ur_level_zero_context.hpp" -#include "ur_level_zero.hpp" - UR_APIEXPORT ur_result_t UR_APICALL urContextCreate( uint32_t DeviceCount, ///< [in] the number of devices given in phDevices @@ -108,10 +106,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( return ReturnValue(uint32_t{Context->RefCount.load()}); case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: // 2D USM memcpy is supported. - return ReturnValue(ur_bool_t{UseMemcpy2DOperations}); + return ReturnValue(uint32_t{UseMemcpy2DOperations}); case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT: // 2D USM fill is not supported. - return ReturnValue(ur_bool_t{false}); + return ReturnValue(uint32_t{false}); case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { ur_memory_order_capability_flags_t Capabilities = diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp index 83a5a33abda51..f3d242f7f4e5d 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp @@ -10,7 +10,6 @@ #include "ur_level_zero.hpp" #include #include - UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet( ur_platform_handle_t Platform, ///< [in] handle of the platform instance @@ -771,9 +770,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(capabilities); } case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT: - return ReturnValue(ur_bool_t{false}); + return ReturnValue(uint32_t{false}); case UR_DEVICE_INFO_IMAGE_SRGB: - return ReturnValue(ur_bool_t{false}); + return ReturnValue(uint32_t{false}); case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: { @@ -1197,7 +1196,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( // Look for GEN binary, which we known can only be handled by Level-Zero now. const char *BinaryTarget = - UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; //UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; + UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; // UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; uint32_t *SelectedBinaryInd = SelectedBinary; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index d61b93d581f5e..ba4e36aaeb21d 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -13,8 +13,6 @@ #include "ur_level_zero.hpp" #include "ur_level_zero_context.hpp" #include "ur_level_zero_event.hpp" -#include "ur_level_zero.hpp" - // Default to using compute engine for fill operation, but allow to // override this with an environment variable. @@ -1604,6 +1602,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( void *Host, ///< [in] pointer to the buffer data ur_mem_handle_t *Mem ///< [out] pointer to handle of image object created ) { + // TODO: implement read-only, write-only + if ((Flags & UR_MEM_FLAG_READ_WRITE) == 0) { + die("urMemImageCreate: Level-Zero implements only read-write buffer," + "no read-only or write-only yet."); + } + std::shared_lock Lock(Context->Mutex); ZeStruct ZeImageDesc; From f35fef4d84dadad0a292cf2e2c46053f98a3637d Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Mon, 15 May 2023 19:45:08 -0700 Subject: [PATCH 40/50] Rebase and fix format - Add changes to fix tests after [SYCL] Add Unified Runtime plugin and route to it with SYCL_PREFER_UR https://github.com/intel/llvm/pull/9232 Signed-off-by: Jaime Arteaga --- sycl/plugins/unified_runtime/pi2ur.hpp | 11 ++++------- sycl/plugins/unified_runtime/pi_unified_runtime.cpp | 10 ++++++++++ .../ur/adapters/level_zero/ur_level_zero.cpp | 1 - .../ur/adapters/level_zero/ur_level_zero_common.hpp | 1 - .../ur/adapters/level_zero/ur_level_zero_context.hpp | 1 - .../ur/adapters/level_zero/ur_level_zero_device.hpp | 1 - .../ur/adapters/level_zero/ur_level_zero_event.cpp | 3 +-- .../ur/adapters/level_zero/ur_level_zero_event.hpp | 1 - .../ur/adapters/level_zero/ur_level_zero_kernel.cpp | 4 ++-- .../ur/adapters/level_zero/ur_level_zero_mem.hpp | 1 - .../ur/adapters/level_zero/ur_level_zero_platform.cpp | 1 - .../ur/adapters/level_zero/ur_level_zero_program.cpp | 1 - .../ur/adapters/level_zero/ur_level_zero_queue.cpp | 3 +-- .../ur/adapters/level_zero/ur_level_zero_queue.hpp | 1 - .../ur/adapters/level_zero/ur_level_zero_sampler.cpp | 1 - sycl/plugins/unified_runtime/ur/ur.hpp | 1 - 16 files changed, 18 insertions(+), 24 deletions(-) diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 2f3b6211cb46b..f36ce228d48a2 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -426,7 +426,6 @@ inline pi_result fixupInfoValueTypes(size_t ParamValueSizeUR, return PI_SUCCESS; } - inline ur_result_t mapPIMetadataToUR(const pi_device_binary_property *pi_metadata, ur_program_metadata_t *ur_metadata) { @@ -455,7 +454,10 @@ namespace pi2ur { inline pi_result piTearDown(void *PluginParameter) { std::ignore = PluginParameter; - HANDLE_ERRORS(urTearDown(nullptr)); + // TODO: Dont check for errors in urTearDown, since + // when using Level Zero plugin, the second urTearDown + // will fail as ur_loader.so has already been unloaded, + urTearDown(nullptr); return PI_SUCCESS; } @@ -1379,7 +1381,6 @@ inline pi_result piextQueueGetNativeHandle(pi_queue Queue, return PI_SUCCESS; } - inline pi_result piextQueueGetNativeHandle2(pi_queue Queue, pi_native_handle *NativeHandle, int32_t *NativeHandleDesc) { @@ -1388,7 +1389,6 @@ inline pi_result piextQueueGetNativeHandle2(pi_queue Queue, return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle); } - inline pi_result piQueueRelease(pi_queue Queue) { PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); @@ -2920,7 +2920,6 @@ inline pi_result piextUSMEnqueueFill2D(pi_queue Queue, void *Ptr, size_t Pitch, const pi_event *EventsWaitList, pi_event *Event) { - auto hQueue = reinterpret_cast(Queue); auto phEventWaitList = reinterpret_cast(EventsWaitList); @@ -2931,7 +2930,6 @@ inline pi_result piextUSMEnqueueFill2D(pi_queue Queue, void *Ptr, size_t Pitch, phEventWaitList, phEvent)); return PI_SUCCESS; - } inline pi_result piextUSMEnqueueMemset2D(pi_queue Queue, void *Ptr, @@ -3504,7 +3502,6 @@ inline pi_result piEnqueueEventsWait(pi_queue Queue, return PI_SUCCESS; } - inline pi_result piextEnqueueReadHostPipe(pi_queue queue, pi_program program, const char *pipe_symbol, pi_bool blocking, void *ptr, diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index 3cf3e10a21676..acff4810f9dc9 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -84,6 +84,15 @@ piContextCreate(const pi_context_properties *Properties, pi_uint32 NumDevices, UserData, RetContext); } +__SYCL_EXPORT pi_result piContextGetInfo(pi_context Context, + pi_context_info ParamName, + size_t ParamValueSize, + void *ParamValue, + size_t *ParamValueSizeRet) { + return pi2ur::piContextGetInfo(Context, ParamName, ParamValueSize, ParamValue, + ParamValueSizeRet); +} + __SYCL_EXPORT pi_result piContextRelease(pi_context Context) { return pi2ur::piContextRelease(Context); } @@ -1045,6 +1054,7 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) { _PI_API(piContextCreate) _PI_API(piContextRelease) _PI_API(piContextRetain) + _PI_API(piContextGetInfo) _PI_API(piextContextSetExtendedDeleter) _PI_API(piextContextGetNativeHandle) _PI_API(piextContextCreateWithNativeHandle) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp index 92ada96340bd9..c0a873025e8b8 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp @@ -11,7 +11,6 @@ #include #include "ur_level_zero.hpp" - // Define the static class field std::mutex ZeCall::GlobalLock; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp index 9d375bb8e2fab..ed269665cd99b 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp @@ -16,7 +16,6 @@ #include #include - #include #include #include diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp index 2a9d2f97e84f9..cc1775d87f3c9 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp @@ -15,7 +15,6 @@ #include #include - #include #include #include diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp index e8514ce569f45..4bc56c6fc5108 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp @@ -15,7 +15,6 @@ #include #include - #include #include #include diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp index 72cfcbed5bbbc..6d14ae2176681 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp @@ -11,10 +11,9 @@ #include #include +#include "ur_level_zero.hpp" #include "ur_level_zero_common.hpp" #include "ur_level_zero_event.hpp" -#include "ur_level_zero.hpp" - void printZeEventList(const _ur_ze_event_list_t &UrZeEventList) { urPrint(" NumEventsInWaitList %d:", UrZeEventList.Length); diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp index 42c9468ec2ef0..9922742c7776d 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp @@ -17,7 +17,6 @@ #include #include - #include #include #include diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp index 38df90c31ee72..73111abeb475a 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp @@ -8,7 +8,6 @@ #include "ur_level_zero_kernel.hpp" #include "ur_level_zero.hpp" - UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t Queue, ///< [in] handle of the queue object @@ -609,7 +608,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( ///< holding the argument value. If null then argument ///< value is considered null. ) { - UR_CALL(urKernelSetArgValue(Kernel, ArgIndex, sizeof(const void *), ArgValue)); + UR_CALL( + urKernelSetArgValue(Kernel, ArgIndex, sizeof(const void *), ArgValue)); return UR_RESULT_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp index 74850eb78f08e..0d658342fb0b1 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp @@ -16,7 +16,6 @@ #include #include - #include #include #include diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp index 71469d4e68020..469c39d3e668c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp @@ -8,7 +8,6 @@ #include "ur_level_zero_platform.hpp" #include "ur_level_zero.hpp" - UR_APIEXPORT ur_result_t UR_APICALL urInit( ur_device_init_flags_t diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp index 281eacdd11509..5519f7e2254bd 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp @@ -8,7 +8,6 @@ #include "ur_level_zero_program.hpp" #include "ur_level_zero.hpp" - extern "C" { // Check to see if a Level Zero module has any unresolved symbols. diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp index 15b6ab451ac3d..efd3538887f93 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp @@ -11,10 +11,9 @@ #include #include +#include "ur_level_zero.hpp" #include "ur_level_zero_common.hpp" #include "ur_level_zero_queue.hpp" -#include "ur_level_zero.hpp" - /// @brief Cleanup events in the immediate lists of the queue. /// @param Queue Queue where events need to be cleaned up. diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp index 707463ecf55cc..4a5a6fe8b731d 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp @@ -16,7 +16,6 @@ #include #include - #include #include #include diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp index c8cedf4b74f88..bf32fdd9367d0 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp @@ -8,7 +8,6 @@ #include "ur_level_zero_sampler.hpp" #include "ur_level_zero.hpp" - UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate( ur_context_handle_t Context, ///< [in] handle of the context object diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp index 24a38ab318751..d0d1fb8f46912 100644 --- a/sycl/plugins/unified_runtime/ur/ur.hpp +++ b/sycl/plugins/unified_runtime/ur/ur.hpp @@ -67,7 +67,6 @@ typedef enum { "@reqd_work_group_size" #define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping" - // Terminates the process with a catastrophic error message. [[noreturn]] inline void die(const char *Message) { std::cerr << "die: " << Message << std::endl; From 5935f8b25d50c8bd3abc01fa7c5837e73b9bdbf3 Mon Sep 17 00:00:00 2001 From: Brandon Yates Date: Tue, 16 May 2023 19:46:04 -0400 Subject: [PATCH 41/50] Move urUSM into new file (#10) Signed-off-by: Brandon Yates --- sycl/plugins/level_zero/CMakeLists.txt | 2 + sycl/plugins/unified_runtime/CMakeLists.txt | 2 + .../ur/adapters/level_zero/ur_level_zero.hpp | 1 + .../adapters/level_zero/ur_level_zero_mem.cpp | 749 ----------------- .../adapters/level_zero/ur_level_zero_mem.hpp | 98 --- .../adapters/level_zero/ur_level_zero_usm.cpp | 764 ++++++++++++++++++ .../adapters/level_zero/ur_level_zero_usm.hpp | 108 +++ 7 files changed, 877 insertions(+), 847 deletions(-) create mode 100644 sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp create mode 100644 sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp diff --git a/sycl/plugins/level_zero/CMakeLists.txt b/sycl/plugins/level_zero/CMakeLists.txt index 3cd25f2dc6826..916680cdf3959 100755 --- a/sycl/plugins/level_zero/CMakeLists.txt +++ b/sycl/plugins/level_zero/CMakeLists.txt @@ -111,6 +111,7 @@ add_sycl_plugin(level_zero "../unified_runtime/ur/adapters/level_zero/ur_level_zero_program.hpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.hpp" + "../unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_common.cpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp" @@ -122,6 +123,7 @@ add_sycl_plugin(level_zero "../unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp" "../unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp" + "../unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp" # Following are the PI Level-Zero Plugin only codes. "pi_level_zero.cpp" "pi_level_zero.hpp" diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 0b4bcef273b73..177537363380e 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -96,6 +96,7 @@ add_sycl_library("ur_adapter_level_zero" SHARED "ur/adapters/level_zero/ur_level_zero_program.hpp" "ur/adapters/level_zero/ur_level_zero_queue.hpp" "ur/adapters/level_zero/ur_level_zero_sampler.hpp" + "ur/adapters/level_zero/ur_level_zero_usm.hpp" "ur/adapters/level_zero/ur_level_zero.cpp" "ur/adapters/level_zero/ur_level_zero_common.cpp" "ur/adapters/level_zero/ur_level_zero_context.cpp" @@ -107,6 +108,7 @@ add_sycl_library("ur_adapter_level_zero" SHARED "ur/adapters/level_zero/ur_level_zero_program.cpp" "ur/adapters/level_zero/ur_level_zero_queue.cpp" "ur/adapters/level_zero/ur_level_zero_sampler.cpp" + "ur/adapters/level_zero/ur_level_zero_usm.cpp" INCLUDE_DIRS ${sycl_inc_dir} LIBRARIES diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp index 5095e168a4a3e..0da70b073ab1e 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp @@ -30,3 +30,4 @@ #include "ur_level_zero_program.hpp" #include "ur_level_zero_queue.hpp" #include "ur_level_zero_sampler.hpp" +#include "ur_level_zero_usm.hpp" diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp index ba4e36aaeb21d..1974f6052ff04 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp @@ -2021,755 +2021,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( - ur_context_handle_t Context, ///< [in] handle of the context object - const ur_usm_desc_t - *USMDesc, ///< [in][optional] USM memory allocation descriptor - ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created - ///< using urUSMPoolCreate - size_t - Size, ///< [in] size in bytes of the USM memory object to be allocated - void **RetMem ///< [out] pointer to USM host memory object -) { - std::ignore = Pool; - - uint32_t Align = USMDesc->align; - // L0 supports alignment up to 64KB and silently ignores higher values. - // We flag alignment > 64KB as an invalid value. - if (Align > 65536) - return UR_RESULT_ERROR_INVALID_VALUE; - - const ur_usm_advice_flags_t *USMHintFlags = &USMDesc->hints; - std::ignore = USMHintFlags; - - ur_platform_handle_t Plt = Context->getPlatform(); - // If indirect access tracking is enabled then lock the mutex which is - // guarding contexts container in the platform. This prevents new kernels from - // being submitted in any context while we are in the process of allocating a - // memory, this is needed to properly capture allocations by kernels with - // indirect access. This lock also protects access to the context's data - // structures. If indirect access tracking is not enabled then lock context - // mutex to protect access to context's data structures. - std::shared_lock ContextLock(Context->Mutex, - std::defer_lock); - std::unique_lock IndirectAccessTrackingLock( - Plt->ContextsMutex, std::defer_lock); - if (IndirectAccessTrackingEnabled) { - IndirectAccessTrackingLock.lock(); - // We are going to defer memory release if there are kernels with indirect - // access, that is why explicitly retain context to be sure that it is - // released after all memory allocations in this context are released. - UR_CALL(urContextRetain(Context)); - } else { - ContextLock.lock(); - } - - if (!UseUSMAllocator || - // L0 spec says that allocation fails if Alignment != 2^n, in order to - // keep the same behavior for the allocator, just call L0 API directly and - // return the error code. - ((Align & (Align - 1)) != 0)) { - ur_usm_host_mem_flags_t Flags{}; - ur_result_t Res = USMHostAllocImpl(RetMem, Context, &Flags, Size, Align); - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*RetMem), - std::forward_as_tuple(Context)); - } - return Res; - } - - // There is a single allocator for Host USM allocations, so we don't need to - // find the allocator depending on context as we do for Shared and Device - // allocations. - try { - *RetMem = Context->HostMemAllocContext->allocate(Size, Align); - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*RetMem), - std::forward_as_tuple(Context)); - } - } catch (const UsmAllocationException &Ex) { - *RetMem = nullptr; - return Ex.getError(); - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( - ur_context_handle_t Context, ///< [in] handle of the context object - ur_device_handle_t Device, ///< [in] handle of the device object - const ur_usm_desc_t - *USMDesc, ///< [in][optional] USM memory allocation descriptor - ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created - ///< using urUSMPoolCreate - size_t - Size, ///< [in] size in bytes of the USM memory object to be allocated - void **RetMem ///< [out] pointer to USM device memory object -) { - std::ignore = Pool; - - uint32_t Alignment = USMDesc->align; - - // L0 supports alignment up to 64KB and silently ignores higher values. - // We flag alignment > 64KB as an invalid value. - if (Alignment > 65536) - return UR_RESULT_ERROR_INVALID_VALUE; - - const ur_usm_advice_flags_t *USMHintFlags = &USMDesc->hints; - std::ignore = USMHintFlags; - - ur_platform_handle_t Plt = Device->Platform; - - // If indirect access tracking is enabled then lock the mutex which is - // guarding contexts container in the platform. This prevents new kernels from - // being submitted in any context while we are in the process of allocating a - // memory, this is needed to properly capture allocations by kernels with - // indirect access. This lock also protects access to the context's data - // structures. If indirect access tracking is not enabled then lock context - // mutex to protect access to context's data structures. - std::shared_lock ContextLock(Context->Mutex, - std::defer_lock); - std::unique_lock IndirectAccessTrackingLock( - Plt->ContextsMutex, std::defer_lock); - if (IndirectAccessTrackingEnabled) { - IndirectAccessTrackingLock.lock(); - // We are going to defer memory release if there are kernels with indirect - // access, that is why explicitly retain context to be sure that it is - // released after all memory allocations in this context are released. - UR_CALL(urContextRetain(Context)); - } else { - ContextLock.lock(); - } - - if (!UseUSMAllocator || - // L0 spec says that allocation fails if Alignment != 2^n, in order to - // keep the same behavior for the allocator, just call L0 API directly and - // return the error code. - ((Alignment & (Alignment - 1)) != 0)) { - ur_result_t Res = - USMDeviceAllocImpl(RetMem, Context, Device, nullptr, Size, Alignment); - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*RetMem), - std::forward_as_tuple(Context)); - } - return Res; - } - - try { - auto It = Context->DeviceMemAllocContexts.find(Device->ZeDevice); - if (It == Context->DeviceMemAllocContexts.end()) - return UR_RESULT_ERROR_INVALID_VALUE; - - *RetMem = It->second.allocate(Size, Alignment); - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*RetMem), - std::forward_as_tuple(Context)); - } - - } catch (const UsmAllocationException &Ex) { - *RetMem = nullptr; - return Ex.getError(); - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( - ur_context_handle_t Context, ///< [in] handle of the context object - ur_device_handle_t Device, ///< [in] handle of the device object - const ur_usm_desc_t - *USMDesc, ///< [in][optional] USM memory allocation descriptor - ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created - ///< using urUSMPoolCreate - size_t - Size, ///< [in] size in bytes of the USM memory object to be allocated - void **RetMem ///< [out] pointer to USM shared memory object -) { - std::ignore = Pool; - - uint32_t Alignment = USMDesc->align; - - ur_usm_host_mem_flags_t UsmHostFlags{}; - - // See if the memory is going to be read-only on the device. - bool DeviceReadOnly = false; - ur_usm_device_mem_flags_t UsmDeviceFlags{}; - - void *pNext = const_cast(USMDesc->pNext); - while (pNext != nullptr) { - const ur_base_desc_t *BaseDesc = - reinterpret_cast(pNext); - if (BaseDesc->stype == UR_STRUCTURE_TYPE_USM_DEVICE_DESC) { - const ur_usm_device_desc_t *UsmDeviceDesc = - reinterpret_cast(pNext); - UsmDeviceFlags = UsmDeviceDesc->flags; - } - if (BaseDesc->stype == UR_STRUCTURE_TYPE_USM_HOST_DESC) { - const ur_usm_host_desc_t *UsmHostDesc = - reinterpret_cast(pNext); - UsmHostFlags = UsmHostDesc->flags; - } - pNext = const_cast(BaseDesc->pNext); - } - DeviceReadOnly = UsmDeviceFlags & UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY; - - // L0 supports alignment up to 64KB and silently ignores higher values. - // We flag alignment > 64KB as an invalid value. - if (Alignment > 65536) - return UR_RESULT_ERROR_INVALID_VALUE; - - ur_platform_handle_t Plt = Device->Platform; - - // If indirect access tracking is enabled then lock the mutex which is - // guarding contexts container in the platform. This prevents new kernels from - // being submitted in any context while we are in the process of allocating a - // memory, this is needed to properly capture allocations by kernels with - // indirect access. This lock also protects access to the context's data - // structures. If indirect access tracking is not enabled then lock context - // mutex to protect access to context's data structures. - std::scoped_lock Lock( - IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex); - - if (IndirectAccessTrackingEnabled) { - // We are going to defer memory release if there are kernels with indirect - // access, that is why explicitly retain context to be sure that it is - // released after all memory allocations in this context are released. - UR_CALL(urContextRetain(Context)); - } - - if (!UseUSMAllocator || - // L0 spec says that allocation fails if Alignment != 2^n, in order to - // keep the same behavior for the allocator, just call L0 API directly and - // return the error code. - ((Alignment & (Alignment - 1)) != 0)) { - ur_result_t Res = USMSharedAllocImpl(RetMem, Context, Device, &UsmHostFlags, - &UsmDeviceFlags, Size, Alignment); - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*RetMem), - std::forward_as_tuple(Context)); - } - return Res; - } - - try { - auto &Allocator = (DeviceReadOnly ? Context->SharedReadOnlyMemAllocContexts - : Context->SharedMemAllocContexts); - auto It = Allocator.find(Device->ZeDevice); - if (It == Allocator.end()) - return UR_RESULT_ERROR_INVALID_VALUE; - - *RetMem = It->second.allocate(Size, Alignment); - if (DeviceReadOnly) { - Context->SharedReadOnlyAllocs.insert(*RetMem); - } - if (IndirectAccessTrackingEnabled) { - // Keep track of all memory allocations in the context - Context->MemAllocs.emplace(std::piecewise_construct, - std::forward_as_tuple(*RetMem), - std::forward_as_tuple(Context)); - } - } catch (const UsmAllocationException &Ex) { - *RetMem = nullptr; - return Ex.getError(); - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urUSMFree( - ur_context_handle_t Context, ///< [in] handle of the context object - void *Mem ///< [in] pointer to USM memory object -) { - ur_platform_handle_t Plt = Context->getPlatform(); - - std::scoped_lock Lock( - IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex); - - return USMFreeHelper(Context, Mem); -} - -UR_APIEXPORT ur_result_t UR_APICALL urUSMGetMemAllocInfo( - ur_context_handle_t Context, ///< [in] handle of the context object - const void *Ptr, ///< [in] pointer to USM memory object - ur_usm_alloc_info_t - PropName, ///< [in] the name of the USM allocation property to query - size_t PropValueSize, ///< [in] size in bytes of the USM allocation property - ///< value - void *PropValue, ///< [out][optional] value of the USM allocation property - size_t *PropValueSizeRet ///< [out][optional] bytes returned in USM - ///< allocation property -) { - ze_device_handle_t ZeDeviceHandle; - ZeStruct ZeMemoryAllocationProperties; - - ZE2UR_CALL(zeMemGetAllocProperties, - (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties, - &ZeDeviceHandle)); - - UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet); - switch (PropName) { - case UR_USM_ALLOC_INFO_TYPE: { - ur_usm_type_t MemAllocaType; - switch (ZeMemoryAllocationProperties.type) { - case ZE_MEMORY_TYPE_UNKNOWN: - MemAllocaType = UR_USM_TYPE_UNKNOWN; - break; - case ZE_MEMORY_TYPE_HOST: - MemAllocaType = UR_USM_TYPE_HOST; - break; - case ZE_MEMORY_TYPE_DEVICE: - MemAllocaType = UR_USM_TYPE_DEVICE; - break; - case ZE_MEMORY_TYPE_SHARED: - MemAllocaType = UR_USM_TYPE_SHARED; - break; - default: - urPrint("urUSMGetMemAllocInfo: unexpected usm memory type\n"); - return UR_RESULT_ERROR_INVALID_VALUE; - } - return ReturnValue(MemAllocaType); - } - case UR_USM_ALLOC_INFO_DEVICE: - if (ZeDeviceHandle) { - auto Platform = Context->getPlatform(); - auto Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle); - return Device ? ReturnValue(Device) : UR_RESULT_ERROR_INVALID_VALUE; - } else { - return UR_RESULT_ERROR_INVALID_VALUE; - } - case UR_USM_ALLOC_INFO_BASE_PTR: { - void *Base; - ZE2UR_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, &Base, nullptr)); - return ReturnValue(Base); - } - case UR_USM_ALLOC_INFO_SIZE: { - size_t Size; - ZE2UR_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, nullptr, &Size)); - return ReturnValue(Size); - } - default: - urPrint("urUSMGetMemAllocInfo: unsupported ParamName\n"); - return UR_RESULT_ERROR_INVALID_VALUE; - } - return UR_RESULT_SUCCESS; -} - -static ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) { - ZE2UR_CALL(zeMemFree, (Context->ZeContext, Ptr)); - return UR_RESULT_SUCCESS; -} - -void *USMMemoryAllocBase::allocate(size_t Size) { - void *Ptr = nullptr; - - auto Res = allocateImpl(&Ptr, Size, sizeof(void *)); - if (Res != UR_RESULT_SUCCESS) { - throw UsmAllocationException(Res); - } - - return Ptr; -} - -void *USMMemoryAllocBase::allocate(size_t Size, size_t Alignment) { - void *Ptr = nullptr; - - auto Res = allocateImpl(&Ptr, Size, Alignment); - if (Res != UR_RESULT_SUCCESS) { - throw UsmAllocationException(Res); - } - return Ptr; -} - -void USMMemoryAllocBase::deallocate(void *Ptr) { - auto Res = USMFreeImpl(Context, Ptr); - if (Res != UR_RESULT_SUCCESS) { - throw UsmAllocationException(Res); - } -} - -ur_result_t USMSharedMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) { - return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size, - Alignment); -} - -ur_result_t USMSharedReadOnlyMemoryAlloc::allocateImpl(void **ResultPtr, - size_t Size, - uint32_t Alignment) { - ur_usm_device_desc_t UsmDeviceDesc{}; - UsmDeviceDesc.flags = UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY; - ur_usm_host_desc_t UsmHostDesc{}; - return USMSharedAllocImpl(ResultPtr, Context, Device, &UsmDeviceDesc.flags, - &UsmHostDesc.flags, Size, Alignment); -} - -ur_result_t USMDeviceMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) { - return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size, - Alignment); -} - -ur_result_t USMHostMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) { - return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment); -} - -enum class USMAllocationForceResidencyType { - // Do not force memory residency at allocation time. - None = 0, - // Force memory resident on the device of allocation at allocation time. - // For host allocation force residency on all devices in a context. - Device = 1, - // [Default] Force memory resident on all devices in the context with P2P - // access to the device of allocation. - // For host allocation force residency on all devices in a context. - P2PDevices = 2 -}; - -// Returns the desired USM residency setting -static USMAllocationForceResidencyType USMAllocationForceResidency = [] { - const char *UrRet = std::getenv("UR_L0_USM_RESIDENT"); - const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT"); - const char *Str = UrRet ? UrRet : (PiRet ? PiRet : nullptr); - if (!Str) - return USMAllocationForceResidencyType::P2PDevices; - switch (std::atoi(Str)) { - case 1: - return USMAllocationForceResidencyType::Device; - case 2: - return USMAllocationForceResidencyType::P2PDevices; - default: - return USMAllocationForceResidencyType::None; - }; -}(); - -// Make USM allocation resident as requested -static ur_result_t USMAllocationMakeResident( - ur_context_handle_t Context, - ur_device_handle_t Device, // nullptr for host allocation - void *Ptr, size_t Size) { - - std::list Devices; - - if (USMAllocationForceResidency == USMAllocationForceResidencyType::None) - return UR_RESULT_SUCCESS; - else if (!Device) { - // Host allocation, make it resident on all devices in the context - Devices.insert(Devices.end(), Context->Devices.begin(), - Context->Devices.end()); - } else { - Devices.push_back(Device); - if (USMAllocationForceResidency == - USMAllocationForceResidencyType::P2PDevices) { - ze_bool_t P2P; - for (const auto &D : Context->Devices) { - if (D == Device) - continue; - // TODO: Cache P2P devices for a context - ZE2UR_CALL(zeDeviceCanAccessPeer, - (D->ZeDevice, Device->ZeDevice, &P2P)); - if (P2P) - Devices.push_back(D); - } - } - } - for (const auto &D : Devices) { - ZE2UR_CALL(zeContextMakeMemoryResident, - (Context->ZeContext, D->ZeDevice, Ptr, Size)); - } - return UR_RESULT_SUCCESS; -} - -UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate( - ur_context_handle_t Context, ///< [in] handle of the context object - ur_usm_pool_desc_t - *PoolDesc, ///< [in] pointer to USM pool descriptor. Can be chained with - ///< ::ur_usm_pool_limits_desc_t - ur_usm_pool_handle_t *Pool ///< [out] pointer to USM memory pool -) { - std::ignore = Context; - std::ignore = PoolDesc; - std::ignore = Pool; - urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolDestroy( - ur_context_handle_t Context, ///< [in] handle of the context object - ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool -) { - std::ignore = Context; - std::ignore = Pool; - urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t Device, - ur_usm_device_mem_flags_t *Flags, size_t Size, - uint32_t Alignment) { - // TODO: translate PI properties to Level Zero flags - ZeStruct ZeDesc; - ZeDesc.flags = 0; - ZeDesc.ordinal = 0; - - ZeStruct RelaxedDesc; - if (Size > Device->ZeDeviceProperties->maxMemAllocSize) { - // Tell Level-Zero to accept Size > maxMemAllocSize - RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE; - ZeDesc.pNext = &RelaxedDesc; - } - - ZE2UR_CALL(zeMemAllocDevice, (Context->ZeContext, &ZeDesc, Size, Alignment, - Device->ZeDevice, ResultPtr)); - - UR_ASSERT(Alignment == 0 || - reinterpret_cast(*ResultPtr) % Alignment == 0, - UR_RESULT_ERROR_INVALID_VALUE); - - USMAllocationMakeResident(Context, Device, *ResultPtr, Size); - return UR_RESULT_SUCCESS; -} - -ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t Device, - ur_usm_host_mem_flags_t *, - ur_usm_device_mem_flags_t *, size_t Size, - uint32_t Alignment) { - - // TODO: translate PI properties to Level Zero flags - ZeStruct ZeHostDesc; - ZeHostDesc.flags = 0; - ZeStruct ZeDevDesc; - ZeDevDesc.flags = 0; - ZeDevDesc.ordinal = 0; - - ZeStruct RelaxedDesc; - if (Size > Device->ZeDeviceProperties->maxMemAllocSize) { - // Tell Level-Zero to accept Size > maxMemAllocSize - RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE; - ZeDevDesc.pNext = &RelaxedDesc; - } - - ZE2UR_CALL(zeMemAllocShared, (Context->ZeContext, &ZeDevDesc, &ZeHostDesc, - Size, Alignment, Device->ZeDevice, ResultPtr)); - - UR_ASSERT(Alignment == 0 || - reinterpret_cast(*ResultPtr) % Alignment == 0, - UR_RESULT_ERROR_INVALID_VALUE); - - USMAllocationMakeResident(Context, Device, *ResultPtr, Size); - - // TODO: Handle PI_MEM_ALLOC_DEVICE_READ_ONLY. - return UR_RESULT_SUCCESS; -} - -ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_usm_host_mem_flags_t *Flags, size_t Size, - uint32_t Alignment) { - // TODO: translate PI properties to Level Zero flags - ZeStruct ZeHostDesc; - ZeHostDesc.flags = 0; - ZE2UR_CALL(zeMemAllocHost, - (Context->ZeContext, &ZeHostDesc, Size, Alignment, ResultPtr)); - - UR_ASSERT(Alignment == 0 || - reinterpret_cast(*ResultPtr) % Alignment == 0, - UR_RESULT_ERROR_INVALID_VALUE); - - USMAllocationMakeResident(Context, nullptr, *ResultPtr, Size); - - return UR_RESULT_SUCCESS; -} - -// If indirect access tracking is not enabled then this functions just performs -// zeMemFree. If indirect access tracking is enabled then reference counting is -// performed. -ur_result_t ZeMemFreeHelper(ur_context_handle_t Context, void *Ptr) { - ur_platform_handle_t Plt = Context->getPlatform(); - std::unique_lock ContextsLock(Plt->ContextsMutex, - std::defer_lock); - if (IndirectAccessTrackingEnabled) { - ContextsLock.lock(); - auto It = Context->MemAllocs.find(Ptr); - if (It == std::end(Context->MemAllocs)) { - die("All memory allocations must be tracked!"); - } - if (!It->second.RefCount.decrementAndTest()) { - // Memory can't be deallocated yet. - return UR_RESULT_SUCCESS; - } - - // Reference count is zero, it is ok to free memory. - // We don't need to track this allocation anymore. - Context->MemAllocs.erase(It); - } - - ZE2UR_CALL(zeMemFree, (Context->ZeContext, Ptr)); - - if (IndirectAccessTrackingEnabled) - UR_CALL(ContextReleaseHelper(Context)); - - return UR_RESULT_SUCCESS; -} - -bool ShouldUseUSMAllocator() { - // Enable allocator by default if it's not explicitly disabled - const char *UrRet = std::getenv("UR_L0_DISABLE_USM_ALLOCATOR"); - const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR"); - const char *Ret = UrRet ? UrRet : (PiRet ? PiRet : nullptr); - return Ret == nullptr; -} - -const bool UseUSMAllocator = ShouldUseUSMAllocator(); - -// Helper function to deallocate USM memory, if indirect access support is -// enabled then a caller must lock the platform-level mutex guarding the -// container with contexts because deallocating the memory can turn RefCount of -// a context to 0 and as a result the context being removed from the list of -// tracked contexts. -// If indirect access tracking is not enabled then caller must lock Context -// mutex. -ur_result_t USMFreeHelper(ur_context_handle_t Context, void *Ptr, - bool OwnZeMemHandle) { - if (!OwnZeMemHandle) { - // Memory should not be freed - return UR_RESULT_SUCCESS; - } - - if (IndirectAccessTrackingEnabled) { - auto It = Context->MemAllocs.find(Ptr); - if (It == std::end(Context->MemAllocs)) { - die("All memory allocations must be tracked!"); - } - if (!It->second.RefCount.decrementAndTest()) { - // Memory can't be deallocated yet. - return UR_RESULT_SUCCESS; - } - - // Reference count is zero, it is ok to free memory. - // We don't need to track this allocation anymore. - Context->MemAllocs.erase(It); - } - - if (!UseUSMAllocator) { - ur_result_t Res = USMFreeImpl(Context, Ptr); - if (IndirectAccessTrackingEnabled) - UR_CALL(ContextReleaseHelper(Context)); - return Res; - } - - // Query the device of the allocation to determine the right allocator context - ze_device_handle_t ZeDeviceHandle; - ZeStruct ZeMemoryAllocationProperties; - - // Query memory type of the pointer we're freeing to determine the correct - // way to do it(directly or via an allocator) - auto ZeResult = - ZE_CALL_NOCHECK(zeMemGetAllocProperties, - (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties, - &ZeDeviceHandle)); - - // Handle the case that L0 RT was already unloaded - if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) { - if (IndirectAccessTrackingEnabled) - UR_CALL(ContextReleaseHelper(Context)); - return UR_RESULT_SUCCESS; - } else if (ZeResult) { - return ze2urResult(ZeResult); - } - - // If memory type is host release from host pool - if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST) { - try { - Context->HostMemAllocContext->deallocate(Ptr); - } catch (const UsmAllocationException &Ex) { - return Ex.getError(); - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - if (IndirectAccessTrackingEnabled) - UR_CALL(ContextReleaseHelper(Context)); - return UR_RESULT_SUCCESS; - } - - // Points out an allocation in SharedReadOnlyMemAllocContexts - auto SharedReadOnlyAllocsIterator = Context->SharedReadOnlyAllocs.end(); - - if (!ZeDeviceHandle) { - // The only case where it is OK not have device identified is - // if the memory is not known to the driver. We should not ever get - // this either, probably. - UR_ASSERT(ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN, - UR_RESULT_ERROR_INVALID_DEVICE); - } else { - ur_device_handle_t Device; - // All context member devices or their descendants are of the same platform. - auto Platform = Context->getPlatform(); - Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle); - UR_ASSERT(Device, UR_RESULT_ERROR_INVALID_DEVICE); - - auto DeallocationHelper = - [Context, Device, - Ptr](std::unordered_map - &AllocContextMap) { - try { - auto It = AllocContextMap.find(Device->ZeDevice); - if (It == AllocContextMap.end()) - return UR_RESULT_ERROR_INVALID_VALUE; - - // The right context is found, deallocate the pointer - It->second.deallocate(Ptr); - } catch (const UsmAllocationException &Ex) { - return Ex.getError(); - } - - if (IndirectAccessTrackingEnabled) - UR_CALL(ContextReleaseHelper(Context)); - return UR_RESULT_SUCCESS; - }; - - switch (ZeMemoryAllocationProperties.type) { - case ZE_MEMORY_TYPE_SHARED: - // Distinguish device_read_only allocations since they have own pool. - SharedReadOnlyAllocsIterator = Context->SharedReadOnlyAllocs.find(Ptr); - return DeallocationHelper(SharedReadOnlyAllocsIterator != - Context->SharedReadOnlyAllocs.end() - ? Context->SharedReadOnlyMemAllocContexts - : Context->SharedMemAllocContexts); - case ZE_MEMORY_TYPE_DEVICE: - return DeallocationHelper(Context->DeviceMemAllocContexts); - default: - // Handled below - break; - } - } - - ur_result_t Res = USMFreeImpl(Context, Ptr); - if (SharedReadOnlyAllocsIterator != Context->SharedReadOnlyAllocs.end()) { - Context->SharedReadOnlyAllocs.erase(SharedReadOnlyAllocsIterator); - } - if (IndirectAccessTrackingEnabled) - UR_CALL(ContextReleaseHelper(Context)); - return Res; -} - // If indirect access tracking is enabled then performs reference counting, // otherwise just calls zeMemAllocDevice. static ur_result_t ZeDeviceMemAllocHelper(void **ResultPtr, diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp index 0d658342fb0b1..d07d929a59867 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp @@ -56,15 +56,6 @@ ur_result_t enqueueMemCopyRectHelper( uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList, ur_event_handle_t *OutEvent, bool PreferCopyEngine = false); -// Exception type to pass allocation errors -class UsmAllocationException { - const ur_result_t Error; - -public: - UsmAllocationException(ur_result_t Err) : Error{Err} {} - ur_result_t getError() const { return Error; } -}; - struct ur_mem_handle_t_ : _ur_object { // Keeps the PI context of this memory handle. ur_context_handle_t UrContext; @@ -219,92 +210,3 @@ struct _ur_image final : ur_mem_handle_t_ { // Level Zero image handle. ze_image_handle_t ZeImage; }; - -// Implements memory allocation via L0 RT for USM allocator interface. -class USMMemoryAllocBase : public SystemMemory { -protected: - ur_context_handle_t Context; - ur_device_handle_t Device; - // Internal allocation routine which must be implemented for each allocation - // type - virtual ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) = 0; - -public: - USMMemoryAllocBase(ur_context_handle_t Ctx, ur_device_handle_t Dev) - : Context{Ctx}, Device{Dev} {} - void *allocate(size_t Size) override final; - void *allocate(size_t Size, size_t Alignment) override final; - void deallocate(void *Ptr) override final; -}; - -// Allocation routines for shared memory type -class USMSharedMemoryAlloc : public USMMemoryAllocBase { -protected: - ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) override; - -public: - USMSharedMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev) - : USMMemoryAllocBase(Ctx, Dev) {} -}; - -// Allocation routines for shared memory type that is only modified from host. -class USMSharedReadOnlyMemoryAlloc : public USMMemoryAllocBase { -protected: - ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) override; - -public: - USMSharedReadOnlyMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev) - : USMMemoryAllocBase(Ctx, Dev) {} -}; - -// Allocation routines for device memory type -class USMDeviceMemoryAlloc : public USMMemoryAllocBase { -protected: - ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) override; - -public: - USMDeviceMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev) - : USMMemoryAllocBase(Ctx, Dev) {} -}; - -// Allocation routines for host memory type -class USMHostMemoryAlloc : public USMMemoryAllocBase { -protected: - ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) override; - -public: - USMHostMemoryAlloc(ur_context_handle_t Ctx) - : USMMemoryAllocBase(Ctx, nullptr) {} -}; - -ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t Device, - ur_usm_device_mem_flags_t *Flags, size_t Size, - uint32_t Alignment); - -ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t Device, - ur_usm_host_mem_flags_t *, - ur_usm_device_mem_flags_t *, size_t Size, - uint32_t Alignment); - -ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_usm_host_mem_flags_t *Flags, size_t Size, - uint32_t Alignment); - -// If indirect access tracking is not enabled then this functions just performs -// zeMemFree. If indirect access tracking is enabled then reference counting is -// performed. -ur_result_t ZeMemFreeHelper(ur_context_handle_t Context, void *Ptr); - -ur_result_t USMFreeHelper(ur_context_handle_t Context, void *Ptr, - bool OwnZeMemHandle = true); - -bool ShouldUseUSMAllocator(); - -extern const bool UseUSMAllocator; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp new file mode 100644 index 0000000000000..b6236e388a913 --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp @@ -0,0 +1,764 @@ +//===--------- ur_level_zero_usm.cpp - Level Zero Adapter -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include +#include +#include + +#include "ur_level_zero.hpp" +#include "ur_level_zero_context.hpp" +#include "ur_level_zero_event.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( + ur_context_handle_t Context, ///< [in] handle of the context object + const ur_usm_desc_t + *USMDesc, ///< [in][optional] USM memory allocation descriptor + ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created + ///< using urUSMPoolCreate + size_t + Size, ///< [in] size in bytes of the USM memory object to be allocated + void **RetMem ///< [out] pointer to USM host memory object +) { + std::ignore = Pool; + + uint32_t Align = USMDesc->align; + // L0 supports alignment up to 64KB and silently ignores higher values. + // We flag alignment > 64KB as an invalid value. + if (Align > 65536) + return UR_RESULT_ERROR_INVALID_VALUE; + + const ur_usm_advice_flags_t *USMHintFlags = &USMDesc->hints; + std::ignore = USMHintFlags; + + ur_platform_handle_t Plt = Context->getPlatform(); + // If indirect access tracking is enabled then lock the mutex which is + // guarding contexts container in the platform. This prevents new kernels from + // being submitted in any context while we are in the process of allocating a + // memory, this is needed to properly capture allocations by kernels with + // indirect access. This lock also protects access to the context's data + // structures. If indirect access tracking is not enabled then lock context + // mutex to protect access to context's data structures. + std::shared_lock ContextLock(Context->Mutex, + std::defer_lock); + std::unique_lock IndirectAccessTrackingLock( + Plt->ContextsMutex, std::defer_lock); + if (IndirectAccessTrackingEnabled) { + IndirectAccessTrackingLock.lock(); + // We are going to defer memory release if there are kernels with indirect + // access, that is why explicitly retain context to be sure that it is + // released after all memory allocations in this context are released. + UR_CALL(urContextRetain(Context)); + } else { + ContextLock.lock(); + } + + if (!UseUSMAllocator || + // L0 spec says that allocation fails if Alignment != 2^n, in order to + // keep the same behavior for the allocator, just call L0 API directly and + // return the error code. + ((Align & (Align - 1)) != 0)) { + ur_usm_host_mem_flags_t Flags{}; + ur_result_t Res = USMHostAllocImpl(RetMem, Context, &Flags, Size, Align); + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*RetMem), + std::forward_as_tuple(Context)); + } + return Res; + } + + // There is a single allocator for Host USM allocations, so we don't need to + // find the allocator depending on context as we do for Shared and Device + // allocations. + try { + *RetMem = Context->HostMemAllocContext->allocate(Size, Align); + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*RetMem), + std::forward_as_tuple(Context)); + } + } catch (const UsmAllocationException &Ex) { + *RetMem = nullptr; + return Ex.getError(); + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( + ur_context_handle_t Context, ///< [in] handle of the context object + ur_device_handle_t Device, ///< [in] handle of the device object + const ur_usm_desc_t + *USMDesc, ///< [in][optional] USM memory allocation descriptor + ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created + ///< using urUSMPoolCreate + size_t + Size, ///< [in] size in bytes of the USM memory object to be allocated + void **RetMem ///< [out] pointer to USM device memory object +) { + std::ignore = Pool; + + uint32_t Alignment = USMDesc->align; + + // L0 supports alignment up to 64KB and silently ignores higher values. + // We flag alignment > 64KB as an invalid value. + if (Alignment > 65536) + return UR_RESULT_ERROR_INVALID_VALUE; + + const ur_usm_advice_flags_t *USMHintFlags = &USMDesc->hints; + std::ignore = USMHintFlags; + + ur_platform_handle_t Plt = Device->Platform; + + // If indirect access tracking is enabled then lock the mutex which is + // guarding contexts container in the platform. This prevents new kernels from + // being submitted in any context while we are in the process of allocating a + // memory, this is needed to properly capture allocations by kernels with + // indirect access. This lock also protects access to the context's data + // structures. If indirect access tracking is not enabled then lock context + // mutex to protect access to context's data structures. + std::shared_lock ContextLock(Context->Mutex, + std::defer_lock); + std::unique_lock IndirectAccessTrackingLock( + Plt->ContextsMutex, std::defer_lock); + if (IndirectAccessTrackingEnabled) { + IndirectAccessTrackingLock.lock(); + // We are going to defer memory release if there are kernels with indirect + // access, that is why explicitly retain context to be sure that it is + // released after all memory allocations in this context are released. + UR_CALL(urContextRetain(Context)); + } else { + ContextLock.lock(); + } + + if (!UseUSMAllocator || + // L0 spec says that allocation fails if Alignment != 2^n, in order to + // keep the same behavior for the allocator, just call L0 API directly and + // return the error code. + ((Alignment & (Alignment - 1)) != 0)) { + ur_result_t Res = + USMDeviceAllocImpl(RetMem, Context, Device, nullptr, Size, Alignment); + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*RetMem), + std::forward_as_tuple(Context)); + } + return Res; + } + + try { + auto It = Context->DeviceMemAllocContexts.find(Device->ZeDevice); + if (It == Context->DeviceMemAllocContexts.end()) + return UR_RESULT_ERROR_INVALID_VALUE; + + *RetMem = It->second.allocate(Size, Alignment); + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*RetMem), + std::forward_as_tuple(Context)); + } + + } catch (const UsmAllocationException &Ex) { + *RetMem = nullptr; + return Ex.getError(); + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( + ur_context_handle_t Context, ///< [in] handle of the context object + ur_device_handle_t Device, ///< [in] handle of the device object + const ur_usm_desc_t + *USMDesc, ///< [in][optional] USM memory allocation descriptor + ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created + ///< using urUSMPoolCreate + size_t + Size, ///< [in] size in bytes of the USM memory object to be allocated + void **RetMem ///< [out] pointer to USM shared memory object +) { + std::ignore = Pool; + + uint32_t Alignment = USMDesc->align; + + ur_usm_host_mem_flags_t UsmHostFlags{}; + + // See if the memory is going to be read-only on the device. + bool DeviceReadOnly = false; + ur_usm_device_mem_flags_t UsmDeviceFlags{}; + + void *pNext = const_cast(USMDesc->pNext); + while (pNext != nullptr) { + const ur_base_desc_t *BaseDesc = + reinterpret_cast(pNext); + if (BaseDesc->stype == UR_STRUCTURE_TYPE_USM_DEVICE_DESC) { + const ur_usm_device_desc_t *UsmDeviceDesc = + reinterpret_cast(pNext); + UsmDeviceFlags = UsmDeviceDesc->flags; + } + if (BaseDesc->stype == UR_STRUCTURE_TYPE_USM_HOST_DESC) { + const ur_usm_host_desc_t *UsmHostDesc = + reinterpret_cast(pNext); + UsmHostFlags = UsmHostDesc->flags; + } + pNext = const_cast(BaseDesc->pNext); + } + DeviceReadOnly = UsmDeviceFlags & UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY; + + // L0 supports alignment up to 64KB and silently ignores higher values. + // We flag alignment > 64KB as an invalid value. + if (Alignment > 65536) + return UR_RESULT_ERROR_INVALID_VALUE; + + ur_platform_handle_t Plt = Device->Platform; + + // If indirect access tracking is enabled then lock the mutex which is + // guarding contexts container in the platform. This prevents new kernels from + // being submitted in any context while we are in the process of allocating a + // memory, this is needed to properly capture allocations by kernels with + // indirect access. This lock also protects access to the context's data + // structures. If indirect access tracking is not enabled then lock context + // mutex to protect access to context's data structures. + std::scoped_lock Lock( + IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex); + + if (IndirectAccessTrackingEnabled) { + // We are going to defer memory release if there are kernels with indirect + // access, that is why explicitly retain context to be sure that it is + // released after all memory allocations in this context are released. + UR_CALL(urContextRetain(Context)); + } + + if (!UseUSMAllocator || + // L0 spec says that allocation fails if Alignment != 2^n, in order to + // keep the same behavior for the allocator, just call L0 API directly and + // return the error code. + ((Alignment & (Alignment - 1)) != 0)) { + ur_result_t Res = USMSharedAllocImpl(RetMem, Context, Device, &UsmHostFlags, + &UsmDeviceFlags, Size, Alignment); + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*RetMem), + std::forward_as_tuple(Context)); + } + return Res; + } + + try { + auto &Allocator = (DeviceReadOnly ? Context->SharedReadOnlyMemAllocContexts + : Context->SharedMemAllocContexts); + auto It = Allocator.find(Device->ZeDevice); + if (It == Allocator.end()) + return UR_RESULT_ERROR_INVALID_VALUE; + + *RetMem = It->second.allocate(Size, Alignment); + if (DeviceReadOnly) { + Context->SharedReadOnlyAllocs.insert(*RetMem); + } + if (IndirectAccessTrackingEnabled) { + // Keep track of all memory allocations in the context + Context->MemAllocs.emplace(std::piecewise_construct, + std::forward_as_tuple(*RetMem), + std::forward_as_tuple(Context)); + } + } catch (const UsmAllocationException &Ex) { + *RetMem = nullptr; + return Ex.getError(); + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMFree( + ur_context_handle_t Context, ///< [in] handle of the context object + void *Mem ///< [in] pointer to USM memory object +) { + ur_platform_handle_t Plt = Context->getPlatform(); + + std::scoped_lock Lock( + IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex); + + return USMFreeHelper(Context, Mem); +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMGetMemAllocInfo( + ur_context_handle_t Context, ///< [in] handle of the context object + const void *Ptr, ///< [in] pointer to USM memory object + ur_usm_alloc_info_t + PropName, ///< [in] the name of the USM allocation property to query + size_t PropValueSize, ///< [in] size in bytes of the USM allocation property + ///< value + void *PropValue, ///< [out][optional] value of the USM allocation property + size_t *PropValueSizeRet ///< [out][optional] bytes returned in USM + ///< allocation property +) { + ze_device_handle_t ZeDeviceHandle; + ZeStruct ZeMemoryAllocationProperties; + + ZE2UR_CALL(zeMemGetAllocProperties, + (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties, + &ZeDeviceHandle)); + + UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet); + switch (PropName) { + case UR_USM_ALLOC_INFO_TYPE: { + ur_usm_type_t MemAllocaType; + switch (ZeMemoryAllocationProperties.type) { + case ZE_MEMORY_TYPE_UNKNOWN: + MemAllocaType = UR_USM_TYPE_UNKNOWN; + break; + case ZE_MEMORY_TYPE_HOST: + MemAllocaType = UR_USM_TYPE_HOST; + break; + case ZE_MEMORY_TYPE_DEVICE: + MemAllocaType = UR_USM_TYPE_DEVICE; + break; + case ZE_MEMORY_TYPE_SHARED: + MemAllocaType = UR_USM_TYPE_SHARED; + break; + default: + urPrint("urUSMGetMemAllocInfo: unexpected usm memory type\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + return ReturnValue(MemAllocaType); + } + case UR_USM_ALLOC_INFO_DEVICE: + if (ZeDeviceHandle) { + auto Platform = Context->getPlatform(); + auto Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle); + return Device ? ReturnValue(Device) : UR_RESULT_ERROR_INVALID_VALUE; + } else { + return UR_RESULT_ERROR_INVALID_VALUE; + } + case UR_USM_ALLOC_INFO_BASE_PTR: { + void *Base; + ZE2UR_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, &Base, nullptr)); + return ReturnValue(Base); + } + case UR_USM_ALLOC_INFO_SIZE: { + size_t Size; + ZE2UR_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, nullptr, &Size)); + return ReturnValue(Size); + } + default: + urPrint("urUSMGetMemAllocInfo: unsupported ParamName\n"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + return UR_RESULT_SUCCESS; +} + +static ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) { + ZE2UR_CALL(zeMemFree, (Context->ZeContext, Ptr)); + return UR_RESULT_SUCCESS; +} + +void *USMMemoryAllocBase::allocate(size_t Size) { + void *Ptr = nullptr; + + auto Res = allocateImpl(&Ptr, Size, sizeof(void *)); + if (Res != UR_RESULT_SUCCESS) { + throw UsmAllocationException(Res); + } + + return Ptr; +} + +void *USMMemoryAllocBase::allocate(size_t Size, size_t Alignment) { + void *Ptr = nullptr; + + auto Res = allocateImpl(&Ptr, Size, Alignment); + if (Res != UR_RESULT_SUCCESS) { + throw UsmAllocationException(Res); + } + return Ptr; +} + +void USMMemoryAllocBase::deallocate(void *Ptr) { + auto Res = USMFreeImpl(Context, Ptr); + if (Res != UR_RESULT_SUCCESS) { + throw UsmAllocationException(Res); + } +} + +ur_result_t USMSharedMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) { + return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size, + Alignment); +} + +ur_result_t USMSharedReadOnlyMemoryAlloc::allocateImpl(void **ResultPtr, + size_t Size, + uint32_t Alignment) { + ur_usm_device_desc_t UsmDeviceDesc{}; + UsmDeviceDesc.flags = UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY; + ur_usm_host_desc_t UsmHostDesc{}; + return USMSharedAllocImpl(ResultPtr, Context, Device, &UsmDeviceDesc.flags, + &UsmHostDesc.flags, Size, Alignment); +} + +ur_result_t USMDeviceMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) { + return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size, + Alignment); +} + +ur_result_t USMHostMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) { + return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment); +} + +enum class USMAllocationForceResidencyType { + // Do not force memory residency at allocation time. + None = 0, + // Force memory resident on the device of allocation at allocation time. + // For host allocation force residency on all devices in a context. + Device = 1, + // [Default] Force memory resident on all devices in the context with P2P + // access to the device of allocation. + // For host allocation force residency on all devices in a context. + P2PDevices = 2 +}; + +// Returns the desired USM residency setting +static USMAllocationForceResidencyType USMAllocationForceResidency = [] { + const char *UrRet = std::getenv("UR_L0_USM_RESIDENT"); + const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT"); + const char *Str = UrRet ? UrRet : (PiRet ? PiRet : nullptr); + if (!Str) + return USMAllocationForceResidencyType::P2PDevices; + switch (std::atoi(Str)) { + case 1: + return USMAllocationForceResidencyType::Device; + case 2: + return USMAllocationForceResidencyType::P2PDevices; + default: + return USMAllocationForceResidencyType::None; + }; +}(); + +// Make USM allocation resident as requested +static ur_result_t USMAllocationMakeResident( + ur_context_handle_t Context, + ur_device_handle_t Device, // nullptr for host allocation + void *Ptr, size_t Size) { + + std::list Devices; + + if (USMAllocationForceResidency == USMAllocationForceResidencyType::None) + return UR_RESULT_SUCCESS; + else if (!Device) { + // Host allocation, make it resident on all devices in the context + Devices.insert(Devices.end(), Context->Devices.begin(), + Context->Devices.end()); + } else { + Devices.push_back(Device); + if (USMAllocationForceResidency == + USMAllocationForceResidencyType::P2PDevices) { + ze_bool_t P2P; + for (const auto &D : Context->Devices) { + if (D == Device) + continue; + // TODO: Cache P2P devices for a context + ZE2UR_CALL(zeDeviceCanAccessPeer, + (D->ZeDevice, Device->ZeDevice, &P2P)); + if (P2P) + Devices.push_back(D); + } + } + } + for (const auto &D : Devices) { + ZE2UR_CALL(zeContextMakeMemoryResident, + (Context->ZeContext, D->ZeDevice, Ptr, Size)); + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate( + ur_context_handle_t Context, ///< [in] handle of the context object + ur_usm_pool_desc_t + *PoolDesc, ///< [in] pointer to USM pool descriptor. Can be chained with + ///< ::ur_usm_pool_limits_desc_t + ur_usm_pool_handle_t *Pool ///< [out] pointer to USM memory pool +) { + std::ignore = Context; + std::ignore = PoolDesc; + std::ignore = Pool; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolDestroy( + ur_context_handle_t Context, ///< [in] handle of the context object + ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool +) { + std::ignore = Context; + std::ignore = Pool; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_device_handle_t Device, + ur_usm_device_mem_flags_t *Flags, size_t Size, + uint32_t Alignment) { + // TODO: translate PI properties to Level Zero flags + ZeStruct ZeDesc; + ZeDesc.flags = 0; + ZeDesc.ordinal = 0; + + ZeStruct RelaxedDesc; + if (Size > Device->ZeDeviceProperties->maxMemAllocSize) { + // Tell Level-Zero to accept Size > maxMemAllocSize + RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE; + ZeDesc.pNext = &RelaxedDesc; + } + + ZE2UR_CALL(zeMemAllocDevice, (Context->ZeContext, &ZeDesc, Size, Alignment, + Device->ZeDevice, ResultPtr)); + + UR_ASSERT(Alignment == 0 || + reinterpret_cast(*ResultPtr) % Alignment == 0, + UR_RESULT_ERROR_INVALID_VALUE); + + USMAllocationMakeResident(Context, Device, *ResultPtr, Size); + return UR_RESULT_SUCCESS; +} + +ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_device_handle_t Device, + ur_usm_host_mem_flags_t *, + ur_usm_device_mem_flags_t *, size_t Size, + uint32_t Alignment) { + + // TODO: translate PI properties to Level Zero flags + ZeStruct ZeHostDesc; + ZeHostDesc.flags = 0; + ZeStruct ZeDevDesc; + ZeDevDesc.flags = 0; + ZeDevDesc.ordinal = 0; + + ZeStruct RelaxedDesc; + if (Size > Device->ZeDeviceProperties->maxMemAllocSize) { + // Tell Level-Zero to accept Size > maxMemAllocSize + RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE; + ZeDevDesc.pNext = &RelaxedDesc; + } + + ZE2UR_CALL(zeMemAllocShared, (Context->ZeContext, &ZeDevDesc, &ZeHostDesc, + Size, Alignment, Device->ZeDevice, ResultPtr)); + + UR_ASSERT(Alignment == 0 || + reinterpret_cast(*ResultPtr) % Alignment == 0, + UR_RESULT_ERROR_INVALID_VALUE); + + USMAllocationMakeResident(Context, Device, *ResultPtr, Size); + + // TODO: Handle PI_MEM_ALLOC_DEVICE_READ_ONLY. + return UR_RESULT_SUCCESS; +} + +ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_usm_host_mem_flags_t *Flags, size_t Size, + uint32_t Alignment) { + // TODO: translate PI properties to Level Zero flags + ZeStruct ZeHostDesc; + ZeHostDesc.flags = 0; + ZE2UR_CALL(zeMemAllocHost, + (Context->ZeContext, &ZeHostDesc, Size, Alignment, ResultPtr)); + + UR_ASSERT(Alignment == 0 || + reinterpret_cast(*ResultPtr) % Alignment == 0, + UR_RESULT_ERROR_INVALID_VALUE); + + USMAllocationMakeResident(Context, nullptr, *ResultPtr, Size); + + return UR_RESULT_SUCCESS; +} + +// If indirect access tracking is not enabled then this functions just performs +// zeMemFree. If indirect access tracking is enabled then reference counting is +// performed. +ur_result_t ZeMemFreeHelper(ur_context_handle_t Context, void *Ptr) { + ur_platform_handle_t Plt = Context->getPlatform(); + std::unique_lock ContextsLock(Plt->ContextsMutex, + std::defer_lock); + if (IndirectAccessTrackingEnabled) { + ContextsLock.lock(); + auto It = Context->MemAllocs.find(Ptr); + if (It == std::end(Context->MemAllocs)) { + die("All memory allocations must be tracked!"); + } + if (!It->second.RefCount.decrementAndTest()) { + // Memory can't be deallocated yet. + return UR_RESULT_SUCCESS; + } + + // Reference count is zero, it is ok to free memory. + // We don't need to track this allocation anymore. + Context->MemAllocs.erase(It); + } + + ZE2UR_CALL(zeMemFree, (Context->ZeContext, Ptr)); + + if (IndirectAccessTrackingEnabled) + UR_CALL(ContextReleaseHelper(Context)); + + return UR_RESULT_SUCCESS; +} + +bool ShouldUseUSMAllocator() { + // Enable allocator by default if it's not explicitly disabled + const char *UrRet = std::getenv("UR_L0_DISABLE_USM_ALLOCATOR"); + const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR"); + const char *Ret = UrRet ? UrRet : (PiRet ? PiRet : nullptr); + return Ret == nullptr; +} + +const bool UseUSMAllocator = ShouldUseUSMAllocator(); + +// Helper function to deallocate USM memory, if indirect access support is +// enabled then a caller must lock the platform-level mutex guarding the +// container with contexts because deallocating the memory can turn RefCount of +// a context to 0 and as a result the context being removed from the list of +// tracked contexts. +// If indirect access tracking is not enabled then caller must lock Context +// mutex. +ur_result_t USMFreeHelper(ur_context_handle_t Context, void *Ptr, + bool OwnZeMemHandle) { + if (!OwnZeMemHandle) { + // Memory should not be freed + return UR_RESULT_SUCCESS; + } + + if (IndirectAccessTrackingEnabled) { + auto It = Context->MemAllocs.find(Ptr); + if (It == std::end(Context->MemAllocs)) { + die("All memory allocations must be tracked!"); + } + if (!It->second.RefCount.decrementAndTest()) { + // Memory can't be deallocated yet. + return UR_RESULT_SUCCESS; + } + + // Reference count is zero, it is ok to free memory. + // We don't need to track this allocation anymore. + Context->MemAllocs.erase(It); + } + + if (!UseUSMAllocator) { + ur_result_t Res = USMFreeImpl(Context, Ptr); + if (IndirectAccessTrackingEnabled) + UR_CALL(ContextReleaseHelper(Context)); + return Res; + } + + // Query the device of the allocation to determine the right allocator context + ze_device_handle_t ZeDeviceHandle; + ZeStruct ZeMemoryAllocationProperties; + + // Query memory type of the pointer we're freeing to determine the correct + // way to do it(directly or via an allocator) + auto ZeResult = + ZE_CALL_NOCHECK(zeMemGetAllocProperties, + (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties, + &ZeDeviceHandle)); + + // Handle the case that L0 RT was already unloaded + if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) { + if (IndirectAccessTrackingEnabled) + UR_CALL(ContextReleaseHelper(Context)); + return UR_RESULT_SUCCESS; + } else if (ZeResult) { + return ze2urResult(ZeResult); + } + + // If memory type is host release from host pool + if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST) { + try { + Context->HostMemAllocContext->deallocate(Ptr); + } catch (const UsmAllocationException &Ex) { + return Ex.getError(); + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + if (IndirectAccessTrackingEnabled) + UR_CALL(ContextReleaseHelper(Context)); + return UR_RESULT_SUCCESS; + } + + // Points out an allocation in SharedReadOnlyMemAllocContexts + auto SharedReadOnlyAllocsIterator = Context->SharedReadOnlyAllocs.end(); + + if (!ZeDeviceHandle) { + // The only case where it is OK not have device identified is + // if the memory is not known to the driver. We should not ever get + // this either, probably. + UR_ASSERT(ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN, + UR_RESULT_ERROR_INVALID_DEVICE); + } else { + ur_device_handle_t Device; + // All context member devices or their descendants are of the same platform. + auto Platform = Context->getPlatform(); + Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle); + UR_ASSERT(Device, UR_RESULT_ERROR_INVALID_DEVICE); + + auto DeallocationHelper = + [Context, Device, + Ptr](std::unordered_map + &AllocContextMap) { + try { + auto It = AllocContextMap.find(Device->ZeDevice); + if (It == AllocContextMap.end()) + return UR_RESULT_ERROR_INVALID_VALUE; + + // The right context is found, deallocate the pointer + It->second.deallocate(Ptr); + } catch (const UsmAllocationException &Ex) { + return Ex.getError(); + } + + if (IndirectAccessTrackingEnabled) + UR_CALL(ContextReleaseHelper(Context)); + return UR_RESULT_SUCCESS; + }; + + switch (ZeMemoryAllocationProperties.type) { + case ZE_MEMORY_TYPE_SHARED: + // Distinguish device_read_only allocations since they have own pool. + SharedReadOnlyAllocsIterator = Context->SharedReadOnlyAllocs.find(Ptr); + return DeallocationHelper(SharedReadOnlyAllocsIterator != + Context->SharedReadOnlyAllocs.end() + ? Context->SharedReadOnlyMemAllocContexts + : Context->SharedMemAllocContexts); + case ZE_MEMORY_TYPE_DEVICE: + return DeallocationHelper(Context->DeviceMemAllocContexts); + default: + // Handled below + break; + } + } + + ur_result_t Res = USMFreeImpl(Context, Ptr); + if (SharedReadOnlyAllocsIterator != Context->SharedReadOnlyAllocs.end()) { + Context->SharedReadOnlyAllocs.erase(SharedReadOnlyAllocsIterator); + } + if (IndirectAccessTrackingEnabled) + UR_CALL(ContextReleaseHelper(Context)); + return Res; +} \ No newline at end of file diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp new file mode 100644 index 0000000000000..ba0130089906e --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp @@ -0,0 +1,108 @@ +//===--------- ur_level_zero_usm.hpp - Level Zero Adapter -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// +#pragma once + +#include "ur_level_zero_common.hpp" + +// Exception type to pass allocation errors +class UsmAllocationException { + const ur_result_t Error; + +public: + UsmAllocationException(ur_result_t Err) : Error{Err} {} + ur_result_t getError() const { return Error; } +}; + +// Implements memory allocation via L0 RT for USM allocator interface. +class USMMemoryAllocBase : public SystemMemory { +protected: + ur_context_handle_t Context; + ur_device_handle_t Device; + // Internal allocation routine which must be implemented for each allocation + // type + virtual ur_result_t allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) = 0; + +public: + USMMemoryAllocBase(ur_context_handle_t Ctx, ur_device_handle_t Dev) + : Context{Ctx}, Device{Dev} {} + void *allocate(size_t Size) override final; + void *allocate(size_t Size, size_t Alignment) override final; + void deallocate(void *Ptr) override final; +}; + +// Allocation routines for shared memory type +class USMSharedMemoryAlloc : public USMMemoryAllocBase { +protected: + ur_result_t allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) override; + +public: + USMSharedMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev) + : USMMemoryAllocBase(Ctx, Dev) {} +}; + +// Allocation routines for shared memory type that is only modified from host. +class USMSharedReadOnlyMemoryAlloc : public USMMemoryAllocBase { +protected: + ur_result_t allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) override; + +public: + USMSharedReadOnlyMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev) + : USMMemoryAllocBase(Ctx, Dev) {} +}; + +// Allocation routines for device memory type +class USMDeviceMemoryAlloc : public USMMemoryAllocBase { +protected: + ur_result_t allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) override; + +public: + USMDeviceMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev) + : USMMemoryAllocBase(Ctx, Dev) {} +}; + +// Allocation routines for host memory type +class USMHostMemoryAlloc : public USMMemoryAllocBase { +protected: + ur_result_t allocateImpl(void **ResultPtr, size_t Size, + uint32_t Alignment) override; + +public: + USMHostMemoryAlloc(ur_context_handle_t Ctx) + : USMMemoryAllocBase(Ctx, nullptr) {} +}; + +ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_device_handle_t Device, + ur_usm_device_mem_flags_t *Flags, size_t Size, + uint32_t Alignment); + +ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_device_handle_t Device, + ur_usm_host_mem_flags_t *, + ur_usm_device_mem_flags_t *, size_t Size, + uint32_t Alignment); + +ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_usm_host_mem_flags_t *Flags, size_t Size, + uint32_t Alignment); + +// If indirect access tracking is not enabled then this functions just performs +// zeMemFree. If indirect access tracking is enabled then reference counting is +// performed. +ur_result_t ZeMemFreeHelper(ur_context_handle_t Context, void *Ptr); + +ur_result_t USMFreeHelper(ur_context_handle_t Context, void *Ptr, + bool OwnZeMemHandle = true); + +bool ShouldUseUSMAllocator(); + +extern const bool UseUSMAllocator; From cd11f354be2ac4b3b2e36c8578270053b275eb56 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Tue, 16 May 2023 17:13:55 -0700 Subject: [PATCH 42/50] Port [SYCL[L0] Change the SYCL_PI_LEVEL_ZERO_USM_RESIDENT default https://github.com/intel/llvm/pull/9442 Signed-off-by: Jaime Arteaga --- .../adapters/level_zero/ur_level_zero_usm.cpp | 64 ++++++++++++++----- 1 file changed, 48 insertions(+), 16 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp index b6236e388a913..2a5effb541ad7 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp @@ -429,20 +429,37 @@ enum class USMAllocationForceResidencyType { // Force memory resident on the device of allocation at allocation time. // For host allocation force residency on all devices in a context. Device = 1, - // [Default] Force memory resident on all devices in the context with P2P + // Force memory resident on all devices in the context with P2P // access to the device of allocation. // For host allocation force residency on all devices in a context. P2PDevices = 2 }; -// Returns the desired USM residency setting -static USMAllocationForceResidencyType USMAllocationForceResidency = [] { +// Input value is of the form 0xHSD, where: +// 4-bits of D control device allocations +// 4-bits of S control shared allocations +// 4-bits of H control host allocations +// Each 4-bit value is holding a USMAllocationForceResidencyType enum value. +// The default is 0x2, i.e. force full residency for device allocations only. +// +static uint32_t USMAllocationForceResidency = [] { const char *UrRet = std::getenv("UR_L0_USM_RESIDENT"); const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT"); const char *Str = UrRet ? UrRet : (PiRet ? PiRet : nullptr); - if (!Str) - return USMAllocationForceResidencyType::P2PDevices; - switch (std::atoi(Str)) { + try { + if (Str) { + // Auto-detect radix to allow more convinient hex base + return std::stoi(Str, nullptr, 0); + } + } catch (...) { + } + return 0x2; +}(); + +// Convert from an integer value to USMAllocationForceResidencyType enum value +static USMAllocationForceResidencyType +USMAllocationForceResidencyConvert(uint32_t Val) { + switch (Val) { case 1: return USMAllocationForceResidencyType::Device; case 2: @@ -450,26 +467,38 @@ static USMAllocationForceResidencyType USMAllocationForceResidency = [] { default: return USMAllocationForceResidencyType::None; }; +} + +static USMAllocationForceResidencyType USMHostAllocationForceResidency = [] { + return USMAllocationForceResidencyConvert( + (USMAllocationForceResidency & 0xf00) >> 8); +}(); +static USMAllocationForceResidencyType USMSharedAllocationForceResidency = [] { + return USMAllocationForceResidencyConvert( + (USMAllocationForceResidency & 0x0f0) >> 4); +}(); +static USMAllocationForceResidencyType USMDeviceAllocationForceResidency = [] { + return USMAllocationForceResidencyConvert( + (USMAllocationForceResidency & 0x00f)); }(); // Make USM allocation resident as requested static ur_result_t USMAllocationMakeResident( - ur_context_handle_t Context, + USMAllocationForceResidencyType ForceResidency, ur_context_handle_t Context, ur_device_handle_t Device, // nullptr for host allocation void *Ptr, size_t Size) { - std::list Devices; - - if (USMAllocationForceResidency == USMAllocationForceResidencyType::None) + if (ForceResidency == USMAllocationForceResidencyType::None) return UR_RESULT_SUCCESS; - else if (!Device) { + + std::list Devices; + if (!Device) { // Host allocation, make it resident on all devices in the context Devices.insert(Devices.end(), Context->Devices.begin(), Context->Devices.end()); } else { Devices.push_back(Device); - if (USMAllocationForceResidency == - USMAllocationForceResidencyType::P2PDevices) { + if (ForceResidency == USMAllocationForceResidencyType::P2PDevices) { ze_bool_t P2P; for (const auto &D : Context->Devices) { if (D == Device) @@ -536,7 +565,8 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, reinterpret_cast(*ResultPtr) % Alignment == 0, UR_RESULT_ERROR_INVALID_VALUE); - USMAllocationMakeResident(Context, Device, *ResultPtr, Size); + USMAllocationMakeResident(USMDeviceAllocationForceResidency, Context, Device, + *ResultPtr, Size); return UR_RESULT_SUCCESS; } @@ -567,7 +597,8 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, reinterpret_cast(*ResultPtr) % Alignment == 0, UR_RESULT_ERROR_INVALID_VALUE); - USMAllocationMakeResident(Context, Device, *ResultPtr, Size); + USMAllocationMakeResident(USMSharedAllocationForceResidency, Context, Device, + *ResultPtr, Size); // TODO: Handle PI_MEM_ALLOC_DEVICE_READ_ONLY. return UR_RESULT_SUCCESS; @@ -586,7 +617,8 @@ ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, reinterpret_cast(*ResultPtr) % Alignment == 0, UR_RESULT_ERROR_INVALID_VALUE); - USMAllocationMakeResident(Context, nullptr, *ResultPtr, Size); + USMAllocationMakeResident(USMHostAllocationForceResidency, Context, nullptr, + *ResultPtr, Size); return UR_RESULT_SUCCESS; } From 2922ae7f37de79cb250a6a6c077660a8404e7622 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 18 May 2023 09:20:59 -0700 Subject: [PATCH 43/50] Port [SYCL] [L0] Remove unneeded backwards compatibility of 2023.2 make_queue and get_native https://github.com/intel/llvm/pull/8871 Signed-off-by: Jaime Arteaga --- sycl/plugins/level_zero/pi_level_zero.cpp | 35 ++---- sycl/plugins/unified_runtime/pi2ur.hpp | 69 +++++------ .../unified_runtime/pi_unified_runtime.cpp | 42 ++----- .../level_zero/ur_level_zero_queue.cpp | 111 ++++++++++++++---- .../level_zero/ur_level_zero_queue.hpp | 9 ++ 5 files changed, 150 insertions(+), 116 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index bc55890ada108..cd5ed69889253 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -179,28 +179,6 @@ pi_result piextQueueCreate(pi_context Context, pi_device Device, return pi2ur::piextQueueCreate(Context, Device, Properties, Queue); } -pi_result piextQueueCreate2(pi_context Context, pi_device Device, - pi_queue_properties *Properties, pi_queue *Queue) { - return pi2ur::piextQueueCreate(Context, Device, Properties, Queue); -} - -pi_result piextQueueGetNativeHandle2(pi_queue Queue, - pi_native_handle *NativeHandle, - int32_t *NativeHandleDesc) { - std::ignore = NativeHandleDesc; - return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle); -} - -pi_result piextQueueCreateWithNativeHandle2( - pi_native_handle NativeHandle, int32_t NativeHandleDesc, pi_context Context, - pi_device Device, bool OwnNativeHandle, pi_queue_properties *Properties, - pi_queue *Queue) { - std::ignore = NativeHandleDesc; - std::ignore = Properties; - return pi2ur::piextQueueCreateWithNativeHandle(NativeHandle, Context, Device, - OwnNativeHandle, Queue); -} - pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName, size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { @@ -220,18 +198,23 @@ pi_result piQueueFinish(pi_queue Queue) { return pi2ur::piQueueFinish(Queue); } pi_result piQueueFlush(pi_queue Queue) { return pi2ur::piQueueFlush(Queue); } pi_result piextQueueGetNativeHandle(pi_queue Queue, - pi_native_handle *NativeHandle) { + pi_native_handle *NativeHandle, + int32_t *NativeHandleDesc) { - return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle); + return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle, + NativeHandleDesc); } pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle, + int32_t NativeHandleDesc, pi_context Context, pi_device Device, bool OwnNativeHandle, + pi_queue_properties *Properties, pi_queue *Queue) { - return pi2ur::piextQueueCreateWithNativeHandle(NativeHandle, Context, Device, - OwnNativeHandle, Queue); + return pi2ur::piextQueueCreateWithNativeHandle( + NativeHandle, NativeHandleDesc, Context, Device, OwnNativeHandle, + Properties, Queue); } pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index f36ce228d48a2..f21484d657595 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -1326,17 +1326,10 @@ inline pi_result piQueueCreate(pi_context Context, pi_device Device, return pi2ur::piextQueueCreate(Context, Device, Properties, Queue); } -inline pi_result piextQueueCreate2(pi_context context, pi_device device, - pi_queue_properties *properties, - pi_queue *queue) { - return pi2ur::piextQueueCreate(context, device, properties, queue); -} - -inline pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle, - pi_context Context, - pi_device Device, - bool OwnNativeHandle, - pi_queue *Queue) { +inline pi_result piextQueueCreateWithNativeHandle( + pi_native_handle NativeHandle, int32_t NativeHandleDesc, pi_context Context, + pi_device Device, bool OwnNativeHandle, pi_queue_properties *Properties, + pi_queue *Queue) { PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); @@ -1348,29 +1341,45 @@ inline pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle, ur_native_handle_t UrNativeHandle = reinterpret_cast(NativeHandle); ur_queue_handle_t *UrQueue = reinterpret_cast(Queue); - ur_queue_native_properties_t Properties{}; - Properties.isNativeHandleOwned = OwnNativeHandle; - HANDLE_ERRORS(urQueueCreateWithNativeHandle(UrNativeHandle, UrContext, - UrDevice, &Properties, UrQueue)); - return PI_SUCCESS; -} + ur_queue_native_properties_t UrNativeProperties{}; + UrNativeProperties.isNativeHandleOwned = OwnNativeHandle; -inline pi_result piextQueueCreateWithNativeHandle2( - pi_native_handle nativeHandle, int32_t nativeHandleDesc, pi_context context, - pi_device device, bool pluginOwnsNativeHandle, - pi_queue_properties *Properties, pi_queue *queue) { - (void)nativeHandleDesc; - (void)Properties; - return pi2ur::piextQueueCreateWithNativeHandle(nativeHandle, context, device, - pluginOwnsNativeHandle, queue); + ur_queue_properties_t UrProperties{}; + UrProperties.stype = UR_STRUCTURE_TYPE_QUEUE_PROPERTIES; + if (Properties[1] & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) + UrProperties.flags |= UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; + if (Properties[1] & PI_QUEUE_FLAG_PROFILING_ENABLE) + UrProperties.flags |= UR_QUEUE_FLAG_PROFILING_ENABLE; + if (Properties[1] & PI_QUEUE_FLAG_ON_DEVICE) + UrProperties.flags |= UR_QUEUE_FLAG_ON_DEVICE; + if (Properties[1] & PI_QUEUE_FLAG_ON_DEVICE_DEFAULT) + UrProperties.flags |= UR_QUEUE_FLAG_ON_DEVICE_DEFAULT; + if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS) + UrProperties.flags |= UR_QUEUE_FLAG_DISCARD_EVENTS; + if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW) + UrProperties.flags |= UR_QUEUE_FLAG_PRIORITY_LOW; + if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH) + UrProperties.flags |= UR_QUEUE_FLAG_PRIORITY_HIGH; + + UrNativeProperties.pNext = &UrProperties; + + // TODO: How to pass this up in the urQueueCreateWithNativeHandle interface? + std::ignore = NativeHandleDesc; + HANDLE_ERRORS(urQueueCreateWithNativeHandle( + UrNativeHandle, UrContext, UrDevice, &UrNativeProperties, UrQueue)); + return PI_SUCCESS; } inline pi_result piextQueueGetNativeHandle(pi_queue Queue, - pi_native_handle *NativeHandle) { + pi_native_handle *NativeHandle, + int32_t *NativeHandleDesc) { PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); + // TODO: How to pass this up in the urQueueGetNativeHandle interface? + std::ignore = NativeHandleDesc; + ur_queue_handle_t UrQueue = reinterpret_cast(Queue); ur_native_handle_t UrNativeQueue{}; @@ -1381,14 +1390,6 @@ inline pi_result piextQueueGetNativeHandle(pi_queue Queue, return PI_SUCCESS; } -inline pi_result piextQueueGetNativeHandle2(pi_queue Queue, - pi_native_handle *NativeHandle, - int32_t *NativeHandleDesc) { - - (void)NativeHandleDesc; - return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle); -} - inline pi_result piQueueRelease(pi_queue Queue) { PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index acff4810f9dc9..20fe7384a9c63 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -109,28 +109,6 @@ __SYCL_EXPORT pi_result piextQueueCreate(pi_context Context, pi_device Device, return pi2ur::piextQueueCreate(Context, Device, Properties, Queue); } -__SYCL_EXPORT pi_result piextQueueCreate2(pi_context Context, pi_device Device, - pi_queue_properties *Properties, - pi_queue *Queue) { - return pi2ur::piextQueueCreate(Context, Device, Properties, Queue); -} - -__SYCL_EXPORT pi_result piextQueueGetNativeHandle2( - pi_queue Queue, pi_native_handle *NativeHandle, int32_t *NativeHandleDesc) { - std::ignore = NativeHandleDesc; - return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle); -} - -__SYCL_EXPORT pi_result piextQueueCreateWithNativeHandle2( - pi_native_handle NativeHandle, int32_t NativeHandleDesc, pi_context Context, - pi_device Device, bool OwnNativeHandle, pi_queue_properties *Properties, - pi_queue *Queue) { - std::ignore = NativeHandleDesc; - std::ignore = Properties; - return pi2ur::piextQueueCreateWithNativeHandle(NativeHandle, Context, Device, - OwnNativeHandle, Queue); -} - __SYCL_EXPORT pi_result piQueueRelease(pi_queue Queue) { return pi2ur::piQueueRelease(Queue); } @@ -724,16 +702,19 @@ __SYCL_EXPORT pi_result piextContextCreateWithNativeHandle( NativeHandle, NumDevices, Devices, OwnNativeHandle, RetContext); } -__SYCL_EXPORT pi_result -piextQueueGetNativeHandle(pi_queue Queue, pi_native_handle *NativeHandle) { - return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle); +__SYCL_EXPORT pi_result piextQueueGetNativeHandle( + pi_queue Queue, pi_native_handle *NativeHandle, int32_t *NativeHandleDesc) { + return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle, + NativeHandleDesc); } __SYCL_EXPORT pi_result piextQueueCreateWithNativeHandle( - pi_native_handle NativeHandle, pi_context Context, pi_device Device, - bool OwnNativeHandle, pi_queue *Queue) { - return pi2ur::piextQueueCreateWithNativeHandle(NativeHandle, Context, Device, - OwnNativeHandle, Queue); + pi_native_handle NativeHandle, int32_t NativeHandleDesc, pi_context Context, + pi_device Device, bool OwnNativeHandle, pi_queue_properties *Properties, + pi_queue *Queue) { + return pi2ur::piextQueueCreateWithNativeHandle( + NativeHandle, NativeHandleDesc, Context, Device, OwnNativeHandle, + Properties, Queue); } __SYCL_EXPORT pi_result piMemRelease(pi_mem Mem) { @@ -1068,9 +1049,6 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) { _PI_API(piQueueFlush) _PI_API(piextQueueGetNativeHandle) _PI_API(piextQueueCreateWithNativeHandle) - _PI_API(piextQueueCreate2) - _PI_API(piextQueueGetNativeHandle2) - _PI_API(piextQueueCreateWithNativeHandle2) _PI_API(piProgramCreate) _PI_API(piProgramBuild) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp index efd3538887f93..7137bf9e3c11b 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp @@ -463,32 +463,66 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( // Lock automatically releases when this goes out of scope. std::shared_lock lock(Queue->Mutex); - auto ZeQueue = ur_cast(NativeQueue); - - // Extract a Level Zero compute queue handle from the given PI queue + // Get handle to this thread's queue group. auto &QueueGroup = Queue->getQueueGroup(false /*compute*/); - uint32_t QueueGroupOrdinalUnused; - *ZeQueue = QueueGroup.getZeQueue(&QueueGroupOrdinalUnused); + + if (Queue->UsingImmCmdLists) { + auto ZeCmdList = ur_cast(NativeQueue); + // Extract the Level Zero command list handle from the given PI queue + *ZeCmdList = QueueGroup.getImmCmdList()->first; + // TODO: How to pass this up in the urQueueGetNativeHandle interface? + // *NativeHandleDesc = true; + } else { + auto ZeQueue = ur_cast(NativeQueue); + + // Extract a Level Zero compute queue handle from the given PI queue + auto &QueueGroup = Queue->getQueueGroup(false /*compute*/); + uint32_t QueueGroupOrdinalUnused; + *ZeQueue = QueueGroup.getZeQueue(&QueueGroupOrdinalUnused); + // TODO: How to pass this up in the urQueueGetNativeHandle interface? + // *NativeHandleDesc = false; + } return UR_RESULT_SUCCESS; } +void ur_queue_handle_t_::pi_queue_group_t::setImmCmdList( + ze_command_list_handle_t ZeCommandList) { + ImmCmdLists = std::vector( + 1, + Queue->CommandListMap + .insert(std::pair{ + ZeCommandList, {nullptr, true, false, nullptr, 0}}) + .first); +} + UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( ur_native_handle_t NativeQueue, ///< [in] the native handle of the queue. ur_context_handle_t Context, ///< [in] handle of the context object ur_device_handle_t Device, /// - const ur_queue_native_properties_t *Properties, /// + const ur_queue_native_properties_t *NativeProperties, /// ur_queue_handle_t *RetQueue ///< [out] pointer to the handle of the queue object created. ) { - auto ZeQueue = ur_cast(NativeQueue); - // Assume this is the "0" index queue in the compute command-group. - std::vector ZeQueues{ZeQueue}; + bool OwnNativeHandle = false; + ur_queue_flags_t Flags{}; - // TODO: see what we can do to correctly initialize PI queue for - // compute vs. copy Level-Zero queue. Currently we will send - // all commands to the "ZeQueue". - std::vector ZeroCopyQueues; + if (NativeProperties) { + OwnNativeHandle = NativeProperties->isNativeHandleOwned; + if (NativeProperties->pNext) { + const ur_base_properties_t *extendedProperties = + reinterpret_cast( + NativeProperties->pNext); + if (extendedProperties->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) { + const ur_queue_properties_t *UrProperties = + reinterpret_cast(extendedProperties); + Flags = UrProperties->flags; + } + } + } + + // TODO: How to pass this up in the urQueueCreateWithNativeHandle interface? + int32_t NativeHandleDesc = 0; // Get the device handle from first device in the platform // Maybe this is not completely correct. @@ -502,15 +536,42 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( nullptr)); } - try { - ur_queue_handle_t_ *Queue = - new ur_queue_handle_t_(ZeQueues, ZeroCopyQueues, Context, UrDevice, - Properties->isNativeHandleOwned); - *RetQueue = reinterpret_cast(Queue); - } catch (const std::bad_alloc &) { - return UR_RESULT_ERROR_OUT_OF_RESOURCES; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; + // The NativeHandleDesc has value if if the native handle is an immediate + // command list. + if (NativeHandleDesc == 1) { + std::vector ComputeQueues{nullptr}; + std::vector CopyQueues; + + try { + ur_queue_handle_t_ *Queue = new ur_queue_handle_t_( + ComputeQueues, CopyQueues, Context, UrDevice, OwnNativeHandle, Flags); + *RetQueue = reinterpret_cast(Queue); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + auto &InitialGroup = (*RetQueue)->ComputeQueueGroupsByTID.begin()->second; + InitialGroup.setImmCmdList(ur_cast(NativeQueue)); + } else { + auto ZeQueue = ur_cast(NativeQueue); + // Assume this is the "0" index queue in the compute command-group. + std::vector ZeQueues{ZeQueue}; + + // TODO: see what we can do to correctly initialize PI queue for + // compute vs. copy Level-Zero queue. Currently we will send + // all commands to the "ZeQueue". + std::vector ZeroCopyQueues; + + try { + ur_queue_handle_t_ *Queue = new ur_queue_handle_t_( + ZeQueues, ZeroCopyQueues, Context, UrDevice, OwnNativeHandle, Flags); + *RetQueue = reinterpret_cast(Queue); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } } return UR_RESULT_SUCCESS; @@ -757,6 +818,8 @@ ur_queue_handle_t_::ur_queue_handle_t_( bool OwnZeCommandQueue, ur_queue_flags_t Properties, int ForceComputeIndex) : Context{Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue}, Properties(Properties) { + // Set the type of commandlists the queue will use. + UsingImmCmdLists = Device->useImmediateCommandLists(); // Compute group initialization. // First, see if the queue's device allows for round-robin or it is // fixed to one particular compute CCS (it is so for sub-sub-devices). @@ -766,7 +829,7 @@ ur_queue_handle_t_::ur_queue_handle_t_( ComputeQueueGroup.ZeQueues = ComputeQueues; // Create space to hold immediate commandlists corresponding to the // ZeQueues - if (Device->ImmCommandListUsed) { + if (UsingImmCmdLists) { ComputeQueueGroup.ImmCmdLists = std::vector( ComputeQueueGroup.ZeQueues.size(), CommandListMap.end()); } @@ -798,7 +861,7 @@ ur_queue_handle_t_::ur_queue_handle_t_( die("No compute queue available/allowed."); } } - if (Device->ImmCommandListUsed) { + if (UsingImmCmdLists) { // Create space to hold immediate commandlists corresponding to the // ZeQueues ComputeQueueGroup.ImmCmdLists = std::vector( diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp index 4a5a6fe8b731d..81b02825ecff9 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp @@ -121,6 +121,9 @@ struct ur_queue_handle_t_ : _ur_object { // queues and the value of the queue group ordinal. ze_command_queue_handle_t &getZeQueue(uint32_t *QueueGroupOrdinal); + // This function sets an immediate commandlist from the interop interface. + void setImmCmdList(ze_command_list_handle_t); + // This function returns the next immediate commandlist to use. ur_command_list_ptr_t &getImmCmdList(); @@ -195,6 +198,12 @@ struct ur_queue_handle_t_ : _ur_object { // Therefore it can be accessed without holding a lock on this _pi_queue. const ur_device_handle_t Device; + // A queue may use either standard or immediate commandlists. At queue + // construction time this is set based on the device and any env var settings + // that change the default for the device type. When an interop queue is + // constructed, the caller chooses the type of commandlists to use. + bool UsingImmCmdLists = false; + // Keeps track of the event associated with the last enqueued command into // this queue. this is used to add dependency with the last command to add // in-order semantics and updated with the latest event each time a new From 3cd033c726e0389cfcfc77a837444f20b007af21 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Thu, 18 May 2023 09:25:00 -0700 Subject: [PATCH 44/50] Port [SYCL] [L0] Correct the device id check for PVC https://github.com/intel/llvm/pull/9503 Signed-off-by: Jaime Arteaga --- .../ur/adapters/level_zero/ur_level_zero_device.hpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp index 4bc56c6fc5108..ca010ef3e0b06 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp @@ -142,8 +142,14 @@ struct ur_device_handle_t_ : _ur_object { bool isSubDevice() { return RootDevice != nullptr; } - // Is this a Data Center GPU Max series (aka PVC). - bool isPVC() { return (ZeDeviceProperties->deviceId & 0xff0) == 0xbd0; } + // Is this a Data Center GPU Max series (aka PVC)? + // TODO: change to use + // https://spec.oneapi.io/level-zero/latest/core/api.html#ze-device-ip-version-ext-t + // when that is stable. + bool isPVC() { + return (ZeDeviceProperties->deviceId & 0xff0) == 0xbd0 || + (ZeDeviceProperties->deviceId & 0xff0) == 0xb60; + } // Does this device represent a single compute slice? bool isCCS() const { From b17b2d44bc72cae393a1f88b5332ec73b91bf684 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Mon, 22 May 2023 16:59:37 -0700 Subject: [PATCH 45/50] Port [SYCL][L0] Optimize barrier for in-order queue https://github.com/intel/llvm/pull/9446 Signed-off-by: Jaime Arteaga --- .../level_zero/ur_level_zero_event.cpp | 50 +++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp index 6d14ae2176681..0710ef349a519 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp @@ -122,6 +122,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( return UR_RESULT_SUCCESS; } +// Control if wait with barrier is implemented by signal of an event +// as opposed by true barrier command for in-order queue. +static const bool InOrderBarrierBySignal = [] { + const char *UrRet = std::getenv("UR_L0_IN_ORDER_BARRIER_BY_SIGNAL"); + return (UrRet ? std::atoi(UrRet) : true); +}(); + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( ur_queue_handle_t Queue, ///< [in] handle of the queue object uint32_t NumEventsInWaitList, ///< [in] size of the event wait list @@ -144,16 +151,53 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( [&Queue](ur_command_list_ptr_t CmdList, const _ur_ze_event_list_t &EventWaitList, ur_event_handle_t &Event, bool IsInternal) { + // For in-order queue and empty wait-list just use the last command + // event as the barrier event. + if (Queue->isInOrderQueue() && !EventWaitList.Length && + Queue->LastCommandEvent && !Queue->LastCommandEvent->IsDiscarded) { + UR_CALL(urEventRetain(Queue->LastCommandEvent)); + Event = Queue->LastCommandEvent; + return UR_RESULT_SUCCESS; + } + UR_CALL(createEventAndAssociateQueue( Queue, &Event, UR_EXT_COMMAND_TYPE_USER, CmdList, IsInternal)); Event->WaitList = EventWaitList; - ZE2UR_CALL(zeCommandListAppendBarrier, - (CmdList->first, Event->ZeEvent, EventWaitList.Length, - EventWaitList.ZeEventList)); + + // For in-order queue we don't need a real barrier, just wait for + // requested events in potentially different queues and add a "barrier" + // event signal because it is already guaranteed that previous commands + // in this queue are completed when the signal is started. + // + // TODO: this and other special handling of in-order queues to be + // updated when/if Level Zero adds native support for in-order queues. + // + if (Queue->isInOrderQueue() && InOrderBarrierBySignal) { + if (EventWaitList.Length) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (CmdList->first, EventWaitList.Length, + EventWaitList.ZeEventList)); + } + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (CmdList->first, Event->ZeEvent)); + } else { + ZE2UR_CALL(zeCommandListAppendBarrier, + (CmdList->first, Event->ZeEvent, EventWaitList.Length, + EventWaitList.ZeEventList)); + } return UR_RESULT_SUCCESS; }; + // If the queue is in-order then each command in it effectively acts as a + // barrier, so we don't need to do anything except if we were requested + // a "barrier" event to be created. Or if we need to wait for events in + // potentially different queues. + // + if (Queue->isInOrderQueue() && NumEventsInWaitList == 0 && !OutEvent) { + return UR_RESULT_SUCCESS; + } + ur_event_handle_t InternalEvent; bool IsInternal = OutEvent == nullptr; ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; From 438221084e63d2c05ec1a09a0135340a90072f13 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Mon, 22 May 2023 17:34:47 -0700 Subject: [PATCH 46/50] Port [SYCL] [L0] Recycle immediate command lists for queues in a context https://github.com/intel/llvm/pull/9409 Signed-off-by: Jaime Arteaga --- .../level_zero/ur_level_zero_context.cpp | 22 +++-- .../level_zero/ur_level_zero_context.hpp | 8 +- .../level_zero/ur_level_zero_queue.cpp | 96 ++++++++++++++----- .../level_zero/ur_level_zero_queue.hpp | 12 ++- 4 files changed, 102 insertions(+), 36 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp index c177926c24c30..34b4e5ceb7229 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp @@ -383,7 +383,8 @@ ur_result_t ur_context_handle_t_::finalize() { std::scoped_lock Lock(ZeCommandListCacheMutex); for (auto &List : ZeComputeCommandListCache) { - for (ze_command_list_handle_t &ZeCommandList : List.second) { + for (auto &Item : List.second) { + ze_command_list_handle_t ZeCommandList = Item.first; if (ZeCommandList) if (ZeCommandList) { auto ZeResult = @@ -395,7 +396,8 @@ ur_result_t ur_context_handle_t_::finalize() { } } for (auto &List : ZeCopyCommandListCache) { - for (ze_command_list_handle_t &ZeCommandList : List.second) { + for (auto &Item : List.second) { + ze_command_list_handle_t ZeCommandList = Item.first; if (ZeCommandList) { auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); // Gracefully handle the case that L0 was already unloaded. @@ -647,7 +649,7 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList( for (auto ZeCommandListIt = ZeCommandListCache.begin(); ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) { - auto &ZeCommandList = *ZeCommandListIt; + auto &ZeCommandList = ZeCommandListIt->first; auto it = Queue->CommandListMap.find(ZeCommandList); if (it != Queue->CommandListMap.end()) { if (ForcedCmdQueue && *ForcedCmdQueue != it->second.ZeQueue) @@ -671,12 +673,14 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList( ze_fence_handle_t ZeFence; ZeStruct ZeFenceDesc; ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); - CommandList = Queue->CommandListMap - .emplace(ZeCommandList, - pi_command_list_info_t{ZeFence, true, false, - ZeCommandQueue, - QueueGroupOrdinal}) - .first; + ZeStruct ZeQueueDesc; + ZeQueueDesc.ordinal = QueueGroupOrdinal; + CommandList = + Queue->CommandListMap + .emplace(ZeCommandList, + pi_command_list_info_t{ZeFence, true, false, + ZeCommandQueue, ZeQueueDesc}) + .first; } ZeCommandListCache.erase(ZeCommandListIt); if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList)) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp index cc1775d87f3c9..a945826d8fb8c 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp @@ -80,9 +80,13 @@ struct ur_context_handle_t_ : _ur_object { // application must only use the command list for the device, or its // sub-devices, which was provided during creation." // - std::unordered_map> + std::unordered_map>>> ZeComputeCommandListCache; - std::unordered_map> + std::unordered_map>>> ZeCopyCommandListCache; // Store USM allocator context(internal allocator structures) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp index 7137bf9e3c11b..5ee786a1fb3b7 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp @@ -435,6 +435,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease( if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) return ze2urResult(ZeResult); } + if (Queue->UsingImmCmdLists && Queue->OwnZeCommandQueue) { + std::scoped_lock Lock( + Queue->Context->ZeCommandListCacheMutex); + const pi_command_list_info_t &MapEntry = it->second; + if (MapEntry.CanReuse) { + // Add commandlist to the cache for future use. + // It will be deleted when the context is destroyed. + auto &ZeCommandListCache = + MapEntry.isCopy(Queue) + ? Queue->Context + ->ZeCopyCommandListCache[Queue->Device->ZeDevice] + : Queue->Context + ->ZeComputeCommandListCache[Queue->Device->ZeDevice]; + ZeCommandListCache.push_back({it->first, it->second.ZeQueueDesc}); + } else { + // A non-reusable comamnd list that came from a make_queue call is + // destroyed since it cannot be recycled. + ze_command_list_handle_t ZeCommandList = it->first; + if (ZeCommandList) { + ZE2UR_CALL(zeCommandListDestroy, (ZeCommandList)); + } + } + } } Queue->CommandListMap.clear(); } @@ -488,11 +511,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( void ur_queue_handle_t_::pi_queue_group_t::setImmCmdList( ze_command_list_handle_t ZeCommandList) { + // An immediate command list was given to us but we don't have the queue + // descriptor information. Create a dummy and note that it is not recycleable. + ZeStruct ZeQueueDesc; ImmCmdLists = std::vector( 1, Queue->CommandListMap .insert(std::pair{ - ZeCommandList, {nullptr, true, false, nullptr, 0}}) + ZeCommandList, + {nullptr, true, false, nullptr, ZeQueueDesc, false}}) .first); } @@ -1608,14 +1635,15 @@ ur_result_t ur_queue_handle_t_::resetCommandList( UseCopyEngine ? this->Context->ZeCopyCommandListCache[this->Device->ZeDevice] : this->Context->ZeComputeCommandListCache[this->Device->ZeDevice]; - ZeCommandListCache.push_back(CommandList->first); + ZeCommandListCache.push_back( + {CommandList->first, CommandList->second.ZeQueueDesc}); } return UR_RESULT_SUCCESS; } bool pi_command_list_info_t::isCopy(ur_queue_handle_t Queue) const { - return ZeQueueGroupOrdinal != + return ZeQueueDesc.ordinal != (uint32_t)Queue->Device ->QueueGroup [ur_device_handle_t_::queue_group_info_t::type::Compute] @@ -1773,10 +1801,11 @@ ur_result_t ur_queue_handle_t_::createCommandList( &ZeCommandListDesc, &ZeCommandList)); ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); + ZeStruct ZeQueueDesc; + ZeQueueDesc.ordinal = QueueGroupOrdinal; std::tie(CommandList, std::ignore) = CommandListMap.insert( std::pair( - ZeCommandList, - {ZeFence, false, false, ZeCommandQueue, QueueGroupOrdinal})); + ZeCommandList, {ZeFence, false, false, ZeCommandQueue, ZeQueueDesc})); UR_CALL(insertStartBarrierIfDiscardEventsMode(CommandList)); UR_CALL(insertActiveBarriers(CommandList, UseCopyEngine)); @@ -1886,29 +1915,50 @@ ur_command_list_ptr_t &ur_queue_handle_t_::pi_queue_group_t::getImmCmdList() { ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY; } - urPrint("[getZeQueue]: create queue ordinal = %d, index = %d " - "(round robin in [%d, %d]) priority = %s\n", - ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex, - UpperIndex, Priority); + // Check if context's command list cache has an immediate command list with + // matching index. + ze_command_list_handle_t ZeCommandList = nullptr; + { + // Acquire lock to avoid race conditions. + std::scoped_lock Lock(Queue->Context->ZeCommandListCacheMutex); + // Under mutex since operator[] does insertion on the first usage for every + // unique ZeDevice. + auto &ZeCommandListCache = + isCopy() + ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice] + : Queue->Context + ->ZeComputeCommandListCache[Queue->Device->ZeDevice]; + for (auto ZeCommandListIt = ZeCommandListCache.begin(); + ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) { + const auto &Desc = (*ZeCommandListIt).second; + if (Desc.index == ZeCommandQueueDesc.index && + Desc.flags == ZeCommandQueueDesc.flags && + Desc.mode == ZeCommandQueueDesc.mode && + Desc.priority == ZeCommandQueueDesc.priority) { + ZeCommandList = (*ZeCommandListIt).first; + ZeCommandListCache.erase(ZeCommandListIt); + break; + } + } + } - ze_command_list_handle_t ZeCommandList; - ZE_CALL_NOCHECK(zeCommandListCreateImmediate, - (Queue->Context->ZeContext, Queue->Device->ZeDevice, - &ZeCommandQueueDesc, &ZeCommandList)); + // If cache didn't contain a command list, create one. + if (!ZeCommandList) { + urPrint("[getZeQueue]: create queue ordinal = %d, index = %d " + "(round robin in [%d, %d]) priority = %s\n", + ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex, + UpperIndex, Priority); + + ZE_CALL_NOCHECK(zeCommandListCreateImmediate, + (Queue->Context->ZeContext, Queue->Device->ZeDevice, + &ZeCommandQueueDesc, &ZeCommandList)); + } ImmCmdLists[Index] = Queue->CommandListMap .insert(std::pair{ - ZeCommandList, {nullptr, true, false, nullptr, QueueOrdinal}}) + ZeCommandList, + {nullptr, true, false, nullptr, ZeCommandQueueDesc}}) .first; - // Add this commandlist to the cache so it can be destroyed as part of - // urQueueReleaseInternal - auto QueueType = Type; - std::scoped_lock Lock(Queue->Context->ZeCommandListCacheMutex); - auto &ZeCommandListCache = - QueueType == queue_type::Compute - ? Queue->Context->ZeComputeCommandListCache[Queue->Device->ZeDevice] - : Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]; - ZeCommandListCache.push_back(ZeCommandList); return ImmCmdLists[Index]; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp index 81b02825ecff9..4f8e47f0ab5df 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp @@ -55,8 +55,16 @@ struct pi_command_list_info_t { // Record the queue to which the command list will be submitted. ze_command_queue_handle_t ZeQueue{nullptr}; - // Keeps the ordinal of the ZeQueue queue group. Invalid if ZeQueue==nullptr - uint32_t ZeQueueGroupOrdinal{0}; + + // Record the queue descriptor fields used when creating the command list + // because we cannot recover these fields from the command list. Immediate + // command lists are recycled across queues and then all fields are used. For + // standard command lists only the ordinal is used. For queues created through + // the make_queue API the descriptor is unavailable so a dummy descriptor is + // used and then this entry is marked as not eligible for recycling. + ZeStruct ZeQueueDesc; + bool CanReuse{true}; + // Helper functions to tell if this is a copy command-list. bool isCopy(ur_queue_handle_t Queue) const; From 3e9fb4eef5cc469f387eea5633a42666d7a1f3b2 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Tue, 23 May 2023 10:14:43 -0700 Subject: [PATCH 47/50] Rebase ur_loader and headers this to absorb latest changes in queue native handle APIs Signed-off-by: Jaime Arteaga --- sycl/plugins/unified_runtime/CMakeLists.txt | 2 +- sycl/plugins/unified_runtime/pi2ur.hpp | 29 ++++++++++------- .../level_zero/ur_level_zero_device.cpp | 4 +++ .../level_zero/ur_level_zero_platform.cpp | 6 +++- .../level_zero/ur_level_zero_program.cpp | 4 +++ .../level_zero/ur_level_zero_queue.cpp | 29 ++++++++++++----- .../level_zero/ur_level_zero_sampler.cpp | 4 +++ .../adapters/level_zero/ur_level_zero_usm.cpp | 31 ++++++++++++++++--- .../level_zero/ur_loader_interface.cpp | 4 ++- 9 files changed, 87 insertions(+), 26 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 177537363380e..c912025f2991b 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -4,7 +4,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - set(UNIFIED_RUNTIME_TAG 0125b2b42aea73c350f7961cd68e0f1f94cc1238) + set(UNIFIED_RUNTIME_TAG 620ddb1e8bb1f5ef6cc775edf79ba4674057fe2e) message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}") FetchContent_Declare(unified-runtime diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index f21484d657595..4a1ca333e0977 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -498,7 +498,8 @@ piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle, ur_platform_handle_t UrPlatform{}; ur_native_handle_t UrNativeHandle = reinterpret_cast(NativeHandle); - urPlatformCreateWithNativeHandle(UrNativeHandle, &UrPlatform); + ur_platform_native_properties_t UrProperties{}; + urPlatformCreateWithNativeHandle(UrNativeHandle, &UrProperties, &UrPlatform); *Platform = reinterpret_cast(UrPlatform); @@ -1004,8 +1005,9 @@ piextDeviceCreateWithNativeHandle(pi_native_handle NativeHandle, ur_platform_handle_t UrPlatform = reinterpret_cast(Platform); auto UrDevice = reinterpret_cast(Device); - HANDLE_ERRORS( - urDeviceCreateWithNativeHandle(UrNativeDevice, UrPlatform, UrDevice)); + ur_device_native_properties_t UrProperties{}; + HANDLE_ERRORS(urDeviceCreateWithNativeHandle(UrNativeDevice, UrPlatform, + &UrProperties, UrDevice)); return PI_SUCCESS; } @@ -1361,10 +1363,13 @@ inline pi_result piextQueueCreateWithNativeHandle( if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH) UrProperties.flags |= UR_QUEUE_FLAG_PRIORITY_HIGH; + ur_queue_native_desc_t UrNativeDesc{}; + UrNativeDesc.stype = UR_STRUCTURE_TYPE_QUEUE_NATIVE_DESC; + UrNativeDesc.pNativeData = &NativeHandleDesc; + + UrProperties.pNext = &UrNativeDesc; UrNativeProperties.pNext = &UrProperties; - // TODO: How to pass this up in the urQueueCreateWithNativeHandle interface? - std::ignore = NativeHandleDesc; HANDLE_ERRORS(urQueueCreateWithNativeHandle( UrNativeHandle, UrContext, UrDevice, &UrNativeProperties, UrQueue)); return PI_SUCCESS; @@ -1377,13 +1382,13 @@ inline pi_result piextQueueGetNativeHandle(pi_queue Queue, PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); - // TODO: How to pass this up in the urQueueGetNativeHandle interface? - std::ignore = NativeHandleDesc; + ur_queue_native_desc_t UrNativeDesc{}; + UrNativeDesc.pNativeData = NativeHandleDesc; ur_queue_handle_t UrQueue = reinterpret_cast(Queue); ur_native_handle_t UrNativeQueue{}; - HANDLE_ERRORS(urQueueGetNativeHandle(UrQueue, &UrNativeQueue)); + HANDLE_ERRORS(urQueueGetNativeHandle(UrQueue, &UrNativeDesc, &UrNativeQueue)); *NativeHandle = reinterpret_cast(UrNativeQueue); @@ -1967,7 +1972,7 @@ inline pi_result piextProgramGetNativeHandle(pi_program Program, inline pi_result piextProgramCreateWithNativeHandle(pi_native_handle NativeHandle, - pi_context Context, bool ownNativeHandle, + pi_context Context, bool OwnNativeHandle, pi_program *Program) { PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); @@ -1979,8 +1984,10 @@ piextProgramCreateWithNativeHandle(pi_native_handle NativeHandle, reinterpret_cast(Context); ur_program_handle_t *UrProgram = reinterpret_cast(Program); - HANDLE_ERRORS( - urProgramCreateWithNativeHandle(NativeProgram, UrContext, UrProgram)); + ur_program_native_properties_t UrProperties{}; + UrProperties.isNativeHandleOwned = OwnNativeHandle; + HANDLE_ERRORS(urProgramCreateWithNativeHandle(NativeProgram, UrContext, + &UrProperties, UrProgram)); return PI_SUCCESS; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp index f3d242f7f4e5d..dc9f6a9f7069d 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp @@ -1233,9 +1233,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( ur_native_handle_t NativeDevice, ///< [in] the native handle of the device. ur_platform_handle_t Platform, ///< [in] handle of the platform instance + const ur_device_native_properties_t + *Properties, ///< [in][optional] pointer to native device properties + ///< struct. ur_device_handle_t *Device ///< [out] pointer to the handle of the device object created. ) { + std::ignore = Properties; auto ZeDevice = ur_cast(NativeDevice); // The SYCL spec requires that the set of devices must remain fixed for the diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp index 469c39d3e668c..61ef0f98b5683 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp @@ -306,10 +306,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( ur_native_handle_t - NativePlatform, ///< [in] the native handle of the platform. + NativePlatform, ///< [in] the native handle of the platform. + const ur_platform_native_properties_t + *Properties, ///< [in][optional] pointer to native platform properties + ///< struct. ur_platform_handle_t *Platform ///< [out] pointer to the handle of the ///< platform object created. ) { + std::ignore = Properties; auto ZeDriver = ur_cast(NativePlatform); uint32_t NumPlatforms = 0; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp index 5519f7e2254bd..6604ca073bc6a 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp @@ -719,9 +719,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle( ur_native_handle_t NativeProgram, ///< [in] the native handle of the program. ur_context_handle_t Context, ///< [in] handle of the context instance + const ur_program_native_properties_t + *Properties, ///< [in][optional] pointer to native program properties + ///< struct. ur_program_handle_t *Program ///< [out] pointer to the handle of the ///< program object created. ) { + std::ignore = Properties; auto ZeModule = ur_cast(NativeProgram); // We assume here that programs created from a native handle always diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp index 5ee786a1fb3b7..730bb6542e7b7 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp @@ -480,12 +480,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease( UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( ur_queue_handle_t Queue, ///< [in] handle of the queue. + ur_queue_native_desc_t *Desc, ur_native_handle_t *NativeQueue ///< [out] a pointer to the native handle of the queue. ) { // Lock automatically releases when this goes out of scope. std::shared_lock lock(Queue->Mutex); + int32_t NativeHandleDesc{}; + // Get handle to this thread's queue group. auto &QueueGroup = Queue->getQueueGroup(false /*compute*/); @@ -494,7 +497,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( // Extract the Level Zero command list handle from the given PI queue *ZeCmdList = QueueGroup.getImmCmdList()->first; // TODO: How to pass this up in the urQueueGetNativeHandle interface? - // *NativeHandleDesc = true; + NativeHandleDesc = true; } else { auto ZeQueue = ur_cast(NativeQueue); @@ -503,9 +506,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( uint32_t QueueGroupOrdinalUnused; *ZeQueue = QueueGroup.getZeQueue(&QueueGroupOrdinalUnused); // TODO: How to pass this up in the urQueueGetNativeHandle interface? - // *NativeHandleDesc = false; + NativeHandleDesc = false; } + if (Desc && Desc->pNativeData) + *(reinterpret_cast((Desc->pNativeData))) = NativeHandleDesc; + return UR_RESULT_SUCCESS; } @@ -533,24 +539,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( ) { bool OwnNativeHandle = false; ur_queue_flags_t Flags{}; + int32_t NativeHandleDesc{}; if (NativeProperties) { OwnNativeHandle = NativeProperties->isNativeHandleOwned; - if (NativeProperties->pNext) { + void *pNext = NativeProperties->pNext; + while (pNext) { const ur_base_properties_t *extendedProperties = - reinterpret_cast( - NativeProperties->pNext); + reinterpret_cast(pNext); if (extendedProperties->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) { const ur_queue_properties_t *UrProperties = reinterpret_cast(extendedProperties); Flags = UrProperties->flags; + } else if (extendedProperties->stype == + UR_STRUCTURE_TYPE_QUEUE_NATIVE_DESC) { + const ur_queue_native_desc_t *UrNativeDesc = + reinterpret_cast( + extendedProperties); + if (UrNativeDesc->pNativeData) + NativeHandleDesc = + *(reinterpret_cast((UrNativeDesc->pNativeData))); } + pNext = extendedProperties->pNext; } } - // TODO: How to pass this up in the urQueueCreateWithNativeHandle interface? - int32_t NativeHandleDesc = 0; - // Get the device handle from first device in the platform // Maybe this is not completely correct. uint32_t NumEntries = 1; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp index bf32fdd9367d0..e7330bd5078b8 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp @@ -161,11 +161,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreateWithNativeHandle( ur_native_handle_t NativeSampler, ///< [in] the native handle of the sampler. ur_context_handle_t Context, ///< [in] handle of the context object + const ur_sampler_native_properties_t + *Properties, ///< [in][optional] pointer to native sampler properties + ///< struct. ur_sampler_handle_t *Sampler ///< [out] pointer to the handle of the ///< sampler object created. ) { std::ignore = NativeSampler; std::ignore = Context; + std::ignore = Properties; std::ignore = Sampler; urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp index 2a5effb541ad7..0b0cc51c845d9 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp @@ -532,12 +532,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolDestroy( - ur_context_handle_t Context, ///< [in] handle of the context object - ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool +ur_result_t +urUSMPoolRetain(ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool +) { + std::ignore = Pool; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t +urUSMPoolRelease(ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool +) { + std::ignore = Pool; + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t urUSMPoolGetInfo( + ur_usm_pool_handle_t Pool, ///< [in] handle of the USM memory pool + ur_usm_pool_info_t PropName, ///< [in] name of the pool property to query + size_t PropSize, ///< [in] size in bytes of the pool property value provided + void *PropValue, ///< [out][typename(propName, propSize)] value of the pool + ///< property + size_t *PropSizeRet ///< [out] size in bytes returned in pool property value ) { - std::ignore = Context; std::ignore = Pool; + std::ignore = PropName; + std::ignore = PropSize; + std::ignore = PropValue; + std::ignore = PropSizeRet; urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp index 0d37c805bfb2b..0e2c5bc85bf71 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp @@ -272,7 +272,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMProcAddrTable( pDdiTable->pfnFree = urUSMFree; pDdiTable->pfnGetMemAllocInfo = urUSMGetMemAllocInfo; pDdiTable->pfnPoolCreate = urUSMPoolCreate; - pDdiTable->pfnPoolDestroy = urUSMPoolDestroy; + pDdiTable->pfnPoolRetain = urUSMPoolRetain; + pDdiTable->pfnPoolRelease = urUSMPoolRelease; + pDdiTable->pfnPoolGetInfo = urUSMPoolGetInfo; return retVal; } From 7ce01a783dc05bff1bbf96112e4ce8b33cac8189 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Tue, 23 May 2023 10:42:58 -0700 Subject: [PATCH 48/50] Port [SYCL] Properly install UR libraries https://github.com/intel/llvm/pull/9555 Signed-off-by: Jaime Arteaga --- sycl/plugins/unified_runtime/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index c912025f2991b..a7b8a1e8e8b31 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -130,7 +130,7 @@ if (TARGET UnifiedRuntimeLoader) # TODO: this is piggy-backing on the existing target component level-zero-sycl-dev # When UR is moved to its separate repo perhaps we should introduce new component, # e.g. unified-runtime-sycl-dev. - install(TARGETS loader + install(TARGETS ur_loader LIBRARY DESTINATION "lib${LLVM_LIBDIR_SUFFIX}" COMPONENT level-zero-sycl-dev ARCHIVE DESTINATION "lib${LLVM_LIBDIR_SUFFIX}" COMPONENT level-zero-sycl-dev RUNTIME DESTINATION "bin" COMPONENT level-zero-sycl-dev From 7a3deca2db1cf96538ba67db53ebc5a1221b5828 Mon Sep 17 00:00:00 2001 From: Brandon Yates Date: Tue, 23 May 2023 18:07:45 -0400 Subject: [PATCH 49/50] Add implementation of USM pools (#11) Signed-off-by: Brandon Yates --- .../level_zero/ur_level_zero_device.cpp | 2 +- .../adapters/level_zero/ur_level_zero_usm.cpp | 135 +++++++++++++----- .../adapters/level_zero/ur_level_zero_usm.hpp | 17 +++ 3 files changed, 117 insertions(+), 37 deletions(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp index dc9f6a9f7069d..7b95bb9bf5b1a 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp @@ -573,7 +573,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: { auto MapCaps = [](const ze_memory_access_cap_flags_t &ZeCapabilities) { - uint64_t Capabilities = 0; + ur_device_usm_access_capability_flags_t Capabilities = 0; if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_RW) Capabilities |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS; if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp index 0b0cc51c845d9..9f215d06d85a8 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp @@ -24,17 +24,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( Size, ///< [in] size in bytes of the USM memory object to be allocated void **RetMem ///< [out] pointer to USM host memory object ) { - std::ignore = Pool; - uint32_t Align = USMDesc->align; + uint32_t Align = USMDesc ? USMDesc->align : 0; // L0 supports alignment up to 64KB and silently ignores higher values. // We flag alignment > 64KB as an invalid value. if (Align > 65536) return UR_RESULT_ERROR_INVALID_VALUE; - const ur_usm_advice_flags_t *USMHintFlags = &USMDesc->hints; - std::ignore = USMHintFlags; - ur_platform_handle_t Plt = Context->getPlatform(); // If indirect access tracking is enabled then lock the mutex which is // guarding contexts container in the platform. This prevents new kernels from @@ -77,7 +73,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( // find the allocator depending on context as we do for Shared and Device // allocations. try { - *RetMem = Context->HostMemAllocContext->allocate(Size, Align); + if (Pool) { + *RetMem = Pool->HostMemPool->allocate(Size, Align); + } else { + *RetMem = Context->HostMemAllocContext->allocate(Size, Align); + } if (IndirectAccessTrackingEnabled) { // Keep track of all memory allocations in the context Context->MemAllocs.emplace(std::piecewise_construct, @@ -105,18 +105,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( Size, ///< [in] size in bytes of the USM memory object to be allocated void **RetMem ///< [out] pointer to USM device memory object ) { - std::ignore = Pool; - uint32_t Alignment = USMDesc->align; + uint32_t Alignment = USMDesc ? USMDesc->align : 0; // L0 supports alignment up to 64KB and silently ignores higher values. // We flag alignment > 64KB as an invalid value. if (Alignment > 65536) return UR_RESULT_ERROR_INVALID_VALUE; - const ur_usm_advice_flags_t *USMHintFlags = &USMDesc->hints; - std::ignore = USMHintFlags; - ur_platform_handle_t Plt = Device->Platform; // If indirect access tracking is enabled then lock the mutex which is @@ -157,11 +153,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( } try { - auto It = Context->DeviceMemAllocContexts.find(Device->ZeDevice); - if (It == Context->DeviceMemAllocContexts.end()) - return UR_RESULT_ERROR_INVALID_VALUE; - *RetMem = It->second.allocate(Size, Alignment); + if (Pool) { + *RetMem = Pool->DeviceMemPools[Device]->allocate(Size, Alignment); + } else { + auto It = Context->DeviceMemAllocContexts.find(Device->ZeDevice); + if (It == Context->DeviceMemAllocContexts.end()) + return UR_RESULT_ERROR_INVALID_VALUE; + + *RetMem = It->second.allocate(Size, Alignment); + } if (IndirectAccessTrackingEnabled) { // Keep track of all memory allocations in the context Context->MemAllocs.emplace(std::piecewise_construct, @@ -190,9 +191,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( Size, ///< [in] size in bytes of the USM memory object to be allocated void **RetMem ///< [out] pointer to USM shared memory object ) { - std::ignore = Pool; - uint32_t Alignment = USMDesc->align; + uint32_t Alignment = USMDesc ? USMDesc->align : 0; ur_usm_host_mem_flags_t UsmHostFlags{}; @@ -200,7 +200,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( bool DeviceReadOnly = false; ur_usm_device_mem_flags_t UsmDeviceFlags{}; - void *pNext = const_cast(USMDesc->pNext); + void *pNext = USMDesc ? const_cast(USMDesc->pNext) : nullptr; while (pNext != nullptr) { const ur_base_desc_t *BaseDesc = reinterpret_cast(pNext); @@ -259,13 +259,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( } try { - auto &Allocator = (DeviceReadOnly ? Context->SharedReadOnlyMemAllocContexts - : Context->SharedMemAllocContexts); - auto It = Allocator.find(Device->ZeDevice); - if (It == Allocator.end()) - return UR_RESULT_ERROR_INVALID_VALUE; - - *RetMem = It->second.allocate(Size, Alignment); + if (Pool) { + if (DeviceReadOnly) { + *RetMem = + Pool->SharedMemReadOnlyPools[Device]->allocate(Size, Alignment); + } else { + *RetMem = Pool->SharedMemPools[Device]->allocate(Size, Alignment); + } + } else { + auto &Allocator = + (DeviceReadOnly ? Context->SharedReadOnlyMemAllocContexts + : Context->SharedMemAllocContexts); + auto It = Allocator.find(Device->ZeDevice); + if (It == Allocator.end()) + return UR_RESULT_ERROR_INVALID_VALUE; + + *RetMem = It->second.allocate(Size, Alignment); + } if (DeviceReadOnly) { Context->SharedReadOnlyAllocs.insert(*RetMem); } @@ -518,6 +528,56 @@ static ur_result_t USMAllocationMakeResident( return UR_RESULT_SUCCESS; } +ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, + ur_usm_pool_desc_t *PoolDesc) { + + zeroInit = static_cast(PoolDesc->flags & + UR_USM_POOL_FLAG_ZERO_INITIALIZE_BLOCK); + + void *pNext = const_cast(PoolDesc->pNext); + while (pNext != nullptr) { + const ur_base_desc_t *BaseDesc = + reinterpret_cast(pNext); + switch (BaseDesc->stype) { + case UR_STRUCTURE_TYPE_USM_POOL_LIMITS_DESC: { + const ur_usm_pool_limits_desc_t *Limits = + reinterpret_cast(BaseDesc); + for (auto &config : USMAllocatorConfigs.Configs) { + config.MaxPoolableSize = Limits->maxPoolableSize; + config.SlabMinSize = Limits->minDriverAllocSize; + } + break; + } + default: { + urPrint("urUSMPoolCreate: unexpected chained stype\n"); + throw UsmAllocationException(UR_RESULT_ERROR_INVALID_ARGUMENT); + } + } + pNext = const_cast(BaseDesc->pNext); + } + + HostMemPool = std::make_unique( + std::unique_ptr(new USMHostMemoryAlloc(Context)), + this->USMAllocatorConfigs.Configs[usm_settings::MemType::Host]); + + for (auto device : Context->Devices) { + DeviceMemPools[device] = std::make_unique( + std::unique_ptr( + new USMDeviceMemoryAlloc(Context, device)), + this->USMAllocatorConfigs.Configs[usm_settings::MemType::Device]); + + SharedMemPools[device] = std::make_unique( + std::unique_ptr( + new USMSharedMemoryAlloc(Context, device)), + this->USMAllocatorConfigs.Configs[usm_settings::MemType::Shared]); + SharedMemReadOnlyPools[device] = std::make_unique( + std::unique_ptr( + new USMSharedMemoryAlloc(Context, device)), + this->USMAllocatorConfigs + .Configs[usm_settings::MemType::SharedReadOnly]); + } +} + UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate( ur_context_handle_t Context, ///< [in] handle of the context object ur_usm_pool_desc_t @@ -525,27 +585,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate( ///< ::ur_usm_pool_limits_desc_t ur_usm_pool_handle_t *Pool ///< [out] pointer to USM memory pool ) { - std::ignore = Context; - std::ignore = PoolDesc; - std::ignore = Pool; - urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + try { + *Pool = reinterpret_cast( + new ur_usm_pool_handle_t_(Context, PoolDesc)); + } catch (const UsmAllocationException &Ex) { + return Ex.getError(); + } + return UR_RESULT_SUCCESS; } ur_result_t urUSMPoolRetain(ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool ) { - std::ignore = Pool; - urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + Pool->RefCount.increment(); + return UR_RESULT_SUCCESS; } ur_result_t urUSMPoolRelease(ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool ) { - std::ignore = Pool; - urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + if (Pool->RefCount.decrementAndTest()) { + delete Pool; + } + return UR_RESULT_SUCCESS; } ur_result_t urUSMPoolGetInfo( diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp index ba0130089906e..a53b6d35712f9 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp @@ -9,6 +9,23 @@ #include "ur_level_zero_common.hpp" +struct ur_usm_pool_handle_t_ : _ur_object { + bool zeroInit; + + usm_settings::USMAllocatorConfig USMAllocatorConfigs; + + std::unique_ptr HostMemPool; + std::unordered_map> + SharedMemPools; + std::unordered_map> + SharedMemReadOnlyPools; + std::unordered_map> + DeviceMemPools; + + ur_usm_pool_handle_t_(ur_context_handle_t Context, + ur_usm_pool_desc_t *PoolDesc); +}; + // Exception type to pass allocation errors class UsmAllocationException { const ur_result_t Error; From 8b2170d5847a6ab2c958216832abb8869e2184d8 Mon Sep 17 00:00:00 2001 From: Jaime Arteaga Date: Fri, 26 May 2023 07:03:01 -0700 Subject: [PATCH 50/50] Port [SYCL][L0] Check if ZE call count had started https://github.com/intel/llvm/pull/9610 Signed-off-by: Jaime Arteaga --- .../ur/adapters/level_zero/ur_level_zero_platform.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp index 61ef0f98b5683..db7570d795b3e 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp @@ -34,7 +34,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urTearDown( // Print the balance of various create/destroy native calls. // The idea is to verify if the number of create(+) and destroy(-) calls are // matched. - if (UrL0Debug & UR_L0_DEBUG_CALL_COUNT) { + if (ZeCallCount && (UrL0Debug & UR_L0_DEBUG_CALL_COUNT) != 0) { // clang-format off // // The format of this table is such that each row accounts for a