From 642f267e8ab159e0025efd199fc0286fcf7d5955 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Fri, 10 Feb 2023 19:56:41 -0800
Subject: [PATCH 01/50] [SYCL][UR][L0] First version of UR L0 adapter

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/level_zero/CMakeLists.txt        |    4 +-
 sycl/plugins/level_zero/pi_level_zero.cpp     | 8232 +----------------
 sycl/plugins/level_zero/pi_level_zero.hpp     | 1322 +--
 sycl/plugins/level_zero/ur_bindings.hpp       |   38 -
 sycl/plugins/unified_runtime/CMakeLists.txt   |    5 +-
 sycl/plugins/unified_runtime/pi2ur.hpp        | 2723 +++++-
 .../unified_runtime/pi_unified_runtime.cpp    | 1017 +-
 .../ur/adapters/level_zero/ur_level_zero.cpp  | 1569 ----
 .../ur/adapters/level_zero/ur_level_zero.hpp  |  196 +-
 .../level_zero/ur_level_zero_common.hpp       |  169 +-
 .../level_zero/ur_level_zero_context.cpp      |  684 ++
 .../level_zero/ur_level_zero_context.hpp      |  231 +-
 .../level_zero/ur_level_zero_device.cpp       | 1256 +++
 .../level_zero/ur_level_zero_device.hpp       |  156 +
 .../level_zero/ur_level_zero_event.cpp        | 1167 +++
 .../level_zero/ur_level_zero_event.hpp        |  261 +-
 .../level_zero/ur_level_zero_kernel.cpp       |  771 ++
 .../level_zero/ur_level_zero_kernel.hpp       |   97 +
 .../adapters/level_zero/ur_level_zero_mem.cpp | 3058 +++++-
 .../adapters/level_zero/ur_level_zero_mem.hpp |  293 +-
 .../level_zero/ur_level_zero_module.cpp       |    9 -
 .../level_zero/ur_level_zero_module.hpp       |   18 -
 .../level_zero/ur_level_zero_platform.cpp     |  531 ++
 .../level_zero/ur_level_zero_platform.hpp     |   44 +
 .../level_zero/ur_level_zero_program.cpp      |  758 ++
 .../level_zero/ur_level_zero_program.hpp      |  123 +-
 .../level_zero/ur_level_zero_queue.cpp        | 1782 ++++
 .../level_zero/ur_level_zero_queue.hpp        |  502 +-
 .../level_zero/ur_level_zero_sampler.cpp      |  203 +
 .../level_zero/ur_level_zero_sampler.hpp      |    7 +-
 .../level_zero/ur_loader_interface.cpp        |  201 +-
 sycl/plugins/unified_runtime/ur/ur.hpp        |   43 +-
 sycl/plugins/unified_runtime/ur_bindings.hpp  |   41 -
 33 files changed, 16142 insertions(+), 11369 deletions(-)
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.hpp
 delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_module.cpp
 delete mode 100644 sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_module.hpp
 mode change 100755 => 100644 sycl/plugins/unified_runtime/ur_bindings.hpp

diff --git a/sycl/plugins/level_zero/CMakeLists.txt b/sycl/plugins/level_zero/CMakeLists.txt
index 2b9dff977029d..3cd25f2dc6826 100755
--- a/sycl/plugins/level_zero/CMakeLists.txt
+++ b/sycl/plugins/level_zero/CMakeLists.txt
@@ -106,7 +106,7 @@ add_sycl_plugin(level_zero
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp"
-    "../unified_runtime/ur/adapters/level_zero/ur_level_zero_module.hpp"
+    "../unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.hpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.hpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_program.hpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp"
@@ -117,7 +117,7 @@ add_sycl_plugin(level_zero
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp"
-    "../unified_runtime/ur/adapters/level_zero/ur_level_zero_module.cpp"
+    "../unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp"
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 5e53aac3c6a2d..44d747c12b871 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -12,2174 +12,17 @@
 /// \ingroup sycl_pi_level_zero
 
 #include "pi_level_zero.hpp"
-#include <algorithm>
-#include <cstdarg>
-#include <cstdio>
-#include <cstring>
-#include <memory>
-#include <set>
-#include <sstream>
-#include <string>
-#include <string_view>
-#include <sycl/detail/pi.h>
-#include <sycl/detail/spinlock.hpp>
-#include <utility>
-
-#include <zet_api.h>
-
-#include "ur/usm_allocator_config.hpp"
 #include "ur_bindings.hpp"
 
-extern "C" {
-// Forward declarartions.
-static pi_result piQueueReleaseInternal(pi_queue Queue);
-static pi_result piEventReleaseInternal(pi_event Event);
-static pi_result EventCreate(pi_context Context, pi_queue Queue,
-                             bool HostVisible, pi_event *RetEvent);
-}
-
 // Defined in tracing.cpp
 void enableZeTracing();
 void disableZeTracing();
 
-namespace {
-
-// This is an experimental option to test performance of device to device copy
-// operations on copy engines (versus compute engine)
-static const bool UseCopyEngineForD2DCopy = [] {
-  const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_D2D_COPY");
-  const char *PiRet =
-      std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY");
-  const char *CopyEngineForD2DCopy = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-
-  return (CopyEngineForD2DCopy && (std::stoi(CopyEngineForD2DCopy) != 0));
-}();
-
-// This is an experimental option that allows the use of copy engine, if
-// available in the device, in Level Zero plugin for copy operations submitted
-// to an in-order queue. The default is 1.
-static const bool UseCopyEngineForInOrderQueue = [] {
-  const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE");
-  const char *PiRet =
-      std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE");
-  const char *CopyEngineForInOrderQueue =
-      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-
-  return (!CopyEngineForInOrderQueue ||
-          (std::stoi(CopyEngineForInOrderQueue) != 0));
-}();
-
-// This is an experimental option that allows the use of multiple command lists
-// when submitting barriers. The default is 0.
-static const bool UseMultipleCmdlistBarriers = [] {
-  const char *UrRet = std::getenv("UR_L0_USE_MULTIPLE_COMMANDLIST_BARRIERS");
-  const char *PiRet =
-      std::getenv("SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS");
-  const char *UseMultipleCmdlistBarriersFlag =
-      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-
-  if (!UseMultipleCmdlistBarriersFlag)
-    return true;
-  return std::stoi(UseMultipleCmdlistBarriersFlag) > 0;
-}();
-
-// This is an experimental option that allows to disable caching of events in
-// the context.
-static const bool DisableEventsCaching = [] {
-  const char *UrRet = std::getenv("UR_L0_DISABLE_EVENTS_CACHING");
-  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_EVENTS_CACHING");
-  const char *DisableEventsCachingFlag =
-      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-
-  if (!DisableEventsCachingFlag)
-    return false;
-  return std::stoi(DisableEventsCachingFlag) != 0;
-}();
-
-// This is an experimental option that allows reset and reuse of uncompleted
-// events in the in-order queue with discard_events property.
-static const bool ReuseDiscardedEvents = [] {
-  const char *UrRet = std::getenv("UR_L0_REUSE_DISCARDED_EVENTS");
-  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_REUSE_DISCARDED_EVENTS");
-  const char *ReuseDiscardedEventsFlag =
-      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-
-  if (!ReuseDiscardedEventsFlag)
-    return true;
-  return std::stoi(ReuseDiscardedEventsFlag) > 0;
-}();
-
-// Due to a bug with 2D memory copy to and from non-USM pointers, this option is
-// disabled by default.
-static const bool UseMemcpy2DOperations = [] {
-  const char *UrRet = std::getenv("UR_L0_USE_NATIVE_USM_MEMCPY2D");
-  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D");
-  const char *UseMemcpy2DOperationsFlag =
-      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-
-  if (!UseMemcpy2DOperationsFlag)
-    return false;
-  return std::stoi(UseMemcpy2DOperationsFlag) > 0;
-}();
-
-// Map from L0 to PI result.
-static inline pi_result mapError(ze_result_t Result) {
-  return ur2piResult(ze2urResult(Result));
-}
-
-// Trace a call to Level-Zero RT
-#define ZE_CALL(ZeName, ZeArgs)                                                \
-  {                                                                            \
-    ze_result_t ZeResult = ZeName ZeArgs;                                      \
-    if (auto Result = ZeCall().doCall(ZeResult, #ZeName, #ZeArgs, true))       \
-      return mapError(Result);                                                 \
-  }
-
-// Trace an internal PI call; returns in case of an error.
-#define PI_CALL(Call)                                                          \
-  {                                                                            \
-    if (PrintTrace)                                                            \
-      fprintf(stderr, "PI ---> %s\n", #Call);                                  \
-    pi_result Result = (Call);                                                 \
-    if (Result != PI_SUCCESS)                                                  \
-      return Result;                                                           \
-  }
-
-// Controls if we should choose doing eager initialization
-// to make it happen on warmup paths and have the reportable
-// paths be less likely affected.
-//
-static bool doEagerInit = [] {
-  const char *UrRet = std::getenv("UR_L0_EAGER_INIT");
-  const char *PiRet = std::getenv("SYCL_EAGER_INIT");
-  const char *EagerInit = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-  return EagerInit ? std::atoi(EagerInit) != 0 : false;
-}();
-
-// Maximum number of events that can be present in an event ZePool is captured
-// here. Setting it to 256 gave best possible performance for several
-// benchmarks.
-static const pi_uint32 MaxNumEventsPerPool = [] {
-  const char *UrRet = std::getenv("UR_L0_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
-  const char *PiRet = std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
-  const char *MaxNumEventsPerPoolEnv =
-      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-
-  pi_uint32 Result =
-      MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256;
-  if (Result <= 0)
-    Result = 256;
-  return Result;
-}();
-
-// Helper function to implement zeHostSynchronize.
-// The behavior is to avoid infinite wait during host sync under ZE_DEBUG.
-// This allows for a much more responsive debugging of hangs.
-//
-template <typename T, typename Func>
-ze_result_t zeHostSynchronizeImpl(Func Api, T Handle) {
-  if (!UrL0Debug) {
-    return Api(Handle, UINT64_MAX);
-  }
-
-  ze_result_t R;
-  while ((R = Api(Handle, 1000)) == ZE_RESULT_NOT_READY)
-    ;
-  return R;
-}
-
-// Template function to do various types of host synchronizations.
-// This is intended to be used instead of direct calls to specific
-// Level-Zero synchronization APIs.
-//
-template <typename T> ze_result_t zeHostSynchronize(T Handle);
-template <> ze_result_t zeHostSynchronize(ze_event_handle_t Handle) {
-  return zeHostSynchronizeImpl(zeEventHostSynchronize, Handle);
-}
-template <> ze_result_t zeHostSynchronize(ze_command_queue_handle_t Handle) {
-  return zeHostSynchronizeImpl(zeCommandQueueSynchronize, Handle);
-}
-
-} // anonymous namespace
-
-// UR_L0_LEVEL_ZERO_USE_COMPUTE_ENGINE can be set to an integer (>=0) in
-// which case all compute commands will be submitted to the command-queue
-// with the given index in the compute command group. If it is instead set
-// to negative then all available compute engines may be used.
-//
-// The default value is "0".
-//
-static const std::pair<int, int> getRangeOfAllowedComputeEngines() {
-  const char *UrRet = std::getenv("UR_L0_USE_COMPUTE_ENGINE");
-  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE");
-  const char *EnvVar = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-
-  // If the environment variable is not set only use "0" CCS for now.
-  // TODO: allow all CCSs when HW support is complete.
-  if (!EnvVar)
-    return std::pair<int, int>(0, 0);
-
-  auto EnvVarValue = std::atoi(EnvVar);
-  if (EnvVarValue >= 0) {
-    return std::pair<int, int>(EnvVarValue, EnvVarValue);
-  }
-
-  return std::pair<int, int>(0, INT_MAX);
-}
-
-pi_platform _pi_context::getPlatform() const { return Devices[0]->Platform; }
-
-bool _pi_context::isValidDevice(pi_device Device) const {
-  while (Device) {
-    if (std::find(Devices.begin(), Devices.end(), Device) != Devices.end())
-      return true;
-    Device = Device->RootDevice;
-  }
-  return false;
-}
-
-pi_result
-_pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &Pool,
-                                            size_t &Index, bool HostVisible,
-                                            bool ProfilingEnabled) {
-  // Lock while updating event pool machinery.
-  std::scoped_lock<ur_mutex> Lock(ZeEventPoolCacheMutex);
-
-  std::list<ze_event_pool_handle_t> *ZePoolCache =
-      getZeEventPoolCache(HostVisible, ProfilingEnabled);
-
-  if (!ZePoolCache->empty()) {
-    if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) {
-      if (DisableEventsCaching) {
-        // Remove full pool from the cache if events caching is disabled.
-        ZePoolCache->erase(ZePoolCache->begin());
-      } else {
-        // If event caching is enabled then we don't destroy events so there is
-        // no need to remove pool from the cache and add it back when it has
-        // available slots. Just keep it in the tail of the cache so that all
-        // pools can be destroyed during context destruction.
-        ZePoolCache->push_front(nullptr);
-      }
-    }
-  }
-  if (ZePoolCache->empty()) {
-    ZePoolCache->push_back(nullptr);
-  }
-
-  // We shall be adding an event to the front pool.
-  ze_event_pool_handle_t *ZePool = &ZePoolCache->front();
-  Index = 0;
-  // Create one event ZePool per MaxNumEventsPerPool events
-  if (*ZePool == nullptr) {
-    ZeStruct<ze_event_pool_desc_t> ZeEventPoolDesc;
-    ZeEventPoolDesc.count = MaxNumEventsPerPool;
-    ZeEventPoolDesc.flags = 0;
-    if (HostVisible)
-      ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
-    if (ProfilingEnabled)
-      ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
-    urPrint("ze_event_pool_desc_t flags set to: %d\n", ZeEventPoolDesc.flags);
-
-    std::vector<ze_device_handle_t> ZeDevices;
-    std::for_each(Devices.begin(), Devices.end(), [&](const pi_device &D) {
-      ZeDevices.push_back(D->ZeDevice);
-    });
-
-    ZE_CALL(zeEventPoolCreate, (ZeContext, &ZeEventPoolDesc, ZeDevices.size(),
-                                &ZeDevices[0], ZePool));
-    NumEventsAvailableInEventPool[*ZePool] = MaxNumEventsPerPool - 1;
-    NumEventsUnreleasedInEventPool[*ZePool] = 1;
-  } else {
-    Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[*ZePool];
-    --NumEventsAvailableInEventPool[*ZePool];
-    ++NumEventsUnreleasedInEventPool[*ZePool];
-  }
-  Pool = *ZePool;
-  return PI_SUCCESS;
-}
-
-pi_result _pi_context::decrementUnreleasedEventsInPool(pi_event Event) {
-  std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex, std::defer_lock);
-  std::scoped_lock<ur_mutex, std::shared_lock<ur_shared_mutex>> LockAll(
-      ZeEventPoolCacheMutex, EventLock);
-  if (!Event->ZeEventPool) {
-    // This must be an interop event created on a users's pool.
-    // Do nothing.
-    return PI_SUCCESS;
-  }
-
-  std::list<ze_event_pool_handle_t> *ZePoolCache =
-      getZeEventPoolCache(Event->isHostVisible(), Event->isProfilingEnabled());
-
-  // Put the empty pool to the cache of the pools.
-  if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0)
-    die("Invalid event release: event pool doesn't have unreleased events");
-  if (--NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) {
-    if (ZePoolCache->front() != Event->ZeEventPool) {
-      ZePoolCache->push_back(Event->ZeEventPool);
-    }
-    NumEventsAvailableInEventPool[Event->ZeEventPool] = MaxNumEventsPerPool;
-  }
-
-  return PI_SUCCESS;
-}
-
-// Forward declarations
-static pi_result enqueueMemCopyHelper(pi_command_type CommandType,
-                                      pi_queue Queue, void *Dst,
-                                      pi_bool BlockingWrite, size_t Size,
-                                      const void *Src,
-                                      pi_uint32 NumEventsInWaitList,
-                                      const pi_event *EventWaitList,
-                                      pi_event *Event, bool PreferCopyEngine);
-
-static pi_result enqueueMemCopyRectHelper(
-    pi_command_type CommandType, pi_queue Queue, const void *SrcBuffer,
-    void *DstBuffer, pi_buff_rect_offset SrcOrigin,
-    pi_buff_rect_offset DstOrigin, pi_buff_rect_region Region,
-    size_t SrcRowPitch, size_t DstRowPitch, size_t SrcSlicePitch,
-    size_t DstSlicePitch, pi_bool Blocking, pi_uint32 NumEventsInWaitList,
-    const pi_event *EventWaitList, pi_event *Event,
-    bool PreferCopyEngine = false);
-
-bool _pi_queue::doReuseDiscardedEvents() {
-  return ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents();
-}
-
-pi_result _pi_queue::resetDiscardedEvent(pi_command_list_ptr_t CommandList) {
-  if (LastCommandEvent && LastCommandEvent->IsDiscarded) {
-    ZE_CALL(zeCommandListAppendBarrier,
-            (CommandList->first, nullptr, 1, &(LastCommandEvent->ZeEvent)));
-    ZE_CALL(zeCommandListAppendEventReset,
-            (CommandList->first, LastCommandEvent->ZeEvent));
-
-    // Create new pi_event but with the same ze_event_handle_t. We are going
-    // to use this pi_event for the next command with discarded event.
-    pi_event PiEvent;
-    try {
-      PiEvent = new _pi_event(LastCommandEvent->ZeEvent,
-                              LastCommandEvent->ZeEventPool, Context,
-                              PI_COMMAND_TYPE_USER, true);
-    } catch (const std::bad_alloc &) {
-      return PI_ERROR_OUT_OF_HOST_MEMORY;
-    } catch (...) {
-      return PI_ERROR_UNKNOWN;
-    }
-
-    if (LastCommandEvent->isHostVisible())
-      PiEvent->HostVisibleEvent = PiEvent;
-
-    PI_CALL(addEventToQueueCache(PiEvent));
-  }
-
-  return PI_SUCCESS;
-}
-
-// This helper function creates a pi_event and associate a pi_queue.
-// Note that the caller of this function must have acquired lock on the Queue
-// that is passed in.
-// \param Queue pi_queue to associate with a new event.
-// \param Event a pointer to hold the newly created pi_event
-// \param CommandType various command type determined by the caller
-// \param CommandList is the command list where the event is added
-// \param IsInternal tells if the event is internal, i.e. visible in the L0
-//        plugin only.
-// \param HostVisible tells if the event must be created in the
-//        host-visible pool. If not set then this function will decide.
-inline static pi_result
-createEventAndAssociateQueue(pi_queue Queue, pi_event *Event,
-                             pi_command_type CommandType,
-                             pi_command_list_ptr_t CommandList, bool IsInternal,
-                             std::optional<bool> HostVisible = std::nullopt) {
-
-  if (!HostVisible.has_value()) {
-    // Internal/discarded events do not need host-scope visibility.
-    HostVisible =
-        IsInternal ? false : Queue->Device->ZeEventsScope == AllHostVisible;
-  }
-
-  // If event is discarded then try to get event from the queue cache.
-  *Event =
-      IsInternal ? Queue->getEventFromQueueCache(HostVisible.value()) : nullptr;
-
-  if (*Event == nullptr)
-    PI_CALL(EventCreate(Queue->Context, Queue, HostVisible.value(), Event));
-
-  (*Event)->Queue = Queue;
-  (*Event)->CommandType = CommandType;
-  (*Event)->IsDiscarded = IsInternal;
-  (*Event)->CommandList = CommandList;
-  // Discarded event doesn't own ze_event, it is used by multiple pi_event
-  // objects. We destroy corresponding ze_event by releasing events from the
-  // events cache at queue destruction. Event in the cache owns the Level Zero
-  // event.
-  if (IsInternal)
-    (*Event)->OwnZeEvent = false;
-
-  // Append this Event to the CommandList, if any
-  if (CommandList != Queue->CommandListMap.end()) {
-    CommandList->second.append(*Event);
-    (*Event)->RefCount.increment();
-  }
-
-  // We need to increment the reference counter here to avoid pi_queue
-  // being released before the associated pi_event is released because
-  // piEventRelease requires access to the associated pi_queue.
-  // In piEventRelease, the reference counter of the Queue is decremented
-  // to release it.
-  Queue->RefCount.increment();
-
-  // SYCL RT does not track completion of the events, so it could
-  // release a PI event as soon as that's not being waited in the app.
-  // But we have to ensure that the event is not destroyed before
-  // it is really signalled, so retain it explicitly here and
-  // release in CleanupCompletedEvent(Event).
-  // If the event is internal then don't increment the reference count as this
-  // event will not be waited/released by SYCL RT, so it must be destroyed by
-  // EventRelease in resetCommandList.
-  if (!IsInternal)
-    PI_CALL(piEventRetain(*Event));
-
-  return PI_SUCCESS;
-}
-
-pi_result _pi_queue::signalEventFromCmdListIfLastEventDiscarded(
-    pi_command_list_ptr_t CommandList) {
-  // We signal new event at the end of command list only if we have queue with
-  // discard_events property and the last command event is discarded.
-  if (!(doReuseDiscardedEvents() && LastCommandEvent &&
-        LastCommandEvent->IsDiscarded))
-    return PI_SUCCESS;
-
-  // NOTE: We create this "glue" event not as internal so it is not
-  // participating in the discarded events reset/reuse logic, but
-  // with no host-visibility since it is not going to be waited
-  // from the host.
-  pi_event Event;
-  PI_CALL(createEventAndAssociateQueue(
-      this, &Event, PI_COMMAND_TYPE_USER, CommandList,
-      /* IsInternal */ false, /* HostVisible */ false));
-  PI_CALL(piEventReleaseInternal(Event));
-  LastCommandEvent = Event;
-
-  ZE_CALL(zeCommandListAppendSignalEvent, (CommandList->first, Event->ZeEvent));
-  return PI_SUCCESS;
-}
-
-pi_event _pi_queue::getEventFromQueueCache(bool HostVisible) {
-  auto Cache = HostVisible ? &EventCaches[0] : &EventCaches[1];
-
-  // If we don't have any events, return nullptr.
-  // If we have only a single event then it was used by the last command and we
-  // can't use it now because we have to enforce round robin between two events.
-  if (Cache->size() < 2)
-    return nullptr;
-
-  // If there are two events then return an event from the beginning of the list
-  // since event of the last command is added to the end of the list.
-  auto It = Cache->begin();
-  pi_event RetEvent = *It;
-  Cache->erase(It);
-  return RetEvent;
-}
-
-pi_result _pi_queue::addEventToQueueCache(pi_event Event) {
-  auto Cache = Event->isHostVisible() ? &EventCaches[0] : &EventCaches[1];
-  Cache->emplace_back(Event);
-  return PI_SUCCESS;
-}
-
-// Get value of the threshold for number of events in immediate command lists.
-// If number of events in the immediate command list exceeds this threshold then
-// cleanup process for those events is executed.
-static const size_t ImmCmdListsEventCleanupThreshold = [] {
-  const char *UrRet =
-      std::getenv("UR_L0_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD");
-  const char *PiRet = std::getenv(
-      "SYCL_PI_LEVEL_ZERO_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD");
-  const char *ImmCmdListsEventCleanupThresholdStr =
-      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-
-  static constexpr int Default = 1000;
-  if (!ImmCmdListsEventCleanupThresholdStr)
-    return Default;
-
-  int Threshold = std::atoi(ImmCmdListsEventCleanupThresholdStr);
-
-  // Basically disable threshold if negative value is provided.
-  if (Threshold < 0)
-    return INT_MAX;
-
-  return Threshold;
-}();
-
-// Get value of the threshold for number of active command lists allowed before
-// we start heuristically cleaning them up.
-static const size_t CmdListsCleanupThreshold = [] {
-  const char *UrRet = std::getenv("UR_L0_COMMANDLISTS_CLEANUP_THRESHOLD");
-  const char *PiRet =
-      std::getenv("SYCL_PI_LEVEL_ZERO_COMMANDLISTS_CLEANUP_THRESHOLD");
-  const char *CmdListsCleanupThresholdStr =
-      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-
-  static constexpr int Default = 20;
-  if (!CmdListsCleanupThresholdStr)
-    return Default;
-
-  int Threshold = std::atoi(CmdListsCleanupThresholdStr);
-
-  // Basically disable threshold if negative value is provided.
-  if (Threshold < 0)
-    return INT_MAX;
-
-  return Threshold;
-}();
-
-pi_device _pi_context::getRootDevice() const {
-  assert(Devices.size() > 0);
-
-  if (Devices.size() == 1)
-    return Devices[0];
-
-  // Check if we have context with subdevices of the same device (context
-  // may include root device itself as well)
-  pi_device ContextRootDevice =
-      Devices[0]->RootDevice ? Devices[0]->RootDevice : Devices[0];
-
-  // For context with sub subdevices, the ContextRootDevice might still
-  // not be the root device.
-  // Check whether the ContextRootDevice is the subdevice or root device.
-  if (ContextRootDevice->isSubDevice()) {
-    ContextRootDevice = ContextRootDevice->RootDevice;
-  }
-
-  for (auto &Device : Devices) {
-    if ((!Device->RootDevice && Device != ContextRootDevice) ||
-        (Device->RootDevice && Device->RootDevice != ContextRootDevice)) {
-      ContextRootDevice = nullptr;
-      break;
-    }
-  }
-  return ContextRootDevice;
-}
-
-pi_result _pi_context::initialize() {
-
-  // Helper lambda to create various USM allocators for a device.
-  // Note that the CCS devices and their respective subdevices share a
-  // common ze_device_handle and therefore, also share USM allocators.
-  auto createUSMAllocators = [this](pi_device Device) {
-    SharedMemAllocContexts.emplace(
-        std::piecewise_construct, std::make_tuple(Device->ZeDevice),
-        std::make_tuple(
-            std::unique_ptr<SystemMemory>(
-                new USMSharedMemoryAlloc(this, Device)),
-            USMAllocatorConfigInstance.Configs[usm_settings::MemType::Shared]));
-
-    SharedReadOnlyMemAllocContexts.emplace(
-        std::piecewise_construct, std::make_tuple(Device->ZeDevice),
-        std::make_tuple(std::unique_ptr<SystemMemory>(
-                            new USMSharedReadOnlyMemoryAlloc(this, Device)),
-                        USMAllocatorConfigInstance
-                            .Configs[usm_settings::MemType::SharedReadOnly]));
-
-    DeviceMemAllocContexts.emplace(
-        std::piecewise_construct, std::make_tuple(Device->ZeDevice),
-        std::make_tuple(
-            std::unique_ptr<SystemMemory>(
-                new USMDeviceMemoryAlloc(this, Device)),
-            USMAllocatorConfigInstance.Configs[usm_settings::MemType::Device]));
-  };
-
-  // Recursive helper to call createUSMAllocators for all sub-devices
-  std::function<void(pi_device)> createUSMAllocatorsRecursive;
-  createUSMAllocatorsRecursive =
-      [createUSMAllocators,
-       &createUSMAllocatorsRecursive](pi_device Device) -> void {
-    createUSMAllocators(Device);
-    for (auto &SubDevice : Device->SubDevices)
-      createUSMAllocatorsRecursive(SubDevice);
-  };
-
-  // Create USM allocator context for each pair (device, context).
-  //
-  for (auto &Device : Devices) {
-    createUSMAllocatorsRecursive(Device);
-  }
-  // Create USM allocator context for host. Device and Shared USM allocations
-  // are device-specific. Host allocations are not device-dependent therefore
-  // we don't need a map with device as key.
-  HostMemAllocContext = std::make_unique<USMAllocContext>(
-      std::unique_ptr<SystemMemory>(new USMHostMemoryAlloc(this)),
-      USMAllocatorConfigInstance.Configs[usm_settings::MemType::Host]);
-
-  // We may allocate memory to this root device so create allocators.
-  if (SingleRootDevice &&
-      DeviceMemAllocContexts.find(SingleRootDevice->ZeDevice) ==
-          DeviceMemAllocContexts.end()) {
-    createUSMAllocators(SingleRootDevice);
-  }
-
-  // Create the immediate command list to be used for initializations.
-  // Created as synchronous so level-zero performs implicit synchronization and
-  // there is no need to query for completion in the plugin
-  //
-  // TODO: we use Device[0] here as the single immediate command-list
-  // for buffer creation and migration. Initialization is in
-  // in sync and is always performed to Devices[0] as well but
-  // D2D migartion, if no P2P, is broken since it should use
-  // immediate command-list for the specfic devices, and this single one.
-  //
-  pi_device Device = SingleRootDevice ? SingleRootDevice : Devices[0];
-
-  // Prefer to use copy engine for initialization copies,
-  // if available and allowed (main copy engine with index 0).
-  ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
-  const auto &Range = getRangeOfAllowedCopyEngines((ur_device_handle_t)Device);
-  ZeCommandQueueDesc.ordinal =
-      Device->QueueGroup[_pi_device::queue_group_info_t::Compute].ZeOrdinal;
-  if (Range.first >= 0 &&
-      Device->QueueGroup[_pi_device::queue_group_info_t::MainCopy].ZeOrdinal !=
-          -1)
-    ZeCommandQueueDesc.ordinal =
-        Device->QueueGroup[_pi_device::queue_group_info_t::MainCopy].ZeOrdinal;
-
-  ZeCommandQueueDesc.index = 0;
-  ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
-  ZE_CALL(
-      zeCommandListCreateImmediate,
-      (ZeContext, Device->ZeDevice, &ZeCommandQueueDesc, &ZeCommandListInit));
-  return PI_SUCCESS;
-}
-
-pi_result _pi_context::finalize() {
-  // This function is called when pi_context is deallocated, piContextRelease.
-  // There could be some memory that may have not been deallocated.
-  // For example, event and event pool caches would be still alive.
-
-  if (!DisableEventsCaching) {
-    std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
-    for (auto &EventCache : EventCaches) {
-      for (auto &Event : EventCache) {
-        auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
-        // Gracefully handle the case that L0 was already unloaded.
-        if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
-          return mapError(ZeResult);
-
-        delete Event;
-      }
-      EventCache.clear();
-    }
-  }
-  {
-    std::scoped_lock<ur_mutex> Lock(ZeEventPoolCacheMutex);
-    for (auto &ZePoolCache : ZeEventPoolCache) {
-      for (auto &ZePool : ZePoolCache) {
-        auto ZeResult = ZE_CALL_NOCHECK(zeEventPoolDestroy, (ZePool));
-        // Gracefully handle the case that L0 was already unloaded.
-        if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
-          return mapError(ZeResult);
-      }
-      ZePoolCache.clear();
-    }
-  }
-
-  // Destroy the command list used for initializations
-  auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListInit));
-  // Gracefully handle the case that L0 was already unloaded.
-  if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
-    return mapError(ZeResult);
-
-  std::scoped_lock<ur_mutex> Lock(ZeCommandListCacheMutex);
-  for (auto &List : ZeComputeCommandListCache) {
-    for (auto &Item : List.second) {
-      ze_command_list_handle_t ZeCommandList = Item.first;
-      if (ZeCommandList) {
-        auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList));
-        // Gracefully handle the case that L0 was already unloaded.
-        if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
-          return mapError(ZeResult);
-      }
-    }
-  }
-  for (auto &List : ZeCopyCommandListCache) {
-    for (auto &Item : List.second) {
-      ze_command_list_handle_t ZeCommandList = Item.first;
-      if (ZeCommandList) {
-        auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList));
-        // Gracefully handle the case that L0 was already unloaded.
-        if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
-          return mapError(ZeResult);
-      }
-    }
-  }
-  return PI_SUCCESS;
-}
-
-bool pi_command_list_info_t::isCopy(pi_queue Queue) const {
-  return ZeQueueDesc.ordinal !=
-         (uint32_t)Queue->Device
-             ->QueueGroup[_pi_device::queue_group_info_t::type::Compute]
-             .ZeOrdinal;
-}
-
-bool _pi_queue::isInOrderQueue() const {
-  // If out-of-order queue property is not set, then this is a in-order queue.
-  return ((this->Properties & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) ==
-          0);
-}
-
-bool _pi_queue::isDiscardEvents() const {
-  return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS) != 0);
-}
-
-bool _pi_queue::isPriorityLow() const {
-  return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW) != 0);
-}
-
-bool _pi_queue::isPriorityHigh() const {
-  return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH) != 0);
-}
-
-pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList,
-                                      bool MakeAvailable,
-                                      std::vector<pi_event> &EventListToCleanup,
-                                      bool CheckStatus) {
-  bool UseCopyEngine = CommandList->second.isCopy(this);
-
-  // Immediate commandlists do not have an associated fence.
-  if (CommandList->second.ZeFence != nullptr) {
-    // Fence had been signalled meaning the associated command-list completed.
-    // Reset the fence and put the command list into a cache for reuse in PI
-    // calls.
-    ZE_CALL(zeFenceReset, (CommandList->second.ZeFence));
-    ZE_CALL(zeCommandListReset, (CommandList->first));
-    CommandList->second.ZeFenceInUse = false;
-    CommandList->second.IsClosed = false;
-  }
-
-  auto &EventList = CommandList->second.EventList;
-  // Check if standard commandlist or fully synced in-order queue.
-  // If one of those conditions is met then we are sure that all events are
-  // completed so we don't need to check event status.
-  if (!CheckStatus || CommandList->second.ZeFence != nullptr ||
-      (isInOrderQueue() && !LastCommandEvent)) {
-    // Remember all the events in this command list which needs to be
-    // released/cleaned up and clear event list associated with command list.
-    std::move(std::begin(EventList), std::end(EventList),
-              std::back_inserter(EventListToCleanup));
-    EventList.clear();
-  } else if (!isDiscardEvents()) {
-    // If events in the queue are discarded then we can't check their status.
-    // Helper for checking of event completion
-    auto EventCompleted = [](pi_event Event) -> bool {
-      std::scoped_lock<ur_shared_mutex> EventLock(Event->Mutex);
-      ze_result_t ZeResult =
-          Event->Completed
-              ? ZE_RESULT_SUCCESS
-              : ZE_CALL_NOCHECK(zeEventQueryStatus, (Event->ZeEvent));
-      return ZeResult == ZE_RESULT_SUCCESS;
-    };
-    // Handle in-order specially as we can just in few checks (with binary
-    // search) a completed event and then all events before it are also
-    // done.
-    if (isInOrderQueue()) {
-      size_t Bisect = EventList.size();
-      size_t Iter = 0;
-      for (auto it = EventList.rbegin(); it != EventList.rend(); ++Iter) {
-        if (!EventCompleted(*it)) {
-          if (Bisect > 1 && Iter < 3) { // Heuristically limit by 3 checks
-            Bisect >>= 1;
-            it += Bisect;
-            continue;
-          }
-          break;
-        }
-        // Bulk move of event up to "it" to the list ready for cleanup
-        std::move(it, EventList.rend(), std::back_inserter(EventListToCleanup));
-        EventList.erase(EventList.begin(), it.base());
-        break;
-      }
-      return PI_SUCCESS;
-    }
-    // For immediate commandlist reset only those events that have signalled.
-    for (auto it = EventList.begin(); it != EventList.end();) {
-      // Break early as soon as we found first incomplete event because next
-      // events are submitted even later. We are not trying to find all
-      // completed events here because it may be costly. I.e. we are checking
-      // only elements which are most likely completed because they were
-      // submitted earlier. It is guaranteed that all events will be eventually
-      // cleaned up at queue sync/release.
-      if (!EventCompleted(*it))
-        break;
-
-      EventListToCleanup.push_back(std::move((*it)));
-      it = EventList.erase(it);
-    }
-  }
-
-  // Standard commandlists move in and out of the cache as they are recycled.
-  // Immediate commandlists are always available.
-  if (CommandList->second.ZeFence != nullptr && MakeAvailable) {
-    std::scoped_lock<ur_mutex> Lock(this->Context->ZeCommandListCacheMutex);
-    auto &ZeCommandListCache =
-        UseCopyEngine
-            ? this->Context->ZeCopyCommandListCache[this->Device->ZeDevice]
-            : this->Context->ZeComputeCommandListCache[this->Device->ZeDevice];
-    ZeCommandListCache.push_back(
-        {CommandList->first, CommandList->second.ZeQueueDesc});
-  }
-
-  return PI_SUCCESS;
-}
-
-// Configuration of the command-list batching.
-struct zeCommandListBatchConfig {
-  // Default value of 0. This specifies to use dynamic batch size adjustment.
-  // Other values will try to collect specified amount of commands.
-  pi_uint32 Size{0};
-
-  // If doing dynamic batching, specifies start batch size.
-  pi_uint32 DynamicSizeStart{4};
-
-  // The maximum size for dynamic batch.
-  pi_uint32 DynamicSizeMax{64};
-
-  // The step size for dynamic batch increases.
-  pi_uint32 DynamicSizeStep{1};
-
-  // Thresholds for when increase batch size (number of closed early is small
-  // and number of closed full is high).
-  pi_uint32 NumTimesClosedEarlyThreshold{3};
-  pi_uint32 NumTimesClosedFullThreshold{8};
-
-  // Tells the starting size of a batch.
-  pi_uint32 startSize() const { return Size > 0 ? Size : DynamicSizeStart; }
-  // Tells is we are doing dynamic batch size adjustment.
-  bool dynamic() const { return Size == 0; }
-};
-
-// Helper function to initialize static variables that holds batch config info
-// for compute and copy command batching.
-static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) {
-  zeCommandListBatchConfig Config{}; // default initialize
-
-  // Default value of 0. This specifies to use dynamic batch size adjustment.
-  const char *UrRet = nullptr;
-  const char *PiRet = nullptr;
-  if (IsCopy) {
-    UrRet = std::getenv("UR_L0_COPY_BATCH_SIZE");
-    PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE");
-  } else {
-    UrRet = std::getenv("UR_L0_BATCH_SIZE");
-    PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE");
-  }
-  const char *BatchSizeStr = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-
-  if (BatchSizeStr) {
-    pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr);
-    // Level Zero may only support a limted number of commands per command
-    // list.  The actual upper limit is not specified by the Level Zero
-    // Specification.  For now we allow an arbitrary upper limit.
-    if (BatchSizeStrVal > 0) {
-      Config.Size = BatchSizeStrVal;
-    } else if (BatchSizeStrVal == 0) {
-      Config.Size = 0;
-      // We are requested to do dynamic batching. Collect specifics, if any.
-      // The extended format supported is ":" separated values.
-      //
-      // NOTE: these extra settings are experimental and are intended to
-      // be used only for finding a better default heuristic.
-      //
-      std::string BatchConfig(BatchSizeStr);
-      size_t Ord = 0;
-      size_t Pos = 0;
-      while (true) {
-        if (++Ord > 5)
-          break;
-
-        Pos = BatchConfig.find(":", Pos);
-        if (Pos == std::string::npos)
-          break;
-        ++Pos; // past the ":"
-
-        pi_uint32 Val;
-        try {
-          Val = std::stoi(BatchConfig.substr(Pos));
-        } catch (...) {
-          if (IsCopy)
-            urPrint("UR_L0_COPY_BATCH_SIZE: failed to parse value\n");
-          else
-            urPrint("UR_L0_BATCH_SIZE: failed to parse value\n");
-          break;
-        }
-        switch (Ord) {
-        case 1:
-          Config.DynamicSizeStart = Val;
-          break;
-        case 2:
-          Config.DynamicSizeMax = Val;
-          break;
-        case 3:
-          Config.DynamicSizeStep = Val;
-          break;
-        case 4:
-          Config.NumTimesClosedEarlyThreshold = Val;
-          break;
-        case 5:
-          Config.NumTimesClosedFullThreshold = Val;
-          break;
-        default:
-          die("Unexpected batch config");
-        }
-        if (IsCopy)
-          urPrint("UR_L0_COPY_BATCH_SIZE: dynamic batch param "
-                  "#%d: %d\n",
-                  (int)Ord, (int)Val);
-        else
-          urPrint("UR_L0_BATCH_SIZE: dynamic batch param #%d: %d\n", (int)Ord,
-                  (int)Val);
-      };
-
-    } else {
-      // Negative batch sizes are silently ignored.
-      if (IsCopy)
-        urPrint("UR_L0_COPY_BATCH_SIZE: ignored negative value\n");
-      else
-        urPrint("UR_L0_BATCH_SIZE: ignored negative value\n");
-    }
-  }
-  return Config;
-}
-
-// Static variable that holds batch config info for compute command batching.
-static const zeCommandListBatchConfig ZeCommandListBatchComputeConfig = [] {
-  using IsCopy = bool;
-  return ZeCommandListBatchConfig(IsCopy{false});
-}();
-
-// Static variable that holds batch config info for copy command batching.
-static const zeCommandListBatchConfig ZeCommandListBatchCopyConfig = [] {
-  using IsCopy = bool;
-  return ZeCommandListBatchConfig(IsCopy{true});
-}();
-
-// Control if wait with barrier is implemented by signal of an event
-// as opposed by true barrier command for in-order queue.
-static const bool InOrderBarrierBySignal = [] {
-  const char *UrRet = std::getenv("UR_L0_IN_ORDER_BARRIER_BY_SIGNAL");
-  return (UrRet ? std::atoi(UrRet) : true);
-}();
-
-_pi_queue::_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
-                     std::vector<ze_command_queue_handle_t> &CopyQueues,
-                     pi_context Context, pi_device Device,
-                     bool OwnZeCommandQueue,
-                     pi_queue_properties PiQueueProperties,
-                     int ForceComputeIndex)
-    : Context{Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue},
-      Properties(PiQueueProperties) {
-  UsingImmCmdLists = Device->useImmediateCommandLists();
-  urPrint("ImmCmdList setting (%s)\n", (UsingImmCmdLists ? "YES" : "NO"));
-
-  // Compute group initialization.
-  // First, see if the queue's device allows for round-robin or it is
-  // fixed to one particular compute CCS (it is so for sub-sub-devices).
-  auto &ComputeQueueGroupInfo = Device->QueueGroup[queue_type::Compute];
-  pi_queue_group_t ComputeQueueGroup{this, queue_type::Compute};
-  ComputeQueueGroup.ZeQueues = ComputeQueues;
-  // Create space to hold immediate commandlists corresponding to the
-  // ZeQueues
-  if (UsingImmCmdLists) {
-    ComputeQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
-        ComputeQueueGroup.ZeQueues.size(), CommandListMap.end());
-  }
-  if (ComputeQueueGroupInfo.ZeIndex >= 0) {
-    // Sub-sub-device
-
-    // sycl::ext::intel::property::queue::compute_index works with any
-    // backend/device by allowing single zero index if multiple compute CCSes
-    // are not supported. Sub-sub-device falls into the same bucket.
-    assert(ForceComputeIndex <= 0);
-    ComputeQueueGroup.LowerIndex = ComputeQueueGroupInfo.ZeIndex;
-    ComputeQueueGroup.UpperIndex = ComputeQueueGroupInfo.ZeIndex;
-    ComputeQueueGroup.NextIndex = ComputeQueueGroupInfo.ZeIndex;
-  } else if (ForceComputeIndex >= 0) {
-    ComputeQueueGroup.LowerIndex = ForceComputeIndex;
-    ComputeQueueGroup.UpperIndex = ForceComputeIndex;
-    ComputeQueueGroup.NextIndex = ForceComputeIndex;
-  } else {
-    // Set-up to round-robin across allowed range of engines.
-    uint32_t FilterLowerIndex = getRangeOfAllowedComputeEngines().first;
-    uint32_t FilterUpperIndex = getRangeOfAllowedComputeEngines().second;
-    FilterUpperIndex = std::min((size_t)FilterUpperIndex,
-                                FilterLowerIndex + ComputeQueues.size() - 1);
-    if (FilterLowerIndex <= FilterUpperIndex) {
-      ComputeQueueGroup.LowerIndex = FilterLowerIndex;
-      ComputeQueueGroup.UpperIndex = FilterUpperIndex;
-      ComputeQueueGroup.NextIndex = ComputeQueueGroup.LowerIndex;
-    } else {
-      die("No compute queue available/allowed.");
-    }
-  }
-  if (UsingImmCmdLists) {
-    // Create space to hold immediate commandlists corresponding to the
-    // ZeQueues
-    ComputeQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
-        ComputeQueueGroup.ZeQueues.size(), CommandListMap.end());
-  }
-  ComputeQueueGroupsByTID.set(ComputeQueueGroup);
-
-  // Copy group initialization.
-  pi_queue_group_t CopyQueueGroup{this, queue_type::MainCopy};
-  const auto &Range = getRangeOfAllowedCopyEngines((ur_device_handle_t)Device);
-  if (Range.first < 0 || Range.second < 0) {
-    // We are asked not to use copy engines, just do nothing.
-    // Leave CopyQueueGroup.ZeQueues empty, and it won't be used.
-  } else {
-    uint32_t FilterLowerIndex = Range.first;
-    uint32_t FilterUpperIndex = Range.second;
-    FilterUpperIndex = std::min((size_t)FilterUpperIndex,
-                                FilterLowerIndex + CopyQueues.size() - 1);
-    if (FilterLowerIndex <= FilterUpperIndex) {
-      CopyQueueGroup.ZeQueues = CopyQueues;
-      CopyQueueGroup.LowerIndex = FilterLowerIndex;
-      CopyQueueGroup.UpperIndex = FilterUpperIndex;
-      CopyQueueGroup.NextIndex = CopyQueueGroup.LowerIndex;
-      // Create space to hold immediate commandlists corresponding to the
-      // ZeQueues
-      if (UsingImmCmdLists) {
-        CopyQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
-            CopyQueueGroup.ZeQueues.size(), CommandListMap.end());
-      }
-    }
-  }
-  CopyQueueGroupsByTID.set(CopyQueueGroup);
-
-  // Initialize compute/copy command batches.
-  ComputeCommandBatch.OpenCommandList = CommandListMap.end();
-  CopyCommandBatch.OpenCommandList = CommandListMap.end();
-  ComputeCommandBatch.QueueBatchSize =
-      ZeCommandListBatchComputeConfig.startSize();
-  CopyCommandBatch.QueueBatchSize = ZeCommandListBatchCopyConfig.startSize();
-}
-
-static pi_result CleanupCompletedEvent(pi_event Event,
-                                       bool QueueLocked = false);
-
-// Helper function to perform the necessary cleanup of the events from reset cmd
-// list.
-static pi_result
-CleanupEventListFromResetCmdList(std::vector<pi_event> &EventListToCleanup,
-                                 bool QueueLocked = false) {
-  for (auto &Event : EventListToCleanup) {
-    // We don't need to synchronize the events since the fence associated with
-    // the command list was synchronized.
-    {
-      std::scoped_lock<ur_shared_mutex> EventLock(Event->Mutex);
-      Event->Completed = true;
-    }
-    PI_CALL(CleanupCompletedEvent(Event, QueueLocked));
-    // This event was removed from the command list, so decrement ref count
-    // (it was incremented when they were added to the command list).
-    PI_CALL(piEventReleaseInternal(Event));
-  }
-  return PI_SUCCESS;
-}
-
-/// @brief Cleanup events in the immediate lists of the queue.
-/// @param Queue Queue where events need to be cleaned up.
-/// @param QueueLocked Indicates if the queue mutex is locked by caller.
-/// @param QueueSynced 'true' if queue was synchronized before the
-/// call and no other commands were submitted after synchronization, 'false'
-/// otherwise.
-/// @param CompletedEvent Hint providing an event which was synchronized before
-/// the call, in case of in-order queue it allows to cleanup all preceding
-/// events.
-/// @return PI_SUCCESS if successful, PI error code otherwise.
-static pi_result CleanupEventsInImmCmdLists(pi_queue Queue,
-                                            bool QueueLocked = false,
-                                            bool QueueSynced = false,
-                                            pi_event CompletedEvent = nullptr) {
-  // Handle only immediate command lists here.
-  if (!Queue || !Queue->UsingImmCmdLists)
-    return PI_SUCCESS;
-
-  std::vector<pi_event> EventListToCleanup;
-  {
-    std::unique_lock<ur_shared_mutex> QueueLock(Queue->Mutex, std::defer_lock);
-    if (!QueueLocked)
-      QueueLock.lock();
-    // If queue is locked and fully synchronized then cleanup all events.
-    // If queue is not locked then by this time there may be new submitted
-    // commands so we can't do full cleanup.
-    if (QueueLocked &&
-        (QueueSynced || (Queue->isInOrderQueue() &&
-                         (CompletedEvent == Queue->LastCommandEvent ||
-                          !Queue->LastCommandEvent)))) {
-      Queue->LastCommandEvent = nullptr;
-      for (auto &&It = Queue->CommandListMap.begin();
-           It != Queue->CommandListMap.end(); ++It) {
-        PI_CALL(Queue->resetCommandList(It, true, EventListToCleanup,
-                                        /* CheckStatus */ false));
-      }
-    } else if (Queue->isInOrderQueue() && CompletedEvent) {
-      // If the queue is in-order and we have information about completed event
-      // then cleanup all events in the command list preceding to CompletedEvent
-      // including itself.
-
-      // Check that the comleted event has associated command list.
-      if (!(CompletedEvent->CommandList &&
-            CompletedEvent->CommandList.value() != Queue->CommandListMap.end()))
-        return PI_SUCCESS;
-
-      auto &CmdListEvents =
-          CompletedEvent->CommandList.value()->second.EventList;
-      auto CompletedEventIt =
-          std::find(CmdListEvents.begin(), CmdListEvents.end(), CompletedEvent);
-      if (CompletedEventIt != CmdListEvents.end()) {
-        // We can cleanup all events prior to the completed event in this
-        // command list and completed event itself.
-        // TODO: we can potentially cleanup more events here by finding
-        // completed events on another command lists, but it is currently not
-        // implemented.
-        std::move(std::begin(CmdListEvents), CompletedEventIt + 1,
-                  std::back_inserter(EventListToCleanup));
-        CmdListEvents.erase(CmdListEvents.begin(), CompletedEventIt + 1);
-      }
-    } else {
-      // Fallback to resetCommandList over all command lists.
-      for (auto &&It = Queue->CommandListMap.begin();
-           It != Queue->CommandListMap.end(); ++It) {
-        PI_CALL(Queue->resetCommandList(It, true, EventListToCleanup,
-                                        /* CheckStatus */ true));
-      }
-    }
-  }
-  PI_CALL(CleanupEventListFromResetCmdList(EventListToCleanup, QueueLocked));
-  return PI_SUCCESS;
-}
-
-/// @brief Reset signalled command lists in the queue and put them to the cache
-/// of command lists. Also cleanup events associated with signalled command
-/// lists. Queue must be locked by the caller for modification.
-/// @param Queue Queue where we look for signalled command lists and cleanup
-/// events.
-/// @return PI_SUCCESS if successful, PI error code otherwise.
-static pi_result resetCommandLists(pi_queue Queue) {
-  // Handle immediate command lists here, they don't need to be reset and we
-  // only need to cleanup events.
-  if (Queue->UsingImmCmdLists) {
-    PI_CALL(CleanupEventsInImmCmdLists(Queue, true /*locked*/));
-    return PI_SUCCESS;
-  }
-
-  // We need events to be cleaned up out of scope where queue is locked to avoid
-  // nested locks, because event cleanup requires event to be locked. Nested
-  // locks are hard to control and can cause deadlocks if mutexes are locked in
-  // different order.
-  std::vector<pi_event> EventListToCleanup;
-
-  // We check for command lists that have been already signalled, but have not
-  // been added to the available list yet. Each command list has a fence
-  // associated which tracks if a command list has completed dispatch of its
-  // commands and is ready for reuse. If a command list is found to have been
-  // signalled, then the command list & fence are reset and command list is
-  // returned to the command list cache. All events associated with command
-  // list are cleaned up if command list was reset.
-  for (auto &&it = Queue->CommandListMap.begin();
-       it != Queue->CommandListMap.end(); ++it) {
-    // Immediate commandlists don't use a fence and are handled separately
-    // above.
-    assert(it->second.ZeFence != nullptr);
-    // It is possible that the fence was already noted as signalled and
-    // reset. In that case the ZeFenceInUse flag will be false.
-    if (it->second.ZeFenceInUse) {
-      ze_result_t ZeResult =
-          ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence));
-      if (ZeResult == ZE_RESULT_SUCCESS)
-        PI_CALL(Queue->resetCommandList(it, true, EventListToCleanup));
-    }
-  }
-  CleanupEventListFromResetCmdList(EventListToCleanup, true /*locked*/);
-  return PI_SUCCESS;
-}
-
-// Retrieve an available command list to be used in a PI call.
-pi_result _pi_context::getAvailableCommandList(
-    pi_queue Queue, pi_command_list_ptr_t &CommandList, bool UseCopyEngine,
-    bool AllowBatching, ze_command_queue_handle_t *ForcedCmdQueue) {
-  // Immediate commandlists have been pre-allocated and are always available.
-  if (Queue->UsingImmCmdLists) {
-    CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList();
-    if (CommandList->second.EventList.size() >
-        ImmCmdListsEventCleanupThreshold) {
-      std::vector<pi_event> EventListToCleanup;
-      Queue->resetCommandList(CommandList, false, EventListToCleanup);
-      CleanupEventListFromResetCmdList(EventListToCleanup, true);
-    }
-    PI_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList));
-    if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine))
-      return Res;
-    return PI_SUCCESS;
-  } else {
-    // Cleanup regular command-lists if there are too many.
-    // It handles the case that the queue is not synced to the host
-    // for a long time and we want to reclaim the command-lists for
-    // use by other queues.
-    if (Queue->CommandListMap.size() > CmdListsCleanupThreshold) {
-      resetCommandLists(Queue);
-    }
-  }
-
-  auto &CommandBatch =
-      UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
-  // Handle batching of commands
-  // First see if there is an command-list open for batching commands
-  // for this queue.
-  if (Queue->hasOpenCommandList(UseCopyEngine)) {
-    if (AllowBatching) {
-      CommandList = CommandBatch.OpenCommandList;
-      PI_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList));
-      return PI_SUCCESS;
-    }
-    // If this command isn't allowed to be batched or doesn't match the forced
-    // command queue, then we need to go ahead and execute what is already in
-    // the batched list, and then go on to process this. On exit from
-    // executeOpenCommandList OpenCommandList will be invalidated.
-    if (auto Res = Queue->executeOpenCommandList(UseCopyEngine))
-      return Res;
-    // Note that active barriers do not need to be inserted here as they will
-    // have been enqueued into the command-list when they were created.
-  }
-
-  // Create/Reuse the command list, because in Level Zero commands are added to
-  // the command lists, and later are then added to the command queue.
-  // Each command list is paired with an associated fence to track when the
-  // command list is available for reuse.
-  _pi_result pi_result = PI_ERROR_OUT_OF_RESOURCES;
-
-  // Initally, we need to check if a command list has already been created
-  // on this device that is available for use. If so, then reuse that
-  // Level-Zero Command List and Fence for this PI call.
-  {
-    // Make sure to acquire the lock before checking the size, or there
-    // will be a race condition.
-    std::scoped_lock<ur_mutex> Lock(Queue->Context->ZeCommandListCacheMutex);
-    // Under mutex since operator[] does insertion on the first usage for every
-    // unique ZeDevice.
-    auto &ZeCommandListCache =
-        UseCopyEngine
-            ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]
-            : Queue->Context
-                  ->ZeComputeCommandListCache[Queue->Device->ZeDevice];
-
-    for (auto ZeCommandListIt = ZeCommandListCache.begin();
-         ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) {
-      auto &ZeCommandList = ZeCommandListIt->first;
-      auto it = Queue->CommandListMap.find(ZeCommandList);
-      if (it != Queue->CommandListMap.end()) {
-        if (ForcedCmdQueue && *ForcedCmdQueue != it->second.ZeQueue)
-          continue;
-        CommandList = it;
-        if (CommandList->second.ZeFence != nullptr)
-          CommandList->second.ZeFenceInUse = true;
-      } else {
-        // If there is a command list available on this context, but it
-        // wasn't yet used in this queue then create a new entry in this
-        // queue's map to hold the fence and other associated command
-        // list information.
-        auto &QGroup = Queue->getQueueGroup(UseCopyEngine);
-        uint32_t QueueGroupOrdinal;
-        auto &ZeCommandQueue = ForcedCmdQueue
-                                   ? *ForcedCmdQueue
-                                   : QGroup.getZeQueue(&QueueGroupOrdinal);
-        if (ForcedCmdQueue)
-          QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue);
-
-        ze_fence_handle_t ZeFence;
-        ZeStruct<ze_fence_desc_t> ZeFenceDesc;
-        ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
-        ZeStruct<ze_command_queue_desc_t> ZeQueueDesc;
-        ZeQueueDesc.ordinal = QueueGroupOrdinal;
-        CommandList =
-            Queue->CommandListMap
-                .emplace(ZeCommandList,
-                         pi_command_list_info_t{ZeFence, true, false,
-                                                ZeCommandQueue, ZeQueueDesc})
-                .first;
-      }
-      ZeCommandListCache.erase(ZeCommandListIt);
-      if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList))
-        return Res;
-      if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine))
-        return Res;
-      return PI_SUCCESS;
-    }
-  }
-
-  // If there are no available command lists in the cache, then we check for
-  // command lists that have already signalled, but have not been added to the
-  // available list yet. Each command list has a fence associated which tracks
-  // if a command list has completed dispatch of its commands and is ready for
-  // reuse. If a command list is found to have been signalled, then the
-  // command list & fence are reset and we return.
-  for (auto it = Queue->CommandListMap.begin();
-       it != Queue->CommandListMap.end(); ++it) {
-    // Make sure this is the command list type needed.
-    if (UseCopyEngine != it->second.isCopy(Queue))
-      continue;
-
-    ze_result_t ZeResult =
-        ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence));
-    if (ZeResult == ZE_RESULT_SUCCESS) {
-      std::vector<pi_event> EventListToCleanup;
-      Queue->resetCommandList(it, false, EventListToCleanup);
-      CleanupEventListFromResetCmdList(EventListToCleanup,
-                                       true /* QueueLocked */);
-      CommandList = it;
-      CommandList->second.ZeFenceInUse = true;
-      if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList))
-        return Res;
-      return PI_SUCCESS;
-    }
-  }
-
-  // If there are no available command lists nor signalled command lists,
-  // then we must create another command list.
-  pi_result = Queue->createCommandList(UseCopyEngine, CommandList);
-  CommandList->second.ZeFenceInUse = true;
-  return pi_result;
-}
-
-_pi_queue::pi_queue_group_t &_pi_queue::getQueueGroup(bool UseCopyEngine) {
-  auto &Map = (UseCopyEngine ? CopyQueueGroupsByTID : ComputeQueueGroupsByTID);
-  return Map.get();
-}
-
-// Helper function to create a new command-list to this queue and associated
-// fence tracking its completion. This command list & fence are added to the
-// map of command lists in this queue with ZeFenceInUse = false.
-// The caller must hold a lock of the queue already.
-pi_result
-_pi_queue::createCommandList(bool UseCopyEngine,
-                             pi_command_list_ptr_t &CommandList,
-                             ze_command_queue_handle_t *ForcedCmdQueue) {
-
-  ze_fence_handle_t ZeFence;
-  ZeStruct<ze_fence_desc_t> ZeFenceDesc;
-  ze_command_list_handle_t ZeCommandList;
-
-  uint32_t QueueGroupOrdinal;
-  auto &QGroup = getQueueGroup(UseCopyEngine);
-  auto &ZeCommandQueue =
-      ForcedCmdQueue ? *ForcedCmdQueue : QGroup.getZeQueue(&QueueGroupOrdinal);
-  if (ForcedCmdQueue)
-    QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue);
-
-  ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
-  ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
-
-  ZE_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice,
-                                &ZeCommandListDesc, &ZeCommandList));
-
-  ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
-  ZeStruct<ze_command_queue_desc_t> ZeQueueDesc;
-  ZeQueueDesc.ordinal = QueueGroupOrdinal;
-  std::tie(CommandList, std::ignore) = CommandListMap.insert(
-      std::pair<ze_command_list_handle_t, pi_command_list_info_t>(
-          ZeCommandList, {ZeFence, false, false, ZeCommandQueue, ZeQueueDesc}));
-
-  PI_CALL(insertStartBarrierIfDiscardEventsMode(CommandList));
-  PI_CALL(insertActiveBarriers(CommandList, UseCopyEngine));
-  return PI_SUCCESS;
-}
-
-void _pi_queue::adjustBatchSizeForFullBatch(bool IsCopy) {
-  auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch;
-  auto &ZeCommandListBatchConfig =
-      IsCopy ? ZeCommandListBatchCopyConfig : ZeCommandListBatchComputeConfig;
-  pi_uint32 &QueueBatchSize = CommandBatch.QueueBatchSize;
-  // QueueBatchSize of 0 means never allow batching.
-  if (QueueBatchSize == 0 || !ZeCommandListBatchConfig.dynamic())
-    return;
-  CommandBatch.NumTimesClosedFull += 1;
-
-  // If the number of times the list has been closed early is low, and
-  // the number of times it has been closed full is high, then raise
-  // the batching size slowly. Don't raise it if it is already pretty
-  // high.
-  if (CommandBatch.NumTimesClosedEarly <=
-          ZeCommandListBatchConfig.NumTimesClosedEarlyThreshold &&
-      CommandBatch.NumTimesClosedFull >
-          ZeCommandListBatchConfig.NumTimesClosedFullThreshold) {
-    if (QueueBatchSize < ZeCommandListBatchConfig.DynamicSizeMax) {
-      QueueBatchSize += ZeCommandListBatchConfig.DynamicSizeStep;
-      urPrint("Raising QueueBatchSize to %d\n", QueueBatchSize);
-    }
-    CommandBatch.NumTimesClosedEarly = 0;
-    CommandBatch.NumTimesClosedFull = 0;
-  }
-}
-
-void _pi_queue::adjustBatchSizeForPartialBatch(bool IsCopy) {
-  auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch;
-  auto &ZeCommandListBatchConfig =
-      IsCopy ? ZeCommandListBatchCopyConfig : ZeCommandListBatchComputeConfig;
-  pi_uint32 &QueueBatchSize = CommandBatch.QueueBatchSize;
-  // QueueBatchSize of 0 means never allow batching.
-  if (QueueBatchSize == 0 || !ZeCommandListBatchConfig.dynamic())
-    return;
-  CommandBatch.NumTimesClosedEarly += 1;
-
-  // If we are closing early more than about 3x the number of times
-  // it is closing full, lower the batch size to the value of the
-  // current open command list. This is trying to quickly get to a
-  // batch size that will be able to be closed full at least once
-  // in a while.
-  if (CommandBatch.NumTimesClosedEarly >
-      (CommandBatch.NumTimesClosedFull + 1) * 3) {
-    QueueBatchSize = CommandBatch.OpenCommandList->second.size() - 1;
-    if (QueueBatchSize < 1)
-      QueueBatchSize = 1;
-    urPrint("Lowering QueueBatchSize to %d\n", QueueBatchSize);
-    CommandBatch.NumTimesClosedEarly = 0;
-    CommandBatch.NumTimesClosedFull = 0;
-  }
-}
-
-void _pi_queue::CaptureIndirectAccesses() {
-  for (auto &Kernel : KernelsToBeSubmitted) {
-    if (!Kernel->hasIndirectAccess())
-      continue;
-
-    auto &Contexts = Device->Platform->Contexts;
-    for (auto &Ctx : Contexts) {
-      for (auto &Elem : Ctx->MemAllocs) {
-        const auto &Pair = Kernel->MemAllocs.insert(&Elem);
-        // Kernel is referencing this memory allocation from now.
-        // If this memory allocation was already captured for this kernel, it
-        // means that kernel is submitted several times. Increase reference
-        // count only once because we release all allocations only when
-        // SubmissionsCount turns to 0. We don't want to know how many times
-        // allocation was retained by each submission.
-        if (Pair.second)
-          Elem.second.RefCount.increment();
-      }
-    }
-    Kernel->SubmissionsCount++;
-  }
-  KernelsToBeSubmitted.clear();
-}
-
-pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
-                                        bool IsBlocking,
-                                        bool OKToBatchCommand) {
-  // Do nothing if command list is already closed.
-  if (CommandList->second.IsClosed)
-    return PI_SUCCESS;
-
-  bool UseCopyEngine = CommandList->second.isCopy(this);
-
-  // If the current LastCommandEvent is the nullptr, then it means
-  // either that no command has ever been issued to the queue
-  // or it means that the LastCommandEvent has been signalled and
-  // therefore that this Queue is idle.
-  //
-  // NOTE: this behavior adds some flakyness to the batching
-  // since last command's event may or may not be completed by the
-  // time we get here depending on timings and system/gpu load.
-  // So, disable it for modes where we print PI traces. Printing
-  // traces incurs much different timings than real execution
-  // ansyway, and many regression tests use it.
-  //
-  bool CurrentlyEmpty = !PrintTrace && this->LastCommandEvent == nullptr;
-
-  // The list can be empty if command-list only contains signals of proxy
-  // events. It is possible that executeCommandList is called twice for the same
-  // command list without new appended command. We don't to want process the
-  // same last command event twice that's why additionally check that new
-  // command was appended to the command list.
-  if (!CommandList->second.EventList.empty() &&
-      this->LastCommandEvent != CommandList->second.EventList.back()) {
-    this->LastCommandEvent = CommandList->second.EventList.back();
-    if (doReuseDiscardedEvents()) {
-      PI_CALL(resetDiscardedEvent(CommandList));
-    }
-  }
-
-  this->LastUsedCommandList = CommandList;
-
-  if (!UsingImmCmdLists) {
-    // Batch if allowed to, but don't batch if we know there are no kernels
-    // from this queue that are currently executing.  This is intended to get
-    // kernels started as soon as possible when there are no kernels from this
-    // queue awaiting execution, while allowing batching to occur when there
-    // are kernels already executing. Also, if we are using fixed size batching,
-    // as indicated by !ZeCommandListBatch.dynamic(), then just ignore
-    // CurrentlyEmpty as we want to strictly follow the batching the user
-    // specified.
-    auto &CommandBatch = UseCopyEngine ? CopyCommandBatch : ComputeCommandBatch;
-    auto &ZeCommandListBatchConfig = UseCopyEngine
-                                         ? ZeCommandListBatchCopyConfig
-                                         : ZeCommandListBatchComputeConfig;
-    if (OKToBatchCommand && this->isBatchingAllowed(UseCopyEngine) &&
-        (!ZeCommandListBatchConfig.dynamic() || !CurrentlyEmpty)) {
-
-      if (hasOpenCommandList(UseCopyEngine) &&
-          CommandBatch.OpenCommandList != CommandList)
-        die("executeCommandList: OpenCommandList should be equal to"
-            "null or CommandList");
-
-      if (CommandList->second.size() < CommandBatch.QueueBatchSize) {
-        CommandBatch.OpenCommandList = CommandList;
-        return PI_SUCCESS;
-      }
-
-      adjustBatchSizeForFullBatch(UseCopyEngine);
-      CommandBatch.OpenCommandList = CommandListMap.end();
-    }
-  }
-
-  auto &ZeCommandQueue = CommandList->second.ZeQueue;
-  // Scope of the lock must be till the end of the function, otherwise new mem
-  // allocs can be created between the moment when we made a snapshot and the
-  // moment when command list is closed and executed. But mutex is locked only
-  // if indirect access tracking enabled, because std::defer_lock is used.
-  // unique_lock destructor at the end of the function will unlock the mutex
-  // if it was locked (which happens only if IndirectAccessTrackingEnabled is
-  // true).
-  std::unique_lock<ur_shared_mutex> ContextsLock(
-      Device->Platform->ContextsMutex, std::defer_lock);
-
-  if (IndirectAccessTrackingEnabled) {
-    // We are going to submit kernels for execution. If indirect access flag is
-    // set for a kernel then we need to make a snapshot of existing memory
-    // allocations in all contexts in the platform. We need to lock the mutex
-    // guarding the list of contexts in the platform to prevent creation of new
-    // memory alocations in any context before we submit the kernel for
-    // execution.
-    ContextsLock.lock();
-    CaptureIndirectAccesses();
-  }
-
-  if (!UsingImmCmdLists) {
-    // In this mode all inner-batch events have device visibility only,
-    // and we want the last command in the batch to signal a host-visible
-    // event that anybody waiting for any event in the batch will
-    // really be using.
-    // We need to create a proxy host-visible event only if the list of events
-    // in the command list is not empty, otherwise we are going to just create
-    // and remove proxy event right away and dereference deleted object
-    // afterwards.
-    if (Device->ZeEventsScope == LastCommandInBatchHostVisible &&
-        !CommandList->second.EventList.empty()) {
-      // If there are only internal events in the command list then we don't
-      // need to create host proxy event.
-      auto Result =
-          std::find_if(CommandList->second.EventList.begin(),
-                       CommandList->second.EventList.end(),
-                       [](pi_event E) { return E->hasExternalRefs(); });
-      if (Result != CommandList->second.EventList.end()) {
-        // Create a "proxy" host-visible event.
-        //
-        pi_event HostVisibleEvent;
-        auto Res = createEventAndAssociateQueue(
-            this, &HostVisibleEvent, PI_COMMAND_TYPE_USER, CommandList,
-            /* IsInternal */ false, /* HostVisible */ true);
-        if (Res)
-          return Res;
-
-        // Update each command's event in the command-list to "see" this
-        // proxy event as a host-visible counterpart.
-        for (auto &Event : CommandList->second.EventList) {
-          std::scoped_lock<ur_shared_mutex> EventLock(Event->Mutex);
-          // Internal event doesn't need host-visible proxy.
-          if (!Event->hasExternalRefs())
-            continue;
-
-          if (!Event->HostVisibleEvent) {
-            Event->HostVisibleEvent = HostVisibleEvent;
-            HostVisibleEvent->RefCount.increment();
-          }
-        }
-
-        // Decrement the reference count of the event such that all the
-        // remaining references are from the other commands in this batch and
-        // from the command-list itself. This host-visible event will not be
-        // waited/released by SYCL RT, so it must be destroyed after all events
-        // in the batch are gone. We know that refcount is more than 2 because
-        // we check that EventList of the command list is not empty above, i.e.
-        // after createEventAndAssociateQueue ref count is 2 and then +1 for
-        // each event in the EventList.
-        PI_CALL(piEventReleaseInternal(HostVisibleEvent));
-
-        if (doReuseDiscardedEvents()) {
-          // If we have in-order queue with discarded events then we want to
-          // treat this event as regular event. We insert a barrier in the next
-          // command list to wait for this event.
-          LastCommandEvent = HostVisibleEvent;
-        } else {
-          // For all other queues treat this as a special event and indicate no
-          // cleanup is needed.
-          // TODO: always treat this host event as a regular event.
-          PI_CALL(piEventReleaseInternal(HostVisibleEvent));
-          HostVisibleEvent->CleanedUp = true;
-        }
-
-        // Finally set to signal the host-visible event at the end of the
-        // command-list after a barrier that waits for all commands
-        // completion.
-        if (doReuseDiscardedEvents() && LastCommandEvent &&
-            LastCommandEvent->IsDiscarded) {
-          // If we the last event is discarded then we already have a barrier
-          // inserted, so just signal the event.
-          ZE_CALL(zeCommandListAppendSignalEvent,
-                  (CommandList->first, HostVisibleEvent->ZeEvent));
-        } else {
-          ZE_CALL(zeCommandListAppendBarrier,
-                  (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr));
-        }
-      } else {
-        // If we don't have host visible proxy then signal event if needed.
-        this->signalEventFromCmdListIfLastEventDiscarded(CommandList);
-      }
-    } else {
-      // If we don't have host visible proxy then signal event if needed.
-      this->signalEventFromCmdListIfLastEventDiscarded(CommandList);
-    }
-
-    // Close the command list and have it ready for dispatch.
-    ZE_CALL(zeCommandListClose, (CommandList->first));
-    // Mark this command list as closed.
-    CommandList->second.IsClosed = true;
-    this->LastUsedCommandList = CommandListMap.end();
-    // Offload command list to the GPU for asynchronous execution
-    auto ZeCommandList = CommandList->first;
-    auto ZeResult = ZE_CALL_NOCHECK(
-        zeCommandQueueExecuteCommandLists,
-        (ZeCommandQueue, 1, &ZeCommandList, CommandList->second.ZeFence));
-    if (ZeResult != ZE_RESULT_SUCCESS) {
-      this->Healthy = false;
-      if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
-        // Turn into a more informative end-user error.
-        return PI_ERROR_COMMAND_EXECUTION_FAILURE;
-      }
-      return mapError(ZeResult);
-    }
-  }
-
-  // Check global control to make every command blocking for debugging.
-  if (IsBlocking || (UrL0Serialize & UrL0SerializeBlock) != 0) {
-    if (UsingImmCmdLists) {
-      synchronize();
-    } else {
-      // Wait until command lists attached to the command queue are executed.
-      ZE_CALL(zeHostSynchronize, (ZeCommandQueue));
-    }
-  }
-  return PI_SUCCESS;
-}
-
-bool _pi_queue::isBatchingAllowed(bool IsCopy) const {
-  auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch;
-  return (CommandBatch.QueueBatchSize > 0 &&
-          ((UrL0Serialize & UrL0SerializeBlock) == 0));
-}
-
-// Return the index of the next queue to use based on a
-// round robin strategy and the queue group ordinal.
-uint32_t _pi_queue::pi_queue_group_t::getQueueIndex(uint32_t *QueueGroupOrdinal,
-                                                    uint32_t *QueueIndex,
-                                                    bool QueryOnly) {
-  auto CurrentIndex = NextIndex;
-
-  if (!QueryOnly) {
-    ++NextIndex;
-    if (NextIndex > UpperIndex)
-      NextIndex = LowerIndex;
-  }
-
-  // Find out the right queue group ordinal (first queue might be "main" or
-  // "link")
-  auto QueueType = Type;
-  if (QueueType != queue_type::Compute)
-    QueueType = (CurrentIndex == 0 && Queue->Device->hasMainCopyEngine())
-                    ? queue_type::MainCopy
-                    : queue_type::LinkCopy;
-
-  *QueueGroupOrdinal = Queue->Device->QueueGroup[QueueType].ZeOrdinal;
-  // Adjust the index to the L0 queue group since we represent "main" and
-  // "link"
-  // L0 groups with a single copy group ("main" would take "0" index).
-  auto ZeCommandQueueIndex = CurrentIndex;
-  if (QueueType == queue_type::LinkCopy && Queue->Device->hasMainCopyEngine()) {
-    ZeCommandQueueIndex -= 1;
-  }
-  *QueueIndex = ZeCommandQueueIndex;
-
-  return CurrentIndex;
-}
-
-int32_t _pi_queue::pi_queue_group_t::getCmdQueueOrdinal(
-    ze_command_queue_handle_t CmdQueue) {
-  // Find out the right queue group ordinal (first queue might be "main" or
-  // "link")
-  auto QueueType = Type;
-  if (QueueType != queue_type::Compute)
-    QueueType = (ZeQueues[0] == CmdQueue && Queue->Device->hasMainCopyEngine())
-                    ? queue_type::MainCopy
-                    : queue_type::LinkCopy;
-  return Queue->Device->QueueGroup[QueueType].ZeOrdinal;
-}
-
-// This function will return one of possibly multiple available native
-// queues and the value of the queue group ordinal.
-ze_command_queue_handle_t &
-_pi_queue::pi_queue_group_t::getZeQueue(uint32_t *QueueGroupOrdinal) {
-
-  // QueueIndex is the proper L0 index.
-  // Index is the plugins concept of index, with main and link copy engines in
-  // one range.
-  uint32_t QueueIndex;
-  auto Index = getQueueIndex(QueueGroupOrdinal, &QueueIndex);
-
-  ze_command_queue_handle_t &ZeQueue = ZeQueues[Index];
-  if (ZeQueue)
-    return ZeQueue;
-
-  ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
-  ZeCommandQueueDesc.ordinal = *QueueGroupOrdinal;
-  ZeCommandQueueDesc.index = QueueIndex;
-  ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
-  const char *Priority = "Normal";
-  if (Queue->isPriorityLow()) {
-    ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW;
-    Priority = "Low";
-  } else if (Queue->isPriorityHigh()) {
-    ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH;
-    Priority = "High";
-  }
-
-  // Evaluate performance of explicit usage for "0" index.
-  if (QueueIndex != 0) {
-    ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY;
-  }
-
-  urPrint("[getZeQueue]: create queue ordinal = %d, index = %d "
-          "(round robin in [%d, %d]) priority = %s\n",
-          ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
-          UpperIndex, Priority);
-
-  auto ZeResult = ZE_CALL_NOCHECK(
-      zeCommandQueueCreate, (Queue->Context->ZeContext, Queue->Device->ZeDevice,
-                             &ZeCommandQueueDesc, &ZeQueue));
-  if (ZeResult) {
-    die("[L0] getZeQueue: failed to create queue");
-  }
-
-  return ZeQueue;
-}
-
-// This function will return one of possibly multiple available
-// immediate commandlists associated with this Queue.
-pi_command_list_ptr_t &_pi_queue::pi_queue_group_t::getImmCmdList() {
-  uint32_t QueueIndex, QueueOrdinal;
-  auto Index = getQueueIndex(&QueueOrdinal, &QueueIndex);
-
-  if (ImmCmdLists[Index] != Queue->CommandListMap.end())
-    return ImmCmdLists[Index];
-
-  ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
-  ZeCommandQueueDesc.ordinal = QueueOrdinal;
-  ZeCommandQueueDesc.index = QueueIndex;
-  ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
-  const char *Priority = "Normal";
-  if (Queue->isPriorityLow()) {
-    ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW;
-    Priority = "Low";
-  } else if (Queue->isPriorityHigh()) {
-    ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH;
-    Priority = "High";
-  }
-  // Evaluate performance of explicit usage for "0" index.
-  if (QueueIndex != 0) {
-    ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY;
-  }
-
-  // Check if context's command list cache has an immediate command list with
-  // matching index.
-  ze_command_list_handle_t ZeCommandList = nullptr;
-  {
-    // Acquire lock to avoid race conditions.
-    std::scoped_lock<ur_mutex> Lock(Queue->Context->ZeCommandListCacheMutex);
-    // Under mutex since operator[] does insertion on the first usage for every
-    // unique ZeDevice.
-    auto &ZeCommandListCache =
-        isCopy()
-            ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]
-            : Queue->Context
-                  ->ZeComputeCommandListCache[Queue->Device->ZeDevice];
-    for (auto ZeCommandListIt = ZeCommandListCache.begin();
-         ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) {
-      const auto &Desc = (*ZeCommandListIt).second;
-      if (Desc.index == ZeCommandQueueDesc.index &&
-          Desc.flags == ZeCommandQueueDesc.flags &&
-          Desc.mode == ZeCommandQueueDesc.mode &&
-          Desc.priority == ZeCommandQueueDesc.priority) {
-        ZeCommandList = (*ZeCommandListIt).first;
-        ZeCommandListCache.erase(ZeCommandListIt);
-        break;
-      }
-    }
-  }
-
-  // If cache didn't contain a command list, create one.
-  if (!ZeCommandList) {
-    urPrint("[getZeQueue]: create queue ordinal = %d, index = %d "
-            "(round robin in [%d, %d]) priority = %s\n",
-            ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
-            UpperIndex, Priority);
-
-    ZE_CALL_NOCHECK(zeCommandListCreateImmediate,
-                    (Queue->Context->ZeContext, Queue->Device->ZeDevice,
-                     &ZeCommandQueueDesc, &ZeCommandList));
-  }
-
-  ImmCmdLists[Index] =
-      Queue->CommandListMap
-          .insert(std::pair<ze_command_list_handle_t, pi_command_list_info_t>{
-              ZeCommandList,
-              {nullptr, true, false, nullptr, ZeCommandQueueDesc}})
-          .first;
-
-  return ImmCmdLists[Index];
-}
-
-pi_command_list_ptr_t _pi_queue::eventOpenCommandList(pi_event Event) {
-  using IsCopy = bool;
-
-  if (UsingImmCmdLists) {
-    // When using immediate commandlists there are no open command lists.
-    return CommandListMap.end();
-  }
-
-  if (hasOpenCommandList(IsCopy{false})) {
-    const auto &ComputeEventList =
-        ComputeCommandBatch.OpenCommandList->second.EventList;
-    if (std::find(ComputeEventList.begin(), ComputeEventList.end(), Event) !=
-        ComputeEventList.end())
-      return ComputeCommandBatch.OpenCommandList;
-  }
-  if (hasOpenCommandList(IsCopy{true})) {
-    const auto &CopyEventList =
-        CopyCommandBatch.OpenCommandList->second.EventList;
-    if (std::find(CopyEventList.begin(), CopyEventList.end(), Event) !=
-        CopyEventList.end())
-      return CopyCommandBatch.OpenCommandList;
-  }
-  return CommandListMap.end();
-}
-
-pi_result _pi_queue::insertStartBarrierIfDiscardEventsMode(
-    pi_command_list_ptr_t &CmdList) {
-  // If current command list is different from the last command list then insert
-  // a barrier waiting for the last command event.
-  if (doReuseDiscardedEvents() && CmdList != LastUsedCommandList &&
-      LastCommandEvent) {
-    ZE_CALL(zeCommandListAppendBarrier,
-            (CmdList->first, nullptr, 1, &(LastCommandEvent->ZeEvent)));
-    LastCommandEvent = nullptr;
-  }
-  return PI_SUCCESS;
-}
-
-pi_result _pi_queue::insertActiveBarriers(pi_command_list_ptr_t &CmdList,
-                                          bool UseCopyEngine) {
-  // Early exit if there are no active barriers.
-  if (ActiveBarriers.empty())
-    return PI_SUCCESS;
-
-  // Create a wait-list and retain events.
-  _pi_ze_event_list_t ActiveBarriersWaitList;
-  if (auto Res = ActiveBarriersWaitList.createAndRetainPiZeEventList(
-          ActiveBarriers.vector().size(), ActiveBarriers.vector().data(), this,
-          UseCopyEngine))
-    return Res;
-
-  // We can now replace active barriers with the ones in the wait list.
-  if (auto Res = ActiveBarriers.clear())
-    return Res;
-
-  if (ActiveBarriersWaitList.Length == 0) {
-    return PI_SUCCESS;
-  }
-
-  for (pi_uint32 I = 0; I < ActiveBarriersWaitList.Length; ++I) {
-    auto &Event = ActiveBarriersWaitList.PiEventList[I];
-    ActiveBarriers.add(Event);
-  }
-
-  pi_event Event = nullptr;
-  if (auto Res = createEventAndAssociateQueue(
-          this, &Event, PI_COMMAND_TYPE_USER, CmdList, /*IsInternal*/ true))
-    return Res;
-
-  Event->WaitList = ActiveBarriersWaitList;
-  Event->OwnZeEvent = true;
-
-  // If there are more active barriers, insert a barrier on the command-list. We
-  // do not need an event for finishing so we pass nullptr.
-  ZE_CALL(zeCommandListAppendBarrier,
-          (CmdList->first, nullptr, ActiveBarriersWaitList.Length,
-           ActiveBarriersWaitList.ZeEventList));
-  return PI_SUCCESS;
-}
-
-pi_result _pi_queue::executeOpenCommandList(bool IsCopy) {
-  auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch;
-  // If there are any commands still in the open command list for this
-  // queue, then close and execute that command list now.
-  if (hasOpenCommandList(IsCopy)) {
-    adjustBatchSizeForPartialBatch(IsCopy);
-    auto Res = executeCommandList(CommandBatch.OpenCommandList, false, false);
-    CommandBatch.OpenCommandList = CommandListMap.end();
-    return Res;
-  }
-
-  return PI_SUCCESS;
-}
-
-static const bool FilterEventWaitList = [] {
-  const char *UrRet = std::getenv("UR_L0_FILTER_EVENT_WAIT_LIST");
-  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST");
-  return (UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0));
-}();
-
-pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList(
-    pi_uint32 EventListLength, const pi_event *EventList, pi_queue CurQueue,
-    bool UseCopyEngine) {
-  this->Length = 0;
-  this->ZeEventList = nullptr;
-  this->PiEventList = nullptr;
-
-  if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) {
-    if (CurQueue->UsingImmCmdLists) {
-      if (ReuseDiscardedEvents && CurQueue->isDiscardEvents()) {
-        // If queue is in-order with discarded events and if
-        // new command list is different from the last used command list then
-        // signal new event from the last immediate command list. We are going
-        // to insert a barrier in the new command list waiting for that event.
-        auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine);
-        uint32_t QueueGroupOrdinal, QueueIndex;
-        auto NextIndex =
-            QueueGroup.getQueueIndex(&QueueGroupOrdinal, &QueueIndex,
-                                     /*QueryOnly */ true);
-        auto NextImmCmdList = QueueGroup.ImmCmdLists[NextIndex];
-        if (CurQueue->LastUsedCommandList != CurQueue->CommandListMap.end() &&
-            CurQueue->LastUsedCommandList != NextImmCmdList) {
-          CurQueue->signalEventFromCmdListIfLastEventDiscarded(
-              CurQueue->LastUsedCommandList);
-        }
-      }
-    } else {
-      // Ensure LastCommandEvent's batch is submitted if it is differrent
-      // from the one this command is going to. If we reuse discarded events
-      // then signalEventFromCmdListIfLastEventDiscarded will be called at batch
-      // close if needed.
-      const auto &OpenCommandList =
-          CurQueue->eventOpenCommandList(CurQueue->LastCommandEvent);
-      if (OpenCommandList != CurQueue->CommandListMap.end() &&
-          OpenCommandList->second.isCopy(CurQueue) != UseCopyEngine) {
-
-        if (auto Res = CurQueue->executeOpenCommandList(
-                OpenCommandList->second.isCopy(CurQueue)))
-          return Res;
-      }
-    }
-  }
-
-  // For in-order queues, every command should be executed only after the
-  // previous command has finished. The event associated with the last
-  // enqueued command is added into the waitlist to ensure in-order semantics.
-  bool IncludeLastCommandEvent =
-      CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr;
-
-  // If the last event is discarded then we already have a barrier waiting for
-  // that event, so must not include the last command event into the wait
-  // list because it will cause waiting for event which was reset.
-  if (ReuseDiscardedEvents && CurQueue->isDiscardEvents() &&
-      CurQueue->LastCommandEvent && CurQueue->LastCommandEvent->IsDiscarded)
-    IncludeLastCommandEvent = false;
-
-  try {
-    pi_uint32 TmpListLength = 0;
-
-    if (IncludeLastCommandEvent) {
-      this->ZeEventList = new ze_event_handle_t[EventListLength + 1];
-      this->PiEventList = new pi_event[EventListLength + 1];
-      std::shared_lock<ur_shared_mutex> Lock(CurQueue->LastCommandEvent->Mutex);
-      this->ZeEventList[0] = CurQueue->LastCommandEvent->ZeEvent;
-      this->PiEventList[0] = CurQueue->LastCommandEvent;
-      TmpListLength = 1;
-    } else if (EventListLength > 0) {
-      this->ZeEventList = new ze_event_handle_t[EventListLength];
-      this->PiEventList = new pi_event[EventListLength];
-    }
-
-    if (EventListLength > 0) {
-      for (pi_uint32 I = 0; I < EventListLength; I++) {
-        PI_ASSERT(EventList[I] != nullptr, PI_ERROR_INVALID_VALUE);
-        {
-          std::shared_lock<ur_shared_mutex> Lock(EventList[I]->Mutex);
-          if (EventList[I]->Completed)
-            continue;
-
-          // Poll of the host-visible events.
-          auto HostVisibleEvent = EventList[I]->HostVisibleEvent;
-          if (FilterEventWaitList && HostVisibleEvent) {
-            auto Res = ZE_CALL_NOCHECK(zeEventQueryStatus,
-                                       (HostVisibleEvent->ZeEvent));
-            if (Res == ZE_RESULT_SUCCESS) {
-              // Event has already completed, don't put it into the list
-              continue;
-            }
-          }
-        }
-
-        auto Queue = EventList[I]->Queue;
-        if (Queue) {
-          // The caller of createAndRetainPiZeEventList must already hold
-          // a lock of the CurQueue. Additionally lock the Queue if it
-          // is different from CurQueue.
-          // TODO: rework this to avoid deadlock when another thread is
-          //       locking the same queues but in a different order.
-          auto Lock = ((Queue == CurQueue)
-                           ? std::unique_lock<ur_shared_mutex>()
-                           : std::unique_lock<ur_shared_mutex>(Queue->Mutex));
-
-          // If the event that is going to be waited is in an open batch
-          // different from where this next command is going to be added,
-          // then we have to force execute of that open command-list
-          // to avoid deadlocks.
-          //
-          const auto &OpenCommandList =
-              Queue->eventOpenCommandList(EventList[I]);
-          if (OpenCommandList != Queue->CommandListMap.end()) {
-
-            if (Queue == CurQueue &&
-                OpenCommandList->second.isCopy(Queue) == UseCopyEngine) {
-              // Don't force execute the batch yet since the new command
-              // is going to the same open batch as the dependent event.
-            } else {
-              if (auto Res = Queue->executeOpenCommandList(
-                      OpenCommandList->second.isCopy(Queue)))
-                return Res;
-            }
-          }
-        } else {
-          // There is a dependency on an interop-event.
-          // Similarily to the above to avoid dead locks ensure that
-          // execution of all prior commands in the current command-
-          // batch is visible to the host. This may not be the case
-          // when we intended to have only last command in the batch
-          // produce host-visible event, e.g.
-          //
-          //  event0 = interop event
-          //  event1 = command1 (already in batch, no deps)
-          //  event2 = command2 (is being added, dep on event0)
-          //  event3 = signal host-visible event for the batch
-          //  event1.wait()
-          //  event0.signal()
-          //
-          // Make sure that event1.wait() will wait for a host-visible
-          // event that is signalled before the command2 is enqueued.
-          if (CurQueue->Device->ZeEventsScope != AllHostVisible) {
-            CurQueue->executeAllOpenCommandLists();
-          }
-        }
-
-        std::shared_lock<ur_shared_mutex> Lock(EventList[I]->Mutex);
-        this->ZeEventList[TmpListLength] = EventList[I]->ZeEvent;
-        this->PiEventList[TmpListLength] = EventList[I];
-        TmpListLength += 1;
-      }
-    }
-
-    this->Length = TmpListLength;
-
-  } catch (...) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  }
-
-  for (pi_uint32 I = 0; I < this->Length; I++) {
-    this->PiEventList[I]->RefCount.increment();
-  }
-
-  return PI_SUCCESS;
-}
-
-static void printZeEventList(const _pi_ze_event_list_t &PiZeEventList) {
-  urPrint("  NumEventsInWaitList %d:", PiZeEventList.Length);
-
-  for (pi_uint32 I = 0; I < PiZeEventList.Length; I++) {
-    urPrint(" %#llx", ur_cast<std::uintptr_t>(PiZeEventList.ZeEventList[I]));
-  }
-
-  urPrint("\n");
-}
-
-pi_result _pi_ze_event_list_t::collectEventsForReleaseAndDestroyPiZeEventList(
-    std::list<pi_event> &EventsToBeReleased) {
-  // acquire a lock before reading the length and list fields.
-  // Acquire the lock, copy the needed data locally, and reset
-  // the fields, then release the lock.
-  // Only then do we do the actual actions to release and destroy,
-  // holding the lock for the minimum time necessary.
-  pi_uint32 LocLength = 0;
-  ze_event_handle_t *LocZeEventList = nullptr;
-  pi_event *LocPiEventList = nullptr;
-
-  {
-    // acquire the lock and copy fields locally
-    // Lock automatically releases when this goes out of scope.
-    std::scoped_lock<ur_mutex> lock(this->PiZeEventListMutex);
-
-    LocLength = Length;
-    LocZeEventList = ZeEventList;
-    LocPiEventList = PiEventList;
-
-    Length = 0;
-    ZeEventList = nullptr;
-    PiEventList = nullptr;
-
-    // release lock by ending scope.
-  }
-
-  for (pi_uint32 I = 0; I < LocLength; I++) {
-    // Add the event to be released to the list
-    EventsToBeReleased.push_back(LocPiEventList[I]);
-  }
-
-  if (LocZeEventList != nullptr) {
-    delete[] LocZeEventList;
-  }
-  if (LocPiEventList != nullptr) {
-    delete[] LocPiEventList;
-  }
-
-  return PI_SUCCESS;
-}
-
 extern "C" {
 
 // Forward declarations
 decltype(piEventCreate) piEventCreate;
 
-static ze_result_t
-checkUnresolvedSymbols(ze_module_handle_t ZeModule,
-                       ze_module_build_log_handle_t *ZeBuildLog);
-
 pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
                          pi_uint32 *NumPlatforms) {
   return pi2ur::piPlatformsGet(NumEntries, Platforms, NumPlatforms);
@@ -2188,10 +31,6 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
 pi_result piPlatformGetInfo(pi_platform Platform, pi_platform_info ParamName,
                             size_t ParamValueSize, void *ParamValue,
                             size_t *ParamValueSizeRet) {
-  urPrint("==========================\n");
-  urPrint("SYCL over Level-Zero %s\n", Platform->ZeDriverVersion.c_str());
-  urPrint("==========================\n");
-
   // To distinguish this L0 platform from Unified Runtime one.
   if (ParamName == PI_PLATFORM_INFO_NAME) {
     ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
@@ -2203,86 +42,17 @@ pi_result piPlatformGetInfo(pi_platform Platform, pi_platform_info ParamName,
 
 pi_result piextPlatformGetNativeHandle(pi_platform Platform,
                                        pi_native_handle *NativeHandle) {
-  PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
-  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
 
-  auto ZeDriver = ur_cast<ze_driver_handle_t *>(NativeHandle);
-  // Extract the Level Zero driver handle from the given PI platform
-  *ZeDriver = Platform->ZeDriver;
-  return PI_SUCCESS;
+  return pi2ur::piextPlatformGetNativeHandle(Platform, NativeHandle);
 }
 
 pi_result piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle,
                                               pi_platform *Platform) {
-  PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
-  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
 
-  auto ZeDriver = ur_cast<ze_driver_handle_t>(NativeHandle);
-
-  pi_uint32 NumPlatforms = 0;
-  pi_result Res = piPlatformsGet(0, nullptr, &NumPlatforms);
-  if (Res != PI_SUCCESS) {
-    return Res;
-  }
-
-  if (NumPlatforms) {
-    std::vector<pi_platform> Platforms(NumPlatforms);
-    PI_CALL(piPlatformsGet(NumPlatforms, Platforms.data(), nullptr));
-
-    // The SYCL spec requires that the set of platforms must remain fixed for
-    // the duration of the application's execution. We assume that we found all
-    // of the Level Zero drivers when we initialized the platform cache, so the
-    // "NativeHandle" must already be in the cache. If it is not, this must not
-    // be a valid Level Zero driver.
-    for (const pi_platform &CachedPlatform : Platforms) {
-      if (CachedPlatform->ZeDriver == ZeDriver) {
-        *Platform = CachedPlatform;
-        return PI_SUCCESS;
-      }
-    }
-  }
-
-  return PI_ERROR_INVALID_VALUE;
+  return pi2ur::piextPlatformCreateWithNativeHandle(NativeHandle, Platform);
 }
 
 pi_result piPluginGetLastError(char **message) {
-  return pi2ur::piPluginGetLastError(message);
-}
-
-// Returns plugin specific backend option.
-// Current support is only for optimization options.
-// Return '-ze-opt-disable' for frontend_option = -O0.
-// Return '-ze-opt-level=1' for frontend_option = -O1 or -O2.
-// Return '-ze-opt-level=2' for frontend_option = -O3.
-pi_result piPluginGetBackendOption(pi_platform, const char *frontend_option,
-                                   const char **backend_option) {
-  using namespace std::literals;
-  if (frontend_option == nullptr) {
-    return PI_ERROR_INVALID_VALUE;
-  }
-  if (frontend_option == ""sv) {
-    *backend_option = "";
-    return PI_SUCCESS;
-  }
-  if (frontend_option == "-O0"sv) {
-    *backend_option = "-ze-opt-disable";
-    return PI_SUCCESS;
-  }
-  if (frontend_option == "-O1"sv || frontend_option == "-O2"sv) {
-    *backend_option = "-ze-opt-level=1";
-    return PI_SUCCESS;
-  }
-  if (frontend_option == "-O3"sv) {
-    *backend_option = "-ze-opt-level=2";
-    return PI_SUCCESS;
-  }
-  return PI_ERROR_INVALID_VALUE;
-}
-
-pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType,
-                       pi_uint32 NumEntries, pi_device *Devices,
-                       pi_uint32 *NumDevices) {
-  return pi2ur::piDevicesGet(Platform, DeviceType, NumEntries, Devices,
                              NumDevices);
 }
 
@@ -2313,95 +83,22 @@ pi_result
 piextDeviceSelectBinary(pi_device Device, // TODO: does this need to be context?
                         pi_device_binary *Binaries, pi_uint32 NumBinaries,
                         pi_uint32 *SelectedBinaryInd) {
-
-  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
-  PI_ASSERT(SelectedBinaryInd, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(NumBinaries == 0 || Binaries, PI_ERROR_INVALID_VALUE);
-
-  // TODO: this is a bare-bones implementation for choosing a device image
-  // that would be compatible with the targeted device. An AOT-compiled
-  // image is preferred over SPIR-V for known devices (i.e. Intel devices)
-  // The implementation makes no effort to differentiate between multiple images
-  // for the given device, and simply picks the first one compatible.
-  //
-  // Real implementation will use the same mechanism OpenCL ICD dispatcher
-  // uses. Something like:
-  //   PI_VALIDATE_HANDLE_RETURN_HANDLE(ctx, PI_ERROR_INVALID_CONTEXT);
-  //     return context->dispatch->piextDeviceSelectIR(
-  //       ctx, images, num_images, selected_image);
-  // where context->dispatch is set to the dispatch table provided by PI
-  // plugin for platform/device the ctx was created for.
-
-  // Look for GEN binary, which we known can only be handled by Level-Zero now.
-  const char *BinaryTarget = __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN;
-
-  // Find the appropriate device image, fallback to spirv if not found
-  constexpr pi_uint32 InvalidInd = std::numeric_limits<pi_uint32>::max();
-  pi_uint32 Spirv = InvalidInd;
-
-  for (pi_uint32 i = 0; i < NumBinaries; ++i) {
-    if (strcmp(Binaries[i]->DeviceTargetSpec, BinaryTarget) == 0) {
-      *SelectedBinaryInd = i;
-      return PI_SUCCESS;
-    }
-    if (strcmp(Binaries[i]->DeviceTargetSpec,
-               __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64) == 0)
-      Spirv = i;
-  }
-  // Points to a spirv image, if such indeed was found
-  if ((*SelectedBinaryInd = Spirv) != InvalidInd)
-    return PI_SUCCESS;
-
-  // No image can be loaded for the given device
-  return PI_ERROR_INVALID_BINARY;
+  return pi2ur::piextDeviceSelectBinary(Device, Binaries, NumBinaries,
+                                        SelectedBinaryInd);
 }
 
 pi_result piextDeviceGetNativeHandle(pi_device Device,
                                      pi_native_handle *NativeHandle) {
-  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
-  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
 
-  auto ZeDevice = ur_cast<ze_device_handle_t *>(NativeHandle);
-  // Extract the Level Zero module handle from the given PI device
-  *ZeDevice = Device->ZeDevice;
-  return PI_SUCCESS;
+  return pi2ur::piextDeviceGetNativeHandle(Device, NativeHandle);
 }
 
 pi_result piextDeviceCreateWithNativeHandle(pi_native_handle NativeHandle,
                                             pi_platform Platform,
                                             pi_device *Device) {
-  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
-  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
-
-  auto ZeDevice = ur_cast<ze_device_handle_t>(NativeHandle);
-
-  // The SYCL spec requires that the set of devices must remain fixed for the
-  // duration of the application's execution. We assume that we found all of the
-  // Level Zero devices when we initialized the platforms/devices cache, so the
-  // "NativeHandle" must already be in the cache. If it is not, this must not be
-  // a valid Level Zero device.
-  //
-  // TODO: maybe we should populate cache of platforms if it wasn't already.
-  // For now assert that is was populated.
-  PI_ASSERT(PiPlatformCachePopulated, PI_ERROR_INVALID_VALUE);
-  const std::lock_guard<SpinLock> Lock{*PiPlatformsCacheMutex};
-
-  pi_device Dev = nullptr;
-  for (pi_platform ThePlatform : *PiPlatformsCache) {
-    Dev = ThePlatform->getDeviceFromNativeHandle(ZeDevice);
-    if (Dev) {
-      // Check that the input Platform, if was given, matches the found one.
-      PI_ASSERT(!Platform || Platform == ThePlatform,
-                PI_ERROR_INVALID_PLATFORM);
-      break;
-    }
-  }
 
-  if (Dev == nullptr)
-    return PI_ERROR_INVALID_VALUE;
-
-  *Device = Dev;
-  return PI_SUCCESS;
+  return pi2ur::piextDeviceCreateWithNativeHandle(NativeHandle, Platform,
+                                                  Device);
 }
 
 pi_result piContextCreate(const pi_context_properties *Properties,
@@ -2410,96 +107,28 @@ pi_result piContextCreate(const pi_context_properties *Properties,
                                             const void *PrivateInfo, size_t CB,
                                             void *UserData),
                           void *UserData, pi_context *RetContext) {
-  (void)Properties;
-  (void)PFnNotify;
-  (void)UserData;
-  PI_ASSERT(NumDevices, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(Devices, PI_ERROR_INVALID_DEVICE);
-  PI_ASSERT(RetContext, PI_ERROR_INVALID_VALUE);
-
-  pi_platform Platform = (*Devices)->Platform;
-  ZeStruct<ze_context_desc_t> ContextDesc;
-  ContextDesc.flags = 0;
-
-  ze_context_handle_t ZeContext;
-  ZE_CALL(zeContextCreate, (Platform->ZeDriver, &ContextDesc, &ZeContext));
-  try {
-    *RetContext = new _pi_context(ZeContext, NumDevices, Devices, true);
-    (*RetContext)->initialize();
-    if (IndirectAccessTrackingEnabled) {
-      std::scoped_lock<ur_shared_mutex> Lock(Platform->ContextsMutex);
-      Platform->Contexts.push_back(*RetContext);
-    }
-  } catch (const std::bad_alloc &) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  return PI_SUCCESS;
+  return pi2ur::piContextCreate(Properties, NumDevices, Devices, PFnNotify,
+                                UserData, RetContext);
 }
 
 pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName,
                            size_t ParamValueSize, void *ParamValue,
                            size_t *ParamValueSizeRet) {
 
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-
-  std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
-  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
-  switch (ParamName) {
-  case PI_CONTEXT_INFO_DEVICES:
-    return ReturnValue(&Context->Devices[0], Context->Devices.size());
-  case PI_CONTEXT_INFO_NUM_DEVICES:
-    return ReturnValue(pi_uint32(Context->Devices.size()));
-  case PI_CONTEXT_INFO_REFERENCE_COUNT:
-    return ReturnValue(pi_uint32{Context->RefCount.load()});
-  case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT:
-    // 2D USM memcpy is supported unless disabled through
-    // UR_L0_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D.
-    return ReturnValue(pi_bool{UseMemcpy2DOperations});
-  case PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT:
-  case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT:
-    // 2D USM fill and memset is not supported.
-    return ReturnValue(pi_bool{false});
-  case PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES:
-  case PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES:
-  case PI_EXT_CONTEXT_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES:
-  case PI_EXT_CONTEXT_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: {
-    // These queries should be dealt with in context_impl.cpp by calling the
-    // queries of each device separately and building the intersection set.
-    setErrorMessage("These queries should have never come here.",
-                    UR_RESULT_ERROR_INVALID_VALUE);
-    return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-  }
-  default:
-    // TODO: implement other parameters
-    die("piGetContextInfo: unsuppported ParamName.");
-  }
-
-  return PI_SUCCESS;
+  return pi2ur::piContextGetInfo(Context, ParamName, ParamValueSize, ParamValue,
+                                 ParamValueSizeRet);
 }
 
 // FIXME: Dummy implementation to prevent link fail
 pi_result piextContextSetExtendedDeleter(pi_context Context,
                                          pi_context_extended_deleter Function,
                                          void *UserData) {
-  (void)Context;
-  (void)Function;
-  (void)UserData;
-  die("piextContextSetExtendedDeleter: not supported");
-  return PI_SUCCESS;
+  return pi2ur::piextContextSetExtendedDeleter(Context, Function, UserData);
 }
 
 pi_result piextContextGetNativeHandle(pi_context Context,
                                       pi_native_handle *NativeHandle) {
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
-
-  auto ZeContext = ur_cast<ze_context_handle_t *>(NativeHandle);
-  // Extract the Level Zero queue handle from the given PI queue
-  *ZeContext = Context->ZeContext;
-  return PI_SUCCESS;
+  return pi2ur::piextContextGetNativeHandle(Context, NativeHandle);
 }
 
 pi_result piextContextCreateWithNativeHandle(pi_native_handle NativeHandle,
@@ -2507,81 +136,17 @@ pi_result piextContextCreateWithNativeHandle(pi_native_handle NativeHandle,
                                              const pi_device *Devices,
                                              bool OwnNativeHandle,
                                              pi_context *RetContext) {
-  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(Devices, PI_ERROR_INVALID_DEVICE);
-  PI_ASSERT(RetContext, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(NumDevices, PI_ERROR_INVALID_VALUE);
-
-  try {
-    *RetContext = new _pi_context(ur_cast<ze_context_handle_t>(NativeHandle),
-                                  NumDevices, Devices, OwnNativeHandle);
-    (*RetContext)->initialize();
-  } catch (const std::bad_alloc &) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  return PI_SUCCESS;
+  return pi2ur::piextContextCreateWithNativeHandle(
+      NativeHandle, NumDevices, Devices, OwnNativeHandle, RetContext);
 }
 
 pi_result piContextRetain(pi_context Context) {
 
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-
-  Context->RefCount.increment();
-  return PI_SUCCESS;
-}
-
-// Helper function to release the context, a caller must lock the platform-level
-// mutex guarding the container with contexts because the context can be removed
-// from the list of tracked contexts.
-pi_result ContextReleaseHelper(pi_context Context) {
-
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-
-  if (!Context->RefCount.decrementAndTest())
-    return PI_SUCCESS;
-
-  if (IndirectAccessTrackingEnabled) {
-    pi_platform Plt = Context->getPlatform();
-    auto &Contexts = Plt->Contexts;
-    auto It = std::find(Contexts.begin(), Contexts.end(), Context);
-    if (It != Contexts.end())
-      Contexts.erase(It);
-  }
-  ze_context_handle_t DestoryZeContext =
-      Context->OwnZeContext ? Context->ZeContext : nullptr;
-
-  // Clean up any live memory associated with Context
-  pi_result Result = Context->finalize();
-
-  // We must delete Context first and then destroy zeContext because
-  // Context deallocation requires ZeContext in some member deallocation of
-  // pi_context.
-  delete Context;
-
-  // Destruction of some members of pi_context uses L0 context
-  // and therefore it must be valid at that point.
-  // Technically it should be placed to the destructor of pi_context
-  // but this makes API error handling more complex.
-  if (DestoryZeContext) {
-    auto ZeResult = ZE_CALL_NOCHECK(zeContextDestroy, (DestoryZeContext));
-    // Gracefully handle the case that L0 was already unloaded.
-    if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
-      return mapError(ZeResult);
-  }
-  return Result;
+  return pi2ur::piContextRetain(Context);
 }
 
 pi_result piContextRelease(pi_context Context) {
-  pi_platform Plt = Context->getPlatform();
-  std::unique_lock<ur_shared_mutex> ContextsLock(Plt->ContextsMutex,
-                                                 std::defer_lock);
-  if (IndirectAccessTrackingEnabled)
-    ContextsLock.lock();
-
-  return ContextReleaseHelper(Context);
+  return pi2ur::piContextRelease(Context);
 }
 
 pi_result piQueueCreate(pi_context Context, pi_device Device,
@@ -2592,1063 +157,83 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
 
 pi_result piextQueueCreate(pi_context Context, pi_device Device,
                            pi_queue_properties *Properties, pi_queue *Queue) {
-  PI_ASSERT(Properties, PI_ERROR_INVALID_VALUE);
-  // Expect flags mask to be passed first.
-  PI_ASSERT(Properties[0] == PI_QUEUE_FLAGS, PI_ERROR_INVALID_VALUE);
-  pi_queue_properties Flags = Properties[1];
-
-  PI_ASSERT(Properties[2] == 0 ||
-                (Properties[2] == PI_QUEUE_COMPUTE_INDEX && Properties[4] == 0),
-            PI_ERROR_INVALID_VALUE);
-  auto ForceComputeIndex = Properties[2] == PI_QUEUE_COMPUTE_INDEX
-                               ? static_cast<int>(Properties[3])
-                               : -1; // Use default/round-robin.
-
-  // Check that unexpected bits are not set.
-  PI_ASSERT(
-      !(Flags & ~(PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE |
-                  PI_QUEUE_FLAG_PROFILING_ENABLE | PI_QUEUE_FLAG_ON_DEVICE |
-                  PI_QUEUE_FLAG_ON_DEVICE_DEFAULT |
-                  PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS |
-                  PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW |
-                  PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH)),
-      PI_ERROR_INVALID_VALUE);
-
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
-  PI_ASSERT(Context->isValidDevice(Device), PI_ERROR_INVALID_DEVICE);
-
-  // Create placeholder queues in the compute queue group.
-  // Actual L0 queues will be created at first use.
-  std::vector<ze_command_queue_handle_t> ZeComputeCommandQueues(
-      Device->QueueGroup[_pi_queue::queue_type::Compute].ZeProperties.numQueues,
-      nullptr);
-
-  // Create placeholder queues in the copy queue group (main and link
-  // native groups are combined into one group).
-  // Actual L0 queues will be created at first use.
-  size_t NumCopyGroups = 0;
-  if (Device->hasMainCopyEngine()) {
-    NumCopyGroups += Device->QueueGroup[_pi_queue::queue_type::MainCopy]
-                         .ZeProperties.numQueues;
-  }
-  if (Device->hasLinkCopyEngine()) {
-    NumCopyGroups += Device->QueueGroup[_pi_queue::queue_type::LinkCopy]
-                         .ZeProperties.numQueues;
-  }
-  std::vector<ze_command_queue_handle_t> ZeCopyCommandQueues(NumCopyGroups,
-                                                             nullptr);
-
-  try {
-    *Queue = new _pi_queue(ZeComputeCommandQueues, ZeCopyCommandQueues, Context,
-                           Device, true, Flags, ForceComputeIndex);
-  } catch (const std::bad_alloc &) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  // Do eager initialization of Level Zero handles on request.
-  if (doEagerInit) {
-    pi_queue Q = *Queue;
-    // Creates said number of command-lists.
-    auto warmupQueueGroup = [Q](bool UseCopyEngine,
-                                uint32_t RepeatCount) -> pi_result {
-      pi_command_list_ptr_t CommandList;
-      while (RepeatCount--) {
-        if (Q->UsingImmCmdLists) {
-          CommandList = Q->getQueueGroup(UseCopyEngine).getImmCmdList();
-        } else {
-          // Heuristically create some number of regular command-list to reuse.
-          for (int I = 0; I < 10; ++I) {
-            PI_CALL(Q->createCommandList(UseCopyEngine, CommandList));
-            // Immediately return them to the cache of available command-lists.
-            std::vector<pi_event> EventsUnused;
-            PI_CALL(Q->resetCommandList(CommandList, true /* MakeAvailable */,
-                                        EventsUnused));
-          }
-        }
-      }
-      return PI_SUCCESS;
-    };
-    // Create as many command-lists as there are queues in the group.
-    // With this the underlying round-robin logic would initialize all
-    // native queues, and create command-lists and their fences.
-    // At this point only the thread creating the queue will have associated
-    // command-lists. Other threads have not accessed the queue yet. So we can
-    // only warmup the initial thread's command-lists.
-    auto QueueGroup = Q->ComputeQueueGroupsByTID.get();
-    PI_CALL(warmupQueueGroup(false, QueueGroup.UpperIndex -
-                                        QueueGroup.LowerIndex + 1));
-    if (Q->useCopyEngine()) {
-      auto QueueGroup = Q->CopyQueueGroupsByTID.get();
-      PI_CALL(warmupQueueGroup(true, QueueGroup.UpperIndex -
-                                         QueueGroup.LowerIndex + 1));
-    }
-    // TODO: warmup event pools. Both host-visible and device-only.
-  }
-  return PI_SUCCESS;
+  return pi2ur::piextQueueCreate(Context, Device, Properties, Queue);
 }
 
 pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName,
                          size_t ParamValueSize, void *ParamValue,
                          size_t *ParamValueSizeRet) {
 
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  std::shared_lock<ur_shared_mutex> Lock(Queue->Mutex);
-  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
-  // TODO: consider support for queue properties and size
-  switch (ParamName) {
-  case PI_QUEUE_INFO_CONTEXT:
-    return ReturnValue(Queue->Context);
-  case PI_QUEUE_INFO_DEVICE:
-    return ReturnValue(Queue->Device);
-  case PI_QUEUE_INFO_REFERENCE_COUNT:
-    return ReturnValue(pi_uint32{Queue->RefCount.load()});
-  case PI_QUEUE_INFO_PROPERTIES:
-    die("PI_QUEUE_INFO_PROPERTIES in piQueueGetInfo not implemented\n");
-    break;
-  case PI_QUEUE_INFO_SIZE:
-    die("PI_QUEUE_INFO_SIZE in piQueueGetInfo not implemented\n");
-    break;
-  case PI_QUEUE_INFO_DEVICE_DEFAULT:
-    die("PI_QUEUE_INFO_DEVICE_DEFAULT in piQueueGetInfo not implemented\n");
-    break;
-  case PI_EXT_ONEAPI_QUEUE_INFO_EMPTY: {
-    // We can exit early if we have in-order queue.
-    if (Queue->isInOrderQueue()) {
-      if (!Queue->LastCommandEvent)
-        return ReturnValue(pi_bool{true});
-
-      // We can check status of the event only if it isn't discarded otherwise
-      // it may be reset (because we are free to reuse such events) and
-      // zeEventQueryStatus will hang.
-      // TODO: use more robust way to check that ZeEvent is not owned by
-      // LastCommandEvent.
-      if (!Queue->LastCommandEvent->IsDiscarded) {
-        ze_result_t ZeResult = ZE_CALL_NOCHECK(
-            zeEventQueryStatus, (Queue->LastCommandEvent->ZeEvent));
-        if (ZeResult == ZE_RESULT_NOT_READY) {
-          return ReturnValue(pi_bool{false});
-        } else if (ZeResult != ZE_RESULT_SUCCESS) {
-          return mapError(ZeResult);
-        }
-        return ReturnValue(pi_bool{true});
-      }
-      // For immediate command lists we have to check status of the event
-      // because immediate command lists are not associated with level zero
-      // queue. Conservatively return false in this case because last event is
-      // discarded and we can't check its status.
-      if (Queue->UsingImmCmdLists)
-        return ReturnValue(pi_bool{false});
-    }
-
-    // If we have any open command list which is not empty then return false
-    // because it means that there are commands which are not even submitted for
-    // execution yet.
-    using IsCopy = bool;
-    if (Queue->hasOpenCommandList(IsCopy{true}) ||
-        Queue->hasOpenCommandList(IsCopy{false}))
-      return ReturnValue(pi_bool{false});
-
-    for (const auto &QueueMap :
-         {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID}) {
-      for (const auto &QueueGroup : QueueMap) {
-        if (Queue->UsingImmCmdLists) {
-          // Immediate command lists are not associated with any Level Zero
-          // queue, that's why we have to check status of events in each
-          // immediate command list. Start checking from the end and exit early
-          // if some event is not completed.
-          for (const auto &ImmCmdList : QueueGroup.second.ImmCmdLists) {
-            if (ImmCmdList == Queue->CommandListMap.end())
-              continue;
-
-            auto EventList = ImmCmdList->second.EventList;
-            for (auto It = EventList.crbegin(); It != EventList.crend(); It++) {
-              ze_result_t ZeResult =
-                  ZE_CALL_NOCHECK(zeEventQueryStatus, ((*It)->ZeEvent));
-              if (ZeResult == ZE_RESULT_NOT_READY) {
-                return ReturnValue(pi_bool{false});
-              } else if (ZeResult != ZE_RESULT_SUCCESS) {
-                return mapError(ZeResult);
-              }
-            }
-          }
-        } else {
-          for (const auto &ZeQueue : QueueGroup.second.ZeQueues) {
-            if (!ZeQueue)
-              continue;
-            // Provide 0 as the timeout parameter to immediately get the status
-            // of the Level Zero queue.
-            ze_result_t ZeResult = ZE_CALL_NOCHECK(zeCommandQueueSynchronize,
-                                                   (ZeQueue, /* timeout */ 0));
-            if (ZeResult == ZE_RESULT_NOT_READY) {
-              return ReturnValue(pi_bool{false});
-            } else if (ZeResult != ZE_RESULT_SUCCESS) {
-              return mapError(ZeResult);
-            }
-          }
-        }
-      }
-    }
-    return ReturnValue(pi_bool{true});
-  }
-  default:
-    urPrint("Unsupported ParamName in piQueueGetInfo: ParamName=%d(0x%x)\n",
-            ParamName, ParamName);
-    return PI_ERROR_INVALID_VALUE;
-  }
-
-  return PI_SUCCESS;
+  return pi2ur::piQueueGetInfo(Queue, ParamName, ParamValueSize, ParamValue,
+                               ParamValueSizeRet);
 }
 
-pi_result piQueueRetain(pi_queue Queue) {
-  {
-    std::scoped_lock<ur_shared_mutex> Lock(Queue->Mutex);
-    Queue->RefCountExternal++;
-  }
-  Queue->RefCount.increment();
-  return PI_SUCCESS;
-}
+pi_result piQueueRetain(pi_queue Queue) { return pi2ur::piQueueRetain(Queue); }
 
 pi_result piQueueRelease(pi_queue Queue) {
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-  std::vector<pi_event> EventListToCleanup;
-
-  {
-    std::scoped_lock<ur_shared_mutex> Lock(Queue->Mutex);
-
-    if ((--Queue->RefCountExternal) != 0)
-      return PI_SUCCESS;
-
-    // When external reference count goes to zero it is still possible
-    // that internal references still exists, e.g. command-lists that
-    // are not yet completed. So do full queue synchronization here
-    // and perform proper cleanup.
-    //
-    // It is possible to get to here and still have an open command list
-    // if no wait or finish ever occurred for this queue.
-    if (auto Res = Queue->executeAllOpenCommandLists())
-      return Res;
-
-    // Make sure all commands get executed.
-    Queue->synchronize();
-
-    // Destroy all the fences created associated with this queue.
-    for (auto it = Queue->CommandListMap.begin();
-         it != Queue->CommandListMap.end(); ++it) {
-      // This fence wasn't yet signalled when we polled it for recycling
-      // the command-list, so need to release the command-list too.
-      // For immediate commandlists we don't need to do an L0 reset of the
-      // commandlist but do need to do event cleanup which is also in the
-      // resetCommandList function.
-      // If the fence is a nullptr we are using immediate commandlists,
-      // otherwise regular commandlists which use a fence.
-      if (it->second.ZeFence == nullptr || it->second.ZeFenceInUse) {
-        Queue->resetCommandList(it, true, EventListToCleanup);
-      }
-      // TODO: remove "if" when the problem is fixed in the level zero
-      // runtime. Destroy only if a queue is healthy. Destroying a fence may
-      // cause a hang otherwise.
-      // If the fence is a nullptr we are using immediate commandlists.
-      if (Queue->Healthy && it->second.ZeFence != nullptr) {
-        auto ZeResult = ZE_CALL_NOCHECK(zeFenceDestroy, (it->second.ZeFence));
-        // Gracefully handle the case that L0 was already unloaded.
-        if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
-          return mapError(ZeResult);
-      }
-      if (Queue->UsingImmCmdLists && Queue->OwnZeCommandQueue) {
-        std::scoped_lock<ur_mutex> Lock(
-            Queue->Context->ZeCommandListCacheMutex);
-        const pi_command_list_info_t &MapEntry = it->second;
-        if (MapEntry.CanReuse) {
-          // Add commandlist to the cache for future use.
-          // It will be deleted when the context is destroyed.
-          auto &ZeCommandListCache =
-              MapEntry.isCopy(Queue)
-                  ? Queue->Context
-                        ->ZeCopyCommandListCache[Queue->Device->ZeDevice]
-                  : Queue->Context
-                        ->ZeComputeCommandListCache[Queue->Device->ZeDevice];
-          ZeCommandListCache.push_back({it->first, it->second.ZeQueueDesc});
-        } else {
-          // A non-reusable comamnd list that came from a make_queue call is
-          // destroyed since it cannot be recycled.
-          ze_command_list_handle_t ZeCommandList = it->first;
-          if (ZeCommandList) {
-            ZE_CALL(zeCommandListDestroy, (ZeCommandList));
-          }
-        }
-      }
-    }
-    Queue->CommandListMap.clear();
-  }
-
-  for (auto &Event : EventListToCleanup) {
-    // We don't need to synchronize the events since the queue
-    // synchronized above already does that.
-    {
-      std::scoped_lock<ur_shared_mutex> EventLock(Event->Mutex);
-      Event->Completed = true;
-    }
-    PI_CALL(CleanupCompletedEvent(Event));
-    // This event was removed from the command list, so decrement ref count
-    // (it was incremented when they were added to the command list).
-    PI_CALL(piEventReleaseInternal(Event));
-  }
-  PI_CALL(piQueueReleaseInternal(Queue));
-  return PI_SUCCESS;
+  return pi2ur::piQueueRelease(Queue);
 }
 
-static pi_result piQueueReleaseInternal(pi_queue Queue) {
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  if (!Queue->RefCount.decrementAndTest())
-    return PI_SUCCESS;
-
-  for (auto &Cache : Queue->EventCaches)
-    for (auto &Event : Cache)
-      PI_CALL(piEventReleaseInternal(Event));
-
-  if (Queue->OwnZeCommandQueue) {
-    for (auto &QueueMap :
-         {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID})
-      for (auto &QueueGroup : QueueMap)
-        for (auto &ZeQueue : QueueGroup.second.ZeQueues)
-          if (ZeQueue) {
-            auto ZeResult = ZE_CALL_NOCHECK(zeCommandQueueDestroy, (ZeQueue));
-            // Gracefully handle the case that L0 was already unloaded.
-            if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
-              return mapError(ZeResult);
-          }
-  }
-
-  urPrint("piQueueRelease(compute) NumTimesClosedFull %d, "
-          "NumTimesClosedEarly %d\n",
-          Queue->ComputeCommandBatch.NumTimesClosedFull,
-          Queue->ComputeCommandBatch.NumTimesClosedEarly);
-  urPrint("piQueueRelease(copy) NumTimesClosedFull %d, NumTimesClosedEarly "
-          "%d\n",
-          Queue->CopyCommandBatch.NumTimesClosedFull,
-          Queue->CopyCommandBatch.NumTimesClosedEarly);
-
-  delete Queue;
-
-  return PI_SUCCESS;
-}
-
-pi_result piQueueFinish(pi_queue Queue) {
-  // Wait until command lists attached to the command queue are executed.
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  if (Queue->UsingImmCmdLists) {
-    // Lock automatically releases when this goes out of scope.
-    std::scoped_lock<ur_shared_mutex> Lock(Queue->Mutex);
-
-    Queue->synchronize();
-  } else {
-    std::unique_lock<ur_shared_mutex> Lock(Queue->Mutex);
-    std::vector<ze_command_queue_handle_t> ZeQueues;
-
-    // execute any command list that may still be open.
-    if (auto Res = Queue->executeAllOpenCommandLists())
-      return Res;
-
-    // Make a copy of queues to sync and release the lock.
-    for (auto &QueueMap :
-         {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID})
-      for (auto &QueueGroup : QueueMap)
-        std::copy(QueueGroup.second.ZeQueues.begin(),
-                  QueueGroup.second.ZeQueues.end(),
-                  std::back_inserter(ZeQueues));
-
-    // Remember the last command's event.
-    auto LastCommandEvent = Queue->LastCommandEvent;
-
-    // Don't hold a lock to the queue's mutex while waiting.
-    // This allows continue working with the queue from other threads.
-    // TODO: this currently exhibits some issues in the driver, so
-    // we control this with an env var. Remove this control when
-    // we settle one way or the other.
-    const char *UrRet = std::getenv("UR_L0_QUEUE_FINISH_HOLD_LOCK");
-    const char *PiRet =
-        std::getenv("SYCL_PI_LEVEL_ZERO_QUEUE_FINISH_HOLD_LOCK");
-    const bool HoldLock =
-        UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0);
-
-    if (!HoldLock) {
-      Lock.unlock();
-    }
-
-    for (auto &ZeQueue : ZeQueues) {
-      if (ZeQueue)
-        ZE_CALL(zeHostSynchronize, (ZeQueue));
-    }
-
-    // Prevent unneeded already finished events to show up in the wait list.
-    // We can only do so if nothing else was submitted to the queue
-    // while we were synchronizing it.
-    if (!HoldLock) {
-      std::scoped_lock<ur_shared_mutex> Lock(Queue->Mutex);
-      if (LastCommandEvent == Queue->LastCommandEvent) {
-        Queue->LastCommandEvent = nullptr;
-      }
-    } else {
-      Queue->LastCommandEvent = nullptr;
-    }
-  }
-  // Reset signalled command lists and return them back to the cache of
-  // available command lists. Events in the immediate command lists are cleaned
-  // up in synchronize().
-  if (!Queue->UsingImmCmdLists) {
-    std::unique_lock<ur_shared_mutex> Lock(Queue->Mutex);
-    resetCommandLists(Queue);
-  }
-  return PI_SUCCESS;
-}
+pi_result piQueueFinish(pi_queue Queue) { return pi2ur::piQueueFinish(Queue); }
 
-// Flushing cross-queue dependencies is covered by createAndRetainPiZeEventList,
-// so this can be left as a no-op.
-pi_result piQueueFlush(pi_queue Queue) {
-  (void)Queue;
-  return PI_SUCCESS;
-}
+pi_result piQueueFlush(pi_queue Queue) { return pi2ur::piQueueFlush(Queue); }
 
 pi_result piextQueueGetNativeHandle(pi_queue Queue,
-                                    pi_native_handle *NativeHandle,
-                                    int32_t *NativeHandleDesc) {
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(NativeHandleDesc, PI_ERROR_INVALID_VALUE);
-
-  // Lock automatically releases when this goes out of scope.
-  std::shared_lock<ur_shared_mutex> lock(Queue->Mutex);
-
-  // Get handle to this thread's queue group.
-  auto &QueueGroup = Queue->getQueueGroup(false /*compute*/);
-
-  if (Queue->UsingImmCmdLists) {
-    auto ZeCmdList = ur_cast<ze_command_list_handle_t *>(NativeHandle);
-    // Extract the Level Zero command list handle from the given PI queue
-    *ZeCmdList = QueueGroup.getImmCmdList()->first;
-    *NativeHandleDesc = true;
-  } else {
-    auto ZeQueue = ur_cast<ze_command_queue_handle_t *>(NativeHandle);
-    // Extract a Level Zero compute queue handle from the given PI queue
-    uint32_t QueueGroupOrdinalUnused;
-    *ZeQueue = QueueGroup.getZeQueue(&QueueGroupOrdinalUnused);
-    *NativeHandleDesc = false;
-  }
-  return PI_SUCCESS;
-}
+                                    pi_native_handle *NativeHandle) {
 
-void _pi_queue::pi_queue_group_t::setImmCmdList(
-    ze_command_list_handle_t ZeCommandList) {
-  // An immediate command list was given to us but we don't have the queue
-  // descriptor information. Create a dummy and note that it is not recycleable.
-  ZeStruct<ze_command_queue_desc_t> ZeQueueDesc;
-  ImmCmdLists = std::vector<pi_command_list_ptr_t>(
-      1,
-      Queue->CommandListMap
-          .insert(std::pair<ze_command_list_handle_t, pi_command_list_info_t>{
-              ZeCommandList,
-              {nullptr, true, false, nullptr, ZeQueueDesc, false}})
-          .first);
+  return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle);
 }
 
 pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
-                                           int32_t NativeHandleDesc,
                                            pi_context Context, pi_device Device,
                                            bool OwnNativeHandle,
-                                           pi_queue_properties *Properties,
                                            pi_queue *Queue) {
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
-
-  // The NativeHandleDesc has value if if the native handle is an immediate
-  // command list.
-  if (NativeHandleDesc == 1) {
-    std::vector<ze_command_queue_handle_t> ComputeQueues{nullptr};
-    std::vector<ze_command_queue_handle_t> CopyQueues;
-
-    *Queue = new _pi_queue(ComputeQueues, CopyQueues, Context, Device,
-                           OwnNativeHandle, Properties[1]);
-    auto &InitialGroup = (*Queue)->ComputeQueueGroupsByTID.begin()->second;
-    InitialGroup.setImmCmdList(ur_cast<ze_command_list_handle_t>(NativeHandle));
-  } else {
-    auto ZeQueue = ur_cast<ze_command_queue_handle_t>(NativeHandle);
-    // Assume this is the "0" index queue in the compute command-group.
-    std::vector<ze_command_queue_handle_t> ZeQueues{ZeQueue};
-
-    // TODO: see what we can do to correctly initialize PI queue for
-    // compute vs. copy Level-Zero queue. Currently we will send
-    // all commands to the "ZeQueue".
-    std::vector<ze_command_queue_handle_t> ZeroCopyQueues;
-
-    *Queue = new _pi_queue(ZeQueues, ZeroCopyQueues, Context, Device,
-                           OwnNativeHandle, Properties[1]);
-  }
-  (*Queue)->UsingImmCmdLists = (NativeHandleDesc == 1);
 
-  return PI_SUCCESS;
-}
-
-// If indirect access tracking is enabled then performs reference counting,
-// otherwise just calls zeMemAllocDevice.
-static pi_result ZeDeviceMemAllocHelper(void **ResultPtr, pi_context Context,
-                                        pi_device Device, size_t Size) {
-  pi_platform Plt = Device->Platform;
-  std::unique_lock<ur_shared_mutex> ContextsLock(Plt->ContextsMutex,
-                                                 std::defer_lock);
-  if (IndirectAccessTrackingEnabled) {
-    // Lock the mutex which is guarding contexts container in the platform.
-    // This prevents new kernels from being submitted in any context while
-    // we are in the process of allocating a memory, this is needed to
-    // properly capture allocations by kernels with indirect access.
-    ContextsLock.lock();
-    // We are going to defer memory release if there are kernels with
-    // indirect access, that is why explicitly retain context to be sure
-    // that it is released after all memory allocations in this context are
-    // released.
-    PI_CALL(piContextRetain(Context));
-  }
-
-  ze_device_mem_alloc_desc_t ZeDesc = {};
-  ZeDesc.flags = 0;
-  ZeDesc.ordinal = 0;
-  ZE_CALL(zeMemAllocDevice,
-          (Context->ZeContext, &ZeDesc, Size, 1, Device->ZeDevice, ResultPtr));
-
-  if (IndirectAccessTrackingEnabled) {
-    // Keep track of all memory allocations in the context
-    Context->MemAllocs.emplace(std::piecewise_construct,
-                               std::forward_as_tuple(*ResultPtr),
-                               std::forward_as_tuple(Context));
-  }
-  return PI_SUCCESS;
-}
-
-// If indirect access tracking is enabled then performs reference counting,
-// otherwise just calls zeMemAllocHost.
-static pi_result ZeHostMemAllocHelper(void **ResultPtr, pi_context Context,
-                                      size_t Size) {
-  pi_platform Plt = Context->getPlatform();
-  std::unique_lock<ur_shared_mutex> ContextsLock(Plt->ContextsMutex,
-                                                 std::defer_lock);
-  if (IndirectAccessTrackingEnabled) {
-    // Lock the mutex which is guarding contexts container in the platform.
-    // This prevents new kernels from being submitted in any context while
-    // we are in the process of allocating a memory, this is needed to
-    // properly capture allocations by kernels with indirect access.
-    ContextsLock.lock();
-    // We are going to defer memory release if there are kernels with
-    // indirect access, that is why explicitly retain context to be sure
-    // that it is released after all memory allocations in this context are
-    // released.
-    PI_CALL(piContextRetain(Context));
-  }
-
-  ZeStruct<ze_host_mem_alloc_desc_t> ZeDesc;
-  ZeDesc.flags = 0;
-  ZE_CALL(zeMemAllocHost, (Context->ZeContext, &ZeDesc, Size, 1, ResultPtr));
-
-  if (IndirectAccessTrackingEnabled) {
-    // Keep track of all memory allocations in the context
-    Context->MemAllocs.emplace(std::piecewise_construct,
-                               std::forward_as_tuple(*ResultPtr),
-                               std::forward_as_tuple(Context));
-  }
-  return PI_SUCCESS;
+  return pi2ur::piextQueueCreateWithNativeHandle(NativeHandle, Context, Device,
+                                                 OwnNativeHandle, Queue);
 }
 
 pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
                             void *HostPtr, pi_mem *RetMem,
                             const pi_mem_properties *properties) {
-
-  // TODO: implement support for more access modes
-  if (!((Flags & PI_MEM_FLAGS_ACCESS_RW) ||
-        (Flags & PI_MEM_ACCESS_READ_ONLY))) {
-    die("piMemBufferCreate: Level-Zero supports read-write and read-only "
-        "buffer,"
-        "but not other accesses (such as write-only) yet.");
-  }
-
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-  PI_ASSERT(RetMem, PI_ERROR_INVALID_VALUE);
-
-  if (properties != nullptr) {
-    die("piMemBufferCreate: no mem properties goes to Level-Zero RT yet");
-  }
-
-  if (Flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
-    // Having PI_MEM_FLAGS_HOST_PTR_ALLOC for buffer requires allocation of
-    // pinned host memory, see:
-    // sycl/doc/extensions/supported/sycl_ext_oneapi_use_pinned_host_memory_property.asciidoc
-    // We are however missing such functionality in Level Zero, so we just
-    // ignore the flag for now.
-    //
-  }
-
-  // If USM Import feature is enabled and hostptr is supplied,
-  // import the hostptr if not already imported into USM.
-  // Data transfer rate is maximized when both source and destination
-  // are USM pointers. Promotion of the host pointer to USM thus
-  // optimizes data transfer performance.
-  bool HostPtrImported = false;
-  if (ZeUSMImport.Enabled && HostPtr != nullptr &&
-      (Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0) {
-    // Query memory type of the host pointer
-    ze_device_handle_t ZeDeviceHandle;
-    ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
-    ZE_CALL(zeMemGetAllocProperties,
-            (Context->ZeContext, HostPtr, &ZeMemoryAllocationProperties,
-             &ZeDeviceHandle));
-
-    // If not shared of any type, we can import the ptr
-    if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) {
-      // Promote the host ptr to USM host memory
-      ze_driver_handle_t driverHandle = Context->getPlatform()->ZeDriver;
-      ZeUSMImport.doZeUSMImport(driverHandle, HostPtr, Size);
-      HostPtrImported = true;
-    }
-  }
-
-  pi_buffer Buffer = nullptr;
-  auto HostPtrOrNull =
-      (Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? ur_cast<char *>(HostPtr) : nullptr;
-  try {
-    Buffer = new _pi_buffer(Context, Size, HostPtrOrNull, HostPtrImported);
-  } catch (const std::bad_alloc &) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  // Initialize the buffer with user data
-  if (HostPtr) {
-    if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
-        (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
-
-      // We don't yet know which device needs this buffer, so make the first
-      // device in the context be the master, and hold the initial valid
-      // allocation.
-      char *ZeHandleDst;
-      PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only,
-                                  Context->Devices[0]));
-      if (Buffer->OnHost) {
-        // Do a host to host copy.
-        // For an imported HostPtr the copy is unneeded.
-        if (!HostPtrImported)
-          memcpy(ZeHandleDst, HostPtr, Size);
-      } else {
-        // Initialize the buffer synchronously with immediate offload
-        // zeCommandListAppendMemoryCopy must not be called from simultaneous
-        // threads with the same command list handle, so we need exclusive lock.
-        std::scoped_lock<ur_mutex> Lock(Context->ImmediateCommandListMutex);
-        ZE_CALL(zeCommandListAppendMemoryCopy,
-                (Context->ZeCommandListInit, ZeHandleDst, HostPtr, Size,
-                 nullptr, 0, nullptr));
-      }
-    } else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
-      // Nothing more to do.
-    } else {
-      die("piMemBufferCreate: not implemented");
-    }
-  }
-
-  *RetMem = Buffer;
-  return PI_SUCCESS;
+  return pi2ur::piMemBufferCreate(Context, Flags, Size, HostPtr, RetMem,
+                                  properties);
 }
 
 pi_result piMemGetInfo(pi_mem Mem, pi_mem_info ParamName, size_t ParamValueSize,
                        void *ParamValue, size_t *ParamValueSizeRet) {
-  PI_ASSERT(Mem, PI_ERROR_INVALID_VALUE);
-  // piMemImageGetInfo must be used for images, except for shared params (like
-  // Context, AccessMode, etc)
-  PI_ASSERT(ParamName == PI_MEM_CONTEXT || !Mem->isImage(),
-            PI_ERROR_INVALID_VALUE);
-
-  std::shared_lock<ur_shared_mutex> Lock(Mem->Mutex);
-  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
-
-  switch (ParamName) {
-  case PI_MEM_CONTEXT:
-    return ReturnValue(Mem->Context);
-  case PI_MEM_SIZE: {
-    // Get size of the allocation
-    auto Buffer = ur_cast<pi_buffer>(Mem);
-    return ReturnValue(size_t{Buffer->Size});
-  }
-  default:
-    die("piMemGetInfo: Parameter is not implemented");
-  }
-
-  return {};
-}
-
-pi_result piMemRetain(pi_mem Mem) {
-  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
-
-  Mem->RefCount.increment();
-  return PI_SUCCESS;
+  return pi2ur::piMemGetInfo(Mem, ParamName, ParamValueSize, ParamValue,
+                             ParamValueSizeRet);
 }
 
-// If indirect access tracking is not enabled then this functions just performs
-// zeMemFree. If indirect access tracking is enabled then reference counting is
-// performed.
-static pi_result ZeMemFreeHelper(pi_context Context, void *Ptr) {
-  pi_platform Plt = Context->getPlatform();
-  std::unique_lock<ur_shared_mutex> ContextsLock(Plt->ContextsMutex,
-                                                 std::defer_lock);
-  if (IndirectAccessTrackingEnabled) {
-    ContextsLock.lock();
-    auto It = Context->MemAllocs.find(Ptr);
-    if (It == std::end(Context->MemAllocs)) {
-      die("All memory allocations must be tracked!");
-    }
-    if (!It->second.RefCount.decrementAndTest()) {
-      // Memory can't be deallocated yet.
-      return PI_SUCCESS;
-    }
-
-    // Reference count is zero, it is ok to free memory.
-    // We don't need to track this allocation anymore.
-    Context->MemAllocs.erase(It);
-  }
-
-  ZE_CALL(zeMemFree, (Context->ZeContext, Ptr));
+pi_result piMemRetain(pi_mem Mem) { return pi2ur::piMemRetain(Mem); }
 
-  if (IndirectAccessTrackingEnabled)
-    PI_CALL(ContextReleaseHelper(Context));
-
-  return PI_SUCCESS;
-}
-
-static pi_result USMFreeHelper(pi_context Context, void *Ptr,
-                               bool OwnZeMemHandle = true);
-
-pi_result piMemRelease(pi_mem Mem) {
-  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
-
-  if (!Mem->RefCount.decrementAndTest())
-    return PI_SUCCESS;
-
-  if (Mem->isImage()) {
-    char *ZeHandleImage;
-    auto Image = static_cast<pi_image>(Mem);
-    if (Image->OwnZeMemHandle) {
-      PI_CALL(Mem->getZeHandle(ZeHandleImage, _pi_mem::write_only));
-      auto ZeResult = ZE_CALL_NOCHECK(
-          zeImageDestroy, (ur_cast<ze_image_handle_t>(ZeHandleImage)));
-      // Gracefully handle the case that L0 was already unloaded.
-      if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
-        return mapError(ZeResult);
-    }
-  } else {
-    auto Buffer = static_cast<pi_buffer>(Mem);
-    Buffer->free();
-  }
-  delete Mem;
-
-  return PI_SUCCESS;
-}
-
-static pi_result pi2zeImageDesc(const pi_image_format *ImageFormat,
-                                const pi_image_desc *ImageDesc,
-                                ZeStruct<ze_image_desc_t> &ZeImageDesc) {
-  ze_image_format_type_t ZeImageFormatType;
-  size_t ZeImageFormatTypeSize;
-  switch (ImageFormat->image_channel_data_type) {
-  case PI_IMAGE_CHANNEL_TYPE_FLOAT:
-    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT;
-    ZeImageFormatTypeSize = 32;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT:
-    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT;
-    ZeImageFormatTypeSize = 16;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
-    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
-    ZeImageFormatTypeSize = 32;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16:
-    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
-    ZeImageFormatTypeSize = 16;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
-    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
-    ZeImageFormatTypeSize = 8;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_UNORM_INT16:
-    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM;
-    ZeImageFormatTypeSize = 16;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_UNORM_INT8:
-    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM;
-    ZeImageFormatTypeSize = 8;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
-    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
-    ZeImageFormatTypeSize = 32;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT16:
-    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
-    ZeImageFormatTypeSize = 16;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
-    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
-    ZeImageFormatTypeSize = 8;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_SNORM_INT16:
-    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM;
-    ZeImageFormatTypeSize = 16;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_SNORM_INT8:
-    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM;
-    ZeImageFormatTypeSize = 8;
-    break;
-  default:
-    urPrint("piMemImageCreate: unsupported image data type: data type = %d\n",
-            ImageFormat->image_channel_data_type);
-    return PI_ERROR_INVALID_VALUE;
-  }
-
-  // TODO: populate the layout mapping
-  ze_image_format_layout_t ZeImageFormatLayout;
-  switch (ImageFormat->image_channel_order) {
-  case PI_IMAGE_CHANNEL_ORDER_RGBA:
-    switch (ZeImageFormatTypeSize) {
-    case 8:
-      ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8;
-      break;
-    case 16:
-      ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16;
-      break;
-    case 32:
-      ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32;
-      break;
-    default:
-      urPrint("piMemImageCreate: unexpected data type Size\n");
-      return PI_ERROR_INVALID_VALUE;
-    }
-    break;
-  default:
-    urPrint("format layout = %d\n", ImageFormat->image_channel_order);
-    die("piMemImageCreate: unsupported image format layout\n");
-    break;
-  }
-
-  ze_image_format_t ZeFormatDesc = {
-      ZeImageFormatLayout, ZeImageFormatType,
-      // TODO: are swizzles deducted from image_format->image_channel_order?
-      ZE_IMAGE_FORMAT_SWIZZLE_R, ZE_IMAGE_FORMAT_SWIZZLE_G,
-      ZE_IMAGE_FORMAT_SWIZZLE_B, ZE_IMAGE_FORMAT_SWIZZLE_A};
-
-  ze_image_type_t ZeImageType;
-  switch (ImageDesc->image_type) {
-  case PI_MEM_TYPE_IMAGE1D:
-    ZeImageType = ZE_IMAGE_TYPE_1D;
-    break;
-  case PI_MEM_TYPE_IMAGE2D:
-    ZeImageType = ZE_IMAGE_TYPE_2D;
-    break;
-  case PI_MEM_TYPE_IMAGE3D:
-    ZeImageType = ZE_IMAGE_TYPE_3D;
-    break;
-  case PI_MEM_TYPE_IMAGE1D_ARRAY:
-    ZeImageType = ZE_IMAGE_TYPE_1DARRAY;
-    break;
-  case PI_MEM_TYPE_IMAGE2D_ARRAY:
-    ZeImageType = ZE_IMAGE_TYPE_2DARRAY;
-    break;
-  default:
-    urPrint("piMemImageCreate: unsupported image type\n");
-    return PI_ERROR_INVALID_VALUE;
-  }
-
-  ZeImageDesc.arraylevels = 0;
-  ZeImageDesc.flags = 0;
-  ZeImageDesc.type = ZeImageType;
-  ZeImageDesc.format = ZeFormatDesc;
-  ZeImageDesc.width = ur_cast<uint32_t>(ImageDesc->image_width);
-  ZeImageDesc.height = ur_cast<uint32_t>(ImageDesc->image_height);
-  ZeImageDesc.depth = ur_cast<uint32_t>(ImageDesc->image_depth);
-  ZeImageDesc.arraylevels = ur_cast<uint32_t>(ImageDesc->image_array_size);
-  ZeImageDesc.miplevels = ImageDesc->num_mip_levels;
-
-  return PI_SUCCESS;
-}
+pi_result piMemRelease(pi_mem Mem) { return pi2ur::piMemRelease(Mem); }
 
 pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
                            const pi_image_format *ImageFormat,
                            const pi_image_desc *ImageDesc, void *HostPtr,
                            pi_mem *RetImage) {
-
-  // TODO: implement read-only, write-only
-  if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) {
-    die("piMemImageCreate: Level-Zero implements only read-write buffer,"
-        "no read-only or write-only yet.");
-  }
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-  PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(ImageFormat, PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
-
-  ZeStruct<ze_image_desc_t> ZeImageDesc;
-  pi_result DescriptionResult =
-      pi2zeImageDesc(ImageFormat, ImageDesc, ZeImageDesc);
-  if (DescriptionResult != PI_SUCCESS)
-    return DescriptionResult;
-
-  std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
-
-  // Currently we have the "0" device in context with mutliple root devices to
-  // own the image.
-  // TODO: Implement explicit copying for acessing the image from other devices
-  // in the context.
-  pi_device Device = Context->SingleRootDevice ? Context->SingleRootDevice
-                                               : Context->Devices[0];
-  ze_image_handle_t ZeHImage;
-  ZE_CALL(zeImageCreate,
-          (Context->ZeContext, Device->ZeDevice, &ZeImageDesc, &ZeHImage));
-
-  try {
-    auto ZePIImage = new _pi_image(Context, ZeHImage, /*OwnNativeHandle=*/true);
-    *RetImage = ZePIImage;
-
-#ifndef NDEBUG
-    ZePIImage->ZeImageDesc = ZeImageDesc;
-#endif // !NDEBUG
-
-    if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
-        (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
-      // Initialize image synchronously with immediate offload.
-      // zeCommandListAppendImageCopyFromMemory must not be called from
-      // simultaneous threads with the same command list handle, so we need
-      // exclusive lock.
-      std::scoped_lock<ur_mutex> Lock(Context->ImmediateCommandListMutex);
-      ZE_CALL(zeCommandListAppendImageCopyFromMemory,
-              (Context->ZeCommandListInit, ZeHImage, HostPtr, nullptr, nullptr,
-               0, nullptr));
-    }
-  } catch (const std::bad_alloc &) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-  return PI_SUCCESS;
+
+  return pi2ur::piMemImageCreate(Context, Flags, ImageFormat, ImageDesc,
+                                 HostPtr, RetImage);
 }
 
 pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) {
-  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
-  std::shared_lock<ur_shared_mutex> Guard(Mem->Mutex);
-  char *ZeHandle;
-  PI_CALL(Mem->getZeHandle(ZeHandle, _pi_mem::read_write));
-  *NativeHandle = ur_cast<pi_native_handle>(ZeHandle);
-  return PI_SUCCESS;
+  return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle);
 }
 
 pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
                                          pi_context Context,
                                          bool ownNativeHandle, pi_mem *Mem) {
-  PI_ASSERT(Mem, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-
-  std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
-
-  // Get base of the allocation
-  void *Base;
-  size_t Size;
-  void *Ptr = ur_cast<void *>(NativeHandle);
-  ZE_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, &Base, &Size));
-  PI_ASSERT(Ptr == Base, PI_ERROR_INVALID_VALUE);
-
-  ZeStruct<ze_memory_allocation_properties_t> ZeMemProps;
-  ze_device_handle_t ZeDevice = nullptr;
-  ZE_CALL(zeMemGetAllocProperties,
-          (Context->ZeContext, Ptr, &ZeMemProps, &ZeDevice));
-
-  // Check type of the allocation
-  switch (ZeMemProps.type) {
-  case ZE_MEMORY_TYPE_HOST:
-  case ZE_MEMORY_TYPE_SHARED:
-  case ZE_MEMORY_TYPE_DEVICE:
-    break;
-  case ZE_MEMORY_TYPE_UNKNOWN:
-    // Memory allocation is unrelated to the context
-    return PI_ERROR_INVALID_CONTEXT;
-  default:
-    die("Unexpected memory type");
-  }
-
-  pi_device Device = nullptr;
-  if (ZeDevice) {
-    Device = Context->getPlatform()->getDeviceFromNativeHandle(ZeDevice);
-    PI_ASSERT(Context->isValidDevice(Device), PI_ERROR_INVALID_CONTEXT);
-  }
-
-  try {
-    *Mem = new _pi_buffer(Context, Size, Device, ur_cast<char *>(NativeHandle),
-                          ownNativeHandle);
-
-    pi_platform Plt = Context->getPlatform();
-    std::unique_lock<ur_shared_mutex> ContextsLock(Plt->ContextsMutex,
-                                                   std::defer_lock);
-    // If we don't own the native handle then we can't control deallocation of
-    // that memory so there is no point of keeping track of the memory
-    // allocation for deferred memory release in the mode when indirect access
-    // tracking is enabled.
-    if (IndirectAccessTrackingEnabled && ownNativeHandle) {
-      // We need to keep track of all memory allocations in the context
-      ContextsLock.lock();
-      // Retain context to be sure that it is released after all memory
-      // allocations in this context are released.
-      PI_CALL(piContextRetain(Context));
-
-      Context->MemAllocs.emplace(
-          std::piecewise_construct, std::forward_as_tuple(Ptr),
-          std::forward_as_tuple(Context, ownNativeHandle));
-    }
-  } catch (const std::bad_alloc &) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  // Initialize the buffer as necessary
-  auto Buffer = ur_cast<pi_buffer>(*Mem);
-  if (Device) {
-    // If this allocation is on a device, then we re-use it for the buffer.
-    // Nothing to do.
-  } else if (Buffer->OnHost) {
-    // If this is host allocation and buffer always stays on host there
-    // nothing more to do.
-  } else {
-    // In all other cases (shared allocation, or host allocation that cannot
-    // represent the buffer in this context) copy the data to a newly
-    // created device allocation.
-    char *ZeHandleDst;
-    PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Device));
-
-    // zeCommandListAppendMemoryCopy must not be called from simultaneous
-    // threads with the same command list handle, so we need exclusive lock.
-    std::scoped_lock<ur_mutex> Lock(Context->ImmediateCommandListMutex);
-    ZE_CALL(zeCommandListAppendMemoryCopy,
-            (Context->ZeCommandListInit, ZeHandleDst, Ptr, Size, nullptr, 0,
-             nullptr));
-  }
-
-  return PI_SUCCESS;
+  return pi2ur::piextMemCreateWithNativeHandle(NativeHandle, Context,
+                                               ownNativeHandle, Mem);
 }
 
 pi_result piextMemImageCreateWithNativeHandle(
     pi_native_handle NativeHandle, pi_context Context, bool OwnNativeHandle,
-    [[maybe_unused]] const pi_image_format *ImageFormat,
-    [[maybe_unused]] const pi_image_desc *ImageDesc, pi_mem *RetImage) {
+    const pi_image_format *ImageFormat, const pi_image_desc *ImageDesc,
+    pi_mem *RetImage) {
 
   PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE);
   PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
@@ -3656,7 +241,7 @@ pi_result piextMemImageCreateWithNativeHandle(
 
   std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
 
-  ze_image_handle_t ZeHImage = ur_cast<ze_image_handle_t>(NativeHandle);
+  ze_image_handle_t ZeHImage = pi_cast<ze_image_handle_t>(NativeHandle);
 
   try {
     auto ZePIImage = new _pi_image(Context, ZeHImage, OwnNativeHandle);
@@ -3683,22 +268,7 @@ pi_result piextMemImageCreateWithNativeHandle(
 
 pi_result piProgramCreate(pi_context Context, const void *ILBytes,
                           size_t Length, pi_program *Program) {
-
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-  PI_ASSERT(ILBytes && Length, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
-
-  // NOTE: the Level Zero module creation is also building the program, so we
-  // are deferring it until the program is ready to be built.
-
-  try {
-    *Program = new _pi_program(_pi_program::IL, Context, ILBytes, Length);
-  } catch (const std::bad_alloc &) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-  return PI_SUCCESS;
+  return pi2ur::piProgramCreate(Context, ILBytes, Length, Program);
 }
 
 pi_result piProgramCreateWithBinary(
@@ -3706,168 +276,26 @@ pi_result piProgramCreateWithBinary(
     const size_t *Lengths, const unsigned char **Binaries,
     size_t NumMetadataEntries, const pi_device_binary_property *Metadata,
     pi_int32 *BinaryStatus, pi_program *Program) {
-  (void)Metadata;
-  (void)NumMetadataEntries;
 
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-  PI_ASSERT(DeviceList && NumDevices, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(Binaries && Lengths, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
-
-  // For now we support only one device.
-  if (NumDevices != 1) {
-    urPrint("piProgramCreateWithBinary: level_zero supports only one device.");
-    return PI_ERROR_INVALID_VALUE;
-  }
-  if (!Binaries[0] || !Lengths[0]) {
-    if (BinaryStatus)
-      *BinaryStatus = PI_ERROR_INVALID_VALUE;
-    return PI_ERROR_INVALID_VALUE;
-  }
-
-  size_t Length = Lengths[0];
-  auto Binary = Binaries[0];
-
-  // In OpenCL, clCreateProgramWithBinary() can be used to load any of the
-  // following: "program executable", "compiled program", or "library of
-  // compiled programs".  In addition, the loaded program can be either
-  // IL (SPIR-v) or native device code.  For now, we assume that
-  // piProgramCreateWithBinary() is only used to load a "program executable"
-  // as native device code.
-  // If we wanted to support all the same cases as OpenCL, we would need to
-  // somehow examine the binary image to distinguish the cases.  Alternatively,
-  // we could change the PI interface and have the caller pass additional
-  // information to distinguish the cases.
-
-  try {
-    *Program = new _pi_program(_pi_program::Native, Context, Binary, Length);
-  } catch (const std::bad_alloc &) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  if (BinaryStatus)
-    *BinaryStatus = PI_SUCCESS;
-  return PI_SUCCESS;
+  return pi2ur::piProgramCreateWithBinary(Context, NumDevices, DeviceList,
+                                          Lengths, Binaries, NumMetadataEntries,
+                                          Metadata, BinaryStatus, Program);
 }
 
 pi_result piclProgramCreateWithSource(pi_context Context, pi_uint32 Count,
                                       const char **Strings,
                                       const size_t *Lengths,
                                       pi_program *RetProgram) {
-
-  (void)Context;
-  (void)Count;
-  (void)Strings;
-  (void)Lengths;
-  (void)RetProgram;
-  urPrint("piclProgramCreateWithSource: not supported in Level Zero\n");
-  return PI_ERROR_INVALID_OPERATION;
+  return pi2ur::piclProgramCreateWithSource(Context, Count, Strings, Lengths,
+                                            RetProgram);
 }
 
 pi_result piProgramGetInfo(pi_program Program, pi_program_info ParamName,
                            size_t ParamValueSize, void *ParamValue,
                            size_t *ParamValueSizeRet) {
 
-  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
-
-  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
-  switch (ParamName) {
-  case PI_PROGRAM_INFO_REFERENCE_COUNT:
-    return ReturnValue(pi_uint32{Program->RefCount.load()});
-  case PI_PROGRAM_INFO_NUM_DEVICES:
-    // TODO: return true number of devices this program exists for.
-    return ReturnValue(pi_uint32{1});
-  case PI_PROGRAM_INFO_DEVICES:
-    // TODO: return all devices this program exists for.
-    return ReturnValue(Program->Context->Devices[0]);
-  case PI_PROGRAM_INFO_BINARY_SIZES: {
-    std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
-    size_t SzBinary;
-    if (Program->State == _pi_program::IL ||
-        Program->State == _pi_program::Native ||
-        Program->State == _pi_program::Object) {
-      SzBinary = Program->CodeLength;
-    } else if (Program->State == _pi_program::Exe) {
-      ZE_CALL(zeModuleGetNativeBinary, (Program->ZeModule, &SzBinary, nullptr));
-    } else {
-      return PI_ERROR_INVALID_PROGRAM;
-    }
-    // This is an array of 1 element, initialized as if it were scalar.
-    return ReturnValue(size_t{SzBinary});
-  }
-  case PI_PROGRAM_INFO_BINARIES: {
-    // The caller sets "ParamValue" to an array of pointers, one for each
-    // device.  Since Level Zero supports only one device, there is only one
-    // pointer.  If the pointer is NULL, we don't do anything.  Otherwise, we
-    // copy the program's binary image to the buffer at that pointer.
-    uint8_t **PBinary = ur_cast<uint8_t **>(ParamValue);
-    if (!PBinary[0])
-      break;
-
-    std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
-    if (Program->State == _pi_program::IL ||
-        Program->State == _pi_program::Native ||
-        Program->State == _pi_program::Object) {
-      std::memcpy(PBinary[0], Program->Code.get(), Program->CodeLength);
-    } else if (Program->State == _pi_program::Exe) {
-      size_t SzBinary = 0;
-      ZE_CALL(zeModuleGetNativeBinary,
-              (Program->ZeModule, &SzBinary, PBinary[0]));
-    } else {
-      return PI_ERROR_INVALID_PROGRAM;
-    }
-    break;
-  }
-  case PI_PROGRAM_INFO_NUM_KERNELS: {
-    std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
-    uint32_t NumKernels;
-    if (Program->State == _pi_program::IL ||
-        Program->State == _pi_program::Native ||
-        Program->State == _pi_program::Object) {
-      return PI_ERROR_INVALID_PROGRAM_EXECUTABLE;
-    } else if (Program->State == _pi_program::Exe) {
-      NumKernels = 0;
-      ZE_CALL(zeModuleGetKernelNames,
-              (Program->ZeModule, &NumKernels, nullptr));
-    } else {
-      return PI_ERROR_INVALID_PROGRAM;
-    }
-    return ReturnValue(size_t{NumKernels});
-  }
-  case PI_PROGRAM_INFO_KERNEL_NAMES:
-    try {
-      std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
-      std::string PINames{""};
-      if (Program->State == _pi_program::IL ||
-          Program->State == _pi_program::Native ||
-          Program->State == _pi_program::Object) {
-        return PI_ERROR_INVALID_PROGRAM_EXECUTABLE;
-      } else if (Program->State == _pi_program::Exe) {
-        uint32_t Count = 0;
-        ZE_CALL(zeModuleGetKernelNames, (Program->ZeModule, &Count, nullptr));
-        std::unique_ptr<const char *[]> PNames(new const char *[Count]);
-        ZE_CALL(zeModuleGetKernelNames,
-                (Program->ZeModule, &Count, PNames.get()));
-        for (uint32_t I = 0; I < Count; ++I) {
-          PINames += (I > 0 ? ";" : "");
-          PINames += PNames[I];
-        }
-      } else {
-        return PI_ERROR_INVALID_PROGRAM;
-      }
-      return ReturnValue(PINames.c_str());
-    } catch (const std::bad_alloc &) {
-      return PI_ERROR_OUT_OF_HOST_MEMORY;
-    } catch (...) {
-      return PI_ERROR_UNKNOWN;
-    }
-  default:
-    die("piProgramGetInfo: not implemented");
-  }
-
-  return PI_SUCCESS;
+  return pi2ur::piProgramGetInfo(Program, ParamName, ParamValueSize, ParamValue,
+                                 ParamValueSizeRet);
 }
 
 pi_result piProgramLink(pi_context Context, pi_uint32 NumDevices,
@@ -3876,169 +304,9 @@ pi_result piProgramLink(pi_context Context, pi_uint32 NumDevices,
                         const pi_program *InputPrograms,
                         void (*PFnNotify)(pi_program Program, void *UserData),
                         void *UserData, pi_program *RetProgram) {
-  // We only support one device with Level Zero currently.
-  if (NumDevices != 1) {
-    urPrint("piProgramLink: level_zero supports only one device.");
-    return PI_ERROR_INVALID_VALUE;
-  }
-
-  // We do not support any link flags at this time because the Level Zero API
-  // does not have any way to pass flags that are specific to linking.
-  if (Options && *Options != '\0') {
-    std::string ErrorMessage(
-        "Level Zero does not support kernel link flags: \"");
-    ErrorMessage.append(Options);
-    ErrorMessage.push_back('\"');
-    pi_program Program =
-        new _pi_program(_pi_program::Invalid, Context, ErrorMessage);
-    *RetProgram = Program;
-    return PI_ERROR_LINK_PROGRAM_FAILURE;
-  }
-
-  // Validate input parameters.
-  PI_ASSERT(DeviceList, PI_ERROR_INVALID_DEVICE);
-  PI_ASSERT(Context->isValidDevice(DeviceList[0]), PI_ERROR_INVALID_DEVICE);
-  PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
-  if (NumInputPrograms == 0 || InputPrograms == nullptr)
-    return PI_ERROR_INVALID_VALUE;
-
-  pi_result PiResult = PI_SUCCESS;
-  try {
-    // Acquire a "shared" lock on each of the input programs, and also validate
-    // that they are all in Object state.
-    //
-    // There is no danger of deadlock here even if two threads call
-    // piProgramLink simultaneously with the same input programs in a different
-    // order.  If we were acquiring these with "exclusive" access, this could
-    // lead to a classic lock ordering deadlock.  However, there is no such
-    // deadlock potential with "shared" access.  There could also be a deadlock
-    // potential if there was some other code that holds more than one of these
-    // locks simultaneously with "exclusive" access.  However, there is no such
-    // code like that, so this is also not a danger.
-    std::vector<std::shared_lock<ur_shared_mutex>> Guards(NumInputPrograms);
-    for (pi_uint32 I = 0; I < NumInputPrograms; I++) {
-      std::shared_lock<ur_shared_mutex> Guard(InputPrograms[I]->Mutex);
-      Guards[I].swap(Guard);
-      if (InputPrograms[I]->State != _pi_program::Object) {
-        return PI_ERROR_INVALID_OPERATION;
-      }
-    }
-
-    // Previous calls to piProgramCompile did not actually compile the SPIR-V.
-    // Instead, we postpone compilation until this point, when all the modules
-    // are linked together.  By doing compilation and linking together, the JIT
-    // compiler is able see all modules and do cross-module optimizations.
-    //
-    // Construct a ze_module_program_exp_desc_t which contains information about
-    // all of the modules that will be linked together.
-    ZeStruct<ze_module_program_exp_desc_t> ZeExtModuleDesc;
-    std::vector<size_t> CodeSizes(NumInputPrograms);
-    std::vector<const uint8_t *> CodeBufs(NumInputPrograms);
-    std::vector<const char *> BuildFlagPtrs(NumInputPrograms);
-    std::vector<const ze_module_constants_t *> SpecConstPtrs(NumInputPrograms);
-    std::vector<_pi_program::SpecConstantShim> SpecConstShims;
-    SpecConstShims.reserve(NumInputPrograms);
-
-    for (pi_uint32 I = 0; I < NumInputPrograms; I++) {
-      pi_program Program = InputPrograms[I];
-      CodeSizes[I] = Program->CodeLength;
-      CodeBufs[I] = Program->Code.get();
-      BuildFlagPtrs[I] = Program->BuildFlags.c_str();
-      SpecConstShims.emplace_back(Program);
-      SpecConstPtrs[I] = SpecConstShims[I].ze();
-    }
-
-    ZeExtModuleDesc.count = NumInputPrograms;
-    ZeExtModuleDesc.inputSizes = CodeSizes.data();
-    ZeExtModuleDesc.pInputModules = CodeBufs.data();
-    ZeExtModuleDesc.pBuildFlags = BuildFlagPtrs.data();
-    ZeExtModuleDesc.pConstants = SpecConstPtrs.data();
-
-    ZeStruct<ze_module_desc_t> ZeModuleDesc;
-    ZeModuleDesc.pNext = &ZeExtModuleDesc;
-    ZeModuleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV;
-
-    // This works around a bug in the Level Zero driver.  When "ZE_DEBUG=-1",
-    // the driver does validation of the API calls, and it expects
-    // "pInputModule" to be non-NULL and "inputSize" to be non-zero.  This
-    // validation is wrong when using the "ze_module_program_exp_desc_t"
-    // extension because those fields are supposed to be ignored.  As a
-    // workaround, set both fields to 1.
-    //
-    // TODO: Remove this workaround when the driver is fixed.
-    ZeModuleDesc.pInputModule = reinterpret_cast<const uint8_t *>(1);
-    ZeModuleDesc.inputSize = 1;
-
-    // We need a Level Zero extension to compile multiple programs together into
-    // a single Level Zero module.  However, we don't need that extension if
-    // there happens to be only one input program.
-    //
-    // The "|| (NumInputPrograms == 1)" term is a workaround for a bug in the
-    // Level Zero driver.  The driver's "ze_module_program_exp_desc_t"
-    // extension should work even in the case when there is just one input
-    // module.  However, there is currently a bug in the driver that leads to a
-    // crash.  As a workaround, do not use the extension when there is one
-    // input module.
-    //
-    // TODO: Remove this workaround when the driver is fixed.
-    if (!DeviceList[0]->Platform->ZeDriverModuleProgramExtensionFound ||
-        (NumInputPrograms == 1)) {
-      if (NumInputPrograms == 1) {
-        ZeModuleDesc.pNext = nullptr;
-        ZeModuleDesc.inputSize = ZeExtModuleDesc.inputSizes[0];
-        ZeModuleDesc.pInputModule = ZeExtModuleDesc.pInputModules[0];
-        ZeModuleDesc.pBuildFlags = ZeExtModuleDesc.pBuildFlags[0];
-        ZeModuleDesc.pConstants = ZeExtModuleDesc.pConstants[0];
-      } else {
-        urPrint("piProgramLink: level_zero driver does not have static linking "
-                "support.");
-        return PI_ERROR_INVALID_VALUE;
-      }
-    }
-
-    // Call the Level Zero API to compile, link, and create the module.
-    ze_device_handle_t ZeDevice = DeviceList[0]->ZeDevice;
-    ze_context_handle_t ZeContext = Context->ZeContext;
-    ze_module_handle_t ZeModule = nullptr;
-    ze_module_build_log_handle_t ZeBuildLog = nullptr;
-    ze_result_t ZeResult =
-        ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc,
-                                         &ZeModule, &ZeBuildLog));
-
-    // We still create a _pi_program object even if there is a BUILD_FAILURE
-    // because we need the object to hold the ZeBuildLog.  There is no build
-    // log created for other errors, so we don't create an object.
-    PiResult = mapError(ZeResult);
-    if (ZeResult != ZE_RESULT_SUCCESS &&
-        ZeResult != ZE_RESULT_ERROR_MODULE_BUILD_FAILURE) {
-      return PiResult;
-    }
-
-    // The call to zeModuleCreate does not report an error if there are
-    // unresolved symbols because it thinks these could be resolved later via a
-    // call to zeModuleDynamicLink.  However, modules created with piProgramLink
-    // are supposed to be fully linked and ready to use.  Therefore, do an extra
-    // check now for unresolved symbols.  Note that we still create a
-    // _pi_program if there are unresolved symbols because the ZeBuildLog tells
-    // which symbols are unresolved.
-    if (ZeResult == ZE_RESULT_SUCCESS) {
-      ZeResult = checkUnresolvedSymbols(ZeModule, &ZeBuildLog);
-      if (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE) {
-        PiResult = PI_ERROR_LINK_PROGRAM_FAILURE;
-      } else if (ZeResult != ZE_RESULT_SUCCESS) {
-        return mapError(ZeResult);
-      }
-    }
-
-    _pi_program::state State =
-        (PiResult == PI_SUCCESS) ? _pi_program::Exe : _pi_program::Invalid;
-    *RetProgram = new _pi_program(State, Context, ZeModule, ZeBuildLog);
-  } catch (const std::bad_alloc &) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-  return PiResult;
+  return pi2ur::piProgramLink(Context, NumDevices, DeviceList, Options,
+                              NumInputPrograms, InputPrograms, PFnNotify,
+                              UserData, RetProgram);
 }
 
 pi_result piProgramCompile(
@@ -4046,532 +314,92 @@ pi_result piProgramCompile(
     const char *Options, pi_uint32 NumInputHeaders,
     const pi_program *InputHeaders, const char **HeaderIncludeNames,
     void (*PFnNotify)(pi_program Program, void *UserData), void *UserData) {
-  (void)NumInputHeaders;
-  (void)InputHeaders;
-  (void)HeaderIncludeNames;
 
-  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
-
-  if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList))
-    return PI_ERROR_INVALID_VALUE;
-
-  // These aren't supported.
-  PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
-
-  std::scoped_lock<ur_shared_mutex> Guard(Program->Mutex);
-
-  // It's only valid to compile a program created from IL (we don't support
-  // programs created from source code).
-  //
-  // The OpenCL spec says that the header parameters are ignored when compiling
-  // IL programs, so we don't validate them.
-  if (Program->State != _pi_program::IL)
-    return PI_ERROR_INVALID_OPERATION;
-
-  // We don't compile anything now.  Instead, we delay compilation until
-  // piProgramLink, where we do both compilation and linking as a single step.
-  // This produces better code because the driver can do cross-module
-  // optimizations.  Therefore, we just remember the compilation flags, so we
-  // can use them later.
-  if (Options)
-    Program->BuildFlags = Options;
-  Program->State = _pi_program::Object;
-
-  return PI_SUCCESS;
+  return pi2ur::piProgramCompile(Program, NumDevices, DeviceList, Options,
+                                 NumInputHeaders, InputHeaders,
+                                 HeaderIncludeNames, PFnNotify, UserData);
 }
 
 pi_result piProgramBuild(pi_program Program, pi_uint32 NumDevices,
                          const pi_device *DeviceList, const char *Options,
                          void (*PFnNotify)(pi_program Program, void *UserData),
                          void *UserData) {
-
-  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
-  if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList))
-    return PI_ERROR_INVALID_VALUE;
-
-  // We only support build to one device with Level Zero now.
-  // TODO: we should eventually build to the possibly multiple root
-  // devices in the context.
-  if (NumDevices != 1) {
-    urPrint("piProgramBuild: level_zero supports only one device.");
-    return PI_ERROR_INVALID_VALUE;
-  }
-
-  // These aren't supported.
-  PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
-
-  std::scoped_lock<ur_shared_mutex> Guard(Program->Mutex);
-  // Check if device belongs to associated context.
-  PI_ASSERT(Program->Context, PI_ERROR_INVALID_PROGRAM);
-  PI_ASSERT(Program->Context->isValidDevice(DeviceList[0]),
-            PI_ERROR_INVALID_VALUE);
-
-  // It is legal to build a program created from either IL or from native
-  // device code.
-  if (Program->State != _pi_program::IL &&
-      Program->State != _pi_program::Native)
-    return PI_ERROR_INVALID_OPERATION;
-
-  // We should have either IL or native device code.
-  PI_ASSERT(Program->Code, PI_ERROR_INVALID_PROGRAM);
-
-  // Ask Level Zero to build and load the native code onto the device.
-  ZeStruct<ze_module_desc_t> ZeModuleDesc;
-  _pi_program::SpecConstantShim Shim(Program);
-  ZeModuleDesc.format = (Program->State == _pi_program::IL)
-                            ? ZE_MODULE_FORMAT_IL_SPIRV
-                            : ZE_MODULE_FORMAT_NATIVE;
-  ZeModuleDesc.inputSize = Program->CodeLength;
-  ZeModuleDesc.pInputModule = Program->Code.get();
-  ZeModuleDesc.pBuildFlags = Options;
-  ZeModuleDesc.pConstants = Shim.ze();
-
-  ze_device_handle_t ZeDevice = DeviceList[0]->ZeDevice;
-  ze_context_handle_t ZeContext = Program->Context->ZeContext;
-  ze_module_handle_t ZeModule = nullptr;
-
-  pi_result Result = PI_SUCCESS;
-  Program->State = _pi_program::Exe;
-  ze_result_t ZeResult =
-      ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc,
-                                       &ZeModule, &Program->ZeBuildLog));
-  if (ZeResult != ZE_RESULT_SUCCESS) {
-    // We adjust pi_program below to avoid attempting to release zeModule when
-    // RT calls piProgramRelease().
-    Program->State = _pi_program::Invalid;
-    Result = mapError(ZeResult);
-    if (ZeModule) {
-      ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule));
-      ZeModule = nullptr;
-    }
-  } else {
-    // The call to zeModuleCreate does not report an error if there are
-    // unresolved symbols because it thinks these could be resolved later via a
-    // call to zeModuleDynamicLink.  However, modules created with
-    // piProgramBuild are supposed to be fully linked and ready to use.
-    // Therefore, do an extra check now for unresolved symbols.
-    ZeResult = checkUnresolvedSymbols(ZeModule, &Program->ZeBuildLog);
-    if (ZeResult != ZE_RESULT_SUCCESS) {
-      Program->State = _pi_program::Invalid;
-      Result = (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE)
-                   ? PI_ERROR_BUILD_PROGRAM_FAILURE
-                   : mapError(ZeResult);
-      if (ZeModule) {
-        ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule));
-        ZeModule = nullptr;
-      }
-    }
-  }
-
-  // We no longer need the IL / native code.
-  Program->Code.reset();
-  Program->ZeModule = ZeModule;
-  return Result;
+  return pi2ur::piProgramBuild(Program, NumDevices, DeviceList, Options,
+                               PFnNotify, UserData);
 }
 
 pi_result piProgramGetBuildInfo(pi_program Program, pi_device Device,
                                 pi_program_build_info ParamName,
                                 size_t ParamValueSize, void *ParamValue,
                                 size_t *ParamValueSizeRet) {
-  (void)Device;
-
-  std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
-  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
-  if (ParamName == PI_PROGRAM_BUILD_INFO_BINARY_TYPE) {
-    pi_program_binary_type Type = PI_PROGRAM_BINARY_TYPE_NONE;
-    if (Program->State == _pi_program::Object) {
-      Type = PI_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
-    } else if (Program->State == _pi_program::Exe) {
-      Type = PI_PROGRAM_BINARY_TYPE_EXECUTABLE;
-    }
-    return ReturnValue(pi_program_binary_type{Type});
-  }
-  if (ParamName == PI_PROGRAM_BUILD_INFO_OPTIONS) {
-    // TODO: how to get module build options out of Level Zero?
-    // For the programs that we compiled we can remember the options
-    // passed with piProgramCompile/piProgramBuild, but what can we
-    // return for programs that were built outside and registered
-    // with piProgramRegister?
-    return ReturnValue("");
-  } else if (ParamName == PI_PROGRAM_BUILD_INFO_LOG) {
-    // Check first to see if the plugin code recorded an error message.
-    if (!Program->ErrorMessage.empty()) {
-      return ReturnValue(Program->ErrorMessage.c_str());
-    }
-
-    // Next check if there is a Level Zero build log.
-    if (Program->ZeBuildLog) {
-      size_t LogSize = ParamValueSize;
-      ZE_CALL(zeModuleBuildLogGetString,
-              (Program->ZeBuildLog, &LogSize, ur_cast<char *>(ParamValue)));
-      if (ParamValueSizeRet) {
-        *ParamValueSizeRet = LogSize;
-      }
-      if (ParamValue) {
-        // When the program build fails in piProgramBuild(), we delayed cleaning
-        // up the build log because RT later calls this routine to get the
-        // failed build log.
-        // To avoid memory leaks, we should clean up the failed build log here
-        // because RT does not create sycl::program when piProgramBuild() fails,
-        // thus it won't call piProgramRelease() to clean up the build log.
-        if (Program->State == _pi_program::Invalid) {
-          ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (Program->ZeBuildLog));
-          Program->ZeBuildLog = nullptr;
-        }
-      }
-      return PI_SUCCESS;
-    }
-
-    // Otherwise, there is no error.  The OpenCL spec says to return an empty
-    // string if there ws no previous attempt to compile, build, or link the
-    // program.
-    return ReturnValue("");
-  } else {
-    urPrint("piProgramGetBuildInfo: unsupported ParamName\n");
-    return PI_ERROR_INVALID_VALUE;
-  }
-  return PI_SUCCESS;
+
+  return pi2ur::piProgramGetBuildInfo(Program, Device, ParamName,
+                                      ParamValueSize, ParamValue,
+                                      ParamValueSizeRet);
 }
 
 pi_result piProgramRetain(pi_program Program) {
-  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
-  Program->RefCount.increment();
-  return PI_SUCCESS;
+  return pi2ur::piProgramRetain(Program);
 }
 
 pi_result piProgramRelease(pi_program Program) {
-  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
-
-  if (!Program->RefCount.decrementAndTest())
-    return PI_SUCCESS;
-
-  delete Program;
-
-  return PI_SUCCESS;
+  return pi2ur::piProgramRelease(Program);
 }
 
 pi_result piextProgramGetNativeHandle(pi_program Program,
                                       pi_native_handle *NativeHandle) {
-  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
-  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
-
-  auto ZeModule = ur_cast<ze_module_handle_t *>(NativeHandle);
-
-  std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
-  switch (Program->State) {
-  case _pi_program::Exe: {
-    *ZeModule = Program->ZeModule;
-    break;
-  }
-
-  default:
-    return PI_ERROR_INVALID_OPERATION;
-  }
-
-  return PI_SUCCESS;
+  return pi2ur::piextProgramGetNativeHandle(Program, NativeHandle);
 }
 
 pi_result piextProgramCreateWithNativeHandle(pi_native_handle NativeHandle,
                                              pi_context Context,
-                                             bool ownNativeHandle,
+                                             bool OwnNativeHandle,
                                              pi_program *Program) {
-  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
-  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-
-  auto ZeModule = ur_cast<ze_module_handle_t>(NativeHandle);
-
-  // We assume here that programs created from a native handle always
-  // represent a fully linked executable (state Exe) and not an unlinked
-  // executable (state Object).
-
-  try {
-    *Program =
-        new _pi_program(_pi_program::Exe, Context, ZeModule, ownNativeHandle);
-  } catch (const std::bad_alloc &) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-  return PI_SUCCESS;
-}
-
-_pi_program::~_pi_program() {
-  // According to Level Zero Specification, all kernels and build logs
-  // must be destroyed before the Module can be destroyed.  So, be sure
-  // to destroy build log before destroying the module.
-  if (ZeBuildLog) {
-    ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (ZeBuildLog));
-  }
-
-  if (ZeModule && OwnZeModule) {
-    ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule));
-  }
-}
-
-// Check to see if a Level Zero module has any unresolved symbols.
-//
-// @param ZeModule    The module handle to check.
-// @param ZeBuildLog  If there are unresolved symbols, this build log handle is
-//                     modified to receive information telling which symbols
-//                     are unresolved.
-//
-// @return ZE_RESULT_ERROR_MODULE_LINK_FAILURE indicates there are unresolved
-//  symbols.  ZE_RESULT_SUCCESS indicates all symbols are resolved.  Any other
-//  value indicates there was an error and we cannot tell if symbols are
-//  resolved.
-static ze_result_t
-checkUnresolvedSymbols(ze_module_handle_t ZeModule,
-                       ze_module_build_log_handle_t *ZeBuildLog) {
-
-  // First check to see if the module has any imported symbols.  If there are
-  // no imported symbols, it's not possible to have any unresolved symbols.  We
-  // do this check first because we assume it's faster than the call to
-  // zeModuleDynamicLink below.
-  ZeStruct<ze_module_properties_t> ZeModuleProps;
-  ze_result_t ZeResult =
-      ZE_CALL_NOCHECK(zeModuleGetProperties, (ZeModule, &ZeModuleProps));
-  if (ZeResult != ZE_RESULT_SUCCESS)
-    return ZeResult;
-
-  // If there are imported symbols, attempt to "link" the module with itself.
-  // As a side effect, this will return the error
-  // ZE_RESULT_ERROR_MODULE_LINK_FAILURE if there are any unresolved symbols.
-  if (ZeModuleProps.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS) {
-    return ZE_CALL_NOCHECK(zeModuleDynamicLink, (1, &ZeModule, ZeBuildLog));
-  }
-  return ZE_RESULT_SUCCESS;
+  return pi2ur::piextProgramCreateWithNativeHandle(NativeHandle, Context,
+                                                   OwnNativeHandle, Program);
 }
 
 pi_result piKernelCreate(pi_program Program, const char *KernelName,
                          pi_kernel *RetKernel) {
 
-  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
-  PI_ASSERT(RetKernel, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(KernelName, PI_ERROR_INVALID_VALUE);
-
-  std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
-  if (Program->State != _pi_program::Exe) {
-    return PI_ERROR_INVALID_PROGRAM_EXECUTABLE;
-  }
-
-  ZeStruct<ze_kernel_desc_t> ZeKernelDesc;
-  ZeKernelDesc.flags = 0;
-  ZeKernelDesc.pKernelName = KernelName;
-
-  ze_kernel_handle_t ZeKernel;
-  ZE_CALL(zeKernelCreate, (Program->ZeModule, &ZeKernelDesc, &ZeKernel));
-
-  try {
-    *RetKernel = new _pi_kernel(ZeKernel, true, Program);
-  } catch (const std::bad_alloc &) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  PI_CALL((*RetKernel)->initialize());
-  return PI_SUCCESS;
-}
-
-pi_result _pi_kernel::initialize() {
-  // Retain the program and context to show it's used by this kernel.
-  PI_CALL(piProgramRetain(Program));
-  if (IndirectAccessTrackingEnabled)
-    // TODO: do piContextRetain without the guard
-    PI_CALL(piContextRetain(Program->Context));
-
-  // Set up how to obtain kernel properties when needed.
-  ZeKernelProperties.Compute = [this](ze_kernel_properties_t &Properties) {
-    ZE_CALL_NOCHECK(zeKernelGetProperties, (ZeKernel, &Properties));
-  };
-
-  // Cache kernel name.
-  ZeKernelName.Compute = [this](std::string &Name) {
-    size_t Size = 0;
-    ZE_CALL_NOCHECK(zeKernelGetName, (ZeKernel, &Size, nullptr));
-    char *KernelName = new char[Size];
-    ZE_CALL_NOCHECK(zeKernelGetName, (ZeKernel, &Size, KernelName));
-    Name = KernelName;
-    delete[] KernelName;
-  };
-
-  return PI_SUCCESS;
+  return pi2ur::piKernelCreate(Program, KernelName, RetKernel);
 }
 
 pi_result piKernelSetArg(pi_kernel Kernel, pi_uint32 ArgIndex, size_t ArgSize,
                          const void *ArgValue) {
 
-  // OpenCL: "the arg_value pointer can be NULL or point to a NULL value
-  // in which case a NULL value will be used as the value for the argument
-  // declared as a pointer to global or constant memory in the kernel"
-  //
-  // We don't know the type of the argument but it seems that the only time
-  // SYCL RT would send a pointer to NULL in 'arg_value' is when the argument
-  // is a NULL pointer. Treat a pointer to NULL in 'arg_value' as a NULL.
-  if (ArgSize == sizeof(void *) && ArgValue &&
-      *(void **)(const_cast<void *>(ArgValue)) == nullptr) {
-    ArgValue = nullptr;
-  }
-
-  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
-
-  std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
-  ZE_CALL(zeKernelSetArgumentValue,
-          (ur_cast<ze_kernel_handle_t>(Kernel->ZeKernel),
-           ur_cast<uint32_t>(ArgIndex), ur_cast<size_t>(ArgSize),
-           ur_cast<const void *>(ArgValue)));
-
-  return PI_SUCCESS;
+  return pi2ur::piKernelSetArg(Kernel, ArgIndex, ArgSize, ArgValue);
 }
 
 // Special version of piKernelSetArg to accept pi_mem.
 pi_result piextKernelSetArgMemObj(pi_kernel Kernel, pi_uint32 ArgIndex,
                                   const pi_mem *ArgValue) {
-  // TODO: the better way would probably be to add a new PI API for
-  // extracting native PI object from PI handle, and have SYCL
-  // RT pass that directly to the regular piKernelSetArg (and
-  // then remove this piextKernelSetArgMemObj).
-
-  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
-
-  // We don't yet know the device where this kernel will next be run on.
-  // Thus we can't know the actual memory allocation that needs to be used.
-  // Remember the memory object being used as an argument for this kernel
-  // to process it later when the device is known (at the kernel enqueue).
-  //
-  // TODO: for now we have to conservatively assume the access as read-write.
-  //       Improve that by passing SYCL buffer accessor type into
-  //       piextKernelSetArgMemObj.
-  //
-  std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
-  // The ArgValue may be a NULL pointer in which case a NULL value is used for
-  // the kernel argument declared as a pointer to global or constant memory.
-  auto Arg = ArgValue ? *ArgValue : nullptr;
-  Kernel->PendingArguments.push_back(
-      {ArgIndex, sizeof(void *), Arg, _pi_mem::read_write});
 
-  return PI_SUCCESS;
+  return pi2ur::piextKernelSetArgMemObj(Kernel, ArgIndex, ArgValue);
 }
 
 // Special version of piKernelSetArg to accept pi_sampler.
 pi_result piextKernelSetArgSampler(pi_kernel Kernel, pi_uint32 ArgIndex,
                                    const pi_sampler *ArgValue) {
-  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
-
-  std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
-  ZE_CALL(zeKernelSetArgumentValue,
-          (ur_cast<ze_kernel_handle_t>(Kernel->ZeKernel),
-           ur_cast<uint32_t>(ArgIndex), sizeof(void *),
-           &(*ArgValue)->ZeSampler));
 
-  return PI_SUCCESS;
+  return pi2ur::piextKernelSetArgSampler(Kernel, ArgIndex, ArgValue);
 }
 
 pi_result piKernelGetInfo(pi_kernel Kernel, pi_kernel_info ParamName,
                           size_t ParamValueSize, void *ParamValue,
                           size_t *ParamValueSizeRet) {
-  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
-
-  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
-
-  std::shared_lock<ur_shared_mutex> Guard(Kernel->Mutex);
-  switch (ParamName) {
-  case PI_KERNEL_INFO_CONTEXT:
-    return ReturnValue(pi_context{Kernel->Program->Context});
-  case PI_KERNEL_INFO_PROGRAM:
-    return ReturnValue(pi_program{Kernel->Program});
-  case PI_KERNEL_INFO_FUNCTION_NAME:
-    try {
-      std::string &KernelName = *Kernel->ZeKernelName.operator->();
-      return ReturnValue(static_cast<const char *>(KernelName.c_str()));
-    } catch (const std::bad_alloc &) {
-      return PI_ERROR_OUT_OF_HOST_MEMORY;
-    } catch (...) {
-      return PI_ERROR_UNKNOWN;
-    }
-  case PI_KERNEL_INFO_NUM_ARGS:
-    return ReturnValue(pi_uint32{Kernel->ZeKernelProperties->numKernelArgs});
-  case PI_KERNEL_INFO_REFERENCE_COUNT:
-    return ReturnValue(pi_uint32{Kernel->RefCount.load()});
-  case PI_KERNEL_INFO_ATTRIBUTES:
-    try {
-      uint32_t Size;
-      ZE_CALL(zeKernelGetSourceAttributes, (Kernel->ZeKernel, &Size, nullptr));
-      char *attributes = new char[Size];
-      ZE_CALL(zeKernelGetSourceAttributes,
-              (Kernel->ZeKernel, &Size, &attributes));
-      auto Res = ReturnValue(attributes);
-      delete[] attributes;
-      return Res;
-    } catch (const std::bad_alloc &) {
-      return PI_ERROR_OUT_OF_HOST_MEMORY;
-    } catch (...) {
-      return PI_ERROR_UNKNOWN;
-    }
-  default:
-    urPrint("Unsupported ParamName in piKernelGetInfo: ParamName=%d(0x%x)\n",
-            ParamName, ParamName);
-    return PI_ERROR_INVALID_VALUE;
-  }
 
-  return PI_SUCCESS;
+  return pi2ur::piKernelGetInfo(Kernel, ParamName, ParamValueSize, ParamValue,
+                                ParamValueSizeRet);
 }
 
 pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device,
                                pi_kernel_group_info ParamName,
                                size_t ParamValueSize, void *ParamValue,
                                size_t *ParamValueSizeRet) {
-  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
-  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
-
-  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
-
-  std::shared_lock<ur_shared_mutex> Guard(Kernel->Mutex);
-  switch (ParamName) {
-  case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
-    struct {
-      size_t Arr[3];
-    } GlobalWorkSize = {{(Device->ZeDeviceComputeProperties->maxGroupSizeX *
-                          Device->ZeDeviceComputeProperties->maxGroupCountX),
-                         (Device->ZeDeviceComputeProperties->maxGroupSizeY *
-                          Device->ZeDeviceComputeProperties->maxGroupCountY),
-                         (Device->ZeDeviceComputeProperties->maxGroupSizeZ *
-                          Device->ZeDeviceComputeProperties->maxGroupCountZ)}};
-    return ReturnValue(GlobalWorkSize);
-  }
-  case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
-    // As of right now, L0 is missing API to query kernel and device specific
-    // max work group size.
-    return ReturnValue(
-        pi_uint64{Device->ZeDeviceComputeProperties->maxTotalGroupSize});
-  }
-  case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
-    struct {
-      size_t Arr[3];
-    } WgSize = {{Kernel->ZeKernelProperties->requiredGroupSizeX,
-                 Kernel->ZeKernelProperties->requiredGroupSizeY,
-                 Kernel->ZeKernelProperties->requiredGroupSizeZ}};
-    return ReturnValue(WgSize);
-  }
-  case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE:
-    return ReturnValue(pi_uint32{Kernel->ZeKernelProperties->localMemSize});
-  case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
-    return ReturnValue(size_t{Device->ZeDeviceProperties->physicalEUSimdWidth});
-  }
-  case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE:
-    return ReturnValue(pi_uint32{Kernel->ZeKernelProperties->privateMemSize});
-  case PI_KERNEL_GROUP_INFO_NUM_REGS: {
-    die("PI_KERNEL_GROUP_INFO_NUM_REGS in piKernelGetGroupInfo not "
-        "implemented\n");
-    break;
-  }
-  default:
-    urPrint("Unknown ParamName in piKernelGetGroupInfo: ParamName=%d(0x%x)\n",
-            ParamName, ParamName);
-    return PI_ERROR_INVALID_VALUE;
-  }
-  return PI_SUCCESS;
+
+  return pi2ur::piKernelGetGroupInfo(Kernel, Device, ParamName, ParamValueSize,
+                                     ParamValue, ParamValueSizeRet);
 }
 
 pi_result piKernelGetSubGroupInfo(pi_kernel Kernel, pi_device Device,
@@ -4579,57 +407,20 @@ pi_result piKernelGetSubGroupInfo(pi_kernel Kernel, pi_device Device,
                                   size_t InputValueSize, const void *InputValue,
                                   size_t ParamValueSize, void *ParamValue,
                                   size_t *ParamValueSizeRet) {
-  (void)Device;
-  (void)InputValueSize;
-  (void)InputValue;
-
-  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
-
-  std::shared_lock<ur_shared_mutex> Guard(Kernel->Mutex);
-  if (ParamName == PI_KERNEL_MAX_SUB_GROUP_SIZE) {
-    ReturnValue(uint32_t{Kernel->ZeKernelProperties->maxSubgroupSize});
-  } else if (ParamName == PI_KERNEL_MAX_NUM_SUB_GROUPS) {
-    ReturnValue(uint32_t{Kernel->ZeKernelProperties->maxNumSubgroups});
-  } else if (ParamName == PI_KERNEL_COMPILE_NUM_SUB_GROUPS) {
-    ReturnValue(uint32_t{Kernel->ZeKernelProperties->requiredNumSubGroups});
-  } else if (ParamName == PI_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL) {
-    ReturnValue(uint32_t{Kernel->ZeKernelProperties->requiredSubgroupSize});
-  } else {
-    die("piKernelGetSubGroupInfo: parameter not implemented");
-    return {};
-  }
-  return PI_SUCCESS;
+
+  return pi2ur::piKernelGetSubGroupInfo(
+      Kernel, Device, ParamName, InputValueSize, InputValue, ParamValueSize,
+      ParamValue, ParamValueSizeRet);
 }
 
 pi_result piKernelRetain(pi_kernel Kernel) {
 
-  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
-
-  Kernel->RefCount.increment();
-  return PI_SUCCESS;
+  return pi2ur::piKernelRetain(Kernel);
 }
 
 pi_result piKernelRelease(pi_kernel Kernel) {
-  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
-
-  if (!Kernel->RefCount.decrementAndTest())
-    return PI_SUCCESS;
-
-  auto KernelProgram = Kernel->Program;
-  if (Kernel->OwnZeKernel) {
-    auto ZeResult = ZE_CALL_NOCHECK(zeKernelDestroy, (Kernel->ZeKernel));
-    // Gracefully handle the case that L0 was already unloaded.
-    if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
-      return mapError(ZeResult);
-  }
-  if (IndirectAccessTrackingEnabled) {
-    PI_CALL(piContextRelease(KernelProgram->Context));
-  }
-  // do a release on the program this kernel was part of
-  PI_CALL(piProgramRelease(KernelProgram));
-  delete Kernel;
 
-  return PI_SUCCESS;
+  return pi2ur::piKernelRelease(Kernel);
 }
 
 pi_result
@@ -4638,215 +429,9 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
                       const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
                       pi_uint32 NumEventsInWaitList,
                       const pi_event *EventWaitList, pi_event *OutEvent) {
-  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-  PI_ASSERT((WorkDim > 0) && (WorkDim < 4), PI_ERROR_INVALID_WORK_DIMENSION);
-
-  // Lock automatically releases when this goes out of scope.
-  std::scoped_lock<ur_shared_mutex, ur_shared_mutex, ur_shared_mutex> Lock(
-      Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex);
-  if (GlobalWorkOffset != NULL) {
-    if (!Queue->Device->Platform->ZeDriverGlobalOffsetExtensionFound) {
-      urPrint("No global offset extension found on this driver\n");
-      return PI_ERROR_INVALID_VALUE;
-    }
-
-    ZE_CALL(zeKernelSetGlobalOffsetExp,
-            (Kernel->ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1],
-             GlobalWorkOffset[2]));
-  }
-
-  // If there are any pending arguments set them now.
-  for (auto &Arg : Kernel->PendingArguments) {
-    // The ArgValue may be a NULL pointer in which case a NULL value is used for
-    // the kernel argument declared as a pointer to global or constant memory.
-    char **ZeHandlePtr = nullptr;
-    if (Arg.Value) {
-      PI_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode,
-                                        Queue->Device));
-    }
-    ZE_CALL(zeKernelSetArgumentValue,
-            (Kernel->ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr));
-  }
-  Kernel->PendingArguments.clear();
-
-  ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
-  uint32_t WG[3];
-
-  // global_work_size of unused dimensions must be set to 1
-  PI_ASSERT(WorkDim == 3 || GlobalWorkSize[2] == 1, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(WorkDim >= 2 || GlobalWorkSize[1] == 1, PI_ERROR_INVALID_VALUE);
-
-  if (LocalWorkSize) {
-    WG[0] = ur_cast<uint32_t>(LocalWorkSize[0]);
-    WG[1] = ur_cast<uint32_t>(LocalWorkSize[1]);
-    WG[2] = ur_cast<uint32_t>(LocalWorkSize[2]);
-  } else {
-    // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize
-    // values do not fit to 32-bit that the API only supports currently.
-    bool SuggestGroupSize = true;
-    for (int I : {0, 1, 2}) {
-      if (GlobalWorkSize[I] > UINT32_MAX) {
-        SuggestGroupSize = false;
-      }
-    }
-    if (SuggestGroupSize) {
-      ZE_CALL(zeKernelSuggestGroupSize,
-              (Kernel->ZeKernel, GlobalWorkSize[0], GlobalWorkSize[1],
-               GlobalWorkSize[2], &WG[0], &WG[1], &WG[2]));
-    } else {
-      for (int I : {0, 1, 2}) {
-        // Try to find a I-dimension WG size that the GlobalWorkSize[I] is
-        // fully divisable with. Start with the max possible size in
-        // each dimension.
-        uint32_t GroupSize[] = {
-            Queue->Device->ZeDeviceComputeProperties->maxGroupSizeX,
-            Queue->Device->ZeDeviceComputeProperties->maxGroupSizeY,
-            Queue->Device->ZeDeviceComputeProperties->maxGroupSizeZ};
-        GroupSize[I] = std::min(size_t(GroupSize[I]), GlobalWorkSize[I]);
-        while (GlobalWorkSize[I] % GroupSize[I]) {
-          --GroupSize[I];
-        }
-        if (GlobalWorkSize[I] / GroupSize[I] > UINT32_MAX) {
-          urPrint("piEnqueueKernelLaunch: can't find a WG size "
-                  "suitable for global work size > UINT32_MAX\n");
-          return PI_ERROR_INVALID_WORK_GROUP_SIZE;
-        }
-        WG[I] = GroupSize[I];
-      }
-      urPrint("piEnqueueKernelLaunch: using computed WG size = {%d, %d, %d}\n",
-              WG[0], WG[1], WG[2]);
-    }
-  }
-
-  // TODO: assert if sizes do not fit into 32-bit?
-  switch (WorkDim) {
-  case 3:
-    ZeThreadGroupDimensions.groupCountX =
-        ur_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
-    ZeThreadGroupDimensions.groupCountY =
-        ur_cast<uint32_t>(GlobalWorkSize[1] / WG[1]);
-    ZeThreadGroupDimensions.groupCountZ =
-        ur_cast<uint32_t>(GlobalWorkSize[2] / WG[2]);
-    break;
-  case 2:
-    ZeThreadGroupDimensions.groupCountX =
-        ur_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
-    ZeThreadGroupDimensions.groupCountY =
-        ur_cast<uint32_t>(GlobalWorkSize[1] / WG[1]);
-    WG[2] = 1;
-    break;
-  case 1:
-    ZeThreadGroupDimensions.groupCountX =
-        ur_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
-    WG[1] = WG[2] = 1;
-    break;
-
-  default:
-    urPrint("piEnqueueKernelLaunch: unsupported work_dim\n");
-    return PI_ERROR_INVALID_VALUE;
-  }
-
-  // Error handling for non-uniform group size case
-  if (GlobalWorkSize[0] !=
-      size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) {
-    urPrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a "
-            "multiple of the group size in the 1st dimension\n");
-    return PI_ERROR_INVALID_WORK_GROUP_SIZE;
-  }
-  if (GlobalWorkSize[1] !=
-      size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) {
-    urPrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a "
-            "multiple of the group size in the 2nd dimension\n");
-    return PI_ERROR_INVALID_WORK_GROUP_SIZE;
-  }
-  if (GlobalWorkSize[2] !=
-      size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) {
-    urPrint("piEnqueueKernelLaunch: invalid work_dim. The range is not a "
-            "multiple of the group size in the 3rd dimension\n");
-    return PI_ERROR_INVALID_WORK_GROUP_SIZE;
-  }
-
-  ZE_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2]));
-
-  bool UseCopyEngine = false;
-  _pi_ze_event_list_t TmpWaitList;
-  if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
-          NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
-    return Res;
-
-  // Get a new command list to be used on this call
-  pi_command_list_ptr_t CommandList{};
-  if (auto Res = Queue->Context->getAvailableCommandList(
-          Queue, CommandList, UseCopyEngine, true /* AllowBatching */))
-    return Res;
-
-  ze_event_handle_t ZeEvent = nullptr;
-  pi_event InternalEvent;
-  bool IsInternal = OutEvent == nullptr;
-  pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
-  pi_result Res = createEventAndAssociateQueue(
-      Queue, Event, PI_COMMAND_TYPE_NDRANGE_KERNEL, CommandList, IsInternal);
-  if (Res != PI_SUCCESS)
-    return Res;
-  ZeEvent = (*Event)->ZeEvent;
-  (*Event)->WaitList = TmpWaitList;
-
-  // Save the kernel in the event, so that when the event is signalled
-  // the code can do a piKernelRelease on this kernel.
-  (*Event)->CommandData = (void *)Kernel;
-
-  // Increment the reference count of the Kernel and indicate that the Kernel is
-  // in use. Once the event has been signalled, the code in
-  // CleanupCompletedEvent(Event) will do a piReleaseKernel to update the
-  // reference count on the kernel, using the kernel saved in CommandData.
-  PI_CALL(piKernelRetain(Kernel));
-
-  // Add to list of kernels to be submitted
-  if (IndirectAccessTrackingEnabled)
-    Queue->KernelsToBeSubmitted.push_back(Kernel);
-
-  if (Queue->UsingImmCmdLists && IndirectAccessTrackingEnabled) {
-    // If using immediate commandlists then gathering of indirect
-    // references and appending to the queue (which means submission)
-    // must be done together.
-    std::unique_lock<ur_shared_mutex> ContextsLock(
-        Queue->Device->Platform->ContextsMutex, std::defer_lock);
-    // We are going to submit kernels for execution. If indirect access flag is
-    // set for a kernel then we need to make a snapshot of existing memory
-    // allocations in all contexts in the platform. We need to lock the mutex
-    // guarding the list of contexts in the platform to prevent creation of new
-    // memory alocations in any context before we submit the kernel for
-    // execution.
-    ContextsLock.lock();
-    Queue->CaptureIndirectAccesses();
-    // Add the command to the command list, which implies submission.
-    ZE_CALL(zeCommandListAppendLaunchKernel,
-            (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions,
-             ZeEvent, (*Event)->WaitList.Length,
-             (*Event)->WaitList.ZeEventList));
-  } else {
-    // Add the command to the command list for later submission.
-    // No lock is needed here, unlike the immediate commandlist case above,
-    // because the kernels are not actually submitted yet. Kernels will be
-    // submitted only when the comamndlist is closed. Then, a lock is held.
-    ZE_CALL(zeCommandListAppendLaunchKernel,
-            (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions,
-             ZeEvent, (*Event)->WaitList.Length,
-             (*Event)->WaitList.ZeEventList));
-  }
-
-  urPrint("calling zeCommandListAppendLaunchKernel() with"
-          "  ZeEvent %#llx\n",
-          ur_cast<std::uintptr_t>(ZeEvent));
-  printZeEventList((*Event)->WaitList);
-
-  // Execute command list asynchronously, as the event will be used
-  // to track down its completion.
-  if (auto Res = Queue->executeCommandList(CommandList, false, true))
-    return Res;
-
-  return PI_SUCCESS;
+  return pi2ur::piEnqueueKernelLaunch(
+      Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize,
+      NumEventsInWaitList, EventWaitList, OutEvent);
 }
 
 pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
@@ -4854,535 +439,42 @@ pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
                                             pi_program Program,
                                             bool OwnNativeHandle,
                                             pi_kernel *Kernel) {
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
-  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
 
-  auto ZeKernel = ur_cast<ze_kernel_handle_t>(NativeHandle);
-  *Kernel = new _pi_kernel(ZeKernel, OwnNativeHandle, Program);
-  PI_CALL((*Kernel)->initialize());
-  return PI_SUCCESS;
+  return pi2ur::piextKernelCreateWithNativeHandle(
+      NativeHandle, Context, Program, OwnNativeHandle, Kernel);
 }
 
 pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
                                      pi_native_handle *NativeHandle) {
-  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
-  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
-
-  std::shared_lock<ur_shared_mutex> Guard(Kernel->Mutex);
-  auto *ZeKernel = ur_cast<ze_kernel_handle_t *>(NativeHandle);
-  *ZeKernel = Kernel->ZeKernel;
-  return PI_SUCCESS;
+  return pi2ur::piextKernelGetNativeHandle(Kernel, NativeHandle);
 }
 
 //
 // Events
 //
-pi_result
-_pi_event::getOrCreateHostVisibleEvent(ze_event_handle_t &ZeHostVisibleEvent) {
-  PI_ASSERT(Queue, PI_ERROR_INVALID_EVENT);
-
-  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
-                                                          this->Mutex);
-
-  if (!HostVisibleEvent) {
-    if (Queue->Device->ZeEventsScope != OnDemandHostVisibleProxy)
-      die("getOrCreateHostVisibleEvent: missing host-visible event");
-
-    // Submit the command(s) signalling the proxy event to the queue.
-    // We have to first submit a wait for the device-only event for which this
-    // proxy is created.
-    //
-    // Get a new command list to be used on this call
-
-    // We want to batch these commands to avoid extra submissions (costly)
-    bool OkToBatch = true;
-
-    pi_command_list_ptr_t CommandList{};
-    if (auto Res = Queue->Context->getAvailableCommandList(
-            Queue, CommandList, false /* UseCopyEngine */, OkToBatch))
-      return Res;
-
-    // Create a "proxy" host-visible event.
-    auto Res = createEventAndAssociateQueue(
-        Queue, &HostVisibleEvent, PI_COMMAND_TYPE_USER, CommandList,
-        /* IsInternal */ false, /* HostVisible */ true);
-    if (Res != PI_SUCCESS)
-      return Res;
-
-    ZE_CALL(zeCommandListAppendWaitOnEvents, (CommandList->first, 1, &ZeEvent));
-    ZE_CALL(zeCommandListAppendSignalEvent,
-            (CommandList->first, HostVisibleEvent->ZeEvent));
-
-    if (auto Res = Queue->executeCommandList(CommandList, false, OkToBatch))
-      return Res;
-  }
-
-  ZeHostVisibleEvent = HostVisibleEvent->ZeEvent;
-  return PI_SUCCESS;
-}
-
-pi_result _pi_event::reset() {
-  Queue = nullptr;
-  CleanedUp = false;
-  Completed = false;
-  CommandData = nullptr;
-  CommandType = PI_COMMAND_TYPE_USER;
-  WaitList = {};
-  RefCountExternal = 0;
-  RefCount.reset();
-  CommandList = std::nullopt;
-
-  if (!isHostVisible())
-    HostVisibleEvent = nullptr;
-
-  ZE_CALL(zeEventHostReset, (ZeEvent));
-  return PI_SUCCESS;
-}
-
-pi_event _pi_context::getEventFromContextCache(bool HostVisible,
-                                               bool WithProfiling) {
-  std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
-  auto Cache = getEventCache(HostVisible, WithProfiling);
-  if (Cache->empty())
-    return nullptr;
-
-  auto It = Cache->begin();
-  pi_event Event = *It;
-  Cache->erase(It);
-  // We have to reset event before using it.
-  Event->reset();
-  return Event;
-}
-
-void _pi_context::addEventToContextCache(pi_event Event) {
-  std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
-  auto Cache =
-      getEventCache(Event->isHostVisible(), Event->isProfilingEnabled());
-  Cache->emplace_back(Event);
-}
-
-// Helper function for creating a PI event.
-// The "Queue" argument specifies the PI queue where a command is submitted.
-// The "HostVisible" argument specifies if event needs to be allocated from
-// a host-visible pool.
-//
-static pi_result EventCreate(pi_context Context, pi_queue Queue,
-                             bool HostVisible, pi_event *RetEvent) {
-  bool ProfilingEnabled =
-      !Queue || (Queue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) != 0;
-
-  if (auto CachedEvent =
-          Context->getEventFromContextCache(HostVisible, ProfilingEnabled)) {
-    *RetEvent = CachedEvent;
-    return PI_SUCCESS;
-  }
-
-  ze_event_handle_t ZeEvent;
-  ze_event_pool_handle_t ZeEventPool = {};
-
-  size_t Index = 0;
-
-  if (auto Res = Context->getFreeSlotInExistingOrNewPool(
-          ZeEventPool, Index, HostVisible, ProfilingEnabled))
-    return Res;
-
-  ZeStruct<ze_event_desc_t> ZeEventDesc;
-  ZeEventDesc.index = Index;
-  ZeEventDesc.wait = 0;
-
-  if (HostVisible) {
-    ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
-  } else {
-    //
-    // Set the scope to "device" for every event. This is sufficient for
-    // global device access and peer device access. If needed to be seen on
-    // the host we are doing special handling, see EventsScope options.
-    //
-    // TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be
-    //       used in some circumstances.
-    //
-    ZeEventDesc.signal = 0;
-  }
-
-  ZE_CALL(zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeEvent));
-
-  try {
-    PI_ASSERT(RetEvent, PI_ERROR_INVALID_VALUE);
-
-    *RetEvent = new _pi_event(ZeEvent, ZeEventPool, Context,
-                              PI_COMMAND_TYPE_USER, true);
-  } catch (const std::bad_alloc &) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  if (HostVisible)
-    (*RetEvent)->HostVisibleEvent = *RetEvent;
-
-  return PI_SUCCESS;
-}
 
 // External PI API entry
 pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
-  pi_result Result = EventCreate(Context, nullptr, true, RetEvent);
-  (*RetEvent)->RefCountExternal++;
-  if (Result != PI_SUCCESS)
-    return Result;
-  ZE_CALL(zeEventHostSignal, ((*RetEvent)->ZeEvent));
-  return PI_SUCCESS;
+  return pi2ur::piEventCreate(Context, RetEvent);
 }
 
 pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName,
                          size_t ParamValueSize, void *ParamValue,
                          size_t *ParamValueSizeRet) {
-
-  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
-
-  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
-  switch (ParamName) {
-  case PI_EVENT_INFO_COMMAND_QUEUE: {
-    std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
-    return ReturnValue(pi_queue{Event->Queue});
-  }
-  case PI_EVENT_INFO_CONTEXT: {
-    std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
-    return ReturnValue(pi_context{Event->Context});
-  }
-  case PI_EVENT_INFO_COMMAND_TYPE: {
-    std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
-    return ReturnValue(ur_cast<pi_uint64>(Event->CommandType));
-  }
-  case PI_EVENT_INFO_COMMAND_EXECUTION_STATUS: {
-    // Check to see if the event's Queue has an open command list due to
-    // batching. If so, go ahead and close and submit it, because it is
-    // possible that this is trying to query some event's status that
-    // is part of the batch.  This isn't strictly required, but it seems
-    // like a reasonable thing to do.
-    auto Queue = Event->Queue;
-    if (Queue) {
-      // Lock automatically releases when this goes out of scope.
-      std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
-      const auto &OpenCommandList = Queue->eventOpenCommandList(Event);
-      if (OpenCommandList != Queue->CommandListMap.end()) {
-        if (auto Res = Queue->executeOpenCommandList(
-                OpenCommandList->second.isCopy(Queue)))
-          return Res;
-      }
-    }
-
-    // Level Zero has a much more explicit notion of command submission than
-    // OpenCL. It doesn't happen unless the user submits a command list. We've
-    // done it just above so the status is at least PI_EVENT_SUBMITTED.
-    //
-    // NOTE: We currently cannot tell if command is currently running, so
-    // it will always show up "submitted" before it is finally "completed".
-    //
-    pi_int32 Result = PI_EVENT_SUBMITTED;
-
-    // Make sure that we query a host-visible event only.
-    // If one wasn't yet created then don't create it here as well, and
-    // just conservatively return that event is not yet completed.
-    std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
-    auto HostVisibleEvent = Event->HostVisibleEvent;
-    if (Event->Completed) {
-      Result = PI_EVENT_COMPLETE;
-    } else if (HostVisibleEvent) {
-      ze_result_t ZeResult;
-      ZeResult =
-          ZE_CALL_NOCHECK(zeEventQueryStatus, (HostVisibleEvent->ZeEvent));
-      if (ZeResult == ZE_RESULT_SUCCESS) {
-        Result = PI_EVENT_COMPLETE;
-      }
-    }
-    return ReturnValue(ur_cast<pi_int32>(Result));
-  }
-  case PI_EVENT_INFO_REFERENCE_COUNT:
-    return ReturnValue(pi_uint32{Event->RefCount.load()});
-  default:
-    urPrint("Unsupported ParamName in piEventGetInfo: ParamName=%d(%x)\n",
-            ParamName, ParamName);
-    return PI_ERROR_INVALID_VALUE;
-  }
-
-  return PI_SUCCESS;
+  return pi2ur::piEventGetInfo(Event, ParamName, ParamValueSize, ParamValue,
+                               ParamValueSizeRet);
 }
 
 pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName,
                                   size_t ParamValueSize, void *ParamValue,
                                   size_t *ParamValueSizeRet) {
 
-  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
-
-  std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
-  if (Event->Queue &&
-      (Event->Queue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) == 0) {
-    return PI_ERROR_PROFILING_INFO_NOT_AVAILABLE;
-  }
-
-  pi_device Device =
-      Event->Queue ? Event->Queue->Device : Event->Context->Devices[0];
-
-  uint64_t ZeTimerResolution = Device->ZeDeviceProperties->timerResolution;
-  const uint64_t TimestampMaxValue =
-      ((1ULL << Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL);
-
-  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
-
-  ze_kernel_timestamp_result_t tsResult;
-
-  switch (ParamName) {
-  case PI_PROFILING_INFO_COMMAND_START: {
-    ZE_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult));
-    uint64_t ContextStartTime =
-        (tsResult.global.kernelStart & TimestampMaxValue) * ZeTimerResolution;
-    return ReturnValue(ContextStartTime);
-  }
-  case PI_PROFILING_INFO_COMMAND_END: {
-    ZE_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult));
-
-    uint64_t ContextStartTime =
-        (tsResult.global.kernelStart & TimestampMaxValue);
-    uint64_t ContextEndTime = (tsResult.global.kernelEnd & TimestampMaxValue);
-
-    //
-    // Handle a possible wrap-around (the underlying HW counter is < 64-bit).
-    // Note, it will not report correct time if there were multiple wrap
-    // arounds, and the longer term plan is to enlarge the capacity of the
-    // HW timestamps.
-    //
-    if (ContextEndTime <= ContextStartTime) {
-      ContextEndTime += TimestampMaxValue;
-    }
-    ContextEndTime *= ZeTimerResolution;
-    return ReturnValue(ContextEndTime);
-  }
-  case PI_PROFILING_INFO_COMMAND_QUEUED:
-  case PI_PROFILING_INFO_COMMAND_SUBMIT:
-    // Note: No users for this case
-    // The "command_submit" time is implemented by recording submission
-    // timestamp with a call to piGetDeviceAndHostTimer before command enqueue.
-    //
-    return ReturnValue(uint64_t{0});
-  default:
-    urPrint("piEventGetProfilingInfo: not supported ParamName\n");
-    return PI_ERROR_INVALID_VALUE;
-  }
-
-  return PI_SUCCESS;
-}
-
-} // extern "C"
-
-// Perform any necessary cleanup after an event has been signalled.
-// This currently makes sure to release any kernel that may have been used by
-// the event, updates the last command event in the queue and cleans up all dep
-// events of the event.
-// If the caller locks queue mutex then it must pass 'true' to QueueLocked.
-static pi_result CleanupCompletedEvent(pi_event Event, bool QueueLocked) {
-  pi_kernel AssociatedKernel = nullptr;
-  // List of dependent events.
-  std::list<pi_event> EventsToBeReleased;
-  pi_queue AssociatedQueue = nullptr;
-  {
-    std::scoped_lock<ur_shared_mutex> EventLock(Event->Mutex);
-    // Exit early of event was already cleanedup.
-    if (Event->CleanedUp)
-      return PI_SUCCESS;
-
-    AssociatedQueue = Event->Queue;
-
-    // Remember the kernel associated with this event if there is one. We are
-    // going to release it later.
-    if (Event->CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL &&
-        Event->CommandData) {
-      AssociatedKernel = ur_cast<pi_kernel>(Event->CommandData);
-      Event->CommandData = nullptr;
-    }
-
-    // Make a list of all the dependent events that must have signalled
-    // because this event was dependent on them.
-    Event->WaitList.collectEventsForReleaseAndDestroyPiZeEventList(
-        EventsToBeReleased);
-
-    Event->CleanedUp = true;
-  }
-
-  auto ReleaseIndirectMem = [](pi_kernel Kernel) {
-    if (IndirectAccessTrackingEnabled) {
-      // piKernelRelease is called by CleanupCompletedEvent(Event) as soon as
-      // kernel execution has finished. This is the place where we need to
-      // release memory allocations. If kernel is not in use (not submitted by
-      // some other thread) then release referenced memory allocations. As a
-      // result, memory can be deallocated and context can be removed from
-      // container in the platform. That's why we need to lock a mutex here.
-      pi_platform Plt = Kernel->Program->Context->getPlatform();
-      std::scoped_lock<ur_shared_mutex> ContextsLock(Plt->ContextsMutex);
-
-      if (--Kernel->SubmissionsCount == 0) {
-        // Kernel is not submitted for execution, release referenced memory
-        // allocations.
-        for (auto &MemAlloc : Kernel->MemAllocs) {
-          // std::pair<void *const, MemAllocRecord> *, Hash
-          USMFreeHelper(MemAlloc->second.Context, MemAlloc->first,
-                        MemAlloc->second.OwnZeMemHandle);
-        }
-        Kernel->MemAllocs.clear();
-      }
-    }
-  };
-
-  // We've reset event data members above, now cleanup resources.
-  if (AssociatedKernel) {
-    ReleaseIndirectMem(AssociatedKernel);
-    PI_CALL(piKernelRelease(AssociatedKernel));
-  }
-
-  if (AssociatedQueue) {
-    {
-      // Lock automatically releases when this goes out of scope.
-      std::unique_lock<ur_shared_mutex> QueueLock(AssociatedQueue->Mutex,
-                                                  std::defer_lock);
-      if (!QueueLocked)
-        QueueLock.lock();
-
-      // If this event was the LastCommandEvent in the queue, being used
-      // to make sure that commands were executed in-order, remove this.
-      // If we don't do this, the event can get released and freed leaving
-      // a dangling pointer to this event.  It could also cause unneeded
-      // already finished events to show up in the wait list.
-      if (AssociatedQueue->LastCommandEvent == Event) {
-        AssociatedQueue->LastCommandEvent = nullptr;
-      }
-    }
-
-    // Release this event since we explicitly retained it on creation and
-    // association with queue. Events which don't have associated queue doesn't
-    // require this release because it means that they are not created using
-    // createEventAndAssociateQueue, i.e. additional retain was not made.
-    PI_CALL(piEventReleaseInternal(Event));
-  }
-
-  // The list of dependent events will be appended to as we walk it so that this
-  // algorithm doesn't go recursive due to dependent events themselves being
-  // dependent on other events forming a potentially very deep tree, and deep
-  // recursion.  That turned out to be a significant problem with the recursive
-  // code that preceded this implementation.
-  while (!EventsToBeReleased.empty()) {
-    pi_event DepEvent = EventsToBeReleased.front();
-    DepEvent->Completed = true;
-    EventsToBeReleased.pop_front();
-
-    pi_kernel DepEventKernel = nullptr;
-    {
-      std::scoped_lock<ur_shared_mutex> DepEventLock(DepEvent->Mutex);
-      DepEvent->WaitList.collectEventsForReleaseAndDestroyPiZeEventList(
-          EventsToBeReleased);
-      if (IndirectAccessTrackingEnabled) {
-        // DepEvent has finished, we can release the associated kernel if there
-        // is one. This is the earliest place we can do this and it can't be
-        // done twice, so it is safe. Lock automatically releases when this goes
-        // out of scope.
-        // TODO: this code needs to be moved out of the guard.
-        if (DepEvent->CommandType == PI_COMMAND_TYPE_NDRANGE_KERNEL &&
-            DepEvent->CommandData) {
-          DepEventKernel = ur_cast<pi_kernel>(DepEvent->CommandData);
-          DepEvent->CommandData = nullptr;
-        }
-      }
-    }
-    if (DepEventKernel) {
-      ReleaseIndirectMem(DepEventKernel);
-      PI_CALL(piKernelRelease(DepEventKernel));
-    }
-    PI_CALL(piEventReleaseInternal(DepEvent));
-  }
-
-  return PI_SUCCESS;
-}
-
-extern "C" {
+  return pi2ur::piEventGetProfilingInfo(Event, ParamName, ParamValueSize,
+                                        ParamValue, ParamValueSizeRet);
+}
 
 pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
-
-  if (NumEvents && !EventList) {
-    return PI_ERROR_INVALID_EVENT;
-  }
-  for (uint32_t I = 0; I < NumEvents; I++) {
-    if (EventList[I]->Queue->Device->ZeEventsScope ==
-        OnDemandHostVisibleProxy) {
-      // Make sure to add all host-visible "proxy" event signals if needed.
-      // This ensures that all signalling commands are submitted below and
-      // thus proxy events can be waited without a deadlock.
-      //
-      if (!EventList[I]->hasExternalRefs())
-        die("piEventsWait must not be called for an internal event");
-
-      ze_event_handle_t ZeHostVisibleEvent;
-      if (auto Res =
-              EventList[I]->getOrCreateHostVisibleEvent(ZeHostVisibleEvent))
-        return Res;
-    }
-  }
-  // Submit dependent open command lists for execution, if any
-  for (uint32_t I = 0; I < NumEvents; I++) {
-    auto Queue = EventList[I]->Queue;
-    if (Queue) {
-      // Lock automatically releases when this goes out of scope.
-      std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
-
-      if (auto Res = Queue->executeAllOpenCommandLists())
-        return Res;
-    }
-  }
-  std::unordered_set<pi_queue> Queues;
-  for (uint32_t I = 0; I < NumEvents; I++) {
-    {
-      {
-        std::shared_lock<ur_shared_mutex> EventLock(EventList[I]->Mutex);
-        if (!EventList[I]->hasExternalRefs())
-          die("piEventsWait must not be called for an internal event");
-
-        if (!EventList[I]->Completed) {
-          auto HostVisibleEvent = EventList[I]->HostVisibleEvent;
-          if (!HostVisibleEvent)
-            die("The host-visible proxy event missing");
-
-          ze_event_handle_t ZeEvent = HostVisibleEvent->ZeEvent;
-          urPrint("ZeEvent = %#llx\n", ur_cast<std::uintptr_t>(ZeEvent));
-          ZE_CALL(zeHostSynchronize, (ZeEvent));
-          EventList[I]->Completed = true;
-        }
-      }
-      if (auto Q = EventList[I]->Queue) {
-        if (Q->UsingImmCmdLists && Q->isInOrderQueue())
-          // Use information about waited event to cleanup completed events in
-          // the in-order queue.
-          CleanupEventsInImmCmdLists(EventList[I]->Queue,
-                                     /* QueueLocked */ false,
-                                     /* QueueSynced */ false, EventList[I]);
-        else {
-          // NOTE: we are cleaning up after the event here to free resources
-          // sooner in case run-time is not calling piEventRelease soon enough.
-          CleanupCompletedEvent(EventList[I]);
-          // For the case when we have out-of-order queue or regular command
-          // lists its more efficient to check fences so put the queue in the
-          // set to cleanup later.
-          Queues.insert(Q);
-        }
-      }
-    }
-  }
-
-  // We waited some events above, check queue for signaled command lists and
-  // reset them.
-  for (auto &Q : Queues) {
-    std::unique_lock<ur_shared_mutex> Lock(Q->Mutex);
-    resetCommandLists(Q);
-  }
-  return PI_SUCCESS;
+  return pi2ur::piEventsWait(NumEvents, EventList);
 }
 
 pi_result piEventSetCallback(pi_event Event, pi_int32 CommandExecCallbackType,
@@ -5390,152 +482,32 @@ pi_result piEventSetCallback(pi_event Event, pi_int32 CommandExecCallbackType,
                                                pi_int32 EventCommandStatus,
                                                void *UserData),
                              void *UserData) {
-  (void)Event;
-  (void)CommandExecCallbackType;
-  (void)PFnNotify;
-  (void)UserData;
-  die("piEventSetCallback: deprecated, to be removed");
-  return PI_SUCCESS;
+  return pi2ur::piEventSetCallback(Event, CommandExecCallbackType, PFnNotify,
+                                   UserData);
 }
 
 pi_result piEventSetStatus(pi_event Event, pi_int32 ExecutionStatus) {
-  (void)Event;
-  (void)ExecutionStatus;
-  die("piEventSetStatus: deprecated, to be removed");
-  return PI_SUCCESS;
+  return pi2ur::piEventSetStatus(Event, ExecutionStatus);
 }
 
-pi_result piEventRetain(pi_event Event) {
-  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
-  Event->RefCountExternal++;
-  Event->RefCount.increment();
-  return PI_SUCCESS;
-}
+pi_result piEventRetain(pi_event Event) { return pi2ur::piEventRetain(Event); }
 
 pi_result piEventRelease(pi_event Event) {
-  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
-  Event->RefCountExternal--;
-  PI_CALL(piEventReleaseInternal(Event));
-  return PI_SUCCESS;
-}
-
-void _pi_queue::active_barriers::add(pi_event &Event) {
-  Event->RefCount.increment();
-  Events.push_back(Event);
-}
-
-pi_result _pi_queue::active_barriers::clear() {
-  for (const auto &Event : Events)
-    PI_CALL(piEventReleaseInternal(Event));
-  Events.clear();
-  return PI_SUCCESS;
-}
-
-static pi_result piEventReleaseInternal(pi_event Event) {
-  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
-
-  if (!Event->RefCount.decrementAndTest())
-    return PI_SUCCESS;
-
-  if (Event->CommandType == PI_COMMAND_TYPE_MEM_BUFFER_UNMAP &&
-      Event->CommandData) {
-    // Free the memory allocated in the piEnqueueMemBufferMap.
-    if (auto Res = ZeMemFreeHelper(Event->Context, Event->CommandData))
-      return Res;
-    Event->CommandData = nullptr;
-  }
-  if (Event->OwnZeEvent) {
-    if (DisableEventsCaching) {
-      auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
-      // Gracefully handle the case that L0 was already unloaded.
-      if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
-        return mapError(ZeResult);
-
-      auto Context = Event->Context;
-      if (auto Res = Context->decrementUnreleasedEventsInPool(Event))
-        return Res;
-    }
-  }
-  // It is possible that host-visible event was never created.
-  // In case it was check if that's different from this same event
-  // and release a reference to it.
-  if (Event->HostVisibleEvent && Event->HostVisibleEvent != Event) {
-    // Decrement ref-count of the host-visible proxy event.
-    PI_CALL(piEventReleaseInternal(Event->HostVisibleEvent));
-  }
-
-  // Save pointer to the queue before deleting/resetting event.
-  // When we add an event to the cache we need to check whether profiling is
-  // enabled or not, so we access properties of the queue and that's why queue
-  // must released later.
-  auto Queue = Event->Queue;
-  if (DisableEventsCaching || !Event->OwnZeEvent) {
-    delete Event;
-  } else {
-    Event->Context->addEventToContextCache(Event);
-  }
-
-  // We intentionally incremented the reference counter when an event is
-  // created so that we can avoid pi_queue is released before the associated
-  // pi_event is released. Here we have to decrement it so pi_queue
-  // can be released successfully.
-  if (Queue) {
-    PI_CALL(piQueueReleaseInternal(Queue));
-  }
-
-  return PI_SUCCESS;
+  return pi2ur::piEventRelease(Event);
 }
 
 pi_result piextEventGetNativeHandle(pi_event Event,
                                     pi_native_handle *NativeHandle) {
-  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
-  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
 
-  {
-    std::shared_lock<ur_shared_mutex> Lock(Event->Mutex);
-    auto *ZeEvent = ur_cast<ze_event_handle_t *>(NativeHandle);
-    *ZeEvent = Event->ZeEvent;
-  }
-  // Event can potentially be in an open command-list, make sure that
-  // it is submitted for execution to avoid potential deadlock if
-  // interop app is going to wait for it.
-  auto Queue = Event->Queue;
-  if (Queue) {
-    std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
-    const auto &OpenCommandList = Queue->eventOpenCommandList(Event);
-    if (OpenCommandList != Queue->CommandListMap.end()) {
-      if (auto Res = Queue->executeOpenCommandList(
-              OpenCommandList->second.isCopy(Queue)))
-        return Res;
-    }
-  }
-  return PI_SUCCESS;
+  return pi2ur::piextEventGetNativeHandle(Event, NativeHandle);
 }
 
 pi_result piextEventCreateWithNativeHandle(pi_native_handle NativeHandle,
                                            pi_context Context,
                                            bool OwnNativeHandle,
                                            pi_event *Event) {
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
-  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
-
-  auto ZeEvent = ur_cast<ze_event_handle_t>(NativeHandle);
-  *Event = new _pi_event(ZeEvent, nullptr /* ZeEventPool */, Context,
-                         PI_COMMAND_TYPE_USER, OwnNativeHandle);
-
-  // Assume native event is host-visible, or otherwise we'd
-  // need to create a host-visible proxy for it.
-  (*Event)->HostVisibleEvent = *Event;
-
-  // Unlike regular events managed by SYCL RT we don't have to wait for interop
-  // events completion, and not need to do the their `cleanup()`. This in
-  // particular guarantees that the extra `piEventRelease` is not called on
-  // them. That release is needed to match the `piEventRetain` of regular events
-  // made for waiting for event completion, but not this interop event.
-  (*Event)->CleanedUp = true;
-
-  return PI_SUCCESS;
+  return pi2ur::piextEventCreateWithNativeHandle(NativeHandle, Context,
+                                                 OwnNativeHandle, Event);
 }
 
 //
@@ -5544,167 +516,23 @@ pi_result piextEventCreateWithNativeHandle(pi_native_handle NativeHandle,
 pi_result piSamplerCreate(pi_context Context,
                           const pi_sampler_properties *SamplerProperties,
                           pi_sampler *RetSampler) {
-
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-  PI_ASSERT(RetSampler, PI_ERROR_INVALID_VALUE);
-
-  std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
-
-  // Have the "0" device in context to own the sampler. Rely on Level-Zero
-  // drivers to perform migration as necessary for sharing it across multiple
-  // devices in the context.
-  //
-  // TODO: figure out if we instead need explicit copying for acessing
-  // the sampler from other devices in the context.
-  //
-  pi_device Device = Context->Devices[0];
-
-  ze_sampler_handle_t ZeSampler;
-  ZeStruct<ze_sampler_desc_t> ZeSamplerDesc;
-
-  // Set the default values for the ZeSamplerDesc.
-  ZeSamplerDesc.isNormalized = PI_TRUE;
-  ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_CLAMP;
-  ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST;
-
-  // Update the values of the ZeSamplerDesc from the pi_sampler_properties list.
-  // Default values will be used if any of the following is true:
-  //   a) SamplerProperties list is NULL
-  //   b) SamplerProperties list is missing any properties
-
-  if (SamplerProperties) {
-    const pi_sampler_properties *CurProperty = SamplerProperties;
-
-    while (*CurProperty != 0) {
-      switch (*CurProperty) {
-      case PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS: {
-        pi_bool CurValueBool = ur_cast<pi_bool>(*(++CurProperty));
-
-        if (CurValueBool == PI_TRUE)
-          ZeSamplerDesc.isNormalized = PI_TRUE;
-        else if (CurValueBool == PI_FALSE)
-          ZeSamplerDesc.isNormalized = PI_FALSE;
-        else {
-          urPrint("piSamplerCreate: unsupported "
-                  "PI_SAMPLER_NORMALIZED_COORDS value\n");
-          return PI_ERROR_INVALID_VALUE;
-        }
-      } break;
-
-      case PI_SAMPLER_PROPERTIES_ADDRESSING_MODE: {
-        pi_sampler_addressing_mode CurValueAddressingMode =
-            ur_cast<pi_sampler_addressing_mode>(
-                ur_cast<pi_uint32>(*(++CurProperty)));
-
-        // Level Zero runtime with API version 1.2 and lower has a bug:
-        // ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER is implemented as "clamp to
-        // edge" and ZE_SAMPLER_ADDRESS_MODE_CLAMP is implemented as "clamp to
-        // border", i.e. logic is flipped. Starting from API version 1.3 this
-        // problem is going to be fixed. That's why check for API version to set
-        // an address mode.
-        ze_api_version_t ZeApiVersion = Context->getPlatform()->ZeApiVersion;
-        // TODO: add support for PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE
-        switch (CurValueAddressingMode) {
-        case PI_SAMPLER_ADDRESSING_MODE_NONE:
-          ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_NONE;
-          break;
-        case PI_SAMPLER_ADDRESSING_MODE_REPEAT:
-          ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_REPEAT;
-          break;
-        case PI_SAMPLER_ADDRESSING_MODE_CLAMP:
-          ZeSamplerDesc.addressMode =
-              ZeApiVersion < ZE_MAKE_VERSION(1, 3)
-                  ? ZE_SAMPLER_ADDRESS_MODE_CLAMP
-                  : ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER;
-          break;
-        case PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE:
-          ZeSamplerDesc.addressMode =
-              ZeApiVersion < ZE_MAKE_VERSION(1, 3)
-                  ? ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER
-                  : ZE_SAMPLER_ADDRESS_MODE_CLAMP;
-          break;
-        case PI_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT:
-          ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_MIRROR;
-          break;
-        default:
-          urPrint("piSamplerCreate: unsupported PI_SAMPLER_ADDRESSING_MODE "
-                  "value\n");
-          urPrint("PI_SAMPLER_ADDRESSING_MODE=%d\n", CurValueAddressingMode);
-          return PI_ERROR_INVALID_VALUE;
-        }
-      } break;
-
-      case PI_SAMPLER_PROPERTIES_FILTER_MODE: {
-        pi_sampler_filter_mode CurValueFilterMode =
-            ur_cast<pi_sampler_filter_mode>(
-                ur_cast<pi_uint32>(*(++CurProperty)));
-
-        if (CurValueFilterMode == PI_SAMPLER_FILTER_MODE_NEAREST)
-          ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST;
-        else if (CurValueFilterMode == PI_SAMPLER_FILTER_MODE_LINEAR)
-          ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_LINEAR;
-        else {
-          urPrint("PI_SAMPLER_FILTER_MODE=%d\n", CurValueFilterMode);
-          urPrint(
-              "piSamplerCreate: unsupported PI_SAMPLER_FILTER_MODE value\n");
-          return PI_ERROR_INVALID_VALUE;
-        }
-      } break;
-
-      default:
-        break;
-      }
-      CurProperty++;
-    }
-  }
-
-  ZE_CALL(zeSamplerCreate, (Context->ZeContext, Device->ZeDevice,
-                            &ZeSamplerDesc, // TODO: translate properties
-                            &ZeSampler));
-
-  try {
-    *RetSampler = new _pi_sampler(ZeSampler);
-  } catch (const std::bad_alloc &) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-  return PI_SUCCESS;
+  return pi2ur::piSamplerCreate(Context, SamplerProperties, RetSampler);
 }
 
 pi_result piSamplerGetInfo(pi_sampler Sampler, pi_sampler_info ParamName,
                            size_t ParamValueSize, void *ParamValue,
                            size_t *ParamValueSizeRet) {
-  (void)Sampler;
-  (void)ParamName;
-  (void)ParamValueSize;
-  (void)ParamValue;
-  (void)ParamValueSizeRet;
 
-  die("piSamplerGetInfo: not implemented");
-  return {};
+  return pi2ur::piSamplerGetInfo(Sampler, ParamName, ParamValueSize, ParamValue,
+                                 ParamValueSizeRet);
 }
 
 pi_result piSamplerRetain(pi_sampler Sampler) {
-  PI_ASSERT(Sampler, PI_ERROR_INVALID_SAMPLER);
-
-  Sampler->RefCount.increment();
-  return PI_SUCCESS;
+  return pi2ur::piSamplerRetain(Sampler);
 }
 
 pi_result piSamplerRelease(pi_sampler Sampler) {
-  PI_ASSERT(Sampler, PI_ERROR_INVALID_SAMPLER);
-
-  if (!Sampler->RefCount.decrementAndTest())
-    return PI_SUCCESS;
-
-  auto ZeResult = ZE_CALL_NOCHECK(zeSamplerDestroy, (Sampler->ZeSampler));
-  // Gracefully handle the case that L0 was already unloaded.
-  if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
-    return mapError(ZeResult);
-
-  delete Sampler;
-  return PI_SUCCESS;
+  return pi2ur::piSamplerRelease(Sampler);
 }
 
 //
@@ -5714,302 +542,17 @@ pi_result piEnqueueEventsWait(pi_queue Queue, pi_uint32 NumEventsInWaitList,
                               const pi_event *EventWaitList,
                               pi_event *OutEvent) {
 
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  if (EventWaitList) {
-    PI_ASSERT(NumEventsInWaitList > 0, PI_ERROR_INVALID_VALUE);
-
-    bool UseCopyEngine = false;
-
-    // Lock automatically releases when this goes out of scope.
-    std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
-
-    _pi_ze_event_list_t TmpWaitList = {};
-    if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
-            NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
-      return Res;
-
-    // Get a new command list to be used on this call
-    pi_command_list_ptr_t CommandList{};
-    if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                           UseCopyEngine))
-      return Res;
-
-    ze_event_handle_t ZeEvent = nullptr;
-    pi_event InternalEvent;
-    bool IsInternal = OutEvent == nullptr;
-    pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
-    auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER,
-                                            CommandList, IsInternal);
-    if (Res != PI_SUCCESS)
-      return Res;
-
-    ZeEvent = (*Event)->ZeEvent;
-    (*Event)->WaitList = TmpWaitList;
-
-    const auto &WaitList = (*Event)->WaitList;
-    auto ZeCommandList = CommandList->first;
-    ZE_CALL(zeCommandListAppendWaitOnEvents,
-            (ZeCommandList, WaitList.Length, WaitList.ZeEventList));
-
-    ZE_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent));
-
-    // Execute command list asynchronously as the event will be used
-    // to track down its completion.
-    return Queue->executeCommandList(CommandList);
-  }
-
-  {
-    // If wait-list is empty, then this particular command should wait until
-    // all previous enqueued commands to the command-queue have completed.
-    //
-    // TODO: find a way to do that without blocking the host.
-
-    // Lock automatically releases when this goes out of scope.
-    std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
-
-    if (OutEvent) {
-      auto Res = createEventAndAssociateQueue(
-          Queue, OutEvent, PI_COMMAND_TYPE_USER, Queue->CommandListMap.end(),
-          /* IsInternal */ false);
-      if (Res != PI_SUCCESS)
-        return Res;
-    }
-
-    Queue->synchronize();
-
-    if (OutEvent) {
-      Queue->LastCommandEvent = *OutEvent;
-
-      ZE_CALL(zeEventHostSignal, ((*OutEvent)->ZeEvent));
-      (*OutEvent)->Completed = true;
-    }
-  }
-
-  if (!Queue->UsingImmCmdLists) {
-    std::unique_lock<ur_shared_mutex> Lock(Queue->Mutex);
-    resetCommandLists(Queue);
-  }
-
-  return PI_SUCCESS;
+  return pi2ur::piEnqueueEventsWait(Queue, NumEventsInWaitList, EventWaitList,
+                                    OutEvent);
 }
 
 pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue,
                                          pi_uint32 NumEventsInWaitList,
                                          const pi_event *EventWaitList,
                                          pi_event *OutEvent) {
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  // Lock automatically releases when this goes out of scope.
-  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
-
-  // Helper function for appending a barrier to a command list.
-  auto insertBarrierIntoCmdList = [&Queue](
-                                      pi_command_list_ptr_t CmdList,
-                                      const _pi_ze_event_list_t &EventWaitList,
-                                      pi_event &Event, bool IsInternal) {
-    // For in-order queue and empty wait-list just use the last command
-    // event as the barrier event.
-    if (Queue->isInOrderQueue() && !EventWaitList.Length &&
-        Queue->LastCommandEvent && !Queue->LastCommandEvent->IsDiscarded) {
-      PI_CALL(piEventRetain(Queue->LastCommandEvent));
-      Event = Queue->LastCommandEvent;
-      return PI_SUCCESS;
-    }
-
-    if (auto Res = createEventAndAssociateQueue(
-            Queue, &Event, PI_COMMAND_TYPE_USER, CmdList, IsInternal))
-      return Res;
-
-    Event->WaitList = EventWaitList;
-
-    // For in-order queue we don't need a real barrier, just wait for requested
-    // events in potentially different queues and add a "barrier" event signal
-    // because it is already guaranteed that previous commands in this queue
-    // are completed when the signal is started.
-    //
-    // TODO: this and other special handling of in-order queues to be
-    // updated when/if Level Zero adds native support for in-order queues.
-    //
-    if (Queue->isInOrderQueue() && InOrderBarrierBySignal) {
-      if (EventWaitList.Length) {
-        ZE_CALL(
-            zeCommandListAppendWaitOnEvents,
-            (CmdList->first, EventWaitList.Length, EventWaitList.ZeEventList));
-      }
-      ZE_CALL(zeCommandListAppendSignalEvent, (CmdList->first, Event->ZeEvent));
-    } else {
-      ZE_CALL(zeCommandListAppendBarrier,
-              (CmdList->first, Event->ZeEvent, EventWaitList.Length,
-               EventWaitList.ZeEventList));
-    }
-    return PI_SUCCESS;
-  };
-
-  // If the queue is in-order then each command in it effectively acts as a
-  // barrier, so we don't need to do anything except if we were requested
-  // a "barrier" event to be created. Or if we need to wait for events in
-  // potentially different queues.
-  //
-  if (Queue->isInOrderQueue() && NumEventsInWaitList == 0 && !OutEvent) {
-    return PI_SUCCESS;
-  }
-
-  pi_event InternalEvent;
-  bool IsInternal = OutEvent == nullptr;
-  pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
-
-  // Indicator for whether batching is allowed. This may be changed later in
-  // this function, but allow it by default.
-  bool OkToBatch = true;
-
-  // If we have a list of events to make the barrier from, then we can create a
-  // barrier on these and use the resulting event as our future barrier.
-  // We use the same approach if
-  // UR_L0_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a
-  // positive value.
-  // We use the same approach if we have in-order queue because every command
-  // depends on previous one, so we don't need to insert barrier to multiple
-  // command lists.
-  if (NumEventsInWaitList || !UseMultipleCmdlistBarriers ||
-      Queue->isInOrderQueue()) {
-    // Retain the events as they will be owned by the result event.
-    _pi_ze_event_list_t TmpWaitList;
-    if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
-            NumEventsInWaitList, EventWaitList, Queue,
-            /*UseCopyEngine=*/false))
-      return Res;
-
-    // Get an arbitrary command-list in the queue.
-    pi_command_list_ptr_t CmdList;
-    if (auto Res = Queue->Context->getAvailableCommandList(
-            Queue, CmdList,
-            /*UseCopyEngine=*/false, OkToBatch))
-      return Res;
-
-    // Insert the barrier into the command-list and execute.
-    if (auto Res =
-            insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal))
-      return Res;
-
-    if (auto Res = Queue->executeCommandList(CmdList, false, OkToBatch))
-      return Res;
-
-    // Because of the dependency between commands in the in-order queue we don't
-    // need to keep track of any active barriers if we have in-order queue.
-    if (UseMultipleCmdlistBarriers && !Queue->isInOrderQueue()) {
-      Queue->ActiveBarriers.add(*Event);
-    }
-    return PI_SUCCESS;
-  }
 
-  // Since there are no events to explicitly create a barrier for, we are
-  // inserting a queue-wide barrier.
-
-  // Command list(s) for putting barriers.
-  std::vector<pi_command_list_ptr_t> CmdLists;
-
-  // There must be at least one L0 queue.
-  auto &ComputeGroup = Queue->ComputeQueueGroupsByTID.get();
-  auto &CopyGroup = Queue->CopyQueueGroupsByTID.get();
-  PI_ASSERT(!ComputeGroup.ZeQueues.empty() || !CopyGroup.ZeQueues.empty(),
-            PI_ERROR_INVALID_QUEUE);
-
-  size_t NumQueues = 0;
-  for (auto &QueueMap :
-       {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID})
-    for (auto &QueueGroup : QueueMap)
-      NumQueues += QueueGroup.second.ZeQueues.size();
-
-  OkToBatch = true;
-  // Get an available command list tied to each command queue. We need
-  // these so a queue-wide barrier can be inserted into each command
-  // queue.
-  CmdLists.reserve(NumQueues);
-  for (auto &QueueMap :
-       {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID})
-    for (auto &QueueGroup : QueueMap) {
-      bool UseCopyEngine =
-          QueueGroup.second.Type != _pi_queue::queue_type::Compute;
-      if (Queue->UsingImmCmdLists) {
-        // If immediate command lists are being used, each will act as their own
-        // queue, so we must insert a barrier into each.
-        for (auto &ImmCmdList : QueueGroup.second.ImmCmdLists)
-          if (ImmCmdList != Queue->CommandListMap.end())
-            CmdLists.push_back(ImmCmdList);
-      } else {
-        for (auto ZeQueue : QueueGroup.second.ZeQueues) {
-          if (ZeQueue) {
-            pi_command_list_ptr_t CmdList;
-            if (auto Res = Queue->Context->getAvailableCommandList(
-                    Queue, CmdList, UseCopyEngine, OkToBatch, &ZeQueue))
-              return Res;
-            CmdLists.push_back(CmdList);
-          }
-        }
-      }
-    }
-
-  // If no activity has occurred on the queue then there will be no cmdlists.
-  // We need one for generating an Event, so create one.
-  if (CmdLists.size() == 0) {
-    // Get any available command list.
-    pi_command_list_ptr_t CmdList;
-    if (auto Res = Queue->Context->getAvailableCommandList(
-            Queue, CmdList,
-            /*UseCopyEngine=*/false, OkToBatch))
-      return Res;
-    CmdLists.push_back(CmdList);
-  }
-
-  if (CmdLists.size() > 1) {
-    // Insert a barrier into each unique command queue using the available
-    // command-lists.
-    std::vector<pi_event> EventWaitVector(CmdLists.size());
-    for (size_t I = 0; I < CmdLists.size(); ++I) {
-      if (auto Res =
-              insertBarrierIntoCmdList(CmdLists[I], _pi_ze_event_list_t{},
-                                       EventWaitVector[I], /*IsInternal*/ true))
-        return Res;
-    }
-    // If there were multiple queues we need to create a "convergence" event to
-    // be our active barrier. This convergence event is signalled by a barrier
-    // on all the events from the barriers we have inserted into each queue.
-    // Use the first command list as our convergence command list.
-    pi_command_list_ptr_t &ConvergenceCmdList = CmdLists[0];
-
-    // Create an event list. It will take ownership over all relevant events so
-    // we relinquish ownership and let it keep all events it needs.
-    _pi_ze_event_list_t BaseWaitList;
-    if (auto Res = BaseWaitList.createAndRetainPiZeEventList(
-            EventWaitVector.size(), EventWaitVector.data(), Queue,
-            ConvergenceCmdList->second.isCopy(Queue)))
-      return Res;
-
-    // Insert a barrier with the events from each command-queue into the
-    // convergence command list. The resulting event signals the convergence of
-    // all barriers.
-    if (auto Res = insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList,
-                                            *Event, IsInternal))
-      return Res;
-  } else {
-    // If there is only a single queue then insert a barrier and the single
-    // result event can be used as our active barrier and used as the return
-    // event. Take into account whether output event is discarded or not.
-    if (auto Res = insertBarrierIntoCmdList(CmdLists[0], _pi_ze_event_list_t{},
-                                            *Event, IsInternal))
-      return Res;
-  }
-
-  // Execute each command list so the barriers can be encountered.
-  for (pi_command_list_ptr_t &CmdList : CmdLists)
-    if (auto Res = Queue->executeCommandList(CmdList, false, OkToBatch))
-      return Res;
-
-  if (auto Res = Queue->ActiveBarriers.clear())
-    return Res;
-  Queue->ActiveBarriers.add(*Event);
-  return PI_SUCCESS;
+  return pi2ur::piEnqueueEventsWaitWithBarrier(Queue, NumEventsInWaitList,
+                                               EventWaitList, OutEvent);
 }
 
 pi_result piEnqueueMemBufferRead(pi_queue Queue, pi_mem Src,
@@ -6018,19 +561,10 @@ pi_result piEnqueueMemBufferRead(pi_queue Queue, pi_mem Src,
                                  pi_uint32 NumEventsInWaitList,
                                  const pi_event *EventWaitList,
                                  pi_event *Event) {
-  PI_ASSERT(Src, PI_ERROR_INVALID_MEM_OBJECT);
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  std::shared_lock<ur_shared_mutex> SrcLock(Src->Mutex, std::defer_lock);
-  std::scoped_lock<std::shared_lock<ur_shared_mutex>, ur_shared_mutex> LockAll(
-      SrcLock, Queue->Mutex);
 
-  char *ZeHandleSrc;
-  PI_CALL(Src->getZeHandle(ZeHandleSrc, _pi_mem::read_only, Queue->Device));
-  return enqueueMemCopyHelper(PI_COMMAND_TYPE_MEM_BUFFER_READ, Queue, Dst,
-                              BlockingRead, Size, ZeHandleSrc + Offset,
-                              NumEventsInWaitList, EventWaitList, Event,
-                              /* PreferCopyEngine */ true);
+  return pi2ur::piEnqueueMemBufferRead(Queue, Src, BlockingRead, Offset, Size,
+                                       Dst, NumEventsInWaitList, EventWaitList,
+                                       Event);
 }
 
 pi_result piEnqueueMemBufferReadRect(
@@ -6041,255 +575,12 @@ pi_result piEnqueueMemBufferReadRect(
     pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
     pi_event *Event) {
 
-  PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT);
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  std::shared_lock<ur_shared_mutex> SrcLock(Buffer->Mutex, std::defer_lock);
-  std::scoped_lock<std::shared_lock<ur_shared_mutex>, ur_shared_mutex> LockAll(
-      SrcLock, Queue->Mutex);
-
-  char *ZeHandleSrc;
-  PI_CALL(Buffer->getZeHandle(ZeHandleSrc, _pi_mem::read_only, Queue->Device));
-  return enqueueMemCopyRectHelper(
-      PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, Queue, ZeHandleSrc,
-      static_cast<char *>(Ptr), BufferOffset, HostOffset, Region,
-      BufferRowPitch, HostRowPitch, BufferSlicePitch, HostSlicePitch,
-      BlockingRead, NumEventsInWaitList, EventWaitList, Event);
-}
-
-} // extern "C"
-
-bool _pi_queue::useCopyEngine(bool PreferCopyEngine) const {
-  auto InitialCopyGroup = CopyQueueGroupsByTID.begin()->second;
-  return PreferCopyEngine && InitialCopyGroup.ZeQueues.size() > 0 &&
-         (!isInOrderQueue() || UseCopyEngineForInOrderQueue);
-}
-
-// Wait on all operations in flight on this Queue.
-// The caller is expected to hold a lock on the Queue.
-// For standard commandlists sync the L0 queues directly.
-// For immediate commandlists add barriers to all commandlists associated
-// with the Queue. An alternative approach would be to wait on all Events
-// associated with the in-flight operations.
-// TODO: Event release in immediate commandlist mode is driven by the SYCL
-// runtime. Need to investigate whether relase can be done earlier, at sync
-// points such as this, to reduce total number of active Events.
-pi_result _pi_queue::synchronize() {
-  if (!Healthy)
-    return PI_SUCCESS;
-
-  auto syncImmCmdList = [](_pi_queue *Queue, pi_command_list_ptr_t ImmCmdList) {
-    if (ImmCmdList == Queue->CommandListMap.end())
-      return PI_SUCCESS;
-
-    pi_event Event;
-    pi_result Res =
-        createEventAndAssociateQueue(Queue, &Event, PI_COMMAND_TYPE_USER,
-                                     ImmCmdList, /* IsInternal */ false);
-    if (Res != PI_SUCCESS)
-      return Res;
-    auto zeEvent = Event->ZeEvent;
-    ZE_CALL(zeCommandListAppendBarrier,
-            (ImmCmdList->first, zeEvent, 0, nullptr));
-    ZE_CALL(zeHostSynchronize, (zeEvent));
-    Event->Completed = true;
-    PI_CALL(piEventRelease(Event));
-
-    // Cleanup all events from the synced command list.
-    auto EventListToCleanup = std::move(ImmCmdList->second.EventList);
-    ImmCmdList->second.EventList.clear();
-    CleanupEventListFromResetCmdList(EventListToCleanup, true);
-    return PI_SUCCESS;
-  };
-
-  if (LastCommandEvent) {
-    // For in-order queue just wait for the last command.
-    // If event is discarded then it can be in reset state or underlying level
-    // zero handle can have device scope, so we can't synchronize the last
-    // event.
-    if (isInOrderQueue() && !LastCommandEvent->IsDiscarded) {
-      ZE_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent));
-    } else {
-      // Otherwise sync all L0 queues/immediate command-lists.
-      for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) {
-        for (auto &QueueGroup : QueueMap) {
-          if (UsingImmCmdLists) {
-            for (auto ImmCmdList : QueueGroup.second.ImmCmdLists)
-              syncImmCmdList(this, ImmCmdList);
-          } else {
-            for (auto &ZeQueue : QueueGroup.second.ZeQueues)
-              if (ZeQueue)
-                ZE_CALL(zeHostSynchronize, (ZeQueue));
-          }
-        }
-      }
-    }
-    LastCommandEvent = nullptr;
-  }
-  // With the entire queue synchronized, the active barriers must be done so we
-  // can remove them.
-  if (auto Res = ActiveBarriers.clear())
-    return Res;
-
-  return PI_SUCCESS;
-}
-
-// Shared by all memory read/write/copy PI interfaces.
-// PI interfaces must have queue's and destination buffer's mutexes locked for
-// exclusive use and source buffer's mutex locked for shared use on entry.
-static pi_result
-enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst,
-                     pi_bool BlockingWrite, size_t Size, const void *Src,
-                     pi_uint32 NumEventsInWaitList,
-                     const pi_event *EventWaitList, pi_event *OutEvent,
-                     bool PreferCopyEngine) {
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine);
-
-  _pi_ze_event_list_t TmpWaitList;
-  if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
-          NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
-    return Res;
-
-  // We want to batch these commands to avoid extra submissions (costly)
-  bool OkToBatch = true;
-
-  // Get a new command list to be used on this call
-  pi_command_list_ptr_t CommandList{};
-  if (auto Res = Queue->Context->getAvailableCommandList(
-          Queue, CommandList, UseCopyEngine, OkToBatch))
-    return Res;
-
-  ze_event_handle_t ZeEvent = nullptr;
-  pi_event InternalEvent;
-  bool IsInternal = OutEvent == nullptr;
-  pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
-  auto Res = createEventAndAssociateQueue(Queue, Event, CommandType,
-                                          CommandList, IsInternal);
-  if (Res != PI_SUCCESS)
-    return Res;
-  ZeEvent = (*Event)->ZeEvent;
-  (*Event)->WaitList = TmpWaitList;
-
-  const auto &ZeCommandList = CommandList->first;
-  const auto &WaitList = (*Event)->WaitList;
-
-  urPrint("calling zeCommandListAppendMemoryCopy() with\n"
-          "  ZeEvent %#llx\n",
-          ur_cast<std::uintptr_t>(ZeEvent));
-  printZeEventList(WaitList);
-
-  ZE_CALL(zeCommandListAppendMemoryCopy,
-          (ZeCommandList, Dst, Src, Size, ZeEvent, WaitList.Length,
-           WaitList.ZeEventList));
-
-  if (auto Res =
-          Queue->executeCommandList(CommandList, BlockingWrite, OkToBatch))
-    return Res;
-
-  return PI_SUCCESS;
-}
-
-// Shared by all memory read/write/copy rect PI interfaces.
-// PI interfaces must have queue's and destination buffer's mutexes locked for
-// exclusive use and source buffer's mutex locked for shared use on entry.
-static pi_result enqueueMemCopyRectHelper(
-    pi_command_type CommandType, pi_queue Queue, const void *SrcBuffer,
-    void *DstBuffer, pi_buff_rect_offset SrcOrigin,
-    pi_buff_rect_offset DstOrigin, pi_buff_rect_region Region,
-    size_t SrcRowPitch, size_t DstRowPitch, size_t SrcSlicePitch,
-    size_t DstSlicePitch, pi_bool Blocking, pi_uint32 NumEventsInWaitList,
-    const pi_event *EventWaitList, pi_event *OutEvent, bool PreferCopyEngine) {
-
-  PI_ASSERT(Region && SrcOrigin && DstOrigin && Queue, PI_ERROR_INVALID_VALUE);
-
-  bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine);
-
-  _pi_ze_event_list_t TmpWaitList;
-  if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
-          NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
-    return Res;
-
-  // We want to batch these commands to avoid extra submissions (costly)
-  bool OkToBatch = true;
-
-  // Get a new command list to be used on this call
-  pi_command_list_ptr_t CommandList{};
-  if (auto Res = Queue->Context->getAvailableCommandList(
-          Queue, CommandList, UseCopyEngine, OkToBatch))
-    return Res;
-
-  ze_event_handle_t ZeEvent = nullptr;
-  pi_event InternalEvent;
-  bool IsInternal = OutEvent == nullptr;
-  pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
-  auto Res = createEventAndAssociateQueue(Queue, Event, CommandType,
-                                          CommandList, IsInternal);
-  if (Res != PI_SUCCESS)
-    return Res;
-  ZeEvent = (*Event)->ZeEvent;
-  (*Event)->WaitList = TmpWaitList;
-
-  const auto &ZeCommandList = CommandList->first;
-  const auto &WaitList = (*Event)->WaitList;
-
-  urPrint("calling zeCommandListAppendMemoryCopy() with\n"
-          "  ZeEvent %#llx\n",
-          ur_cast<std::uintptr_t>(ZeEvent));
-  printZeEventList(WaitList);
-
-  uint32_t SrcOriginX = ur_cast<uint32_t>(SrcOrigin->x_bytes);
-  uint32_t SrcOriginY = ur_cast<uint32_t>(SrcOrigin->y_scalar);
-  uint32_t SrcOriginZ = ur_cast<uint32_t>(SrcOrigin->z_scalar);
-
-  uint32_t SrcPitch = SrcRowPitch;
-  if (SrcPitch == 0)
-    SrcPitch = ur_cast<uint32_t>(Region->width_bytes);
-
-  if (SrcSlicePitch == 0)
-    SrcSlicePitch = ur_cast<uint32_t>(Region->height_scalar) * SrcPitch;
-
-  uint32_t DstOriginX = ur_cast<uint32_t>(DstOrigin->x_bytes);
-  uint32_t DstOriginY = ur_cast<uint32_t>(DstOrigin->y_scalar);
-  uint32_t DstOriginZ = ur_cast<uint32_t>(DstOrigin->z_scalar);
-
-  uint32_t DstPitch = DstRowPitch;
-  if (DstPitch == 0)
-    DstPitch = ur_cast<uint32_t>(Region->width_bytes);
-
-  if (DstSlicePitch == 0)
-    DstSlicePitch = ur_cast<uint32_t>(Region->height_scalar) * DstPitch;
-
-  uint32_t Width = ur_cast<uint32_t>(Region->width_bytes);
-  uint32_t Height = ur_cast<uint32_t>(Region->height_scalar);
-  uint32_t Depth = ur_cast<uint32_t>(Region->depth_scalar);
-
-  const ze_copy_region_t ZeSrcRegion = {SrcOriginX, SrcOriginY, SrcOriginZ,
-                                        Width,      Height,     Depth};
-  const ze_copy_region_t ZeDstRegion = {DstOriginX, DstOriginY, DstOriginZ,
-                                        Width,      Height,     Depth};
-
-  ZE_CALL(zeCommandListAppendMemoryCopyRegion,
-          (ZeCommandList, DstBuffer, &ZeDstRegion, DstPitch, DstSlicePitch,
-           SrcBuffer, &ZeSrcRegion, SrcPitch, SrcSlicePitch, nullptr,
-           WaitList.Length, WaitList.ZeEventList));
-
-  urPrint("calling zeCommandListAppendMemoryCopyRegion()\n");
-
-  ZE_CALL(zeCommandListAppendBarrier, (ZeCommandList, ZeEvent, 0, nullptr));
-
-  urPrint("calling zeCommandListAppendBarrier() with Event %#llx\n",
-          ur_cast<std::uintptr_t>(ZeEvent));
-
-  if (auto Res = Queue->executeCommandList(CommandList, Blocking, OkToBatch))
-    return Res;
-
-  return PI_SUCCESS;
+  return pi2ur::piEnqueueMemBufferReadRect(
+      Queue, Buffer, BlockingRead, BufferOffset, HostOffset, Region,
+      BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr,
+      NumEventsInWaitList, EventWaitList, Event);
 }
 
-extern "C" {
-
 pi_result piEnqueueMemBufferWrite(pi_queue Queue, pi_mem Buffer,
                                   pi_bool BlockingWrite, size_t Offset,
                                   size_t Size, const void *Ptr,
@@ -6297,20 +588,9 @@ pi_result piEnqueueMemBufferWrite(pi_queue Queue, pi_mem Buffer,
                                   const pi_event *EventWaitList,
                                   pi_event *Event) {
 
-  PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT);
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
-                                                          Buffer->Mutex);
-
-  char *ZeHandleDst;
-  PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device));
-  return enqueueMemCopyHelper(PI_COMMAND_TYPE_MEM_BUFFER_WRITE, Queue,
-                              ZeHandleDst + Offset, // dst
-                              BlockingWrite, Size,
-                              Ptr, // src
-                              NumEventsInWaitList, EventWaitList, Event,
-                              /* PreferCopyEngine */ true);
+  return pi2ur::piEnqueueMemBufferWrite(Queue, Buffer, BlockingWrite, Offset,
+                                        Size, Ptr, NumEventsInWaitList,
+                                        EventWaitList, Event);
 }
 
 pi_result piEnqueueMemBufferWriteRect(
@@ -6321,20 +601,10 @@ pi_result piEnqueueMemBufferWriteRect(
     pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
     pi_event *Event) {
 
-  PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT);
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
-                                                          Buffer->Mutex);
-
-  char *ZeHandleDst;
-  PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device));
-  return enqueueMemCopyRectHelper(
-      PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, Queue,
-      const_cast<char *>(static_cast<const char *>(Ptr)), ZeHandleDst,
-      HostOffset, BufferOffset, Region, HostRowPitch, BufferRowPitch,
-      HostSlicePitch, BufferSlicePitch, BlockingWrite, NumEventsInWaitList,
-      EventWaitList, Event);
+  return pi2ur::piEnqueueMemBufferWriteRect(
+      Queue, Buffer, BlockingWrite, BufferOffset, HostOffset, Region,
+      BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr,
+      NumEventsInWaitList, EventWaitList, Event);
 }
 
 pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcMem, pi_mem DstMem,
@@ -6342,38 +612,10 @@ pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcMem, pi_mem DstMem,
                                  size_t Size, pi_uint32 NumEventsInWaitList,
                                  const pi_event *EventWaitList,
                                  pi_event *Event) {
-  PI_ASSERT(SrcMem && DstMem, PI_ERROR_INVALID_MEM_OBJECT);
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
 
-  PI_ASSERT(!SrcMem->isImage(), PI_ERROR_INVALID_MEM_OBJECT);
-  PI_ASSERT(!DstMem->isImage(), PI_ERROR_INVALID_MEM_OBJECT);
-  auto SrcBuffer = ur_cast<pi_buffer>(SrcMem);
-  auto DstBuffer = ur_cast<pi_buffer>(DstMem);
-
-  std::shared_lock<ur_shared_mutex> SrcLock(SrcBuffer->Mutex, std::defer_lock);
-  std::scoped_lock<std::shared_lock<ur_shared_mutex>, ur_shared_mutex,
-                   ur_shared_mutex>
-      LockAll(SrcLock, DstBuffer->Mutex, Queue->Mutex);
-
-  // Copy engine is preferred only for host to device transfer.
-  // Device to device transfers run faster on compute engines.
-  bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost);
-
-  // Temporary option added to use copy engine for D2D copy
-  PreferCopyEngine |= UseCopyEngineForD2DCopy;
-
-  char *ZeHandleSrc;
-  PI_CALL(
-      SrcBuffer->getZeHandle(ZeHandleSrc, _pi_mem::read_only, Queue->Device));
-  char *ZeHandleDst;
-  PI_CALL(
-      DstBuffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device));
-
-  return enqueueMemCopyHelper(
-      PI_COMMAND_TYPE_MEM_BUFFER_COPY, Queue, ZeHandleDst + DstOffset,
-      false, // blocking
-      Size, ZeHandleSrc + SrcOffset, NumEventsInWaitList, EventWaitList, Event,
-      PreferCopyEngine);
+  return pi2ur::piEnqueueMemBufferCopy(Queue, SrcMem, DstMem, SrcOffset,
+                                       DstOffset, Size, NumEventsInWaitList,
+                                       EventWaitList, Event);
 }
 
 pi_result piEnqueueMemBufferCopyRect(
@@ -6382,133 +624,13 @@ pi_result piEnqueueMemBufferCopyRect(
     size_t SrcRowPitch, size_t SrcSlicePitch, size_t DstRowPitch,
     size_t DstSlicePitch, pi_uint32 NumEventsInWaitList,
     const pi_event *EventWaitList, pi_event *Event) {
-  PI_ASSERT(SrcMem && DstMem, PI_ERROR_INVALID_MEM_OBJECT);
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  PI_ASSERT(!SrcMem->isImage(), PI_ERROR_INVALID_MEM_OBJECT);
-  PI_ASSERT(!DstMem->isImage(), PI_ERROR_INVALID_MEM_OBJECT);
-  auto SrcBuffer = ur_cast<pi_buffer>(SrcMem);
-  auto DstBuffer = ur_cast<pi_buffer>(DstMem);
-
-  std::shared_lock<ur_shared_mutex> SrcLock(SrcBuffer->Mutex, std::defer_lock);
-  std::scoped_lock<std::shared_lock<ur_shared_mutex>, ur_shared_mutex,
-                   ur_shared_mutex>
-      LockAll(SrcLock, DstBuffer->Mutex, Queue->Mutex);
-
-  // Copy engine is preferred only for host to device transfer.
-  // Device to device transfers run faster on compute engines.
-  bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost);
-
-  char *ZeHandleSrc;
-  PI_CALL(
-      SrcBuffer->getZeHandle(ZeHandleSrc, _pi_mem::read_only, Queue->Device));
-  char *ZeHandleDst;
-  PI_CALL(
-      DstBuffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device));
-
-  return enqueueMemCopyRectHelper(
-      PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, Queue, ZeHandleSrc, ZeHandleDst,
-      SrcOrigin, DstOrigin, Region, SrcRowPitch, DstRowPitch, SrcSlicePitch,
-      DstSlicePitch,
-      false, // blocking
-      NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine);
-}
-
-} // extern "C"
-
-// Default to using compute engine for fill operation, but allow to
-// override this with an environment variable.
-static bool PreferCopyEngine = [] {
-  const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_FILL");
-  const char *PiRet =
-      std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL");
-  return (UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0));
-}();
-
-// PI interfaces must have queue's and buffer's mutexes locked on entry.
-static pi_result
-enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr,
-                     const void *Pattern, size_t PatternSize, size_t Size,
-                     pi_uint32 NumEventsInWaitList,
-                     const pi_event *EventWaitList, pi_event *OutEvent) {
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-  // Pattern size must be a power of two.
-  PI_ASSERT((PatternSize > 0) && ((PatternSize & (PatternSize - 1)) == 0),
-            PI_ERROR_INVALID_VALUE);
-
-  auto &Device = Queue->Device;
-
-  // Make sure that pattern size matches the capability of the copy queues.
-  // Check both main and link groups as we don't known which one will be used.
-  //
-  if (PreferCopyEngine && Device->hasCopyEngine()) {
-    if (Device->hasMainCopyEngine() &&
-        Device->QueueGroup[_pi_device::queue_group_info_t::MainCopy]
-                .ZeProperties.maxMemoryFillPatternSize < PatternSize) {
-      PreferCopyEngine = false;
-    }
-    if (Device->hasLinkCopyEngine() &&
-        Device->QueueGroup[_pi_device::queue_group_info_t::LinkCopy]
-                .ZeProperties.maxMemoryFillPatternSize < PatternSize) {
-      PreferCopyEngine = false;
-    }
-  }
-
-  bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine);
-  if (!UseCopyEngine) {
-    // Pattern size must fit the compute queue capabilities.
-    PI_ASSERT(PatternSize <=
-                  Device->QueueGroup[_pi_device::queue_group_info_t::Compute]
-                      .ZeProperties.maxMemoryFillPatternSize,
-              PI_ERROR_INVALID_VALUE);
-  }
 
-  _pi_ze_event_list_t TmpWaitList;
-  if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
-          NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
-    return Res;
-
-  pi_command_list_ptr_t CommandList{};
-  // We want to batch these commands to avoid extra submissions (costly)
-  bool OkToBatch = true;
-  if (auto Res = Queue->Context->getAvailableCommandList(
-          Queue, CommandList, UseCopyEngine, OkToBatch))
-    return Res;
-
-  ze_event_handle_t ZeEvent = nullptr;
-  pi_event InternalEvent;
-  bool IsInternal = OutEvent == nullptr;
-  pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
-  auto Res = createEventAndAssociateQueue(Queue, Event, CommandType,
-                                          CommandList, IsInternal);
-  if (Res != PI_SUCCESS)
-    return Res;
-
-  ZeEvent = (*Event)->ZeEvent;
-  (*Event)->WaitList = TmpWaitList;
-
-  const auto &ZeCommandList = CommandList->first;
-  const auto &WaitList = (*Event)->WaitList;
-
-  ZE_CALL(zeCommandListAppendMemoryFill,
-          (ZeCommandList, Ptr, Pattern, PatternSize, Size, ZeEvent,
-           WaitList.Length, WaitList.ZeEventList));
-
-  urPrint("calling zeCommandListAppendMemoryFill() with\n"
-          "  ZeEvent %#llx\n",
-          ur_cast<pi_uint64>(ZeEvent));
-  printZeEventList(WaitList);
-
-  // Execute command list asynchronously, as the event will be used
-  // to track down its completion.
-  if (auto Res = Queue->executeCommandList(CommandList, false, OkToBatch))
-    return Res;
-
-  return PI_SUCCESS;
+  return pi2ur::piEnqueueMemBufferCopyRect(
+      Queue, SrcMem, DstMem, SrcOrigin, DstOrigin, Region, SrcRowPitch,
+      SrcSlicePitch, DstRowPitch, DstSlicePitch, NumEventsInWaitList,
+      EventWaitList, Event);
 }
 
-extern "C" {
-
 pi_result piEnqueueMemBufferFill(pi_queue Queue, pi_mem Buffer,
                                  const void *Pattern, size_t PatternSize,
                                  size_t Offset, size_t Size,
@@ -6516,502 +638,38 @@ pi_result piEnqueueMemBufferFill(pi_queue Queue, pi_mem Buffer,
                                  const pi_event *EventWaitList,
                                  pi_event *Event) {
 
-  PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT);
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
-                                                          Buffer->Mutex);
-
-  char *ZeHandleDst;
-  PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device));
-  return enqueueMemFillHelper(PI_COMMAND_TYPE_MEM_BUFFER_FILL, Queue,
-                              ZeHandleDst + Offset, Pattern, PatternSize, Size,
-                              NumEventsInWaitList, EventWaitList, Event);
+  return pi2ur::piEnqueueMemBufferFill(Queue, Buffer, Pattern, PatternSize,
+                                       Offset, Size, NumEventsInWaitList,
+                                       EventWaitList, Event);
 }
 
-static pi_result USMHostAllocImpl(void **ResultPtr, pi_context Context,
-                                  pi_usm_mem_properties *Properties,
-                                  size_t Size, pi_uint32 Alignment);
-
 pi_result piEnqueueMemBufferMap(pi_queue Queue, pi_mem Mem, pi_bool BlockingMap,
                                 pi_map_flags MapFlags, size_t Offset,
                                 size_t Size, pi_uint32 NumEventsInWaitList,
                                 const pi_event *EventWaitList,
                                 pi_event *OutEvent, void **RetMap) {
 
-  // TODO: we don't implement read-only or write-only, always read-write.
-  // assert((map_flags & PI_MAP_READ) != 0);
-  // assert((map_flags & PI_MAP_WRITE) != 0);
-  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  PI_ASSERT(!Mem->isImage(), PI_ERROR_INVALID_MEM_OBJECT);
-  auto Buffer = ur_cast<pi_buffer>(Mem);
-
-  pi_event InternalEvent;
-  bool IsInternal = OutEvent == nullptr;
-  pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
-  ze_event_handle_t ZeEvent = nullptr;
-
-  bool UseCopyEngine = false;
-  {
-    // Lock automatically releases when this goes out of scope.
-    std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
-
-    _pi_ze_event_list_t TmpWaitList;
-    if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
-            NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
-      return Res;
-
-    auto Res = createEventAndAssociateQueue(
-        Queue, Event, PI_COMMAND_TYPE_MEM_BUFFER_MAP,
-        Queue->CommandListMap.end(), IsInternal);
-    if (Res != PI_SUCCESS)
-      return Res;
-
-    ZeEvent = (*Event)->ZeEvent;
-    (*Event)->WaitList = TmpWaitList;
-  }
-
-  // Translate the host access mode info.
-  _pi_mem::access_mode_t AccessMode = _pi_mem::unknown;
-  if (MapFlags & PI_MAP_WRITE_INVALIDATE_REGION)
-    AccessMode = _pi_mem::write_only;
-  else {
-    if (MapFlags & PI_MAP_READ) {
-      AccessMode = _pi_mem::read_only;
-      if (MapFlags & PI_MAP_WRITE)
-        AccessMode = _pi_mem::read_write;
-    } else if (MapFlags & PI_MAP_WRITE)
-      AccessMode = _pi_mem::write_only;
-  }
-  PI_ASSERT(AccessMode != _pi_mem::unknown, PI_ERROR_INVALID_VALUE);
-
-  // TODO: Level Zero is missing the memory "mapping" capabilities, so we are
-  // left to doing new memory allocation and a copy (read) on discrete devices.
-  // For integrated devices, we have allocated the buffer in host memory so no
-  // actions are needed here except for synchronizing on incoming events.
-  // A host-to-host copy is done if a host pointer had been supplied during
-  // buffer creation on integrated devices.
-  //
-  // TODO: for discrete, check if the input buffer is already allocated
-  // in shared memory and thus is accessible from the host as is.
-  // Can we get SYCL RT to predict/allocate in shared memory
-  // from the beginning?
-
-  // For integrated devices the buffer has been allocated in host memory.
-  if (Buffer->OnHost) {
-    // Wait on incoming events before doing the copy
-    if (NumEventsInWaitList > 0)
-      PI_CALL(piEventsWait(NumEventsInWaitList, EventWaitList));
-
-    if (Queue->isInOrderQueue())
-      PI_CALL(piQueueFinish(Queue));
-
-    // Lock automatically releases when this goes out of scope.
-    std::scoped_lock<ur_shared_mutex> Guard(Buffer->Mutex);
-
-    char *ZeHandleSrc;
-    PI_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device));
-
-    if (Buffer->MapHostPtr) {
-      *RetMap = Buffer->MapHostPtr + Offset;
-      if (ZeHandleSrc != Buffer->MapHostPtr &&
-          AccessMode != _pi_mem::write_only) {
-        memcpy(*RetMap, ZeHandleSrc + Offset, Size);
-      }
-    } else {
-      *RetMap = ZeHandleSrc + Offset;
-    }
-
-    auto Res = Buffer->Mappings.insert({*RetMap, {Offset, Size}});
-    // False as the second value in pair means that mapping was not inserted
-    // because mapping already exists.
-    if (!Res.second) {
-      urPrint("piEnqueueMemBufferMap: duplicate mapping detected\n");
-      return PI_ERROR_INVALID_VALUE;
-    }
-
-    // Signal this event
-    ZE_CALL(zeEventHostSignal, (ZeEvent));
-    (*Event)->Completed = true;
-    return PI_SUCCESS;
-  }
-
-  // Lock automatically releases when this goes out of scope.
-  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
-                                                          Buffer->Mutex);
-
-  if (Buffer->MapHostPtr) {
-    *RetMap = Buffer->MapHostPtr + Offset;
-  } else {
-    // TODO: use USM host allocator here
-    // TODO: Do we even need every map to allocate new host memory?
-    //       In the case when the buffer is "OnHost" we use single allocation.
-    if (auto Res = ZeHostMemAllocHelper(RetMap, Queue->Context, Size))
-      return Res;
-  }
-
-  // Take a shortcut if the host is not going to read buffer's data.
-  if (AccessMode == _pi_mem::write_only) {
-    (*Event)->Completed = true;
-  } else {
-    // For discrete devices we need a command list
-    pi_command_list_ptr_t CommandList{};
-    if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                           UseCopyEngine))
-      return Res;
-
-    // Add the event to the command list.
-    CommandList->second.append(*Event);
-    (*Event)->RefCount.increment();
-
-    const auto &ZeCommandList = CommandList->first;
-    const auto &WaitList = (*Event)->WaitList;
-
-    char *ZeHandleSrc;
-    PI_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device));
-
-    ZE_CALL(zeCommandListAppendMemoryCopy,
-            (ZeCommandList, *RetMap, ZeHandleSrc + Offset, Size, ZeEvent,
-             WaitList.Length, WaitList.ZeEventList));
-
-    if (auto Res = Queue->executeCommandList(CommandList, BlockingMap))
-      return Res;
-  }
-
-  auto Res = Buffer->Mappings.insert({*RetMap, {Offset, Size}});
-  // False as the second value in pair means that mapping was not inserted
-  // because mapping already exists.
-  if (!Res.second) {
-    urPrint("piEnqueueMemBufferMap: duplicate mapping detected\n");
-    return PI_ERROR_INVALID_VALUE;
-  }
-  return PI_SUCCESS;
+  return pi2ur::piEnqueueMemBufferMap(Queue, Mem, BlockingMap, MapFlags, Offset,
+                                      Size, NumEventsInWaitList, EventWaitList,
+                                      OutEvent, RetMap);
 }
 
 pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem Mem, void *MappedPtr,
                             pi_uint32 NumEventsInWaitList,
                             const pi_event *EventWaitList, pi_event *OutEvent) {
-  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  PI_ASSERT(!Mem->isImage(), PI_ERROR_INVALID_MEM_OBJECT);
-  auto Buffer = ur_cast<pi_buffer>(Mem);
-
-  bool UseCopyEngine = false;
-
-  ze_event_handle_t ZeEvent = nullptr;
-  pi_event InternalEvent;
-  bool IsInternal = OutEvent == nullptr;
-  pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
-  {
-    // Lock automatically releases when this goes out of scope.
-    std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
-
-    _pi_ze_event_list_t TmpWaitList;
-    if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
-            NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
-      return Res;
-
-    auto Res = createEventAndAssociateQueue(
-        Queue, Event, PI_COMMAND_TYPE_MEM_BUFFER_UNMAP,
-        Queue->CommandListMap.end(), IsInternal);
-    if (Res != PI_SUCCESS)
-      return Res;
-    ZeEvent = (*Event)->ZeEvent;
-    (*Event)->WaitList = TmpWaitList;
-  }
-
-  _pi_buffer::Mapping MapInfo = {};
-  {
-    // Lock automatically releases when this goes out of scope.
-    std::scoped_lock<ur_shared_mutex> Guard(Buffer->Mutex);
-    auto It = Buffer->Mappings.find(MappedPtr);
-    if (It == Buffer->Mappings.end()) {
-      urPrint("piEnqueueMemUnmap: unknown memory mapping\n");
-      return PI_ERROR_INVALID_VALUE;
-    }
-    MapInfo = It->second;
-    Buffer->Mappings.erase(It);
-
-    // NOTE: we still have to free the host memory allocated/returned by
-    // piEnqueueMemBufferMap, but can only do so after the above copy
-    // is completed. Instead of waiting for It here (blocking), we shall
-    // do so in piEventRelease called for the pi_event tracking the unmap.
-    // In the case of an integrated device, the map operation does not allocate
-    // any memory, so there is nothing to free. This is indicated by a nullptr.
-    (*Event)->CommandData =
-        (Buffer->OnHost ? nullptr : (Buffer->MapHostPtr ? nullptr : MappedPtr));
-  }
-
-  // For integrated devices the buffer is allocated in host memory.
-  if (Buffer->OnHost) {
-    // Wait on incoming events before doing the copy
-    if (NumEventsInWaitList > 0)
-      PI_CALL(piEventsWait(NumEventsInWaitList, EventWaitList));
-
-    if (Queue->isInOrderQueue())
-      PI_CALL(piQueueFinish(Queue));
-
-    char *ZeHandleDst;
-    PI_CALL(
-        Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device));
 
-    std::scoped_lock<ur_shared_mutex> Guard(Buffer->Mutex);
-    if (Buffer->MapHostPtr)
-      memcpy(ZeHandleDst + MapInfo.Offset, MappedPtr, MapInfo.Size);
-
-    // Signal this event
-    ZE_CALL(zeEventHostSignal, (ZeEvent));
-    (*Event)->Completed = true;
-    return PI_SUCCESS;
-  }
-
-  // Lock automatically releases when this goes out of scope.
-  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
-                                                          Buffer->Mutex);
-
-  pi_command_list_ptr_t CommandList{};
-  if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                         UseCopyEngine))
-    return Res;
-
-  CommandList->second.append(*Event);
-  (*Event)->RefCount.increment();
-
-  const auto &ZeCommandList = CommandList->first;
-
-  // TODO: Level Zero is missing the memory "mapping" capabilities, so we are
-  // left to doing copy (write back to the device).
-  //
-  // NOTE: Keep this in sync with the implementation of
-  // piEnqueueMemBufferMap.
-
-  char *ZeHandleDst;
-  PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device));
-
-  ZE_CALL(zeCommandListAppendMemoryCopy,
-          (ZeCommandList, ZeHandleDst + MapInfo.Offset, MappedPtr, MapInfo.Size,
-           ZeEvent, (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList));
-
-  // Execute command list asynchronously, as the event will be used
-  // to track down its completion.
-  if (auto Res = Queue->executeCommandList(CommandList))
-    return Res;
-
-  return PI_SUCCESS;
+  return pi2ur::piEnqueueMemUnmap(Queue, Mem, MappedPtr, NumEventsInWaitList,
+                                  EventWaitList, OutEvent);
 }
 
 pi_result piMemImageGetInfo(pi_mem Image, pi_image_info ParamName,
                             size_t ParamValueSize, void *ParamValue,
                             size_t *ParamValueSizeRet) {
-  (void)Image;
-  (void)ParamName;
-  (void)ParamValueSize;
-  (void)ParamValue;
-  (void)ParamValueSizeRet;
-
-  die("piMemImageGetInfo: not implemented");
-  return {};
-}
-
-} // extern "C"
-
-static pi_result getImageRegionHelper(pi_mem Mem, pi_image_offset Origin,
-                                      pi_image_region Region,
-                                      ze_image_region_t &ZeRegion) {
-
-  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
-  PI_ASSERT(Origin, PI_ERROR_INVALID_VALUE);
-
-#ifndef NDEBUG
-  PI_ASSERT(Mem->isImage(), PI_ERROR_INVALID_MEM_OBJECT);
-  auto Image = static_cast<_pi_image *>(Mem);
-  ze_image_desc_t &ZeImageDesc = Image->ZeImageDesc;
-
-  PI_ASSERT((ZeImageDesc.type == ZE_IMAGE_TYPE_1D && Origin->y == 0 &&
-             Origin->z == 0) ||
-                (ZeImageDesc.type == ZE_IMAGE_TYPE_1DARRAY && Origin->z == 0) ||
-                (ZeImageDesc.type == ZE_IMAGE_TYPE_2D && Origin->z == 0) ||
-                (ZeImageDesc.type == ZE_IMAGE_TYPE_3D),
-            PI_ERROR_INVALID_VALUE);
-
-  PI_ASSERT(Region->width && Region->height && Region->depth,
-            PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(
-      (ZeImageDesc.type == ZE_IMAGE_TYPE_1D && Region->height == 1 &&
-       Region->depth == 1) ||
-          (ZeImageDesc.type == ZE_IMAGE_TYPE_1DARRAY && Region->depth == 1) ||
-          (ZeImageDesc.type == ZE_IMAGE_TYPE_2D && Region->depth == 1) ||
-          (ZeImageDesc.type == ZE_IMAGE_TYPE_3D),
-      PI_ERROR_INVALID_VALUE);
-#endif // !NDEBUG
-
-  uint32_t OriginX = ur_cast<uint32_t>(Origin->x);
-  uint32_t OriginY = ur_cast<uint32_t>(Origin->y);
-  uint32_t OriginZ = ur_cast<uint32_t>(Origin->z);
-
-  uint32_t Width = ur_cast<uint32_t>(Region->width);
-  uint32_t Height = ur_cast<uint32_t>(Region->height);
-  uint32_t Depth = ur_cast<uint32_t>(Region->depth);
-
-  ZeRegion = {OriginX, OriginY, OriginZ, Width, Height, Depth};
-
-  return PI_SUCCESS;
-}
-
-// Helper function to implement image read/write/copy.
-// PI interfaces must have queue's and destination image's mutexes locked for
-// exclusive use and source image's mutex locked for shared use on entry.
-static pi_result enqueueMemImageCommandHelper(
-    pi_command_type CommandType, pi_queue Queue,
-    const void *Src, // image or ptr
-    void *Dst,       // image or ptr
-    pi_bool IsBlocking, pi_image_offset SrcOrigin, pi_image_offset DstOrigin,
-    pi_image_region Region, size_t RowPitch, size_t SlicePitch,
-    pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
-    pi_event *OutEvent, bool PreferCopyEngine = false) {
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine);
-
-  _pi_ze_event_list_t TmpWaitList;
-  if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
-          NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
-    return Res;
-
-  // We want to batch these commands to avoid extra submissions (costly)
-  bool OkToBatch = true;
-
-  // Get a new command list to be used on this call
-  pi_command_list_ptr_t CommandList{};
-  if (auto Res = Queue->Context->getAvailableCommandList(
-          Queue, CommandList, UseCopyEngine, OkToBatch))
-    return Res;
-
-  ze_event_handle_t ZeEvent = nullptr;
-  pi_event InternalEvent;
-  bool IsInternal = OutEvent == nullptr;
-  pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
-  auto Res = createEventAndAssociateQueue(Queue, Event, CommandType,
-                                          CommandList, IsInternal);
-  if (Res != PI_SUCCESS)
-    return Res;
-  ZeEvent = (*Event)->ZeEvent;
-  (*Event)->WaitList = TmpWaitList;
-
-  const auto &ZeCommandList = CommandList->first;
-  const auto &WaitList = (*Event)->WaitList;
-
-  if (CommandType == PI_COMMAND_TYPE_IMAGE_READ) {
-    pi_mem SrcMem = ur_cast<pi_mem>(const_cast<void *>(Src));
-
-    ze_image_region_t ZeSrcRegion;
-    auto Result = getImageRegionHelper(SrcMem, SrcOrigin, Region, ZeSrcRegion);
-    if (Result != PI_SUCCESS)
-      return Result;
-
-    // TODO: Level Zero does not support row_pitch/slice_pitch for images yet.
-    // Check that SYCL RT did not want pitch larger than default.
-    (void)RowPitch;
-    (void)SlicePitch;
-#ifndef NDEBUG
-    PI_ASSERT(SrcMem->isImage(), PI_ERROR_INVALID_MEM_OBJECT);
-
-    auto SrcImage = static_cast<_pi_image *>(SrcMem);
-    const ze_image_desc_t &ZeImageDesc = SrcImage->ZeImageDesc;
-    PI_ASSERT(
-        RowPitch == 0 ||
-            // special case RGBA image pitch equal to region's width
-            (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32 &&
-             RowPitch == 4 * 4 * ZeSrcRegion.width) ||
-            (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16 &&
-             RowPitch == 4 * 2 * ZeSrcRegion.width) ||
-            (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 &&
-             RowPitch == 4 * ZeSrcRegion.width),
-        PI_ERROR_INVALID_IMAGE_SIZE);
-    PI_ASSERT(SlicePitch == 0 || SlicePitch == RowPitch * ZeSrcRegion.height,
-              PI_ERROR_INVALID_IMAGE_SIZE);
-#endif // !NDEBUG
-
-    char *ZeHandleSrc;
-    PI_CALL(
-        SrcMem->getZeHandle(ZeHandleSrc, _pi_mem::read_only, Queue->Device));
-    ZE_CALL(zeCommandListAppendImageCopyToMemory,
-            (ZeCommandList, Dst, ur_cast<ze_image_handle_t>(ZeHandleSrc),
-             &ZeSrcRegion, ZeEvent, WaitList.Length, WaitList.ZeEventList));
-  } else if (CommandType == PI_COMMAND_TYPE_IMAGE_WRITE) {
-    pi_mem DstMem = ur_cast<pi_mem>(Dst);
-    ze_image_region_t ZeDstRegion;
-    auto Result = getImageRegionHelper(DstMem, DstOrigin, Region, ZeDstRegion);
-    if (Result != PI_SUCCESS)
-      return Result;
-
-      // TODO: Level Zero does not support row_pitch/slice_pitch for images yet.
-      // Check that SYCL RT did not want pitch larger than default.
-#ifndef NDEBUG
-    PI_ASSERT(DstMem->isImage(), PI_ERROR_INVALID_MEM_OBJECT);
-
-    auto DstImage = static_cast<_pi_image *>(DstMem);
-    const ze_image_desc_t &ZeImageDesc = DstImage->ZeImageDesc;
-    PI_ASSERT(
-        RowPitch == 0 ||
-            // special case RGBA image pitch equal to region's width
-            (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32 &&
-             RowPitch == 4 * 4 * ZeDstRegion.width) ||
-            (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16 &&
-             RowPitch == 4 * 2 * ZeDstRegion.width) ||
-            (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 &&
-             RowPitch == 4 * ZeDstRegion.width),
-        PI_ERROR_INVALID_IMAGE_SIZE);
-    PI_ASSERT(SlicePitch == 0 || SlicePitch == RowPitch * ZeDstRegion.height,
-              PI_ERROR_INVALID_IMAGE_SIZE);
-#endif // !NDEBUG
 
-    char *ZeHandleDst;
-    PI_CALL(
-        DstMem->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device));
-    ZE_CALL(zeCommandListAppendImageCopyFromMemory,
-            (ZeCommandList, ur_cast<ze_image_handle_t>(ZeHandleDst), Src,
-             &ZeDstRegion, ZeEvent, WaitList.Length, WaitList.ZeEventList));
-  } else if (CommandType == PI_COMMAND_TYPE_IMAGE_COPY) {
-    pi_mem SrcImage = ur_cast<pi_mem>(const_cast<void *>(Src));
-    pi_mem DstImage = ur_cast<pi_mem>(Dst);
-
-    ze_image_region_t ZeSrcRegion;
-    auto Result =
-        getImageRegionHelper(SrcImage, SrcOrigin, Region, ZeSrcRegion);
-    if (Result != PI_SUCCESS)
-      return Result;
-    ze_image_region_t ZeDstRegion;
-    Result = getImageRegionHelper(DstImage, DstOrigin, Region, ZeDstRegion);
-    if (Result != PI_SUCCESS)
-      return Result;
-
-    char *ZeHandleSrc;
-    char *ZeHandleDst;
-    PI_CALL(
-        SrcImage->getZeHandle(ZeHandleSrc, _pi_mem::read_only, Queue->Device));
-    PI_CALL(
-        DstImage->getZeHandle(ZeHandleDst, _pi_mem::write_only, Queue->Device));
-    ZE_CALL(zeCommandListAppendImageCopyRegion,
-            (ZeCommandList, ur_cast<ze_image_handle_t>(ZeHandleDst),
-             ur_cast<ze_image_handle_t>(ZeHandleSrc), &ZeDstRegion,
-             &ZeSrcRegion, ZeEvent, 0, nullptr));
-  } else {
-    urPrint("enqueueMemImageUpdate: unsupported image command type\n");
-    return PI_ERROR_INVALID_OPERATION;
-  }
-
-  if (auto Res = Queue->executeCommandList(CommandList, IsBlocking, OkToBatch))
-    return Res;
-
-  return PI_SUCCESS;
+  return pi2ur::piMemImageGetInfo(Image, ParamName, ParamValueSize, ParamValue,
+                                  ParamValueSizeRet);
 }
 
-extern "C" {
-
 pi_result piEnqueueMemImageRead(pi_queue Queue, pi_mem Image,
                                 pi_bool BlockingRead, pi_image_offset Origin,
                                 pi_image_region Region, size_t RowPitch,
@@ -7019,19 +677,9 @@ pi_result piEnqueueMemImageRead(pi_queue Queue, pi_mem Image,
                                 pi_uint32 NumEventsInWaitList,
                                 const pi_event *EventWaitList,
                                 pi_event *Event) {
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  std::shared_lock<ur_shared_mutex> SrcLock(Image->Mutex, std::defer_lock);
-  std::scoped_lock<std::shared_lock<ur_shared_mutex>, ur_shared_mutex> LockAll(
-      SrcLock, Queue->Mutex);
-  return enqueueMemImageCommandHelper(
-      PI_COMMAND_TYPE_IMAGE_READ, Queue,
-      Image, // src
-      Ptr,   // dst
-      BlockingRead,
-      Origin,  // SrcOrigin
-      nullptr, // DstOrigin
-      Region, RowPitch, SlicePitch, NumEventsInWaitList, EventWaitList, Event);
+  return pi2ur::piEnqueueMemImageRead(
+      Queue, Image, BlockingRead, Origin, Region, RowPitch, SlicePitch, Ptr,
+      NumEventsInWaitList, EventWaitList, Event);
 }
 
 pi_result piEnqueueMemImageWrite(pi_queue Queue, pi_mem Image,
@@ -7042,19 +690,9 @@ pi_result piEnqueueMemImageWrite(pi_queue Queue, pi_mem Image,
                                  const pi_event *EventWaitList,
                                  pi_event *Event) {
 
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
-                                                          Image->Mutex);
-  return enqueueMemImageCommandHelper(PI_COMMAND_TYPE_IMAGE_WRITE, Queue,
-                                      Ptr,   // src
-                                      Image, // dst
-                                      BlockingWrite,
-                                      nullptr, // SrcOrigin
-                                      Origin,  // DstOrigin
-                                      Region, InputRowPitch, InputSlicePitch,
-                                      NumEventsInWaitList, EventWaitList,
-                                      Event);
+  return pi2ur::piEnqueueMemImageWrite(
+      Queue, Image, BlockingWrite, Origin, Region, InputRowPitch,
+      InputSlicePitch, Ptr, NumEventsInWaitList, EventWaitList, Event);
 }
 
 pi_result
@@ -7062,24 +700,9 @@ piEnqueueMemImageCopy(pi_queue Queue, pi_mem SrcImage, pi_mem DstImage,
                       pi_image_offset SrcOrigin, pi_image_offset DstOrigin,
                       pi_image_region Region, pi_uint32 NumEventsInWaitList,
                       const pi_event *EventWaitList, pi_event *Event) {
-
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  std::shared_lock<ur_shared_mutex> SrcLock(SrcImage->Mutex, std::defer_lock);
-  std::scoped_lock<std::shared_lock<ur_shared_mutex>, ur_shared_mutex,
-                   ur_shared_mutex>
-      LockAll(SrcLock, DstImage->Mutex, Queue->Mutex);
-  // Copy engine is preferred only for host to device transfer.
-  // Device to device transfers run faster on compute engines.
-  // Images are always allocated on device.
-  bool PreferCopyEngine = false;
-  return enqueueMemImageCommandHelper(
-      PI_COMMAND_TYPE_IMAGE_COPY, Queue, SrcImage, DstImage,
-      false, // is_blocking
-      SrcOrigin, DstOrigin, Region,
-      0, // row pitch
-      0, // slice pitch
-      NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine);
+  return pi2ur::piEnqueueMemImageCopy(Queue, SrcImage, DstImage, SrcOrigin,
+                                      DstOrigin, Region, NumEventsInWaitList,
+                                      EventWaitList, Event);
 }
 
 pi_result piEnqueueMemImageFill(pi_queue Queue, pi_mem Image,
@@ -7088,59 +711,18 @@ pi_result piEnqueueMemImageFill(pi_queue Queue, pi_mem Image,
                                 pi_uint32 NumEventsInWaitList,
                                 const pi_event *EventWaitList,
                                 pi_event *Event) {
-  (void)Image;
-  (void)FillColor;
-  (void)Origin;
-  (void)Region;
-  (void)NumEventsInWaitList;
-  (void)EventWaitList;
-  (void)Event;
-
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  // Lock automatically releases when this goes out of scope.
-  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
-                                                          Image->Mutex);
 
-  die("piEnqueueMemImageFill: not implemented");
-  return {};
+  return pi2ur::piEnqueueMemImageFill(Queue, Image, FillColor, Origin, Region,
+                                      NumEventsInWaitList, EventWaitList,
+                                      Event);
 }
 
 pi_result piMemBufferPartition(pi_mem Buffer, pi_mem_flags Flags,
                                pi_buffer_create_type BufferCreateType,
                                void *BufferCreateInfo, pi_mem *RetMem) {
 
-  PI_ASSERT(Buffer && !Buffer->isImage() &&
-                !(static_cast<pi_buffer>(Buffer))->isSubBuffer(),
-            PI_ERROR_INVALID_MEM_OBJECT);
-
-  PI_ASSERT(BufferCreateType == PI_BUFFER_CREATE_TYPE_REGION &&
-                BufferCreateInfo && RetMem,
-            PI_ERROR_INVALID_VALUE);
-
-  std::shared_lock<ur_shared_mutex> Guard(Buffer->Mutex);
-
-  if (Flags != PI_MEM_FLAGS_ACCESS_RW) {
-    die("piMemBufferPartition: Level-Zero implements only read-write buffer,"
-        "no read-only or write-only yet.");
-  }
-
-  auto Region = (pi_buffer_region)BufferCreateInfo;
-
-  PI_ASSERT(Region->size != 0u, PI_ERROR_INVALID_BUFFER_SIZE);
-  PI_ASSERT(Region->origin <= (Region->origin + Region->size),
-            PI_ERROR_INVALID_VALUE);
-
-  try {
-    *RetMem = new _pi_buffer(static_cast<pi_buffer>(Buffer), Region->origin,
-                             Region->size);
-  } catch (const std::bad_alloc &) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  return PI_SUCCESS;
+  return pi2ur::piMemBufferPartition(Buffer, Flags, BufferCreateType,
+                                     BufferCreateInfo, RetMem);
 }
 
 pi_result piEnqueueNativeKernel(pi_queue Queue, void (*UserFunc)(void *),
@@ -7150,725 +732,53 @@ pi_result piEnqueueNativeKernel(pi_queue Queue, void (*UserFunc)(void *),
                                 pi_uint32 NumEventsInWaitList,
                                 const pi_event *EventWaitList,
                                 pi_event *Event) {
-  (void)UserFunc;
-  (void)Args;
-  (void)CbArgs;
-  (void)NumMemObjects;
-  (void)MemList;
-  (void)ArgsMemLoc;
-  (void)NumEventsInWaitList;
-  (void)EventWaitList;
-  (void)Event;
-
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  // Lock automatically releases when this goes out of scope.
-  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
-
-  die("piEnqueueNativeKernel: not implemented");
-  return {};
-}
-
-// Function gets characters between delimeter's in str
-// then checks if they are equal to the sub_str.
-// returns true if there is at least one instance
-// returns false if there are no instances of the name
-static bool is_in_separated_string(const std::string &str, char delimiter,
-                                   const std::string &sub_str) {
-  size_t beg = 0;
-  size_t length = 0;
-  for (const auto &x : str) {
-    if (x == delimiter) {
-      if (str.substr(beg, length) == sub_str)
-        return true;
-
-      beg += length + 1;
-      length = 0;
-      continue;
-    }
-    length++;
-  }
-  if (length != 0)
-    if (str.substr(beg, length) == sub_str)
-      return true;
-
-  return false;
+  return pi2ur::piEnqueueNativeKernel(
+      Queue, UserFunc, Args, CbArgs, NumMemObjects, MemList, ArgsMemLoc,
+      NumEventsInWaitList, EventWaitList, Event);
 }
 
 // TODO: Check if the function_pointer_ret type can be converted to void**.
 pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program,
                                         const char *FunctionName,
                                         pi_uint64 *FunctionPointerRet) {
-  (void)Device;
-  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
-
-  std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
-  if (Program->State != _pi_program::Exe) {
-    return PI_ERROR_INVALID_PROGRAM_EXECUTABLE;
-  }
-
-  ze_result_t ZeResult =
-      ZE_CALL_NOCHECK(zeModuleGetFunctionPointer,
-                      (Program->ZeModule, FunctionName,
-                       reinterpret_cast<void **>(FunctionPointerRet)));
-
-  // zeModuleGetFunctionPointer currently fails for all
-  // kernels regardless of if the kernel exist or not
-  // with ZE_RESULT_ERROR_INVALID_ARGUMENT
-  // TODO: remove when this is no longer the case
-  // If zeModuleGetFunctionPointer returns invalid argument,
-  // fallback to searching through kernel list and return
-  // PI_ERROR_FUNCTION_ADDRESS_IS_NOT_AVAILABLE if the function exists
-  // or PI_ERROR_INVALID_KERNEL_NAME if the function does not exist.
-  // FunctionPointerRet should always be 0
-  if (ZeResult == ZE_RESULT_ERROR_INVALID_ARGUMENT) {
-    size_t Size;
-    *FunctionPointerRet = 0;
-    PI_CALL(piProgramGetInfo(Program, PI_PROGRAM_INFO_KERNEL_NAMES, 0, nullptr,
-                             &Size));
-
-    std::string ClResult(Size, ' ');
-    PI_CALL(piProgramGetInfo(Program, PI_PROGRAM_INFO_KERNEL_NAMES,
-                             ClResult.size(), &ClResult[0], nullptr));
-
-    // Get rid of the null terminator and search for kernel_name
-    // If function can be found return error code to indicate it
-    // exists
-    ClResult.pop_back();
-    if (is_in_separated_string(ClResult, ';', std::string(FunctionName)))
-      return PI_ERROR_FUNCTION_ADDRESS_IS_NOT_AVAILABLE;
-
-    return PI_ERROR_INVALID_KERNEL_NAME;
-  }
-
-  if (ZeResult == ZE_RESULT_ERROR_INVALID_FUNCTION_NAME) {
-    *FunctionPointerRet = 0;
-    return PI_ERROR_INVALID_KERNEL_NAME;
-  }
-
-  return mapError(ZeResult);
-}
-
-enum class USMAllocationForceResidencyType {
-  // Do not force memory residency at allocation time.
-  None = 0,
-  // Force memory resident on the device of allocation at allocation time.
-  // For host allocation force residency on all devices in a context.
-  Device = 1,
-  // Force memory resident on all devices in the context with P2P
-  // access to the device of allocation.
-  // For host allocation force residency on all devices in a context.
-  P2PDevices = 2
-};
-
-// Returns the desired USM residency setting
-// Input value is of the form 0xHSD, where:
-//   4-bits of D control device allocations
-//   4-bits of S control shared allocations
-//   4-bits of H control host allocations
-// Each 4-bit value is holding a USMAllocationForceResidencyType enum value.
-// The default is 0x2, i.e. force full residency for device allocations only.
-//
-static uint32_t USMAllocationForceResidency = [] {
-  const char *UrRet = std::getenv("UR_L0_USM_RESIDENT");
-  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT");
-  const char *Str = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-  try {
-    if (Str) {
-      // Auto-detect radix to allow more convinient hex base
-      return std::stoi(Str, nullptr, 0);
-    }
-  } catch (...) {
-  }
-  return 0x2;
-}();
-
-// Convert from an integer value to USMAllocationForceResidencyType enum value
-static USMAllocationForceResidencyType
-USMAllocationForceResidencyConvert(uint32_t Val) {
-  switch (Val) {
-  case 1:
-    return USMAllocationForceResidencyType::Device;
-  case 2:
-    return USMAllocationForceResidencyType::P2PDevices;
-  default:
-    return USMAllocationForceResidencyType::None;
-  };
-}
-
-static USMAllocationForceResidencyType USMHostAllocationForceResidency = [] {
-  return USMAllocationForceResidencyConvert(
-      (USMAllocationForceResidency & 0xf00) >> 8);
-}();
-static USMAllocationForceResidencyType USMSharedAllocationForceResidency = [] {
-  return USMAllocationForceResidencyConvert(
-      (USMAllocationForceResidency & 0x0f0) >> 4);
-}();
-static USMAllocationForceResidencyType USMDeviceAllocationForceResidency = [] {
-  return USMAllocationForceResidencyConvert(
-      (USMAllocationForceResidency & 0x00f));
-}();
-
-// Make USM allocation resident as requested
-static pi_result
-USMAllocationMakeResident(USMAllocationForceResidencyType ForceResidency,
-                          pi_context Context,
-                          pi_device Device, // nullptr for host allocation
-                          void *Ptr, size_t Size) {
-  if (ForceResidency == USMAllocationForceResidencyType::None)
-    return PI_SUCCESS;
-
-  std::list<pi_device> Devices;
-  if (!Device) {
-    // Host allocation, make it resident on all devices in the context
-    Devices.insert(Devices.end(), Context->Devices.begin(),
-                   Context->Devices.end());
-  } else {
-    Devices.push_back(Device);
-    if (ForceResidency == USMAllocationForceResidencyType::P2PDevices) {
-      ze_bool_t P2P;
-      for (const auto &D : Context->Devices) {
-        if (D == Device)
-          continue;
-        // TODO: Cache P2P devices for a context
-        ZE_CALL(zeDeviceCanAccessPeer, (D->ZeDevice, Device->ZeDevice, &P2P));
-        if (P2P)
-          Devices.push_back(D);
-      }
-    }
-  }
-  for (const auto &D : Devices) {
-    ZE_CALL(zeContextMakeMemoryResident,
-            (Context->ZeContext, D->ZeDevice, Ptr, Size));
-  }
-  return PI_SUCCESS;
-}
-
-static pi_result USMDeviceAllocImpl(void **ResultPtr, pi_context Context,
-                                    pi_device Device,
-                                    pi_usm_mem_properties *Properties,
-                                    size_t Size, pi_uint32 Alignment) {
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
 
-  // Check that incorrect bits are not set in the properties.
-  PI_ASSERT(!Properties || *Properties == 0 ||
-                (*Properties == PI_MEM_ALLOC_FLAGS && *(Properties + 2) == 0),
-            PI_ERROR_INVALID_VALUE);
-
-  // TODO: translate PI properties to Level Zero flags
-  ZeStruct<ze_device_mem_alloc_desc_t> ZeDesc;
-  ZeDesc.flags = 0;
-  ZeDesc.ordinal = 0;
-
-  ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
-  if (Size > Device->ZeDeviceProperties->maxMemAllocSize) {
-    // Tell Level-Zero to accept Size > maxMemAllocSize
-    RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
-    ZeDesc.pNext = &RelaxedDesc;
-  }
-
-  ZE_CALL(zeMemAllocDevice, (Context->ZeContext, &ZeDesc, Size, Alignment,
-                             Device->ZeDevice, ResultPtr));
-
-  PI_ASSERT(Alignment == 0 ||
-                reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
-            PI_ERROR_INVALID_VALUE);
-
-  USMAllocationMakeResident(USMDeviceAllocationForceResidency, Context, Device,
-                            *ResultPtr, Size);
-  return PI_SUCCESS;
-}
-
-static pi_result USMSharedAllocImpl(void **ResultPtr, pi_context Context,
-                                    pi_device Device, pi_usm_mem_properties *,
-                                    size_t Size, pi_uint32 Alignment) {
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
-
-  // TODO: translate PI properties to Level Zero flags
-  ZeStruct<ze_host_mem_alloc_desc_t> ZeHostDesc;
-  ZeHostDesc.flags = 0;
-  ZeStruct<ze_device_mem_alloc_desc_t> ZeDevDesc;
-  ZeDevDesc.flags = 0;
-  ZeDevDesc.ordinal = 0;
-
-  ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
-  if (Size > Device->ZeDeviceProperties->maxMemAllocSize) {
-    // Tell Level-Zero to accept Size > maxMemAllocSize
-    RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
-    ZeDevDesc.pNext = &RelaxedDesc;
-  }
-
-  ZE_CALL(zeMemAllocShared, (Context->ZeContext, &ZeDevDesc, &ZeHostDesc, Size,
-                             Alignment, Device->ZeDevice, ResultPtr));
-
-  PI_ASSERT(Alignment == 0 ||
-                reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
-            PI_ERROR_INVALID_VALUE);
-
-  USMAllocationMakeResident(USMSharedAllocationForceResidency, Context, Device,
-                            *ResultPtr, Size);
-
-  // TODO: Handle PI_MEM_ALLOC_DEVICE_READ_ONLY.
-  return PI_SUCCESS;
-}
-
-static pi_result USMHostAllocImpl(void **ResultPtr, pi_context Context,
-                                  pi_usm_mem_properties *Properties,
-                                  size_t Size, pi_uint32 Alignment) {
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-
-  // Check that incorrect bits are not set in the properties.
-  PI_ASSERT(!Properties || *Properties == 0 ||
-                (*Properties == PI_MEM_ALLOC_FLAGS && *(Properties + 2) == 0),
-            PI_ERROR_INVALID_VALUE);
-
-  // TODO: translate PI properties to Level Zero flags
-  ZeStruct<ze_host_mem_alloc_desc_t> ZeHostDesc;
-  ZeHostDesc.flags = 0;
-  ZE_CALL(zeMemAllocHost,
-          (Context->ZeContext, &ZeHostDesc, Size, Alignment, ResultPtr));
-
-  PI_ASSERT(Alignment == 0 ||
-                reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
-            PI_ERROR_INVALID_VALUE);
-
-  USMAllocationMakeResident(USMHostAllocationForceResidency, Context, nullptr,
-                            *ResultPtr, Size);
-  return PI_SUCCESS;
-}
-
-static pi_result USMFreeImpl(pi_context Context, void *Ptr) {
-  ZE_CALL(zeMemFree, (Context->ZeContext, Ptr));
-  return PI_SUCCESS;
-}
-
-// Exception type to pass allocation errors
-class UsmAllocationException {
-  const pi_result Error;
-
-public:
-  UsmAllocationException(pi_result Err) : Error{Err} {}
-  pi_result getError() const { return Error; }
-};
-
-pi_result USMSharedMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
-                                             pi_uint32 Alignment) {
-  return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, Size,
-                            Alignment);
-}
-
-pi_result USMSharedReadOnlyMemoryAlloc::allocateImpl(void **ResultPtr,
-                                                     size_t Size,
-                                                     pi_uint32 Alignment) {
-  pi_usm_mem_properties Props[] = {PI_MEM_ALLOC_FLAGS,
-                                   PI_MEM_ALLOC_DEVICE_READ_ONLY, 0};
-  return USMSharedAllocImpl(ResultPtr, Context, Device, Props, Size, Alignment);
-}
-
-pi_result USMDeviceMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
-                                             pi_uint32 Alignment) {
-  return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size,
-                            Alignment);
-}
-
-pi_result USMHostMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
-                                           pi_uint32 Alignment) {
-  return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment);
-}
-
-void *USMMemoryAllocBase::allocate(size_t Size) {
-  void *Ptr = nullptr;
-
-  auto Res = allocateImpl(&Ptr, Size, sizeof(void *));
-  if (Res != PI_SUCCESS) {
-    throw UsmAllocationException(Res);
-  }
-
-  return Ptr;
-}
-
-void *USMMemoryAllocBase::allocate(size_t Size, size_t Alignment) {
-  void *Ptr = nullptr;
-
-  auto Res = allocateImpl(&Ptr, Size, Alignment);
-  if (Res != PI_SUCCESS) {
-    throw UsmAllocationException(Res);
-  }
-  return Ptr;
-}
-
-void USMMemoryAllocBase::deallocate(void *Ptr) {
-  auto Res = USMFreeImpl(Context, Ptr);
-  if (Res != PI_SUCCESS) {
-    throw UsmAllocationException(Res);
-  }
+  return pi2ur::piextGetDeviceFunctionPointer(Device, Program, FunctionName,
+                                              FunctionPointerRet);
 }
 
 pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context,
                               pi_device Device,
                               pi_usm_mem_properties *Properties, size_t Size,
                               pi_uint32 Alignment) {
-  // L0 supports alignment up to 64KB and silently ignores higher values.
-  // We flag alignment > 64KB as an invalid value.
-  if (Alignment > 65536)
-    return PI_ERROR_INVALID_VALUE;
-
-  pi_platform Plt = Device->Platform;
-
-  // If indirect access tracking is enabled then lock the mutex which is
-  // guarding contexts container in the platform. This prevents new kernels from
-  // being submitted in any context while we are in the process of allocating a
-  // memory, this is needed to properly capture allocations by kernels with
-  // indirect access. This lock also protects access to the context's data
-  // structures. If indirect access tracking is not enabled then lock context
-  // mutex to protect access to context's data structures.
-  std::shared_lock<ur_shared_mutex> ContextLock(Context->Mutex,
-                                                std::defer_lock);
-  std::unique_lock<ur_shared_mutex> IndirectAccessTrackingLock(
-      Plt->ContextsMutex, std::defer_lock);
-  if (IndirectAccessTrackingEnabled) {
-    IndirectAccessTrackingLock.lock();
-    // We are going to defer memory release if there are kernels with indirect
-    // access, that is why explicitly retain context to be sure that it is
-    // released after all memory allocations in this context are released.
-    PI_CALL(piContextRetain(Context));
-  } else {
-    ContextLock.lock();
-  }
-
-  if (!UseUSMAllocator ||
-      // L0 spec says that allocation fails if Alignment != 2^n, in order to
-      // keep the same behavior for the allocator, just call L0 API directly and
-      // return the error code.
-      ((Alignment & (Alignment - 1)) != 0)) {
-    pi_result Res = USMDeviceAllocImpl(ResultPtr, Context, Device, Properties,
-                                       Size, Alignment);
-    if (IndirectAccessTrackingEnabled) {
-      // Keep track of all memory allocations in the context
-      Context->MemAllocs.emplace(std::piecewise_construct,
-                                 std::forward_as_tuple(*ResultPtr),
-                                 std::forward_as_tuple(Context));
-    }
-    return Res;
-  }
-
-  try {
-    auto It = Context->DeviceMemAllocContexts.find(Device->ZeDevice);
-    if (It == Context->DeviceMemAllocContexts.end())
-      return PI_ERROR_INVALID_VALUE;
-
-    *ResultPtr = It->second.allocate(Size, Alignment);
-    if (IndirectAccessTrackingEnabled) {
-      // Keep track of all memory allocations in the context
-      Context->MemAllocs.emplace(std::piecewise_construct,
-                                 std::forward_as_tuple(*ResultPtr),
-                                 std::forward_as_tuple(Context));
-    }
-
-  } catch (const UsmAllocationException &Ex) {
-    *ResultPtr = nullptr;
-    return Ex.getError();
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
 
-  return PI_SUCCESS;
+  return pi2ur::piextUSMDeviceAlloc(ResultPtr, Context, Device, Properties,
+                                    Size, Alignment);
 }
 
 pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context,
                               pi_device Device,
                               pi_usm_mem_properties *Properties, size_t Size,
                               pi_uint32 Alignment) {
-  // See if the memory is going to be read-only on the device.
-  bool DeviceReadOnly = false;
-  // Check that incorrect bits are not set in the properties.
-  if (Properties && *Properties != 0) {
-    PI_ASSERT(*(Properties) == PI_MEM_ALLOC_FLAGS && *(Properties + 2) == 0,
-              PI_ERROR_INVALID_VALUE);
-    DeviceReadOnly = *(Properties + 1) & PI_MEM_ALLOC_DEVICE_READ_ONLY;
-  }
 
-  // L0 supports alignment up to 64KB and silently ignores higher values.
-  // We flag alignment > 64KB as an invalid value.
-  if (Alignment > 65536)
-    return PI_ERROR_INVALID_VALUE;
-
-  pi_platform Plt = Device->Platform;
-
-  // If indirect access tracking is enabled then lock the mutex which is
-  // guarding contexts container in the platform. This prevents new kernels from
-  // being submitted in any context while we are in the process of allocating a
-  // memory, this is needed to properly capture allocations by kernels with
-  // indirect access. This lock also protects access to the context's data
-  // structures. If indirect access tracking is not enabled then lock context
-  // mutex to protect access to context's data structures.
-  std::scoped_lock<ur_shared_mutex> Lock(
-      IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex);
-
-  if (IndirectAccessTrackingEnabled) {
-    // We are going to defer memory release if there are kernels with indirect
-    // access, that is why explicitly retain context to be sure that it is
-    // released after all memory allocations in this context are released.
-    PI_CALL(piContextRetain(Context));
-  }
-
-  if (!UseUSMAllocator ||
-      // L0 spec says that allocation fails if Alignment != 2^n, in order to
-      // keep the same behavior for the allocator, just call L0 API directly and
-      // return the error code.
-      ((Alignment & (Alignment - 1)) != 0)) {
-    pi_result Res = USMSharedAllocImpl(ResultPtr, Context, Device, Properties,
-                                       Size, Alignment);
-    if (IndirectAccessTrackingEnabled) {
-      // Keep track of all memory allocations in the context
-      Context->MemAllocs.emplace(std::piecewise_construct,
-                                 std::forward_as_tuple(*ResultPtr),
-                                 std::forward_as_tuple(Context));
-    }
-    return Res;
-  }
-
-  try {
-    auto &Allocator = (DeviceReadOnly ? Context->SharedReadOnlyMemAllocContexts
-                                      : Context->SharedMemAllocContexts);
-    auto It = Allocator.find(Device->ZeDevice);
-    if (It == Allocator.end())
-      return PI_ERROR_INVALID_VALUE;
-
-    *ResultPtr = It->second.allocate(Size, Alignment);
-    if (DeviceReadOnly) {
-      Context->SharedReadOnlyAllocs.insert(*ResultPtr);
-    }
-    if (IndirectAccessTrackingEnabled) {
-      // Keep track of all memory allocations in the context
-      Context->MemAllocs.emplace(std::piecewise_construct,
-                                 std::forward_as_tuple(*ResultPtr),
-                                 std::forward_as_tuple(Context));
-    }
-  } catch (const UsmAllocationException &Ex) {
-    *ResultPtr = nullptr;
-    return Ex.getError();
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  return PI_SUCCESS;
+  return pi2ur::piextUSMSharedAlloc(ResultPtr, Context, Device, Properties,
+                                    Size, Alignment);
 }
 
 pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context,
                             pi_usm_mem_properties *Properties, size_t Size,
                             pi_uint32 Alignment) {
-  // L0 supports alignment up to 64KB and silently ignores higher values.
-  // We flag alignment > 64KB as an invalid value.
-  if (Alignment > 65536)
-    return PI_ERROR_INVALID_VALUE;
-
-  pi_platform Plt = Context->getPlatform();
-  // If indirect access tracking is enabled then lock the mutex which is
-  // guarding contexts container in the platform. This prevents new kernels from
-  // being submitted in any context while we are in the process of allocating a
-  // memory, this is needed to properly capture allocations by kernels with
-  // indirect access. This lock also protects access to the context's data
-  // structures. If indirect access tracking is not enabled then lock context
-  // mutex to protect access to context's data structures.
-  std::shared_lock<ur_shared_mutex> ContextLock(Context->Mutex,
-                                                std::defer_lock);
-  std::unique_lock<ur_shared_mutex> IndirectAccessTrackingLock(
-      Plt->ContextsMutex, std::defer_lock);
-  if (IndirectAccessTrackingEnabled) {
-    IndirectAccessTrackingLock.lock();
-    // We are going to defer memory release if there are kernels with indirect
-    // access, that is why explicitly retain context to be sure that it is
-    // released after all memory allocations in this context are released.
-    PI_CALL(piContextRetain(Context));
-  } else {
-    ContextLock.lock();
-  }
-
-  if (!UseUSMAllocator ||
-      // L0 spec says that allocation fails if Alignment != 2^n, in order to
-      // keep the same behavior for the allocator, just call L0 API directly and
-      // return the error code.
-      ((Alignment & (Alignment - 1)) != 0)) {
-    pi_result Res =
-        USMHostAllocImpl(ResultPtr, Context, Properties, Size, Alignment);
-    if (IndirectAccessTrackingEnabled) {
-      // Keep track of all memory allocations in the context
-      Context->MemAllocs.emplace(std::piecewise_construct,
-                                 std::forward_as_tuple(*ResultPtr),
-                                 std::forward_as_tuple(Context));
-    }
-    return Res;
-  }
-
-  // There is a single allocator for Host USM allocations, so we don't need to
-  // find the allocator depending on context as we do for Shared and Device
-  // allocations.
-  try {
-    *ResultPtr = Context->HostMemAllocContext->allocate(Size, Alignment);
-    if (IndirectAccessTrackingEnabled) {
-      // Keep track of all memory allocations in the context
-      Context->MemAllocs.emplace(std::piecewise_construct,
-                                 std::forward_as_tuple(*ResultPtr),
-                                 std::forward_as_tuple(Context));
-    }
-  } catch (const UsmAllocationException &Ex) {
-    *ResultPtr = nullptr;
-    return Ex.getError();
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  return PI_SUCCESS;
-}
-
-// Helper function to deallocate USM memory, if indirect access support is
-// enabled then a caller must lock the platform-level mutex guarding the
-// container with contexts because deallocating the memory can turn RefCount of
-// a context to 0 and as a result the context being removed from the list of
-// tracked contexts.
-// If indirect access tracking is not enabled then caller must lock Context
-// mutex.
-static pi_result USMFreeHelper(pi_context Context, void *Ptr,
-                               bool OwnZeMemHandle) {
-  if (!OwnZeMemHandle) {
-    // Memory should not be freed
-    return PI_SUCCESS;
-  }
-
-  if (IndirectAccessTrackingEnabled) {
-    auto It = Context->MemAllocs.find(Ptr);
-    if (It == std::end(Context->MemAllocs)) {
-      die("All memory allocations must be tracked!");
-    }
-    if (!It->second.RefCount.decrementAndTest()) {
-      // Memory can't be deallocated yet.
-      return PI_SUCCESS;
-    }
-
-    // Reference count is zero, it is ok to free memory.
-    // We don't need to track this allocation anymore.
-    Context->MemAllocs.erase(It);
-  }
-
-  if (!UseUSMAllocator) {
-    pi_result Res = USMFreeImpl(Context, Ptr);
-    if (IndirectAccessTrackingEnabled)
-      PI_CALL(ContextReleaseHelper(Context));
-    return Res;
-  }
-
-  // Query the device of the allocation to determine the right allocator context
-  ze_device_handle_t ZeDeviceHandle;
-  ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
-
-  // Query memory type of the pointer we're freeing to determine the correct
-  // way to do it(directly or via an allocator)
-  auto ZeResult =
-      ZE_CALL_NOCHECK(zeMemGetAllocProperties,
-                      (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties,
-                       &ZeDeviceHandle));
-
-  // Handle the case that L0 RT was already unloaded
-  if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
-    if (IndirectAccessTrackingEnabled)
-      PI_CALL(ContextReleaseHelper(Context));
-    return PI_SUCCESS;
-  } else if (ZeResult) {
-    return mapError(ZeResult);
-  }
-
-  // If memory type is host release from host pool
-  if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST) {
-    try {
-      Context->HostMemAllocContext->deallocate(Ptr);
-    } catch (const UsmAllocationException &Ex) {
-      return Ex.getError();
-    } catch (...) {
-      return PI_ERROR_UNKNOWN;
-    }
-    if (IndirectAccessTrackingEnabled)
-      PI_CALL(ContextReleaseHelper(Context));
-    return PI_SUCCESS;
-  }
-
-  // Points out an allocation in SharedReadOnlyMemAllocContexts
-  auto SharedReadOnlyAllocsIterator = Context->SharedReadOnlyAllocs.end();
-
-  if (!ZeDeviceHandle) {
-    // The only case where it is OK not have device identified is
-    // if the memory is not known to the driver. We should not ever get
-    // this either, probably.
-    PI_ASSERT(ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN,
-              PI_ERROR_INVALID_DEVICE);
-  } else {
-    pi_device Device;
-    // All context member devices or their descendants are of the same platform.
-    auto Platform = Context->getPlatform();
-    Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle);
-    PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
-
-    auto DeallocationHelper =
-        [Context, Device,
-         Ptr](std::unordered_map<ze_device_handle_t, USMAllocContext>
-                  &AllocContextMap) {
-          try {
-            auto It = AllocContextMap.find(Device->ZeDevice);
-            if (It == AllocContextMap.end())
-              return PI_ERROR_INVALID_VALUE;
-
-            // The right context is found, deallocate the pointer
-            It->second.deallocate(Ptr);
-          } catch (const UsmAllocationException &Ex) {
-            return Ex.getError();
-          }
-
-          if (IndirectAccessTrackingEnabled)
-            PI_CALL(ContextReleaseHelper(Context));
-          return PI_SUCCESS;
-        };
-
-    switch (ZeMemoryAllocationProperties.type) {
-    case ZE_MEMORY_TYPE_SHARED:
-      // Distinguish device_read_only allocations since they have own pool.
-      SharedReadOnlyAllocsIterator = Context->SharedReadOnlyAllocs.find(Ptr);
-      return DeallocationHelper(SharedReadOnlyAllocsIterator !=
-                                        Context->SharedReadOnlyAllocs.end()
-                                    ? Context->SharedReadOnlyMemAllocContexts
-                                    : Context->SharedMemAllocContexts);
-    case ZE_MEMORY_TYPE_DEVICE:
-      return DeallocationHelper(Context->DeviceMemAllocContexts);
-    default:
-      // Handled below
-      break;
-    }
-  }
-
-  pi_result Res = USMFreeImpl(Context, Ptr);
-  if (SharedReadOnlyAllocsIterator != Context->SharedReadOnlyAllocs.end()) {
-    Context->SharedReadOnlyAllocs.erase(SharedReadOnlyAllocsIterator);
-  }
-  if (IndirectAccessTrackingEnabled)
-    PI_CALL(ContextReleaseHelper(Context));
-  return Res;
+  return pi2ur::piextUSMHostAlloc(ResultPtr, Context, Properties, Size,
+                                  Alignment);
 }
 
 pi_result piextUSMFree(pi_context Context, void *Ptr) {
-  pi_platform Plt = Context->getPlatform();
-
-  std::scoped_lock<ur_shared_mutex> Lock(
-      IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex);
 
-  return USMFreeHelper(Context, Ptr);
+  return pi2ur::piextUSMFree(Context, Ptr);
 }
 
 pi_result piextKernelSetArgPointer(pi_kernel Kernel, pi_uint32 ArgIndex,
                                    size_t ArgSize, const void *ArgValue) {
-
-  PI_CALL(piKernelSetArg(Kernel, ArgIndex, ArgSize, ArgValue));
-  return PI_SUCCESS;
+  return pi2ur::piextKernelSetArgPointer(Kernel, ArgIndex, ArgSize, ArgValue);
 }
 
 /// USM Memset API
@@ -7886,32 +796,8 @@ pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr, pi_int32 Value,
                                 size_t Count, pi_uint32 NumEventsInWaitlist,
                                 const pi_event *EventsWaitlist,
                                 pi_event *Event) {
-  if (!Ptr) {
-    return PI_ERROR_INVALID_VALUE;
-  }
-
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  std::scoped_lock<ur_shared_mutex> Lock(Queue->Mutex);
-  return enqueueMemFillHelper(
-      // TODO: do we need a new command type for USM memset?
-      PI_COMMAND_TYPE_MEM_BUFFER_FILL, Queue, Ptr,
-      &Value, // It will be interpreted as an 8-bit value,
-      1,      // which is indicated with this pattern_size==1
-      Count, NumEventsInWaitlist, EventsWaitlist, Event);
-}
-
-// Helper function to check if a pointer is a device pointer.
-static bool IsDevicePointer(pi_context Context, const void *Ptr) {
-  ze_device_handle_t ZeDeviceHandle;
-  ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
-
-  // Query memory type of the pointer
-  ZE_CALL(zeMemGetAllocProperties,
-          (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties,
-           &ZeDeviceHandle));
-
-  return (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_DEVICE);
+  return pi2ur::piextUSMEnqueueMemset(
+      Queue, Ptr, Value, Count, NumEventsInWaitlist, EventsWaitlist, Event);
 }
 
 pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking, void *DstPtr,
@@ -7920,26 +806,9 @@ pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking, void *DstPtr,
                                 const pi_event *EventsWaitlist,
                                 pi_event *Event) {
 
-  if (!DstPtr) {
-    return PI_ERROR_INVALID_VALUE;
-  }
-
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
-
-  // Device to Device copies are found to execute slower on copy engine
-  // (versus compute engine).
-  bool PreferCopyEngine = !IsDevicePointer(Queue->Context, SrcPtr) ||
-                          !IsDevicePointer(Queue->Context, DstPtr);
-
-  // Temporary option added to use copy engine for D2D copy
-  PreferCopyEngine |= UseCopyEngineForD2DCopy;
-
-  return enqueueMemCopyHelper(
-      // TODO: do we need a new command type for this?
-      PI_COMMAND_TYPE_MEM_BUFFER_COPY, Queue, DstPtr, Blocking, Size, SrcPtr,
-      NumEventsInWaitlist, EventsWaitlist, Event, PreferCopyEngine);
+  return pi2ur::piextUSMEnqueueMemcpy(Queue, Blocking, DstPtr, SrcPtr, Size,
+                                      NumEventsInWaitlist, EventsWaitlist,
+                                      Event);
 }
 
 /// Hint to migrate memory to the device
@@ -7957,63 +826,8 @@ pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, size_t Size,
                                   const pi_event *EventWaitList,
                                   pi_event *OutEvent) {
 
-  // flags is currently unused so fail if set
-  PI_ASSERT(Flags == 0, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  // Lock automatically releases when this goes out of scope.
-  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
-
-  bool UseCopyEngine = false;
-
-  // Please note that the following code should be run before the
-  // subsequent getAvailableCommandList() call so that there is no
-  // dead-lock from waiting unsubmitted events in an open batch.
-  // The createAndRetainPiZeEventList() has the proper side-effect
-  // of submitting batches with dependent events.
-  //
-  _pi_ze_event_list_t TmpWaitList;
-  if (auto Res = TmpWaitList.createAndRetainPiZeEventList(
-          NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine))
-    return Res;
-
-  // Get a new command list to be used on this call
-  pi_command_list_ptr_t CommandList{};
-  // TODO: Change UseCopyEngine argument to 'true' once L0 backend
-  // support is added
-  if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                         UseCopyEngine))
-    return Res;
-
-  // TODO: do we need to create a unique command type for this?
-  ze_event_handle_t ZeEvent = nullptr;
-  pi_event InternalEvent;
-  bool IsInternal = OutEvent == nullptr;
-  pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
-  auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER,
-                                          CommandList, IsInternal);
-  if (Res != PI_SUCCESS)
-    return Res;
-  ZeEvent = (*Event)->ZeEvent;
-  (*Event)->WaitList = TmpWaitList;
-
-  const auto &WaitList = (*Event)->WaitList;
-  const auto &ZeCommandList = CommandList->first;
-  if (WaitList.Length) {
-    ZE_CALL(zeCommandListAppendWaitOnEvents,
-            (ZeCommandList, WaitList.Length, WaitList.ZeEventList));
-  }
-  // TODO: figure out how to translate "flags"
-  ZE_CALL(zeCommandListAppendMemoryPrefetch, (ZeCommandList, Ptr, Size));
-
-  // TODO: Level Zero does not have a completion "event" with the prefetch API,
-  // so manually add command to signal our event.
-  ZE_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent));
-
-  if (auto Res = Queue->executeCommandList(CommandList, false))
-    return Res;
-
-  return PI_SUCCESS;
+  return pi2ur::piextUSMEnqueuePrefetch(
+      Queue, Ptr, Size, Flags, NumEventsInWaitList, EventWaitList, OutEvent);
 }
 
 /// USM memadvise API to govern behavior of automatic migration mechanisms
@@ -8027,59 +841,8 @@ pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, size_t Size,
 pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr,
                                    size_t Length, pi_mem_advice Advice,
                                    pi_event *OutEvent) {
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  // Lock automatically releases when this goes out of scope.
-  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
-
-  auto ZeAdvice = ur_cast<ze_memory_advice_t>(Advice);
-
-  bool UseCopyEngine = false;
-
-  _pi_ze_event_list_t TmpWaitList;
-  if (auto Res = TmpWaitList.createAndRetainPiZeEventList(0, nullptr, Queue,
-                                                          UseCopyEngine))
-    return Res;
-
-  // Get a new command list to be used on this call
-  pi_command_list_ptr_t CommandList{};
-  // UseCopyEngine is set to 'false' here.
-  // TODO: Additional analysis is required to check if this operation will
-  // run faster on copy engines.
-  if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                         UseCopyEngine))
-    return Res;
-
-  // TODO: do we need to create a unique command type for this?
-  ze_event_handle_t ZeEvent = nullptr;
-  pi_event InternalEvent;
-  bool IsInternal = OutEvent == nullptr;
-  pi_event *Event = OutEvent ? OutEvent : &InternalEvent;
-  auto Res = createEventAndAssociateQueue(Queue, Event, PI_COMMAND_TYPE_USER,
-                                          CommandList, IsInternal);
-  if (Res != PI_SUCCESS)
-    return Res;
-  ZeEvent = (*Event)->ZeEvent;
-  (*Event)->WaitList = TmpWaitList;
-
-  const auto &ZeCommandList = CommandList->first;
-  const auto &WaitList = (*Event)->WaitList;
-
-  if (WaitList.Length) {
-    ZE_CALL(zeCommandListAppendWaitOnEvents,
-            (ZeCommandList, WaitList.Length, WaitList.ZeEventList));
-  }
 
-  ZE_CALL(zeCommandListAppendMemAdvise,
-          (ZeCommandList, Queue->Device->ZeDevice, Ptr, Length, ZeAdvice));
-
-  // TODO: Level Zero does not have a completion "event" with the advise API,
-  // so manually add command to signal our event.
-  ZE_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent));
-
-  Queue->executeCommandList(CommandList, false);
-
-  return PI_SUCCESS;
+  return pi2ur::piextUSMEnqueueMemAdvise(Queue, Ptr, Length, Advice, OutEvent);
 }
 
 /// USM 2D Fill API
@@ -8094,25 +857,17 @@ pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr,
 /// \param num_events_in_waitlist is the number of events to wait on
 /// \param events_waitlist is an array of events to wait on
 /// \param event is the event that represents this operation
-__SYCL_EXPORT pi_result piextUSMEnqueueFill2D(pi_queue queue, void *ptr,
-                                              size_t pitch, size_t pattern_size,
-                                              const void *pattern, size_t width,
-                                              size_t height,
-                                              pi_uint32 num_events_in_waitlist,
-                                              const pi_event *events_waitlist,
-                                              pi_event *event) {
-  std::ignore = queue;
-  std::ignore = ptr;
-  std::ignore = pitch;
-  std::ignore = pattern_size;
-  std::ignore = pattern;
-  std::ignore = width;
-  std::ignore = height;
-  std::ignore = num_events_in_waitlist;
-  std::ignore = events_waitlist;
-  std::ignore = event;
-  die("piextUSMEnqueueFill2D: not implemented");
-  return {};
+__SYCL_EXPORT pi_result piextUSMEnqueueFill2D(pi_queue Queue, void *Ptr,
+                                              size_t Pitch, size_t PatternSize,
+                                              const void *Pattern, size_t Width,
+                                              size_t Height,
+                                              pi_uint32 NumEventsWaitList,
+                                              const pi_event *EventsWaitList,
+                                              pi_event *Event) {
+
+  return pi2ur::piextUSMEnqueueFill2D(Queue, Ptr, Pitch, PatternSize, Pattern,
+                                      Width, Height, NumEventsWaitList,
+                                      EventsWaitList, Event);
 }
 
 /// USM 2D Memset API
@@ -8127,21 +882,16 @@ __SYCL_EXPORT pi_result piextUSMEnqueueFill2D(pi_queue queue, void *ptr,
 /// \param num_events_in_waitlist is the number of events to wait on
 /// \param events_waitlist is an array of events to wait on
 /// \param event is the event that represents this operation
-__SYCL_EXPORT pi_result piextUSMEnqueueMemset2D(
-    pi_queue queue, void *ptr, size_t pitch, int value, size_t width,
-    size_t height, pi_uint32 num_events_in_waitlist,
-    const pi_event *events_waitlist, pi_event *event) {
-  std::ignore = queue;
-  std::ignore = ptr;
-  std::ignore = pitch;
-  std::ignore = value;
-  std::ignore = width;
-  std::ignore = height;
-  std::ignore = num_events_in_waitlist;
-  std::ignore = events_waitlist;
-  std::ignore = event;
-  die("piextUSMEnqueueMemset2D: not implemented");
-  return {};
+__SYCL_EXPORT pi_result piextUSMEnqueueMemset2D(pi_queue Queue, void *Ptr,
+                                                size_t Pitch, int Value,
+                                                size_t Width, size_t Height,
+                                                pi_uint32 NumEventsWaitList,
+                                                const pi_event *EventsWaitlist,
+                                                pi_event *Event) {
+
+  return pi2ur::piextUSMEnqueueMemset2D(Queue, Ptr, Pitch, Value, Width, Height,
+                                        NumEventsWaitList, EventsWaitlist,
+                                        Event);
 }
 
 /// USM 2D Memcpy API
@@ -8163,30 +913,10 @@ __SYCL_EXPORT pi_result piextUSMEnqueueMemcpy2D(
     const void *SrcPtr, size_t SrcPitch, size_t Width, size_t Height,
     pi_uint32 NumEventsInWaitlist, const pi_event *EventWaitlist,
     pi_event *Event) {
-  if (!DstPtr || !SrcPtr)
-    return PI_ERROR_INVALID_VALUE;
 
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  pi_buff_rect_offset_struct ZeroOffset{0, 0, 0};
-  pi_buff_rect_region_struct Region{Width, Height, 0};
-
-  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
-
-  // Device to Device copies are found to execute slower on copy engine
-  // (versus compute engine).
-  bool PreferCopyEngine = !IsDevicePointer(Queue->Context, SrcPtr) ||
-                          !IsDevicePointer(Queue->Context, DstPtr);
-
-  // Temporary option added to use copy engine for D2D copy
-  PreferCopyEngine |= UseCopyEngineForD2DCopy;
-
-  return enqueueMemCopyRectHelper(
-      // TODO: do we need a new command type for this?
-      PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, Queue, SrcPtr, DstPtr, &ZeroOffset,
-      &ZeroOffset, &Region, SrcPitch, DstPitch, /*SrcSlicePitch=*/0,
-      /*DstSlicePitch=*/0, Blocking, NumEventsInWaitlist, EventWaitlist, Event,
-      PreferCopyEngine);
+  return pi2ur::piextUSMEnqueueMemcpy2D(
+      Queue, Blocking, DstPtr, DstPitch, SrcPtr, SrcPitch, Width, Height,
+      NumEventsInWaitlist, EventWaitlist, Event);
 }
 
 /// API to query information about USM allocated pointers.
@@ -8209,61 +939,8 @@ pi_result piextUSMGetMemAllocInfo(pi_context Context, const void *Ptr,
                                   pi_mem_alloc_info ParamName,
                                   size_t ParamValueSize, void *ParamValue,
                                   size_t *ParamValueSizeRet) {
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-
-  ze_device_handle_t ZeDeviceHandle;
-  ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
-
-  ZE_CALL(zeMemGetAllocProperties,
-          (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties,
-           &ZeDeviceHandle));
-
-  ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
-  switch (ParamName) {
-  case PI_MEM_ALLOC_TYPE: {
-    pi_usm_type MemAllocaType;
-    switch (ZeMemoryAllocationProperties.type) {
-    case ZE_MEMORY_TYPE_UNKNOWN:
-      MemAllocaType = PI_MEM_TYPE_UNKNOWN;
-      break;
-    case ZE_MEMORY_TYPE_HOST:
-      MemAllocaType = PI_MEM_TYPE_HOST;
-      break;
-    case ZE_MEMORY_TYPE_DEVICE:
-      MemAllocaType = PI_MEM_TYPE_DEVICE;
-      break;
-    case ZE_MEMORY_TYPE_SHARED:
-      MemAllocaType = PI_MEM_TYPE_SHARED;
-      break;
-    default:
-      urPrint("piextUSMGetMemAllocInfo: unexpected usm memory type\n");
-      return PI_ERROR_INVALID_VALUE;
-    }
-    return ReturnValue(MemAllocaType);
-  }
-  case PI_MEM_ALLOC_DEVICE:
-    if (ZeDeviceHandle) {
-      auto Platform = Context->getPlatform();
-      auto Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle);
-      return Device ? ReturnValue(Device) : PI_ERROR_INVALID_VALUE;
-    } else {
-      return PI_ERROR_INVALID_VALUE;
-    }
-  case PI_MEM_ALLOC_BASE_PTR: {
-    void *Base;
-    ZE_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, &Base, nullptr));
-    return ReturnValue(Base);
-  }
-  case PI_MEM_ALLOC_SIZE: {
-    size_t Size;
-    ZE_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, nullptr, &Size));
-    return ReturnValue(Size);
-  }
-  default:
-    urPrint("piextUSMGetMemAllocInfo: unsupported ParamName\n");
-    return PI_ERROR_INVALID_VALUE;
-  }
-  return PI_SUCCESS;
+  return pi2ur::piextUSMGetMemAllocInfo(Context, Ptr, ParamName, ParamValueSize,
+                                        ParamValue, ParamValueSizeRet);
 }
 
 /// API for writing data from host to a device global variable.
@@ -8283,32 +960,9 @@ pi_result piextEnqueueDeviceGlobalVariableWrite(
     pi_queue Queue, pi_program Program, const char *Name, pi_bool BlockingWrite,
     size_t Count, size_t Offset, const void *Src, pi_uint32 NumEventsInWaitList,
     const pi_event *EventsWaitList, pi_event *Event) {
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
-
-  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
-
-  // Find global variable pointer
-  size_t GlobalVarSize = 0;
-  void *GlobalVarPtr = nullptr;
-  ZE_CALL(zeModuleGetGlobalPointer,
-          (Program->ZeModule, Name, &GlobalVarSize, &GlobalVarPtr));
-  if (GlobalVarSize < Offset + Count) {
-    setErrorMessage("Write device global variable is out of range.",
-                    UR_RESULT_ERROR_INVALID_VALUE);
-    return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-  }
-
-  // Copy engine is preferred only for host to device transfer.
-  // Device to device transfers run faster on compute engines.
-  bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src);
-
-  // Temporary option added to use copy engine for D2D copy
-  PreferCopyEngine |= UseCopyEngineForD2DCopy;
-
-  return enqueueMemCopyHelper(PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_WRITE,
-                              Queue, ur_cast<char *>(GlobalVarPtr) + Offset,
-                              BlockingWrite, Count, Src, NumEventsInWaitList,
-                              EventsWaitList, Event, PreferCopyEngine);
+  return pi2ur::piextEnqueueDeviceGlobalVariableWrite(
+      Queue, Program, Name, BlockingWrite, Count, Offset, Src,
+      NumEventsInWaitList, EventsWaitList, Event);
 }
 
 /// API reading data from a device global variable to host.
@@ -8328,32 +982,12 @@ pi_result piextEnqueueDeviceGlobalVariableRead(
     pi_queue Queue, pi_program Program, const char *Name, pi_bool BlockingRead,
     size_t Count, size_t Offset, void *Dst, pi_uint32 NumEventsInWaitList,
     const pi_event *EventsWaitList, pi_event *Event) {
-  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
 
-  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
-
-  // Find global variable pointer
-  size_t GlobalVarSize = 0;
-  void *GlobalVarPtr = nullptr;
-  ZE_CALL(zeModuleGetGlobalPointer,
-          (Program->ZeModule, Name, &GlobalVarSize, &GlobalVarPtr));
-  if (GlobalVarSize < Offset + Count) {
-    setErrorMessage("Read from device global variable is out of range.",
-                    UR_RESULT_ERROR_INVALID_VALUE);
-    return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-  }
-
-  // Copy engine is preferred only for host to device transfer.
-  // Device to device transfers run faster on compute engines.
-  bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Dst);
+  return pi2ur::piextEnqueueDeviceGlobalVariableRead(
+      Queue, Program, Name, BlockingRead, Count, Offset, Dst,
+      NumEventsInWaitList, EventsWaitList, Event);
 
-  // Temporary option added to use copy engine for D2D copy
-  PreferCopyEngine |= UseCopyEngineForD2DCopy;
-
-  return enqueueMemCopyHelper(
-      PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_READ, Queue, Dst, BlockingRead,
-      Count, ur_cast<char *>(GlobalVarPtr) + Offset, NumEventsInWaitList,
-      EventsWaitList, Event, PreferCopyEngine);
+  return PI_SUCCESS;
 }
 /// API for Read from host pipe.
 ///
@@ -8423,65 +1057,21 @@ pi_result piextEnqueueWriteHostPipe(pi_queue Queue, pi_program Program,
 
 pi_result piKernelSetExecInfo(pi_kernel Kernel, pi_kernel_exec_info ParamName,
                               size_t ParamValueSize, const void *ParamValue) {
-  (void)ParamValueSize;
-  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
-  PI_ASSERT(ParamValue, PI_ERROR_INVALID_VALUE);
-
-  std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
-  if (ParamName == PI_USM_INDIRECT_ACCESS &&
-      *(static_cast<const pi_bool *>(ParamValue)) == PI_TRUE) {
-    // The whole point for users really was to not need to know anything
-    // about the types of allocations kernel uses. So in DPC++ we always
-    // just set all 3 modes for each kernel.
-    ze_kernel_indirect_access_flags_t IndirectFlags =
-        ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST |
-        ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE |
-        ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
-    ZE_CALL(zeKernelSetIndirectAccess, (Kernel->ZeKernel, IndirectFlags));
-  } else if (ParamName == PI_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG) {
-    ze_cache_config_flag_t ZeCacheConfig;
-    switch (*(static_cast<const pi_kernel_cache_config *>(ParamValue))) {
-    case PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM:
-      ZeCacheConfig = ZE_CACHE_CONFIG_FLAG_LARGE_SLM;
-      break;
-    case PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA:
-      ZeCacheConfig = ZE_CACHE_CONFIG_FLAG_LARGE_DATA;
-      break;
-    case PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT:
-      ZeCacheConfig = static_cast<ze_cache_config_flag_t>(0);
-      break;
-    default:
-      // Unexpected cache configuration value.
-      return PI_ERROR_INVALID_VALUE;
-    }
-    ZE_CALL(zeKernelSetCacheConfig, (Kernel->ZeKernel, ZeCacheConfig););
-  } else {
-    urPrint("piKernelSetExecInfo: unsupported ParamName\n");
-    return PI_ERROR_INVALID_VALUE;
-  }
 
-  return PI_SUCCESS;
+  return pi2ur::piKernelSetExecInfo(Kernel, ParamName, ParamValueSize,
+                                    ParamValue);
 }
 
 pi_result piextProgramSetSpecializationConstant(pi_program Prog,
-                                                pi_uint32 SpecID, size_t,
+                                                pi_uint32 SpecID, size_t Size,
                                                 const void *SpecValue) {
-  std::scoped_lock<ur_shared_mutex> Guard(Prog->Mutex);
-
-  // Remember the value of this specialization constant until the program is
-  // built.  Note that we only save the pointer to the buffer that contains the
-  // value.  The caller is responsible for maintaining storage for this buffer.
-  //
-  // NOTE: SpecSize is unused in Level Zero, the size is known from SPIR-V by
-  // SpecID.
-  Prog->SpecConstants[SpecID] = SpecValue;
-
-  return PI_SUCCESS;
+  return pi2ur::piextProgramSetSpecializationConstant(Prog, SpecID, Size,
+                                                      SpecValue);
 }
 
 const char SupportedVersion[] = _PI_LEVEL_ZERO_PLUGIN_VERSION_STRING;
 
-pi_result piPluginInit(pi_plugin *PluginInit) {
+pi_result piPluginInit(pi_plugin *PluginInit) { // missing
   PI_ASSERT(PluginInit, PI_ERROR_INVALID_VALUE);
 
   // Check that the major version matches in PiVersion and SupportedVersion
@@ -8505,9 +1095,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
 
 pi_result piextPluginGetOpaqueData(void *opaque_data_param,
                                    void **opaque_data_return) {
-  (void)opaque_data_param;
-  (void)opaque_data_return;
-  return PI_ERROR_UNKNOWN;
+  return pi2ur::piextPluginGetOpaqueData(opaque_data_param, opaque_data_return);
 }
 
 // SYCL RT calls this api to notify the end of plugin lifetime.
@@ -8518,388 +1106,12 @@ pi_result piextPluginGetOpaqueData(void *opaque_data_param,
 // It can include all the jobs to tear down resources before
 // the plugin is unloaded from memory.
 pi_result piTearDown(void *PluginParameter) {
-  (void)PluginParameter;
-  bool LeakFound = false;
-  // reclaim pi_platform objects here since we don't have piPlatformRelease.
-  for (pi_platform Platform : *PiPlatformsCache) {
-    delete Platform;
-  }
-  delete PiPlatformsCache;
-  delete PiPlatformsCacheMutex;
-
-  // Print the balance of various create/destroy native calls.
-  // The idea is to verify if the number of create(+) and destroy(-) calls are
-  // matched.
-  if (ZeCallCount && (UrL0Debug & UR_L0_DEBUG_CALL_COUNT) != 0) {
-    // clang-format off
-    //
-    // The format of this table is such that each row accounts for a
-    // specific type of objects, and all elements in the raw except the last
-    // one are allocating objects of that type, while the last element is known
-    // to deallocate objects of that type.
-    //
-    std::vector<std::vector<const char *>> CreateDestroySet = {
-      {"zeContextCreate",      "zeContextDestroy"},
-      {"zeCommandQueueCreate", "zeCommandQueueDestroy"},
-      {"zeModuleCreate",       "zeModuleDestroy"},
-      {"zeKernelCreate",       "zeKernelDestroy"},
-      {"zeEventPoolCreate",    "zeEventPoolDestroy"},
-      {"zeCommandListCreateImmediate", "zeCommandListCreate", "zeCommandListDestroy"},
-      {"zeEventCreate",        "zeEventDestroy"},
-      {"zeFenceCreate",        "zeFenceDestroy"},
-      {"zeImageCreate",        "zeImageDestroy"},
-      {"zeSamplerCreate",      "zeSamplerDestroy"},
-      {"zeMemAllocDevice", "zeMemAllocHost", "zeMemAllocShared", "zeMemFree"},
-    };
-
-    // A sample output aimed below is this:
-    // ------------------------------------------------------------------------
-    //                zeContextCreate = 1     \--->        zeContextDestroy = 1
-    //           zeCommandQueueCreate = 1     \--->   zeCommandQueueDestroy = 1
-    //                 zeModuleCreate = 1     \--->         zeModuleDestroy = 1
-    //                 zeKernelCreate = 1     \--->         zeKernelDestroy = 1
-    //              zeEventPoolCreate = 1     \--->      zeEventPoolDestroy = 1
-    //   zeCommandListCreateImmediate = 1     |
-    //            zeCommandListCreate = 1     \--->    zeCommandListDestroy = 1  ---> LEAK = 1
-    //                  zeEventCreate = 2     \--->          zeEventDestroy = 2
-    //                  zeFenceCreate = 1     \--->          zeFenceDestroy = 1
-    //                  zeImageCreate = 0     \--->          zeImageDestroy = 0
-    //                zeSamplerCreate = 0     \--->        zeSamplerDestroy = 0
-    //               zeMemAllocDevice = 0     |
-    //                 zeMemAllocHost = 1     |
-    //               zeMemAllocShared = 0     \--->               zeMemFree = 1
-    //
-    // clang-format on
-
-    fprintf(stderr, "ZE_DEBUG=%d: check balance of create/destroy calls\n",
-            UR_L0_DEBUG_CALL_COUNT);
-    fprintf(stderr,
-            "----------------------------------------------------------\n");
-    for (const auto &Row : CreateDestroySet) {
-      int diff = 0;
-      for (auto I = Row.begin(); I != Row.end();) {
-        const char *ZeName = *I;
-        const auto &ZeCount = (*ZeCallCount)[*I];
-
-        bool First = (I == Row.begin());
-        bool Last = (++I == Row.end());
-
-        if (Last) {
-          fprintf(stderr, " \\--->");
-          diff -= ZeCount;
-        } else {
-          diff += ZeCount;
-          if (!First) {
-            fprintf(stderr, " | \n");
-          }
-        }
-
-        fprintf(stderr, "%30s = %-5d", ZeName, ZeCount);
-      }
-
-      if (diff) {
-        LeakFound = true;
-        fprintf(stderr, " ---> LEAK = %d", diff);
-      }
-      fprintf(stderr, "\n");
-    }
-
-    ZeCallCount->clear();
-    delete ZeCallCount;
-    ZeCallCount = nullptr;
-  }
-  if (LeakFound)
-    return PI_ERROR_INVALID_MEM_OBJECT;
-
-  disableZeTracing();
-  return PI_SUCCESS;
-}
-
-pi_result _pi_buffer::getZeHandlePtr(char **&ZeHandlePtr,
-                                     access_mode_t AccessMode,
-                                     pi_device Device) {
-  char *ZeHandle;
-  PI_CALL(getZeHandle(ZeHandle, AccessMode, Device));
-  ZeHandlePtr = &Allocations[Device].ZeHandle;
-  return PI_SUCCESS;
-}
-
-size_t _pi_buffer::getAlignment() const {
-  // Choose an alignment that is at most 64 and is the next power of 2
-  // for sizes less than 64.
-  auto Alignment = Size;
-  if (Alignment > 32UL)
-    Alignment = 64UL;
-  else if (Alignment > 16UL)
-    Alignment = 32UL;
-  else if (Alignment > 8UL)
-    Alignment = 16UL;
-  else if (Alignment > 4UL)
-    Alignment = 8UL;
-  else if (Alignment > 2UL)
-    Alignment = 4UL;
-  else if (Alignment > 1UL)
-    Alignment = 2UL;
-  else
-    Alignment = 1UL;
-  return Alignment;
-}
-
-pi_result _pi_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode,
-                                  pi_device Device) {
-
-  // NOTE: There might be no valid allocation at all yet and we get
-  // here from piEnqueueKernelLaunch that would be doing the buffer
-  // initialization. In this case the Device is not null as kernel
-  // launch is always on a specific device.
-  if (!Device)
-    Device = LastDeviceWithValidAllocation;
-  // If the device is still not selected then use the first one in
-  // the context of the buffer.
-  if (!Device)
-    Device = Context->Devices[0];
-
-  auto &Allocation = Allocations[Device];
-
-  // Sub-buffers don't maintain own allocations but rely on parent buffer.
-  if (isSubBuffer()) {
-    PI_CALL(SubBuffer.Parent->getZeHandle(ZeHandle, AccessMode, Device));
-    ZeHandle += SubBuffer.Origin;
-    // Still store the allocation info in the PI sub-buffer for
-    // getZeHandlePtr to work. At least zeKernelSetArgumentValue needs to
-    // be given a pointer to the allocation handle rather than its value.
-    //
-    Allocation.ZeHandle = ZeHandle;
-    Allocation.ReleaseAction = allocation_t::keep;
-    LastDeviceWithValidAllocation = Device;
-    return PI_SUCCESS;
-  }
-
-  // First handle case where the buffer is represented by only
-  // a single host allocation.
-  if (OnHost) {
-    auto &HostAllocation = Allocations[nullptr];
-    // The host allocation may already exists, e.g. with imported
-    // host ptr, or in case of interop buffer.
-    if (!HostAllocation.ZeHandle) {
-      if (USMAllocatorConfigInstance.EnableBuffers) {
-        HostAllocation.ReleaseAction = allocation_t::free;
-        PI_CALL(piextUSMHostAlloc(ur_cast<void **>(&ZeHandle), Context, nullptr,
-                                  Size, getAlignment()));
-      } else {
-        HostAllocation.ReleaseAction = allocation_t::free_native;
-        PI_CALL(
-            ZeHostMemAllocHelper(ur_cast<void **>(&ZeHandle), Context, Size));
-      }
-      HostAllocation.ZeHandle = ZeHandle;
-      HostAllocation.Valid = true;
-    }
-    Allocation = HostAllocation;
-    Allocation.ReleaseAction = allocation_t::keep;
-    ZeHandle = Allocation.ZeHandle;
-    LastDeviceWithValidAllocation = Device;
-    return PI_SUCCESS;
-  }
-  // Reads user setting on how to deal with buffers in contexts where
-  // all devices have the same root-device. Returns "true" if the
-  // preference is to have allocate on each [sub-]device and migrate
-  // normally (copy) to other sub-devices as needed. Returns "false"
-  // if the preference is to have single root-device allocations
-  // serve the needs of all [sub-]devices, meaning potentially more
-  // cross-tile traffic.
-  //
-  static const bool SingleRootDeviceBufferMigration = [] {
-    const char *UrRet =
-        std::getenv("UR_L0_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION");
-    const char *PiRet =
-        std::getenv("SYCL_PI_LEVEL_ZERO_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION");
-    const char *EnvStr = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-
-    if (EnvStr)
-      return (std::stoi(EnvStr) != 0);
-    // The default is to migrate normally, which may not always be the
-    // best option (depends on buffer access patterns), but is an
-    // overall win on the set of the available benchmarks.
-    return true;
-  }();
-
-  // Peform actual device allocation as needed.
-  if (!Allocation.ZeHandle) {
-    if (!SingleRootDeviceBufferMigration && Context->SingleRootDevice &&
-        Context->SingleRootDevice != Device) {
-      // If all devices in the context are sub-devices of the same device
-      // then we reuse root-device allocation by all sub-devices in the
-      // context.
-      // TODO: we can probably generalize this and share root-device
-      //       allocations by its own sub-devices even if not all other
-      //       devices in the context have the same root.
-      PI_CALL(getZeHandle(ZeHandle, AccessMode, Context->SingleRootDevice));
-      Allocation.ReleaseAction = allocation_t::keep;
-      Allocation.ZeHandle = ZeHandle;
-      Allocation.Valid = true;
-      return PI_SUCCESS;
-    } else { // Create device allocation
-      if (USMAllocatorConfigInstance.EnableBuffers) {
-        Allocation.ReleaseAction = allocation_t::free;
-        PI_CALL(piextUSMDeviceAlloc(ur_cast<void **>(&ZeHandle), Context,
-                                    Device, nullptr, Size, getAlignment()));
-      } else {
-        Allocation.ReleaseAction = allocation_t::free_native;
-        PI_CALL(ZeDeviceMemAllocHelper(ur_cast<void **>(&ZeHandle), Context,
-                                       Device, Size));
-      }
-    }
-    Allocation.ZeHandle = ZeHandle;
-  } else {
-    ZeHandle = Allocation.ZeHandle;
-  }
-
-  // If some prior access invalidated this allocation then make it valid again.
-  if (!Allocation.Valid) {
-    // LastDeviceWithValidAllocation should always have valid allocation.
-    if (Device == LastDeviceWithValidAllocation)
-      die("getZeHandle: last used allocation is not valid");
-
-    // For write-only access the allocation contents is not going to be used.
-    // So don't do anything to make it "valid".
-    bool NeedCopy = AccessMode != _pi_mem::write_only;
-    // It's also possible that the buffer doesn't have a valid allocation
-    // yet presumably when it is passed to a kernel that will perform
-    // it's intialization.
-    if (NeedCopy && !LastDeviceWithValidAllocation) {
-      NeedCopy = false;
-    }
-    char *ZeHandleSrc = nullptr;
-    if (NeedCopy) {
-      PI_CALL(getZeHandle(ZeHandleSrc, _pi_mem::read_only,
-                          LastDeviceWithValidAllocation));
-      // It's possible with the single root-device contexts that
-      // the buffer is represented by the single root-device
-      // allocation and then skip the copy to itself.
-      if (ZeHandleSrc == ZeHandle)
-        NeedCopy = false;
-    }
-
-    if (NeedCopy) {
-      // Copy valid buffer data to this allocation.
-      // TODO: see if we should better use peer's device allocation used
-      // directly, if that capability is reported with zeDeviceCanAccessPeer,
-      // instead of maintaining a separate allocation and performing
-      // explciit copies.
-      //
-      // zeCommandListAppendMemoryCopy must not be called from simultaneous
-      // threads with the same command list handle, so we need exclusive lock.
-      ze_bool_t P2P = false;
-      ZE_CALL(
-          zeDeviceCanAccessPeer,
-          (Device->ZeDevice, LastDeviceWithValidAllocation->ZeDevice, &P2P));
-      if (!P2P) {
-        // P2P copy is not possible, so copy through the host.
-        auto &HostAllocation = Allocations[nullptr];
-        // The host allocation may already exists, e.g. with imported
-        // host ptr, or in case of interop buffer.
-        if (!HostAllocation.ZeHandle) {
-          void *ZeHandleHost;
-          if (USMAllocatorConfigInstance.EnableBuffers) {
-            HostAllocation.ReleaseAction = allocation_t::free;
-            PI_CALL(piextUSMHostAlloc(&ZeHandleHost, Context, nullptr, Size,
-                                      getAlignment()));
-          } else {
-            HostAllocation.ReleaseAction = allocation_t::free_native;
-            PI_CALL(ZeHostMemAllocHelper(&ZeHandleHost, Context, Size));
-          }
-          HostAllocation.ZeHandle = ur_cast<char *>(ZeHandleHost);
-          HostAllocation.Valid = false;
-        }
-        std::scoped_lock<ur_mutex> Lock(Context->ImmediateCommandListMutex);
-        if (!HostAllocation.Valid) {
-          ZE_CALL(zeCommandListAppendMemoryCopy,
-                  (Context->ZeCommandListInit,
-                   HostAllocation.ZeHandle /* Dst */, ZeHandleSrc, Size,
-                   nullptr, 0, nullptr));
-          // Mark the host allocation data  as valid so it can be reused.
-          // It will be invalidated below if the current access is not
-          // read-only.
-          HostAllocation.Valid = true;
-        }
-        ZE_CALL(zeCommandListAppendMemoryCopy,
-                (Context->ZeCommandListInit, ZeHandle /* Dst */,
-                 HostAllocation.ZeHandle, Size, nullptr, 0, nullptr));
-      } else {
-        // Perform P2P copy.
-        std::scoped_lock<ur_mutex> Lock(Context->ImmediateCommandListMutex);
-        ZE_CALL(zeCommandListAppendMemoryCopy,
-                (Context->ZeCommandListInit, ZeHandle /* Dst */, ZeHandleSrc,
-                 Size, nullptr, 0, nullptr));
-      }
-    }
-    Allocation.Valid = true;
-    LastDeviceWithValidAllocation = Device;
-  }
-
-  // Invalidate other allocations that would become not valid if
-  // this access is not read-only.
-  if (AccessMode != _pi_mem::read_only) {
-    for (auto &Alloc : Allocations) {
-      if (Alloc.first != LastDeviceWithValidAllocation)
-        Alloc.second.Valid = false;
-    }
-  }
-
-  urPrint("getZeHandle(pi_device{%p}) = %p\n", (void *)Device,
-          (void *)Allocation.ZeHandle);
-  return PI_SUCCESS;
-}
-
-pi_result _pi_buffer::free() {
-  for (auto &Alloc : Allocations) {
-    auto &ZeHandle = Alloc.second.ZeHandle;
-    // It is possible that the real allocation wasn't made if the buffer
-    // wasn't really used in this location.
-    if (!ZeHandle)
-      continue;
-
-    switch (Alloc.second.ReleaseAction) {
-    case allocation_t::keep:
-      break;
-    case allocation_t::free: {
-      pi_platform Plt = Context->getPlatform();
-      std::scoped_lock<ur_shared_mutex> Lock(
-          IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex);
-
-      PI_CALL(USMFreeHelper(Context, ZeHandle));
-      break;
-    }
-    case allocation_t::free_native:
-      PI_CALL(ZeMemFreeHelper(Context, ZeHandle));
-      break;
-    case allocation_t::unimport:
-      ZeUSMImport.doZeUSMRelease(Context->getPlatform()->ZeDriver, ZeHandle);
-      break;
-    default:
-      die("_pi_buffer::free(): Unhandled release action");
-    }
-    ZeHandle = nullptr; // don't leave hanging pointers
-  }
-  return PI_SUCCESS;
+  return pi2ur::piTearDown(PluginParameter);
 }
 
 pi_result piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime,
                                   uint64_t *HostTime) {
-  const uint64_t &ZeTimerResolution =
-      Device->ZeDeviceProperties->timerResolution;
-  const uint64_t TimestampMaxCount =
-      ((1ULL << Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL);
-  uint64_t DeviceClockCount, Dummy;
-
-  ZE_CALL(zeDeviceGetGlobalTimestamps,
-          (Device->ZeDevice, HostTime == nullptr ? &Dummy : HostTime,
-           &DeviceClockCount));
-
-  if (DeviceTime != nullptr) {
-    *DeviceTime = (DeviceClockCount & TimestampMaxCount) * ZeTimerResolution;
-  }
-  return PI_SUCCESS;
+  return pi2ur::piGetDeviceAndHostTimer(Device, DeviceTime, HostTime);
 }
 
 #ifdef _WIN32
diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp
index 2634e03cae595..8acc1077eb713 100644
--- a/sycl/plugins/level_zero/pi_level_zero.hpp
+++ b/sycl/plugins/level_zero/pi_level_zero.hpp
@@ -25,1330 +25,10 @@
 #define _PI_LEVEL_ZERO_PLUGIN_VERSION_STRING                                   \
   _PI_PLUGIN_VERSION_STRING(_PI_LEVEL_ZERO_PLUGIN_VERSION)
 
-#include <atomic>
-#include <cassert>
-#include <cstring>
-#include <functional>
-#include <list>
-#include <map>
-#include <memory>
-#include <mutex>
-#include <optional>
-#include <shared_mutex>
-#include <string>
-#include <sycl/detail/pi.h>
-#include <thread>
-#include <tuple>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include <sycl/detail/iostream_proxy.hpp>
-#include <ze_api.h>
-#include <zes_api.h>
-
 // Share code between this PI L0 Plugin and UR L0 Adapter
+#include "ur/usm_allocator_config.hpp"
 #include <pi2ur.hpp>
 #include <ur/adapters/level_zero/ur_level_zero.hpp>
 #include <ur/usm_allocator.hpp>
 
-// Define the types that are opaque in pi.h in a manner suitabale for Level Zero
-// plugin
-
-struct _pi_platform : public _ur_platform_handle_t {
-  using _ur_platform_handle_t::_ur_platform_handle_t;
-
-  // Keep track of all contexts in the platform. This is needed to manage
-  // a lifetime of memory allocations in each context when there are kernels
-  // with indirect access.
-  // TODO: should be deleted when memory isolation in the context is implemented
-  // in the driver.
-  std::list<pi_context> Contexts;
-  ur_shared_mutex ContextsMutex;
-};
-
-// Implements memory allocation via L0 RT for USM allocator interface.
-class USMMemoryAllocBase : public SystemMemory {
-protected:
-  pi_context Context;
-  pi_device Device;
-  // Internal allocation routine which must be implemented for each allocation
-  // type
-  virtual pi_result allocateImpl(void **ResultPtr, size_t Size,
-                                 pi_uint32 Alignment) = 0;
-
-public:
-  USMMemoryAllocBase(pi_context Ctx, pi_device Dev)
-      : Context{Ctx}, Device{Dev} {}
-  void *allocate(size_t Size) override final;
-  void *allocate(size_t Size, size_t Alignment) override final;
-  void deallocate(void *Ptr) override final;
-};
-
-// Allocation routines for shared memory type
-class USMSharedMemoryAlloc : public USMMemoryAllocBase {
-protected:
-  pi_result allocateImpl(void **ResultPtr, size_t Size,
-                         pi_uint32 Alignment) override;
-
-public:
-  USMSharedMemoryAlloc(pi_context Ctx, pi_device Dev)
-      : USMMemoryAllocBase(Ctx, Dev) {}
-};
-
-// Allocation routines for shared memory type that is only modified from host.
-class USMSharedReadOnlyMemoryAlloc : public USMMemoryAllocBase {
-protected:
-  pi_result allocateImpl(void **ResultPtr, size_t Size,
-                         pi_uint32 Alignment) override;
-
-public:
-  USMSharedReadOnlyMemoryAlloc(pi_context Ctx, pi_device Dev)
-      : USMMemoryAllocBase(Ctx, Dev) {}
-};
-
-// Allocation routines for device memory type
-class USMDeviceMemoryAlloc : public USMMemoryAllocBase {
-protected:
-  pi_result allocateImpl(void **ResultPtr, size_t Size,
-                         pi_uint32 Alignment) override;
-
-public:
-  USMDeviceMemoryAlloc(pi_context Ctx, pi_device Dev)
-      : USMMemoryAllocBase(Ctx, Dev) {}
-};
-
-// Allocation routines for host memory type
-class USMHostMemoryAlloc : public USMMemoryAllocBase {
-protected:
-  pi_result allocateImpl(void **ResultPtr, size_t Size,
-                         pi_uint32 Alignment) override;
-
-public:
-  USMHostMemoryAlloc(pi_context Ctx) : USMMemoryAllocBase(Ctx, nullptr) {}
-};
-
-struct _pi_device : _ur_device_handle_t {
-  using _ur_device_handle_t::_ur_device_handle_t;
-};
-
-// Structure describing the specific use of a command-list in a queue.
-// This is because command-lists are re-used across multiple queues
-// in the same context.
-struct pi_command_list_info_t {
-  // The Level-Zero fence that will be signalled at completion.
-  // Immediate commandlists do not have an associated fence.
-  // A nullptr for the fence indicates that this is an immediate commandlist.
-  ze_fence_handle_t ZeFence{nullptr};
-  // Record if the fence is in use.
-  // This is needed to avoid leak of the tracked command-list if the fence
-  // was not yet signaled at the time all events in that list were already
-  // completed (we are polling the fence at events completion). The fence
-  // may be still "in-use" due to sporadic delay in HW.
-  bool ZeFenceInUse{false};
-
-  // Indicates if command list is in closed state. This is needed to avoid
-  // appending commands to the closed command list.
-  bool IsClosed{false};
-
-  // Record the queue to which the command list will be submitted.
-  ze_command_queue_handle_t ZeQueue{nullptr};
-
-  // Record the queue descriptor fields used when creating the command list
-  // because we cannot recover these fields from the command list. Immediate
-  // command lists are recycled across queues and then all fields are used. For
-  // standard command lists only the ordinal is used. For queues created through
-  // the make_queue API the descriptor is unavailable so a dummy descriptor is
-  // used and then this entry is marked as not eligible for recycling.
-  ZeStruct<ze_command_queue_desc_t> ZeQueueDesc;
-  bool CanReuse{true};
-
-  // Helper functions to tell if this is a copy command-list.
-  bool isCopy(pi_queue Queue) const;
-
-  // Keeps events created by commands submitted into this command-list.
-  // TODO: use this for explicit wait/cleanup of events at command-list
-  // completion.
-  // TODO: use this for optimizing events in the same command-list, e.g.
-  // only have last one visible to the host.
-  std::vector<pi_event> EventList{};
-  size_t size() const { return EventList.size(); }
-  void append(pi_event Event) { EventList.push_back(Event); }
-};
-
-// The map type that would track all command-lists in a queue.
-using pi_command_list_map_t =
-    std::unordered_map<ze_command_list_handle_t, pi_command_list_info_t>;
-// The iterator pointing to a specific command-list in use.
-using pi_command_list_ptr_t = pi_command_list_map_t::iterator;
-
-struct _pi_context : _ur_object {
-  _pi_context(ze_context_handle_t ZeContext, pi_uint32 NumDevices,
-              const pi_device *Devs, bool OwnZeContext)
-      : ZeContext{ZeContext}, OwnZeContext{OwnZeContext},
-        Devices{Devs, Devs + NumDevices}, SingleRootDevice(getRootDevice()),
-        ZeCommandListInit{nullptr} {
-    // NOTE: one must additionally call initialize() to complete
-    // PI context creation.
-  }
-
-  // Initialize the PI context.
-  pi_result initialize();
-
-  // Finalize the PI context
-  pi_result finalize();
-
-  // Return the Platform, which is the same for all devices in the context
-  pi_platform getPlatform() const;
-
-  // A L0 context handle is primarily used during creation and management of
-  // resources that may be used by multiple devices.
-  // This field is only set at _pi_context creation time, and cannot change.
-  // Therefore it can be accessed without holding a lock on this _pi_context.
-  const ze_context_handle_t ZeContext;
-
-  // Indicates if we own the ZeContext or it came from interop that
-  // asked to not transfer the ownership to SYCL RT.
-  bool OwnZeContext;
-
-  // Keep the PI devices this PI context was created for.
-  // This field is only set at _pi_context creation time, and cannot change.
-  // Therefore it can be accessed without holding a lock on this _pi_context.
-  const std::vector<pi_device> Devices;
-
-  // Checks if Device is covered by this context.
-  // For that the Device or its root devices need to be in the context.
-  bool isValidDevice(pi_device Device) const;
-
-  // If context contains one device or sub-devices of the same device, we want
-  // to save this device.
-  // This field is only set at _pi_context creation time, and cannot change.
-  // Therefore it can be accessed without holding a lock on this _pi_context.
-  const pi_device SingleRootDevice = nullptr;
-
-  // Immediate Level Zero command list for the device in this context, to be
-  // used for initializations. To be created as:
-  // - Immediate command list: So any command appended to it is immediately
-  //   offloaded to the device.
-  // - Synchronous: So implicit synchronization is made inside the level-zero
-  //   driver.
-  // There will be a list of immediate command lists (for each device) when
-  // support of the multiple devices per context will be added.
-  ze_command_list_handle_t ZeCommandListInit;
-
-  // Mutex for the immediate command list. Per the Level Zero spec memory copy
-  // operations submitted to an immediate command list are not allowed to be
-  // called from simultaneous threads.
-  ur_mutex ImmediateCommandListMutex;
-
-  // Mutex Lock for the Command List Cache. This lock is used to control both
-  // compute and copy command list caches.
-  ur_mutex ZeCommandListCacheMutex;
-  // Cache of all currently available/completed command/copy lists.
-  // Note that command-list can only be re-used on the same device.
-  //
-  // TODO: explore if we should use root-device for creating command-lists
-  // as spec says that in that case any sub-device can re-use it: "The
-  // application must only use the command list for the device, or its
-  // sub-devices, which was provided during creation."
-  //
-  std::unordered_map<ze_device_handle_t,
-                     std::list<std::pair<ze_command_list_handle_t,
-                                         ZeStruct<ze_command_queue_desc_t>>>>
-      ZeComputeCommandListCache;
-  std::unordered_map<ze_device_handle_t,
-                     std::list<std::pair<ze_command_list_handle_t,
-                                         ZeStruct<ze_command_queue_desc_t>>>>
-      ZeCopyCommandListCache;
-
-  // Retrieves a command list for executing on this device along with
-  // a fence to be used in tracking the execution of this command list.
-  // If a command list has been created on this device which has
-  // completed its commands, then that command list and its associated fence
-  // will be reused. Otherwise, a new command list and fence will be created for
-  // running on this device. L0 fences are created on a L0 command queue so the
-  // caller must pass a command queue to create a new fence for the new command
-  // list if a command list/fence pair is not available. All Command Lists &
-  // associated fences are destroyed at Device Release.
-  // If UseCopyEngine is true, the command will eventually be executed in a
-  // copy engine. Otherwise, the command will be executed in a compute engine.
-  // If AllowBatching is true, then the command list returned may already have
-  // command in it, if AllowBatching is false, any open command lists that
-  // already exist in Queue will be closed and executed.
-  // If ForcedCmdQueue is not nullptr, the resulting command list must be tied
-  // to the contained command queue. This option is ignored if immediate
-  // command lists are used.
-  // When using immediate commandlists, retrieves an immediate command list
-  // for executing on this device. Immediate commandlists are created only
-  // once for each SYCL Queue and after that they are reused.
-  pi_result
-  getAvailableCommandList(pi_queue Queue, pi_command_list_ptr_t &CommandList,
-                          bool UseCopyEngine, bool AllowBatching = false,
-                          ze_command_queue_handle_t *ForcedCmdQueue = nullptr);
-
-  // Get index of the free slot in the available pool. If there is no available
-  // pool then create new one. The HostVisible parameter tells if we need a
-  // slot for a host-visible event. The ProfilingEnabled tells is we need a
-  // slot for an event with profiling capabilities.
-  pi_result getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &,
-                                           bool HostVisible,
-                                           bool ProfilingEnabled);
-
-  // Decrement number of events living in the pool upon event destroy
-  // and return the pool to the cache if there are no unreleased events.
-  pi_result decrementUnreleasedEventsInPool(pi_event Event);
-
-  // Store USM allocator context(internal allocator structures)
-  // for USM shared and device allocations. There is 1 allocator context
-  // per each pair of (context, device) per each memory type.
-  std::unordered_map<ze_device_handle_t, USMAllocContext>
-      DeviceMemAllocContexts;
-  std::unordered_map<ze_device_handle_t, USMAllocContext>
-      SharedMemAllocContexts;
-  std::unordered_map<ze_device_handle_t, USMAllocContext>
-      SharedReadOnlyMemAllocContexts;
-
-  // Since L0 native runtime does not distinguisg "shared device_read_only"
-  // vs regular "shared" allocations, we have keep track of it to use
-  // proper USMAllocContext when freeing allocations.
-  std::unordered_set<void *> SharedReadOnlyAllocs;
-
-  // Store the host allocator context. It does not depend on any device.
-  std::unique_ptr<USMAllocContext> HostMemAllocContext;
-
-  // We need to store all memory allocations in the context because there could
-  // be kernels with indirect access. Kernels with indirect access start to
-  // reference all existing memory allocations at the time when they are
-  // submitted to the device. Referenced memory allocations can be released only
-  // when kernel has finished execution.
-  std::unordered_map<void *, MemAllocRecord> MemAllocs;
-
-  // Get pi_event from cache.
-  pi_event getEventFromContextCache(bool HostVisible, bool WithProfiling);
-
-  // Add pi_event to cache.
-  void addEventToContextCache(pi_event);
-
-private:
-  // If context contains one device then return this device.
-  // If context contains sub-devices of the same device, then return this parent
-  // device. Return nullptr if context consists of several devices which are not
-  // sub-devices of the same device. We call returned device the root device of
-  // a context.
-  // TODO: get rid of this when contexts with multiple devices are supported for
-  // images.
-  pi_device getRootDevice() const;
-
-  // Following member variables are used to manage assignment of events
-  // to event pools.
-  //
-  // TODO: Create pi_event_pool class to encapsulate working with pools.
-  // This will avoid needing the use of maps below, and cleanup the
-  // pi_context overall.
-  //
-
-  // The cache of event pools from where new events are allocated from.
-  // The head event pool is where the next event would be added to if there
-  // is still some room there. If there is no room in the head then
-  // the following event pool is taken (guranteed to be empty) and made the
-  // head. In case there is no next pool, a new pool is created and made the
-  // head.
-  //
-  // Cache of event pools to which host-visible events are added to.
-  std::vector<std::list<ze_event_pool_handle_t>> ZeEventPoolCache{4};
-  auto getZeEventPoolCache(bool HostVisible, bool WithProfiling) {
-    if (HostVisible)
-      return WithProfiling ? &ZeEventPoolCache[0] : &ZeEventPoolCache[1];
-    else
-      return WithProfiling ? &ZeEventPoolCache[2] : &ZeEventPoolCache[3];
-  }
-
-  // This map will be used to determine if a pool is full or not
-  // by storing number of empty slots available in the pool.
-  std::unordered_map<ze_event_pool_handle_t, pi_uint32>
-      NumEventsAvailableInEventPool;
-  // This map will be used to determine number of unreleased events in the pool.
-  // We use separate maps for number of event slots available in the pool from
-  // the number of events unreleased in the pool.
-  // This will help when we try to make the code thread-safe.
-  std::unordered_map<ze_event_pool_handle_t, pi_uint32>
-      NumEventsUnreleasedInEventPool;
-
-  // Mutex to control operations on event pool caches and the helper maps
-  // holding the current pool usage counts.
-  ur_mutex ZeEventPoolCacheMutex;
-
-  // Mutex to control operations on event caches.
-  ur_mutex EventCacheMutex;
-
-  // Caches for events.
-  std::vector<std::list<pi_event>> EventCaches{4};
-
-  // Get the cache of events for a provided scope and profiling mode.
-  auto getEventCache(bool HostVisible, bool WithProfiling) {
-    if (HostVisible)
-      return WithProfiling ? &EventCaches[0] : &EventCaches[1];
-    else
-      return WithProfiling ? &EventCaches[2] : &EventCaches[3];
-  }
-};
-
-struct _pi_queue : _ur_object {
-  // ForceComputeIndex, if non-negative, indicates that the queue must be fixed
-  // to that particular compute CCS.
-  _pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
-            std::vector<ze_command_queue_handle_t> &CopyQueues,
-            pi_context Context, pi_device Device, bool OwnZeCommandQueue,
-            pi_queue_properties Properties = 0, int ForceComputeIndex = -1);
-
-  using queue_type = _pi_device::queue_group_info_t::type;
-
-  // PI queue is in general a one to many mapping to L0 native queues.
-  struct pi_queue_group_t {
-    pi_queue Queue;
-    pi_queue_group_t() = delete;
-
-    // The Queue argument captures the enclosing PI queue.
-    // The Type argument specifies the type of this queue group.
-    // The actual ZeQueues are populated at PI queue construction.
-    pi_queue_group_t(pi_queue Queue, queue_type Type)
-        : Queue(Queue), Type(Type) {}
-
-    // The type of the queue group.
-    queue_type Type;
-    bool isCopy() const { return Type != queue_type::Compute; }
-
-    // Level Zero command queue handles.
-    std::vector<ze_command_queue_handle_t> ZeQueues;
-
-    // Immediate commandlist handles, one per Level Zero command queue handle.
-    // These are created only once, along with the L0 queues (see above)
-    // and reused thereafter.
-    std::vector<pi_command_list_ptr_t> ImmCmdLists;
-
-    // Return the index of the next queue to use based on a
-    // round robin strategy and the queue group ordinal.
-    // If QueryOnly is true then return index values but don't update internal
-    // index data members of the queue.
-    uint32_t getQueueIndex(uint32_t *QueueGroupOrdinal, uint32_t *QueueIndex,
-                           bool QueryOnly = false);
-
-    // Get the ordinal for a command queue handle.
-    int32_t getCmdQueueOrdinal(ze_command_queue_handle_t CmdQueue);
-
-    // This function will return one of possibly multiple available native
-    // queues and the value of the queue group ordinal.
-    ze_command_queue_handle_t &getZeQueue(uint32_t *QueueGroupOrdinal);
-
-    // This function sets an immediate commandlist from the interop interface.
-    void setImmCmdList(ze_command_list_handle_t);
-
-    // This function returns the next immediate commandlist to use.
-    pi_command_list_ptr_t &getImmCmdList();
-
-    // These indices are to filter specific range of the queues to use,
-    // and to organize round-robin across them.
-    uint32_t UpperIndex{0};
-    uint32_t LowerIndex{0};
-    uint32_t NextIndex{0};
-  };
-
-  // Helper class to facilitate per-thread queue groups
-  // We maintain a hashtable of queue groups if requested to do them per-thread.
-  // Otherwise it is just single entry used for all threads.
-  struct pi_queue_group_by_tid_t
-      : public std::unordered_map<std::thread::id, pi_queue_group_t> {
-    bool PerThread = false;
-
-    // Returns thread id if doing per-thread, or a generic id that represents
-    // all the threads.
-    std::thread::id tid() const {
-      return PerThread ? std::this_thread::get_id() : std::thread::id();
-    }
-
-    // Make the specified queue group be the master
-    void set(const pi_queue_group_t &QueueGroup) {
-      const auto &Device = QueueGroup.Queue->Device;
-      PerThread = Device->ImmCommandListUsed == _pi_device::PerThreadPerQueue;
-      assert(empty());
-      insert({tid(), QueueGroup});
-    }
-
-    // Get a queue group to use for this thread
-    pi_queue_group_t &get() {
-      assert(!empty());
-      auto It = find(tid());
-      if (It != end()) {
-        return It->second;
-      }
-      // Add new queue group for this thread initialized from a master entry.
-      auto QueueGroup = begin()->second;
-      // Create space for queues and immediate commandlists, which are created
-      // on demand.
-      QueueGroup.ZeQueues = std::vector<ze_command_queue_handle_t>(
-          QueueGroup.ZeQueues.size(), nullptr);
-      QueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
-          QueueGroup.ZeQueues.size(), QueueGroup.Queue->CommandListMap.end());
-
-      std::tie(It, std::ignore) = insert({tid(), QueueGroup});
-      return It->second;
-    }
-  };
-
-  // A map of compute groups containing compute queue handles, one per thread.
-  // When a queue is accessed from multiple host threads, a separate queue group
-  // is created for each thread. The key used for mapping is the thread ID.
-  pi_queue_group_by_tid_t ComputeQueueGroupsByTID;
-
-  // A group containing copy queue handles. The main copy engine, if available,
-  // comes first followed by link copy engines, if available.
-  // When a queue is accessed from multiple host threads, a separate queue group
-  // is created for each thread. The key used for mapping is the thread ID.
-  pi_queue_group_by_tid_t CopyQueueGroupsByTID;
-
-  // Wait for all commandlists associated with this Queue to finish operations.
-  pi_result synchronize();
-
-  // Return the queue group to use based on standard/immediate commandlist mode,
-  // and if immediate mode, the thread-specific group.
-  pi_queue_group_t &getQueueGroup(bool UseCopyEngine);
-
-  // This function considers multiple factors including copy engine
-  // availability and user preference and returns a boolean that is used to
-  // specify if copy engine will eventually be used for a particular command.
-  bool useCopyEngine(bool PreferCopyEngine = true) const;
-
-  // Keeps the PI context to which this queue belongs.
-  // This field is only set at _pi_queue creation time, and cannot change.
-  // Therefore it can be accessed without holding a lock on this _pi_queue.
-  const pi_context Context;
-
-  // Keeps the PI device to which this queue belongs.
-  // This field is only set at _pi_queue creation time, and cannot change.
-  // Therefore it can be accessed without holding a lock on this _pi_queue.
-  const pi_device Device;
-
-  // A queue may use either standard or immediate commandlists. At queue
-  // construction time this is set based on the device and any env var settings
-  // that change the default for the device type. When an interop queue is
-  // constructed, the caller chooses the type of commandlists to use.
-  bool UsingImmCmdLists;
-
-  // Keeps track of the event associated with the last enqueued command into
-  // this queue. this is used to add dependency with the last command to add
-  // in-order semantics and updated with the latest event each time a new
-  // command is enqueued.
-  pi_event LastCommandEvent = nullptr;
-
-  // Kernel is not necessarily submitted for execution during
-  // piEnqueueKernelLaunch, it may be batched. That's why we need to save the
-  // list of kernels which is going to be submitted but have not been submitted
-  // yet. This is needed to capture memory allocations for each kernel with
-  // indirect access in the list at the moment when kernel is really submitted
-  // for execution.
-  std::vector<pi_kernel> KernelsToBeSubmitted;
-
-  // Update map of memory references made by the kernels about to be submitted
-  void CaptureIndirectAccesses();
-
-  // Indicates if we own the ZeCommandQueue or it came from interop that
-  // asked to not transfer the ownership to SYCL RT.
-  bool OwnZeCommandQueue;
-
-  // Map of all command lists used in this queue.
-  pi_command_list_map_t CommandListMap;
-
-  // Helper data structure to hold all variables related to batching
-  struct command_batch {
-    // These two members are used to keep track of how often the
-    // batching closes and executes a command list before reaching the
-    // QueueComputeBatchSize limit, versus how often we reach the limit.
-    // This info might be used to vary the QueueComputeBatchSize value.
-    pi_uint32 NumTimesClosedEarly = {0};
-    pi_uint32 NumTimesClosedFull = {0};
-
-    // Open command list fields for batching commands into this queue.
-    pi_command_list_ptr_t OpenCommandList{};
-
-    // Approximate number of commands that are allowed to be batched for
-    // this queue.
-    // Added this member to the queue rather than using a global variable
-    // so that future implementation could use heuristics to change this on
-    // a queue specific basis. And by putting it in the queue itself, this
-    // is thread safe because of the locking of the queue that occurs.
-    pi_uint32 QueueBatchSize = {0};
-  };
-
-  // ComputeCommandBatch holds data related to batching of non-copy commands.
-  // CopyCommandBatch holds data related to batching of copy commands.
-  command_batch ComputeCommandBatch, CopyCommandBatch;
-
-  // Returns true if any commands for this queue are allowed to
-  // be batched together.
-  // For copy commands, IsCopy is set to 'true'.
-  // For non-copy commands, IsCopy is set to 'false'.
-  bool isBatchingAllowed(bool IsCopy) const;
-
-  // Keeps the properties of this queue.
-  pi_queue_properties Properties;
-
-  // Returns true if the queue is a in-order queue.
-  bool isInOrderQueue() const;
-
-  // Returns true if the queue has discard events property.
-  bool isDiscardEvents() const;
-
-  // Returns true if the queue has explicit priority set by user.
-  bool isPriorityLow() const;
-  bool isPriorityHigh() const;
-
-  // adjust the queue's batch size, knowing that the current command list
-  // is being closed with a full batch.
-  // For copy commands, IsCopy is set to 'true'.
-  // For non-copy commands, IsCopy is set to 'false'.
-  void adjustBatchSizeForFullBatch(bool IsCopy);
-
-  // adjust the queue's batch size, knowing that the current command list
-  // is being closed with only a partial batch of commands.
-  // For copy commands, IsCopy is set to 'true'.
-  // For non-copy commands, IsCopy is set to 'false'.
-  void adjustBatchSizeForPartialBatch(bool IsCopy);
-
-  // Helper function to create a new command-list to this queue and associated
-  // fence tracking its completion. This command list & fence are added to the
-  // map of command lists in this queue with ZeFenceInUse = false.
-  // The caller must hold a lock of the queue already.
-  pi_result
-  createCommandList(bool UseCopyEngine, pi_command_list_ptr_t &CommandList,
-                    ze_command_queue_handle_t *ForcedCmdQueue = nullptr);
-
-  /// @brief Resets the command list and associated fence in the map and removes
-  /// events from the command list.
-  /// @param CommandList The caller must verify that this command list and fence
-  /// have been signalled.
-  /// @param MakeAvailable If the reset command list should be made available,
-  /// then MakeAvailable needs to be set to true.
-  /// @param EventListToCleanup  The EventListToCleanup contains a list of
-  /// events from the command list which need to be cleaned up.
-  /// @param CheckStatus Hint informing whether we need to check status of the
-  /// events before removing them from the immediate command list. This is
-  /// needed because immediate command lists are not associated with fences and
-  /// in general status of the event needs to be checked.
-  /// @return PI_SUCCESS if successful, PI error code otherwise.
-  pi_result resetCommandList(pi_command_list_ptr_t CommandList,
-                             bool MakeAvailable,
-                             std::vector<pi_event> &EventListToCleanup,
-                             bool CheckStatus = true);
-
-  // Returns true if an OpenCommandList has commands that need to be submitted.
-  // If IsCopy is 'true', then the OpenCommandList containing copy commands is
-  // checked. Otherwise, the OpenCommandList containing compute commands is
-  // checked.
-  bool hasOpenCommandList(bool IsCopy) const {
-    auto CommandBatch = (IsCopy) ? CopyCommandBatch : ComputeCommandBatch;
-    return CommandBatch.OpenCommandList != CommandListMap.end();
-  }
-  // Attach a command list to this queue.
-  // For non-immediate commandlist also close and execute it.
-  // Note that this command list cannot be appended to after this.
-  // The "IsBlocking" tells if the wait for completion is required.
-  // If OKToBatchCommand is true, then this command list may be executed
-  // immediately, or it may be left open for other future command to be
-  // batched into.
-  // If IsBlocking is true, then batching will not be allowed regardless
-  // of the value of OKToBatchCommand
-  //
-  // For immediate commandlists, no close and execute is necessary.
-  pi_result executeCommandList(pi_command_list_ptr_t CommandList,
-                               bool IsBlocking = false,
-                               bool OKToBatchCommand = false);
-
-  // If there is an open command list associated with this queue,
-  // close it, execute it, and reset the corresponding OpenCommandList.
-  // If IsCopy is 'true', then the OpenCommandList containing copy commands is
-  // executed. Otherwise OpenCommandList containing compute commands is
-  // executed.
-  pi_result executeOpenCommandList(bool IsCopy);
-
-  // Gets the open command containing the event, or CommandListMap.end()
-  pi_command_list_ptr_t eventOpenCommandList(pi_event Event);
-
-  // Wrapper function to execute both OpenCommandLists (Copy and Compute).
-  // This wrapper is helpful when all 'open' commands need to be executed.
-  // Call-sites instances: piQuueueFinish, piQueueRelease, etc.
-  pi_result executeAllOpenCommandLists() {
-    using IsCopy = bool;
-    if (auto Res = executeOpenCommandList(IsCopy{false}))
-      return Res;
-    if (auto Res = executeOpenCommandList(IsCopy{true}))
-      return Res;
-    return PI_SUCCESS;
-  }
-
-  // Inserts a barrier waiting for all unfinished events in ActiveBarriers into
-  // CmdList. Any finished events will be removed from ActiveBarriers.
-  pi_result insertActiveBarriers(pi_command_list_ptr_t &CmdList,
-                                 bool UseCopyEngine);
-
-  // A helper structure to keep active barriers of the queue.
-  // It additionally manages ref-count of events in this list.
-  struct active_barriers {
-    std::vector<pi_event> Events;
-    void add(pi_event &Event);
-    pi_result clear();
-    bool empty() { return Events.empty(); }
-    std::vector<pi_event> &vector() { return Events; }
-  };
-  // A collection of currently active barriers.
-  // These should be inserted into a command list whenever an available command
-  // list is needed for a command.
-  active_barriers ActiveBarriers;
-
-  // Besides each PI object keeping a total reference count in
-  // _ur_object::RefCount we keep special track of the queue *external*
-  // references. This way we are able to tell when the queue is being finished
-  // externally, and can wait for internal references to complete, and do proper
-  // cleanup of the queue.
-  // This counter doesn't track the lifetime of a queue object, it only tracks
-  // the number of external references. I.e. even if it reaches zero a queue
-  // object may not be destroyed and can be used internally in the plugin.
-  // That's why we intentionally don't use atomic type for this counter to
-  // enforce guarding with a mutex all the work involving this counter.
-  pi_uint32 RefCountExternal{1};
-
-  // Indicates that the queue is healthy and all operations on it are OK.
-  bool Healthy{true};
-
-  // The following data structures and methods are used only for handling
-  // in-order queue with discard_events property. Some commands in such queue
-  // may have discarded event. Which means that event is not visible outside of
-  // the plugin. It is possible to reset and reuse discarded events in the same
-  // in-order queue because of the dependency between commands. We don't have to
-  // wait event completion to do this. We use the following 2-event model to
-  // reuse events inside each command list:
-  //
-  // Operation1 = zeCommantListAppendMemoryCopy (signal ze_event1)
-  // zeCommandListAppendBarrier(wait for ze_event1)
-  // zeCommandListAppendEventReset(ze_event1)
-  // # Create new pi_event using ze_event1 and append to the cache.
-  //
-  // Operation2 = zeCommandListAppendMemoryCopy (signal ze_event2)
-  // zeCommandListAppendBarrier(wait for ze_event2)
-  // zeCommandListAppendEventReset(ze_event2)
-  // # Create new pi_event using ze_event2 and append to the cache.
-  //
-  // # Get pi_event from the beginning of the cache because there are two events
-  // # there. So it is guaranteed that we do round-robin between two events -
-  // # event from the last command is appended to the cache.
-  // Operation3 = zeCommandListAppendMemoryCopy (signal ze_event1)
-  // # The same ze_event1 is used for Operation1 and Operation3.
-  //
-  // When we switch to a different command list we need to signal new event and
-  // wait for it in the new command list using barrier.
-  // [CmdList1]
-  // Operation1 = zeCommantListAppendMemoryCopy (signal event1)
-  // zeCommandListAppendBarrier(wait for event1)
-  // zeCommandListAppendEventReset(event1)
-  // zeCommandListAppendSignalEvent(NewEvent)
-  //
-  // [CmdList2]
-  // zeCommandListAppendBarrier(wait for NewEvent)
-  //
-  // This barrier guarantees that command list execution starts only after
-  // completion of previous command list which signals aforementioned event. It
-  // allows to reset and reuse same event handles inside all command lists in
-  // scope of the queue. It means that we need 2 reusable events of each type
-  // (host-visible and device-scope) per queue at maximum.
-
-  // This data member keeps track of the last used command list and allows to
-  // handle switch of immediate command lists because immediate command lists
-  // are never closed unlike regular command lists.
-  pi_command_list_ptr_t LastUsedCommandList = CommandListMap.end();
-
-  // Vector of 2 lists of reusable events: host-visible and device-scope.
-  // They are separated to allow faster access to stored events depending on
-  // requested type of event. Each list contains events which can be reused
-  // inside all command lists in the queue as described in the 2-event model.
-  // Leftover events in the cache are relased at the queue destruction.
-  std::vector<std::list<pi_event>> EventCaches{2};
-
-  // Get event from the queue's cache.
-  // Returns nullptr if the cache doesn't contain any reusable events or if the
-  // cache contains only one event which corresponds to the previous command and
-  // can't be used for the current command because we can't use the same event
-  // two times in a row and have to do round-robin between two events. Otherwise
-  // it picks an event from the beginning of the cache and returns it. Event
-  // from the last command is always appended to the end of the list.
-  pi_event getEventFromQueueCache(bool HostVisible);
-
-  // Put pi_event to the cache. Provided pi_event object is not used by
-  // any command but its ZeEvent is used by many pi_event objects.
-  // Commands to wait and reset ZeEvent must be submitted to the queue before
-  // calling this method.
-  pi_result addEventToQueueCache(pi_event Event);
-
-  // Append command to provided command list to wait and reset the last event if
-  // it is discarded and create new pi_event wrapper using the same native event
-  // and put it to the cache. We call this method after each command submission
-  // to make native event available to use by next commands.
-  pi_result resetDiscardedEvent(pi_command_list_ptr_t);
-
-  // Append command to the command list to signal new event if the last event in
-  // the command list is discarded. While we submit commands in scope of the
-  // same command list we can reset and reuse events but when we switch to a
-  // different command list we currently need to signal new event and wait for
-  // it in the new command list using barrier.
-  pi_result signalEventFromCmdListIfLastEventDiscarded(pi_command_list_ptr_t);
-
-  // Insert a barrier waiting for the last command event into the beginning of
-  // command list. This barrier guarantees that command list execution starts
-  // only after completion of previous command list which signals aforementioned
-  // event. It allows to reset and reuse same event handles inside all command
-  // lists in the queue.
-  pi_result
-  insertStartBarrierIfDiscardEventsMode(pi_command_list_ptr_t &CmdList);
-
-  // Helper method telling whether we need to reuse discarded event in this
-  // queue.
-  bool doReuseDiscardedEvents();
-};
-
-struct _pi_mem : _ur_object {
-  // Keeps the PI context of this memory handle.
-  pi_context Context;
-
-  // Enumerates all possible types of accesses.
-  enum access_mode_t { unknown, read_write, read_only, write_only };
-
-  // Interface of the _pi_mem object
-
-  // Get the Level Zero handle of the current memory object
-  virtual pi_result getZeHandle(char *&ZeHandle, access_mode_t,
-                                pi_device Device = nullptr) = 0;
-
-  // Get a pointer to the Level Zero handle of the current memory object
-  virtual pi_result getZeHandlePtr(char **&ZeHandlePtr, access_mode_t,
-                                   pi_device Device = nullptr) = 0;
-
-  // Method to get type of the derived object (image or buffer)
-  virtual bool isImage() const = 0;
-
-  virtual ~_pi_mem() = default;
-
-protected:
-  _pi_mem(pi_context Ctx) : Context{Ctx} {}
-};
-
-struct _pi_buffer;
-using pi_buffer = _pi_buffer *;
-
-struct _pi_buffer final : _pi_mem {
-  // Buffer constructor
-  _pi_buffer(pi_context Context, size_t Size, char *HostPtr,
-             bool ImportedHostPtr = false)
-      : _pi_mem(Context), Size(Size), SubBuffer{nullptr, 0} {
-
-    // We treat integrated devices (physical memory shared with the CPU)
-    // differently from discrete devices (those with distinct memories).
-    // For integrated devices, allocating the buffer in the host memory
-    // enables automatic access from the device, and makes copying
-    // unnecessary in the map/unmap operations. This improves performance.
-    OnHost = Context->Devices.size() == 1 &&
-             Context->Devices[0]->ZeDeviceProperties->flags &
-                 ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
-
-    // Fill the host allocation data.
-    if (HostPtr) {
-      MapHostPtr = HostPtr;
-      // If this host ptr is imported to USM then use this as a host
-      // allocation for this buffer.
-      if (ImportedHostPtr) {
-        Allocations[nullptr].ZeHandle = HostPtr;
-        Allocations[nullptr].Valid = true;
-        Allocations[nullptr].ReleaseAction = _pi_buffer::allocation_t::unimport;
-      }
-    }
-
-    // This initialization does not end up with any valid allocation yet.
-    LastDeviceWithValidAllocation = nullptr;
-  }
-
-  // Sub-buffer constructor
-  _pi_buffer(pi_buffer Parent, size_t Origin, size_t Size)
-      : _pi_mem(Parent->Context), Size(Size), SubBuffer{Parent, Origin} {}
-
-  // Interop-buffer constructor
-  _pi_buffer(pi_context Context, size_t Size, pi_device Device,
-             char *ZeMemHandle, bool OwnZeMemHandle)
-      : _pi_mem(Context), Size(Size), SubBuffer{nullptr, 0} {
-
-    // Device == nullptr means host allocation
-    Allocations[Device].ZeHandle = ZeMemHandle;
-    Allocations[Device].Valid = true;
-    Allocations[Device].ReleaseAction =
-        OwnZeMemHandle ? allocation_t::free_native : allocation_t::keep;
-
-    // Check if this buffer can always stay on host
-    OnHost = false;
-    if (!Device) { // Host allocation
-      if (Context->Devices.size() == 1 &&
-          Context->Devices[0]->ZeDeviceProperties->flags &
-              ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) {
-        OnHost = true;
-        MapHostPtr = ZeMemHandle; // map to this allocation
-      }
-    }
-    LastDeviceWithValidAllocation = Device;
-  }
-
-  // Returns a pointer to the USM allocation representing this PI buffer
-  // on the specified Device. If Device is nullptr then the returned
-  // USM allocation is on the device where this buffer was used the latest.
-  // The returned allocation is always valid, i.e. its contents is
-  // up-to-date and any data copies needed for that are performed under
-  // the hood.
-  //
-  virtual pi_result getZeHandle(char *&ZeHandle, access_mode_t,
-                                pi_device Device = nullptr) override;
-  virtual pi_result getZeHandlePtr(char **&ZeHandlePtr, access_mode_t,
-                                   pi_device Device = nullptr) override;
-
-  bool isImage() const override { return false; }
-
-  bool isSubBuffer() const { return SubBuffer.Parent != nullptr; }
-
-  // Frees all allocations made for the buffer.
-  pi_result free();
-
-  // Information about a single allocation representing this buffer.
-  struct allocation_t {
-    // Level Zero memory handle is really just a naked pointer.
-    // It is just convenient to have it char * to simplify offset arithmetics.
-    char *ZeHandle{nullptr};
-    // Indicates if this allocation's data is valid.
-    bool Valid{false};
-    // Specifies the action that needs to be taken for this
-    // allocation at buffer destruction.
-    enum {
-      keep,       // do nothing, the allocation is not owned by us
-      unimport,   // release of the imported allocation
-      free,       // free from the pooling context (default)
-      free_native // free with a native call
-    } ReleaseAction{free};
-  };
-
-  // We maintain multiple allocations on possibly all devices in the context.
-  // The "nullptr" device identifies a host allocation representing buffer.
-  // Sub-buffers don't maintain own allocations but rely on parent buffer.
-  std::unordered_map<pi_device, allocation_t> Allocations;
-  pi_device LastDeviceWithValidAllocation{nullptr};
-
-  // Flag to indicate that this memory is allocated in host memory.
-  // Integrated device accesses this memory.
-  bool OnHost{false};
-
-  // Tells the host allocation to use for buffer map operations.
-  char *MapHostPtr{nullptr};
-
-  // Supplementary data to keep track of the mappings of this buffer
-  // created with piEnqueueMemBufferMap.
-  struct Mapping {
-    // The offset in the buffer giving the start of the mapped region.
-    size_t Offset;
-    // The size of the mapped region.
-    size_t Size;
-  };
-
-  // The key is the host pointer representing an active mapping.
-  // The value is the information needed to maintain/undo the mapping.
-  std::unordered_map<void *, Mapping> Mappings;
-
-  // The size and alignment of the buffer
-  size_t Size;
-  size_t getAlignment() const;
-
-  struct {
-    _pi_mem *Parent;
-    size_t Origin; // only valid if Parent != nullptr
-  } SubBuffer;
-};
-
-struct _pi_image;
-using pi_image = _pi_image *;
-
-// TODO: add proper support for images on context with multiple devices.
-struct _pi_image final : _pi_mem {
-  // Image constructor
-  _pi_image(pi_context Ctx, ze_image_handle_t Image, bool OwnNativeHandle)
-      : _pi_mem(Ctx), ZeImage{Image}, OwnZeMemHandle{OwnNativeHandle} {}
-
-  virtual pi_result getZeHandle(char *&ZeHandle, access_mode_t,
-                                pi_device = nullptr) override {
-    ZeHandle = ur_cast<char *>(ZeImage);
-    return PI_SUCCESS;
-  }
-  virtual pi_result getZeHandlePtr(char **&ZeHandlePtr, access_mode_t,
-                                   pi_device = nullptr) override {
-    ZeHandlePtr = ur_cast<char **>(&ZeImage);
-    return PI_SUCCESS;
-  }
-
-  bool isImage() const override { return true; }
-
-#ifndef NDEBUG
-  // Keep the descriptor of the image (for debugging purposes)
-  ZeStruct<ze_image_desc_t> ZeImageDesc;
-#endif // !NDEBUG
-
-  // Level Zero image handle.
-  ze_image_handle_t ZeImage;
-
-  bool OwnZeMemHandle;
-};
-
-struct _pi_ze_event_list_t {
-  // List of level zero events for this event list.
-  ze_event_handle_t *ZeEventList = {nullptr};
-
-  // List of pi_events for this event list.
-  pi_event *PiEventList = {nullptr};
-
-  // length of both the lists.  The actual allocation of these lists
-  // may be longer than this length.  This length is the actual number
-  // of elements in the above arrays that are valid.
-  pi_uint32 Length = {0};
-
-  // A mutex is needed for destroying the event list.
-  // Creation is already thread-safe because we only create the list
-  // when an event is initially created.  However, it might be
-  // possible to have multiple threads racing to destroy the list,
-  // so this will be used to make list destruction thread-safe.
-  ur_mutex PiZeEventListMutex;
-
-  // Initialize this using the array of events in EventList, and retain
-  // all the pi_events in the created data structure.
-  // CurQueue is the pi_queue that the command with this event wait
-  // list is going to be added to.  That is needed to flush command
-  // batches for wait events that are in other queues.
-  // UseCopyEngine indicates if the next command (the one that this
-  // event wait-list is for) is going to go to copy or compute
-  // queue. This is used to properly submit the dependent open
-  // command-lists.
-  pi_result createAndRetainPiZeEventList(pi_uint32 EventListLength,
-                                         const pi_event *EventList,
-                                         pi_queue CurQueue, bool UseCopyEngine);
-
-  // Add all the events in this object's PiEventList to the end
-  // of the list EventsToBeReleased. Destroy pi_ze_event_list_t data
-  // structure fields making it look empty.
-  pi_result collectEventsForReleaseAndDestroyPiZeEventList(
-      std::list<pi_event> &EventsToBeReleased);
-
-  // Had to create custom assignment operator because the mutex is
-  // not assignment copyable. Just field by field copy of the other
-  // fields.
-  _pi_ze_event_list_t &operator=(const _pi_ze_event_list_t &other) {
-    if (this != &other) {
-      this->ZeEventList = other.ZeEventList;
-      this->PiEventList = other.PiEventList;
-      this->Length = other.Length;
-    }
-    return *this;
-  }
-};
-
-struct _pi_event : _ur_object {
-  _pi_event(ze_event_handle_t ZeEvent, ze_event_pool_handle_t ZeEventPool,
-            pi_context Context, pi_command_type CommandType, bool OwnZeEvent)
-      : ZeEvent{ZeEvent}, OwnZeEvent{OwnZeEvent}, ZeEventPool{ZeEventPool},
-        CommandType{CommandType}, Context{Context}, CommandData{nullptr} {}
-
-  // Level Zero event handle.
-  ze_event_handle_t ZeEvent;
-
-  // Indicates if we own the ZeEvent or it came from interop that
-  // asked to not transfer the ownership to SYCL RT.
-  bool OwnZeEvent;
-
-  // Level Zero event pool handle.
-  ze_event_pool_handle_t ZeEventPool;
-
-  // In case we use device-only events this holds their host-visible
-  // counterpart. If this event is itself host-visble then HostVisibleEvent
-  // points to this event. If this event is not host-visible then this field can
-  // be: 1) null, meaning that a host-visible event wasn't yet created 2) a PI
-  // event created internally that host will actually be redirected
-  //    to wait/query instead of this PI event.
-  //
-  // The HostVisibleEvent is a reference counted PI event and can be used more
-  // than by just this one event, depending on the mode (see EventsScope).
-  //
-  pi_event HostVisibleEvent = {nullptr};
-  bool isHostVisible() const { return this == HostVisibleEvent; }
-
-  // Get the host-visible event or create one and enqueue its signal.
-  pi_result getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent);
-
-  // Tells if this event is with profiling capabilities.
-  bool isProfilingEnabled() const {
-    return !Queue || // tentatively assume user events are profiling enabled
-           (Queue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) != 0;
-  }
-
-  // Keeps the command-queue and command associated with the event.
-  // These are NULL for the user events.
-  pi_queue Queue = {nullptr};
-  pi_command_type CommandType;
-  // Provide direct access to Context, instead of going via queue.
-  // Not every PI event has a queue, and we need a handle to Context
-  // to get to event pool related information.
-  pi_context Context;
-
-  // Opaque data to hold any data needed for CommandType.
-  void *CommandData;
-
-  // List of events that were in the wait list of the command that will
-  // signal this event.  These events must be retained when the command is
-  // enqueued, and must then be released when this event has signalled.
-  // This list must be destroyed once the event has signalled.
-  _pi_ze_event_list_t WaitList;
-
-  // Command list associated with the pi_event.
-  std::optional<pi_command_list_ptr_t> CommandList;
-
-  // Tracks if the needed cleanup was already performed for
-  // a completed event. This allows to control that some cleanup
-  // actions are performed only once.
-  //
-  bool CleanedUp = {false};
-
-  // Indicates that this PI event had already completed in the sense
-  // that no other synchromization is needed. Note that the underlying
-  // L0 event (if any) is not guranteed to have been signalled, or
-  // being visible to the host at all.
-  bool Completed = {false};
-
-  // Indicates that this event is discarded, i.e. it is not visible outside of
-  // plugin.
-  bool IsDiscarded = {false};
-
-  // Besides each PI object keeping a total reference count in
-  // _ur_object::RefCount we keep special track of the event *external*
-  // references. This way we are able to tell when the event is not referenced
-  // externally anymore, i.e. it can't be passed as a dependency event to
-  // piEnqueue* functions and explicitly waited meaning that we can do some
-  // optimizations:
-  // 1. For in-order queues we can reset and reuse event even if it was not yet
-  // completed by submitting a reset command to the queue (since there are no
-  // external references, we know that nobody can wait this event somewhere in
-  // parallel thread or pass it as a dependency which may lead to hang)
-  // 2. We can avoid creating host proxy event.
-  // This counter doesn't track the lifetime of an event object. Even if it
-  // reaches zero an event object may not be destroyed and can be used
-  // internally in the plugin.
-  std::atomic<pi_uint32> RefCountExternal{0};
-
-  bool hasExternalRefs() { return RefCountExternal != 0; }
-
-  // Reset _pi_event object.
-  pi_result reset();
-};
-
-struct _pi_program : _ur_object {
-  // Possible states of a program.
-  typedef enum {
-    // The program has been created from intermediate language (SPIR-V), but it
-    // is not yet compiled.
-    IL,
-
-    // The program has been created by loading native code, but it has not yet
-    // been built.  This is equivalent to an OpenCL "program executable" that
-    // is loaded via clCreateProgramWithBinary().
-    Native,
-
-    // The program was notionally compiled from SPIR-V form.  However, since we
-    // postpone compilation until the module is linked, the internal state
-    // still represents the module as SPIR-V.
-    Object,
-
-    // The program has been built or linked, and it is represented as a Level
-    // Zero module.
-    Exe,
-
-    // An error occurred during piProgramLink, but we created a _pi_program
-    // object anyways in order to hold the ZeBuildLog.  Note that the ZeModule
-    // may or may not be nullptr in this state, depending on the error.
-    Invalid
-  } state;
-
-  // A utility class that converts specialization constants into the form
-  // required by the Level Zero driver.
-  class SpecConstantShim {
-  public:
-    SpecConstantShim(pi_program Program) {
-      ZeSpecConstants.numConstants = Program->SpecConstants.size();
-      ZeSpecContantsIds.reserve(ZeSpecConstants.numConstants);
-      ZeSpecContantsValues.reserve(ZeSpecConstants.numConstants);
-
-      for (auto &SpecConstant : Program->SpecConstants) {
-        ZeSpecContantsIds.push_back(SpecConstant.first);
-        ZeSpecContantsValues.push_back(SpecConstant.second);
-      }
-      ZeSpecConstants.pConstantIds = ZeSpecContantsIds.data();
-      ZeSpecConstants.pConstantValues = ZeSpecContantsValues.data();
-    }
-
-    const ze_module_constants_t *ze() { return &ZeSpecConstants; }
-
-  private:
-    std::vector<uint32_t> ZeSpecContantsIds;
-    std::vector<const void *> ZeSpecContantsValues;
-    ze_module_constants_t ZeSpecConstants;
-  };
-
-  // Construct a program in IL or Native state.
-  _pi_program(state St, pi_context Context, const void *Input, size_t Length)
-      : Context{Context}, OwnZeModule{true}, State{St},
-        Code{new uint8_t[Length]}, CodeLength{Length}, ZeModule{nullptr},
-        ZeBuildLog{nullptr} {
-    std::memcpy(Code.get(), Input, Length);
-  }
-
-  // Construct a program in Exe or Invalid state.
-  _pi_program(state St, pi_context Context, ze_module_handle_t ZeModule,
-              ze_module_build_log_handle_t ZeBuildLog)
-      : Context{Context}, OwnZeModule{true}, State{St}, ZeModule{ZeModule},
-        ZeBuildLog{ZeBuildLog} {}
-
-  // Construct a program in Exe state (interop).
-  _pi_program(state St, pi_context Context, ze_module_handle_t ZeModule,
-              bool OwnZeModule)
-      : Context{Context}, OwnZeModule{OwnZeModule}, State{St},
-        ZeModule{ZeModule}, ZeBuildLog{nullptr} {}
-
-  // Construct a program in Invalid state with a custom error message.
-  _pi_program(state St, pi_context Context, const std::string &ErrorMessage)
-      : Context{Context}, OwnZeModule{true}, ErrorMessage{ErrorMessage},
-        State{St}, ZeModule{nullptr}, ZeBuildLog{nullptr} {}
-
-  ~_pi_program();
-
-  const pi_context Context; // Context of the program.
-
-  // Indicates if we own the ZeModule or it came from interop that
-  // asked to not transfer the ownership to SYCL RT.
-  const bool OwnZeModule;
-
-  // This error message is used only in Invalid state to hold a custom error
-  // message from a call to piProgramLink.
-  const std::string ErrorMessage;
-
-  state State;
-
-  // In IL and Object states, this contains the SPIR-V representation of the
-  // module.  In Native state, it contains the native code.
-  std::unique_ptr<uint8_t[]> Code; // Array containing raw IL / native code.
-  size_t CodeLength{0};            // Size (bytes) of the array.
-
-  // Used only in IL and Object states.  Contains the SPIR-V specialization
-  // constants as a map from the SPIR-V "SpecID" to a buffer that contains the
-  // associated value.  The caller of the PI layer is responsible for
-  // maintaining the storage of this buffer.
-  std::unordered_map<uint32_t, const void *> SpecConstants;
-
-  // Used only in Object state.  Contains the build flags from the last call to
-  // piProgramCompile().
-  std::string BuildFlags;
-
-  // The Level Zero module handle.  Used primarily in Exe state.
-  ze_module_handle_t ZeModule;
-
-  // The Level Zero build log from the last call to zeModuleCreate().
-  ze_module_build_log_handle_t ZeBuildLog;
-};
-
-struct _pi_kernel : _ur_object {
-  _pi_kernel(ze_kernel_handle_t Kernel, bool OwnZeKernel, pi_program Program)
-      : ZeKernel{Kernel}, OwnZeKernel{OwnZeKernel}, Program{Program},
-        MemAllocs{}, SubmissionsCount{0} {}
-
-  // Completed initialization of PI kernel. Must be called after construction.
-  pi_result initialize();
-
-  // Returns true if kernel has indirect access, false otherwise.
-  bool hasIndirectAccess() {
-    // Currently indirect access flag is set for all kernels and there is no API
-    // to check if kernel actually indirectly access smth.
-    return true;
-  }
-
-  // Level Zero function handle.
-  ze_kernel_handle_t ZeKernel;
-
-  // Indicates if we own the ZeKernel or it came from interop that
-  // asked to not transfer the ownership to SYCL RT.
-  bool OwnZeKernel;
-
-  // Keep the program of the kernel.
-  pi_program Program;
-
-  // Hash function object for the unordered_set below.
-  struct Hash {
-    size_t operator()(const std::pair<void *const, MemAllocRecord> *P) const {
-      return std::hash<void *>()(P->first);
-    }
-  };
-
-  // If kernel has indirect access we need to make a snapshot of all existing
-  // memory allocations to defer deletion of these memory allocations to the
-  // moment when kernel execution has finished.
-  // We store pointers to the elements because pointers are not invalidated by
-  // insert/delete for std::unordered_map (iterators are invalidated). We need
-  // to take a snapshot instead of just reference-counting the allocations,
-  // because picture of active allocations can change during kernel execution
-  // (new allocations can be added) and we need to know which memory allocations
-  // were retained by this kernel to release them (and don't touch new
-  // allocations) at kernel completion. Same kernel may be submitted several
-  // times and retained allocations may be different at each submission. That's
-  // why we have a set of memory allocations here and increase ref count only
-  // once even if kernel is submitted many times. We don't want to know how many
-  // times and which allocations were retained by each submission. We release
-  // all allocations in the set only when SubmissionsCount == 0.
-  std::unordered_set<std::pair<void *const, MemAllocRecord> *, Hash> MemAllocs;
-
-  // Counter to track the number of submissions of the kernel.
-  // When this value is zero, it means that kernel is not submitted for an
-  // execution - at this time we can release memory allocations referenced by
-  // this kernel. We can do this when RefCount turns to 0 but it is too late
-  // because kernels are cached in the context by SYCL RT and they are released
-  // only during context object destruction. Regular RefCount is not usable to
-  // track submissions because user/SYCL RT can retain kernel object any number
-  // of times. And that's why there is no value of RefCount which can mean zero
-  // submissions.
-  std::atomic<pi_uint32> SubmissionsCount;
-
-  // Keeps info about an argument to the kernel enough to set it with
-  // zeKernelSetArgumentValue.
-  struct ArgumentInfo {
-    uint32_t Index;
-    size_t Size;
-    const pi_mem Value;
-    _pi_mem::access_mode_t AccessMode{_pi_mem::unknown};
-  };
-  // Arguments that still need to be set (with zeKernelSetArgumentValue)
-  // before kernel is enqueued.
-  std::vector<ArgumentInfo> PendingArguments;
-
-  // Cache of the kernel properties.
-  ZeCache<ZeStruct<ze_kernel_properties_t>> ZeKernelProperties;
-  ZeCache<std::string> ZeKernelName;
-};
-
-struct _pi_sampler : _ur_object {
-  _pi_sampler(ze_sampler_handle_t Sampler) : ZeSampler{Sampler} {}
-
-  // Level Zero sampler handle.
-  ze_sampler_handle_t ZeSampler;
-};
-
 #endif // PI_LEVEL_ZERO_HPP
diff --git a/sycl/plugins/level_zero/ur_bindings.hpp b/sycl/plugins/level_zero/ur_bindings.hpp
index 0504df7e2f0d9..faaab6d5e925b 100755
--- a/sycl/plugins/level_zero/ur_bindings.hpp
+++ b/sycl/plugins/level_zero/ur_bindings.hpp
@@ -9,41 +9,3 @@
 
 #include "pi_level_zero.hpp"
 #include <ur_api.h>
-
-// Make the Unified Runtime handles definition complete.
-// This is used in various "create" API where new handles are allocated.
-struct ur_platform_handle_t_ : public _pi_platform {
-  using _pi_platform::_pi_platform;
-};
-
-struct ur_device_handle_t_ : public _pi_device {
-  using _pi_device::_pi_device;
-};
-
-struct ur_context_handle_t_ : public _pi_context {
-  using _pi_context::_pi_context;
-};
-
-struct ur_event_handle_t_ : public _pi_event {
-  using _pi_event::_pi_event;
-};
-
-struct ur_program_handle_t_ : public _pi_program {
-  using _pi_program::_pi_program;
-};
-
-struct ur_kernel_handle_t_ : public _pi_kernel {
-  using _pi_kernel::_pi_kernel;
-};
-
-struct ur_queue_handle_t_ : public _pi_queue {
-  using _pi_queue::_pi_queue;
-};
-
-struct ur_sampler_handle_t_ : public _pi_sampler {
-  using _pi_sampler::_pi_sampler;
-};
-
-struct ur_mem_handle_t_ : public _pi_mem {
-  using _pi_mem::_pi_mem;
-};
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 7dd2a7b96bcd3..a4eee6963601e 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -71,6 +71,7 @@ add_sycl_plugin(unified_runtime
     Threads::Threads
     UnifiedRuntimeLoader
     UnifiedRuntime-Headers
+    LevelZeroLoader-Headers # we need for #include <ze_api.h> in ur_level_zero_common.h
 )
 
 # Build level zero adapter
@@ -90,7 +91,7 @@ add_sycl_library("ur_adapter_level_zero" SHARED
     "ur/adapters/level_zero/ur_level_zero_device.hpp"
     "ur/adapters/level_zero/ur_level_zero_event.hpp"
     "ur/adapters/level_zero/ur_level_zero_mem.hpp"
-    "ur/adapters/level_zero/ur_level_zero_module.hpp"
+    "ur/adapters/level_zero/ur_level_zero_kernel.hpp"
     "ur/adapters/level_zero/ur_level_zero_platform.hpp"
     "ur/adapters/level_zero/ur_level_zero_program.hpp"
     "ur/adapters/level_zero/ur_level_zero_queue.hpp"
@@ -101,7 +102,7 @@ add_sycl_library("ur_adapter_level_zero" SHARED
     "ur/adapters/level_zero/ur_level_zero_device.cpp"
     "ur/adapters/level_zero/ur_level_zero_event.cpp"
     "ur/adapters/level_zero/ur_level_zero_mem.cpp"
-    "ur/adapters/level_zero/ur_level_zero_module.cpp"
+    "ur/adapters/level_zero/ur_level_zero_kernel.cpp"
     "ur/adapters/level_zero/ur_level_zero_platform.cpp"
     "ur/adapters/level_zero/ur_level_zero_program.cpp"
     "ur/adapters/level_zero/ur_level_zero_queue.cpp"
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 2d80f4c4ad20a..5ca4b1b9ae4f6 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -7,7 +7,9 @@
 //===------------------------------------------------------------------===//
 #pragma once
 
+#include "ur/adapters/level_zero/ur_level_zero.hpp"
 #include "ur_api.h"
+#include <cstdarg>
 #include <sycl/detail/pi.h>
 #include <ur/ur.hpp>
 
@@ -52,6 +54,11 @@ static pi_result ur2piResult(ur_result_t urResult) {
   };
 }
 
+// Helper for one-liner validation
+#define PI_ASSERT(condition, error)                                            \
+  if (!(condition))                                                            \
+    return error;
+
 // Early exits on any error
 #define HANDLE_ERRORS(urCall)                                                  \
   if (auto Result = urCall)                                                    \
@@ -375,54 +382,135 @@ inline pi_result ur2piDeviceInfoValue(ur_device_info_t ParamName,
   return PI_SUCCESS;
 }
 
+struct _pi_context : ur_context_handle_t_ {};
+
+struct _pi_queue : ur_context_handle_t_ {};
+
+struct _pi_program : ur_program_handle_t_ {};
+
+struct _pi_kernel : ur_kernel_handle_t_ {};
+
+struct _pi_mem : ur_mem_handle_t_ {};
+
+struct _pi_buffer : ur_mem_handle_t_ {};
+
+struct _pi_image : ur_mem_handle_t_ {};
+
+struct _pi_sampler : ur_sampler_handle_t_ {};
+
+struct _pi_event : ur_event_handle_t_ {};
+
 namespace pi2ur {
-inline pi_result piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
-                                pi_uint32 *num_platforms) {
+
+inline pi_result piTearDown(void *PluginParameter) {
+  std::ignore = PluginParameter;
+  HANDLE_ERRORS(urTearDown(nullptr));
+  return PI_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Platform
+inline pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms,
+                                pi_uint32 *NumPlatforms) {
 
   urInit(0);
-  uint32_t Count = num_entries;
-  auto phPlatforms = reinterpret_cast<ur_platform_handle_t *>(platforms);
-  HANDLE_ERRORS(urPlatformGet(Count, phPlatforms, num_platforms));
+  auto phPlatforms = reinterpret_cast<ur_platform_handle_t *>(Platforms);
+  HANDLE_ERRORS(urPlatformGet(NumEntries, phPlatforms, NumPlatforms));
+  return PI_SUCCESS;
+}
+
+inline pi_result piextPlatformGetNativeHandle(pi_platform Platform,
+                                              pi_native_handle *NativeHandle) {
+
+  PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
+  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
+
+  auto UrPlatform = reinterpret_cast<ur_platform_handle_t>(Platform);
+
+  ur_native_handle_t UrNativeHandle{};
+  HANDLE_ERRORS(urPlatformGetNativeHandle(UrPlatform, &UrNativeHandle));
+
+  *NativeHandle = reinterpret_cast<pi_native_handle>(UrNativeHandle);
+
+  return PI_SUCCESS;
+}
+
+inline pi_result
+piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle,
+                                    pi_platform *Platform) {
+
+  PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
+  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
+
+  ur_platform_handle_t UrPlatform{};
+  ur_native_handle_t UrNativeHandle =
+      reinterpret_cast<ur_native_handle_t>(NativeHandle);
+  urPlatformCreateWithNativeHandle(UrNativeHandle, &UrPlatform);
+
+  *Platform = reinterpret_cast<pi_platform>(UrPlatform);
+
   return PI_SUCCESS;
 }
 
-inline pi_result piPlatformGetInfo(pi_platform platform,
+inline pi_result piPlatformGetInfo(pi_platform Platform,
                                    pi_platform_info ParamName,
                                    size_t ParamValueSize, void *ParamValue,
                                    size_t *ParamValueSizeRet) {
-  ur_platform_info_t InfoType;
+
+  PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
+
+  ur_platform_info_t UrParamName = {};
   switch (ParamName) {
-  case PI_PLATFORM_INFO_EXTENSIONS:
-    InfoType = UR_PLATFORM_INFO_NAME;
+  case PI_PLATFORM_INFO_EXTENSIONS: {
+    UrParamName = UR_PLATFORM_INFO_EXTENSIONS;
     break;
-  case PI_PLATFORM_INFO_NAME:
-    InfoType = UR_PLATFORM_INFO_NAME;
+  }
+  case PI_PLATFORM_INFO_NAME: {
+    UrParamName = UR_PLATFORM_INFO_NAME;
     break;
-  case PI_PLATFORM_INFO_PROFILE:
-    InfoType = UR_PLATFORM_INFO_PROFILE;
+  }
+  case PI_PLATFORM_INFO_PROFILE: {
+    UrParamName = UR_PLATFORM_INFO_PROFILE;
     break;
-  case PI_PLATFORM_INFO_VENDOR:
-    InfoType = UR_PLATFORM_INFO_VENDOR_NAME;
+  }
+  case PI_PLATFORM_INFO_VENDOR: {
+    UrParamName = UR_PLATFORM_INFO_VENDOR_NAME;
     break;
-  case PI_PLATFORM_INFO_VERSION:
-    InfoType = UR_PLATFORM_INFO_VERSION;
+  }
+  case PI_PLATFORM_INFO_VERSION: {
+    UrParamName = UR_PLATFORM_INFO_VERSION;
     break;
-  case PI_EXT_PLATFORM_INFO_BACKEND:
-    InfoType = UR_PLATFORM_INFO_BACKEND;
+  }
+  case PI_EXT_PLATFORM_INFO_BACKEND: {
+    UrParamName = UR_PLATFORM_INFO_BACKEND;
     break;
+  }
   default:
-    return PI_ERROR_UNKNOWN;
+    die("urGetContextInfo: unsuppported ParamName.");
   }
 
   size_t SizeInOut = ParamValueSize;
-  auto hPlatform = reinterpret_cast<ur_platform_handle_t>(platform);
-  HANDLE_ERRORS(urPlatformGetInfo(hPlatform, InfoType, SizeInOut, ParamValue,
-                                  ParamValueSizeRet));
+  auto UrPlatform = reinterpret_cast<ur_platform_handle_t>(Platform);
+  HANDLE_ERRORS(urPlatformGetInfo(UrPlatform, UrParamName, ParamValueSize,
+                                  ParamValue, ParamValueSizeRet));
+
+  ur2piPlatformInfoValue(UrParamName, ParamValueSize, &SizeInOut, ParamValue);
 
-  ur2piPlatformInfoValue(InfoType, ParamValueSize, &SizeInOut, ParamValue);
   return PI_SUCCESS;
 }
 
+inline pi_result piextPluginGetOpaqueData(void *opaque_data_param,
+                                          void **opaque_data_return) {
+  (void)opaque_data_param;
+  (void)opaque_data_return;
+  return PI_ERROR_UNKNOWN;
+}
+
+// Platform
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+// Device
 inline pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType,
                               pi_uint32 NumEntries, pi_device *Devices,
                               pi_uint32 *NumDevices) {
@@ -444,26 +532,36 @@ inline pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType,
     return PI_ERROR_UNKNOWN;
   }
 
-  uint32_t Count = NumEntries;
-  auto hPlatform = reinterpret_cast<ur_platform_handle_t>(Platform);
-  auto phDevices = reinterpret_cast<ur_device_handle_t *>(Devices);
-  HANDLE_ERRORS(urDeviceGet(hPlatform, Type, Count, phDevices, NumDevices));
+  PI_ASSERT(Platform, PI_ERROR_INVALID_PLATFORM);
+
+  auto UrPlatform = reinterpret_cast<ur_platform_handle_t>(Platform);
+  auto UrDevices = reinterpret_cast<ur_device_handle_t *>(Devices);
+  HANDLE_ERRORS(
+      urDeviceGet(UrPlatform, Type, NumEntries, UrDevices, NumDevices));
+
   return PI_SUCCESS;
 }
 
 inline pi_result piDeviceRetain(pi_device Device) {
-  auto hDevice = reinterpret_cast<ur_device_handle_t>(Device);
-  HANDLE_ERRORS(urDeviceRetain(hDevice));
+  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
+
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+  HANDLE_ERRORS(urDeviceRetain(UrDevice));
   return PI_SUCCESS;
 }
 
 inline pi_result piDeviceRelease(pi_device Device) {
-  auto hDevice = reinterpret_cast<ur_device_handle_t>(Device);
-  HANDLE_ERRORS(urDeviceRelease(hDevice));
+  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
+
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+  HANDLE_ERRORS(urDeviceRelease(UrDevice));
   return PI_SUCCESS;
 }
 
-inline pi_result piPluginGetLastError(char **) { return PI_SUCCESS; }
+inline pi_result piPluginGetLastError(char **message) {
+  std::ignore = message;
+  return PI_SUCCESS;
+}
 
 inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
                                  size_t ParamValueSize, void *ParamValue,
@@ -800,9 +898,12 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
     return PI_ERROR_UNKNOWN;
   };
 
+  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
+
   size_t SizeInOut = ParamValueSize;
-  auto hDevice = reinterpret_cast<ur_device_handle_t>(Device);
-  HANDLE_ERRORS(urDeviceGetInfo(hDevice, InfoType, SizeInOut, ParamValue,
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+
+  HANDLE_ERRORS(urDeviceGetInfo(UrDevice, InfoType, SizeInOut, ParamValue,
                                 ParamValueSizeRet));
 
   ur2piDeviceInfoValue(InfoType, ParamValueSize, &SizeInOut, ParamValue);
@@ -810,10 +911,43 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
   return PI_SUCCESS;
 }
 
+inline pi_result piextDeviceGetNativeHandle(pi_device Device,
+                                            pi_native_handle *NativeHandle) {
+  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
+  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
+
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+
+  ur_native_handle_t UrNativeHandle{};
+  HANDLE_ERRORS(urDeviceGetNativeHandle(UrDevice, &UrNativeHandle));
+  *NativeHandle = reinterpret_cast<pi_native_handle>(UrNativeHandle);
+  return PI_SUCCESS;
+}
+
+inline pi_result
+piextDeviceCreateWithNativeHandle(pi_native_handle NativeHandle,
+                                  pi_platform Platform, pi_device *Device) {
+
+  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
+  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
+
+  ur_native_handle_t UrNativeDevice =
+      reinterpret_cast<ur_native_handle_t>(NativeHandle);
+  ur_platform_handle_t UrPlatform =
+      reinterpret_cast<ur_platform_handle_t>(Platform);
+  auto UrDevice = reinterpret_cast<ur_device_handle_t *>(Device);
+  HANDLE_ERRORS(
+      urDeviceCreateWithNativeHandle(UrNativeDevice, UrPlatform, UrDevice));
+
+  return PI_SUCCESS;
+}
+
 inline pi_result piDevicePartition(
     pi_device Device, const pi_device_partition_property *Properties,
     pi_uint32 NumEntries, pi_device *SubDevices, pi_uint32 *NumSubDevices) {
 
+  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
+
   if (!Properties || !Properties[0])
     return PI_ERROR_INVALID_VALUE;
 
@@ -860,10 +994,2521 @@ inline pi_result piDevicePartition(
   ur_device_partition_property_t UrProperties[] = {
       ur_device_partition_property_t(Property), Value, 0};
 
-  auto hDevice = reinterpret_cast<ur_device_handle_t>(Device);
-  auto phSubDevices = reinterpret_cast<ur_device_handle_t *>(SubDevices);
-  HANDLE_ERRORS(urDevicePartition(hDevice, UrProperties, NumEntries,
-                                  phSubDevices, NumSubDevices));
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+  auto UrSubDevices = reinterpret_cast<ur_device_handle_t *>(SubDevices);
+  HANDLE_ERRORS(urDevicePartition(UrDevice, UrProperties, NumEntries,
+                                  UrSubDevices, NumSubDevices));
+  return PI_SUCCESS;
+}
+
+inline pi_result piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime,
+                                         uint64_t *HostTime) {
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+  HANDLE_ERRORS(urDeviceGetGlobalTimestamps(UrDevice, DeviceTime, HostTime));
+  return PI_SUCCESS;
+}
+
+inline pi_result
+piextDeviceSelectBinary(pi_device Device, // TODO: does this need to be context?
+                        pi_device_binary *Binaries, pi_uint32 NumBinaries,
+                        pi_uint32 *SelectedBinaryInd) {
+
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+  const uint8_t **UrBinaries =
+      const_cast<const uint8_t **>(reinterpret_cast<uint8_t **>(Binaries));
+  HANDLE_ERRORS(urDeviceSelectBinary(UrDevice, UrBinaries, NumBinaries,
+                                     SelectedBinaryInd));
+  return PI_SUCCESS;
+}
+
+// Device
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+// Context
+inline pi_result piContextCreate(const pi_context_properties *Properties,
+                                 pi_uint32 NumDevices, const pi_device *Devices,
+                                 void (*PFnNotify)(const char *ErrInfo,
+                                                   const void *PrivateInfo,
+                                                   size_t CB, void *UserData),
+                                 void *UserData, pi_context *RetContext) {
+  auto UrDevices = reinterpret_cast<const ur_device_handle_t *>(Devices);
+
+  ur_context_handle_t *UrContext =
+      reinterpret_cast<ur_context_handle_t *>(RetContext);
+  // TODO: Parse PI Context Properties into UR
+  ur_context_properties_t UrProperties{};
+  HANDLE_ERRORS(
+      urContextCreate(NumDevices, UrDevices, &UrProperties, UrContext));
+  return PI_SUCCESS;
+}
+
+// FIXME: Dummy implementation to prevent link fail
+inline pi_result piextContextSetExtendedDeleter(
+    pi_context Context, pi_context_extended_deleter Function, void *UserData) {
+  std::ignore = Context;
+  std::ignore = Function;
+  std::ignore = UserData;
+  die("piextContextSetExtendedDeleter: not supported");
+  return PI_SUCCESS;
+}
+
+inline pi_result piextContextGetNativeHandle(pi_context Context,
+                                             pi_native_handle *NativeHandle) {
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+  ur_native_handle_t UrNativeHandle{};
+  HANDLE_ERRORS(urContextGetNativeHandle(UrContext, &UrNativeHandle));
+  *NativeHandle = reinterpret_cast<pi_native_handle>(UrNativeHandle);
+  return PI_SUCCESS;
+}
+
+inline pi_result piextContextCreateWithNativeHandle(
+    pi_native_handle NativeHandle, pi_uint32 NumDevices,
+    const pi_device *Devices, bool OwnNativeHandle, pi_context *RetContext) {
+  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
+  PI_ASSERT(Devices, PI_ERROR_INVALID_DEVICE);
+  PI_ASSERT(RetContext, PI_ERROR_INVALID_VALUE);
+  PI_ASSERT(NumDevices, PI_ERROR_INVALID_VALUE);
+
+  ur_native_handle_t NativeContext =
+      reinterpret_cast<ur_native_handle_t>(NativeHandle);
+  ur_context_handle_t *UrContext =
+      reinterpret_cast<ur_context_handle_t *>(RetContext);
+  HANDLE_ERRORS(urContextCreateWithNativeHandle(NativeContext, UrContext));
+  (*UrContext)->OwnZeContext = OwnNativeHandle;
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName,
+                                  size_t ParamValueSize, void *ParamValue,
+                                  size_t *ParamValueSizeRet) {
+
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+
+  ur_context_handle_t hContext = reinterpret_cast<ur_context_handle_t>(Context);
+  ur_context_info_t ContextInfoType{};
+
+  switch (ParamName) {
+  case PI_CONTEXT_INFO_DEVICES: {
+    ContextInfoType = UR_CONTEXT_INFO_DEVICES;
+    break;
+  }
+  case PI_CONTEXT_INFO_PLATFORM: {
+    die("urGetContextInfo: unsuppported ParamName.");
+  }
+  case PI_CONTEXT_INFO_NUM_DEVICES: {
+    ContextInfoType = UR_CONTEXT_INFO_NUM_DEVICES;
+    break;
+  }
+  case PI_CONTEXT_INFO_PROPERTIES: {
+    die("urGetContextInfo: unsuppported ParamName.");
+  }
+  case PI_CONTEXT_INFO_REFERENCE_COUNT: {
+    ContextInfoType = UR_EXT_CONTEXT_INFO_REFERENCE_COUNT;
+    break;
+  }
+  case PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT: {
+    ContextInfoType = UR_CONTEXT_INFO_USM_FILL2D_SUPPORT;
+    break;
+  }
+  case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT: {
+    ContextInfoType = UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT;
+    break;
+  }
+  case PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES:
+  case PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES:
+  case PI_EXT_CONTEXT_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES:
+  case PI_EXT_CONTEXT_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: {
+    // These queries should be dealt with in context_impl.cpp by calling the
+    // queries of each device separately and building the intersection set.
+    die("These queries should have never come here");
+  }
+  default: {
+    die("piGetContextInfo: unsuppported ParamName.");
+  }
+  }
+
+  HANDLE_ERRORS(urContextGetInfo(hContext, ContextInfoType, ParamValueSize,
+                                 ParamValue, ParamValueSizeRet));
+  return PI_SUCCESS;
+}
+
+inline pi_result piContextRetain(pi_context Context) {
+  ur_context_handle_t hContext = reinterpret_cast<ur_context_handle_t>(Context);
+
+  HANDLE_ERRORS(urContextRetain(hContext));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piContextRelease(pi_context Context) {
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+  HANDLE_ERRORS(urContextRelease(UrContext));
+  return PI_SUCCESS;
+}
+// Context
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+// Queue
+inline pi_result piQueueCreate(pi_context Context, pi_device Device,
+                               pi_queue_properties Flags, pi_queue *Queue) {
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+  ur_queue_property_t Props{};
+  ur_queue_handle_t *UrQueue = reinterpret_cast<ur_queue_handle_t *>(Queue);
+  HANDLE_ERRORS(urQueueCreate(UrContext, UrDevice, &Props, UrQueue));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piextQueueCreate(pi_context Context, pi_device Device,
+                                  pi_queue_properties *Properties,
+                                  pi_queue *Queue) {
+
+  PI_ASSERT(Properties, PI_ERROR_INVALID_VALUE);
+  // Expect flags mask to be passed first.
+  PI_ASSERT(Properties[0] == PI_QUEUE_FLAGS, PI_ERROR_INVALID_VALUE);
+
+  PI_ASSERT(Properties[2] == 0 ||
+                (Properties[2] == PI_QUEUE_COMPUTE_INDEX && Properties[4] == 0),
+            PI_ERROR_INVALID_VALUE);
+
+  // Check that unexpected bits are not set.
+  PI_ASSERT(!(Properties[1] &
+              ~(PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                PI_QUEUE_FLAG_PROFILING_ENABLE | PI_QUEUE_FLAG_ON_DEVICE |
+                PI_QUEUE_FLAG_ON_DEVICE_DEFAULT |
+                PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS |
+                PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW |
+                PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH)),
+            PI_ERROR_INVALID_VALUE);
+
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
+
+  ur_queue_property_t props[5]{};
+  props[0] = UR_QUEUE_PROPERTIES_FLAGS;
+  if (Properties[1] & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    props[1] |= UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+  if (Properties[1] & PI_QUEUE_FLAG_PROFILING_ENABLE)
+    props[1] |= UR_QUEUE_FLAG_PROFILING_ENABLE;
+  if (Properties[1] & PI_QUEUE_FLAG_ON_DEVICE)
+    props[1] |= UR_QUEUE_FLAG_ON_DEVICE;
+  if (Properties[1] & PI_QUEUE_FLAG_ON_DEVICE_DEFAULT)
+    props[1] |= UR_QUEUE_FLAG_ON_DEVICE_DEFAULT;
+  if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS)
+    props[1] |= UR_QUEUE_FLAG_DISCARD_EVENTS;
+  if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW)
+    props[1] |= UR_QUEUE_FLAG_PRIORITY_LOW;
+  if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH)
+    props[1] |= UR_QUEUE_FLAG_PRIORITY_HIGH;
+
+  if (Properties[2] != 0) {
+    props[2] = UR_QUEUE_PROPERTIES_COMPUTE_INDEX;
+    props[3] = Properties[3];
+  }
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+
+  ur_queue_handle_t *UrQueue = reinterpret_cast<ur_queue_handle_t *>(Queue);
+  HANDLE_ERRORS(urQueueCreate(UrContext, UrDevice, props, UrQueue));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
+                                                  pi_context Context,
+                                                  pi_device Device,
+                                                  bool OwnNativeHandle,
+                                                  pi_queue *Queue) {
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+
+  ur_native_handle_t UrNativeHandle =
+      reinterpret_cast<ur_native_handle_t>(NativeHandle);
+  ur_queue_handle_t *UrQueue = reinterpret_cast<ur_queue_handle_t *>(Queue);
+  HANDLE_ERRORS(
+      urQueueCreateWithNativeHandle(UrNativeHandle, UrContext, UrQueue));
+  (*UrQueue)->OwnNativeHandle = OwnNativeHandle;
+  return PI_SUCCESS;
+}
+
+inline pi_result piextQueueGetNativeHandle(pi_queue Queue,
+                                           pi_native_handle *NativeHandle) {
+
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+
+  ur_native_handle_t UrNativeQueue{};
+  HANDLE_ERRORS(urQueueGetNativeHandle(UrQueue, &UrNativeQueue));
+
+  *NativeHandle = reinterpret_cast<pi_native_handle>(UrNativeQueue);
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piQueueRelease(pi_queue Queue) {
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+
+  HANDLE_ERRORS(urQueueRelease(UrQueue));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piQueueFinish(pi_queue Queue) {
+  // Wait until command lists attached to the command queue are executed.
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+
+  HANDLE_ERRORS(urQueueFinish(UrQueue));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName,
+                                size_t ParamValueSize, void *ParamValue,
+                                size_t *ParamValueSizeRet) {
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+
+  ur_queue_info_t UrParamName{};
+
+  switch (ParamName) {
+  case PI_QUEUE_INFO_CONTEXT: {
+    UrParamName = UR_QUEUE_INFO_CONTEXT;
+    break;
+  }
+  case PI_QUEUE_INFO_DEVICE: {
+    UrParamName = UR_QUEUE_INFO_DEVICE;
+    break;
+  }
+  case PI_QUEUE_INFO_DEVICE_DEFAULT: {
+    UrParamName = UR_QUEUE_INFO_DEVICE_DEFAULT;
+    break;
+  }
+  case PI_QUEUE_INFO_PROPERTIES: {
+    UrParamName = UR_QUEUE_INFO_PROPERTIES;
+    break;
+  }
+  case PI_QUEUE_INFO_REFERENCE_COUNT: {
+    UrParamName = UR_QUEUE_INFO_REFERENCE_COUNT;
+    break;
+  }
+  case PI_QUEUE_INFO_SIZE: {
+    UrParamName = UR_QUEUE_INFO_SIZE;
+    break;
+  }
+  case PI_EXT_ONEAPI_QUEUE_INFO_EMPTY: {
+    UrParamName = UR_EXT_ONEAPI_QUEUE_INFO_EMPTY;
+    break;
+  }
+  default: {
+    die("Unsupported ParamName in piQueueGetInfo");
+    return PI_ERROR_INVALID_VALUE;
+  }
+  }
+
+  HANDLE_ERRORS(urQueueGetInfo(UrQueue, UrParamName, ParamValueSize, ParamValue,
+                               ParamValueSizeRet));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piQueueRetain(pi_queue Queue) {
+
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+
+  HANDLE_ERRORS(urQueueRetain(UrQueue));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piQueueFlush(pi_queue Queue) {
+
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+
+  HANDLE_ERRORS(urQueueFlush(UrQueue));
+
+  return PI_SUCCESS;
+}
+
+// Queue
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+// Program
+
+inline pi_result piProgramCreate(pi_context Context, const void *ILBytes,
+                                 size_t Length, pi_program *Program) {
+
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+  PI_ASSERT(ILBytes && Length, PI_ERROR_INVALID_VALUE);
+  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+
+  ur_program_properties_t UrProperties{};
+  ur_program_handle_t *UrProgram =
+      reinterpret_cast<ur_program_handle_t *>(Program);
+  HANDLE_ERRORS(urProgramCreateWithIL(UrContext, ILBytes, Length, &UrProperties,
+                                      UrProgram));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piProgramCreateWithBinary(
+    pi_context Context, pi_uint32 NumDevices, const pi_device *DeviceList,
+    const size_t *Lengths, const unsigned char **Binaries,
+    size_t NumMetadataEntries, const pi_device_binary_property *Metadata,
+    pi_int32 *BinaryStatus, pi_program *Program) {
+  std::ignore = Metadata;
+  std::ignore = NumMetadataEntries;
+
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+  PI_ASSERT(DeviceList && NumDevices, PI_ERROR_INVALID_VALUE);
+  PI_ASSERT(Binaries && Lengths, PI_ERROR_INVALID_VALUE);
+  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
+
+  // For now we support only one device.
+  if (NumDevices != 1) {
+    die("piProgramCreateWithBinary: level_zero supports only one device.");
+    return PI_ERROR_INVALID_VALUE;
+  }
+  if (!Binaries[0] || !Lengths[0]) {
+    if (BinaryStatus)
+      *BinaryStatus = PI_ERROR_INVALID_VALUE;
+    return PI_ERROR_INVALID_VALUE;
+  }
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(DeviceList[0]);
+
+  // TODO: Translate Metadata into Properties?
+  ur_program_properties_t Properties{};
+  ur_program_handle_t *UrProgram =
+      reinterpret_cast<ur_program_handle_t *>(Program);
+  HANDLE_ERRORS(urProgramCreateWithBinary(UrContext, UrDevice, Lengths[0],
+                                          Binaries[0], &Properties, UrProgram));
+
+  if (BinaryStatus)
+    *BinaryStatus = PI_SUCCESS;
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piclProgramCreateWithSource(pi_context Context,
+                                             pi_uint32 Count,
+                                             const char **Strings,
+                                             const size_t *Lengths,
+                                             pi_program *RetProgram) {
+  std::ignore = Context;
+  std::ignore = Count;
+  std::ignore = Strings;
+  std::ignore = Lengths;
+  std::ignore = RetProgram;
+  die("piclProgramCreateWithSource: not supported in UR\n");
+  return PI_ERROR_INVALID_OPERATION;
+}
+
+inline pi_result piProgramGetInfo(pi_program Program, pi_program_info ParamName,
+                                  size_t ParamValueSize, void *ParamValue,
+                                  size_t *ParamValueSizeRet) {
+
+  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
+
+  ur_program_handle_t UrProgram =
+      reinterpret_cast<ur_program_handle_t>(Program);
+
+  ur_program_info_t PropName{};
+
+  switch (ParamName) {
+  case PI_PROGRAM_INFO_REFERENCE_COUNT: {
+    PropName = UR_PROGRAM_INFO_REFERENCE_COUNT;
+    break;
+  }
+  case PI_PROGRAM_INFO_CONTEXT: {
+    PropName = UR_PROGRAM_INFO_CONTEXT;
+    break;
+  }
+  case PI_PROGRAM_INFO_NUM_DEVICES: {
+    PropName = UR_PROGRAM_INFO_NUM_DEVICES;
+    break;
+  }
+  case PI_PROGRAM_INFO_DEVICES: {
+    PropName = UR_PROGRAM_INFO_DEVICES;
+    break;
+  }
+  case PI_PROGRAM_INFO_SOURCE: {
+    PropName = UR_PROGRAM_INFO_SOURCE;
+    break;
+  }
+  case PI_PROGRAM_INFO_BINARY_SIZES: {
+    PropName = UR_PROGRAM_INFO_BINARY_SIZES;
+    break;
+  }
+  case PI_PROGRAM_INFO_BINARIES: {
+    PropName = UR_PROGRAM_INFO_BINARIES;
+    break;
+  }
+  case PI_PROGRAM_INFO_NUM_KERNELS: {
+    PropName = UR_PROGRAM_INFO_NUM_KERNELS;
+    break;
+  }
+  case PI_PROGRAM_INFO_KERNEL_NAMES: {
+    PropName = UR_PROGRAM_INFO_KERNEL_NAMES;
+    break;
+  }
+  default: {
+    die("urProgramGetInfo: not implemented");
+  }
+  }
+
+  HANDLE_ERRORS(urProgramGetInfo(UrProgram, PropName, ParamValueSize,
+                                 ParamValue, ParamValueSizeRet));
+
   return PI_SUCCESS;
 }
+
+inline pi_result
+piProgramLink(pi_context Context, pi_uint32 NumDevices,
+              const pi_device *DeviceList, const char *Options,
+              pi_uint32 NumInputPrograms, const pi_program *InputPrograms,
+              void (*PFnNotify)(pi_program Program, void *UserData),
+              void *UserData, pi_program *RetProgram) {
+  // We only support one device with Level Zero currently.
+  if (NumDevices != 1) {
+    die("piProgramLink: level_zero supports only one device.");
+    return PI_ERROR_INVALID_VALUE;
+  }
+
+  // Validate input parameters.
+  PI_ASSERT(DeviceList, PI_ERROR_INVALID_DEVICE);
+  PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
+  if (NumInputPrograms == 0 || InputPrograms == nullptr)
+    return PI_ERROR_INVALID_VALUE;
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+  const ur_program_handle_t *UrInputPrograms =
+      reinterpret_cast<const ur_program_handle_t *>(InputPrograms);
+  ur_program_handle_t *UrProgram =
+      reinterpret_cast<ur_program_handle_t *>(RetProgram);
+
+  HANDLE_ERRORS(urProgramLink(UrContext, NumInputPrograms, UrInputPrograms,
+                              Options, UrProgram));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piProgramCompile(
+    pi_program Program, pi_uint32 NumDevices, const pi_device *DeviceList,
+    const char *Options, pi_uint32 NumInputHeaders,
+    const pi_program *InputHeaders, const char **HeaderIncludeNames,
+    void (*PFnNotify)(pi_program Program, void *UserData), void *UserData) {
+
+  std::ignore = NumInputHeaders;
+  std::ignore = InputHeaders;
+  std::ignore = HeaderIncludeNames;
+
+  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
+
+  if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList))
+    return PI_ERROR_INVALID_VALUE;
+
+  // These aren't supported.
+  PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
+
+  ur_program_handle_t UrProgram =
+      reinterpret_cast<ur_program_handle_t>(Program);
+
+  ur_program_info_t PropName = UR_PROGRAM_INFO_CONTEXT;
+  ur_context_handle_t UrContext{};
+  HANDLE_ERRORS(urProgramGetInfo(UrProgram, PropName, sizeof(&UrContext),
+                                 &UrContext, nullptr));
+
+  HANDLE_ERRORS(urProgramCompile(UrContext, UrProgram, Options));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result
+piProgramBuild(pi_program Program, pi_uint32 NumDevices,
+               const pi_device *DeviceList, const char *Options,
+               void (*PFnNotify)(pi_program Program, void *UserData),
+               void *UserData) {
+  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
+  if ((NumDevices && !DeviceList) || (!NumDevices && DeviceList)) {
+    return PI_ERROR_INVALID_VALUE;
+  }
+
+  // We only support build to one device with Level Zero now.
+  // TODO: we should eventually build to the possibly multiple root
+  // devices in the context.
+  if (NumDevices != 1) {
+    die("piProgramBuild: level_zero supports only one device.");
+    return PI_ERROR_INVALID_VALUE;
+  }
+
+  // These aren't supported.
+  PI_ASSERT(!PFnNotify && !UserData, PI_ERROR_INVALID_VALUE);
+
+  ur_program_handle_t UrProgram =
+      reinterpret_cast<ur_program_handle_t>(Program);
+  ur_program_info_t PropName = UR_PROGRAM_INFO_CONTEXT;
+  ur_context_handle_t UrContext{};
+  HANDLE_ERRORS(urProgramGetInfo(UrProgram, PropName, sizeof(&UrContext),
+                                 &UrContext, nullptr));
+
+  HANDLE_ERRORS(urProgramBuild(UrContext, UrProgram, Options));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piextProgramSetSpecializationConstant(pi_program Program,
+                                                       pi_uint32 SpecID,
+                                                       size_t Size,
+                                                       const void *SpecValue) {
+  ur_program_handle_t UrProgram =
+      reinterpret_cast<ur_program_handle_t>(Program);
+  uint32_t Count = 1;
+  ur_specialization_constant_info_t SpecConstant{};
+  SpecConstant.id = SpecID;
+  SpecConstant.size = Size;
+  SpecConstant.pValue = SpecValue;
+  HANDLE_ERRORS(
+      urProgramSetSpecializationConstants(UrProgram, Count, &SpecConstant));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piKernelCreate(pi_program Program, const char *KernelName,
+                                pi_kernel *RetKernel) {
+  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
+  PI_ASSERT(RetKernel, PI_ERROR_INVALID_VALUE);
+  PI_ASSERT(KernelName, PI_ERROR_INVALID_VALUE);
+
+  ur_program_handle_t UrProgram =
+      reinterpret_cast<ur_program_handle_t>(Program);
+  ur_kernel_handle_t *UrKernel =
+      reinterpret_cast<ur_kernel_handle_t *>(RetKernel);
+
+  HANDLE_ERRORS(urKernelCreate(UrProgram, KernelName, UrKernel));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result
+piEnqueueMemImageFill(pi_queue Queue, pi_mem Image, const void *FillColor,
+                      const size_t *Origin, const size_t *Region,
+                      pi_uint32 NumEventsInWaitList,
+                      const pi_event *EventsWaitList, pi_event *Event) {
+
+  std::ignore = Image;
+  std::ignore = FillColor;
+  std::ignore = Origin;
+  std::ignore = Region;
+  std::ignore = NumEventsInWaitList;
+  std::ignore = EventsWaitList;
+  std::ignore = Event;
+
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  die("piEnqueueMemImageFill: not implemented");
+  return PI_SUCCESS;
+}
+
+inline pi_result
+piEnqueueNativeKernel(pi_queue Queue, void (*UserFunc)(void *), void *Args,
+                      size_t CbArgs, pi_uint32 NumMemObjects,
+                      const pi_mem *MemList, const void **ArgsMemLoc,
+                      pi_uint32 NumEventsInWaitList,
+                      const pi_event *EventsWaitList, pi_event *Event) {
+  std::ignore = UserFunc;
+  std::ignore = Args;
+  std::ignore = CbArgs;
+  std::ignore = NumMemObjects;
+  std::ignore = MemList;
+  std::ignore = ArgsMemLoc;
+  std::ignore = NumEventsInWaitList;
+  std::ignore = EventsWaitList;
+  std::ignore = Event;
+
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  die("piEnqueueNativeKernel: not implemented");
+  return PI_SUCCESS;
+}
+
+inline pi_result piextGetDeviceFunctionPointer(pi_device Device,
+                                               pi_program Program,
+                                               const char *FunctionName,
+                                               pi_uint64 *FunctionPointerRet) {
+
+  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
+
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+
+  ur_program_handle_t UrProgram =
+      reinterpret_cast<ur_program_handle_t>(Program);
+
+  void **FunctionPointer = reinterpret_cast<void **>(FunctionPointerRet);
+
+  HANDLE_ERRORS(urProgramGetFunctionPointer(UrDevice, UrProgram, FunctionName,
+                                            FunctionPointer));
+  return PI_SUCCESS;
+}
+
+// Special version of piKernelSetArg to accept pi_mem.
+inline pi_result piextKernelSetArgMemObj(pi_kernel Kernel, pi_uint32 ArgIndex,
+                                         const pi_mem *ArgValue) {
+
+  // TODO: the better way would probably be to add a new PI API for
+  // extracting native PI object from PI handle, and have SYCL
+  // RT pass that directly to the regular piKernelSetArg (and
+  // then remove this piextKernelSetArgMemObj).
+
+  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
+
+  ur_mem_handle_t UrMemory = reinterpret_cast<ur_mem_handle_t>(*ArgValue);
+
+  // We don't yet know the device where this kernel will next be run on.
+  // Thus we can't know the actual memory allocation that needs to be used.
+  // Remember the memory object being used as an argument for this kernel
+  // to process it later when the device is known (at the kernel enqueue).
+  //
+  // TODO: for now we have to conservatively assume the access as read-write.
+  //       Improve that by passing SYCL buffer accessor type into
+  //       piextKernelSetArgMemObj.
+  //
+
+  ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
+  HANDLE_ERRORS(urKernelSetArgMemObj(UrKernel, ArgIndex, UrMemory));
+  return PI_SUCCESS;
+}
+
+inline pi_result piKernelSetArg(pi_kernel Kernel, pi_uint32 ArgIndex,
+                                size_t ArgSize, const void *ArgValue) {
+
+  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
+
+  ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
+
+  HANDLE_ERRORS(urKernelSetArgValue(UrKernel, ArgIndex, ArgSize, ArgValue));
+  return PI_SUCCESS;
+}
+
+inline pi_result
+piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
+                                  pi_context Context, pi_program Program,
+                                  bool OwnNativeHandle, pi_kernel *Kernel) {
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
+  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
+  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
+
+  ur_native_handle_t UrNativeKernel =
+      reinterpret_cast<ur_native_handle_t>(NativeHandle);
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+  std::ignore = Program;
+  ur_kernel_handle_t *UrKernel = reinterpret_cast<ur_kernel_handle_t *>(Kernel);
+  HANDLE_ERRORS(
+      urKernelCreateWithNativeHandle(UrNativeKernel, UrContext, UrKernel));
+  (*UrKernel)->OwnNativeHandle = OwnNativeHandle;
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piProgramRetain(pi_program Program) {
+  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
+
+  ur_program_handle_t UrProgram =
+      reinterpret_cast<ur_program_handle_t>(Program);
+  HANDLE_ERRORS(
+      urProgramRetain(reinterpret_cast<ur_program_handle_t>(UrProgram)));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piKernelSetExecInfo(pi_kernel Kernel,
+                                     pi_kernel_exec_info ParamName,
+                                     size_t ParamValueSize,
+                                     const void *ParamValue) {
+
+  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
+  PI_ASSERT(ParamValue, PI_ERROR_INVALID_VALUE);
+
+  ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
+  ur_kernel_exec_info_t propName{};
+  switch (ParamName) {
+  case PI_USM_INDIRECT_ACCESS: {
+    propName = UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS;
+    break;
+  }
+  case PI_USM_PTRS: {
+    propName = UR_KERNEL_EXEC_INFO_USM_PTRS;
+    break;
+  }
+  case PI_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG: {
+    propName = UR_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG;
+    break;
+  }
+  default:
+    return PI_ERROR_INVALID_PROPERTY;
+  }
+  HANDLE_ERRORS(
+      urKernelSetExecInfo(UrKernel, propName, ParamValueSize, ParamValue));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piextProgramGetNativeHandle(pi_program Program,
+                                             pi_native_handle *NativeHandle) {
+  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
+  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
+
+  ur_program_handle_t UrProgram =
+      reinterpret_cast<ur_program_handle_t>(Program);
+  ur_native_handle_t NativeProgram{};
+  HANDLE_ERRORS(urProgramGetNativeHandle(UrProgram, &NativeProgram));
+
+  *NativeHandle = reinterpret_cast<pi_native_handle>(NativeProgram);
+
+  return PI_SUCCESS;
+}
+
+inline pi_result
+piextProgramCreateWithNativeHandle(pi_native_handle NativeHandle,
+                                   pi_context Context, bool ownNativeHandle,
+                                   pi_program *Program) {
+  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
+  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+
+  ur_native_handle_t NativeProgram =
+      reinterpret_cast<ur_native_handle_t>(NativeHandle);
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+  ur_program_handle_t *UrProgram =
+      reinterpret_cast<ur_program_handle_t *>(Program);
+  HANDLE_ERRORS(
+      urProgramCreateWithNativeHandle(NativeProgram, UrContext, UrProgram));
+  return PI_SUCCESS;
+}
+
+inline pi_result piKernelGetInfo(pi_kernel Kernel, pi_kernel_info ParamName,
+                                 size_t ParamValueSize, void *ParamValue,
+                                 size_t *ParamValueSizeRet) {
+  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
+
+  ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
+  ur_kernel_info_t UrParamName{};
+  switch (ParamName) {
+  case PI_KERNEL_INFO_FUNCTION_NAME: {
+    UrParamName = UR_KERNEL_INFO_FUNCTION_NAME;
+    break;
+  }
+  case PI_KERNEL_INFO_NUM_ARGS: {
+    UrParamName = UR_KERNEL_INFO_NUM_ARGS;
+    break;
+  }
+  case PI_KERNEL_INFO_REFERENCE_COUNT: {
+    UrParamName = UR_KERNEL_INFO_REFERENCE_COUNT;
+    break;
+  }
+  case PI_KERNEL_INFO_CONTEXT: {
+    UrParamName = UR_KERNEL_INFO_CONTEXT;
+    break;
+  }
+  case PI_KERNEL_INFO_PROGRAM: {
+    UrParamName = UR_KERNEL_INFO_PROGRAM;
+    break;
+  }
+  case PI_KERNEL_INFO_ATTRIBUTES: {
+    UrParamName = UR_KERNEL_INFO_ATTRIBUTES;
+    break;
+  }
+  default:
+    return PI_ERROR_INVALID_PROPERTY;
+  }
+
+  HANDLE_ERRORS(urKernelGetInfo(UrKernel, UrParamName, ParamValueSize,
+                                ParamValue, ParamValueSizeRet));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device,
+                                      pi_kernel_group_info ParamName,
+                                      size_t ParamValueSize, void *ParamValue,
+                                      size_t *ParamValueSizeRet) {
+  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
+  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
+
+  ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+
+  ur_kernel_group_info_t UrParamName{};
+  switch (ParamName) {
+  case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
+    UrParamName = UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE;
+    break;
+  }
+  case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
+    UrParamName = UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE;
+    break;
+  }
+  case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
+    UrParamName = UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE;
+    break;
+  }
+  case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
+    UrParamName = UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE;
+    break;
+  }
+  case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
+    UrParamName = UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE;
+    break;
+  }
+  case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
+    UrParamName = UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE;
+    break;
+  }
+  // The number of registers used by the compiled kernel (device specific)
+  case PI_KERNEL_GROUP_INFO_NUM_REGS: {
+    die("PI_KERNEL_GROUP_INFO_NUM_REGS in piKernelGetGroupInfo not "
+        "implemented\n");
+    break;
+  }
+  default: {
+    die("Unknown ParamName in piKernelGetGroupInfo");
+    return PI_ERROR_INVALID_VALUE;
+  }
+  }
+
+  HANDLE_ERRORS(urKernelGetGroupInfo(UrKernel, UrDevice, UrParamName,
+                                     ParamValueSize, ParamValue,
+                                     ParamValueSizeRet));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piKernelRetain(pi_kernel Kernel) {
+
+  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
+
+  ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
+
+  HANDLE_ERRORS(urKernelRetain(UrKernel));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piKernelRelease(pi_kernel Kernel) {
+
+  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
+
+  ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
+
+  HANDLE_ERRORS(urKernelRelease(UrKernel));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piProgramRelease(pi_program Program) {
+
+  PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
+
+  ur_program_handle_t UrProgram =
+      reinterpret_cast<ur_program_handle_t>(Program);
+
+  HANDLE_ERRORS(urProgramRelease(UrProgram));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piextKernelSetArgPointer(pi_kernel Kernel, pi_uint32 ArgIndex,
+                                          size_t ArgSize,
+                                          const void *ArgValue) {
+  ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
+
+  HANDLE_ERRORS(urKernelSetArgValue(UrKernel, ArgIndex, ArgSize, ArgValue));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piKernelGetSubGroupInfo(
+    pi_kernel Kernel, pi_device Device, pi_kernel_sub_group_info ParamName,
+    size_t InputValueSize, const void *InputValue, size_t ParamValueSize,
+    void *ParamValue, size_t *ParamValueSizeRet) {
+
+  std::ignore = InputValueSize;
+  std::ignore = InputValue;
+
+  ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+
+  ur_kernel_sub_group_info_t PropName{};
+  switch (ParamName) {
+  case PI_KERNEL_MAX_SUB_GROUP_SIZE: {
+    PropName = UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE;
+    break;
+  }
+  case PI_KERNEL_MAX_NUM_SUB_GROUPS: {
+    PropName = UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS;
+    break;
+  }
+  case PI_KERNEL_COMPILE_NUM_SUB_GROUPS: {
+    PropName = UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS;
+    break;
+  }
+  case PI_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL: {
+    PropName = UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL;
+    break;
+  }
+  }
+  HANDLE_ERRORS(urKernelGetSubGroupInfo(UrKernel, UrDevice, PropName,
+                                        ParamValueSize, ParamValue,
+                                        ParamValueSizeRet));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piProgramGetBuildInfo(pi_program Program, pi_device Device,
+                                       pi_program_build_info ParamName,
+                                       size_t ParamValueSize, void *ParamValue,
+                                       size_t *ParamValueSizeRet) {
+
+  ur_program_handle_t UrProgram =
+      reinterpret_cast<ur_program_handle_t>(Program);
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+
+  ur_program_build_info_t PropName{};
+  switch (ParamName) {
+  case PI_PROGRAM_BUILD_INFO_STATUS: {
+    PropName = UR_PROGRAM_BUILD_INFO_STATUS;
+    break;
+  }
+  case PI_PROGRAM_BUILD_INFO_OPTIONS: {
+    PropName = UR_PROGRAM_BUILD_INFO_OPTIONS;
+    break;
+  }
+  case PI_PROGRAM_BUILD_INFO_LOG: {
+    PropName = UR_PROGRAM_BUILD_INFO_LOG;
+    break;
+  }
+  case PI_PROGRAM_BUILD_INFO_BINARY_TYPE: {
+    PropName = UR_PROGRAM_BUILD_INFO_BINARY_TYPE;
+    break;
+  }
+  default: {
+    die("piProgramGetBuildInfo: not implemented");
+  }
+  }
+  HANDLE_ERRORS(urProgramGetBuildInfo(UrProgram, UrDevice, PropName,
+                                      ParamValueSize, ParamValue,
+                                      ParamValueSizeRet));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piextKernelGetNativeHandle(pi_kernel Kernel,
+                                            pi_native_handle *NativeHandle) {
+  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
+  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
+
+  ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
+  ur_native_handle_t NativeKernel{};
+  HANDLE_ERRORS(urKernelGetNativeHandle(UrKernel, &NativeKernel));
+
+  *NativeHandle = reinterpret_cast<pi_native_handle>(NativeKernel);
+
+  return PI_SUCCESS;
+}
+
+/// API for writing data from host to a device global variable.
+///
+/// \param Queue is the queue
+/// \param Program is the program containing the device global variable
+/// \param Name is the unique identifier for the device global variable
+/// \param BlockingWrite is true if the write should block
+/// \param Count is the number of bytes to copy
+/// \param Offset is the byte offset into the device global variable to start
+/// copying
+/// \param Src is a pointer to where the data must be copied from
+/// \param NumEventsInWaitList is a number of events in the wait list
+/// \param EventWaitList is the wait list
+/// \param Event is the resulting event
+inline pi_result piextEnqueueDeviceGlobalVariableWrite(
+    pi_queue Queue, pi_program Program, const char *Name, pi_bool BlockingWrite,
+    size_t Count, size_t Offset, const void *Src, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventsWaitList, pi_event *OutEvent) {
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_program_handle_t UrProgram =
+      reinterpret_cast<ur_program_handle_t>(Program);
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+  HANDLE_ERRORS(urEnqueueDeviceGlobalVariableWrite(
+      UrQueue, UrProgram, Name, BlockingWrite, Count, Offset, Src,
+      NumEventsInWaitList, UrEventsWaitList, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+/// API reading data from a device global variable to host.
+///
+/// \param Queue is the queue
+/// \param Program is the program containing the device global variable
+/// \param Name is the unique identifier for the device global variable
+/// \param BlockingRead is true if the read should block
+/// \param Count is the number of bytes to copy
+/// \param Offset is the byte offset into the device global variable to start
+/// copying
+/// \param Dst is a pointer to where the data must be copied to
+/// \param NumEventsInWaitList is a number of events in the wait list
+/// \param EventWaitList is the wait list
+/// \param Event is the resulting event
+inline pi_result piextEnqueueDeviceGlobalVariableRead(
+    pi_queue Queue, pi_program Program, const char *Name, pi_bool BlockingRead,
+    size_t Count, size_t Offset, void *Dst, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventsWaitList, pi_event *OutEvent) {
+
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_program_handle_t UrProgram =
+      reinterpret_cast<ur_program_handle_t>(Program);
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueDeviceGlobalVariableRead(
+      UrQueue, UrProgram, Name, BlockingRead, Count, Offset, Dst,
+      NumEventsInWaitList, UrEventsWaitList, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+// Program
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+// Memory
+inline pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags,
+                                   size_t Size, void *HostPtr, pi_mem *RetMem,
+                                   const pi_mem_properties *properties) {
+
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+  PI_ASSERT(RetMem, PI_ERROR_INVALID_VALUE);
+
+  // TODO: implement support for more access modes
+  if (!((Flags & PI_MEM_FLAGS_ACCESS_RW) ||
+        (Flags & PI_MEM_ACCESS_READ_ONLY))) {
+    die("piMemBufferCreate: Level-Zero supports read-write and read-only "
+        "buffer,"
+        "but not other accesses (such as write-only) yet.");
+  }
+
+  if (properties != nullptr) {
+    die("piMemBufferCreate: no mem properties goes to Level-Zero RT yet");
+  }
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+
+  ur_mem_flags_t UrBufferFlags{};
+  if (Flags & PI_MEM_FLAGS_ACCESS_RW) {
+    UrBufferFlags |= UR_MEM_FLAG_READ_WRITE;
+  }
+  if (Flags & PI_MEM_ACCESS_READ_ONLY) {
+    UrBufferFlags |= UR_MEM_FLAG_READ_ONLY;
+  }
+  if (Flags & PI_MEM_FLAGS_HOST_PTR_USE) {
+    UrBufferFlags |= UR_MEM_FLAG_USE_HOST_POINTER;
+  }
+  if (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) {
+    UrBufferFlags |= UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER;
+  }
+  if (Flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
+    UrBufferFlags |= UR_MEM_FLAG_ALLOC_HOST_POINTER;
+  }
+
+  ur_mem_handle_t *UrBuffer = reinterpret_cast<ur_mem_handle_t *>(RetMem);
+  HANDLE_ERRORS(
+      urMemBufferCreate(UrContext, UrBufferFlags, Size, HostPtr, UrBuffer));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context,
+                                   pi_usm_mem_properties *Properties,
+                                   size_t Size, pi_uint32 Alignment) {
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+  ur_usm_desc_t USMDesc{};
+  ur_usm_pool_handle_t Pool{};
+  HANDLE_ERRORS(
+      urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, Alignment, ResultPtr));
+  return PI_SUCCESS;
+}
+
+inline pi_result piMemGetInfo(pi_mem Mem, pi_mem_info ParamName,
+                              size_t ParamValueSize, void *ParamValue,
+                              size_t *ParamValueSizeRet) {
+  PI_ASSERT(Mem, PI_ERROR_INVALID_VALUE);
+  // piMemImageGetInfo must be used for images
+
+  ur_mem_handle_t UrMemory = reinterpret_cast<ur_mem_handle_t>(Mem);
+  ur_mem_info_t MemInfoType{};
+  switch (ParamName) {
+  case PI_MEM_CONTEXT: {
+    MemInfoType = UR_MEM_INFO_CONTEXT;
+    break;
+  }
+  case PI_MEM_SIZE: {
+    MemInfoType = UR_MEM_INFO_SIZE;
+    break;
+  }
+  default: {
+    die("piMemGetInfo: unsuppported ParamName.");
+  }
+  }
+  HANDLE_ERRORS(urMemGetInfo(UrMemory, MemInfoType, ParamValueSize, ParamValue,
+                             ParamValueSizeRet));
+  return PI_SUCCESS;
+}
+
+inline pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
+                                  const pi_image_format *ImageFormat,
+                                  const pi_image_desc *ImageDesc, void *HostPtr,
+                                  pi_mem *RetImage) {
+
+  // TODO: implement read-only, write-only
+  if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) {
+    die("piMemImageCreate: Level-Zero implements only read-write buffer,"
+        "no read-only or write-only yet.");
+  }
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+  PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE);
+  PI_ASSERT(ImageFormat, PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+
+  ur_mem_flags_t UrFlags{};
+  if (Flags & PI_MEM_FLAGS_ACCESS_RW) {
+    UrFlags |= UR_MEM_FLAG_READ_WRITE;
+  }
+  if (Flags & PI_MEM_ACCESS_READ_ONLY) {
+    UrFlags |= UR_MEM_FLAG_READ_ONLY;
+  }
+  if (Flags & PI_MEM_FLAGS_HOST_PTR_USE) {
+    UrFlags |= UR_MEM_FLAG_USE_HOST_POINTER;
+  }
+  if (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) {
+    UrFlags |= UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER;
+  }
+  if (Flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
+    UrFlags |= UR_MEM_FLAG_ALLOC_HOST_POINTER;
+  }
+
+  ur_image_format_t UrFormat{};
+  switch (ImageFormat->image_channel_data_type) {
+  case PI_IMAGE_CHANNEL_TYPE_SNORM_INT8: {
+    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SNORM_INT8;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_TYPE_SNORM_INT16: {
+    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SNORM_INT16;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_TYPE_UNORM_INT8: {
+    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_INT8;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_TYPE_UNORM_INT16: {
+    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_INT16;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565: {
+    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555: {
+    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_TYPE_UNORM_INT_101010: {
+    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_INT_101010;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8: {
+    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT16: {
+    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT32: {
+    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: {
+    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: {
+    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: {
+    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT: {
+    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_TYPE_FLOAT: {
+    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_FLOAT;
+    break;
+  }
+  default: {
+    die("piMemImageCreate: unsuppported image_channel_data_type.");
+  }
+  }
+  switch (ImageFormat->image_channel_order) {
+  case PI_IMAGE_CHANNEL_ORDER_A: {
+    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_A;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_ORDER_R: {
+    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_R;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_ORDER_RG: {
+    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RG;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_ORDER_RA: {
+    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RA;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_ORDER_RGB: {
+    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RGB;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_ORDER_RGBA: {
+    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RGBA;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_ORDER_BGRA: {
+    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_BGRA;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_ORDER_ARGB: {
+    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_ARGB;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_ORDER_ABGR: {
+    UrFormat.channelOrder = UR_EXT_IMAGE_CHANNEL_ORDER_ABGR;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_ORDER_INTENSITY: {
+    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_INTENSITY;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_ORDER_LUMINANCE: {
+    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_LUMINANCE;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_ORDER_Rx: {
+    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RX;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_ORDER_RGx: {
+    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RGX;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_ORDER_RGBx: {
+    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RGBX;
+    break;
+  }
+  case PI_IMAGE_CHANNEL_ORDER_sRGBA: {
+    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_SRGBA;
+    break;
+  }
+  default: {
+    die("piMemImageCreate: unsuppported image_channel_data_type.");
+  }
+  }
+  ur_image_desc_t UrDesc{};
+  UrDesc.arraySize = ImageDesc->image_array_size;
+  UrDesc.depth = ImageDesc->image_depth;
+  UrDesc.height = ImageDesc->image_height;
+  UrDesc.numMipLevel = ImageDesc->num_mip_levels;
+  UrDesc.numSamples = ImageDesc->num_samples;
+  UrDesc.rowPitch = ImageDesc->image_row_pitch;
+  UrDesc.slicePitch = ImageDesc->image_slice_pitch;
+  switch (ImageDesc->image_type) {
+  case PI_MEM_TYPE_BUFFER: {
+    UrDesc.type = UR_MEM_TYPE_BUFFER;
+    break;
+  }
+  case PI_MEM_TYPE_IMAGE2D: {
+    UrDesc.type = UR_MEM_TYPE_IMAGE2D;
+    break;
+  }
+  case PI_MEM_TYPE_IMAGE3D: {
+    UrDesc.type = UR_MEM_TYPE_IMAGE3D;
+    break;
+  }
+  case PI_MEM_TYPE_IMAGE2D_ARRAY: {
+    UrDesc.type = UR_MEM_TYPE_IMAGE2D_ARRAY;
+    break;
+  }
+  case PI_MEM_TYPE_IMAGE1D: {
+    UrDesc.type = UR_MEM_TYPE_IMAGE1D;
+    break;
+  }
+  case PI_MEM_TYPE_IMAGE1D_ARRAY: {
+    UrDesc.type = UR_MEM_TYPE_IMAGE1D_ARRAY;
+    break;
+  }
+  case PI_MEM_TYPE_IMAGE1D_BUFFER: {
+    UrDesc.type = UR_MEM_TYPE_IMAGE1D_BUFFER;
+    break;
+  }
+  default: {
+    die("piMemImageCreate: unsuppported image_type.");
+  }
+  }
+  UrDesc.width = ImageDesc->image_width;
+  UrDesc.arraySize = ImageDesc->image_array_size;
+  UrDesc.arraySize = ImageDesc->image_array_size;
+  // TODO: UrDesc doesn't have something for ImageDesc->buffer
+
+  ur_mem_handle_t *UrMem = reinterpret_cast<ur_mem_handle_t *>(RetImage);
+  HANDLE_ERRORS(
+      urMemImageCreate(UrContext, UrFlags, &UrFormat, &UrDesc, HostPtr, UrMem));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piMemBufferPartition(pi_mem Buffer, pi_mem_flags Flags,
+                                      pi_buffer_create_type BufferCreateType,
+                                      void *BufferCreateInfo, pi_mem *RetMem) {
+
+  PI_ASSERT(BufferCreateType == PI_BUFFER_CREATE_TYPE_REGION &&
+                BufferCreateInfo && RetMem,
+            PI_ERROR_INVALID_VALUE);
+
+  auto Region = (pi_buffer_region)BufferCreateInfo;
+  PI_ASSERT(Region->size != 0u, PI_ERROR_INVALID_BUFFER_SIZE);
+  PI_ASSERT(Region->origin <= (Region->origin + Region->size),
+            PI_ERROR_INVALID_VALUE);
+
+  ur_mem_handle_t UrBuffer = reinterpret_cast<ur_mem_handle_t>(Buffer);
+
+  ur_mem_flags_t UrFlags{};
+  if (Flags & PI_MEM_FLAGS_ACCESS_RW) {
+    UrFlags |= UR_MEM_FLAG_READ_WRITE;
+  }
+  if (Flags & PI_MEM_ACCESS_READ_ONLY) {
+    UrFlags |= UR_MEM_FLAG_READ_ONLY;
+  }
+  if (Flags & PI_MEM_FLAGS_HOST_PTR_USE) {
+    UrFlags |= UR_MEM_FLAG_USE_HOST_POINTER;
+  }
+  if (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) {
+    UrFlags |= UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER;
+  }
+  if (Flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
+    UrFlags |= UR_MEM_FLAG_ALLOC_HOST_POINTER;
+  }
+
+  ur_buffer_create_type_t UrBufferCreateType{};
+  if (BufferCreateType == PI_BUFFER_CREATE_TYPE_REGION) {
+    UrBufferCreateType = UR_BUFFER_CREATE_TYPE_REGION;
+  }
+
+  ur_buffer_region_t UrBufferCreateInfo{};
+  UrBufferCreateInfo.origin = Region->origin;
+  UrBufferCreateInfo.size = Region->size;
+  ur_mem_handle_t *UrMem = reinterpret_cast<ur_mem_handle_t *>(RetMem);
+  HANDLE_ERRORS(urMemBufferPartition(UrBuffer, UrFlags, UrBufferCreateType,
+                                     &UrBufferCreateInfo, UrMem));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piextMemGetNativeHandle(pi_mem Mem,
+                                         pi_native_handle *NativeHandle) {
+  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
+
+  ur_mem_handle_t UrMem = reinterpret_cast<ur_mem_handle_t>(Mem);
+  ur_native_handle_t NativeMem{};
+  HANDLE_ERRORS(urMemGetNativeHandle(UrMem, &NativeMem));
+
+  *NativeHandle = reinterpret_cast<pi_native_handle>(NativeMem);
+
+  return PI_SUCCESS;
+}
+
+inline pi_result
+piEnqueueMemImageCopy(pi_queue Queue, pi_mem SrcImage, pi_mem DstImage,
+                      pi_image_offset SrcOrigin, pi_image_offset DstOrigin,
+                      pi_image_region Region, pi_uint32 NumEventsInWaitList,
+                      const pi_event *EventsWaitList, pi_event *OutEvent) {
+
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+
+  ur_mem_handle_t UrImageSrc = reinterpret_cast<ur_mem_handle_t>(SrcImage);
+  ur_mem_handle_t UrImageDst = reinterpret_cast<ur_mem_handle_t>(DstImage);
+
+  ur_rect_offset_t UrSrcOrigin{SrcOrigin->x, SrcOrigin->y, SrcOrigin->z};
+  ur_rect_offset_t UrDstOrigin{DstOrigin->x, DstOrigin->y, DstOrigin->z};
+  ur_rect_region_t UrRegion{};
+  UrRegion.depth = Region->depth;
+  UrRegion.height = Region->height;
+  UrRegion.width = Region->width;
+
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueMemImageCopy(
+      UrQueue, UrImageSrc, UrImageDst, UrSrcOrigin, UrDstOrigin, UrRegion,
+      NumEventsInWaitList, UrEventsWaitList, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
+                                                pi_context Context,
+                                                bool OwnNativeHandle,
+                                                pi_mem *Mem) {
+  PI_ASSERT(Mem, PI_ERROR_INVALID_VALUE);
+  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+
+  ur_native_handle_t UrNativeMem =
+      reinterpret_cast<ur_native_handle_t>(NativeHandle);
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+  ur_mem_handle_t *UrMem = reinterpret_cast<ur_mem_handle_t *>(Mem);
+  // TODO: Pass OwnNativeHandle to the output parameter
+  // while we get it in interface
+  (*UrMem)->OwnNativeHandle = OwnNativeHandle;
+  HANDLE_ERRORS(urMemCreateWithNativeHandle(UrNativeMem, UrContext, UrMem));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context,
+                                     pi_device Device,
+                                     pi_usm_mem_properties *Properties,
+                                     size_t Size, pi_uint32 Alignment) {
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+
+  ur_usm_desc_t USMDesc{};
+  ur_usm_pool_handle_t Pool{};
+  HANDLE_ERRORS(urUSMDeviceAlloc(UrContext, UrDevice, &USMDesc, Pool, Size,
+                                 Alignment, ResultPtr));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context,
+                                     pi_device Device,
+                                     pi_usm_mem_properties *Properties,
+                                     size_t Size, pi_uint32 Alignment) {
+
+  if (Properties && *Properties != 0) {
+    PI_ASSERT(*(Properties) == PI_MEM_ALLOC_FLAGS && *(Properties + 2) == 0,
+              PI_ERROR_INVALID_VALUE);
+  }
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+
+  ur_usm_desc_t USMDesc{};
+  ur_usm_pool_handle_t Pool{};
+  HANDLE_ERRORS(urUSMSharedAlloc(UrContext, UrDevice, &USMDesc, Pool, Size,
+                                 Alignment, ResultPtr));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piextUSMFree(pi_context Context, void *Ptr) {
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+  HANDLE_ERRORS(urUSMFree(UrContext, Ptr));
+  return PI_SUCCESS;
+}
+
+inline pi_result piMemRetain(pi_mem Mem) {
+  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
+
+  ur_mem_handle_t UrMem = reinterpret_cast<ur_mem_handle_t>(Mem);
+
+  HANDLE_ERRORS(urMemRetain(UrMem));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piMemRelease(pi_mem Mem) {
+  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
+
+  ur_mem_handle_t UrMem = reinterpret_cast<ur_mem_handle_t>(Mem);
+
+  HANDLE_ERRORS(urMemRelease(UrMem));
+
+  return PI_SUCCESS;
+}
+
+/// Hint to migrate memory to the device
+///
+/// @param Queue is the queue to submit to
+/// @param Ptr points to the memory to migrate
+/// @param Size is the number of bytes to migrate
+/// @param Flags is a bitfield used to specify memory migration options
+/// @param NumEventsInWaitList is the number of events to wait on
+/// @param EventsWaitList is an array of events to wait on
+/// @param Event is the event that represents this operation
+inline pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr,
+                                         size_t Size,
+                                         pi_usm_migration_flags Flags,
+                                         pi_uint32 NumEventsInWaitList,
+                                         const pi_event *EventsWaitList,
+                                         pi_event *OutEvent) {
+
+  // flags is currently unused so fail if set
+  PI_ASSERT(Flags == 0, PI_ERROR_INVALID_VALUE);
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  // TODO: to map from pi_usm_migration_flags to
+  // ur_usm_migration_flags_t
+  // once we have those defined
+  ur_usm_migration_flags_t UrFlags{};
+  HANDLE_ERRORS(urEnqueueUSMPrefetch(UrQueue, Ptr, Size, UrFlags,
+                                     NumEventsInWaitList, UrEventsWaitList,
+                                     UrEvent));
+
+  return PI_SUCCESS;
+}
+
+/// USM memadvise API to govern behavior of automatic migration mechanisms
+///
+/// @param Queue is the queue to submit to
+/// @param Ptr is the data to be advised
+/// @param Length is the size in bytes of the meory to advise
+/// @param Advice is device specific advice
+/// @param Event is the event that represents this operation
+///
+inline pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr,
+                                          size_t Length, pi_mem_advice Advice,
+                                          pi_event *OutEvent) {
+
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  // TODO: to map from pi_mem_advice to ur_mem_advice_t
+  // once we have those defined
+  ur_mem_advice_t UrAdvice{};
+  HANDLE_ERRORS(urEnqueueUSMMemAdvise(UrQueue, Ptr, Length, UrAdvice, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+/// USM 2D Fill API
+///
+/// \param queue is the queue to submit to
+/// \param ptr is the ptr to fill
+/// \param pitch is the total width of the destination memory including padding
+/// \param pattern is a pointer with the bytes of the pattern to set
+/// \param pattern_size is the size in bytes of the pattern
+/// \param width is width in bytes of each row to fill
+/// \param height is height the columns to fill
+/// \param num_events_in_waitlist is the number of events to wait on
+/// \param events_waitlist is an array of events to wait on
+/// \param event is the event that represents this operation
+inline pi_result piextUSMEnqueueFill2D(pi_queue Queue, void *Ptr, size_t Pitch,
+                                       size_t PatternSize, const void *Pattern,
+                                       size_t Width, size_t Height,
+                                       pi_uint32 NumEventsWaitList,
+                                       const pi_event *EventsWaitList,
+                                       pi_event *Event) {
+
+  std::ignore = Queue;
+  std::ignore = Ptr;
+  std::ignore = Pitch;
+  std::ignore = PatternSize;
+  std::ignore = Pattern;
+  std::ignore = Width;
+  std::ignore = Height;
+  std::ignore = NumEventsWaitList;
+  std::ignore = EventsWaitList;
+  std::ignore = Event;
+  die("piextUSMEnqueueFill2D: not implemented");
+  return {};
+}
+
+inline pi_result piextUSMEnqueueMemset2D(pi_queue Queue, void *Ptr,
+                                         size_t Pitch, int Value, size_t Width,
+                                         size_t Height,
+                                         pi_uint32 NumEventsWaitList,
+                                         const pi_event *EventsWaitList,
+                                         pi_event *Event) {
+  std::ignore = Queue;
+  std::ignore = Ptr;
+  std::ignore = Pitch;
+  std::ignore = Value;
+  std::ignore = Width;
+  std::ignore = Height;
+  std::ignore = NumEventsWaitList;
+  std::ignore = EventsWaitList;
+  std::ignore = Event;
+  die("piextUSMEnqueueMemset2D: not implemented");
+  return PI_SUCCESS;
+}
+
+inline pi_result piextUSMGetMemAllocInfo(pi_context Context, const void *Ptr,
+                                         pi_mem_alloc_info ParamName,
+                                         size_t ParamValueSize,
+                                         void *ParamValue,
+                                         size_t *ParamValueSizeRet) {
+
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+
+  ur_usm_alloc_info_t UrParamName{};
+  switch (ParamName) {
+  case PI_MEM_ALLOC_TYPE: {
+    UrParamName = UR_USM_ALLOC_INFO_TYPE;
+    break;
+  }
+  case PI_MEM_ALLOC_BASE_PTR: {
+    UrParamName = UR_USM_ALLOC_INFO_BASE_PTR;
+    break;
+  }
+  case PI_MEM_ALLOC_SIZE: {
+    UrParamName = UR_USM_ALLOC_INFO_SIZE;
+    break;
+  }
+  case PI_MEM_ALLOC_DEVICE: {
+    UrParamName = UR_USM_ALLOC_INFO_DEVICE;
+    break;
+  }
+  default: {
+    die("piextUSMGetMemAllocInfo: unsuppported ParamName.");
+  }
+  }
+
+  HANDLE_ERRORS(urUSMGetMemAllocInfo(UrContext, Ptr, UrParamName,
+                                     ParamValueSize, ParamValue,
+                                     ParamValueSizeRet))
+  return PI_SUCCESS;
+}
+
+inline pi_result piMemImageGetInfo(pi_mem Image, pi_image_info ParamName,
+                                   size_t ParamValueSize, void *ParamValue,
+                                   size_t *ParamValueSizeRet) { // missing
+  std::ignore = Image;
+  std::ignore = ParamName;
+  std::ignore = ParamValueSize;
+  std::ignore = ParamValue;
+  std::ignore = ParamValueSizeRet;
+
+  // TODO: use urMemImageGetInfo
+
+  die("piMemImageGetInfo: not implemented");
+  return {};
+}
+
+/// USM 2D Memcpy API
+///
+/// \param queue is the queue to submit to
+/// \param blocking is whether this operation should block the host
+/// \param dst_ptr is the location the data will be copied
+/// \param dst_pitch is the total width of the destination memory including
+/// padding
+/// \param src_ptr is the data to be copied
+/// \param dst_pitch is the total width of the source memory including padding
+/// \param width is width in bytes of each row to be copied
+/// \param height is height the columns to be copied
+/// \param num_events_in_waitlist is the number of events to wait on
+/// \param events_waitlist is an array of events to wait on
+/// \param event is the event that represents this operation
+inline pi_result piextUSMEnqueueMemcpy2D(pi_queue Queue, pi_bool Blocking,
+                                         void *DstPtr, size_t DstPitch,
+                                         const void *SrcPtr, size_t SrcPitch,
+                                         size_t Width, size_t Height,
+                                         pi_uint32 NumEventsInWaitList,
+                                         const pi_event *EventsWaitList,
+                                         pi_event *OutEvent) {
+
+  if (!DstPtr || !SrcPtr)
+    return PI_ERROR_INVALID_VALUE;
+
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueUSMMemcpy2D(
+      UrQueue, Blocking, DstPtr, DstPitch, SrcPtr, SrcPitch, Width, Height,
+      NumEventsInWaitList, UrEventsWaitList, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+// Memory
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+// Enqueue
+
+inline pi_result
+piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+                      const size_t *GlobalWorkOffset,
+                      const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+                      pi_uint32 NumEventsInWaitList,
+                      const pi_event *EventsWaitList, pi_event *OutEvent) {
+
+  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+  PI_ASSERT((WorkDim > 0) && (WorkDim < 4), PI_ERROR_INVALID_WORK_DIMENSION);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueKernelLaunch(
+      UrQueue, UrKernel, WorkDim, GlobalWorkOffset, GlobalWorkSize,
+      LocalWorkSize, NumEventsInWaitList, UrEventsWaitList, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result
+piEnqueueMemImageWrite(pi_queue Queue, pi_mem Image, pi_bool BlockingWrite,
+                       pi_image_offset Origin, pi_image_region Region,
+                       size_t InputRowPitch, size_t InputSlicePitch,
+                       const void *Ptr, pi_uint32 NumEventsInWaitList,
+                       const pi_event *EventsWaitList, pi_event *OutEvent) {
+
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_mem_handle_t UrImage = reinterpret_cast<ur_mem_handle_t>(Image);
+  ur_rect_offset_t UrOrigin{Origin->x, Origin->y, Origin->z};
+  ur_rect_region_t UrRegion{};
+  UrRegion.depth = Region->depth;
+  UrRegion.height = Region->height;
+  UrRegion.width = Region->width;
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueMemImageWrite(
+      UrQueue, UrImage, BlockingWrite, UrOrigin, UrRegion, InputRowPitch,
+      InputSlicePitch, const_cast<void *>(Ptr), NumEventsInWaitList,
+      UrEventsWaitList, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result
+piEnqueueMemImageRead(pi_queue Queue, pi_mem Image, pi_bool BlockingRead,
+                      pi_image_offset Origin, pi_image_region Region,
+                      size_t RowPitch, size_t SlicePitch, void *Ptr,
+                      pi_uint32 NumEventsInWaitList,
+                      const pi_event *EventsWaitList, pi_event *OutEvent) {
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_mem_handle_t UrImage = reinterpret_cast<ur_mem_handle_t>(Image);
+  ur_rect_offset_t UrOrigin{Origin->x, Origin->y, Origin->z};
+  ur_rect_region_t UrRegion{};
+  UrRegion.depth = Region->depth;
+  UrRegion.height = Region->height;
+  UrRegion.width = Region->width;
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueMemImageRead(
+      UrQueue, UrImage, BlockingRead, UrOrigin, UrRegion, RowPitch, SlicePitch,
+      Ptr, NumEventsInWaitList, UrEventsWaitList, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piEnqueueMemBufferMap(
+    pi_queue Queue, pi_mem Mem, pi_bool BlockingMap, pi_map_flags MapFlags,
+    size_t Offset, size_t Size, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventsWaitList, pi_event *OutEvent, void **RetMap) {
+  // TODO: we don't implement read-only or write-only, always read-write.
+  // assert((map_flags & PI_MAP_READ) != 0);
+  // assert((map_flags & PI_MAP_WRITE) != 0);
+  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_mem_handle_t UrMem = reinterpret_cast<ur_mem_handle_t>(Mem);
+
+  ur_map_flags_t UrMapFlags{};
+  if (MapFlags & PI_MAP_READ)
+    UrMapFlags |= UR_MAP_FLAG_READ;
+  if (MapFlags & PI_MAP_WRITE)
+    UrMapFlags |= UR_MAP_FLAG_WRITE;
+  if (MapFlags & PI_MAP_WRITE_INVALIDATE_REGION)
+    UrMapFlags |= UR_EXT_MAP_FLAG_WRITE_INVALIDATE_REGION;
+
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueMemBufferMap(UrQueue, UrMem, BlockingMap, UrMapFlags,
+                                      Offset, Size, NumEventsInWaitList,
+                                      UrEventsWaitList, UrEvent, RetMap));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem Mem, void *MappedPtr,
+                                   pi_uint32 NumEventsInWaitList,
+                                   const pi_event *EventsWaitList,
+                                   pi_event *OutEvent) {
+
+  PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT);
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_mem_handle_t UrMem = reinterpret_cast<ur_mem_handle_t>(Mem);
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueMemUnmap(UrQueue, UrMem, MappedPtr,
+                                  NumEventsInWaitList, UrEventsWaitList,
+                                  UrEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piEnqueueMemBufferFill(pi_queue Queue, pi_mem Buffer,
+                                        const void *Pattern, size_t PatternSize,
+                                        size_t Offset, size_t Size,
+                                        pi_uint32 NumEventsInWaitList,
+                                        const pi_event *EventsWaitList,
+                                        pi_event *OutEvent) {
+  PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT);
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_mem_handle_t UrBuffer = reinterpret_cast<ur_mem_handle_t>(Buffer);
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueMemBufferFill(UrQueue, UrBuffer, Pattern, PatternSize,
+                                       Offset, Size, NumEventsInWaitList,
+                                       UrEventsWaitList, UrEvent));
+  return PI_SUCCESS;
+}
+
+inline pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr,
+                                       pi_int32 Value, size_t Count,
+                                       pi_uint32 NumEventsInWaitList,
+                                       const pi_event *EventsWaitList,
+                                       pi_event *OutEvent) {
+  PI_ASSERT(Ptr, PI_ERROR_INVALID_MEM_OBJECT);
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_mem_handle_t UrBuffer = reinterpret_cast<ur_mem_handle_t>(Ptr);
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  uint32_t Pattern = Value;
+  size_t PatternSize = sizeof(Pattern);
+  HANDLE_ERRORS(urEnqueueMemBufferFill(
+      UrQueue, UrBuffer,
+      const_cast<const void *>(reinterpret_cast<void *>(&Pattern)), PatternSize,
+      0, Count, NumEventsInWaitList, UrEventsWaitList, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piEnqueueMemBufferCopyRect(
+    pi_queue Queue, pi_mem SrcMem, pi_mem DstMem, pi_buff_rect_offset SrcOrigin,
+    pi_buff_rect_offset DstOrigin, pi_buff_rect_region Region,
+    size_t SrcRowPitch, size_t SrcSlicePitch, size_t DstRowPitch,
+    size_t DstSlicePitch, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventsWaitList, pi_event *OutEvent) {
+
+  PI_ASSERT(SrcMem && DstMem, PI_ERROR_INVALID_MEM_OBJECT);
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_mem_handle_t UrBufferSrc = reinterpret_cast<ur_mem_handle_t>(SrcMem);
+  ur_mem_handle_t UrBufferDst = reinterpret_cast<ur_mem_handle_t>(DstMem);
+  ur_rect_offset_t UrSrcOrigin{SrcOrigin->x_bytes, SrcOrigin->y_scalar,
+                               SrcOrigin->z_scalar};
+  ur_rect_offset_t UrDstOrigin{DstOrigin->x_bytes, DstOrigin->y_scalar,
+                               DstOrigin->z_scalar};
+  ur_rect_region_t UrRegion{};
+  UrRegion.depth = Region->depth_scalar;
+  UrRegion.height = Region->height_scalar;
+  UrRegion.width = Region->width_bytes;
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueMemBufferCopyRect(
+      UrQueue, UrBufferSrc, UrBufferDst, UrSrcOrigin, UrDstOrigin, UrRegion,
+      SrcRowPitch, SrcSlicePitch, DstRowPitch, DstSlicePitch,
+      NumEventsInWaitList, UrEventsWaitList, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcMem,
+                                        pi_mem DstMem, size_t SrcOffset,
+                                        size_t DstOffset, size_t Size,
+                                        pi_uint32 NumEventsInWaitList,
+                                        const pi_event *EventsWaitList,
+                                        pi_event *OutEvent) {
+
+  PI_ASSERT(SrcMem && DstMem, PI_ERROR_INVALID_MEM_OBJECT);
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_mem_handle_t UrBufferSrc = reinterpret_cast<ur_mem_handle_t>(SrcMem);
+  ur_mem_handle_t UrBufferDst = reinterpret_cast<ur_mem_handle_t>(DstMem);
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueMemBufferCopy(
+      UrQueue, UrBufferSrc, UrBufferDst, SrcOffset, DstOffset, Size,
+      NumEventsInWaitList, UrEventsWaitList, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking,
+                                       void *DstPtr, const void *SrcPtr,
+                                       size_t Size,
+                                       pi_uint32 NumEventsInWaitList,
+                                       const pi_event *EventsWaitList,
+                                       pi_event *OutEvent) {
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueUSMMemcpy(UrQueue, Blocking, DstPtr, SrcPtr, Size,
+                                   NumEventsInWaitList, UrEventsWaitList,
+                                   UrEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piEnqueueMemBufferWriteRect(
+    pi_queue Queue, pi_mem Buffer, pi_bool BlockingWrite,
+    pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset,
+    pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch,
+    size_t HostRowPitch, size_t HostSlicePitch, const void *Ptr,
+    pi_uint32 NumEventsInWaitList, const pi_event *EventsWaitList,
+    pi_event *OutEvent) {
+
+  PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT);
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_mem_handle_t UrBuffer = reinterpret_cast<ur_mem_handle_t>(Buffer);
+  ur_rect_offset_t UrBufferOffset{BufferOffset->x_bytes, BufferOffset->y_scalar,
+                                  BufferOffset->z_scalar};
+  ur_rect_offset_t UrHostOffset{HostOffset->x_bytes, HostOffset->y_scalar,
+                                HostOffset->z_scalar};
+  ur_rect_region_t UrRegion{};
+  UrRegion.depth = Region->depth_scalar;
+  UrRegion.height = Region->height_scalar;
+  UrRegion.width = Region->width_bytes;
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueMemBufferWriteRect(
+      UrQueue, UrBuffer, BlockingWrite, UrBufferOffset, UrHostOffset, UrRegion,
+      BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch,
+      const_cast<void *>(Ptr), NumEventsInWaitList, UrEventsWaitList, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piEnqueueMemBufferWrite(pi_queue Queue, pi_mem Buffer,
+                                         pi_bool BlockingWrite, size_t Offset,
+                                         size_t Size, const void *Ptr,
+                                         pi_uint32 NumEventsInWaitList,
+                                         const pi_event *EventsWaitList,
+                                         pi_event *OutEvent) {
+
+  PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT);
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_mem_handle_t UrBuffer = reinterpret_cast<ur_mem_handle_t>(Buffer);
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueMemBufferWrite(
+      UrQueue, UrBuffer, BlockingWrite, Offset, Size, const_cast<void *>(Ptr),
+      NumEventsInWaitList, UrEventsWaitList, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piEnqueueMemBufferReadRect(
+    pi_queue Queue, pi_mem Buffer, pi_bool BlockingRead,
+    pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset,
+    pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch,
+    size_t HostRowPitch, size_t HostSlicePitch, void *Ptr,
+    pi_uint32 NumEventsInWaitList, const pi_event *EventsWaitList,
+    pi_event *OutEvent) {
+
+  PI_ASSERT(Buffer, PI_ERROR_INVALID_MEM_OBJECT);
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_mem_handle_t UrBuffer = reinterpret_cast<ur_mem_handle_t>(Buffer);
+  ur_rect_offset_t UrBufferOffset{BufferOffset->x_bytes, BufferOffset->y_scalar,
+                                  BufferOffset->z_scalar};
+  ur_rect_offset_t UrHostOffset{HostOffset->x_bytes, HostOffset->y_scalar,
+                                HostOffset->z_scalar};
+  ur_rect_region_t UrRegion{};
+  UrRegion.depth = Region->depth_scalar;
+  UrRegion.height = Region->height_scalar;
+  UrRegion.width = Region->width_bytes;
+
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueMemBufferReadRect(
+      UrQueue, UrBuffer, BlockingRead, UrBufferOffset, UrHostOffset, UrRegion,
+      BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr,
+      NumEventsInWaitList, UrEventsWaitList, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piEnqueueMemBufferRead(pi_queue Queue, pi_mem Src,
+                                        pi_bool BlockingRead, size_t Offset,
+                                        size_t Size, void *Dst,
+                                        pi_uint32 NumEventsInWaitList,
+                                        const pi_event *EventsWaitList,
+                                        pi_event *OutEvent) {
+  PI_ASSERT(Src, PI_ERROR_INVALID_MEM_OBJECT);
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_mem_handle_t UrBuffer = reinterpret_cast<ur_mem_handle_t>(Src);
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueMemBufferRead(UrQueue, UrBuffer, BlockingRead, Offset,
+                                       Size, Dst, NumEventsInWaitList,
+                                       UrEventsWaitList, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue,
+                                                pi_uint32 NumEventsInWaitList,
+                                                const pi_event *EventsWaitList,
+                                                pi_event *OutEvent) {
+
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueEventsWaitWithBarrier(UrQueue, NumEventsInWaitList,
+                                               UrEventsWaitList, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piEnqueueEventsWait(pi_queue Queue,
+                                     pi_uint32 NumEventsInWaitList,
+                                     const pi_event *EventsWaitList,
+                                     pi_event *OutEvent) {
+
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+  if (EventsWaitList) {
+    PI_ASSERT(NumEventsInWaitList > 0, PI_ERROR_INVALID_VALUE);
+  }
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  HANDLE_ERRORS(urEnqueueEventsWait(UrQueue, NumEventsInWaitList,
+                                    UrEventsWaitList, UrEvent));
+
+  return PI_SUCCESS;
+}
+// Enqueue
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+// Events
+inline pi_result piEventsWait(pi_uint32 NumEvents,
+                              const pi_event *EventsWaitList) {
+  if (NumEvents && !EventsWaitList) {
+    return PI_ERROR_INVALID_EVENT;
+  }
+
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  HANDLE_ERRORS(urEventWait(NumEvents, UrEventsWaitList));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName,
+                                size_t ParamValueSize, void *ParamValue,
+                                size_t *ParamValueSizeRet) {
+
+  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
+
+  ur_event_handle_t UrEvent = reinterpret_cast<ur_event_handle_t>(Event);
+
+  ur_event_info_t PropName{};
+  if (ParamName == PI_EVENT_INFO_COMMAND_QUEUE) {
+    PropName = UR_EVENT_INFO_COMMAND_QUEUE;
+  } else if (ParamName == PI_EVENT_INFO_CONTEXT) {
+    PropName = UR_EVENT_INFO_CONTEXT;
+  } else if (ParamName == PI_EVENT_INFO_COMMAND_TYPE) {
+    PropName = UR_EVENT_INFO_COMMAND_TYPE;
+  } else if (ParamName == PI_EVENT_INFO_COMMAND_EXECUTION_STATUS) {
+    PropName = UR_EVENT_INFO_COMMAND_EXECUTION_STATUS;
+  } else if (ParamName == PI_EVENT_INFO_REFERENCE_COUNT) {
+    PropName = UR_EVENT_INFO_REFERENCE_COUNT;
+  } else {
+    return PI_ERROR_INVALID_VALUE;
+  }
+
+  HANDLE_ERRORS(urEventGetInfo(UrEvent, PropName, ParamValueSize, ParamValue,
+                               ParamValueSizeRet));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piextEventGetNativeHandle(pi_event Event,
+                                           pi_native_handle *NativeHandle) {
+
+  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
+  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
+
+  ur_event_handle_t UrEvent = reinterpret_cast<ur_event_handle_t>(Event);
+
+  ur_native_handle_t *UrNativeEvent =
+      reinterpret_cast<ur_native_handle_t *>(NativeHandle);
+  HANDLE_ERRORS(urEventGetNativeHandle(UrEvent, UrNativeEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piEventGetProfilingInfo(pi_event Event,
+                                         pi_profiling_info ParamName,
+                                         size_t ParamValueSize,
+                                         void *ParamValue,
+                                         size_t *ParamValueSizeRet) {
+
+  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
+
+  ur_event_handle_t UrEvent = reinterpret_cast<ur_event_handle_t>(Event);
+
+  ur_profiling_info_t PropName{};
+  switch (ParamName) {
+  case PI_PROFILING_INFO_COMMAND_QUEUED: {
+    PropName = UR_PROFILING_INFO_COMMAND_QUEUED;
+    break;
+  }
+  case PI_PROFILING_INFO_COMMAND_SUBMIT: {
+    PropName = UR_PROFILING_INFO_COMMAND_SUBMIT;
+    break;
+  }
+  case PI_PROFILING_INFO_COMMAND_START: {
+    PropName = UR_PROFILING_INFO_COMMAND_START;
+    break;
+  }
+  case PI_PROFILING_INFO_COMMAND_END: {
+    PropName = UR_PROFILING_INFO_COMMAND_END;
+    break;
+  }
+  default:
+    return PI_ERROR_INVALID_PROPERTY;
+  }
+
+  HANDLE_ERRORS(urEventGetProfilingInfo(UrEvent, PropName, ParamValueSize,
+                                        ParamValue, ParamValueSizeRet));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(RetEvent);
+  // pass null for the hNativeHandle to use urEventCreateWithNativeHandle
+  // as urEventCreate
+  HANDLE_ERRORS(urEventCreateWithNativeHandle(nullptr, UrContext, UrEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piextEventCreateWithNativeHandle(pi_native_handle NativeHandle,
+                                                  pi_context Context,
+                                                  bool OwnNativeHandle,
+                                                  pi_event *Event) {
+
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
+  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
+
+  ur_native_handle_t UrNativeKernel =
+      reinterpret_cast<ur_native_handle_t>(NativeHandle);
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(*Event);
+  HANDLE_ERRORS(
+      urEventCreateWithNativeHandle(UrNativeKernel, UrContext, UrEvent));
+  (*UrEvent)->OwnNativeHandle = OwnNativeHandle;
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piEventSetCallback(
+    pi_event Event, pi_int32 CommandExecCallbackType,
+    void (*PFnNotify)(pi_event Event, pi_int32 EventCommandStatus,
+                      void *UserData),
+    void *UserData) {
+  std::ignore = Event;
+  std::ignore = CommandExecCallbackType;
+  std::ignore = PFnNotify;
+  std::ignore = UserData;
+  die("piEventSetCallback: deprecated, to be removed");
+  return PI_SUCCESS;
+}
+
+inline pi_result piEventSetStatus(pi_event Event, pi_int32 ExecutionStatus) {
+  std::ignore = Event;
+  std::ignore = ExecutionStatus;
+  die("piEventSetStatus: deprecated, to be removed");
+  return PI_SUCCESS;
+}
+
+inline pi_result piEventRetain(pi_event Event) {
+  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
+
+  ur_event_handle_t UrEvent = reinterpret_cast<ur_event_handle_t>(Event);
+  HANDLE_ERRORS(urEventRetain(UrEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piEventRelease(pi_event Event) {
+  PI_ASSERT(Event, PI_ERROR_INVALID_EVENT);
+
+  ur_event_handle_t UrEvent = reinterpret_cast<ur_event_handle_t>(Event);
+  HANDLE_ERRORS(urEventRelease(UrEvent));
+
+  return PI_SUCCESS;
+}
+
+// Events
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////
+// Sampler
+inline pi_result piSamplerCreate(pi_context Context,
+                                 const pi_sampler_properties *SamplerProperties,
+                                 pi_sampler *RetSampler) {
+
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+  PI_ASSERT(RetSampler, PI_ERROR_INVALID_VALUE);
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+  ur_sampler_property_t UrProps[6]{};
+  UrProps[0] = UR_SAMPLER_PROPERTIES_NORMALIZED_COORDS;
+  UrProps[1] = SamplerProperties[1];
+
+  UrProps[2] = UR_SAMPLER_PROPERTIES_ADDRESSING_MODE;
+  if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT)
+    UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
+  else if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_REPEAT)
+    UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_REPEAT;
+  else if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE)
+    UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
+  else if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_CLAMP)
+    UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_CLAMP;
+  else if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_NONE)
+    UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_NONE;
+
+  UrProps[4] = UR_SAMPLER_PROPERTIES_FILTER_MODE;
+  if (SamplerProperties[4] & PI_SAMPLER_FILTER_MODE_NEAREST)
+    UrProps[5] = UR_EXT_SAMPLER_FILTER_MODE_NEAREST;
+  else if (SamplerProperties[4] & PI_SAMPLER_FILTER_MODE_LINEAR)
+    UrProps[5] = UR_EXT_SAMPLER_FILTER_MODE_LINEAR;
+
+  ur_sampler_handle_t *UrSampler =
+      reinterpret_cast<ur_sampler_handle_t *>(RetSampler);
+
+  HANDLE_ERRORS(urSamplerCreate(UrContext, UrProps, UrSampler));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piSamplerGetInfo(pi_sampler Sampler, pi_sampler_info ParamName,
+                                  size_t ParamValueSize, void *ParamValue,
+                                  size_t *ParamValueSizeRet) {
+  std::ignore = Sampler;
+  std::ignore = ParamName;
+  std::ignore = ParamValueSize;
+  std::ignore = ParamValue;
+  std::ignore = ParamValueSizeRet;
+
+  die("piSamplerGetInfo: not implemented");
+  return PI_SUCCESS;
+}
+
+// Special version of piKernelSetArg to accept pi_sampler.
+inline pi_result piextKernelSetArgSampler(pi_kernel Kernel, pi_uint32 ArgIndex,
+                                          const pi_sampler *ArgValue) {
+  ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
+  ur_sampler_handle_t UrSampler =
+      reinterpret_cast<ur_sampler_handle_t>(*ArgValue);
+
+  HANDLE_ERRORS(urKernelSetArgSampler(UrKernel, ArgIndex, UrSampler));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piSamplerRetain(pi_sampler Sampler) {
+  PI_ASSERT(Sampler, PI_ERROR_INVALID_SAMPLER);
+
+  ur_sampler_handle_t UrSampler =
+      reinterpret_cast<ur_sampler_handle_t>(Sampler);
+
+  HANDLE_ERRORS(urSamplerRetain(UrSampler));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result piSamplerRelease(pi_sampler Sampler) {
+  PI_ASSERT(Sampler, PI_ERROR_INVALID_SAMPLER);
+
+  ur_sampler_handle_t UrSampler =
+      reinterpret_cast<ur_sampler_handle_t>(Sampler);
+
+  HANDLE_ERRORS(urSamplerRelease(UrSampler));
+
+  return PI_SUCCESS;
+}
+
+// Sampler
+///////////////////////////////////////////////////////////////////////////////
+
 } // namespace pi2ur
diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
index cb255fbd53229..ba1cb72e8518f 100644
--- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
+++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
@@ -8,6 +8,7 @@
 
 #include <cstring>
 
+// #include "ur/adapters/level_zero/ur_level_zero_common.hpp"
 #include <pi2ur.hpp>
 #include <pi_unified_runtime.hpp>
 
@@ -18,10 +19,10 @@ static void DieUnsupported() {
 
 // All PI API interfaces are C interfaces
 extern "C" {
-__SYCL_EXPORT pi_result piPlatformsGet(pi_uint32 num_entries,
-                                       pi_platform *platforms,
-                                       pi_uint32 *num_platforms) {
-  return pi2ur::piPlatformsGet(num_entries, platforms, num_platforms);
+__SYCL_EXPORT pi_result piPlatformsGet(pi_uint32 NumEntries,
+                                       pi_platform *Platforms,
+                                       pi_uint32 *NumPlatforms) {
+  return pi2ur::piPlatformsGet(NumEntries, Platforms, NumPlatforms);
 }
 
 __SYCL_EXPORT pi_result piPlatformGetInfo(pi_platform Platform,
@@ -65,13 +66,903 @@ __SYCL_EXPORT pi_result piDevicePartition(
 }
 
 // Stub for the not yet supported API
-__SYCL_EXPORT pi_result piextDeviceSelectBinary(pi_device, pi_device_binary *,
-                                                pi_uint32, pi_uint32 *) {
-  return PI_ERROR_INVALID_BINARY;
+__SYCL_EXPORT pi_result piextDeviceSelectBinary(pi_device Device,
+                                                pi_device_binary *Binaries,
+                                                pi_uint32 NumBinaries,
+                                                pi_uint32 *SelectedBinaryInd) {
+  return pi2ur::piextDeviceSelectBinary(Device, Binaries, NumBinaries,
+                                        SelectedBinaryInd);
+}
+
+__SYCL_EXPORT pi_result
+piContextCreate(const pi_context_properties *Properties, pi_uint32 NumDevices,
+                const pi_device *Devices,
+                void (*PFnNotify)(const char *ErrInfo, const void *PrivateInfo,
+                                  size_t CB, void *UserData),
+                void *UserData, pi_context *RetContext) {
+  return pi2ur::piContextCreate(Properties, NumDevices, Devices, PFnNotify,
+                                UserData, RetContext);
+}
+
+__SYCL_EXPORT pi_result piContextRelease(pi_context Context) {
+  return pi2ur::piContextRelease(Context);
+}
+
+__SYCL_EXPORT pi_result piQueueCreate(pi_context Context, pi_device Device,
+                                      pi_queue_properties Flags,
+                                      pi_queue *Queue) {
+  return pi2ur::piQueueCreate(Context, Device, Flags, Queue);
+}
+
+__SYCL_EXPORT pi_result piextQueueCreate(pi_context Context, pi_device Device,
+                                         pi_queue_properties *Properties,
+                                         pi_queue *Queue) {
+  return pi2ur::piextQueueCreate(Context, Device, Properties, Queue);
+}
+
+__SYCL_EXPORT pi_result piQueueRelease(pi_queue Queue) {
+  return pi2ur::piQueueRelease(Queue);
+}
+
+__SYCL_EXPORT pi_result piProgramCreate(pi_context Context, const void *ILBytes,
+                                        size_t Length, pi_program *Program) {
+  return pi2ur::piProgramCreate(Context, ILBytes, Length, Program);
+}
+
+__SYCL_EXPORT pi_result piProgramBuild(
+    pi_program Program, pi_uint32 NumDevices, const pi_device *DeviceList,
+    const char *Options, void (*PFnNotify)(pi_program Program, void *UserData),
+    void *UserData) {
+  return pi2ur::piProgramBuild(Program, NumDevices, DeviceList, Options,
+                               PFnNotify, UserData);
+}
+
+__SYCL_EXPORT pi_result piextProgramSetSpecializationConstant(
+    pi_program Prog, pi_uint32 SpecID, size_t Size, const void *SpecValue) {
+  return pi2ur::piextProgramSetSpecializationConstant(Prog, SpecID, Size,
+                                                      SpecValue);
+}
+
+__SYCL_EXPORT pi_result
+piProgramLink(pi_context Context, pi_uint32 NumDevices,
+              const pi_device *DeviceList, const char *Options,
+              pi_uint32 NumInputPrograms, const pi_program *InputPrograms,
+              void (*PFnNotify)(pi_program Program, void *UserData),
+              void *UserData, pi_program *RetProgram) {
+  return pi2ur::piProgramLink(Context, NumDevices, DeviceList, Options,
+                              NumInputPrograms, InputPrograms, PFnNotify,
+                              UserData, RetProgram);
+}
+
+__SYCL_EXPORT pi_result piKernelCreate(pi_program Program,
+                                       const char *KernelName,
+                                       pi_kernel *RetKernel) {
+  return pi2ur::piKernelCreate(Program, KernelName, RetKernel);
+}
+
+// Special version of piKernelSetArg to accept pi_mem.
+__SYCL_EXPORT pi_result piextKernelSetArgMemObj(pi_kernel Kernel,
+                                                pi_uint32 ArgIndex,
+                                                const pi_mem *ArgValue) {
+
+  return pi2ur::piextKernelSetArgMemObj(Kernel, ArgIndex, ArgValue);
+}
+
+__SYCL_EXPORT pi_result piKernelSetArg(pi_kernel Kernel, pi_uint32 ArgIndex,
+                                       size_t ArgSize, const void *ArgValue) {
+
+  return pi2ur::piKernelSetArg(Kernel, ArgIndex, ArgSize, ArgValue);
+}
+
+__SYCL_EXPORT pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device,
+                                             pi_kernel_group_info ParamName,
+                                             size_t ParamValueSize,
+                                             void *ParamValue,
+                                             size_t *ParamValueSizeRet) {
+  return pi2ur::piKernelGetGroupInfo(Kernel, Device, ParamName, ParamValueSize,
+                                     ParamValue, ParamValueSizeRet);
+}
+
+__SYCL_EXPORT pi_result piMemBufferCreate(pi_context Context,
+                                          pi_mem_flags Flags, size_t Size,
+                                          void *HostPtr, pi_mem *RetMem,
+                                          const pi_mem_properties *properties) {
+
+  return pi2ur::piMemBufferCreate(Context, Flags, Size, HostPtr, RetMem,
+                                  properties);
+}
+
+__SYCL_EXPORT pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context,
+                                          pi_usm_mem_properties *Properties,
+                                          size_t Size, pi_uint32 Alignment) {
+
+  return pi2ur::piextUSMHostAlloc(ResultPtr, Context, Properties, Size,
+                                  Alignment);
+}
+
+__SYCL_EXPORT pi_result piMemGetInfo(pi_mem Mem, pi_mem_info ParamName,
+                                     size_t ParamValueSize, void *ParamValue,
+                                     size_t *ParamValueSizeRet) {
+  return pi2ur::piMemGetInfo(Mem, ParamName, ParamValueSize, ParamValue,
+                             ParamValueSizeRet);
+}
+
+__SYCL_EXPORT pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
+                                         const pi_image_format *ImageFormat,
+                                         const pi_image_desc *ImageDesc,
+                                         void *HostPtr, pi_mem *RetImage) {
+
+  return pi2ur::piMemImageCreate(Context, Flags, ImageFormat, ImageDesc,
+                                 HostPtr, RetImage);
+}
+
+__SYCL_EXPORT pi_result piMemBufferPartition(
+    pi_mem Buffer, pi_mem_flags Flags, pi_buffer_create_type BufferCreateType,
+    void *BufferCreateInfo, pi_mem *RetMem) {
+  return pi2ur::piMemBufferPartition(Buffer, Flags, BufferCreateType,
+                                     BufferCreateInfo, RetMem);
+}
+
+__SYCL_EXPORT pi_result
+piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) {
+  return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle);
+}
+
+__SYCL_EXPORT pi_result
+piEnqueueMemImageCopy(pi_queue Queue, pi_mem SrcImage, pi_mem DstImage,
+                      pi_image_offset SrcOrigin, pi_image_offset DstOrigin,
+                      pi_image_region Region, pi_uint32 NumEventsInWaitList,
+                      const pi_event *EventWaitList, pi_event *Event) {
+  return pi2ur::piEnqueueMemImageCopy(Queue, SrcImage, DstImage, SrcOrigin,
+                                      DstOrigin, Region, NumEventsInWaitList,
+                                      EventWaitList, Event);
+}
+
+__SYCL_EXPORT pi_result piextMemCreateWithNativeHandle(
+    pi_native_handle NativeHandle, pi_context Context, bool ownNativeHandle,
+    pi_mem *Mem) {
+  return pi2ur::piextMemCreateWithNativeHandle(NativeHandle, Context,
+                                               ownNativeHandle, Mem);
+}
+
+__SYCL_EXPORT pi_result piEnqueueKernelLaunch(
+    pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+    const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
+    const size_t *LocalWorkSize, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *OutEvent) {
+
+  return pi2ur::piEnqueueKernelLaunch(
+      Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, LocalWorkSize,
+      NumEventsInWaitList, EventWaitList, OutEvent);
+}
+
+__SYCL_EXPORT pi_result piEnqueueMemImageWrite(
+    pi_queue Queue, pi_mem Image, pi_bool BlockingWrite, pi_image_offset Origin,
+    pi_image_region Region, size_t InputRowPitch, size_t InputSlicePitch,
+    const void *Ptr, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+
+  return pi2ur::piEnqueueMemImageWrite(
+      Queue, Image, BlockingWrite, Origin, Region, InputRowPitch,
+      InputSlicePitch, Ptr, NumEventsInWaitList, EventWaitList, Event);
+}
+
+__SYCL_EXPORT pi_result piEnqueueMemImageRead(
+    pi_queue Queue, pi_mem Image, pi_bool BlockingRead, pi_image_offset Origin,
+    pi_image_region Region, size_t RowPitch, size_t SlicePitch, void *Ptr,
+    pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
+    pi_event *Event) {
+  return pi2ur::piEnqueueMemImageRead(
+      Queue, Image, BlockingRead, Origin, Region, RowPitch, SlicePitch, Ptr,
+      NumEventsInWaitList, EventWaitList, Event);
+}
+
+__SYCL_EXPORT pi_result piextKernelCreateWithNativeHandle(
+    pi_native_handle NativeHandle, pi_context Context, pi_program Program,
+    bool OwnNativeHandle, pi_kernel *Kernel) {
+
+  return pi2ur::piextKernelCreateWithNativeHandle(
+      NativeHandle, Context, Program, OwnNativeHandle, Kernel);
+}
+
+__SYCL_EXPORT pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem Mem,
+                                          void *MappedPtr,
+                                          pi_uint32 NumEventsInWaitList,
+                                          const pi_event *EventWaitList,
+                                          pi_event *OutEvent) {
+
+  return pi2ur::piEnqueueMemUnmap(Queue, Mem, MappedPtr, NumEventsInWaitList,
+                                  EventWaitList, OutEvent);
+}
+
+__SYCL_EXPORT pi_result piEventsWait(pi_uint32 NumEvents,
+                                     const pi_event *EventList) {
+
+  return pi2ur::piEventsWait(NumEvents, EventList);
+}
+
+__SYCL_EXPORT pi_result piQueueFinish(pi_queue Queue) {
+  return pi2ur::piQueueFinish(Queue);
+}
+
+__SYCL_EXPORT pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName,
+                                       size_t ParamValueSize, void *ParamValue,
+                                       size_t *ParamValueSizeRet) {
+  return pi2ur::piEventGetInfo(Event, ParamName, ParamValueSize, ParamValue,
+                               ParamValueSizeRet);
+}
+
+__SYCL_EXPORT pi_result piEnqueueMemBufferMap(
+    pi_queue Queue, pi_mem Mem, pi_bool BlockingMap, pi_map_flags MapFlags,
+    size_t Offset, size_t Size, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *OutEvent, void **RetMap) {
+
+  return pi2ur::piEnqueueMemBufferMap(Queue, Mem, BlockingMap, MapFlags, Offset,
+                                      Size, NumEventsInWaitList, EventWaitList,
+                                      OutEvent, RetMap);
+}
+
+__SYCL_EXPORT pi_result piEnqueueMemBufferFill(
+    pi_queue Queue, pi_mem Buffer, const void *Pattern, size_t PatternSize,
+    size_t Offset, size_t Size, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+  return pi2ur::piEnqueueMemBufferFill(Queue, Buffer, Pattern, PatternSize,
+                                       Offset, Size, NumEventsInWaitList,
+                                       EventWaitList, Event);
+}
+
+__SYCL_EXPORT pi_result piextUSMDeviceAlloc(void **ResultPtr,
+                                            pi_context Context,
+                                            pi_device Device,
+                                            pi_usm_mem_properties *Properties,
+                                            size_t Size, pi_uint32 Alignment) {
+
+  return pi2ur::piextUSMDeviceAlloc(ResultPtr, Context, Device, Properties,
+                                    Size, Alignment);
+}
+
+__SYCL_EXPORT pi_result piKernelRetain(pi_kernel Kernel) {
+  return pi2ur::piKernelRetain(Kernel);
+}
+
+__SYCL_EXPORT pi_result piKernelRelease(pi_kernel Kernel) {
+
+  return pi2ur::piKernelRelease(Kernel);
+}
+
+__SYCL_EXPORT pi_result piProgramRelease(pi_program Program) {
+  return pi2ur::piProgramRelease(Program);
+}
+
+__SYCL_EXPORT pi_result piextUSMSharedAlloc(void **ResultPtr,
+                                            pi_context Context,
+                                            pi_device Device,
+                                            pi_usm_mem_properties *Properties,
+                                            size_t Size, pi_uint32 Alignment) {
+
+  return pi2ur::piextUSMSharedAlloc(ResultPtr, Context, Device, Properties,
+                                    Size, Alignment);
+}
+
+__SYCL_EXPORT pi_result piextUSMFree(pi_context Context, void *Ptr) {
+  return pi2ur::piextUSMFree(Context, Ptr);
+}
+
+__SYCL_EXPORT pi_result piContextRetain(pi_context Context) {
+  return pi2ur::piContextRetain(Context);
+}
+
+__SYCL_EXPORT pi_result piextKernelSetArgPointer(pi_kernel Kernel,
+                                                 pi_uint32 ArgIndex,
+                                                 size_t ArgSize,
+                                                 const void *ArgValue) {
+  return pi2ur::piextKernelSetArgPointer(Kernel, ArgIndex, ArgSize, ArgValue);
+}
+
+// Special version of piKernelSetArg to accept pi_sampler.
+__SYCL_EXPORT pi_result piextKernelSetArgSampler(pi_kernel Kernel,
+                                                 pi_uint32 ArgIndex,
+                                                 const pi_sampler *ArgValue) {
+
+  return pi2ur::piextKernelSetArgSampler(Kernel, ArgIndex, ArgValue);
+}
+
+__SYCL_EXPORT pi_result piKernelGetSubGroupInfo(
+    pi_kernel Kernel, pi_device Device, pi_kernel_sub_group_info ParamName,
+    size_t InputValueSize, const void *InputValue, size_t ParamValueSize,
+    void *ParamValue, size_t *ParamValueSizeRet) {
+
+  return pi2ur::piKernelGetSubGroupInfo(
+      Kernel, Device, ParamName, InputValueSize, InputValue, ParamValueSize,
+      ParamValue, ParamValueSizeRet);
+}
+
+__SYCL_EXPORT pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName,
+                                       size_t ParamValueSize, void *ParamValue,
+                                       size_t *ParamValueSizeRet) {
+
+  return pi2ur::piQueueGetInfo(Queue, ParamName, ParamValueSize, ParamValue,
+                               ParamValueSizeRet);
+}
+
+/// USM Memset API
+///
+/// @param Queue is the queue to submit to
+/// @param Ptr is the ptr to memset
+/// @param Value is value to set.  It is interpreted as an 8-bit value and the
+/// upper
+///        24 bits are ignored
+/// @param Count is the size in bytes to memset
+/// @param NumEventsInWaitlist is the number of events to wait on
+/// @param EventsWaitlist is an array of events to wait on
+/// @param Event is the event that represents this operation
+__SYCL_EXPORT pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr,
+                                              pi_int32 Value, size_t Count,
+                                              pi_uint32 NumEventsInWaitlist,
+                                              const pi_event *EventsWaitlist,
+                                              pi_event *Event) {
+  return pi2ur::piextUSMEnqueueMemset(
+      Queue, Ptr, Value, Count, NumEventsInWaitlist, EventsWaitlist, Event);
+}
+
+__SYCL_EXPORT pi_result piEnqueueMemBufferCopyRect(
+    pi_queue Queue, pi_mem SrcMem, pi_mem DstMem, pi_buff_rect_offset SrcOrigin,
+    pi_buff_rect_offset DstOrigin, pi_buff_rect_region Region,
+    size_t SrcRowPitch, size_t SrcSlicePitch, size_t DstRowPitch,
+    size_t DstSlicePitch, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+
+  return pi2ur::piEnqueueMemBufferCopyRect(
+      Queue, SrcMem, DstMem, SrcOrigin, DstOrigin, Region, SrcRowPitch,
+      SrcSlicePitch, DstRowPitch, DstSlicePitch, NumEventsInWaitList,
+      EventWaitList, Event);
+}
+
+__SYCL_EXPORT pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcMem,
+                                               pi_mem DstMem, size_t SrcOffset,
+                                               size_t DstOffset, size_t Size,
+                                               pi_uint32 NumEventsInWaitList,
+                                               const pi_event *EventWaitList,
+                                               pi_event *Event) {
+  return pi2ur::piEnqueueMemBufferCopy(Queue, SrcMem, DstMem, SrcOffset,
+                                       DstOffset, Size, NumEventsInWaitList,
+                                       EventWaitList, Event);
+}
+
+__SYCL_EXPORT pi_result piextUSMEnqueueMemcpy(pi_queue Queue, pi_bool Blocking,
+                                              void *DstPtr, const void *SrcPtr,
+                                              size_t Size,
+                                              pi_uint32 NumEventsInWaitlist,
+                                              const pi_event *EventsWaitlist,
+                                              pi_event *Event) {
+
+  return pi2ur::piextUSMEnqueueMemcpy(Queue, Blocking, DstPtr, SrcPtr, Size,
+                                      NumEventsInWaitlist, EventsWaitlist,
+                                      Event);
+}
+
+__SYCL_EXPORT pi_result piEnqueueMemBufferWriteRect(
+    pi_queue Queue, pi_mem Buffer, pi_bool BlockingWrite,
+    pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset,
+    pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch,
+    size_t HostRowPitch, size_t HostSlicePitch, const void *Ptr,
+    pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
+    pi_event *Event) {
+
+  return pi2ur::piEnqueueMemBufferWriteRect(
+      Queue, Buffer, BlockingWrite, BufferOffset, HostOffset, Region,
+      BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr,
+      NumEventsInWaitList, EventWaitList, Event);
+}
+
+__SYCL_EXPORT pi_result piEnqueueMemBufferWrite(
+    pi_queue Queue, pi_mem Buffer, pi_bool BlockingWrite, size_t Offset,
+    size_t Size, const void *Ptr, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+
+  return pi2ur::piEnqueueMemBufferWrite(Queue, Buffer, BlockingWrite, Offset,
+                                        Size, Ptr, NumEventsInWaitList,
+                                        EventWaitList, Event);
+}
+
+__SYCL_EXPORT pi_result piEnqueueMemBufferReadRect(
+    pi_queue Queue, pi_mem Buffer, pi_bool BlockingRead,
+    pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset,
+    pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch,
+    size_t HostRowPitch, size_t HostSlicePitch, void *Ptr,
+    pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
+    pi_event *Event) {
+
+  return pi2ur::piEnqueueMemBufferReadRect(
+      Queue, Buffer, BlockingRead, BufferOffset, HostOffset, Region,
+      BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr,
+      NumEventsInWaitList, EventWaitList, Event);
+}
+
+__SYCL_EXPORT pi_result piEnqueueMemBufferRead(
+    pi_queue Queue, pi_mem Src, pi_bool BlockingRead, size_t Offset,
+    size_t Size, void *Dst, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+
+  return pi2ur::piEnqueueMemBufferRead(Queue, Src, BlockingRead, Offset, Size,
+                                       Dst, NumEventsInWaitList, EventWaitList,
+                                       Event);
+}
+
+__SYCL_EXPORT pi_result piEnqueueEventsWaitWithBarrier(
+    pi_queue Queue, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *OutEvent) {
+
+  return pi2ur::piEnqueueEventsWaitWithBarrier(Queue, NumEventsInWaitList,
+                                               EventWaitList, OutEvent);
+}
+
+__SYCL_EXPORT pi_result piEnqueueEventsWait(pi_queue Queue,
+                                            pi_uint32 NumEventsInWaitList,
+                                            const pi_event *EventWaitList,
+                                            pi_event *OutEvent) {
+
+  return pi2ur::piEnqueueEventsWait(Queue, NumEventsInWaitList, EventWaitList,
+                                    OutEvent);
+}
+
+__SYCL_EXPORT pi_result
+piextEventGetNativeHandle(pi_event Event, pi_native_handle *NativeHandle) {
+
+  return pi2ur::piextEventGetNativeHandle(Event, NativeHandle);
+}
+
+__SYCL_EXPORT pi_result piEventGetProfilingInfo(pi_event Event,
+                                                pi_profiling_info ParamName,
+                                                size_t ParamValueSize,
+                                                void *ParamValue,
+                                                size_t *ParamValueSizeRet) {
+
+  return pi2ur::piEventGetProfilingInfo(Event, ParamName, ParamValueSize,
+                                        ParamValue, ParamValueSizeRet);
+}
+
+__SYCL_EXPORT pi_result piProgramRetain(pi_program Program) {
+  return pi2ur::piProgramRetain(Program);
+}
+
+__SYCL_EXPORT pi_result piKernelSetExecInfo(pi_kernel Kernel,
+                                            pi_kernel_exec_info ParamName,
+                                            size_t ParamValueSize,
+                                            const void *ParamValue) {
+
+  return pi2ur::piKernelSetExecInfo(Kernel, ParamName, ParamValueSize,
+                                    ParamValue);
+}
+
+__SYCL_EXPORT pi_result piKernelGetInfo(pi_kernel Kernel,
+                                        pi_kernel_info ParamName,
+                                        size_t ParamValueSize, void *ParamValue,
+                                        size_t *ParamValueSizeRet) {
+  return pi2ur::piKernelGetInfo(Kernel, ParamName, ParamValueSize, ParamValue,
+                                ParamValueSizeRet);
+}
+
+__SYCL_EXPORT pi_result piQueueRetain(pi_queue Queue) {
+  return pi2ur::piQueueRetain(Queue);
+}
+
+__SYCL_EXPORT pi_result piQueueFlush(pi_queue Queue) {
+  return pi2ur::piQueueFlush(Queue);
+}
+
+__SYCL_EXPORT pi_result piMemRetain(pi_mem Mem) {
+  return pi2ur::piMemRetain(Mem);
+}
+
+__SYCL_EXPORT pi_result piProgramCreateWithBinary(
+    pi_context Context, pi_uint32 NumDevices, const pi_device *DeviceList,
+    const size_t *Lengths, const unsigned char **Binaries,
+    size_t NumMetadataEntries, const pi_device_binary_property *Metadata,
+    pi_int32 *BinaryStatus, pi_program *Program) {
+
+  return pi2ur::piProgramCreateWithBinary(Context, NumDevices, DeviceList,
+                                          Lengths, Binaries, NumMetadataEntries,
+                                          Metadata, BinaryStatus, Program);
+}
+
+__SYCL_EXPORT pi_result piclProgramCreateWithSource(pi_context Context,
+                                                    pi_uint32 Count,
+                                                    const char **Strings,
+                                                    const size_t *Lengths,
+                                                    pi_program *RetProgram) {
+  return pi2ur::piclProgramCreateWithSource(Context, Count, Strings, Lengths,
+                                            RetProgram);
+}
+
+__SYCL_EXPORT pi_result piProgramGetInfo(pi_program Program,
+                                         pi_program_info ParamName,
+                                         size_t ParamValueSize,
+                                         void *ParamValue,
+                                         size_t *ParamValueSizeRet) {
+
+  return pi2ur::piProgramGetInfo(Program, ParamName, ParamValueSize, ParamValue,
+                                 ParamValueSizeRet);
+}
+
+__SYCL_EXPORT pi_result piProgramCompile(
+    pi_program Program, pi_uint32 NumDevices, const pi_device *DeviceList,
+    const char *Options, pi_uint32 NumInputHeaders,
+    const pi_program *InputHeaders, const char **HeaderIncludeNames,
+    void (*PFnNotify)(pi_program Program, void *UserData), void *UserData) {
+
+  return pi2ur::piProgramCompile(Program, NumDevices, DeviceList, Options,
+                                 NumInputHeaders, InputHeaders,
+                                 HeaderIncludeNames, PFnNotify, UserData);
+}
+
+__SYCL_EXPORT pi_result piProgramGetBuildInfo(
+    pi_program Program, pi_device Device, pi_program_build_info ParamName,
+    size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) {
+
+  return pi2ur::piProgramGetBuildInfo(Program, Device, ParamName,
+                                      ParamValueSize, ParamValue,
+                                      ParamValueSizeRet);
+}
+
+__SYCL_EXPORT pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
+
+  return pi2ur::piEventCreate(Context, RetEvent);
+}
+
+__SYCL_EXPORT pi_result piEventSetCallback(
+    pi_event Event, pi_int32 CommandExecCallbackType,
+    void (*PFnNotify)(pi_event Event, pi_int32 EventCommandStatus,
+                      void *UserData),
+    void *UserData) {
+  return pi2ur::piEventSetCallback(Event, CommandExecCallbackType, PFnNotify,
+                                   UserData);
+}
+
+__SYCL_EXPORT pi_result piEventSetStatus(pi_event Event,
+                                         pi_int32 ExecutionStatus) {
+  return pi2ur::piEventSetStatus(Event, ExecutionStatus);
+}
+
+__SYCL_EXPORT pi_result piEventRetain(pi_event Event) {
+  return pi2ur::piEventRetain(Event);
+}
+
+__SYCL_EXPORT pi_result piEventRelease(pi_event Event) {
+  return pi2ur::piEventRelease(Event);
+}
+
+__SYCL_EXPORT pi_result piextEventCreateWithNativeHandle(
+    pi_native_handle NativeHandle, pi_context Context, bool OwnNativeHandle,
+    pi_event *Event) {
+  return pi2ur::piextEventCreateWithNativeHandle(NativeHandle, Context,
+                                                 OwnNativeHandle, Event);
+}
+
+__SYCL_EXPORT pi_result piEnqueueMemImageFill(
+    pi_queue Queue, pi_mem Image, const void *FillColor, const size_t *Origin,
+    const size_t *Region, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+
+  return pi2ur::piEnqueueMemImageFill(Queue, Image, FillColor, Origin, Region,
+                                      NumEventsInWaitList, EventWaitList,
+                                      Event);
+}
+
+__SYCL_EXPORT pi_result piextPlatformGetNativeHandle(
+    pi_platform Platform, pi_native_handle *NativeHandle) {
+
+  return pi2ur::piextPlatformGetNativeHandle(Platform, NativeHandle);
+}
+
+__SYCL_EXPORT pi_result piextPlatformCreateWithNativeHandle(
+    pi_native_handle NativeHandle, pi_platform *Platform) {
+
+  return pi2ur::piextPlatformCreateWithNativeHandle(NativeHandle, Platform);
+}
+
+__SYCL_EXPORT pi_result
+piextDeviceGetNativeHandle(pi_device Device, pi_native_handle *NativeHandle) {
+
+  return pi2ur::piextDeviceGetNativeHandle(Device, NativeHandle);
+}
+
+__SYCL_EXPORT pi_result piextDeviceCreateWithNativeHandle(
+    pi_native_handle NativeHandle, pi_platform Platform, pi_device *Device) {
+
+  return pi2ur::piextDeviceCreateWithNativeHandle(NativeHandle, Platform,
+                                                  Device);
+}
+
+// FIXME: Dummy implementation to prevent link fail
+__SYCL_EXPORT pi_result piextContextSetExtendedDeleter(
+    pi_context Context, pi_context_extended_deleter Function, void *UserData) {
+  return pi2ur::piextContextSetExtendedDeleter(Context, Function, UserData);
+}
+
+__SYCL_EXPORT pi_result piextContextGetNativeHandle(
+    pi_context Context, pi_native_handle *NativeHandle) {
+
+  return pi2ur::piextContextGetNativeHandle(Context, NativeHandle);
+}
+
+__SYCL_EXPORT pi_result piextContextCreateWithNativeHandle(
+    pi_native_handle NativeHandle, pi_uint32 NumDevices,
+    const pi_device *Devices, bool OwnNativeHandle, pi_context *RetContext) {
+  return pi2ur::piextContextCreateWithNativeHandle(
+      NativeHandle, NumDevices, Devices, OwnNativeHandle, RetContext);
+}
+
+__SYCL_EXPORT pi_result
+piextQueueGetNativeHandle(pi_queue Queue, pi_native_handle *NativeHandle) {
+  return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle);
+}
+
+__SYCL_EXPORT pi_result piextQueueCreateWithNativeHandle(
+    pi_native_handle NativeHandle, pi_context Context, pi_device Device,
+    bool OwnNativeHandle, pi_queue *Queue) {
+  return pi2ur::piextQueueCreateWithNativeHandle(NativeHandle, Context, Device,
+                                                 OwnNativeHandle, Queue);
+}
+
+__SYCL_EXPORT pi_result piMemRelease(pi_mem Mem) {
+  return pi2ur::piMemRelease(Mem);
+}
+
+__SYCL_EXPORT pi_result piEnqueueNativeKernel(
+    pi_queue Queue, void (*UserFunc)(void *), void *Args, size_t CbArgs,
+    pi_uint32 NumMemObjects, const pi_mem *MemList, const void **ArgsMemLoc,
+    pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
+    pi_event *Event) {
+  return pi2ur::piEnqueueNativeKernel(
+      Queue, UserFunc, Args, CbArgs, NumMemObjects, MemList, ArgsMemLoc,
+      NumEventsInWaitList, EventWaitList, Event);
+}
+
+__SYCL_EXPORT pi_result piextGetDeviceFunctionPointer(
+    pi_device Device, pi_program Program, const char *FunctionName,
+    pi_uint64 *FunctionPointerRet) {
+
+  return pi2ur::piextGetDeviceFunctionPointer(Device, Program, FunctionName,
+                                              FunctionPointerRet);
+}
+
+/// Hint to migrate memory to the device
+///
+/// @param Queue is the queue to submit to
+/// @param Ptr points to the memory to migrate
+/// @param Size is the number of bytes to migrate
+/// @param Flags is a bitfield used to specify memory migration options
+/// @param NumEventsInWaitlist is the number of events to wait on
+/// @param EventsWaitlist is an array of events to wait on
+/// @param Event is the event that represents this operation
+__SYCL_EXPORT pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr,
+                                                size_t Size,
+                                                pi_usm_migration_flags Flags,
+                                                pi_uint32 NumEventsInWaitList,
+                                                const pi_event *EventWaitList,
+                                                pi_event *OutEvent) {
+
+  return pi2ur::piextUSMEnqueuePrefetch(
+      Queue, Ptr, Size, Flags, NumEventsInWaitList, EventWaitList, OutEvent);
+}
+
+/// USM memadvise API to govern behavior of automatic migration mechanisms
+///
+/// @param Queue is the queue to submit to
+/// @param Ptr is the data to be advised
+/// @param Length is the size in bytes of the meory to advise
+/// @param Advice is device specific advice
+/// @param Event is the event that represents this operation
+///
+__SYCL_EXPORT pi_result piextUSMEnqueueMemAdvise(pi_queue Queue,
+                                                 const void *Ptr, size_t Length,
+                                                 pi_mem_advice Advice,
+                                                 pi_event *OutEvent) {
+
+  return pi2ur::piextUSMEnqueueMemAdvise(Queue, Ptr, Length, Advice, OutEvent);
+}
+
+/// USM 2D Fill API
+///
+/// \param queue is the queue to submit to
+/// \param ptr is the ptr to fill
+/// \param pitch is the total width of the destination memory including padding
+/// \param pattern is a pointer with the bytes of the pattern to set
+/// \param pattern_size is the size in bytes of the pattern
+/// \param width is width in bytes of each row to fill
+/// \param height is height the columns to fill
+/// \param num_events_in_waitlist is the number of events to wait on
+/// \param events_waitlist is an array of events to wait on
+/// \param event is the event that represents this operation
+__SYCL_EXPORT pi_result piextUSMEnqueueFill2D(pi_queue Queue, void *Ptr,
+                                              size_t Pitch, size_t PatternSize,
+                                              const void *Pattern, size_t Width,
+                                              size_t Height,
+                                              pi_uint32 NumEventsWaitList,
+                                              const pi_event *EventsWaitList,
+                                              pi_event *Event) {
+
+  return pi2ur::piextUSMEnqueueFill2D(Queue, Ptr, Pitch, PatternSize, Pattern,
+                                      Width, Height, NumEventsWaitList,
+                                      EventsWaitList, Event);
+}
+
+/// USM 2D Memset API
+///
+/// \param queue is the queue to submit to
+/// \param ptr is the ptr to fill
+/// \param pitch is the total width of the destination memory including padding
+/// \param pattern is a pointer with the bytes of the pattern to set
+/// \param pattern_size is the size in bytes of the pattern
+/// \param width is width in bytes of each row to fill
+/// \param height is height the columns to fill
+/// \param num_events_in_waitlist is the number of events to wait on
+/// \param events_waitlist is an array of events to wait on
+/// \param event is the event that represents this operation
+__SYCL_EXPORT pi_result piextUSMEnqueueMemset2D(pi_queue Queue, void *Ptr,
+                                                size_t Pitch, int Value,
+                                                size_t Width, size_t Height,
+                                                pi_uint32 NumEventsWaitList,
+                                                const pi_event *EventsWaitlist,
+                                                pi_event *Event) {
+  return pi2ur::piextUSMEnqueueMemset2D(Queue, Ptr, Pitch, Value, Width, Height,
+                                        NumEventsWaitList, EventsWaitlist,
+                                        Event);
+}
+
+/// API to query information about USM allocated pointers.
+/// Valid Queries:
+///   PI_MEM_ALLOC_TYPE returns host/device/shared pi_usm_type value
+///   PI_MEM_ALLOC_BASE_PTR returns the base ptr of an allocation if
+///                         the queried pointer fell inside an allocation.
+///                         Result must fit in void *
+///   PI_MEM_ALLOC_SIZE returns how big the queried pointer's
+///                     allocation is in bytes. Result is a size_t.
+///   PI_MEM_ALLOC_DEVICE returns the pi_device this was allocated against
+///
+/// @param Context is the pi_context
+/// @param Ptr is the pointer to query
+/// @param ParamName is the type of query to perform
+/// @param ParamValueSize is the size of the result in bytes
+/// @param ParamValue is the result
+/// @param ParamValueRet is how many bytes were written
+__SYCL_EXPORT pi_result piextUSMGetMemAllocInfo(
+    pi_context Context, const void *Ptr, pi_mem_alloc_info ParamName,
+    size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) {
+  return pi2ur::piextUSMGetMemAllocInfo(Context, Ptr, ParamName, ParamValueSize,
+                                        ParamValue, ParamValueSizeRet);
+}
+
+__SYCL_EXPORT pi_result piextPluginGetOpaqueData(void *opaque_data_param,
+                                                 void **opaque_data_return) {
+  return pi2ur::piextPluginGetOpaqueData(opaque_data_param, opaque_data_return);
+}
+
+__SYCL_EXPORT pi_result piextProgramGetNativeHandle(
+    pi_program Program, pi_native_handle *NativeHandle) {
+
+  return pi2ur::piextProgramGetNativeHandle(Program, NativeHandle);
+}
+
+__SYCL_EXPORT pi_result piextProgramCreateWithNativeHandle(
+    pi_native_handle NativeHandle, // missing
+    pi_context Context, bool ownNativeHandle, pi_program *Program) {
+  return pi2ur::piextProgramCreateWithNativeHandle(NativeHandle, Context,
+                                                   ownNativeHandle, Program);
+}
+
+__SYCL_EXPORT pi_result piSamplerCreate(
+    pi_context Context, const pi_sampler_properties *SamplerProperties,
+    pi_sampler *RetSampler) {
+  return pi2ur::piSamplerCreate(Context, SamplerProperties, RetSampler);
+}
+
+__SYCL_EXPORT pi_result piSamplerGetInfo(pi_sampler Sampler,
+                                         pi_sampler_info ParamName,
+                                         size_t ParamValueSize,
+                                         void *ParamValue,
+                                         size_t *ParamValueSizeRet) {
+  return pi2ur::piSamplerGetInfo(Sampler, ParamName, ParamValueSize, ParamValue,
+                                 ParamValueSizeRet);
+}
+
+__SYCL_EXPORT pi_result piSamplerRetain(pi_sampler Sampler) {
+  return pi2ur::piSamplerRetain(Sampler);
+}
+
+__SYCL_EXPORT pi_result piSamplerRelease(pi_sampler Sampler) {
+  return pi2ur::piSamplerRelease(Sampler);
+}
+
+__SYCL_EXPORT pi_result piMemImageGetInfo(pi_mem Image, pi_image_info ParamName,
+                                          size_t ParamValueSize,
+                                          void *ParamValue,
+                                          size_t *ParamValueSizeRet) {
+  return pi2ur::piMemImageGetInfo(Image, ParamName, ParamValueSize, ParamValue,
+                                  ParamValueSizeRet);
+}
+
+/// USM 2D Memcpy API
+///
+/// \param queue is the queue to submit to
+/// \param blocking is whether this operation should block the host
+/// \param dst_ptr is the location the data will be copied
+/// \param dst_pitch is the total width of the destination memory including
+/// padding
+/// \param src_ptr is the data to be copied
+/// \param dst_pitch is the total width of the source memory including padding
+/// \param width is width in bytes of each row to be copied
+/// \param height is height the columns to be copied
+/// \param num_events_in_waitlist is the number of events to wait on
+/// \param events_waitlist is an array of events to wait on
+/// \param event is the event that represents this operation
+__SYCL_EXPORT pi_result piextUSMEnqueueMemcpy2D(
+    pi_queue Queue, pi_bool Blocking, void *DstPtr, size_t DstPitch,
+    const void *SrcPtr, size_t SrcPitch, size_t Width, size_t Height,
+    pi_uint32 NumEventsInWaitList, const pi_event *EventsWaitList,
+    pi_event *Event) {
+
+  return pi2ur::piextUSMEnqueueMemcpy2D(
+      Queue, Blocking, DstPtr, DstPitch, SrcPtr, SrcPitch, Width, Height,
+      NumEventsInWaitList, EventsWaitList, Event);
+}
+
+/// API for writing data from host to a device global variable.
+///
+/// \param Queue is the queue
+/// \param Program is the program containing the device global variable
+/// \param Name is the unique identifier for the device global variable
+/// \param BlockingWrite is true if the write should block
+/// \param Count is the number of bytes to copy
+/// \param Offset is the byte offset into the device global variable to start
+/// copying
+/// \param Src is a pointer to where the data must be copied from
+/// \param NumEventsInWaitList is a number of events in the wait list
+/// \param EventWaitList is the wait list
+/// \param Event is the resulting event
+pi_result piextEnqueueDeviceGlobalVariableWrite(
+    pi_queue Queue, pi_program Program, const char *Name, pi_bool BlockingWrite,
+    size_t Count, size_t Offset, const void *Src, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventsWaitList, pi_event *Event) {
+  return pi2ur::piextEnqueueDeviceGlobalVariableWrite(
+      Queue, Program, Name, BlockingWrite, Count, Offset, Src,
+      NumEventsInWaitList, EventsWaitList, Event);
+}
+
+/// API reading data from a device global variable to host.
+///
+/// \param Queue is the queue
+/// \param Program is the program containing the device global variable
+/// \param Name is the unique identifier for the device global variable
+/// \param BlockingRead is true if the read should block
+/// \param Count is the number of bytes to copy
+/// \param Offset is the byte offset into the device global variable to start
+/// copying
+/// \param Dst is a pointer to where the data must be copied to
+/// \param NumEventsInWaitList is a number of events in the wait list
+/// \param EventWaitList is the wait list
+/// \param Event is the resulting event
+pi_result piextEnqueueDeviceGlobalVariableRead(
+    pi_queue Queue, pi_program Program, const char *Name, pi_bool BlockingRead,
+    size_t Count, size_t Offset, void *Dst, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventsWaitList, pi_event *Event) {
+
+  return pi2ur::piextEnqueueDeviceGlobalVariableRead(
+      Queue, Program, Name, BlockingRead, Count, Offset, Dst,
+      NumEventsInWaitList, EventsWaitList, Event);
+}
+
+__SYCL_EXPORT pi_result piGetDeviceAndHostTimer(pi_device Device,
+                                                uint64_t *DeviceTime,
+                                                uint64_t *HostTime) {
+  return pi2ur::piGetDeviceAndHostTimer(Device, DeviceTime, HostTime);
 }
 
 // This interface is not in Unified Runtime currently
-__SYCL_EXPORT pi_result piTearDown(void *) { return PI_SUCCESS; }
+__SYCL_EXPORT pi_result piTearDown(void *PluginParameter) {
+  return pi2ur::piTearDown(PluginParameter);
+}
 
 // This interface is not in Unified Runtime currently
 __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) {
@@ -107,6 +998,116 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_API(piDeviceGetInfo)
   _PI_API(piDevicePartition)
   _PI_API(piextDeviceSelectBinary)
+  _PI_API(piGetDeviceAndHostTimer)
+  _PI_API(piextPlatformGetNativeHandle)
+  _PI_API(piextPlatformCreateWithNativeHandle)
+  _PI_API(piextDeviceGetNativeHandle)
+  _PI_API(piextDeviceCreateWithNativeHandle)
+
+  _PI_API(piContextCreate)
+  _PI_API(piContextRelease)
+  _PI_API(piContextRetain)
+  _PI_API(piextContextSetExtendedDeleter)
+  _PI_API(piextContextGetNativeHandle)
+  _PI_API(piextContextCreateWithNativeHandle)
+
+  _PI_API(piQueueCreate)
+  _PI_API(piQueueRelease)
+  _PI_API(piextQueueCreate)
+  _PI_API(piQueueFinish)
+  _PI_API(piQueueGetInfo)
+  _PI_API(piQueueRetain)
+  _PI_API(piQueueFlush)
+  _PI_API(piextQueueGetNativeHandle)
+  _PI_API(piextQueueCreateWithNativeHandle)
+
+  _PI_API(piProgramCreate)
+  _PI_API(piProgramBuild)
+  _PI_API(piextProgramGetNativeHandle)
+  _PI_API(piextProgramCreateWithNativeHandle)
+  _PI_API(piextProgramSetSpecializationConstant)
+  _PI_API(piProgramLink)
+  _PI_API(piKernelCreate)
+  _PI_API(piextKernelSetArgMemObj)
+  _PI_API(piextKernelCreateWithNativeHandle)
+  _PI_API(piProgramRetain)
+  _PI_API(piKernelSetExecInfo)
+  _PI_API(piKernelGetInfo)
+  _PI_API(piKernelSetArg)
+  _PI_API(piKernelGetGroupInfo)
+  _PI_API(piKernelRetain)
+  _PI_API(piKernelRelease)
+  _PI_API(piProgramRelease)
+  _PI_API(piextKernelSetArgPointer)
+  _PI_API(piextKernelSetArgSampler)
+  _PI_API(piKernelGetSubGroupInfo)
+  _PI_API(piProgramCreateWithBinary)
+  _PI_API(piclProgramCreateWithSource)
+  _PI_API(piProgramGetInfo)
+  _PI_API(piProgramCompile)
+  _PI_API(piProgramGetBuildInfo)
+  _PI_API(piextGetDeviceFunctionPointer)
+
+  _PI_API(piMemBufferCreate)
+  _PI_API(piMemGetInfo)
+  _PI_API(piMemBufferPartition)
+  _PI_API(piEnqueueMemImageCopy)
+  _PI_API(piextMemGetNativeHandle)
+  _PI_API(piextMemCreateWithNativeHandle)
+  _PI_API(piMemRetain)
+  _PI_API(piextUSMGetMemAllocInfo)
+  _PI_API(piextUSMEnqueuePrefetch)
+  _PI_API(piextUSMEnqueueFill2D)
+  _PI_API(piextUSMEnqueueMemset2D)
+  _PI_API(piextUSMEnqueueMemAdvise)
+  _PI_API(piMemRelease)
+  _PI_API(piMemImageCreate)
+  _PI_API(piMemImageGetInfo)
+  _PI_API(piextUSMEnqueueMemcpy2D)
+  _PI_API(piextEnqueueDeviceGlobalVariableWrite)
+  _PI_API(piextEnqueueDeviceGlobalVariableRead)
+
+  _PI_API(piextUSMHostAlloc)
+  _PI_API(piextUSMDeviceAlloc)
+  _PI_API(piextUSMSharedAlloc)
+  _PI_API(piextUSMFree)
+
+  _PI_API(piEnqueueKernelLaunch)
+  _PI_API(piEnqueueMemImageWrite)
+  _PI_API(piEnqueueMemImageRead)
+  _PI_API(piEnqueueMemBufferMap)
+  _PI_API(piEnqueueMemUnmap)
+  _PI_API(piEnqueueMemBufferFill)
+  _PI_API(piextUSMEnqueueMemset)
+  _PI_API(piEnqueueMemBufferCopyRect)
+  _PI_API(piEnqueueMemBufferCopy)
+  _PI_API(piextUSMEnqueueMemcpy)
+  _PI_API(piEnqueueMemBufferWriteRect)
+  _PI_API(piEnqueueMemBufferWrite)
+  _PI_API(piEnqueueMemBufferReadRect)
+  _PI_API(piEnqueueMemBufferRead)
+  _PI_API(piEnqueueEventsWaitWithBarrier)
+  _PI_API(piEnqueueEventsWait)
+  _PI_API(piEnqueueNativeKernel)
+  _PI_API(piEnqueueMemImageFill)
+
+  _PI_API(piEventSetCallback)
+  _PI_API(piEventSetStatus)
+  _PI_API(piEventRetain)
+  _PI_API(piEventRelease)
+  _PI_API(piextEventCreateWithNativeHandle)
+  _PI_API(piEventsWait)
+  _PI_API(piEventGetInfo)
+  _PI_API(piextEventGetNativeHandle)
+  _PI_API(piEventGetProfilingInfo)
+  _PI_API(piEventCreate)
+
+  _PI_API(piSamplerCreate)
+  _PI_API(piSamplerGetInfo)
+  _PI_API(piSamplerRetain)
+  _PI_API(piSamplerRelease)
+
+  _PI_API(piextPluginGetOpaqueData)
   _PI_API(piTearDown)
 
   return PI_SUCCESS;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp
index ae7ae6375bea0..51fe4cf9c475b 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp
@@ -15,1572 +15,3 @@
 
 // Define the static class field
 std::mutex ZeCall::GlobalLock;
-
-// Trace a call to Level-Zero RT
-#define ZE_CALL(ZeName, ZeArgs)                                                \
-  {                                                                            \
-    ze_result_t ZeResult = ZeName ZeArgs;                                      \
-    if (auto Result = ZeCall().doCall(ZeResult, #ZeName, #ZeArgs, true))       \
-      return ze2urResult(Result);                                              \
-  }
-
-ur_result_t _ur_platform_handle_t::initialize() {
-  // Cache driver properties
-  ZeStruct<ze_driver_properties_t> ZeDriverProperties;
-  ZE_CALL(zeDriverGetProperties, (ZeDriver, &ZeDriverProperties));
-  uint32_t DriverVersion = ZeDriverProperties.driverVersion;
-  // Intel Level-Zero GPU driver stores version as:
-  // | 31 - 24 | 23 - 16 | 15 - 0 |
-  // |  Major  |  Minor  | Build  |
-  auto VersionMajor = std::to_string((DriverVersion & 0xFF000000) >> 24);
-  auto VersionMinor = std::to_string((DriverVersion & 0x00FF0000) >> 16);
-  auto VersionBuild = std::to_string(DriverVersion & 0x0000FFFF);
-  ZeDriverVersion = VersionMajor + "." + VersionMinor + "." + VersionBuild;
-
-  ZE_CALL(zeDriverGetApiVersion, (ZeDriver, &ZeApiVersion));
-  ZeDriverApiVersion = std::to_string(ZE_MAJOR_VERSION(ZeApiVersion)) + "." +
-                       std::to_string(ZE_MINOR_VERSION(ZeApiVersion));
-
-  // Cache driver extension properties
-  uint32_t Count = 0;
-  ZE_CALL(zeDriverGetExtensionProperties, (ZeDriver, &Count, nullptr));
-
-  std::vector<ze_driver_extension_properties_t> ZeExtensions(Count);
-
-  ZE_CALL(zeDriverGetExtensionProperties,
-          (ZeDriver, &Count, ZeExtensions.data()));
-
-  for (auto &extension : ZeExtensions) {
-    // Check if global offset extension is available
-    if (strncmp(extension.name, ZE_GLOBAL_OFFSET_EXP_NAME,
-                strlen(ZE_GLOBAL_OFFSET_EXP_NAME) + 1) == 0) {
-      if (extension.version == ZE_GLOBAL_OFFSET_EXP_VERSION_1_0) {
-        ZeDriverGlobalOffsetExtensionFound = true;
-      }
-    }
-    // Check if extension is available for "static linking" (compiling multiple
-    // SPIR-V modules together into one Level Zero module).
-    if (strncmp(extension.name, ZE_MODULE_PROGRAM_EXP_NAME,
-                strlen(ZE_MODULE_PROGRAM_EXP_NAME) + 1) == 0) {
-      if (extension.version == ZE_MODULE_PROGRAM_EXP_VERSION_1_0) {
-        ZeDriverModuleProgramExtensionFound = true;
-      }
-    }
-    zeDriverExtensionMap[extension.name] = extension.version;
-  }
-
-  // Check if import user ptr into USM feature has been requested.
-  // If yes, then set up L0 API pointers if the platform supports it.
-  ZeUSMImport.setZeUSMImport(this);
-
-  return UR_RESULT_SUCCESS;
-}
-
-ur_result_t urPlatformGet(
-    uint32_t NumEntries, ///< [in] the number of platforms to be added to
-                         ///< phPlatforms. If phPlatforms is not NULL, then
-                         ///< NumEntries should be greater than zero, otherwise
-                         ///< ::UR_RESULT_ERROR_INVALID_SIZE, will be returned.
-    ur_platform_handle_t
-        *Platforms, ///< [out][optional][range(0, NumEntries)] array of handle
-                    ///< of platforms. If NumEntries is less than the number of
-                    ///< platforms available, then
-                    ///< ::urPlatformGet shall only retrieve that number of
-                    ///< platforms.
-    uint32_t *NumPlatforms ///< [out][optional] returns the total number of
-                           ///< platforms available.
-) {
-  static std::once_flag ZeCallCountInitialized;
-  try {
-    std::call_once(ZeCallCountInitialized, []() {
-      if (UrL0Debug & UR_L0_DEBUG_CALL_COUNT) {
-        ZeCallCount = new std::map<const char *, int>;
-      }
-    });
-  } catch (const std::bad_alloc &) {
-    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return UR_RESULT_ERROR_UNKNOWN;
-  }
-
-  // Setting these environment variables before running zeInit will enable the
-  // validation layer in the Level Zero loader.
-  if (UrL0Debug & UR_L0_DEBUG_VALIDATION) {
-    setEnvVar("ZE_ENABLE_VALIDATION_LAYER", "1");
-    setEnvVar("ZE_ENABLE_PARAMETER_VALIDATION", "1");
-  }
-
-  // Enable SYSMAN support for obtaining the PCI address
-  // and maximum memory bandwidth.
-  if (getenv("SYCL_ENABLE_PCI") != nullptr) {
-    setEnvVar("ZES_ENABLE_SYSMAN", "1");
-  }
-
-  // TODO: We can still safely recover if something goes wrong during the init.
-  // Implement handling segfault using sigaction.
-
-  // We must only initialize the driver once, even if piPlatformsGet() is called
-  // multiple times.  Declaring the return value as "static" ensures it's only
-  // called once.
-  static ze_result_t ZeResult = ZE_CALL_NOCHECK(zeInit, (0));
-
-  // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms.
-  if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
-    PI_ASSERT(NumEntries != 0, UR_RESULT_ERROR_INVALID_VALUE);
-    if (NumPlatforms)
-      *NumPlatforms = 0;
-    return UR_RESULT_SUCCESS;
-  }
-
-  if (ZeResult != ZE_RESULT_SUCCESS) {
-    urPrint("zeInit: Level Zero initialization failure\n");
-    return ze2urResult(ZeResult);
-  }
-
-  // Cache pi_platforms for reuse in the future
-  // It solves two problems;
-  // 1. sycl::platform equality issue; we always return the same pi_platform.
-  // 2. performance; we can save time by immediately return from cache.
-  //
-
-  const std::lock_guard<SpinLock> Lock{*PiPlatformsCacheMutex};
-  if (!PiPlatformCachePopulated) {
-    try {
-      // Level Zero does not have concept of Platforms, but Level Zero driver is
-      // the closest match.
-      uint32_t ZeDriverCount = 0;
-      ZE_CALL(zeDriverGet, (&ZeDriverCount, nullptr));
-      if (ZeDriverCount == 0) {
-        PiPlatformCachePopulated = true;
-      } else {
-        std::vector<ze_driver_handle_t> ZeDrivers;
-        ZeDrivers.resize(ZeDriverCount);
-
-        ZE_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data()));
-        for (uint32_t I = 0; I < ZeDriverCount; ++I) {
-          auto Platform = new ur_platform_handle_t_(ZeDrivers[I]);
-          // Save a copy in the cache for future uses.
-          PiPlatformsCache->push_back(Platform);
-
-          ur_result_t Result = Platform->initialize();
-          if (Result != UR_RESULT_SUCCESS) {
-            return Result;
-          }
-        }
-        PiPlatformCachePopulated = true;
-      }
-    } catch (const std::bad_alloc &) {
-      return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
-    } catch (...) {
-      return UR_RESULT_ERROR_UNKNOWN;
-    }
-  }
-
-  // Populate returned platforms from the cache.
-  if (Platforms) {
-    PI_ASSERT(NumEntries <= PiPlatformsCache->size(),
-              UR_RESULT_ERROR_INVALID_PLATFORM);
-    std::copy_n(PiPlatformsCache->begin(), NumEntries, Platforms);
-  }
-
-  if (NumPlatforms) {
-    if (*NumPlatforms == 0)
-      *NumPlatforms = PiPlatformsCache->size();
-    else
-      *NumPlatforms = std::min(PiPlatformsCache->size(), (size_t)NumEntries);
-  }
-
-  return UR_RESULT_SUCCESS;
-}
-
-ur_result_t urPlatformGetInfo(
-    ur_platform_handle_t Platform, ///< [in] handle of the platform
-    ur_platform_info_t ParamName,  ///< [in] type of the info to retrieve
-    size_t Size,      ///< [in] the number of bytes pointed to by pPlatformInfo.
-    void *ParamValue, ///< [out][optional] array of bytes holding the info.
-                      ///< If Size is not equal to or greater to the real number
-                      ///< of bytes needed to return the info then the
-                      ///< ::UR_RESULT_ERROR_INVALID_SIZE error is returned and
-                      ///< pPlatformInfo is not used.
-    size_t *pSizeRet  ///< [out][optional] pointer to the actual number of bytes
-                      ///< being queried by pPlatformInfo.
-) {
-
-  PI_ASSERT(Platform, UR_RESULT_ERROR_INVALID_PLATFORM);
-  UrReturnHelper ReturnValue(Size, ParamValue, pSizeRet);
-
-  switch (ParamName) {
-  case UR_PLATFORM_INFO_NAME:
-    // TODO: Query Level Zero driver when relevant info is added there.
-    return ReturnValue("Intel(R) oneAPI Unified Runtime over Level-Zero");
-  case UR_PLATFORM_INFO_VENDOR_NAME:
-    // TODO: Query Level Zero driver when relevant info is added there.
-    return ReturnValue("Intel(R) Corporation");
-  case UR_PLATFORM_INFO_EXTENSIONS:
-    // Convention adopted from OpenCL:
-    //     "Returns a space-separated list of extension names (the extension
-    // names themselves do not contain any spaces) supported by the platform.
-    // Extensions defined here must be supported by all devices associated
-    // with this platform."
-    //
-    // TODO: Check the common extensions supported by all connected devices and
-    // return them. For now, hardcoding some extensions we know are supported by
-    // all Level Zero devices.
-    return ReturnValue(ZE_SUPPORTED_EXTENSIONS);
-  case UR_PLATFORM_INFO_PROFILE:
-    // TODO: figure out what this means and how is this used
-    return ReturnValue("FULL_PROFILE");
-  case UR_PLATFORM_INFO_VERSION:
-    // TODO: this should query to zeDriverGetDriverVersion
-    // but we don't yet have the driver handle here.
-    //
-    // From OpenCL 2.1: "This version string has the following format:
-    // OpenCL<space><major_version.minor_version><space><platform-specific
-    // information>. Follow the same notation here.
-    //
-    return ReturnValue(Platform->ZeDriverApiVersion.c_str());
-  case UR_PLATFORM_INFO_BACKEND:
-    return ReturnValue(UR_PLATFORM_BACKEND_LEVEL_ZERO);
-  default:
-    urPrint("piPlatformGetInfo: unrecognized ParamName\n");
-    return UR_RESULT_ERROR_INVALID_VALUE;
-  }
-
-  return UR_RESULT_SUCCESS;
-}
-
-ur_result_t urDeviceGet(
-    ur_platform_handle_t Platform, ///< [in] handle of the platform instance
-    ur_device_type_t DeviceType,   ///< [in] the type of the devices.
-    uint32_t NumEntries, ///< [in] the number of devices to be added to
-                         ///< phDevices. If phDevices in not NULL then
-                         ///< NumEntries should be greater than zero, otherwise
-                         ///< ::UR_RESULT_ERROR_INVALID_SIZE, will be returned.
-    ur_device_handle_t
-        *Devices, ///< [out][optional][range(0, NumEntries)] array of handle of
-                  ///< devices. If NumEntries is less than the number of devices
-                  ///< available, then platform shall only retrieve that number
-                  ///< of devices.
-    uint32_t *NumDevices ///< [out][optional] pointer to the number of devices.
-                         ///< pNumDevices will be updated with the total number
-                         ///< of devices available.
-
-) {
-
-  PI_ASSERT(Platform, UR_RESULT_ERROR_INVALID_PLATFORM);
-
-  auto Res = Platform->populateDeviceCacheIfNeeded();
-  if (Res != UR_RESULT_SUCCESS) {
-    return Res;
-  }
-
-  // Filter available devices based on input DeviceType.
-  std::vector<ur_device_handle_t> MatchedDevices;
-  std::shared_lock<ur_shared_mutex> Lock(Platform->PiDevicesCacheMutex);
-  for (auto &D : Platform->PiDevicesCache) {
-    // Only ever return root-devices from piDevicesGet, but the
-    // devices cache also keeps sub-devices.
-    if (D->isSubDevice())
-      continue;
-
-    bool Matched = false;
-    switch (DeviceType) {
-    case UR_DEVICE_TYPE_ALL:
-      Matched = true;
-      break;
-    case UR_DEVICE_TYPE_GPU:
-    case UR_DEVICE_TYPE_DEFAULT:
-      Matched = (D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_GPU);
-      break;
-    case UR_DEVICE_TYPE_CPU:
-      Matched = (D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_CPU);
-      break;
-    case UR_DEVICE_TYPE_FPGA:
-      Matched = D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_FPGA;
-      break;
-    case UR_DEVICE_TYPE_MCA:
-      Matched = D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_MCA;
-      break;
-    default:
-      Matched = false;
-      urPrint("Unknown device type");
-      break;
-    }
-    if (Matched)
-      MatchedDevices.push_back(D.get());
-  }
-
-  uint32_t ZeDeviceCount = MatchedDevices.size();
-
-  auto N = std::min(ZeDeviceCount, NumEntries);
-  if (Devices)
-    std::copy_n(MatchedDevices.begin(), N, Devices);
-
-  if (NumDevices) {
-    if (*NumDevices == 0)
-      *NumDevices = ZeDeviceCount;
-    else
-      *NumDevices = N;
-  }
-
-  return UR_RESULT_SUCCESS;
-}
-
-ur_result_t urDeviceGetInfo(
-    ur_device_handle_t Device,  ///< [in] handle of the device instance
-    ur_device_info_t ParamName, ///< [in] type of the info to retrieve
-    size_t propSize,  ///< [in] the number of bytes pointed to by pDeviceInfo.
-    void *ParamValue, ///< [out][optional] array of bytes holding the info.
-                      ///< If propSize is not equal to or greater than the real
-                      ///< number of bytes needed to return the info then the
-                      ///< ::UR_RESULT_ERROR_INVALID_SIZE error is returned and
-                      ///< pDeviceInfo is not used.
-    size_t *pSize ///< [out][optional] pointer to the actual size in bytes of
-                  ///< the queried infoType.
-) {
-  PI_ASSERT(Device, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  UrReturnHelper ReturnValue(propSize, ParamValue, pSize);
-
-  ze_device_handle_t ZeDevice = Device->ZeDevice;
-
-  switch ((int)ParamName) {
-  case UR_DEVICE_INFO_TYPE: {
-    switch (Device->ZeDeviceProperties->type) {
-    case ZE_DEVICE_TYPE_GPU:
-      return ReturnValue(UR_DEVICE_TYPE_GPU);
-    case ZE_DEVICE_TYPE_CPU:
-      return ReturnValue(UR_DEVICE_TYPE_CPU);
-    case ZE_DEVICE_TYPE_FPGA:
-      return ReturnValue(UR_DEVICE_TYPE_FPGA);
-    default:
-      urPrint("This device type is not supported\n");
-      return UR_RESULT_ERROR_INVALID_VALUE;
-    }
-  }
-  case UR_DEVICE_INFO_PARENT_DEVICE:
-    return ReturnValue(Device->RootDevice);
-  case UR_DEVICE_INFO_PLATFORM:
-    return ReturnValue(Device->Platform);
-  case UR_DEVICE_INFO_VENDOR_ID:
-    return ReturnValue(uint32_t{Device->ZeDeviceProperties->vendorId});
-  case UR_DEVICE_INFO_UUID: {
-    // Intel extension for device UUID. This returns the UUID as
-    // std::array<std::byte, 16>. For details about this extension,
-    // see sycl/doc/extensions/supported/sycl_ext_intel_device_info.md.
-    const auto &UUID = Device->ZeDeviceProperties->uuid.id;
-    return ReturnValue(UUID, sizeof(UUID));
-  }
-  case UR_DEVICE_INFO_ATOMIC_64:
-    return ReturnValue(uint32_t{Device->ZeDeviceModuleProperties->flags &
-                                ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS});
-  case UR_DEVICE_INFO_EXTENSIONS: {
-    // Convention adopted from OpenCL:
-    //     "Returns a space separated list of extension names (the extension
-    // names themselves do not contain any spaces) supported by the device."
-    //
-    // TODO: Use proper mechanism to get this information from Level Zero after
-    // it is added to Level Zero.
-    // Hardcoding the few we know are supported by the current hardware.
-    //
-    //
-    std::string SupportedExtensions;
-
-    // cl_khr_il_program - OpenCL 2.0 KHR extension for SPIR-V support. Core
-    //   feature in >OpenCL 2.1
-    // cl_khr_subgroups - Extension adds support for implementation-controlled
-    //   subgroups.
-    // cl_intel_subgroups - Extension adds subgroup features, defined by Intel.
-    // cl_intel_subgroups_short - Extension adds subgroup functions described in
-    //   the cl_intel_subgroups extension to support 16-bit integer data types
-    //   for performance.
-    // cl_intel_required_subgroup_size - Extension to allow programmers to
-    //   optionally specify the required subgroup size for a kernel function.
-    // cl_khr_fp16 - Optional half floating-point support.
-    // cl_khr_fp64 - Support for double floating-point precision.
-    // cl_khr_int64_base_atomics, cl_khr_int64_extended_atomics - Optional
-    //   extensions that implement atomic operations on 64-bit signed and
-    //   unsigned integers to locations in __global and __local memory.
-    // cl_khr_3d_image_writes - Extension to enable writes to 3D image memory
-    //   objects.
-    //
-    // Hardcoding some extensions we know are supported by all Level Zero
-    // devices.
-    SupportedExtensions += (ZE_SUPPORTED_EXTENSIONS);
-    if (Device->ZeDeviceModuleProperties->flags & ZE_DEVICE_MODULE_FLAG_FP16)
-      SupportedExtensions += ("cl_khr_fp16 ");
-    if (Device->ZeDeviceModuleProperties->flags & ZE_DEVICE_MODULE_FLAG_FP64)
-      SupportedExtensions += ("cl_khr_fp64 ");
-    if (Device->ZeDeviceModuleProperties->flags &
-        ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS)
-      // int64AtomicsSupported indicates support for both.
-      SupportedExtensions +=
-          ("cl_khr_int64_base_atomics cl_khr_int64_extended_atomics ");
-    if (Device->ZeDeviceImageProperties->maxImageDims3D > 0)
-      // Supports reading and writing of images.
-      SupportedExtensions += ("cl_khr_3d_image_writes ");
-
-    // L0 does not tell us if bfloat16 is supported.
-    // For now, assume ATS and PVC support it.
-    // TODO: change the way we detect bfloat16 support.
-    if ((Device->ZeDeviceProperties->deviceId & 0xfff) == 0x201 ||
-        (Device->ZeDeviceProperties->deviceId & 0xff0) == 0xbd0)
-      SupportedExtensions += ("cl_intel_bfloat16_conversions ");
-
-    return ReturnValue(SupportedExtensions.c_str());
-  }
-  case UR_DEVICE_INFO_NAME:
-    return ReturnValue(Device->ZeDeviceProperties->name);
-  // zeModuleCreate allows using root device module for sub-devices:
-  // > The application must only use the module for the device, or its
-  // > sub-devices, which was provided during creation.
-  case UR_EXT_DEVICE_INFO_BUILD_ON_SUBDEVICE:
-    return ReturnValue(uint32_t{0});
-  case UR_DEVICE_INFO_COMPILER_AVAILABLE:
-    return ReturnValue(uint32_t{1});
-  case UR_DEVICE_INFO_LINKER_AVAILABLE:
-    return ReturnValue(uint32_t{1});
-  case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
-    uint32_t MaxComputeUnits =
-        Device->ZeDeviceProperties->numEUsPerSubslice *
-        Device->ZeDeviceProperties->numSubslicesPerSlice *
-        Device->ZeDeviceProperties->numSlices;
-
-    bool RepresentsCSlice =
-        Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
-            .ZeIndex >= 0;
-    if (RepresentsCSlice)
-      MaxComputeUnits /= Device->RootDevice->SubDevices.size();
-
-    return ReturnValue(uint32_t{MaxComputeUnits});
-  }
-  case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS:
-    // Level Zero spec defines only three dimensions
-    return ReturnValue(uint32_t{3});
-  case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE:
-    return ReturnValue(
-        uint64_t{Device->ZeDeviceComputeProperties->maxTotalGroupSize});
-  case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: {
-    struct {
-      size_t Arr[3];
-    } MaxGroupSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX,
-                       Device->ZeDeviceComputeProperties->maxGroupSizeY,
-                       Device->ZeDeviceComputeProperties->maxGroupSizeZ}};
-    return ReturnValue(MaxGroupSize);
-  }
-  case UR_EXT_DEVICE_INFO_MAX_WORK_GROUPS_3D: {
-    struct {
-      size_t Arr[3];
-    } MaxGroupCounts = {{Device->ZeDeviceComputeProperties->maxGroupCountX,
-                         Device->ZeDeviceComputeProperties->maxGroupCountY,
-                         Device->ZeDeviceComputeProperties->maxGroupCountZ}};
-    return ReturnValue(MaxGroupCounts);
-  }
-  case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY:
-    return ReturnValue(uint32_t{Device->ZeDeviceProperties->coreClockRate});
-  case UR_DEVICE_INFO_ADDRESS_BITS: {
-    // TODO: To confirm with spec.
-    return ReturnValue(uint32_t{64});
-  }
-  case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
-    return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize});
-  case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: {
-    uint64_t GlobalMemSize = 0;
-    // Support to read physicalSize depends on kernel,
-    // so fallback into reading totalSize if physicalSize
-    // is not available.
-    for (const auto &ZeDeviceMemoryExtProperty :
-         Device->ZeDeviceMemoryProperties->second) {
-      GlobalMemSize += ZeDeviceMemoryExtProperty.physicalSize;
-    }
-    if (GlobalMemSize == 0) {
-      for (const auto &ZeDeviceMemoryProperty :
-           Device->ZeDeviceMemoryProperties->first) {
-        GlobalMemSize += ZeDeviceMemoryProperty.totalSize;
-      }
-    }
-    return ReturnValue(uint64_t{GlobalMemSize});
-  }
-  case UR_DEVICE_INFO_LOCAL_MEM_SIZE:
-    return ReturnValue(
-        uint64_t{Device->ZeDeviceComputeProperties->maxSharedLocalMemory});
-  case UR_DEVICE_INFO_IMAGE_SUPPORTED:
-    return ReturnValue(
-        uint32_t{Device->ZeDeviceImageProperties->maxImageDims1D > 0});
-  case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY:
-    return ReturnValue(uint32_t{(Device->ZeDeviceProperties->flags &
-                                 ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0});
-  case UR_DEVICE_INFO_AVAILABLE:
-    return ReturnValue(uint32_t{ZeDevice ? true : false});
-  case UR_DEVICE_INFO_VENDOR:
-    // TODO: Level-Zero does not return vendor's name at the moment
-    // only the ID.
-    return ReturnValue("Intel(R) Corporation");
-  case UR_DEVICE_INFO_DRIVER_VERSION:
-    return ReturnValue(Device->Platform->ZeDriverVersion.c_str());
-  case UR_DEVICE_INFO_VERSION:
-    return ReturnValue(Device->Platform->ZeDriverApiVersion.c_str());
-  case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: {
-    auto Res = Device->Platform->populateDeviceCacheIfNeeded();
-    if (Res != UR_RESULT_SUCCESS) {
-      return Res;
-    }
-    return ReturnValue((uint32_t)Device->SubDevices.size());
-  }
-  case UR_DEVICE_INFO_REFERENCE_COUNT:
-    return ReturnValue(uint32_t{Device->RefCount.load()});
-  case UR_DEVICE_INFO_PARTITION_PROPERTIES: {
-    // SYCL spec says: if this SYCL device cannot be partitioned into at least
-    // two sub devices then the returned vector must be empty.
-    auto Res = Device->Platform->populateDeviceCacheIfNeeded();
-    if (Res != UR_RESULT_SUCCESS) {
-      return Res;
-    }
-
-    uint32_t ZeSubDeviceCount = Device->SubDevices.size();
-    if (ZeSubDeviceCount < 2) {
-      return ReturnValue((ur_device_partition_property_t)0);
-    }
-    bool PartitionedByCSlice = Device->SubDevices[0]->isCCS();
-
-    auto ReturnHelper = [&](auto... Partitions) {
-      struct {
-        ur_device_partition_property_t Arr[sizeof...(Partitions) + 1];
-      } PartitionProperties = {
-          {Partitions..., ur_device_partition_property_t(0)}};
-      return ReturnValue(PartitionProperties);
-    };
-
-    if (ExposeCSliceInAffinityPartitioning) {
-      if (PartitionedByCSlice)
-        return ReturnHelper(UR_DEVICE_PARTITION_BY_CSLICE,
-                            UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN);
-
-      else
-        return ReturnHelper(UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN);
-    } else {
-      return ReturnHelper(PartitionedByCSlice
-                              ? UR_DEVICE_PARTITION_BY_CSLICE
-                              : UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN);
-    }
-    break;
-  }
-  case UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN:
-    return ReturnValue(ur_device_affinity_domain_flag_t(
-        UR_DEVICE_AFFINITY_DOMAIN_FLAG_NUMA |
-        UR_DEVICE_AFFINITY_DOMAIN_FLAG_NEXT_PARTITIONABLE));
-  case UR_DEVICE_INFO_PARTITION_TYPE: {
-    // For root-device there is no partitioning to report.
-    if (!Device->isSubDevice())
-      return ReturnValue(ur_device_partition_property_t(0));
-
-    if (Device->isCCS()) {
-      struct {
-        ur_device_partition_property_t Arr[2];
-      } PartitionProperties = {
-          {UR_DEVICE_PARTITION_BY_CSLICE, ur_device_partition_property_t(0)}};
-      return ReturnValue(PartitionProperties);
-    }
-
-    struct {
-      ur_device_partition_property_t Arr[3];
-    } PartitionProperties = {
-        {UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
-         (ur_device_partition_property_t)
-             UR_DEVICE_AFFINITY_DOMAIN_FLAG_NEXT_PARTITIONABLE,
-         ur_device_partition_property_t(0)}};
-    return ReturnValue(PartitionProperties);
-  }
-
-    // Everything under here is not supported yet
-
-  case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION:
-    return ReturnValue("");
-  case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC:
-    return ReturnValue(uint32_t{true});
-  case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE:
-    return ReturnValue(
-        size_t{Device->ZeDeviceModuleProperties->printfBufferSize});
-  case UR_DEVICE_INFO_PROFILE:
-    return ReturnValue("FULL_PROFILE");
-  case UR_DEVICE_INFO_BUILT_IN_KERNELS:
-    // TODO: To find out correct value
-    return ReturnValue("");
-  case UR_DEVICE_INFO_QUEUE_PROPERTIES:
-    return ReturnValue(
-        ur_queue_flag_t(UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE |
-                        UR_QUEUE_FLAG_PROFILING_ENABLE));
-  case UR_DEVICE_INFO_EXECUTION_CAPABILITIES:
-    return ReturnValue(ur_device_exec_capability_flag_t{
-        UR_DEVICE_EXEC_CAPABILITY_FLAG_NATIVE_KERNEL});
-  case UR_DEVICE_INFO_ENDIAN_LITTLE:
-    return ReturnValue(uint32_t{true});
-  case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT:
-    return ReturnValue(uint32_t{Device->ZeDeviceProperties->flags &
-                                ZE_DEVICE_PROPERTY_FLAG_ECC});
-  case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION:
-    return ReturnValue(size_t{Device->ZeDeviceProperties->timerResolution});
-  case UR_DEVICE_INFO_LOCAL_MEM_TYPE:
-    return ReturnValue(UR_DEVICE_LOCAL_MEM_TYPE_LOCAL);
-  case UR_DEVICE_INFO_MAX_CONSTANT_ARGS:
-    return ReturnValue(uint32_t{64});
-  case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE:
-    return ReturnValue(
-        uint64_t{Device->ZeDeviceImageProperties->maxImageBufferSize});
-  case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE:
-    return ReturnValue(UR_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE);
-  case UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE:
-    return ReturnValue(
-        // TODO[1.0]: how to query cache line-size?
-        uint32_t{1});
-  case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE:
-    return ReturnValue(uint64_t{Device->ZeDeviceCacheProperties->cacheSize});
-  case UR_DEVICE_INFO_MAX_PARAMETER_SIZE:
-    return ReturnValue(
-        size_t{Device->ZeDeviceModuleProperties->maxArgumentsSize});
-  case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN:
-    // SYCL/OpenCL spec is vague on what this means exactly, but seems to
-    // be for "alignment requirement (in bits) for sub-buffer offsets."
-    // An OpenCL implementation returns 8*128, but Level Zero can do just 8,
-    // meaning unaligned access for values of types larger than 8 bits.
-    return ReturnValue(uint32_t{8});
-  case UR_DEVICE_INFO_MAX_SAMPLERS:
-    return ReturnValue(uint32_t{Device->ZeDeviceImageProperties->maxSamplers});
-  case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS:
-    return ReturnValue(
-        uint32_t{Device->ZeDeviceImageProperties->maxReadImageArgs});
-  case UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS:
-    return ReturnValue(
-        uint32_t{Device->ZeDeviceImageProperties->maxWriteImageArgs});
-  case UR_DEVICE_INFO_SINGLE_FP_CONFIG: {
-    uint64_t SingleFPValue = 0;
-    ze_device_fp_flags_t ZeSingleFPCapabilities =
-        Device->ZeDeviceModuleProperties->fp32flags;
-    if (ZE_DEVICE_FP_FLAG_DENORM & ZeSingleFPCapabilities) {
-      SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_DENORM;
-    }
-    if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeSingleFPCapabilities) {
-      SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN;
-    }
-    if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeSingleFPCapabilities) {
-      SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST;
-    }
-    if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeSingleFPCapabilities) {
-      SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO;
-    }
-    if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeSingleFPCapabilities) {
-      SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF;
-    }
-    if (ZE_DEVICE_FP_FLAG_FMA & ZeSingleFPCapabilities) {
-      SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
-    }
-    if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeSingleFPCapabilities) {
-      SingleFPValue |=
-          UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
-    }
-    return ReturnValue(uint64_t{SingleFPValue});
-  }
-  case UR_DEVICE_INFO_HALF_FP_CONFIG: {
-    uint64_t HalfFPValue = 0;
-    ze_device_fp_flags_t ZeHalfFPCapabilities =
-        Device->ZeDeviceModuleProperties->fp16flags;
-    if (ZE_DEVICE_FP_FLAG_DENORM & ZeHalfFPCapabilities) {
-      HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_DENORM;
-    }
-    if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeHalfFPCapabilities) {
-      HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN;
-    }
-    if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeHalfFPCapabilities) {
-      HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST;
-    }
-    if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeHalfFPCapabilities) {
-      HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO;
-    }
-    if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeHalfFPCapabilities) {
-      HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF;
-    }
-    if (ZE_DEVICE_FP_FLAG_FMA & ZeHalfFPCapabilities) {
-      HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
-    }
-    if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeHalfFPCapabilities) {
-      HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
-    }
-    return ReturnValue(uint64_t{HalfFPValue});
-  }
-  case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: {
-    uint64_t DoubleFPValue = 0;
-    ze_device_fp_flags_t ZeDoubleFPCapabilities =
-        Device->ZeDeviceModuleProperties->fp64flags;
-    if (ZE_DEVICE_FP_FLAG_DENORM & ZeDoubleFPCapabilities) {
-      DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_DENORM;
-    }
-    if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeDoubleFPCapabilities) {
-      DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN;
-    }
-    if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeDoubleFPCapabilities) {
-      DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST;
-    }
-    if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeDoubleFPCapabilities) {
-      DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO;
-    }
-    if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeDoubleFPCapabilities) {
-      DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF;
-    }
-    if (ZE_DEVICE_FP_FLAG_FMA & ZeDoubleFPCapabilities) {
-      DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
-    }
-    if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeDoubleFPCapabilities) {
-      DoubleFPValue |=
-          UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
-    }
-    return ReturnValue(uint64_t{DoubleFPValue});
-  }
-  case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH:
-    return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims2D});
-  case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT:
-    return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims2D});
-  case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH:
-    return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D});
-  case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT:
-    return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D});
-  case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH:
-    return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D});
-  case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE:
-    return ReturnValue(
-        size_t{Device->ZeDeviceImageProperties->maxImageBufferSize});
-  case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE:
-    return ReturnValue(
-        size_t{Device->ZeDeviceImageProperties->maxImageArraySlices});
-  // Handle SIMD widths.
-  // TODO: can we do better than this?
-  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR:
-  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR:
-    return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 1);
-  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT:
-  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT:
-    return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 2);
-  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT:
-  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT:
-    return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 4);
-  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG:
-  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG:
-    return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 8);
-  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT:
-  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT:
-    return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 4);
-  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE:
-  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE:
-    return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 8);
-  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF:
-  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF:
-    return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 2);
-  case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: {
-    // Max_num_sub_Groups = maxTotalGroupSize/min(set of subGroupSizes);
-    uint32_t MinSubGroupSize =
-        Device->ZeDeviceComputeProperties->subGroupSizes[0];
-    for (uint32_t I = 1;
-         I < Device->ZeDeviceComputeProperties->numSubGroupSizes; I++) {
-      if (MinSubGroupSize > Device->ZeDeviceComputeProperties->subGroupSizes[I])
-        MinSubGroupSize = Device->ZeDeviceComputeProperties->subGroupSizes[I];
-    }
-    return ReturnValue(Device->ZeDeviceComputeProperties->maxTotalGroupSize /
-                       MinSubGroupSize);
-  }
-  case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: {
-    // TODO: Not supported yet. Needs to be updated after support is added.
-    return ReturnValue(uint32_t{false});
-  }
-  case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
-    // ze_device_compute_properties.subGroupSizes is in uint32_t whereas the
-    // expected return is size_t datatype. size_t can be 8 bytes of data.
-    return ReturnValue.template operator()<size_t>(
-        Device->ZeDeviceComputeProperties->subGroupSizes,
-        Device->ZeDeviceComputeProperties->numSubGroupSizes);
-  }
-  case UR_DEVICE_INFO_IL_VERSION: {
-    // Set to a space separated list of IL version strings of the form
-    // <IL_Prefix>_<Major_version>.<Minor_version>.
-    // "SPIR-V" is a required IL prefix when cl_khr_il_progam extension is
-    // reported.
-    uint32_t SpirvVersion =
-        Device->ZeDeviceModuleProperties->spirvVersionSupported;
-    uint32_t SpirvVersionMajor = ZE_MAJOR_VERSION(SpirvVersion);
-    uint32_t SpirvVersionMinor = ZE_MINOR_VERSION(SpirvVersion);
-
-    char SpirvVersionString[50];
-    int Len = sprintf(SpirvVersionString, "SPIR-V_%d.%d ", SpirvVersionMajor,
-                      SpirvVersionMinor);
-    // returned string to contain only len number of characters.
-    std::string ILVersion(SpirvVersionString, Len);
-    return ReturnValue(ILVersion.c_str());
-  }
-  case UR_DEVICE_INFO_USM_HOST_SUPPORT:
-  case UR_DEVICE_INFO_USM_DEVICE_SUPPORT:
-  case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT:
-  case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT:
-  case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: {
-    auto MapCaps = [](const ze_memory_access_cap_flags_t &ZeCapabilities) {
-      uint64_t Capabilities = 0;
-      if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_RW)
-        Capabilities |= UR_EXT_USM_CAPS_ACCESS;
-      if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC)
-        Capabilities |= UR_EXT_USM_CAPS_ATOMIC_ACCESS;
-      if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT)
-        Capabilities |= UR_EXT_USM_CAPS_CONCURRENT_ACCESS;
-      if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC)
-        Capabilities |= UR_EXT_USM_CAPS_CONCURRENT_ATOMIC_ACCESS;
-      return Capabilities;
-    };
-    auto &Props = Device->ZeDeviceMemoryAccessProperties;
-    switch (ParamName) {
-    case UR_DEVICE_INFO_USM_HOST_SUPPORT:
-      return ReturnValue(MapCaps(Props->hostAllocCapabilities));
-    case UR_DEVICE_INFO_USM_DEVICE_SUPPORT:
-      return ReturnValue(MapCaps(Props->deviceAllocCapabilities));
-    case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT:
-      return ReturnValue(MapCaps(Props->sharedSingleDeviceAllocCapabilities));
-    case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT:
-      return ReturnValue(MapCaps(Props->sharedCrossDeviceAllocCapabilities));
-    case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT:
-      return ReturnValue(MapCaps(Props->sharedSystemAllocCapabilities));
-    default:
-      die("piDeviceGetInfo: enexpected ParamName.");
-    }
-  }
-
-    // intel extensions for GPU information
-  case UR_DEVICE_INFO_DEVICE_ID:
-    return ReturnValue(uint32_t{Device->ZeDeviceProperties->deviceId});
-  case UR_DEVICE_INFO_PCI_ADDRESS: {
-    if (getenv("ZES_ENABLE_SYSMAN") == nullptr) {
-      urPrint("Set SYCL_ENABLE_PCI=1 to obtain PCI data.\n");
-      return UR_RESULT_ERROR_INVALID_VALUE;
-    }
-    ZesStruct<zes_pci_properties_t> ZeDevicePciProperties;
-    ZE_CALL(zesDevicePciGetProperties, (ZeDevice, &ZeDevicePciProperties));
-    constexpr size_t AddressBufferSize = 13;
-    char AddressBuffer[AddressBufferSize];
-    std::snprintf(AddressBuffer, AddressBufferSize, "%04x:%02x:%02x.%01x",
-                  ZeDevicePciProperties.address.domain,
-                  ZeDevicePciProperties.address.bus,
-                  ZeDevicePciProperties.address.device,
-                  ZeDevicePciProperties.address.function);
-    return ReturnValue(AddressBuffer);
-  }
-
-  case UR_EXT_DEVICE_INFO_FREE_MEMORY: {
-    if (getenv("ZES_ENABLE_SYSMAN") == nullptr) {
-      setErrorMessage("Set ZES_ENABLE_SYSMAN=1 to obtain free memory",
-                      UR_RESULT_SUCCESS);
-      return UR_EXT_RESULT_ADAPTER_SPECIFIC_ERROR;
-    }
-    // Only report device memory which zeMemAllocDevice can allocate from.
-    // Currently this is only the one enumerated with ordinal 0.
-    uint64_t FreeMemory = 0;
-    uint32_t MemCount = 0;
-    ZE_CALL(zesDeviceEnumMemoryModules, (ZeDevice, &MemCount, nullptr));
-    if (MemCount != 0) {
-      std::vector<zes_mem_handle_t> ZesMemHandles(MemCount);
-      ZE_CALL(zesDeviceEnumMemoryModules,
-              (ZeDevice, &MemCount, ZesMemHandles.data()));
-      for (auto &ZesMemHandle : ZesMemHandles) {
-        ZesStruct<zes_mem_properties_t> ZesMemProperties;
-        ZE_CALL(zesMemoryGetProperties, (ZesMemHandle, &ZesMemProperties));
-        // For root-device report memory from all memory modules since that
-        // is what totally available in the default implicit scaling mode.
-        // For sub-devices only report memory local to them.
-        if (!Device->isSubDevice() || Device->ZeDeviceProperties->subdeviceId ==
-                                          ZesMemProperties.subdeviceId) {
-
-          ZesStruct<zes_mem_state_t> ZesMemState;
-          ZE_CALL(zesMemoryGetState, (ZesMemHandle, &ZesMemState));
-          FreeMemory += ZesMemState.free;
-        }
-      }
-    }
-    return ReturnValue(FreeMemory);
-  }
-  case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: {
-    // If there are not any memory modules then return 0.
-    if (Device->ZeDeviceMemoryProperties->first.empty())
-      return ReturnValue(uint32_t{0});
-
-    // If there are multiple memory modules on the device then we have to report
-    // the value of the slowest memory.
-    auto Comp = [](const ze_device_memory_properties_t &A,
-                   const ze_device_memory_properties_t &B) -> bool {
-      return A.maxClockRate < B.maxClockRate;
-    };
-    auto MinIt =
-        std::min_element(Device->ZeDeviceMemoryProperties->first.begin(),
-                         Device->ZeDeviceMemoryProperties->first.end(), Comp);
-    return ReturnValue(uint32_t{MinIt->maxClockRate});
-  }
-  case UR_EXT_DEVICE_INFO_MEMORY_BUS_WIDTH: {
-    // If there are not any memory modules then return 0.
-    if (Device->ZeDeviceMemoryProperties->first.empty())
-      return ReturnValue(uint32_t{0});
-
-    // If there are multiple memory modules on the device then we have to report
-    // the value of the slowest memory.
-    auto Comp = [](const ze_device_memory_properties_t &A,
-                   const ze_device_memory_properties_t &B) -> bool {
-      return A.maxBusWidth < B.maxBusWidth;
-    };
-    auto MinIt =
-        std::min_element(Device->ZeDeviceMemoryProperties->first.begin(),
-                         Device->ZeDeviceMemoryProperties->first.end(), Comp);
-    return ReturnValue(uint32_t{MinIt->maxBusWidth});
-  }
-  case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: {
-    if (Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
-            .ZeIndex >= 0)
-      // Sub-sub-device represents a particular compute index already.
-      return ReturnValue(int32_t{1});
-
-    auto ZeDeviceNumIndices =
-        Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
-            .ZeProperties.numQueues;
-    return ReturnValue(int32_t(ZeDeviceNumIndices));
-  } break;
-  case UR_DEVICE_INFO_GPU_EU_COUNT: {
-    uint32_t count = Device->ZeDeviceProperties->numEUsPerSubslice *
-                     Device->ZeDeviceProperties->numSubslicesPerSlice *
-                     Device->ZeDeviceProperties->numSlices;
-    return ReturnValue(uint32_t{count});
-  }
-  case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH:
-    return ReturnValue(
-        uint32_t{Device->ZeDeviceProperties->physicalEUSimdWidth});
-  case UR_EXT_DEVICE_INFO_GPU_SLICES:
-    return ReturnValue(uint32_t{Device->ZeDeviceProperties->numSlices});
-  case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
-    return ReturnValue(
-        uint32_t{Device->ZeDeviceProperties->numSubslicesPerSlice});
-  case UR_EXT_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
-    return ReturnValue(uint32_t{Device->ZeDeviceProperties->numEUsPerSubslice});
-  case UR_EXT_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
-    return ReturnValue(uint32_t{Device->ZeDeviceProperties->numThreadsPerEU});
-  case UR_EXT_DEVICE_INFO_MAX_MEM_BANDWIDTH:
-    // currently not supported in level zero runtime
-    return UR_RESULT_ERROR_INVALID_VALUE;
-  case UR_DEVICE_INFO_BFLOAT16: {
-    // bfloat16 math functions are not yet supported on Intel GPUs.
-    return ReturnValue(bool{false});
-  }
-  case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
-    // There are no explicit restrictions in L0 programming guide, so assume all
-    // are supported
-    ur_memory_scope_capability_flags_t result =
-        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
-        UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
-        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
-        UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE |
-        UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM;
-
-    return ReturnValue(result);
-  }
-  case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: {
-    // There are no explicit restrictions in L0 programming guide, so assume all
-    // are supported
-    ur_memory_order_capability_flags_t result =
-        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
-        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
-        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
-        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL |
-        UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST;
-
-    return ReturnValue(result);
-  }
-  case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: {
-    // There are no explicit restrictions in L0 programming guide, so assume all
-    // are supported
-    ur_memory_scope_capability_flags_t result =
-        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
-        UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
-        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
-        UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE |
-        UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM;
-
-    return ReturnValue(result);
-  }
-
-  case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
-    ur_memory_order_capability_flags_t capabilities =
-        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
-        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
-        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
-        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL |
-        UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST;
-    return ReturnValue(capabilities);
-  }
-  case UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
-    return ReturnValue(pi_bool{false});
-  case UR_DEVICE_INFO_IMAGE_SRGB:
-    return ReturnValue(pi_bool{false});
-
-  // TODO: Implement.
-  default:
-    urPrint("Unsupported ParamName in piGetDeviceInfo\n");
-    urPrint("ParamName=%d(0x%x)\n", ParamName, ParamName);
-    return UR_RESULT_ERROR_INVALID_VALUE;
-  }
-
-  return UR_RESULT_SUCCESS;
-}
-
-// UR_L0_USE_COPY_ENGINE can be set to an integer value, or
-// a pair of integer values of the form "lower_index:upper_index".
-// Here, the indices point to copy engines in a list of all available copy
-// engines.
-// This functions returns this pair of indices.
-// If the user specifies only a single integer, a value of 0 indicates that
-// the copy engines will not be used at all. A value of 1 indicates that all
-// available copy engines can be used.
-const std::pair<int, int>
-getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device) {
-  const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE");
-  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE");
-  static const char *EnvVar = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-
-  // If the environment variable is not set, no copy engines are used when
-  // immediate commandlists are being used. For standard commandlists all are
-  // used.
-  if (!EnvVar) {
-    if (Device->useImmediateCommandLists())
-      return std::pair<int, int>(0, 0); // Only main copy engine will be used.
-    return std::pair<int, int>(0, INT_MAX); // All copy engines will be used.
-  }
-  std::string CopyEngineRange = EnvVar;
-  // Environment variable can be a single integer or a pair of integers
-  // separated by ":"
-  auto pos = CopyEngineRange.find(":");
-  if (pos == std::string::npos) {
-    bool UseCopyEngine = (std::stoi(CopyEngineRange) != 0);
-    if (UseCopyEngine)
-      return std::pair<int, int>(0, INT_MAX); // All copy engines can be used.
-    return std::pair<int, int>(-1, -1);       // No copy engines will be used.
-  }
-  int LowerCopyEngineIndex = std::stoi(CopyEngineRange.substr(0, pos));
-  int UpperCopyEngineIndex = std::stoi(CopyEngineRange.substr(pos + 1));
-  if ((LowerCopyEngineIndex > UpperCopyEngineIndex) ||
-      (LowerCopyEngineIndex < -1) || (UpperCopyEngineIndex < -1)) {
-    urPrint("UR_L0_LEVEL_ZERO_USE_COPY_ENGINE: invalid value provided, "
-            "default set.\n");
-    LowerCopyEngineIndex = 0;
-    UpperCopyEngineIndex = INT_MAX;
-  }
-  return std::pair<int, int>(LowerCopyEngineIndex, UpperCopyEngineIndex);
-}
-
-bool CopyEngineRequested(const ur_device_handle_t &Device) {
-  int LowerCopyQueueIndex = getRangeOfAllowedCopyEngines(Device).first;
-  int UpperCopyQueueIndex = getRangeOfAllowedCopyEngines(Device).second;
-  return ((LowerCopyQueueIndex != -1) || (UpperCopyQueueIndex != -1));
-}
-
-// Whether immediate commandlists will be used for kernel launches and copies.
-// The default is standard commandlists. Setting 1 or 2 specifies use of
-// immediate commandlists.
-
-// Get value of immediate commandlists env var setting or -1 if unset.
-_ur_device_handle_t::ImmCmdlistMode
-_ur_device_handle_t::useImmediateCommandLists() {
-  // If immediate commandlist setting is not explicitly set, then use the device
-  // default.
-  static const int ImmediateCommandlistsSetting = [] {
-    char *UrRet = std::getenv("UR_L0_USE_IMMEDIATE_COMMANDLISTS");
-    char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS");
-    const char *ImmediateCommandlistsSettingStr =
-        UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-    if (!ImmediateCommandlistsSettingStr)
-      return -1;
-    return std::stoi(ImmediateCommandlistsSettingStr);
-  }();
-
-  if (ImmediateCommandlistsSetting == -1)
-  // Immediate command lists will be used by default only on Linux PVC.
-#ifdef _WIN32
-    return NotUsed;
-#else
-    return isPVC() ? PerQueue : NotUsed;
-#endif
-
-  switch (ImmediateCommandlistsSetting) {
-  case 0:
-    return NotUsed;
-  case 1:
-    return PerQueue;
-  case 2:
-    return PerThreadPerQueue;
-  default:
-    return NotUsed;
-  }
-}
-
-// Get value of device scope events env var setting or default setting
-static const EventsScope DeviceEventsSetting = [] {
-  char *UrRet = std::getenv("UR_L0_DEVICE_SCOPE_EVENTS");
-  char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS");
-  const char *DeviceEventsSettingStr =
-      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-  if (DeviceEventsSettingStr) {
-    // Override the default if user has explicitly chosen the events scope.
-    switch (std::stoi(DeviceEventsSettingStr)) {
-    case 0:
-      return AllHostVisible;
-    case 1:
-      return OnDemandHostVisibleProxy;
-    case 2:
-      return LastCommandInBatchHostVisible;
-    default:
-      // fallthrough to default setting
-      break;
-    }
-  }
-  // This is our default setting, which is expected to be the fastest
-  // with the modern GPU drivers.
-  return AllHostVisible;
-}();
-
-ur_result_t _ur_device_handle_t::initialize(int SubSubDeviceOrdinal,
-                                            int SubSubDeviceIndex) {
-
-  // Maintain various device properties cache.
-  // Note that we just describe here how to compute the data.
-  // The real initialization is upon first access.
-  //
-  auto ZeDevice = this->ZeDevice;
-  ZeDeviceProperties.Compute = [ZeDevice](ze_device_properties_t &Properties) {
-    ZE_CALL_NOCHECK(zeDeviceGetProperties, (ZeDevice, &Properties));
-  };
-
-  ZeDeviceComputeProperties.Compute =
-      [ZeDevice](ze_device_compute_properties_t &Properties) {
-        ZE_CALL_NOCHECK(zeDeviceGetComputeProperties, (ZeDevice, &Properties));
-      };
-
-  ZeDeviceImageProperties.Compute =
-      [ZeDevice](ze_device_image_properties_t &Properties) {
-        ZE_CALL_NOCHECK(zeDeviceGetImageProperties, (ZeDevice, &Properties));
-      };
-
-  ZeDeviceModuleProperties.Compute =
-      [ZeDevice](ze_device_module_properties_t &Properties) {
-        ZE_CALL_NOCHECK(zeDeviceGetModuleProperties, (ZeDevice, &Properties));
-      };
-
-  ZeDeviceMemoryProperties.Compute =
-      [ZeDevice](
-          std::pair<std::vector<ZeStruct<ze_device_memory_properties_t>>,
-                    std::vector<ZeStruct<ze_device_memory_ext_properties_t>>>
-              &Properties) {
-        uint32_t Count = 0;
-        ZE_CALL_NOCHECK(zeDeviceGetMemoryProperties,
-                        (ZeDevice, &Count, nullptr));
-
-        auto &PropertiesVector = Properties.first;
-        auto &PropertiesExtVector = Properties.second;
-
-        PropertiesVector.resize(Count);
-        PropertiesExtVector.resize(Count);
-        // Request for extended memory properties be read in
-        for (uint32_t I = 0; I < Count; ++I)
-          PropertiesVector[I].pNext = (void *)&PropertiesExtVector[I];
-
-        ZE_CALL_NOCHECK(zeDeviceGetMemoryProperties,
-                        (ZeDevice, &Count, PropertiesVector.data()));
-      };
-
-  ZeDeviceMemoryAccessProperties.Compute =
-      [ZeDevice](ze_device_memory_access_properties_t &Properties) {
-        ZE_CALL_NOCHECK(zeDeviceGetMemoryAccessProperties,
-                        (ZeDevice, &Properties));
-      };
-
-  ZeDeviceCacheProperties.Compute =
-      [ZeDevice](ze_device_cache_properties_t &Properties) {
-        // TODO: Since v1.0 there can be multiple cache properties.
-        // For now remember the first one, if any.
-        uint32_t Count = 0;
-        ZE_CALL_NOCHECK(zeDeviceGetCacheProperties,
-                        (ZeDevice, &Count, nullptr));
-        if (Count > 0)
-          Count = 1;
-        ZE_CALL_NOCHECK(zeDeviceGetCacheProperties,
-                        (ZeDevice, &Count, &Properties));
-      };
-
-  ImmCommandListUsed = this->useImmediateCommandLists();
-
-  if (ImmCommandListUsed == ImmCmdlistMode::NotUsed) {
-    ZeEventsScope = DeviceEventsSetting;
-  }
-
-  uint32_t numQueueGroups = 0;
-  ZE_CALL(zeDeviceGetCommandQueueGroupProperties,
-          (ZeDevice, &numQueueGroups, nullptr));
-  if (numQueueGroups == 0) {
-    return UR_RESULT_ERROR_UNKNOWN;
-  }
-  urPrint("NOTE: Number of queue groups = %d\n", numQueueGroups);
-  std::vector<ZeStruct<ze_command_queue_group_properties_t>>
-      QueueGroupProperties(numQueueGroups);
-  ZE_CALL(zeDeviceGetCommandQueueGroupProperties,
-          (ZeDevice, &numQueueGroups, QueueGroupProperties.data()));
-
-  // Initialize ordinal and compute queue group properties
-  for (uint32_t i = 0; i < numQueueGroups; i++) {
-    if (QueueGroupProperties[i].flags &
-        ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
-      QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal =
-          i;
-      QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
-          .ZeProperties = QueueGroupProperties[i];
-      break;
-    }
-  }
-
-  // Reinitialize a sub-sub-device with its own ordinal, index.
-  // Our sub-sub-device representation is currently [Level-Zero sub-device
-  // handle + Level-Zero compute group/engine index]. Only the specified
-  // index queue will be used to submit work to the sub-sub-device.
-  if (SubSubDeviceOrdinal >= 0) {
-    QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal =
-        SubSubDeviceOrdinal;
-    QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeIndex =
-        SubSubDeviceIndex;
-  } else { // Proceed with initialization for root and sub-device
-           // How is it possible that there are no "compute" capabilities?
-    if (QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal <
-        0) {
-      return UR_RESULT_ERROR_UNKNOWN;
-    }
-
-    if (CopyEngineRequested((ur_device_handle_t)this)) {
-      for (uint32_t i = 0; i < numQueueGroups; i++) {
-        if (((QueueGroupProperties[i].flags &
-              ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) &&
-            (QueueGroupProperties[i].flags &
-             ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) {
-          if (QueueGroupProperties[i].numQueues == 1) {
-            QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal = i;
-            QueueGroup[queue_group_info_t::MainCopy].ZeProperties =
-                QueueGroupProperties[i];
-          } else {
-            QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal = i;
-            QueueGroup[queue_group_info_t::LinkCopy].ZeProperties =
-                QueueGroupProperties[i];
-            break;
-          }
-        }
-      }
-      if (QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal < 0)
-        urPrint("NOTE: main blitter/copy engine is not available\n");
-      else
-        urPrint("NOTE: main blitter/copy engine is available\n");
-
-      if (QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal < 0)
-        urPrint("NOTE: link blitter/copy engines are not available\n");
-      else
-        urPrint("NOTE: link blitter/copy engines are available\n");
-    }
-  }
-
-  return UR_RESULT_SUCCESS;
-}
-
-// Get the cached PI device created for the L0 device handle.
-// Return NULL if no such PI device found.
-ur_device_handle_t
-_ur_platform_handle_t::getDeviceFromNativeHandle(ze_device_handle_t ZeDevice) {
-
-  ur_result_t Res = populateDeviceCacheIfNeeded();
-  if (Res != UR_RESULT_SUCCESS) {
-    return nullptr;
-  }
-
-  // TODO: our sub-sub-device representation is currently [Level-Zero device
-  // handle + Level-Zero compute group/engine index], so there is now no 1:1
-  // mapping from L0 device handle to PI device assumed in this function. Until
-  // Level-Zero adds unique ze_device_handle_t for sub-sub-devices, here we
-  // filter out PI sub-sub-devices.
-  std::shared_lock<ur_shared_mutex> Lock(PiDevicesCacheMutex);
-  auto it = std::find_if(PiDevicesCache.begin(), PiDevicesCache.end(),
-                         [&](std::unique_ptr<ur_device_handle_t_> &D) {
-                           return D.get()->ZeDevice == ZeDevice &&
-                                  (D.get()->RootDevice == nullptr ||
-                                   D.get()->RootDevice->RootDevice == nullptr);
-                         });
-  if (it != PiDevicesCache.end()) {
-    return (*it).get();
-  }
-  return nullptr;
-}
-
-// Check the device cache and load it if necessary.
-ur_result_t _ur_platform_handle_t::populateDeviceCacheIfNeeded() {
-  std::scoped_lock<ur_shared_mutex> Lock(PiDevicesCacheMutex);
-
-  if (DeviceCachePopulated) {
-    return UR_RESULT_SUCCESS;
-  }
-
-  uint32_t ZeDeviceCount = 0;
-  ZE_CALL(zeDeviceGet, (ZeDriver, &ZeDeviceCount, nullptr));
-
-  try {
-    std::vector<ze_device_handle_t> ZeDevices(ZeDeviceCount);
-    ZE_CALL(zeDeviceGet, (ZeDriver, &ZeDeviceCount, ZeDevices.data()));
-
-    for (uint32_t I = 0; I < ZeDeviceCount; ++I) {
-      std::unique_ptr<ur_device_handle_t_> Device(
-          new ur_device_handle_t_(ZeDevices[I], (ur_platform_handle_t)this));
-      auto Result = Device->initialize();
-      if (Result != UR_RESULT_SUCCESS) {
-        return Result;
-      }
-
-      // Additionally we need to cache all sub-devices too, such that they
-      // are readily visible to the piextDeviceCreateWithNativeHandle.
-      //
-      uint32_t SubDevicesCount = 0;
-      ZE_CALL(zeDeviceGetSubDevices,
-              (Device->ZeDevice, &SubDevicesCount, nullptr));
-
-      auto ZeSubdevices = new ze_device_handle_t[SubDevicesCount];
-      ZE_CALL(zeDeviceGetSubDevices,
-              (Device->ZeDevice, &SubDevicesCount, ZeSubdevices));
-
-      // Wrap the Level Zero sub-devices into PI sub-devices, and add them to
-      // cache.
-      for (uint32_t I = 0; I < SubDevicesCount; ++I) {
-        std::unique_ptr<ur_device_handle_t_> PiSubDevice(
-            new ur_device_handle_t_(ZeSubdevices[I], (ur_platform_handle_t)this,
-                                    Device.get()));
-        auto Result = PiSubDevice->initialize();
-        if (Result != UR_RESULT_SUCCESS) {
-          delete[] ZeSubdevices;
-          return Result;
-        }
-
-        // collect all the ordinals for the sub-sub-devices
-        std::vector<int> Ordinals;
-
-        uint32_t numQueueGroups = 0;
-        ZE_CALL(zeDeviceGetCommandQueueGroupProperties,
-                (PiSubDevice->ZeDevice, &numQueueGroups, nullptr));
-        if (numQueueGroups == 0) {
-          return UR_RESULT_ERROR_UNKNOWN;
-        }
-        std::vector<ze_command_queue_group_properties_t> QueueGroupProperties(
-            numQueueGroups);
-        ZE_CALL(zeDeviceGetCommandQueueGroupProperties,
-                (PiSubDevice->ZeDevice, &numQueueGroups,
-                 QueueGroupProperties.data()));
-
-        for (uint32_t i = 0; i < numQueueGroups; i++) {
-          if (QueueGroupProperties[i].flags &
-                  ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE &&
-              QueueGroupProperties[i].numQueues > 1) {
-            Ordinals.push_back(i);
-          }
-        }
-
-        // If isn't PVC, then submissions to different CCS can be executed on
-        // the same EUs still, so we cannot treat them as sub-sub-devices.
-        if (PiSubDevice->isPVC() || ExposeCSliceInAffinityPartitioning) {
-          // Create PI sub-sub-devices with the sub-device for all the ordinals.
-          // Each {ordinal, index} points to a specific CCS which constructs
-          // a sub-sub-device at this point.
-          //
-          // FIXME: Level Zero creates multiple PiDevices for a single physical
-          // device when sub-device is partitioned into sub-sub-devices.
-          // Sub-sub-device is technically a command queue and we should not
-          // build program for each command queue. PiDevice is probably not the
-          // right abstraction for a Level Zero command queue.
-          for (uint32_t J = 0; J < Ordinals.size(); ++J) {
-            for (uint32_t K = 0;
-                 K < QueueGroupProperties[Ordinals[J]].numQueues; ++K) {
-              std::unique_ptr<ur_device_handle_t_> PiSubSubDevice(
-                  new ur_device_handle_t_(ZeSubdevices[I],
-                                          (ur_platform_handle_t)this,
-                                          PiSubDevice.get()));
-              auto Result = PiSubSubDevice->initialize(Ordinals[J], K);
-              if (Result != UR_RESULT_SUCCESS) {
-                return Result;
-              }
-
-              // save pointers to sub-sub-devices for quick retrieval in the
-              // future.
-              PiSubDevice->SubDevices.push_back(PiSubSubDevice.get());
-              PiDevicesCache.push_back(std::move(PiSubSubDevice));
-            }
-          }
-        }
-
-        // save pointers to sub-devices for quick retrieval in the future.
-        Device->SubDevices.push_back(PiSubDevice.get());
-        PiDevicesCache.push_back(std::move(PiSubDevice));
-      }
-      delete[] ZeSubdevices;
-
-      // Save the root device in the cache for future uses.
-      PiDevicesCache.push_back(std::move(Device));
-    }
-  } catch (const std::bad_alloc &) {
-    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return UR_RESULT_ERROR_UNKNOWN;
-  }
-  DeviceCachePopulated = true;
-  return UR_RESULT_SUCCESS;
-}
-
-ur_result_t urDeviceRetain(ur_device_handle_t Device) {
-  PI_ASSERT(Device, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-
-  // The root-device ref-count remains unchanged (always 1).
-  if (Device->isSubDevice()) {
-    Device->RefCount.increment();
-  }
-  return UR_RESULT_SUCCESS;
-}
-
-ur_result_t urDeviceRelease(ur_device_handle_t Device) {
-  PI_ASSERT(Device, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-
-  // Root devices are destroyed during the piTearDown process.
-  if (Device->isSubDevice()) {
-    if (Device->RefCount.decrementAndTest()) {
-      delete Device;
-    }
-  }
-
-  return UR_RESULT_SUCCESS;
-}
-
-void ZeUSMImportExtension::setZeUSMImport(_ur_platform_handle_t *Platform) {
-  // Whether env var SYCL_USM_HOSTPTR_IMPORT has been set requesting
-  // host ptr import during buffer creation.
-  const char *USMHostPtrImportStr = std::getenv("SYCL_USM_HOSTPTR_IMPORT");
-  if (!USMHostPtrImportStr || std::atoi(USMHostPtrImportStr) == 0)
-    return;
-
-  // Check if USM hostptr import feature is available.
-  ze_driver_handle_t DriverHandle = Platform->ZeDriver;
-  if (ZE_CALL_NOCHECK(
-          zeDriverGetExtensionFunctionAddress,
-          (DriverHandle, "zexDriverImportExternalPointer",
-           reinterpret_cast<void **>(&zexDriverImportExternalPointer))) == 0) {
-    ZE_CALL_NOCHECK(
-        zeDriverGetExtensionFunctionAddress,
-        (DriverHandle, "zexDriverReleaseImportedPointer",
-         reinterpret_cast<void **>(&zexDriverReleaseImportedPointer)));
-    // Hostptr import/release is turned on because it has been requested
-    // by the env var, and this platform supports the APIs.
-    Enabled = true;
-    // Hostptr import is only possible if piMemBufferCreate receives a
-    // hostptr as an argument. The SYCL runtime passes a host ptr
-    // only when SYCL_HOST_UNIFIED_MEMORY is enabled. Therefore we turn it on.
-    setEnvVar("SYCL_HOST_UNIFIED_MEMORY", "1");
-  }
-}
-void ZeUSMImportExtension::doZeUSMImport(ze_driver_handle_t DriverHandle,
-                                         void *HostPtr, size_t Size) {
-  ZE_CALL_NOCHECK(zexDriverImportExternalPointer,
-                  (DriverHandle, HostPtr, Size));
-}
-void ZeUSMImportExtension::doZeUSMRelease(ze_driver_handle_t DriverHandle,
-                                          void *HostPtr) {
-  ZE_CALL_NOCHECK(zexDriverReleaseImportedPointer, (DriverHandle, HostPtr));
-}
-
-ur_result_t urDevicePartition(
-    ur_device_handle_t Device, ///< [in] handle of the device to partition.
-    const ur_device_partition_property_t
-        *Properties, ///< [in] null-terminated array of <$_device_partition_t
-                     ///< enum, value> pairs.
-    uint32_t NumDevices, ///< [in] the number of sub-devices.
-    ur_device_handle_t
-        *OutDevices, ///< [out][optional][range(0, NumDevices)] array of handle
-                     ///< of devices. If NumDevices is less than the number of
-                     ///< sub-devices available, then the function shall only
-                     ///< retrieve that number of sub-devices.
-    uint32_t *pNumDevicesRet ///< [out][optional] pointer to the number of
-                             ///< sub-devices the device can be partitioned into
-                             ///< according to the partitioning property.
-) {
-  PI_ASSERT(Device, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  // Other partitioning ways are not supported by Level Zero
-  if (Properties[0] == UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN) {
-    if ((Properties[1] != UR_DEVICE_AFFINITY_DOMAIN_FLAG_NEXT_PARTITIONABLE &&
-         Properties[1] != UR_DEVICE_AFFINITY_DOMAIN_FLAG_NUMA)) {
-      return UR_RESULT_ERROR_INVALID_VALUE;
-    }
-  } else if (Properties[0] == UR_DEVICE_PARTITION_BY_CSLICE) {
-    if (Properties[1] != 0) {
-      return UR_RESULT_ERROR_INVALID_VALUE;
-    }
-  } else {
-    return UR_RESULT_ERROR_INVALID_VALUE;
-  }
-
-  // Devices cache is normally created in piDevicesGet but still make
-  // sure that cache is populated.
-  //
-  auto Res = Device->Platform->populateDeviceCacheIfNeeded();
-  if (Res != UR_RESULT_SUCCESS) {
-    return Res;
-  }
-
-  auto EffectiveNumDevices = [&]() -> decltype(Device->SubDevices.size()) {
-    if (Device->SubDevices.size() == 0)
-      return 0;
-
-    // Sub-Sub-Devices are partitioned by CSlices, not by affinity domain.
-    // However, if
-    // UR_L0_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING overrides that
-    // still expose CSlices in partitioning by affinity domain for compatibility
-    // reasons.
-    if (Properties[0] == UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN &&
-        !ExposeCSliceInAffinityPartitioning) {
-      if (Device->isSubDevice()) {
-        return 0;
-      }
-    }
-    if (Properties[0] == UR_DEVICE_PARTITION_BY_CSLICE) {
-      // Not a CSlice-based partitioning.
-      if (!Device->SubDevices[0]->isCCS()) {
-        return 0;
-      }
-    }
-
-    return Device->SubDevices.size();
-  }();
-
-  // TODO: Consider support for partitioning to <= total sub-devices.
-  // Currently supported partitioning (by affinity domain/numa) would always
-  // partition to all sub-devices.
-  //
-  if (NumDevices != 0)
-    PI_ASSERT(NumDevices == EffectiveNumDevices, UR_RESULT_ERROR_INVALID_VALUE);
-
-  for (uint32_t I = 0; I < NumDevices; I++) {
-    OutDevices[I] = Device->SubDevices[I];
-    // reusing the same pi_device needs to increment the reference count
-    urDeviceRetain(OutDevices[I]);
-  }
-
-  if (pNumDevicesRet) {
-    *pNumDevicesRet = EffectiveNumDevices;
-  }
-  return UR_RESULT_SUCCESS;
-}
-
-ur_result_t urInit([[maybe_unused]] ur_device_init_flags_t device_flags) {
-  return UR_RESULT_SUCCESS;
-}
-
-ur_result_t urTearDown([[maybe_unused]] void *pParams) {
-  return UR_RESULT_SUCCESS;
-}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp
index ed815806a2258..5095e168a4a3e 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp
@@ -21,190 +21,12 @@
 #include <zes_api.h>
 
 #include "ur_level_zero_common.hpp"
-
-struct _ur_platform_handle_t;
-// using ur_platform_handle_t = _ur_platform_handle_t *;
-struct _ur_device_handle_t;
-// using ur_device_handle_t = _ur_device_handle_t *;
-
-struct _ur_platform_handle_t : public _ur_platform {
-  _ur_platform_handle_t(ze_driver_handle_t Driver) : ZeDriver{Driver} {}
-  // Performs initialization of a newly constructed PI platform.
-  ur_result_t initialize();
-
-  // Level Zero lacks the notion of a platform, but there is a driver, which is
-  // a pretty good fit to keep here.
-  ze_driver_handle_t ZeDriver;
-
-  // Cache versions info from zeDriverGetProperties.
-  std::string ZeDriverVersion;
-  std::string ZeDriverApiVersion;
-  ze_api_version_t ZeApiVersion;
-
-  // Cache driver extensions
-  std::unordered_map<std::string, uint32_t> zeDriverExtensionMap;
-
-  // Flags to tell whether various Level Zero platform extensions are available.
-  bool ZeDriverGlobalOffsetExtensionFound{false};
-  bool ZeDriverModuleProgramExtensionFound{false};
-
-  // Cache UR devices for reuse
-  std::vector<std::unique_ptr<ur_device_handle_t_>> PiDevicesCache;
-  ur_shared_mutex PiDevicesCacheMutex;
-  bool DeviceCachePopulated = false;
-
-  // Check the device cache and load it if necessary.
-  ur_result_t populateDeviceCacheIfNeeded();
-
-  // Return the PI device from cache that represents given native device.
-  // If not found, then nullptr is returned.
-  ur_device_handle_t getDeviceFromNativeHandle(ze_device_handle_t);
-};
-
-enum EventsScope {
-  // All events are created host-visible.
-  AllHostVisible,
-  // All events are created with device-scope and only when
-  // host waits them or queries their status that a proxy
-  // host-visible event is created and set to signal after
-  // original event signals.
-  OnDemandHostVisibleProxy,
-  // All events are created with device-scope and only
-  // when a batch of commands is submitted for execution a
-  // last command in that batch is added to signal host-visible
-  // completion of each command in this batch (the default mode).
-  LastCommandInBatchHostVisible
-};
-
-struct _ur_device_handle_t : _ur_object {
-  _ur_device_handle_t(ze_device_handle_t Device, ur_platform_handle_t Plt,
-                      ur_device_handle_t ParentDevice = nullptr)
-      : ZeDevice{Device}, Platform{Plt}, RootDevice{ParentDevice},
-        ZeDeviceProperties{}, ZeDeviceComputeProperties{} {
-    // NOTE: one must additionally call initialize() to complete
-    // UR device creation.
-  }
-
-  // The helper structure that keeps info about a command queue groups of the
-  // device. It is not changed after it is initialized.
-  struct queue_group_info_t {
-    enum type {
-      MainCopy,
-      LinkCopy,
-      Compute,
-      Size // must be last
-    };
-
-    // Keep the ordinal of the commands group as returned by
-    // zeDeviceGetCommandQueueGroupProperties. A value of "-1" means that
-    // there is no such queue group available in the Level Zero runtime.
-    int32_t ZeOrdinal{-1};
-
-    // Keep the index of the specific queue in this queue group where
-    // all the command enqueues of the corresponding type should go to.
-    // The value of "-1" means that no hard binding is defined and
-    // implementation can choose specific queue index on its own.
-    int32_t ZeIndex{-1};
-
-    // Keeps the queue group properties.
-    ZeStruct<ze_command_queue_group_properties_t> ZeProperties;
-  };
-
-  std::vector<queue_group_info_t> QueueGroup =
-      std::vector<queue_group_info_t>(queue_group_info_t::Size);
-
-  // This returns "true" if a main copy engine is available for use.
-  bool hasMainCopyEngine() const {
-    return QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal >= 0;
-  }
-
-  // This returns "true" if a link copy engine is available for use.
-  bool hasLinkCopyEngine() const {
-    return QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal >= 0;
-  }
-
-  // This returns "true" if a main or link copy engine is available for use.
-  bool hasCopyEngine() const {
-    return hasMainCopyEngine() || hasLinkCopyEngine();
-  }
-
-  // Initialize the entire UR device.
-  // Optional param `SubSubDeviceOrdinal` `SubSubDeviceIndex` are the compute
-  // command queue ordinal and index respectively, used to initialize
-  // sub-sub-devices.
-  ur_result_t initialize(int SubSubDeviceOrdinal = -1,
-                         int SubSubDeviceIndex = -1);
-
-  // Level Zero device handle.
-  // This field is only set at _ur_device_handle_t creation time, and cannot
-  // change. Therefore it can be accessed without holding a lock on this
-  // _ur_device_handle_t.
-  const ze_device_handle_t ZeDevice;
-
-  // Keep the subdevices that are partitioned from this ur_device_handle_t for
-  // reuse The order of sub-devices in this vector is repeated from the
-  // ze_device_handle_t array that are returned from zeDeviceGetSubDevices()
-  // call, which will always return sub-devices in the fixed same order.
-  std::vector<ur_device_handle_t> SubDevices;
-
-  // PI platform to which this device belongs.
-  // This field is only set at _ur_device_handle_t creation time, and cannot
-  // change. Therefore it can be accessed without holding a lock on this
-  // _ur_device_handle_t.
-  ur_platform_handle_t Platform;
-
-  // Root-device of a sub-device, null if this is not a sub-device.
-  // This field is only set at _ur_device_handle_t creation time, and cannot
-  // change. Therefore it can be accessed without holding a lock on this
-  // _ur_device_handle_t.
-  const ur_device_handle_t RootDevice;
-
-  enum ImmCmdlistMode {
-    // Immediate commandlists are not used.
-    NotUsed = 0,
-    // One set of compute and copy immediate commandlists per queue.
-    PerQueue,
-    // One set of compute and copy immediate commandlists per host thread that
-    // accesses the queue.
-    PerThreadPerQueue
-  };
-  // Read env settings to select immediate commandlist mode.
-  ImmCmdlistMode useImmediateCommandLists();
-
-  // Returns whether immediate command lists are used on this device.
-  ImmCmdlistMode ImmCommandListUsed{};
-
-  // Scope of events used for events on the device
-  // Can be adjusted with UR_DEVICE_SCOPE_EVENTS
-  // for non-immediate command lists
-  EventsScope ZeEventsScope = AllHostVisible;
-
-  bool isSubDevice() { return RootDevice != nullptr; }
-
-  // Is this a Data Center GPU Max series (aka PVC)?
-  // TODO: change to use
-  // https://spec.oneapi.io/level-zero/latest/core/api.html#ze-device-ip-version-ext-t
-  // when that is stable.
-  bool isPVC() {
-    return (ZeDeviceProperties->deviceId & 0xff0) == 0xbd0 ||
-           (ZeDeviceProperties->deviceId & 0xff0) == 0xb60;
-  }
-
-  // Does this device represent a single compute slice?
-  bool isCCS() const {
-    return QueueGroup[_ur_device_handle_t::queue_group_info_t::Compute]
-               .ZeIndex >= 0;
-  }
-
-  // Cache of the immutable device properties.
-  ZeCache<ZeStruct<ze_device_properties_t>> ZeDeviceProperties;
-  ZeCache<ZeStruct<ze_device_compute_properties_t>> ZeDeviceComputeProperties;
-  ZeCache<ZeStruct<ze_device_image_properties_t>> ZeDeviceImageProperties;
-  ZeCache<ZeStruct<ze_device_module_properties_t>> ZeDeviceModuleProperties;
-  ZeCache<std::pair<std::vector<ZeStruct<ze_device_memory_properties_t>>,
-                    std::vector<ZeStruct<ze_device_memory_ext_properties_t>>>>
-      ZeDeviceMemoryProperties;
-  ZeCache<ZeStruct<ze_device_memory_access_properties_t>>
-      ZeDeviceMemoryAccessProperties;
-  ZeCache<ZeStruct<ze_device_cache_properties_t>> ZeDeviceCacheProperties;
-};
+#include "ur_level_zero_context.hpp"
+#include "ur_level_zero_device.hpp"
+#include "ur_level_zero_event.hpp"
+#include "ur_level_zero_kernel.hpp"
+#include "ur_level_zero_mem.hpp"
+#include "ur_level_zero_platform.hpp"
+#include "ur_level_zero_program.hpp"
+#include "ur_level_zero_queue.hpp"
+#include "ur_level_zero_sampler.hpp"
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
index 16b4c1ef4e582..a26e3412fadca 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
@@ -8,9 +8,13 @@
 #pragma once
 
 #include <cassert>
-#include <cstdarg>
+#include <list>
 #include <map>
+#include <mutex>
+#include <stdarg.h>
+#include <string>
 #include <unordered_map>
+#include <vector>
 
 #include <sycl/detail/pi.h>
 #include <ur/ur.hpp>
@@ -19,15 +23,6 @@
 #include <zes_api.h>
 
 #include "ur/usm_allocator_config.hpp"
-#include "ur_level_zero_context.hpp"
-#include "ur_level_zero_device.hpp"
-#include "ur_level_zero_event.hpp"
-#include "ur_level_zero_mem.hpp"
-#include "ur_level_zero_module.hpp"
-#include "ur_level_zero_platform.hpp"
-#include "ur_level_zero_program.hpp"
-#include "ur_level_zero_queue.hpp"
-#include "ur_level_zero_sampler.hpp"
 
 struct _ur_platform_handle_t;
 
@@ -298,6 +293,30 @@ template <class T> struct ZesStruct : public T {
   }
 };
 
+// Trace an internal PI call; returns in case of an error.
+#define UR_CALL(Call)                                                          \
+  {                                                                            \
+    if (PrintTrace)                                                            \
+      fprintf(stderr, "UR ---> %s\n", #Call);                                  \
+    ur_result_t Result = (Call);                                               \
+    if (PrintTrace)                                                            \
+      fprintf(stderr, "UR <--- %s(%s)\n", #Call, getUrResultString(Result));   \
+    if (Result != UR_RESULT_SUCCESS)                                           \
+      return Result;                                                           \
+  }
+
+// This function will ensure compatibility with both Linux and Windows for
+// setting environment variables.
+bool setEnvVar(const char *name, const char *value);
+
+// Prints to stderr if UR_L0_DEBUG allows it
+void urPrint(const char *Format, ...);
+
+// Helper for one-liner validation
+#define UR_ASSERT(condition, error)                                            \
+  if (!(condition))                                                            \
+    return error;
+
 // Map Level Zero runtime error code to UR error code.
 ur_result_t ze2urResult(ze_result_t ZeResult);
 
@@ -316,14 +335,14 @@ ur_result_t ze2urResult(ze_result_t ZeResult);
 // Record for a memory allocation. This structure is used to keep information
 // for each memory allocation.
 struct MemAllocRecord : _ur_object {
-  MemAllocRecord(pi_context Context, bool OwnZeMemHandle = true)
+  MemAllocRecord(ur_context_handle_t Context, bool OwnZeMemHandle = true)
       : Context(Context), OwnZeMemHandle(OwnZeMemHandle) {}
   // Currently kernel can reference memory allocations from different contexts
   // and we need to know the context of a memory allocation when we release it
   // in piKernelRelease.
   // TODO: this should go away when memory isolation issue is fixed in the Level
   // Zero runtime.
-  pi_context Context;
+  ur_context_handle_t Context;
 
   // Indicates if we own the native memory handle or it came from interop that
   // asked to not transfer the ownership to SYCL RT.
@@ -341,6 +360,130 @@ const bool IndirectAccessTrackingEnabled = [] {
   return RetVal;
 }();
 
+extern const bool UseUSMAllocator;
+
+// The getInfo*/ReturnHelper facilities provide shortcut way of
+// writing return bytes for the various getInfo APIs.
+template <typename T, typename Assign>
+ur_result_t urL0getInfoImpl(size_t param_value_size, void *param_value,
+                            size_t *param_value_size_ret, T value,
+                            size_t value_size, Assign &&assign_func) {
+
+  if (param_value != nullptr) {
+
+    if (param_value_size < value_size) {
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+
+    assign_func(param_value, value, value_size);
+  }
+
+  if (param_value_size_ret != nullptr) {
+    *param_value_size_ret = value_size;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+template <typename T>
+ur_result_t urL0getInfo(size_t param_value_size, void *param_value,
+                        size_t *param_value_size_ret, T value) {
+
+  auto assignment = [](void *param_value, T value, size_t value_size) {
+    std::ignore = value_size;
+    *static_cast<T *>(param_value) = value;
+  };
+
+  return urL0getInfoImpl(param_value_size, param_value, param_value_size_ret,
+                         value, sizeof(T), assignment);
+}
+
+template <typename T>
+ur_result_t urL0getInfoArray(size_t array_length, size_t param_value_size,
+                             void *param_value, size_t *param_value_size_ret,
+                             const T *value) {
+  return urL0getInfoImpl(param_value_size, param_value, param_value_size_ret,
+                         value, array_length * sizeof(T), memcpy);
+}
+
+template <typename T, typename RetType>
+ur_result_t urL0getInfoArray(size_t array_length, size_t param_value_size,
+                             void *param_value, size_t *param_value_size_ret,
+                             const T *value) {
+  if (param_value) {
+    memset(param_value, 0, param_value_size);
+    for (uint32_t I = 0; I < array_length; I++)
+      ((RetType *)param_value)[I] = (RetType)value[I];
+  }
+  if (param_value_size_ret)
+    *param_value_size_ret = array_length * sizeof(RetType);
+  return UR_RESULT_SUCCESS;
+}
+
+template <>
+inline ur_result_t
+urL0getInfo<const char *>(size_t param_value_size, void *param_value,
+                          size_t *param_value_size_ret, const char *value) {
+  return urL0getInfoArray(strlen(value) + 1, param_value_size, param_value,
+                          param_value_size_ret, value);
+}
+
+class UrL0ReturnHelperBase {
+public:
+  UrL0ReturnHelperBase(size_t param_value_size, void *param_value,
+                       size_t *param_value_size_ret)
+      : param_value_size(param_value_size), param_value(param_value),
+        param_value_size_ret(param_value_size_ret) {}
+
+  // A version where in/out info size is represented by a single pointer
+  // to a value which is updated on return
+  UrL0ReturnHelperBase(size_t *param_value_size, void *param_value)
+      : param_value_size(*param_value_size), param_value(param_value),
+        param_value_size_ret(param_value_size) {}
+
+  // Scalar return value
+  template <class T> ur_result_t operator()(const T &t) {
+    return getInfo(param_value_size, param_value, param_value_size_ret, t);
+  }
+
+  // Array return value
+  template <class T> ur_result_t operator()(const T *t, size_t s) {
+    return urL0getInfoArray(s, param_value_size, param_value,
+                            param_value_size_ret, t);
+  }
+
+  // Array return value where element type is differrent from T
+  template <class RetType, class T>
+  ur_result_t operator()(const T *t, size_t s) {
+    return urL0getInfoArray<T, RetType>(s, param_value_size, param_value,
+                                        param_value_size_ret, t);
+  }
+
+protected:
+  size_t param_value_size;
+  void *param_value;
+  size_t *param_value_size_ret;
+};
+
+// A version of return helper that returns pi_result and not ur_result_t
+class UrL0ReturnHelper : public UrL0ReturnHelperBase {
+public:
+  using UrL0ReturnHelperBase::UrL0ReturnHelperBase;
+
+  template <class T> ur_result_t operator()(const T &t) {
+    return UrL0ReturnHelperBase::operator()(t);
+  }
+  // Array return value
+  template <class T> ur_result_t operator()(const T *t, size_t s) {
+    return UrL0ReturnHelperBase::operator()(t, s);
+  }
+  // Array return value where element type is differrent from T
+  template <class RetType, class T>
+  ur_result_t operator()(const T *t, size_t s) {
+    return UrL0ReturnHelperBase::operator()<RetType>(t, s);
+  }
+};
+
 const bool ExposeCSliceInAffinityPartitioning = [] {
   char *UrRet = std::getenv("UR_L0_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING");
   char *PiRet =
@@ -366,7 +509,7 @@ class ZeUSMImportExtension {
 
   ZeUSMImportExtension() : Enabled{false} {}
 
-  void setZeUSMImport(_ur_platform_handle_t *Platform);
+  void setZeUSMImport(ur_platform_handle_t_ *Platform);
   void doZeUSMImport(ze_driver_handle_t DriverHandle, void *HostPtr,
                      size_t Size);
   void doZeUSMRelease(ze_driver_handle_t DriverHandle, void *HostPtr);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
index 43b8d9981b039..815a1a5db06cf 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
@@ -6,4 +6,688 @@
 //
 //===-----------------------------------------------------------------===//
 
+#include <algorithm>
+#include <climits>
+#include <mutex>
+#include <string.h>
+
+#include "ur_level_zero.hpp"
 #include "ur_level_zero_context.hpp"
+#include <ur_bindings.hpp>
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextCreate(
+    uint32_t DeviceCount, ///< [in] the number of devices given in phDevices
+    const ur_device_handle_t
+        *Devices, ///< [in][range(0, DeviceCount)] array of handle of devices.
+    const ur_context_properties_t
+        *Properties, ///< [in][optional] pointer to context creation properties.
+    ur_context_handle_t
+        *RetContext ///< [out] pointer to handle of context object created
+) {
+  std::ignore = Properties;
+
+  ur_platform_handle_t Platform = Devices[0]->Platform;
+  ZeStruct<ze_context_desc_t> ContextDesc{};
+
+  ze_context_handle_t ZeContext{};
+  ZE2UR_CALL(zeContextCreate, (Platform->ZeDriver, &ContextDesc, &ZeContext));
+  try {
+    ur_context_handle_t_ *Context =
+        new ur_context_handle_t_(ZeContext, DeviceCount, Devices, true);
+
+    Context->initialize();
+    *RetContext = reinterpret_cast<ur_context_handle_t>(Context);
+    if (IndirectAccessTrackingEnabled) {
+      std::scoped_lock<ur_shared_mutex> Lock(Platform->ContextsMutex);
+      Platform->Contexts.push_back(*RetContext);
+    }
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextRetain(
+    ur_context_handle_t
+        Context ///< [in] handle of the context to get a reference of.
+) {
+  Context->RefCount.increment();
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextRelease(
+    ur_context_handle_t Context ///< [in] handle of the context to release.
+) {
+  ur_platform_handle_t Plt = Context->getPlatform();
+  std::unique_lock<ur_shared_mutex> ContextsLock(Plt->ContextsMutex,
+                                                 std::defer_lock);
+  if (IndirectAccessTrackingEnabled)
+    ContextsLock.lock();
+
+  return ContextReleaseHelper(Context);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
+    ur_context_handle_t Context,       ///< [in] handle of the context
+    ur_context_info_t ContextInfoType, ///< [in] type of the info to retrieve
+    size_t PropSize,    ///< [in] the number of bytes of memory pointed to by
+                        ///< pContextInfo.
+    void *ContextInfo,  ///< [out][optional] array of bytes holding the info.
+                        ///< if propSize is not equal to or greater than the
+                        ///< real number of bytes needed to return the info then
+                        ///< the ::UR_RESULT_ERROR_INVALID_SIZE error is
+                        ///< returned and pContextInfo is not used.
+    size_t *PropSizeRet ///< [out][optional] pointer to the actual size in
+                        ///< bytes of data queried by ContextInfoType.
+) {
+  std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
+  UrReturnHelper ReturnValue(PropSize, ContextInfo, PropSizeRet);
+  switch (
+      (uint32_t)ContextInfoType) { // cast to avoid warnings on EXT enum values
+  case UR_CONTEXT_INFO_DEVICES:
+    return ReturnValue(&Context->Devices[0], Context->Devices.size());
+  case UR_CONTEXT_INFO_NUM_DEVICES:
+    return ReturnValue(uint32_t(Context->Devices.size()));
+  case UR_EXT_CONTEXT_INFO_REFERENCE_COUNT:
+    return ReturnValue(uint32_t{Context->RefCount.load()});
+  case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT:
+    // 2D USM memcpy is supported.
+    return ReturnValue(pi_bool{true});
+  case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT:
+    // 2D USM fill is not supported.
+    return ReturnValue(pi_bool{false});
+  case UR_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
+    ur_memory_order_capability_flags_t Capabilities =
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST;
+    return ReturnValue(Capabilities);
+  }
+  default:
+    // TODO: implement other parameters
+    die("urGetContextInfo: unsuppported ParamName.");
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle(
+    ur_context_handle_t Context,      ///< [in] handle of the context.
+    ur_native_handle_t *NativeContext ///< [out] a pointer to the native
+                                      ///< handle of the context.
+) {
+  *NativeContext = reinterpret_cast<ur_native_handle_t>(Context->ZeContext);
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle(
+    ur_native_handle_t
+        NativeContext,           ///< [in] the native handle of the context.
+    ur_context_handle_t *Context ///< [out] pointer to the handle of the
+                                 ///< context object created.
+) {
+  try {
+    ze_context_handle_t ZeContext =
+        reinterpret_cast<ze_context_handle_t>(NativeContext);
+    ur_context_handle_t_ *UrContext = new ur_context_handle_t_(ZeContext);
+    UrContext->initialize();
+    *Context = reinterpret_cast<ur_context_handle_t>(UrContext);
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextSetExtendedDeleter(
+    ur_context_handle_t Context, ///< [in] handle of the context.
+    ur_context_extended_deleter_t
+        Deleter,   ///< [in] Function pointer to extended deleter.
+    void *UserData ///< [in][out][optional] pointer to data to be passed to
+                   ///< callback.
+) {
+  std::ignore = Context;
+  std::ignore = Deleter;
+  std::ignore = UserData;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+ur_result_t ur_context_handle_t_::initialize() {
+
+  // Helper lambda to create various USM allocators for a device.
+  // Note that the CCS devices and their respective subdevices share a
+  // common ze_device_handle and therefore, also share USM allocators.
+  auto createUSMAllocators = [this](ur_device_handle_t Device) {
+    SharedMemAllocContexts.emplace(
+        std::piecewise_construct, std::make_tuple(Device->ZeDevice),
+        std::make_tuple(
+            std::unique_ptr<SystemMemory>(new USMSharedMemoryAlloc(
+                reinterpret_cast<ur_context_handle_t>(this),
+                reinterpret_cast<ur_device_handle_t>(Device))),
+            USMAllocatorConfigInstance.Configs[usm_settings::MemType::Shared]));
+
+    SharedReadOnlyMemAllocContexts.emplace(
+        std::piecewise_construct, std::make_tuple(Device->ZeDevice),
+        std::make_tuple(
+            std::unique_ptr<SystemMemory>(new USMSharedReadOnlyMemoryAlloc(
+                reinterpret_cast<ur_context_handle_t>(this),
+                reinterpret_cast<ur_device_handle_t>(Device))),
+            USMAllocatorConfigInstance
+                .Configs[usm_settings::MemType::SharedReadOnly]));
+
+    DeviceMemAllocContexts.emplace(
+        std::piecewise_construct, std::make_tuple(Device->ZeDevice),
+        std::make_tuple(
+            std::unique_ptr<SystemMemory>(new USMDeviceMemoryAlloc(
+                reinterpret_cast<ur_context_handle_t>(this),
+                reinterpret_cast<ur_device_handle_t>(Device))),
+            USMAllocatorConfigInstance.Configs[usm_settings::MemType::Device]));
+  };
+
+  // Recursive helper to call createUSMAllocators for all sub-devices
+  std::function<void(ur_device_handle_t)> createUSMAllocatorsRecursive;
+  createUSMAllocatorsRecursive =
+      [createUSMAllocators,
+       &createUSMAllocatorsRecursive](ur_device_handle_t Device) -> void {
+    createUSMAllocators(Device);
+    for (auto &SubDevice : Device->SubDevices)
+      createUSMAllocatorsRecursive(SubDevice);
+  };
+
+  // Create USM allocator context for each pair (device, context).
+  //
+  for (auto &Device : Devices) {
+    createUSMAllocatorsRecursive(Device);
+  }
+  // Create USM allocator context for host. Device and Shared USM allocations
+  // are device-specific. Host allocations are not device-dependent therefore
+  // we don't need a map with device as key.
+  HostMemAllocContext = std::make_unique<USMAllocContext>(
+      std::unique_ptr<SystemMemory>(
+          new USMHostMemoryAlloc(reinterpret_cast<ur_context_handle_t>(this))),
+      USMAllocatorConfigInstance.Configs[usm_settings::MemType::Host]);
+
+  // We may allocate memory to this root device so create allocators.
+  if (SingleRootDevice &&
+      DeviceMemAllocContexts.find(SingleRootDevice->ZeDevice) ==
+          DeviceMemAllocContexts.end()) {
+    createUSMAllocators(SingleRootDevice);
+  }
+
+  // Create the immediate command list to be used for initializations.
+  // Created as synchronous so level-zero performs implicit synchronization and
+  // there is no need to query for completion in the plugin
+  //
+  // TODO: we use Device[0] here as the single immediate command-list
+  // for buffer creation and migration. Initialization is in
+  // in sync and is always performed to Devices[0] as well but
+  // D2D migartion, if no P2P, is broken since it should use
+  // immediate command-list for the specfic devices, and this single one.
+  //
+  ur_device_handle_t Device = SingleRootDevice ? SingleRootDevice : Devices[0];
+
+  // Prefer to use copy engine for initialization copies,
+  // if available and allowed (main copy engine with index 0).
+  ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
+  const auto &Range = getRangeOfAllowedCopyEngines((ur_device_handle_t)Device);
+  ZeCommandQueueDesc.ordinal =
+      Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
+          .ZeOrdinal;
+  if (Range.first >= 0 &&
+      Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::MainCopy]
+              .ZeOrdinal != -1)
+    ZeCommandQueueDesc.ordinal =
+        Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::MainCopy]
+            .ZeOrdinal;
+
+  ZeCommandQueueDesc.index = 0;
+  ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
+  ZE2UR_CALL(
+      zeCommandListCreateImmediate,
+      (ZeContext, Device->ZeDevice, &ZeCommandQueueDesc, &ZeCommandListInit));
+  return UR_RESULT_SUCCESS;
+}
+
+ur_device_handle_t ur_context_handle_t_::getRootDevice() const {
+  assert(Devices.size() > 0);
+
+  if (Devices.size() == 1)
+    return Devices[0];
+
+  // Check if we have context with subdevices of the same device (context
+  // may include root device itself as well)
+  ur_device_handle_t ContextRootDevice =
+      Devices[0]->RootDevice ? Devices[0]->RootDevice : Devices[0];
+
+  // For context with sub subdevices, the ContextRootDevice might still
+  // not be the root device.
+  // Check whether the ContextRootDevice is the subdevice or root device.
+  if (ContextRootDevice->isSubDevice()) {
+    ContextRootDevice = ContextRootDevice->RootDevice;
+  }
+
+  for (auto &Device : Devices) {
+    if ((!Device->RootDevice && Device != ContextRootDevice) ||
+        (Device->RootDevice && Device->RootDevice != ContextRootDevice)) {
+      ContextRootDevice = nullptr;
+      break;
+    }
+  }
+  return ContextRootDevice;
+}
+
+// Helper function to release the context, a caller must lock the platform-level
+// mutex guarding the container with contexts because the context can be removed
+// from the list of tracked contexts.
+ur_result_t ContextReleaseHelper(ur_context_handle_t Context) {
+
+  if (!Context->RefCount.decrementAndTest())
+    return UR_RESULT_SUCCESS;
+
+  if (IndirectAccessTrackingEnabled) {
+    ur_platform_handle_t Plt = Context->getPlatform();
+    auto &Contexts = Plt->Contexts;
+    auto It = std::find(Contexts.begin(), Contexts.end(), Context);
+    if (It != Contexts.end())
+      Contexts.erase(It);
+  }
+  ze_context_handle_t DestroyZeContext =
+      Context->OwnZeContext ? Context->ZeContext : nullptr;
+
+  // Clean up any live memory associated with Context
+  ur_result_t Result = Context->finalize();
+
+  // We must delete Context first and then destroy zeContext because
+  // Context deallocation requires ZeContext in some member deallocation of
+  // pi_context.
+  delete Context;
+
+  // Destruction of some members of pi_context uses L0 context
+  // and therefore it must be valid at that point.
+  // Technically it should be placed to the destructor of pi_context
+  // but this makes API error handling more complex.
+  if (DestroyZeContext) {
+    auto ZeResult = ZE_CALL_NOCHECK(zeContextDestroy, (DestroyZeContext));
+    // Gracefully handle the case that L0 was already unloaded.
+    if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
+      return ze2urResult(ZeResult);
+  }
+
+  return Result;
+}
+
+ur_platform_handle_t ur_context_handle_t_::getPlatform() const {
+  return Devices[0]->Platform;
+}
+
+ur_result_t ur_context_handle_t_::finalize() {
+  // This function is called when pi_context is deallocated, piContextRelease.
+  // There could be some memory that may have not been deallocated.
+  // For example, event and event pool caches would be still alive.
+
+  if (!DisableEventsCaching) {
+    std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
+    for (auto &EventCache : EventCaches) {
+      for (auto &Event : EventCache) {
+        auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
+        // Gracefully handle the case that L0 was already unloaded.
+        if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
+          return ze2urResult(ZeResult);
+        delete Event;
+      }
+      EventCache.clear();
+    }
+  }
+  {
+    std::scoped_lock<ur_mutex> Lock(ZeEventPoolCacheMutex);
+    for (auto &ZePoolCache : ZeEventPoolCache) {
+      for (auto &ZePool : ZePoolCache) {
+        auto ZeResult = ZE_CALL_NOCHECK(zeEventPoolDestroy, (ZePool));
+        // Gracefully handle the case that L0 was already unloaded.
+        if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
+          return ze2urResult(ZeResult);
+      }
+      ZePoolCache.clear();
+    }
+  }
+
+  // Destroy the command list used for initializations
+  auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListInit));
+  // Gracefully handle the case that L0 was already unloaded.
+  if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
+    return ze2urResult(ZeResult);
+
+  std::scoped_lock<ur_mutex> Lock(ZeCommandListCacheMutex);
+  for (auto &List : ZeComputeCommandListCache) {
+    for (ze_command_list_handle_t &ZeCommandList : List.second) {
+      if (ZeCommandList)
+        if (ZeCommandList) {
+          auto ZeResult =
+              ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList));
+          // Gracefully handle the case that L0 was already unloaded.
+          if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
+            return ze2urResult(ZeResult);
+        }
+    }
+  }
+  for (auto &List : ZeCopyCommandListCache) {
+    for (ze_command_list_handle_t &ZeCommandList : List.second) {
+      if (ZeCommandList) {
+        auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList));
+        // Gracefully handle the case that L0 was already unloaded.
+        if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
+          return ze2urResult(ZeResult);
+      }
+    }
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
+    ze_event_pool_handle_t &Pool, size_t &Index, bool HostVisible,
+    bool ProfilingEnabled) {
+  // Lock while updating event pool machinery.
+  std::scoped_lock<ur_mutex> Lock(ZeEventPoolCacheMutex);
+
+  std::list<ze_event_pool_handle_t> *ZePoolCache =
+      getZeEventPoolCache(HostVisible, ProfilingEnabled);
+
+  if (!ZePoolCache->empty()) {
+    if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) {
+      if (DisableEventsCaching) {
+        // Remove full pool from the cache if events caching is disabled.
+        ZePoolCache->erase(ZePoolCache->begin());
+      } else {
+        // If event caching is enabled then we don't destroy events so there is
+        // no need to remove pool from the cache and add it back when it has
+        // available slots. Just keep it in the tail of the cache so that all
+        // pools can be destroyed during context destruction.
+        ZePoolCache->push_front(nullptr);
+      }
+    }
+  }
+  if (ZePoolCache->empty()) {
+    ZePoolCache->push_back(nullptr);
+  }
+
+  // We shall be adding an event to the front pool.
+  ze_event_pool_handle_t *ZePool = &ZePoolCache->front();
+  Index = 0;
+  // Create one event ZePool per MaxNumEventsPerPool events
+  if (*ZePool == nullptr) {
+    ZeStruct<ze_event_pool_desc_t> ZeEventPoolDesc;
+    ZeEventPoolDesc.count = MaxNumEventsPerPool;
+    ZeEventPoolDesc.flags = 0;
+    if (HostVisible)
+      ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
+    if (ProfilingEnabled)
+      ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
+    urPrint("ze_event_pool_desc_t flags set to: %d\n", ZeEventPoolDesc.flags);
+
+    std::vector<ze_device_handle_t> ZeDevices;
+    std::for_each(
+        Devices.begin(), Devices.end(),
+        [&](const ur_device_handle_t &D) { ZeDevices.push_back(D->ZeDevice); });
+
+    ZE2UR_CALL(zeEventPoolCreate, (ZeContext, &ZeEventPoolDesc,
+                                   ZeDevices.size(), &ZeDevices[0], ZePool));
+    NumEventsAvailableInEventPool[*ZePool] = MaxNumEventsPerPool - 1;
+    NumEventsUnreleasedInEventPool[*ZePool] = 1;
+  } else {
+    Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[*ZePool];
+    --NumEventsAvailableInEventPool[*ZePool];
+    ++NumEventsUnreleasedInEventPool[*ZePool];
+  }
+  Pool = *ZePool;
+  return UR_RESULT_SUCCESS;
+}
+
+ur_event_handle_t
+ur_context_handle_t_::getEventFromContextCache(bool HostVisible,
+                                               bool WithProfiling) {
+  std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
+  auto Cache = getEventCache(HostVisible, WithProfiling);
+  if (Cache->empty())
+    return nullptr;
+
+  auto It = Cache->begin();
+  ur_event_handle_t Event = *It;
+  Cache->erase(It);
+  // We have to reset event before using it.
+  Event->reset();
+  return Event;
+}
+
+void ur_context_handle_t_::addEventToContextCache(ur_event_handle_t Event) {
+  std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
+  auto Cache =
+      getEventCache(Event->isHostVisible(), Event->isProfilingEnabled());
+  Cache->emplace_back(Event);
+}
+
+ur_result_t
+ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) {
+  std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex, std::defer_lock);
+  std::scoped_lock<ur_mutex, std::shared_lock<ur_shared_mutex>> LockAll(
+      ZeEventPoolCacheMutex, EventLock);
+  if (!Event->ZeEventPool) {
+    // This must be an interop event created on a users's pool.
+    // Do nothing.
+    return UR_RESULT_SUCCESS;
+  }
+
+  std::list<ze_event_pool_handle_t> *ZePoolCache =
+      getZeEventPoolCache(Event->isHostVisible(), Event->isProfilingEnabled());
+
+  // Put the empty pool to the cache of the pools.
+  if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0)
+    die("Invalid event release: event pool doesn't have unreleased events");
+  if (--NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) {
+    if (ZePoolCache->front() != Event->ZeEventPool) {
+      ZePoolCache->push_back(Event->ZeEventPool);
+    }
+    NumEventsAvailableInEventPool[Event->ZeEventPool] = MaxNumEventsPerPool;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+// Get value of the threshold for number of events in immediate command lists.
+// If number of events in the immediate command list exceeds this threshold then
+// cleanup process for those events is executed.
+static const size_t ImmCmdListsEventCleanupThreshold = [] {
+  const char *ImmCmdListsEventCleanupThresholdStr = std::getenv(
+      "SYCL_PI_LEVEL_ZERO_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD");
+  static constexpr int Default = 20;
+  if (!ImmCmdListsEventCleanupThresholdStr)
+    return Default;
+
+  int Threshold = std::atoi(ImmCmdListsEventCleanupThresholdStr);
+
+  // Basically disable threshold if negative value is provided.
+  if (Threshold < 0)
+    return INT_MAX;
+
+  return Threshold;
+}();
+
+// Get value of the threshold for number of active command lists allowed before
+// we start heuristically cleaning them up.
+static const size_t CmdListsCleanupThreshold = [] {
+  const char *CmdListsCleanupThresholdStr =
+      std::getenv("SYCL_PI_LEVEL_ZERO_COMMANDLISTS_CLEANUP_THRESHOLD");
+  static constexpr int Default = 20;
+  if (!CmdListsCleanupThresholdStr)
+    return Default;
+
+  int Threshold = std::atoi(CmdListsCleanupThresholdStr);
+
+  // Basically disable threshold if negative value is provided.
+  if (Threshold < 0)
+    return INT_MAX;
+
+  return Threshold;
+}();
+
+// Retrieve an available command list to be used in a PI call.
+ur_result_t ur_context_handle_t_::getAvailableCommandList(
+    ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList,
+    bool UseCopyEngine, bool AllowBatching,
+    ze_command_queue_handle_t *ForcedCmdQueue) {
+  // Immediate commandlists have been pre-allocated and are always available.
+  if (Queue->Device->ImmCommandListUsed) {
+    CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList();
+    if (CommandList->second.EventList.size() >
+        ImmCmdListsEventCleanupThreshold) {
+      std::vector<ur_event_handle_t> EventListToCleanup;
+      Queue->resetCommandList(CommandList, false, EventListToCleanup);
+      CleanupEventListFromResetCmdList(EventListToCleanup, true);
+    }
+    UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList));
+    if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine))
+      return Res;
+    return UR_RESULT_SUCCESS;
+  } else {
+    // Cleanup regular command-lists if there are too many.
+    // It handles the case that the queue is not synced to the host
+    // for a long time and we want to reclaim the command-lists for
+    // use by other queues.
+    if (Queue->CommandListMap.size() > CmdListsCleanupThreshold) {
+      resetCommandLists(Queue);
+    }
+  }
+
+  auto &CommandBatch =
+      UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
+  // Handle batching of commands
+  // First see if there is an command-list open for batching commands
+  // for this queue.
+  if (Queue->hasOpenCommandList(UseCopyEngine)) {
+    if (AllowBatching) {
+      CommandList = CommandBatch.OpenCommandList;
+      UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList));
+      return UR_RESULT_SUCCESS;
+    }
+    // If this command isn't allowed to be batched or doesn't match the forced
+    // command queue, then we need to go ahead and execute what is already in
+    // the batched list, and then go on to process this. On exit from
+    // executeOpenCommandList OpenCommandList will be invalidated.
+    if (auto Res = Queue->executeOpenCommandList(UseCopyEngine))
+      return Res;
+    // Note that active barriers do not need to be inserted here as they will
+    // have been enqueued into the command-list when they were created.
+  }
+
+  // Create/Reuse the command list, because in Level Zero commands are added to
+  // the command lists, and later are then added to the command queue.
+  // Each command list is paired with an associated fence to track when the
+  // command list is available for reuse.
+  ur_result_t pi_result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+
+  // Initally, we need to check if a command list has already been created
+  // on this device that is available for use. If so, then reuse that
+  // Level-Zero Command List and Fence for this PI call.
+  {
+    // Make sure to acquire the lock before checking the size, or there
+    // will be a race condition.
+    std::scoped_lock<ur_mutex> Lock(Queue->Context->ZeCommandListCacheMutex);
+    // Under mutex since operator[] does insertion on the first usage for every
+    // unique ZeDevice.
+    auto &ZeCommandListCache =
+        UseCopyEngine
+            ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]
+            : Queue->Context
+                  ->ZeComputeCommandListCache[Queue->Device->ZeDevice];
+
+    for (auto ZeCommandListIt = ZeCommandListCache.begin();
+         ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) {
+      auto &ZeCommandList = *ZeCommandListIt;
+      auto it = Queue->CommandListMap.find(ZeCommandList);
+      if (it != Queue->CommandListMap.end()) {
+        if (ForcedCmdQueue && *ForcedCmdQueue != it->second.ZeQueue)
+          continue;
+        CommandList = it;
+        if (CommandList->second.ZeFence != nullptr)
+          CommandList->second.ZeFenceInUse = true;
+      } else {
+        // If there is a command list available on this context, but it
+        // wasn't yet used in this queue then create a new entry in this
+        // queue's map to hold the fence and other associated command
+        // list information.
+        auto &QGroup = Queue->getQueueGroup(UseCopyEngine);
+        uint32_t QueueGroupOrdinal;
+        auto &ZeCommandQueue = ForcedCmdQueue
+                                   ? *ForcedCmdQueue
+                                   : QGroup.getZeQueue(&QueueGroupOrdinal);
+        if (ForcedCmdQueue)
+          QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue);
+
+        ze_fence_handle_t ZeFence;
+        ZeStruct<ze_fence_desc_t> ZeFenceDesc;
+        ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
+        CommandList = Queue->CommandListMap
+                          .emplace(ZeCommandList,
+                                   pi_command_list_info_t{ZeFence, true, false,
+                                                          ZeCommandQueue,
+                                                          QueueGroupOrdinal})
+                          .first;
+      }
+      ZeCommandListCache.erase(ZeCommandListIt);
+      if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList))
+        return Res;
+      if (auto Res = Queue->insertActiveBarriers(CommandList, UseCopyEngine))
+        return Res;
+      return UR_RESULT_SUCCESS;
+    }
+  }
+
+  // If there are no available command lists in the cache, then we check for
+  // command lists that have already signalled, but have not been added to the
+  // available list yet. Each command list has a fence associated which tracks
+  // if a command list has completed dispatch of its commands and is ready for
+  // reuse. If a command list is found to have been signalled, then the
+  // command list & fence are reset and we return.
+  for (auto it = Queue->CommandListMap.begin();
+       it != Queue->CommandListMap.end(); ++it) {
+    // Make sure this is the command list type needed.
+    if (UseCopyEngine != it->second.isCopy(Queue))
+      continue;
+
+    ze_result_t ZeResult =
+        ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence));
+    if (ZeResult == ZE_RESULT_SUCCESS) {
+      std::vector<ur_event_handle_t> EventListToCleanup;
+      Queue->resetCommandList(it, false, EventListToCleanup);
+      CleanupEventListFromResetCmdList(EventListToCleanup,
+                                       true /* QueueLocked */);
+      CommandList = it;
+      CommandList->second.ZeFenceInUse = true;
+      if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList))
+        return Res;
+      return UR_RESULT_SUCCESS;
+    }
+  }
+
+  // If there are no available command lists nor signalled command lists,
+  // then we must create another command list.
+  pi_result = Queue->createCommandList(UseCopyEngine, CommandList);
+  CommandList->second.ZeFenceInUse = true;
+  return pi_result;
+}
+
+bool ur_context_handle_t_::isValidDevice(ur_device_handle_t Device) const {
+  while (Device) {
+    if (std::find(Devices.begin(), Devices.end(), Device) != Devices.end())
+      return true;
+    Device = Device->RootDevice;
+  }
+  return false;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp
index 28b4bf599b8a0..8cb8a94124b6a 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp
@@ -7,8 +7,235 @@
 //===-----------------------------------------------------------------===//
 #pragma once
 
+#include <list>
+#include <map>
+#include <stdarg.h>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include <sycl/detail/pi.h>
+#include <ur/ur.hpp>
+#include <ur_api.h>
+#include <ze_api.h>
+#include <zes_api.h>
+
 #include "ur_level_zero_common.hpp"
+#include "ur_level_zero_queue.hpp"
+#include <ur/usm_allocator.hpp>
+
+struct ur_context_handle_t_ : _ur_object {
+  ur_context_handle_t_(ze_context_handle_t ZeContext, uint32_t NumDevices,
+                       const ur_device_handle_t *Devs, bool OwnZeContext)
+      : ZeContext{ZeContext}, Devices{Devs, Devs + NumDevices},
+        OwnZeContext{OwnZeContext} {}
+
+  ur_context_handle_t_(ze_context_handle_t ZeContext) : ZeContext{ZeContext} {}
+
+  // A L0 context handle is primarily used during creation and management of
+  // resources that may be used by multiple devices.
+  // This field is only set at _pi_context creation time, and cannot change.
+  // Therefore it can be accessed without holding a lock on this _pi_context.
+  const ze_context_handle_t ZeContext{};
+
+  // Keep the PI devices this PI context was created for.
+  // This field is only set at _pi_context creation time, and cannot change.
+  // Therefore it can be accessed without holding a lock on this _pi_context.
+  // const std::vector<ur_device_handle_t> Devices;
+  std::vector<ur_device_handle_t> Devices;
+
+  // Indicates if we own the ZeContext or it came from interop that
+  // asked to not transfer the ownership to SYCL RT.
+  bool OwnZeContext = false;
+
+  // Immediate Level Zero command list for the device in this context, to be
+  // used for initializations. To be created as:
+  // - Immediate command list: So any command appended to it is immediately
+  //   offloaded to the device.
+  // - Synchronous: So implicit synchronization is made inside the level-zero
+  //   driver.
+  // There will be a list of immediate command lists (for each device) when
+  // support of the multiple devices per context will be added.
+  ze_command_list_handle_t ZeCommandListInit{};
+
+  // Mutex for the immediate command list. Per the Level Zero spec memory copy
+  // operations submitted to an immediate command list are not allowed to be
+  // called from simultaneous threads.
+  ur_mutex ImmediateCommandListMutex;
+
+  // Mutex Lock for the Command List Cache. This lock is used to control both
+  // compute and copy command list caches.
+  ur_mutex ZeCommandListCacheMutex;
+
+  // If context contains one device or sub-devices of the same device, we want
+  // to save this device.
+  // This field is only set at _pi_context creation time, and cannot change.
+  // Therefore it can be accessed without holding a lock on this _pi_context.
+  ur_device_handle_t SingleRootDevice = nullptr;
+
+  // Cache of all currently available/completed command/copy lists.
+  // Note that command-list can only be re-used on the same device.
+  //
+  // TODO: explore if we should use root-device for creating command-lists
+  // as spec says that in that case any sub-device can re-use it: "The
+  // application must only use the command list for the device, or its
+  // sub-devices, which was provided during creation."
+  //
+  std::unordered_map<ze_device_handle_t, std::list<ze_command_list_handle_t>>
+      ZeComputeCommandListCache;
+  std::unordered_map<ze_device_handle_t, std::list<ze_command_list_handle_t>>
+      ZeCopyCommandListCache;
+
+  // Store USM allocator context(internal allocator structures)
+  // for USM shared and device allocations. There is 1 allocator context
+  // per each pair of (context, device) per each memory type.
+  std::unordered_map<ze_device_handle_t, USMAllocContext>
+      DeviceMemAllocContexts;
+  std::unordered_map<ze_device_handle_t, USMAllocContext>
+      SharedMemAllocContexts;
+  std::unordered_map<ze_device_handle_t, USMAllocContext>
+      SharedReadOnlyMemAllocContexts;
+
+  // Since L0 native runtime does not distinguisg "shared device_read_only"
+  // vs regular "shared" allocations, we have keep track of it to use
+  // proper USMAllocContext when freeing allocations.
+  std::unordered_set<void *> SharedReadOnlyAllocs;
+
+  // Store the host allocator context. It does not depend on any device.
+  std::unique_ptr<USMAllocContext> HostMemAllocContext;
+
+  // We need to store all memory allocations in the context because there could
+  // be kernels with indirect access. Kernels with indirect access start to
+  // reference all existing memory allocations at the time when they are
+  // submitted to the device. Referenced memory allocations can be released only
+  // when kernel has finished execution.
+  std::unordered_map<void *, MemAllocRecord> MemAllocs;
+
+  // Following member variables are used to manage assignment of events
+  // to event pools.
+  //
+  // TODO: Create pi_event_pool class to encapsulate working with pools.
+  // This will avoid needing the use of maps below, and cleanup the
+  // pi_context overall.
+  //
 
-struct _ur_context_handle_t : _ur_object {
-  _ur_context_handle_t() {}
+  // The cache of event pools from where new events are allocated from.
+  // The head event pool is where the next event would be added to if there
+  // is still some room there. If there is no room in the head then
+  // the following event pool is taken (guranteed to be empty) and made the
+  // head. In case there is no next pool, a new pool is created and made the
+  // head.
+  //
+  // Cache of event pools to which host-visible events are added to.
+  std::vector<std::list<ze_event_pool_handle_t>> ZeEventPoolCache{4};
+
+  // This map will be used to determine if a pool is full or not
+  // by storing number of empty slots available in the pool.
+  std::unordered_map<ze_event_pool_handle_t, uint32_t>
+      NumEventsAvailableInEventPool;
+  // This map will be used to determine number of unreleased events in the pool.
+  // We use separate maps for number of event slots available in the pool from
+  // the number of events unreleased in the pool.
+  // This will help when we try to make the code thread-safe.
+  std::unordered_map<ze_event_pool_handle_t, uint32_t>
+      NumEventsUnreleasedInEventPool;
+
+  // Mutex to control operations on event pool caches and the helper maps
+  // holding the current pool usage counts.
+  ur_mutex ZeEventPoolCacheMutex;
+
+  // Mutex to control operations on event caches.
+  ur_mutex EventCacheMutex;
+
+  // Caches for events.
+  std::vector<std::list<ur_event_handle_t>> EventCaches{4};
+
+  // Initialize the PI context.
+  ur_result_t initialize();
+
+  // If context contains one device then return this device.
+  // If context contains sub-devices of the same device, then return this parent
+  // device. Return nullptr if context consists of several devices which are not
+  // sub-devices of the same device. We call returned device the root device of
+  // a context.
+  // TODO: get rid of this when contexts with multiple devices are supported for
+  // images.
+  ur_device_handle_t getRootDevice() const;
+
+  // Finalize the PI context
+  ur_result_t finalize();
+
+  // Return the Platform, which is the same for all devices in the context
+  ur_platform_handle_t getPlatform() const;
+
+  // Get index of the free slot in the available pool. If there is no available
+  // pool then create new one. The HostVisible parameter tells if we need a
+  // slot for a host-visible event. The ProfilingEnabled tells is we need a
+  // slot for an event with profiling capabilities.
+  ur_result_t getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &,
+                                             bool HostVisible,
+                                             bool ProfilingEnabled);
+
+  // Get pi_event from cache.
+  ur_event_handle_t getEventFromContextCache(bool HostVisible,
+                                             bool WithProfiling);
+
+  // Add pi_event to cache.
+  void addEventToContextCache(ur_event_handle_t);
+
+  auto getZeEventPoolCache(bool HostVisible, bool WithProfiling) {
+    if (HostVisible)
+      return WithProfiling ? &ZeEventPoolCache[0] : &ZeEventPoolCache[1];
+    else
+      return WithProfiling ? &ZeEventPoolCache[2] : &ZeEventPoolCache[3];
+  }
+
+  // Decrement number of events living in the pool upon event destroy
+  // and return the pool to the cache if there are no unreleased events.
+  ur_result_t decrementUnreleasedEventsInPool(ur_event_handle_t Event);
+
+  // Retrieves a command list for executing on this device along with
+  // a fence to be used in tracking the execution of this command list.
+  // If a command list has been created on this device which has
+  // completed its commands, then that command list and its associated fence
+  // will be reused. Otherwise, a new command list and fence will be created for
+  // running on this device. L0 fences are created on a L0 command queue so the
+  // caller must pass a command queue to create a new fence for the new command
+  // list if a command list/fence pair is not available. All Command Lists &
+  // associated fences are destroyed at Device Release.
+  // If UseCopyEngine is true, the command will eventually be executed in a
+  // copy engine. Otherwise, the command will be executed in a compute engine.
+  // If AllowBatching is true, then the command list returned may already have
+  // command in it, if AllowBatching is false, any open command lists that
+  // already exist in Queue will be closed and executed.
+  // If ForcedCmdQueue is not nullptr, the resulting command list must be tied
+  // to the contained command queue. This option is ignored if immediate
+  // command lists are used.
+  // When using immediate commandlists, retrieves an immediate command list
+  // for executing on this device. Immediate commandlists are created only
+  // once for each SYCL Queue and after that they are reused.
+  ur_result_t
+  getAvailableCommandList(ur_queue_handle_t Queue,
+                          ur_command_list_ptr_t &CommandList,
+                          bool UseCopyEngine, bool AllowBatching = false,
+                          ze_command_queue_handle_t *ForcedCmdQueue = nullptr);
+
+  // Checks if Device is covered by this context.
+  // For that the Device or its root devices need to be in the context.
+  bool isValidDevice(ur_device_handle_t Device) const;
+
+private:
+  // Get the cache of events for a provided scope and profiling mode.
+  auto getEventCache(bool HostVisible, bool WithProfiling) {
+    if (HostVisible)
+      return WithProfiling ? &EventCaches[0] : &EventCaches[1];
+    else
+      return WithProfiling ? &EventCaches[2] : &EventCaches[3];
+  }
 };
+
+// Helper function to release the context, a caller must lock the platform-level
+// mutex guarding the container with contexts because the context can be removed
+// from the list of tracked contexts.
+ur_result_t ContextReleaseHelper(ur_context_handle_t Context);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
index 4ef5d989ca953..8983835ad0811 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
@@ -7,3 +7,1259 @@
 //===-----------------------------------------------------------------===//
 
 #include "ur_level_zero_device.hpp"
+#include <algorithm>
+#include <climits>
+#include <ur_bindings.hpp>
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(
+    ur_platform_handle_t Platform, ///< [in] handle of the platform instance
+    ur_device_type_t DeviceType,   ///< [in] the type of the devices.
+    uint32_t NumEntries, ///< [in] the number of devices to be added to
+                         ///< phDevices. If phDevices in not NULL then
+                         ///< NumEntries should be greater than zero, otherwise
+                         ///< ::UR_RESULT_ERROR_INVALID_SIZE, will be returned.
+    ur_device_handle_t
+        *Devices, ///< [out][optional][range(0, NumEntries)] array of handle of
+                  ///< devices. If NumEntries is less than the number of devices
+                  ///< available, then platform shall only retrieve that number
+                  ///< of devices.
+    uint32_t *NumDevices ///< [out][optional] pointer to the number of devices.
+                         ///< pNumDevices will be updated with the total number
+                         ///< of devices available.
+
+) {
+
+  auto Res = Platform->populateDeviceCacheIfNeeded();
+  if (Res != UR_RESULT_SUCCESS) {
+    return Res;
+  }
+
+  // Filter available devices based on input DeviceType.
+  std::vector<ur_device_handle_t> MatchedDevices;
+  std::shared_lock<ur_shared_mutex> Lock(Platform->PiDevicesCacheMutex);
+  for (auto &D : Platform->PiDevicesCache) {
+    // Only ever return root-devices from piDevicesGet, but the
+    // devices cache also keeps sub-devices.
+    if (D->isSubDevice())
+      continue;
+
+    bool Matched = false;
+    switch (DeviceType) {
+    case UR_DEVICE_TYPE_ALL:
+      Matched = true;
+      break;
+    case UR_DEVICE_TYPE_GPU:
+    case UR_DEVICE_TYPE_DEFAULT:
+      Matched = (D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_GPU);
+      break;
+    case UR_DEVICE_TYPE_CPU:
+      Matched = (D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_CPU);
+      break;
+    case UR_DEVICE_TYPE_FPGA:
+      Matched = D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_FPGA;
+      break;
+    case UR_DEVICE_TYPE_MCA:
+      Matched = D->ZeDeviceProperties->type == ZE_DEVICE_TYPE_MCA;
+      break;
+    default:
+      Matched = false;
+      urPrint("Unknown device type");
+      break;
+    }
+    if (Matched)
+      MatchedDevices.push_back(D.get());
+  }
+
+  uint32_t ZeDeviceCount = MatchedDevices.size();
+
+  auto N = std::min(ZeDeviceCount, NumEntries);
+  if (Devices)
+    std::copy_n(MatchedDevices.begin(), N, Devices);
+
+  if (NumDevices) {
+    if (*NumDevices == 0)
+      *NumDevices = ZeDeviceCount;
+    else
+      *NumDevices = N;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
+    ur_device_handle_t Device,  ///< [in] handle of the device instance
+    ur_device_info_t ParamName, ///< [in] type of the info to retrieve
+    size_t propSize,  ///< [in] the number of bytes pointed to by pDeviceInfo.
+    void *ParamValue, ///< [out][optional] array of bytes holding the info.
+                      ///< If propSize is not equal to or greater than the real
+                      ///< number of bytes needed to return the info then the
+                      ///< ::UR_RESULT_ERROR_INVALID_SIZE error is returned and
+                      ///< pDeviceInfo is not used.
+    size_t *pSize ///< [out][optional] pointer to the actual size in bytes of
+                  ///< the queried infoType.
+) {
+  UrReturnHelper ReturnValue(propSize, ParamValue, pSize);
+
+  ze_device_handle_t ZeDevice = Device->ZeDevice;
+
+  switch ((int)ParamName) {
+  case UR_DEVICE_INFO_TYPE: {
+    switch (Device->ZeDeviceProperties->type) {
+    case ZE_DEVICE_TYPE_GPU:
+      return ReturnValue(UR_DEVICE_TYPE_GPU);
+    case ZE_DEVICE_TYPE_CPU:
+      return ReturnValue(UR_DEVICE_TYPE_CPU);
+    case ZE_DEVICE_TYPE_FPGA:
+      return ReturnValue(UR_DEVICE_TYPE_FPGA);
+    default:
+      urPrint("This device type is not supported\n");
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+  }
+  case UR_DEVICE_INFO_PARENT_DEVICE:
+    return ReturnValue(Device->RootDevice);
+  case UR_DEVICE_INFO_PLATFORM:
+    return ReturnValue(Device->Platform);
+  case UR_DEVICE_INFO_VENDOR_ID:
+    return ReturnValue(uint32_t{Device->ZeDeviceProperties->vendorId});
+  case UR_DEVICE_INFO_UUID: {
+    // Intel extension for device UUID. This returns the UUID as
+    // std::array<std::byte, 16>. For details about this extension,
+    // see sycl/doc/extensions/supported/sycl_ext_intel_device_info.md.
+    const auto &UUID = Device->ZeDeviceProperties->uuid.id;
+    return ReturnValue(UUID, sizeof(UUID));
+  }
+  case UR_DEVICE_INFO_ATOMIC_64:
+    return ReturnValue(uint32_t{Device->ZeDeviceModuleProperties->flags &
+                                ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS});
+  case UR_DEVICE_INFO_EXTENSIONS: {
+    // Convention adopted from OpenCL:
+    //     "Returns a space separated list of extension names (the extension
+    // names themselves do not contain any spaces) supported by the device."
+    //
+    // TODO: Use proper mechanism to get this information from Level Zero after
+    // it is added to Level Zero.
+    // Hardcoding the few we know are supported by the current hardware.
+    //
+    //
+    std::string SupportedExtensions;
+
+    // cl_khr_il_program - OpenCL 2.0 KHR extension for SPIR-V support. Core
+    //   feature in >OpenCL 2.1
+    // cl_khr_subgroups - Extension adds support for implementation-controlled
+    //   subgroups.
+    // cl_intel_subgroups - Extension adds subgroup features, defined by Intel.
+    // cl_intel_subgroups_short - Extension adds subgroup functions described in
+    //   the cl_intel_subgroups extension to support 16-bit integer data types
+    //   for performance.
+    // cl_intel_required_subgroup_size - Extension to allow programmers to
+    //   optionally specify the required subgroup size for a kernel function.
+    // cl_khr_fp16 - Optional half floating-point support.
+    // cl_khr_fp64 - Support for double floating-point precision.
+    // cl_khr_int64_base_atomics, cl_khr_int64_extended_atomics - Optional
+    //   extensions that implement atomic operations on 64-bit signed and
+    //   unsigned integers to locations in __global and __local memory.
+    // cl_khr_3d_image_writes - Extension to enable writes to 3D image memory
+    //   objects.
+    //
+    // Hardcoding some extensions we know are supported by all Level Zero
+    // devices.
+    SupportedExtensions += (ZE_SUPPORTED_EXTENSIONS);
+    if (Device->ZeDeviceModuleProperties->flags & ZE_DEVICE_MODULE_FLAG_FP16)
+      SupportedExtensions += ("cl_khr_fp16 ");
+    if (Device->ZeDeviceModuleProperties->flags & ZE_DEVICE_MODULE_FLAG_FP64)
+      SupportedExtensions += ("cl_khr_fp64 ");
+    if (Device->ZeDeviceModuleProperties->flags &
+        ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS)
+      // int64AtomicsSupported indicates support for both.
+      SupportedExtensions +=
+          ("cl_khr_int64_base_atomics cl_khr_int64_extended_atomics ");
+    if (Device->ZeDeviceImageProperties->maxImageDims3D > 0)
+      // Supports reading and writing of images.
+      SupportedExtensions += ("cl_khr_3d_image_writes ");
+
+    // L0 does not tell us if bfloat16 is supported.
+    // For now, assume ATS and PVC support it.
+    // TODO: change the way we detect bfloat16 support.
+    if ((Device->ZeDeviceProperties->deviceId & 0xfff) == 0x201 ||
+        (Device->ZeDeviceProperties->deviceId & 0xff0) == 0xbd0)
+      SupportedExtensions += ("cl_intel_bfloat16_conversions ");
+
+    return ReturnValue(SupportedExtensions.c_str());
+  }
+  case UR_DEVICE_INFO_NAME:
+    return ReturnValue(Device->ZeDeviceProperties->name);
+  // zeModuleCreate allows using root device module for sub-devices:
+  // > The application must only use the module for the device, or its
+  // > sub-devices, which was provided during creation.
+  case UR_EXT_DEVICE_INFO_BUILD_ON_SUBDEVICE:
+    return ReturnValue(uint32_t{0});
+  case UR_DEVICE_INFO_COMPILER_AVAILABLE:
+    return ReturnValue(uint32_t{1});
+  case UR_DEVICE_INFO_LINKER_AVAILABLE:
+    return ReturnValue(uint32_t{1});
+  case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
+    uint32_t MaxComputeUnits =
+        Device->ZeDeviceProperties->numEUsPerSubslice *
+        Device->ZeDeviceProperties->numSubslicesPerSlice *
+        Device->ZeDeviceProperties->numSlices;
+
+    bool RepresentsCSlice =
+        Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
+            .ZeIndex >= 0;
+    if (RepresentsCSlice)
+      MaxComputeUnits /= Device->RootDevice->SubDevices.size();
+
+    return ReturnValue(uint32_t{MaxComputeUnits});
+  }
+  case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS:
+    // Level Zero spec defines only three dimensions
+    return ReturnValue(uint32_t{3});
+  case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE:
+    return ReturnValue(
+        uint64_t{Device->ZeDeviceComputeProperties->maxTotalGroupSize});
+  case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: {
+    struct {
+      size_t Arr[3];
+    } MaxGroupSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX,
+                       Device->ZeDeviceComputeProperties->maxGroupSizeY,
+                       Device->ZeDeviceComputeProperties->maxGroupSizeZ}};
+    return ReturnValue(MaxGroupSize);
+  }
+  case UR_EXT_DEVICE_INFO_MAX_WORK_GROUPS_3D: {
+    struct {
+      size_t Arr[3];
+    } MaxGroupCounts = {{Device->ZeDeviceComputeProperties->maxGroupCountX,
+                         Device->ZeDeviceComputeProperties->maxGroupCountY,
+                         Device->ZeDeviceComputeProperties->maxGroupCountZ}};
+    return ReturnValue(MaxGroupCounts);
+  }
+  case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY:
+    return ReturnValue(uint32_t{Device->ZeDeviceProperties->coreClockRate});
+  case UR_DEVICE_INFO_ADDRESS_BITS: {
+    // TODO: To confirm with spec.
+    return ReturnValue(uint32_t{64});
+  }
+  case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
+    return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize});
+  case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: {
+    uint64_t GlobalMemSize = 0;
+    for (const auto &ZeDeviceMemoryExtProperty :
+         Device->ZeDeviceMemoryProperties->second) {
+      GlobalMemSize += ZeDeviceMemoryExtProperty.physicalSize;
+    }
+    return ReturnValue(uint64_t{GlobalMemSize});
+  }
+  case UR_DEVICE_INFO_LOCAL_MEM_SIZE:
+    return ReturnValue(
+        uint64_t{Device->ZeDeviceComputeProperties->maxSharedLocalMemory});
+  case UR_DEVICE_INFO_IMAGE_SUPPORTED:
+    return ReturnValue(
+        uint32_t{Device->ZeDeviceImageProperties->maxImageDims1D > 0});
+  case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY:
+    return ReturnValue(uint32_t{(Device->ZeDeviceProperties->flags &
+                                 ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0});
+  case UR_DEVICE_INFO_AVAILABLE:
+    return ReturnValue(uint32_t{ZeDevice ? true : false});
+  case UR_DEVICE_INFO_VENDOR:
+    // TODO: Level-Zero does not return vendor's name at the moment
+    // only the ID.
+    return ReturnValue("Intel(R) Corporation");
+  case UR_DEVICE_INFO_DRIVER_VERSION:
+    return ReturnValue(Device->Platform->ZeDriverVersion.c_str());
+  case UR_DEVICE_INFO_VERSION:
+    return ReturnValue(Device->Platform->ZeDriverApiVersion.c_str());
+  case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: {
+    auto Res = Device->Platform->populateDeviceCacheIfNeeded();
+    if (Res != UR_RESULT_SUCCESS) {
+      return Res;
+    }
+    return ReturnValue((uint32_t)Device->SubDevices.size());
+  }
+  case UR_DEVICE_INFO_REFERENCE_COUNT:
+    return ReturnValue(uint32_t{Device->RefCount.load()});
+  case UR_DEVICE_INFO_PARTITION_PROPERTIES: {
+    // SYCL spec says: if this SYCL device cannot be partitioned into at least
+    // two sub devices then the returned vector must be empty.
+    auto Res = Device->Platform->populateDeviceCacheIfNeeded();
+    if (Res != UR_RESULT_SUCCESS) {
+      return Res;
+    }
+
+    uint32_t ZeSubDeviceCount = Device->SubDevices.size();
+    if (ZeSubDeviceCount < 2) {
+      return ReturnValue((ur_device_partition_property_t)0);
+    }
+    bool PartitionedByCSlice = Device->SubDevices[0]->isCCS();
+
+    auto ReturnHelper = [&](auto... Partitions) {
+      struct {
+        ur_device_partition_property_t Arr[sizeof...(Partitions) + 1];
+      } PartitionProperties = {
+          {Partitions..., ur_device_partition_property_t(0)}};
+      return ReturnValue(PartitionProperties);
+    };
+
+    if (ExposeCSliceInAffinityPartitioning) {
+      if (PartitionedByCSlice)
+        return ReturnHelper(UR_DEVICE_PARTITION_BY_CSLICE,
+                            UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN);
+
+      else
+        return ReturnHelper(UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN);
+    } else {
+      return ReturnHelper(PartitionedByCSlice
+                              ? UR_DEVICE_PARTITION_BY_CSLICE
+                              : UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN);
+    }
+    break;
+  }
+  case UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN:
+    return ReturnValue(ur_device_affinity_domain_flag_t(
+        UR_DEVICE_AFFINITY_DOMAIN_FLAG_NUMA |
+        UR_DEVICE_AFFINITY_DOMAIN_FLAG_NEXT_PARTITIONABLE));
+  case UR_DEVICE_INFO_PARTITION_TYPE: {
+    // For root-device there is no partitioning to report.
+    if (!Device->isSubDevice())
+      return ReturnValue(ur_device_partition_property_t(0));
+
+    if (Device->isCCS()) {
+      struct {
+        ur_device_partition_property_t Arr[2];
+      } PartitionProperties = {
+          {UR_DEVICE_PARTITION_BY_CSLICE, ur_device_partition_property_t(0)}};
+      return ReturnValue(PartitionProperties);
+    }
+
+    struct {
+      ur_device_partition_property_t Arr[3];
+    } PartitionProperties = {
+        {UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
+         (ur_device_partition_property_t)
+             UR_DEVICE_AFFINITY_DOMAIN_FLAG_NEXT_PARTITIONABLE,
+         ur_device_partition_property_t(0)}};
+    return ReturnValue(PartitionProperties);
+  }
+
+    // Everything under here is not supported yet
+
+  case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION:
+    return ReturnValue("");
+  case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC:
+    return ReturnValue(uint32_t{true});
+  case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE:
+    return ReturnValue(
+        size_t{Device->ZeDeviceModuleProperties->printfBufferSize});
+  case UR_DEVICE_INFO_PROFILE:
+    return ReturnValue("FULL_PROFILE");
+  case UR_DEVICE_INFO_BUILT_IN_KERNELS:
+    // TODO: To find out correct value
+    return ReturnValue("");
+  case UR_DEVICE_INFO_QUEUE_PROPERTIES:
+    return ReturnValue(
+        ur_queue_flag_t(UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                        UR_QUEUE_FLAG_PROFILING_ENABLE));
+  case UR_DEVICE_INFO_EXECUTION_CAPABILITIES:
+    return ReturnValue(ur_device_exec_capability_flag_t{
+        UR_DEVICE_EXEC_CAPABILITY_FLAG_NATIVE_KERNEL});
+  case UR_DEVICE_INFO_ENDIAN_LITTLE:
+    return ReturnValue(uint32_t{true});
+  case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT:
+    return ReturnValue(uint32_t{Device->ZeDeviceProperties->flags &
+                                ZE_DEVICE_PROPERTY_FLAG_ECC});
+  case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION:
+    return ReturnValue(size_t{Device->ZeDeviceProperties->timerResolution});
+  case UR_DEVICE_INFO_LOCAL_MEM_TYPE:
+    return ReturnValue(UR_DEVICE_LOCAL_MEM_TYPE_LOCAL);
+  case UR_DEVICE_INFO_MAX_CONSTANT_ARGS:
+    return ReturnValue(uint32_t{64});
+  case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE:
+    return ReturnValue(
+        uint64_t{Device->ZeDeviceImageProperties->maxImageBufferSize});
+  case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE:
+    return ReturnValue(UR_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE);
+  case UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE:
+    return ReturnValue(
+        // TODO[1.0]: how to query cache line-size?
+        uint32_t{1});
+  case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE:
+    return ReturnValue(uint64_t{Device->ZeDeviceCacheProperties->cacheSize});
+  case UR_DEVICE_INFO_MAX_PARAMETER_SIZE:
+    return ReturnValue(
+        size_t{Device->ZeDeviceModuleProperties->maxArgumentsSize});
+  case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN:
+    // SYCL/OpenCL spec is vague on what this means exactly, but seems to
+    // be for "alignment requirement (in bits) for sub-buffer offsets."
+    // An OpenCL implementation returns 8*128, but Level Zero can do just 8,
+    // meaning unaligned access for values of types larger than 8 bits.
+    return ReturnValue(uint32_t{8});
+  case UR_DEVICE_INFO_MAX_SAMPLERS:
+    return ReturnValue(uint32_t{Device->ZeDeviceImageProperties->maxSamplers});
+  case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS:
+    return ReturnValue(
+        uint32_t{Device->ZeDeviceImageProperties->maxReadImageArgs});
+  case UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS:
+    return ReturnValue(
+        uint32_t{Device->ZeDeviceImageProperties->maxWriteImageArgs});
+  case UR_DEVICE_INFO_SINGLE_FP_CONFIG: {
+    uint64_t SingleFPValue = 0;
+    ze_device_fp_flags_t ZeSingleFPCapabilities =
+        Device->ZeDeviceModuleProperties->fp32flags;
+    if (ZE_DEVICE_FP_FLAG_DENORM & ZeSingleFPCapabilities) {
+      SingleFPValue |= UR_FP_CAPABILITY_FLAG_DENORM;
+    }
+    if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeSingleFPCapabilities) {
+      SingleFPValue |= UR_FP_CAPABILITY_FLAG_INF_NAN;
+    }
+    if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeSingleFPCapabilities) {
+      SingleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST;
+    }
+    if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeSingleFPCapabilities) {
+      SingleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_ZERO;
+    }
+    if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeSingleFPCapabilities) {
+      SingleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_INF;
+    }
+    if (ZE_DEVICE_FP_FLAG_FMA & ZeSingleFPCapabilities) {
+      SingleFPValue |= UR_FP_CAPABILITY_FLAG_FMA;
+    }
+    if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeSingleFPCapabilities) {
+      SingleFPValue |= UR_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
+    }
+    return ReturnValue(uint64_t{SingleFPValue});
+  }
+  case UR_DEVICE_INFO_HALF_FP_CONFIG: {
+    uint64_t HalfFPValue = 0;
+    ze_device_fp_flags_t ZeHalfFPCapabilities =
+        Device->ZeDeviceModuleProperties->fp16flags;
+    if (ZE_DEVICE_FP_FLAG_DENORM & ZeHalfFPCapabilities) {
+      HalfFPValue |= UR_FP_CAPABILITY_FLAG_DENORM;
+    }
+    if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeHalfFPCapabilities) {
+      HalfFPValue |= UR_FP_CAPABILITY_FLAG_INF_NAN;
+    }
+    if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeHalfFPCapabilities) {
+      HalfFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST;
+    }
+    if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeHalfFPCapabilities) {
+      HalfFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_ZERO;
+    }
+    if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeHalfFPCapabilities) {
+      HalfFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_INF;
+    }
+    if (ZE_DEVICE_FP_FLAG_FMA & ZeHalfFPCapabilities) {
+      HalfFPValue |= UR_FP_CAPABILITY_FLAG_FMA;
+    }
+    if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeHalfFPCapabilities) {
+      HalfFPValue |= UR_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
+    }
+    return ReturnValue(uint64_t{HalfFPValue});
+  }
+  case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: {
+    uint64_t DoubleFPValue = 0;
+    ze_device_fp_flags_t ZeDoubleFPCapabilities =
+        Device->ZeDeviceModuleProperties->fp64flags;
+    if (ZE_DEVICE_FP_FLAG_DENORM & ZeDoubleFPCapabilities) {
+      DoubleFPValue |= UR_FP_CAPABILITY_FLAG_DENORM;
+    }
+    if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeDoubleFPCapabilities) {
+      DoubleFPValue |= UR_FP_CAPABILITY_FLAG_INF_NAN;
+    }
+    if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeDoubleFPCapabilities) {
+      DoubleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST;
+    }
+    if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeDoubleFPCapabilities) {
+      DoubleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_ZERO;
+    }
+    if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeDoubleFPCapabilities) {
+      DoubleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_INF;
+    }
+    if (ZE_DEVICE_FP_FLAG_FMA & ZeDoubleFPCapabilities) {
+      DoubleFPValue |= UR_FP_CAPABILITY_FLAG_FMA;
+    }
+    if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeDoubleFPCapabilities) {
+      DoubleFPValue |= UR_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
+    }
+    return ReturnValue(uint64_t{DoubleFPValue});
+  }
+  case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH:
+    return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims2D});
+  case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT:
+    return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims2D});
+  case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH:
+    return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D});
+  case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT:
+    return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D});
+  case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH:
+    return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims3D});
+  case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE:
+    return ReturnValue(
+        size_t{Device->ZeDeviceImageProperties->maxImageBufferSize});
+  case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE:
+    return ReturnValue(
+        size_t{Device->ZeDeviceImageProperties->maxImageArraySlices});
+  // Handle SIMD widths.
+  // TODO: can we do better than this?
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR:
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR:
+    return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 1);
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT:
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT:
+    return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 2);
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT:
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT:
+    return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 4);
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG:
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG:
+    return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 8);
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT:
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT:
+    return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 4);
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE:
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE:
+    return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 8);
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF:
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF:
+    return ReturnValue(Device->ZeDeviceProperties->physicalEUSimdWidth / 2);
+  case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: {
+    // Max_num_sub_Groups = maxTotalGroupSize/min(set of subGroupSizes);
+    uint32_t MinSubGroupSize =
+        Device->ZeDeviceComputeProperties->subGroupSizes[0];
+    for (uint32_t I = 1;
+         I < Device->ZeDeviceComputeProperties->numSubGroupSizes; I++) {
+      if (MinSubGroupSize > Device->ZeDeviceComputeProperties->subGroupSizes[I])
+        MinSubGroupSize = Device->ZeDeviceComputeProperties->subGroupSizes[I];
+    }
+    return ReturnValue(Device->ZeDeviceComputeProperties->maxTotalGroupSize /
+                       MinSubGroupSize);
+  }
+  case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: {
+    // TODO: Not supported yet. Needs to be updated after support is added.
+    return ReturnValue(uint32_t{false});
+  }
+  case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
+    // ze_device_compute_properties.subGroupSizes is in uint32_t whereas the
+    // expected return is size_t datatype. size_t can be 8 bytes of data.
+    return ReturnValue.template operator()<size_t>(
+        Device->ZeDeviceComputeProperties->subGroupSizes,
+        Device->ZeDeviceComputeProperties->numSubGroupSizes);
+  }
+  case UR_DEVICE_INFO_IL_VERSION: {
+    // Set to a space separated list of IL version strings of the form
+    // <IL_Prefix>_<Major_version>.<Minor_version>.
+    // "SPIR-V" is a required IL prefix when cl_khr_il_progam extension is
+    // reported.
+    uint32_t SpirvVersion =
+        Device->ZeDeviceModuleProperties->spirvVersionSupported;
+    uint32_t SpirvVersionMajor = ZE_MAJOR_VERSION(SpirvVersion);
+    uint32_t SpirvVersionMinor = ZE_MINOR_VERSION(SpirvVersion);
+
+    char SpirvVersionString[50];
+    int Len = sprintf(SpirvVersionString, "SPIR-V_%d.%d ", SpirvVersionMajor,
+                      SpirvVersionMinor);
+    // returned string to contain only len number of characters.
+    std::string ILVersion(SpirvVersionString, Len);
+    return ReturnValue(ILVersion.c_str());
+  }
+  case UR_DEVICE_INFO_USM_HOST_SUPPORT:
+  case UR_DEVICE_INFO_USM_DEVICE_SUPPORT:
+  case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT:
+  case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT:
+  case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: {
+    auto MapCaps = [](const ze_memory_access_cap_flags_t &ZeCapabilities) {
+      uint64_t Capabilities = 0;
+      if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_RW)
+        Capabilities |= UR_EXT_USM_CAPS_ACCESS;
+      if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC)
+        Capabilities |= UR_EXT_USM_CAPS_ATOMIC_ACCESS;
+      if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT)
+        Capabilities |= UR_EXT_USM_CAPS_CONCURRENT_ACCESS;
+      if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC)
+        Capabilities |= UR_EXT_USM_CAPS_CONCURRENT_ATOMIC_ACCESS;
+      return Capabilities;
+    };
+    auto &Props = Device->ZeDeviceMemoryAccessProperties;
+    switch (ParamName) {
+    case UR_DEVICE_INFO_USM_HOST_SUPPORT:
+      return ReturnValue(MapCaps(Props->hostAllocCapabilities));
+    case UR_DEVICE_INFO_USM_DEVICE_SUPPORT:
+      return ReturnValue(MapCaps(Props->deviceAllocCapabilities));
+    case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT:
+      return ReturnValue(MapCaps(Props->sharedSingleDeviceAllocCapabilities));
+    case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT:
+      return ReturnValue(MapCaps(Props->sharedCrossDeviceAllocCapabilities));
+    case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT:
+      return ReturnValue(MapCaps(Props->sharedSystemAllocCapabilities));
+    default:
+      die("urDeviceGetInfo: unexpected ParamName.");
+    }
+  }
+
+    // intel extensions for GPU information
+  case UR_DEVICE_INFO_DEVICE_ID:
+    return ReturnValue(uint32_t{Device->ZeDeviceProperties->deviceId});
+  case UR_DEVICE_INFO_PCI_ADDRESS: {
+    if (getenv("ZES_ENABLE_SYSMAN") == nullptr) {
+      urPrint("Set SYCL_ENABLE_PCI=1 to obtain PCI data.\n");
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+    ZesStruct<zes_pci_properties_t> ZeDevicePciProperties;
+    ZE2UR_CALL(zesDevicePciGetProperties, (ZeDevice, &ZeDevicePciProperties));
+    constexpr size_t AddressBufferSize = 13;
+    char AddressBuffer[AddressBufferSize];
+    std::snprintf(AddressBuffer, AddressBufferSize, "%04x:%02x:%02x.%01x",
+                  ZeDevicePciProperties.address.domain,
+                  ZeDevicePciProperties.address.bus,
+                  ZeDevicePciProperties.address.device,
+                  ZeDevicePciProperties.address.function);
+    return ReturnValue(AddressBuffer);
+  }
+
+  case UR_EXT_DEVICE_INFO_FREE_MEMORY: {
+    if (getenv("ZES_ENABLE_SYSMAN") == nullptr) {
+      setErrorMessage("Set ZES_ENABLE_SYSMAN=1 to obtain free memory",
+                      UR_RESULT_SUCCESS);
+      return UR_EXT_RESULT_ADAPTER_SPECIFIC_ERROR;
+    }
+    // Only report device memory which zeMemAllocDevice can allocate from.
+    // Currently this is only the one enumerated with ordinal 0.
+    uint64_t FreeMemory = 0;
+    uint32_t MemCount = 0;
+    ZE2UR_CALL(zesDeviceEnumMemoryModules, (ZeDevice, &MemCount, nullptr));
+    if (MemCount != 0) {
+      std::vector<zes_mem_handle_t> ZesMemHandles(MemCount);
+      ZE2UR_CALL(zesDeviceEnumMemoryModules,
+                 (ZeDevice, &MemCount, ZesMemHandles.data()));
+      for (auto &ZesMemHandle : ZesMemHandles) {
+        ZesStruct<zes_mem_properties_t> ZesMemProperties;
+        ZE2UR_CALL(zesMemoryGetProperties, (ZesMemHandle, &ZesMemProperties));
+        // For root-device report memory from all memory modules since that
+        // is what totally available in the default implicit scaling mode.
+        // For sub-devices only report memory local to them.
+        if (!Device->isSubDevice() || Device->ZeDeviceProperties->subdeviceId ==
+                                          ZesMemProperties.subdeviceId) {
+
+          ZesStruct<zes_mem_state_t> ZesMemState;
+          ZE2UR_CALL(zesMemoryGetState, (ZesMemHandle, &ZesMemState));
+          FreeMemory += ZesMemState.free;
+        }
+      }
+    }
+    return ReturnValue(FreeMemory);
+  }
+  case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: {
+    // If there are not any memory modules then return 0.
+    if (Device->ZeDeviceMemoryProperties->first.empty())
+      return ReturnValue(uint32_t{0});
+
+    // If there are multiple memory modules on the device then we have to report
+    // the value of the slowest memory.
+    auto Comp = [](const ze_device_memory_properties_t &A,
+                   const ze_device_memory_properties_t &B) -> bool {
+      return A.maxClockRate < B.maxClockRate;
+    };
+    auto MinIt =
+        std::min_element(Device->ZeDeviceMemoryProperties->first.begin(),
+                         Device->ZeDeviceMemoryProperties->first.end(), Comp);
+    return ReturnValue(uint32_t{MinIt->maxClockRate});
+  }
+  case UR_EXT_DEVICE_INFO_MEMORY_BUS_WIDTH: {
+    // If there are not any memory modules then return 0.
+    if (Device->ZeDeviceMemoryProperties->first.empty())
+      return ReturnValue(uint32_t{0});
+
+    // If there are multiple memory modules on the device then we have to report
+    // the value of the slowest memory.
+    auto Comp = [](const ze_device_memory_properties_t &A,
+                   const ze_device_memory_properties_t &B) -> bool {
+      return A.maxBusWidth < B.maxBusWidth;
+    };
+    auto MinIt =
+        std::min_element(Device->ZeDeviceMemoryProperties->first.begin(),
+                         Device->ZeDeviceMemoryProperties->first.end(), Comp);
+    return ReturnValue(uint32_t{MinIt->maxBusWidth});
+  }
+  case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: {
+    if (Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
+            .ZeIndex >= 0)
+      // Sub-sub-device represents a particular compute index already.
+      return ReturnValue(int32_t{1});
+
+    auto ZeDeviceNumIndices =
+        Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
+            .ZeProperties.numQueues;
+    return ReturnValue(int32_t(ZeDeviceNumIndices));
+  } break;
+  case UR_DEVICE_INFO_GPU_EU_COUNT: {
+    uint32_t count = Device->ZeDeviceProperties->numEUsPerSubslice *
+                     Device->ZeDeviceProperties->numSubslicesPerSlice *
+                     Device->ZeDeviceProperties->numSlices;
+    return ReturnValue(uint32_t{count});
+  }
+  case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH:
+    return ReturnValue(
+        uint32_t{Device->ZeDeviceProperties->physicalEUSimdWidth});
+  case UR_EXT_DEVICE_INFO_GPU_SLICES:
+    return ReturnValue(uint32_t{Device->ZeDeviceProperties->numSlices});
+  case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
+    return ReturnValue(
+        uint32_t{Device->ZeDeviceProperties->numSubslicesPerSlice});
+  case UR_EXT_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
+    return ReturnValue(uint32_t{Device->ZeDeviceProperties->numEUsPerSubslice});
+  case UR_EXT_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
+    return ReturnValue(uint32_t{Device->ZeDeviceProperties->numThreadsPerEU});
+  case UR_EXT_DEVICE_INFO_MAX_MEM_BANDWIDTH:
+    // currently not supported in level zero runtime
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  case UR_DEVICE_INFO_BFLOAT16: {
+    // bfloat16 math functions are not yet supported on Intel GPUs.
+    return ReturnValue(bool{false});
+  }
+  case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
+    // There are no explicit restrictions in L0 programming guide, so assume all
+    // are supported
+    ur_memory_scope_capability_flags_t result =
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM;
+
+    return ReturnValue(result);
+  }
+  case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: {
+    // There are no explicit restrictions in L0 programming guide, so assume all
+    // are supported
+    ur_memory_order_capability_flags_t result =
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST;
+
+    return ReturnValue(result);
+  }
+  case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: {
+    // There are no explicit restrictions in L0 programming guide, so assume all
+    // are supported
+    ur_memory_scope_capability_flags_t result =
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM;
+
+    return ReturnValue(result);
+  }
+
+  case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
+    ur_memory_order_capability_flags_t capabilities =
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST;
+    return ReturnValue(capabilities);
+  }
+
+  default:
+    urPrint("Unsupported ParamName in urGetDeviceInfo\n");
+    urPrint("ParamName=%d(0x%x)\n", ParamName, ParamName);
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+// SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE can be set to an integer value, or
+// a pair of integer values of the form "lower_index:upper_index".
+// Here, the indices point to copy engines in a list of all available copy
+// engines.
+// This functions returns this pair of indices.
+// If the user specifies only a single integer, a value of 0 indicates that
+// the copy engines will not be used at all. A value of 1 indicates that all
+// available copy engines can be used.
+const std::pair<int, int>
+getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device) {
+  static const char *EnvVar = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE");
+  // If the environment variable is not set, no copy engines are used when
+  // immediate commandlists are being used. For standard commandlists all are
+  // used.
+  if (!EnvVar) {
+    if (Device->ImmCommandListUsed)
+      return std::pair<int, int>(-1, -1);   // No copy engines can be used.
+    return std::pair<int, int>(0, INT_MAX); // All copy engines will be used.
+  }
+  std::string CopyEngineRange = EnvVar;
+  // Environment variable can be a single integer or a pair of integers
+  // separated by ":"
+  auto pos = CopyEngineRange.find(":");
+  if (pos == std::string::npos) {
+    bool UseCopyEngine = (std::stoi(CopyEngineRange) != 0);
+    if (UseCopyEngine)
+      return std::pair<int, int>(0, INT_MAX); // All copy engines can be used.
+    return std::pair<int, int>(-1, -1);       // No copy engines will be used.
+  }
+  int LowerCopyEngineIndex = std::stoi(CopyEngineRange.substr(0, pos));
+  int UpperCopyEngineIndex = std::stoi(CopyEngineRange.substr(pos + 1));
+  if ((LowerCopyEngineIndex > UpperCopyEngineIndex) ||
+      (LowerCopyEngineIndex < -1) || (UpperCopyEngineIndex < -1)) {
+    urPrint("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE: invalid value provided, "
+            "default set.\n");
+    LowerCopyEngineIndex = 0;
+    UpperCopyEngineIndex = INT_MAX;
+  }
+  return std::pair<int, int>(LowerCopyEngineIndex, UpperCopyEngineIndex);
+}
+
+bool CopyEngineRequested(const ur_device_handle_t &Device) {
+  int LowerCopyQueueIndex = getRangeOfAllowedCopyEngines(Device).first;
+  int UpperCopyQueueIndex = getRangeOfAllowedCopyEngines(Device).second;
+  return ((LowerCopyQueueIndex != -1) || (UpperCopyQueueIndex != -1));
+}
+
+// Whether immediate commandlists will be used for kernel launches and copies.
+// The default is standard commandlists. Setting 1 or 2 specifies use of
+// immediate commandlists. Note: when immediate commandlists are used then
+// device-only events must be either AllHostVisible or OnDemandHostVisibleProxy.
+// (See env var SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS).
+
+// Get value of immediate commandlists env var setting or -1 if unset
+ur_device_handle_t_::ImmCmdlistMode
+ur_device_handle_t_::useImmediateCommandLists() {
+  // If immediate commandlist setting is not explicitly set, then use the device
+  // default.
+  static const int ImmediateCommandlistsSetting = [] {
+    const char *ImmediateCommandlistsSettingStr =
+        std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS");
+    if (!ImmediateCommandlistsSettingStr)
+      return -1;
+    return std::stoi(ImmediateCommandlistsSettingStr);
+  }();
+
+  if (ImmediateCommandlistsSetting == -1)
+    // Change this to PerQueue as default after more testing.
+    return NotUsed;
+  switch (ImmediateCommandlistsSetting) {
+  case 0:
+    return NotUsed;
+  case 1:
+    return PerQueue;
+  case 2:
+    return PerThreadPerQueue;
+  default:
+    return NotUsed;
+  }
+}
+
+ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal,
+                                            int SubSubDeviceIndex) {
+  uint32_t numQueueGroups = 0;
+  ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties,
+             (ZeDevice, &numQueueGroups, nullptr));
+  if (numQueueGroups == 0) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+  urPrint("NOTE: Number of queue groups = %d\n", numQueueGroups);
+  std::vector<ZeStruct<ze_command_queue_group_properties_t>>
+      QueueGroupProperties(numQueueGroups);
+  ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties,
+             (ZeDevice, &numQueueGroups, QueueGroupProperties.data()));
+
+  // Initialize ordinal and compute queue group properties
+  for (uint32_t i = 0; i < numQueueGroups; i++) {
+    if (QueueGroupProperties[i].flags &
+        ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
+      QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal =
+          i;
+      QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
+          .ZeProperties = QueueGroupProperties[i];
+      break;
+    }
+  }
+
+  // Reinitialize a sub-sub-device with its own ordinal, index.
+  // Our sub-sub-device representation is currently [Level-Zero sub-device
+  // handle + Level-Zero compute group/engine index]. Only the specified
+  // index queue will be used to submit work to the sub-sub-device.
+  if (SubSubDeviceOrdinal >= 0) {
+    QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal =
+        SubSubDeviceOrdinal;
+    QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeIndex =
+        SubSubDeviceIndex;
+  } else { // Proceed with initialization for root and sub-device
+    // How is it possible that there are no "compute" capabilities?
+    if (QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal <
+        0) {
+      return UR_RESULT_ERROR_UNKNOWN;
+    }
+
+    if (CopyEngineRequested((ur_device_handle_t)this)) {
+      for (uint32_t i = 0; i < numQueueGroups; i++) {
+        if (((QueueGroupProperties[i].flags &
+              ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) &&
+            (QueueGroupProperties[i].flags &
+             ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) {
+          if (QueueGroupProperties[i].numQueues == 1) {
+            QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal = i;
+            QueueGroup[queue_group_info_t::MainCopy].ZeProperties =
+                QueueGroupProperties[i];
+          } else {
+            QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal = i;
+            QueueGroup[queue_group_info_t::LinkCopy].ZeProperties =
+                QueueGroupProperties[i];
+            break;
+          }
+        }
+      }
+      if (QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal < 0)
+        urPrint("NOTE: main blitter/copy engine is not available\n");
+      else
+        urPrint("NOTE: main blitter/copy engine is available\n");
+
+      if (QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal < 0)
+        urPrint("NOTE: link blitter/copy engines are not available\n");
+      else
+        urPrint("NOTE: link blitter/copy engines are available\n");
+    }
+  }
+
+  // Maintain various device properties cache.
+  // Note that we just describe here how to compute the data.
+  // The real initialization is upon first access.
+  //
+  auto ZeDevice = this->ZeDevice;
+  ZeDeviceProperties.Compute = [ZeDevice](ze_device_properties_t &Properties) {
+    ZE_CALL_NOCHECK(zeDeviceGetProperties, (ZeDevice, &Properties));
+  };
+
+  ZeDeviceComputeProperties.Compute =
+      [ZeDevice](ze_device_compute_properties_t &Properties) {
+        ZE_CALL_NOCHECK(zeDeviceGetComputeProperties, (ZeDevice, &Properties));
+      };
+
+  ZeDeviceImageProperties.Compute =
+      [ZeDevice](ze_device_image_properties_t &Properties) {
+        ZE_CALL_NOCHECK(zeDeviceGetImageProperties, (ZeDevice, &Properties));
+      };
+
+  ZeDeviceModuleProperties.Compute =
+      [ZeDevice](ze_device_module_properties_t &Properties) {
+        ZE_CALL_NOCHECK(zeDeviceGetModuleProperties, (ZeDevice, &Properties));
+      };
+
+  ZeDeviceMemoryProperties.Compute =
+      [ZeDevice](
+          std::pair<std::vector<ZeStruct<ze_device_memory_properties_t>>,
+                    std::vector<ZeStruct<ze_device_memory_ext_properties_t>>>
+              &Properties) {
+        uint32_t Count = 0;
+        ZE_CALL_NOCHECK(zeDeviceGetMemoryProperties,
+                        (ZeDevice, &Count, nullptr));
+
+        auto &PropertiesVector = Properties.first;
+        auto &PropertiesExtVector = Properties.second;
+
+        PropertiesVector.resize(Count);
+        PropertiesExtVector.resize(Count);
+        // Request for extended memory properties be read in
+        for (uint32_t I = 0; I < Count; ++I)
+          PropertiesVector[I].pNext = (void *)&PropertiesExtVector[I];
+
+        ZE_CALL_NOCHECK(zeDeviceGetMemoryProperties,
+                        (ZeDevice, &Count, PropertiesVector.data()));
+      };
+
+  ZeDeviceMemoryAccessProperties.Compute =
+      [ZeDevice](ze_device_memory_access_properties_t &Properties) {
+        ZE_CALL_NOCHECK(zeDeviceGetMemoryAccessProperties,
+                        (ZeDevice, &Properties));
+      };
+
+  ZeDeviceCacheProperties.Compute =
+      [ZeDevice](ze_device_cache_properties_t &Properties) {
+        // TODO: Since v1.0 there can be multiple cache properties.
+        // For now remember the first one, if any.
+        uint32_t Count = 0;
+        ZE_CALL_NOCHECK(zeDeviceGetCacheProperties,
+                        (ZeDevice, &Count, nullptr));
+        if (Count > 0)
+          Count = 1;
+        ZE_CALL_NOCHECK(zeDeviceGetCacheProperties,
+                        (ZeDevice, &Count, &Properties));
+      };
+
+  ImmCommandListUsed = this->useImmediateCommandLists();
+
+  if (ImmCommandListUsed == ImmCmdlistMode::NotUsed) {
+    ZeEventsScope = DeviceEventsSetting;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t urDeviceRetain(ur_device_handle_t Device) {
+
+  // The root-device ref-count remains unchanged (always 1).
+  if (Device->isSubDevice()) {
+    Device->RefCount.increment();
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t urDeviceRelease(ur_device_handle_t Device) {
+  // Root devices are destroyed during the piTearDown process.
+  if (Device->isSubDevice()) {
+    if (Device->RefCount.decrementAndTest()) {
+      delete Device;
+    }
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+void ZeUSMImportExtension::setZeUSMImport(ur_platform_handle_t_ *Platform) {
+  // Whether env var SYCL_USM_HOSTPTR_IMPORT has been set requesting
+  // host ptr import during buffer creation.
+  const char *USMHostPtrImportStr = std::getenv("SYCL_USM_HOSTPTR_IMPORT");
+  if (!USMHostPtrImportStr || std::atoi(USMHostPtrImportStr) == 0)
+    return;
+
+  // Check if USM hostptr import feature is available.
+  ze_driver_handle_t DriverHandle = Platform->ZeDriver;
+  if (ZE_CALL_NOCHECK(
+          zeDriverGetExtensionFunctionAddress,
+          (DriverHandle, "zexDriverImportExternalPointer",
+           reinterpret_cast<void **>(&zexDriverImportExternalPointer))) == 0) {
+    ZE_CALL_NOCHECK(
+        zeDriverGetExtensionFunctionAddress,
+        (DriverHandle, "zexDriverReleaseImportedPointer",
+         reinterpret_cast<void **>(&zexDriverReleaseImportedPointer)));
+    // Hostptr import/release is turned on because it has been requested
+    // by the env var, and this platform supports the APIs.
+    Enabled = true;
+    // Hostptr import is only possible if piMemBufferCreate receives a
+    // hostptr as an argument. The SYCL runtime passes a host ptr
+    // only when SYCL_HOST_UNIFIED_MEMORY is enabled. Therefore we turn it on.
+    setEnvVar("SYCL_HOST_UNIFIED_MEMORY", "1");
+  }
+}
+void ZeUSMImportExtension::doZeUSMImport(ze_driver_handle_t DriverHandle,
+                                         void *HostPtr, size_t Size) {
+  ZE_CALL_NOCHECK(zexDriverImportExternalPointer,
+                  (DriverHandle, HostPtr, Size));
+}
+void ZeUSMImportExtension::doZeUSMRelease(ze_driver_handle_t DriverHandle,
+                                          void *HostPtr) {
+  ZE_CALL_NOCHECK(zexDriverReleaseImportedPointer, (DriverHandle, HostPtr));
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urDevicePartition(
+    ur_device_handle_t Device, ///< [in] handle of the device to partition.
+    const ur_device_partition_property_t
+        *Properties, ///< [in] null-terminated array of <$_device_partition_t
+                     ///< enum, value> pairs.
+    uint32_t NumDevices, ///< [in] the number of sub-devices.
+    ur_device_handle_t
+        *OutDevices, ///< [out][optional][range(0, NumDevices)] array of handle
+                     ///< of devices. If NumDevices is less than the number of
+                     ///< sub-devices available, then the function shall only
+                     ///< retrieve that number of sub-devices.
+    uint32_t *NumDevicesRet ///< [out][optional] pointer to the number of
+                            ///< sub-devices the device can be partitioned into
+                            ///< according to the partitioning property.
+) {
+  // Other partitioning ways are not supported by Level Zero
+  if (Properties[0] == UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN) {
+    if ((Properties[1] != UR_DEVICE_AFFINITY_DOMAIN_FLAG_NEXT_PARTITIONABLE &&
+         Properties[1] != UR_DEVICE_AFFINITY_DOMAIN_FLAG_NUMA)) {
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+  } else if (Properties[0] == UR_DEVICE_PARTITION_BY_CSLICE) {
+    if (Properties[1] != 0) {
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+  } else {
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+
+  // Devices cache is normally created in piDevicesGet but still make
+  // sure that cache is populated.
+  //
+  auto Res = Device->Platform->populateDeviceCacheIfNeeded();
+  if (Res != UR_RESULT_SUCCESS) {
+    return Res;
+  }
+
+  auto EffectiveNumDevices = [&]() -> decltype(Device->SubDevices.size()) {
+    if (Device->SubDevices.size() == 0)
+      return 0;
+
+    // Sub-Sub-Devices are partitioned by CSlices, not by affinity domain.
+    // However, if
+    // SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING overrides that
+    // still expose CSlices in partitioning by affinity domain for compatibility
+    // reasons.
+    if (Properties[0] == UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN &&
+        !ExposeCSliceInAffinityPartitioning) {
+      if (Device->isSubDevice()) {
+        return 0;
+      }
+    }
+    if (Properties[0] == UR_DEVICE_PARTITION_BY_CSLICE) {
+      // Not a CSlice-based partitioning.
+      if (!Device->SubDevices[0]->isCCS()) {
+        return 0;
+      }
+    }
+
+    return Device->SubDevices.size();
+  }();
+
+  // TODO: Consider support for partitioning to <= total sub-devices.
+  // Currently supported partitioning (by affinity domain/numa) would always
+  // partition to all sub-devices.
+  //
+  if (NumDevices != 0)
+    UR_ASSERT(NumDevices == EffectiveNumDevices, UR_RESULT_ERROR_INVALID_VALUE);
+
+  for (uint32_t I = 0; I < NumDevices; I++) {
+    OutDevices[I] = Device->SubDevices[I];
+    // reusing the same pi_device needs to increment the reference count
+    urDeviceRetain(OutDevices[I]);
+  }
+
+  if (NumDevicesRet) {
+    *NumDevicesRet = EffectiveNumDevices;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary(
+    ur_device_handle_t
+        Device, ///< [in] handle of the device to select binary for.
+    const uint8_t **BinaryArray, ///< [in] the array of binaries to select from.
+    uint32_t NumBinaries, ///< [in] the number of binaries passed in ppBinaries.
+                          ///< Must greater than or equal to zero otherwise
+                          ///< ::UR_RESULT_ERROR_INVALID_VALUE is returned.
+    uint32_t
+        *SelectedBinary ///< [out] the index of the selected binary in the input
+                        ///< array of binaries. If a suitable binary was not
+                        ///< found the function returns ${X}_INVALID_BINARY.
+) {
+  // TODO: this is a bare-bones implementation for choosing a device image
+  // that would be compatible with the targeted device. An AOT-compiled
+  // image is preferred over SPIR-V for known devices (i.e. Intel devices)
+  // The implementation makes no effort to differentiate between multiple images
+  // for the given device, and simply picks the first one compatible.
+  //
+  // Real implementation will use the same mechanism OpenCL ICD dispatcher
+  // uses. Something like:
+  //   PI_VALIDATE_HANDLE_RETURN_HANDLE(ctx, PI_ERROR_INVALID_CONTEXT);
+  //     return context->dispatch->piextDeviceSelectIR(
+  //       ctx, images, num_images, selected_image);
+  // where context->dispatch is set to the dispatch table provided by PI
+  // plugin for platform/device the ctx was created for.
+
+  // Look for GEN binary, which we known can only be handled by Level-Zero now.
+  const char *BinaryTarget = __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN;
+
+  pi_device_binary *Binaries =
+      reinterpret_cast<pi_device_binary *>(const_cast<uint8_t **>(BinaryArray));
+
+  uint32_t *SelectedBinaryInd = SelectedBinary;
+
+  // Find the appropriate device image, fallback to spirv if not found
+  constexpr uint32_t InvalidInd = std::numeric_limits<uint32_t>::max();
+  uint32_t Spirv = InvalidInd;
+
+  for (uint32_t i = 0; i < NumBinaries; ++i) {
+    if (strcmp(Binaries[i]->DeviceTargetSpec, BinaryTarget) == 0) {
+      *SelectedBinaryInd = i;
+      return UR_RESULT_SUCCESS;
+    }
+    if (strcmp(Binaries[i]->DeviceTargetSpec,
+               __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64) == 0)
+      Spirv = i;
+  }
+  // Points to a spirv image, if such indeed was found
+  if ((*SelectedBinaryInd = Spirv) != InvalidInd)
+    return UR_RESULT_SUCCESS;
+
+  // No image can be loaded for the given device
+  return UR_RESULT_ERROR_INVALID_BINARY;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle(
+    ur_device_handle_t Device, ///< [in] handle of the device.
+    ur_native_handle_t
+        *NativeDevice ///< [out] a pointer to the native handle of the device.
+) {
+  *NativeDevice = reinterpret_cast<ur_native_handle_t>(Device->ZeDevice);
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
+    ur_native_handle_t NativeDevice, ///< [in] the native handle of the device.
+    ur_platform_handle_t Platform,   ///< [in] handle of the platform instance
+    ur_device_handle_t
+        *Device ///< [out] pointer to the handle of the device object created.
+) {
+  auto ZeDevice = ur_cast<ze_device_handle_t>(NativeDevice);
+
+  // The SYCL spec requires that the set of devices must remain fixed for the
+  // duration of the application's execution. We assume that we found all of the
+  // Level Zero devices when we initialized the platforms/devices cache, so the
+  // "NativeHandle" must already be in the cache. If it is not, this must not be
+  // a valid Level Zero device.
+  //
+  // TODO: maybe we should populate cache of platforms if it wasn't already.
+  // For now assert that is was populated.
+  UR_ASSERT(PiPlatformCachePopulated, UR_RESULT_ERROR_INVALID_VALUE);
+  const std::lock_guard<SpinLock> Lock{*PiPlatformsCacheMutex};
+
+  ur_device_handle_t Dev = nullptr;
+  for (ur_platform_handle_t ThePlatform : *PiPlatformsCache) {
+    Dev = ThePlatform->getDeviceFromNativeHandle(ZeDevice);
+    if (Dev) {
+      // Check that the input Platform, if was given, matches the found one.
+      UR_ASSERT(!Platform || Platform == ThePlatform,
+                UR_RESULT_ERROR_INVALID_PLATFORM);
+      break;
+    }
+  }
+
+  if (Dev == nullptr)
+    return UR_RESULT_ERROR_INVALID_VALUE;
+
+  *Device = Dev;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(
+    ur_device_handle_t Device, ///< [in] handle of the device instance
+    uint64_t *DeviceTimestamp, ///< [out][optional] pointer to the Device's
+                               ///< global timestamp that correlates with the
+                               ///< Host's global timestamp value
+    uint64_t *HostTimestamp    ///< [out][optional] pointer to the Host's global
+                               ///< timestamp that correlates with the Device's
+                               ///< global timestamp value
+) {
+  const uint64_t &ZeTimerResolution =
+      Device->ZeDeviceProperties->timerResolution;
+  const uint64_t TimestampMaxCount =
+      ((1ULL << Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL);
+  uint64_t DeviceClockCount, Dummy;
+
+  ZE2UR_CALL(zeDeviceGetGlobalTimestamps,
+             (Device->ZeDevice,
+              HostTimestamp == nullptr ? &Dummy : HostTimestamp,
+              &DeviceClockCount));
+
+  if (DeviceTimestamp != nullptr) {
+    *DeviceTimestamp =
+        (DeviceClockCount & TimestampMaxCount) * ZeTimerResolution;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp
index ecbc9cc6a21d2..09e942a6441b8 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp
@@ -7,4 +7,160 @@
 //===-----------------------------------------------------------------===//
 #pragma once
 
+#include <cassert>
+#include <list>
+#include <map>
+#include <stdarg.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <sycl/detail/pi.h>
+#include <ur/ur.hpp>
+#include <ur_api.h>
+#include <ze_api.h>
+#include <zes_api.h>
+
 #include "ur_level_zero_common.hpp"
+
+enum EventsScope {
+  // All events are created host-visible.
+  AllHostVisible,
+  // All events are created with device-scope and only when
+  // host waits them or queries their status that a proxy
+  // host-visible event is created and set to signal after
+  // original event signals.
+  OnDemandHostVisibleProxy,
+  // All events are created with device-scope and only
+  // when a batch of commands is submitted for execution a
+  // last command in that batch is added to signal host-visible
+  // completion of each command in this batch (the default mode).
+  LastCommandInBatchHostVisible
+};
+
+struct ur_device_handle_t_ : _ur_object {
+  ur_device_handle_t_(ze_device_handle_t Device, ur_platform_handle_t Plt,
+                      ur_device_handle_t ParentDevice = nullptr)
+      : ZeDevice{Device}, Platform{Plt}, RootDevice{ParentDevice},
+        ZeDeviceProperties{}, ZeDeviceComputeProperties{} {
+    // NOTE: one must additionally call initialize() to complete
+    // UR device creation.
+  }
+
+  // The helper structure that keeps info about a command queue groups of the
+  // device. It is not changed after it is initialized.
+  struct queue_group_info_t {
+    enum type {
+      MainCopy,
+      LinkCopy,
+      Compute,
+      Size // must be last
+    };
+
+    // Keep the ordinal of the commands group as returned by
+    // zeDeviceGetCommandQueueGroupProperties. A value of "-1" means that
+    // there is no such queue group available in the Level Zero runtime.
+    int32_t ZeOrdinal{-1};
+
+    // Keep the index of the specific queue in this queue group where
+    // all the command enqueues of the corresponding type should go to.
+    // The value of "-1" means that no hard binding is defined and
+    // implementation can choose specific queue index on its own.
+    int32_t ZeIndex{-1};
+
+    // Keeps the queue group properties.
+    ZeStruct<ze_command_queue_group_properties_t> ZeProperties;
+  };
+
+  std::vector<queue_group_info_t> QueueGroup =
+      std::vector<queue_group_info_t>(queue_group_info_t::Size);
+
+  // This returns "true" if a main copy engine is available for use.
+  bool hasMainCopyEngine() const {
+    return QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal >= 0;
+  }
+
+  // This returns "true" if a link copy engine is available for use.
+  bool hasLinkCopyEngine() const {
+    return QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal >= 0;
+  }
+
+  // This returns "true" if a main or link copy engine is available for use.
+  bool hasCopyEngine() const {
+    return hasMainCopyEngine() || hasLinkCopyEngine();
+  }
+
+  // Initialize the entire UR device.
+  // Optional param `SubSubDeviceOrdinal` `SubSubDeviceIndex` are the compute
+  // command queue ordinal and index respectively, used to initialize
+  // sub-sub-devices.
+  ur_result_t initialize(int SubSubDeviceOrdinal = -1,
+                         int SubSubDeviceIndex = -1);
+
+  // Level Zero device handle.
+  // This field is only set at _ur_device_handle_t creation time, and cannot
+  // change. Therefore it can be accessed without holding a lock on this
+  // _ur_device_handle_t.
+  const ze_device_handle_t ZeDevice;
+
+  // Keep the subdevices that are partitioned from this ur_device_handle_t for
+  // reuse The order of sub-devices in this vector is repeated from the
+  // ze_device_handle_t array that are returned from zeDeviceGetSubDevices()
+  // call, which will always return sub-devices in the fixed same order.
+  std::vector<ur_device_handle_t> SubDevices;
+
+  // PI platform to which this device belongs.
+  // This field is only set at _ur_device_handle_t creation time, and cannot
+  // change. Therefore it can be accessed without holding a lock on this
+  // _ur_device_handle_t.
+  ur_platform_handle_t Platform;
+
+  // Root-device of a sub-device, null if this is not a sub-device.
+  // This field is only set at _ur_device_handle_t creation time, and cannot
+  // change. Therefore it can be accessed without holding a lock on this
+  // _ur_device_handle_t.
+  const ur_device_handle_t RootDevice;
+
+  enum ImmCmdlistMode {
+    // Immediate commandlists are not used.
+    NotUsed = 0,
+    // One set of compute and copy immediate commandlists per queue.
+    PerQueue,
+    // One set of compute and copy immediate commandlists per host thread that
+    // accesses the queue.
+    PerThreadPerQueue
+  };
+  // Read env settings to select immediate commandlist mode.
+  ImmCmdlistMode useImmediateCommandLists();
+
+  // Returns whether immediate command lists are used on this device.
+  ImmCmdlistMode ImmCommandListUsed{};
+
+  // Scope of events used for events on the device
+  // Can be adjusted with SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS
+  // for non-immediate command lists
+  EventsScope ZeEventsScope = AllHostVisible;
+
+  bool isSubDevice() { return RootDevice != nullptr; }
+
+  // Is this a Data Center GPU Max series (aka PVC).
+  bool isPVC() { return (ZeDeviceProperties->deviceId & 0xff0) == 0xbd0; }
+
+  // Does this device represent a single compute slice?
+  bool isCCS() const {
+    return QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
+               .ZeIndex >= 0;
+  }
+
+  // Cache of the immutable device properties.
+  ZeCache<ZeStruct<ze_device_properties_t>> ZeDeviceProperties;
+  ZeCache<ZeStruct<ze_device_compute_properties_t>> ZeDeviceComputeProperties;
+  ZeCache<ZeStruct<ze_device_image_properties_t>> ZeDeviceImageProperties;
+  ZeCache<ZeStruct<ze_device_module_properties_t>> ZeDeviceModuleProperties;
+  ZeCache<std::pair<std::vector<ZeStruct<ze_device_memory_properties_t>>,
+                    std::vector<ZeStruct<ze_device_memory_ext_properties_t>>>>
+      ZeDeviceMemoryProperties;
+  ZeCache<ZeStruct<ze_device_memory_access_properties_t>>
+      ZeDeviceMemoryAccessProperties;
+  ZeCache<ZeStruct<ze_device_cache_properties_t>> ZeDeviceCacheProperties;
+};
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
index 2889db7884b0e..318a931d608f3 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
@@ -6,4 +6,1171 @@
 //
 //===-----------------------------------------------------------------===//
 
+#include <algorithm>
+#include <climits>
+#include <mutex>
+#include <string.h>
+
+#include "ur_level_zero_common.hpp"
 #include "ur_level_zero_event.hpp"
+#include <ur_bindings.hpp>
+
+void printZeEventList(const _ur_ze_event_list_t &UrZeEventList) {
+  urPrint("  NumEventsInWaitList %d:", UrZeEventList.Length);
+
+  for (uint32_t I = 0; I < UrZeEventList.Length; I++) {
+    urPrint(" %#llx", ur_cast<std::uintptr_t>(UrZeEventList.ZeEventList[I]));
+  }
+
+  urPrint("\n");
+}
+
+// This is an experimental option that allows the use of multiple command lists
+// when submitting barriers. The default is 0.
+static const bool UseMultipleCmdlistBarriers = [] {
+  const char *UseMultipleCmdlistBarriersFlag =
+      std::getenv("SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS");
+  if (!UseMultipleCmdlistBarriersFlag)
+    return true;
+  return std::stoi(UseMultipleCmdlistBarriersFlag) > 0;
+}();
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
+    ur_queue_handle_t Queue,      ///< [in] handle of the queue object
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating that
+                        ///< all previously enqueued commands must be complete.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular command instance.
+) {
+  if (EventWaitList) {
+    bool UseCopyEngine = false;
+
+    // Lock automatically releases when this goes out of scope.
+    std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
+
+    _ur_ze_event_list_t TmpWaitList = {};
+    UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+        NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
+
+    // Get a new command list to be used on this call
+    ur_command_list_ptr_t CommandList{};
+    UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
+                                                    UseCopyEngine));
+
+    ze_event_handle_t ZeEvent = nullptr;
+    ur_event_handle_t InternalEvent;
+    bool IsInternal = OutEvent == nullptr;
+    ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
+    UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_EXT_COMMAND_TYPE_USER,
+                                         CommandList, IsInternal));
+
+    ZeEvent = (*Event)->ZeEvent;
+    (*Event)->WaitList = TmpWaitList;
+
+    const auto &WaitList = (*Event)->WaitList;
+    auto ZeCommandList = CommandList->first;
+    ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
+               (ZeCommandList, WaitList.Length, WaitList.ZeEventList));
+
+    ZE2UR_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent));
+
+    // Execute command list asynchronously as the event will be used
+    // to track down its completion.
+    return Queue->executeCommandList(CommandList);
+  }
+
+  {
+    // If wait-list is empty, then this particular command should wait until
+    // all previous enqueued commands to the command-queue have completed.
+    //
+    // TODO: find a way to do that without blocking the host.
+
+    // Lock automatically releases when this goes out of scope.
+    std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
+
+    if (OutEvent) {
+      UR_CALL(createEventAndAssociateQueue(Queue, OutEvent,
+                                           UR_EXT_COMMAND_TYPE_USER,
+                                           Queue->CommandListMap.end(),
+                                           /* IsInternal */ false));
+    }
+
+    Queue->synchronize();
+
+    if (OutEvent) {
+      Queue->LastCommandEvent = reinterpret_cast<ur_event_handle_t>(*OutEvent);
+
+      ZE2UR_CALL(zeEventHostSignal, ((*OutEvent)->ZeEvent));
+      (*OutEvent)->Completed = true;
+    }
+  }
+
+  if (!Queue->Device->ImmCommandListUsed) {
+    std::unique_lock<ur_shared_mutex> Lock(Queue->Mutex);
+    resetCommandLists(Queue);
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
+    ur_queue_handle_t Queue,      ///< [in] handle of the queue object
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating that
+                        ///< all previously enqueued commands must be complete.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular command instance.
+) {
+
+  // Lock automatically releases when this goes out of scope.
+  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
+
+  // Helper function for appending a barrier to a command list.
+  auto insertBarrierIntoCmdList =
+      [&Queue](ur_command_list_ptr_t CmdList,
+               const _ur_ze_event_list_t &EventWaitList,
+               ur_event_handle_t &Event, bool IsInternal) {
+        UR_CALL(createEventAndAssociateQueue(
+            Queue, &Event, UR_EXT_COMMAND_TYPE_USER, CmdList, IsInternal));
+
+        Event->WaitList = EventWaitList;
+        ZE2UR_CALL(zeCommandListAppendBarrier,
+                   (CmdList->first, Event->ZeEvent, EventWaitList.Length,
+                    EventWaitList.ZeEventList));
+        return UR_RESULT_SUCCESS;
+      };
+
+  ur_event_handle_t InternalEvent;
+  bool IsInternal = OutEvent == nullptr;
+  ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
+
+  // Indicator for whether batching is allowed. This may be changed later in
+  // this function, but allow it by default.
+  bool OkToBatch = true;
+
+  // If we have a list of events to make the barrier from, then we can create a
+  // barrier on these and use the resulting event as our future barrier.
+  // We use the same approach if
+  // SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a
+  // positive value.
+  // We use the same approach if we have in-order queue because every command
+  // depends on previous one, so we don't need to insert barrier to multiple
+  // command lists.
+  if (NumEventsInWaitList || !UseMultipleCmdlistBarriers ||
+      Queue->isInOrderQueue()) {
+    // Retain the events as they will be owned by the result event.
+    _ur_ze_event_list_t TmpWaitList;
+    UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+        NumEventsInWaitList, EventWaitList, Queue, false /*UseCopyEngine=*/));
+
+    // Get an arbitrary command-list in the queue.
+    ur_command_list_ptr_t CmdList;
+    UR_CALL(Queue->Context->getAvailableCommandList(
+        Queue, CmdList, false /*UseCopyEngine=*/, OkToBatch));
+
+    // Insert the barrier into the command-list and execute.
+    UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal));
+
+    UR_CALL(Queue->executeCommandList(CmdList, false, OkToBatch));
+
+    // Because of the dependency between commands in the in-order queue we don't
+    // need to keep track of any active barriers if we have in-order queue.
+    if (UseMultipleCmdlistBarriers && !Queue->isInOrderQueue()) {
+      auto UREvent = reinterpret_cast<ur_event_handle_t>(*Event);
+      Queue->ActiveBarriers.add(UREvent);
+    }
+    return UR_RESULT_SUCCESS;
+  }
+
+  // Since there are no events to explicitly create a barrier for, we are
+  // inserting a queue-wide barrier.
+
+  // Command list(s) for putting barriers.
+  std::vector<ur_command_list_ptr_t> CmdLists;
+
+  // There must be at least one L0 queue.
+  auto &ComputeGroup = Queue->ComputeQueueGroupsByTID.get();
+  auto &CopyGroup = Queue->CopyQueueGroupsByTID.get();
+  UR_ASSERT(!ComputeGroup.ZeQueues.empty() || !CopyGroup.ZeQueues.empty(),
+            UR_RESULT_ERROR_INVALID_QUEUE);
+
+  size_t NumQueues = 0;
+  for (auto &QueueMap :
+       {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID})
+    for (auto &QueueGroup : QueueMap)
+      NumQueues += QueueGroup.second.ZeQueues.size();
+
+  OkToBatch = true;
+  // Get an available command list tied to each command queue. We need
+  // these so a queue-wide barrier can be inserted into each command
+  // queue.
+  CmdLists.reserve(NumQueues);
+  for (auto &QueueMap :
+       {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID})
+    for (auto &QueueGroup : QueueMap) {
+      bool UseCopyEngine =
+          QueueGroup.second.Type != ur_queue_handle_t_::queue_type::Compute;
+      if (Queue->Device->ImmCommandListUsed) {
+        // If immediate command lists are being used, each will act as their own
+        // queue, so we must insert a barrier into each.
+        for (auto ImmCmdList : QueueGroup.second.ImmCmdLists)
+          if (ImmCmdList != Queue->CommandListMap.end())
+            CmdLists.push_back(ImmCmdList);
+      } else {
+        for (auto ZeQueue : QueueGroup.second.ZeQueues) {
+          if (ZeQueue) {
+            ur_command_list_ptr_t CmdList;
+            UR_CALL(Queue->Context->getAvailableCommandList(
+                Queue, CmdList, UseCopyEngine, OkToBatch, &ZeQueue));
+            CmdLists.push_back(CmdList);
+          }
+        }
+      }
+    }
+
+  // If no activity has occurred on the queue then there will be no cmdlists.
+  // We need one for generating an Event, so create one.
+  if (CmdLists.size() == 0) {
+    // Get any available command list.
+    ur_command_list_ptr_t CmdList;
+    UR_CALL(Queue->Context->getAvailableCommandList(
+        Queue, CmdList, false /*UseCopyEngine=*/, OkToBatch));
+    CmdLists.push_back(CmdList);
+  }
+
+  if (CmdLists.size() > 1) {
+    // Insert a barrier into each unique command queue using the available
+    // command-lists.
+    std::vector<ur_event_handle_t> EventWaitVector(CmdLists.size());
+    for (size_t I = 0; I < CmdLists.size(); ++I) {
+      UR_CALL(insertBarrierIntoCmdList(CmdLists[I], _ur_ze_event_list_t{},
+                                       EventWaitVector[I],
+                                       true /*IsInternal*/));
+    }
+    // If there were multiple queues we need to create a "convergence" event to
+    // be our active barrier. This convergence event is signalled by a barrier
+    // on all the events from the barriers we have inserted into each queue.
+    // Use the first command list as our convergence command list.
+    ur_command_list_ptr_t &ConvergenceCmdList = CmdLists[0];
+
+    // Create an event list. It will take ownership over all relevant events so
+    // we relinquish ownership and let it keep all events it needs.
+    _ur_ze_event_list_t BaseWaitList;
+    UR_CALL(BaseWaitList.createAndRetainUrZeEventList(
+        EventWaitVector.size(),
+        reinterpret_cast<const ur_event_handle_t *>(EventWaitVector.data()),
+        Queue, ConvergenceCmdList->second.isCopy(Queue)));
+
+    // Insert a barrier with the events from each command-queue into the
+    // convergence command list. The resulting event signals the convergence of
+    // all barriers.
+    UR_CALL(insertBarrierIntoCmdList(ConvergenceCmdList, BaseWaitList, *Event,
+                                     IsInternal));
+  } else {
+    // If there is only a single queue then insert a barrier and the single
+    // result event can be used as our active barrier and used as the return
+    // event. Take into account whether output event is discarded or not.
+    UR_CALL(insertBarrierIntoCmdList(CmdLists[0], _ur_ze_event_list_t{}, *Event,
+                                     IsInternal));
+  }
+
+  // Execute each command list so the barriers can be encountered.
+  for (ur_command_list_ptr_t &CmdList : CmdLists)
+    UR_CALL(Queue->executeCommandList(CmdList, false, OkToBatch));
+
+  UR_CALL(Queue->ActiveBarriers.clear());
+  auto UREvent = reinterpret_cast<ur_event_handle_t>(*Event);
+  Queue->ActiveBarriers.add(UREvent);
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(
+    ur_event_handle_t Event,  ///< [in] handle of the event object
+    ur_event_info_t PropName, ///< [in] the name of the event property to query
+    size_t PropValueSize, ///< [in] size in bytes of the event property value
+    void *PropValue,      ///< [out][optional] value of the event property
+    size_t
+        *PropValueSizeRet ///< [out][optional] bytes returned in event property
+) {
+  UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet);
+
+  switch (PropName) {
+  case UR_EVENT_INFO_COMMAND_QUEUE: {
+    std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
+    return ReturnValue(ur_queue_handle_t{Event->UrQueue});
+  }
+  case UR_EVENT_INFO_CONTEXT: {
+    std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
+    return ReturnValue(ur_context_handle_t{Event->Context});
+  }
+  case UR_EVENT_INFO_COMMAND_TYPE: {
+    std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
+    return ReturnValue(ur_cast<uint64_t>(Event->CommandType));
+  }
+  case UR_EVENT_INFO_COMMAND_EXECUTION_STATUS: {
+    // Check to see if the event's Queue has an open command list due to
+    // batching. If so, go ahead and close and submit it, because it is
+    // possible that this is trying to query some event's status that
+    // is part of the batch.  This isn't strictly required, but it seems
+    // like a reasonable thing to do.
+    auto UrQueue = Event->UrQueue;
+    if (UrQueue) {
+      // Lock automatically releases when this goes out of scope.
+      std::scoped_lock<ur_shared_mutex> lock(UrQueue->Mutex);
+      const auto &OpenCommandList = UrQueue->eventOpenCommandList(Event);
+      if (OpenCommandList != UrQueue->CommandListMap.end()) {
+        UR_CALL(UrQueue->executeOpenCommandList(
+            OpenCommandList->second.isCopy(UrQueue)));
+      }
+    }
+
+    // Level Zero has a much more explicit notion of command submission than
+    // OpenCL. It doesn't happen unless the user submits a command list. We've
+    // done it just above so the status is at least PI_EVENT_RUNNING.
+    uint32_t Result = ur_cast<uint32_t>(UR_EVENT_STATUS_RUNNING);
+
+    // Make sure that we query a host-visible event only.
+    // If one wasn't yet created then don't create it here as well, and
+    // just conservatively return that event is not yet completed.
+    std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
+    auto HostVisibleEvent = Event->HostVisibleEvent;
+    if (Event->Completed) {
+      Result = UR_EVENT_STATUS_COMPLETE;
+    } else if (HostVisibleEvent) {
+      ze_result_t ZeResult;
+      ZeResult =
+          ZE_CALL_NOCHECK(zeEventQueryStatus, (HostVisibleEvent->ZeEvent));
+      if (ZeResult == ZE_RESULT_SUCCESS) {
+        Result = UR_EVENT_STATUS_COMPLETE;
+      }
+    }
+    return ReturnValue(Result);
+    return UR_RESULT_SUCCESS;
+  }
+  case UR_EVENT_INFO_REFERENCE_COUNT: {
+    return ReturnValue(Event->RefCount.load());
+  }
+  default:
+    urPrint("Unsupported ParamName in urEventGetInfo: ParamName=%d(%x)\n",
+            PropName, PropName);
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
+    ur_event_handle_t Event, ///< [in] handle of the event object
+    ur_profiling_info_t
+        PropName, ///< [in] the name of the profiling property to query
+    size_t
+        PropValueSize, ///< [in] size in bytes of the profiling property value
+    void *PropValue,   ///< [out][optional] value of the profiling property
+    size_t *PropValueSizeRet ///< [out][optional] pointer to the actual size in
+                             ///< bytes returned in propValue
+) {
+  std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
+  if (Event->UrQueue &&
+      (Event->UrQueue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) == 0) {
+    return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
+  }
+
+  ur_device_handle_t Device =
+      Event->UrQueue ? Event->UrQueue->Device : Event->Context->Devices[0];
+
+  uint64_t ZeTimerResolution = Device->ZeDeviceProperties->timerResolution;
+  const uint64_t TimestampMaxValue =
+      ((1ULL << Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL);
+
+  UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet);
+
+  ze_kernel_timestamp_result_t tsResult;
+
+  switch (PropName) {
+  case UR_PROFILING_INFO_COMMAND_START: {
+    ZE2UR_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult));
+    uint64_t ContextStartTime =
+        (tsResult.global.kernelStart & TimestampMaxValue) * ZeTimerResolution;
+    return ReturnValue(ContextStartTime);
+  }
+  case UR_PROFILING_INFO_COMMAND_END: {
+    ZE2UR_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult));
+
+    uint64_t ContextStartTime =
+        (tsResult.global.kernelStart & TimestampMaxValue);
+    uint64_t ContextEndTime = (tsResult.global.kernelEnd & TimestampMaxValue);
+
+    //
+    // Handle a possible wrap-around (the underlying HW counter is < 64-bit).
+    // Note, it will not report correct time if there were multiple wrap
+    // arounds, and the longer term plan is to enlarge the capacity of the
+    // HW timestamps.
+    //
+    if (ContextEndTime <= ContextStartTime) {
+      ContextEndTime += TimestampMaxValue;
+    }
+    ContextEndTime *= ZeTimerResolution;
+    return ReturnValue(ContextEndTime);
+  }
+  case UR_PROFILING_INFO_COMMAND_QUEUED:
+  case UR_PROFILING_INFO_COMMAND_SUBMIT:
+    // Note: No users for this case
+    // TODO: Implement commmand submission time when needed,
+    //        by recording device timestamp (using zeDeviceGetGlobalTimestamps)
+    //        before submitting command to device
+    return ReturnValue(uint64_t{0});
+  default:
+    urPrint("urEventGetProfilingInfo: not supported ParamName\n");
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
+    ze_event_handle_t &ZeHostVisibleEvent) {
+
+  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(UrQueue->Mutex,
+                                                          this->Mutex);
+
+  if (!HostVisibleEvent) {
+    if (UrQueue->Device->ZeEventsScope != OnDemandHostVisibleProxy)
+      die("getOrCreateHostVisibleEvent: missing host-visible event");
+
+    // Submit the command(s) signalling the proxy event to the queue.
+    // We have to first submit a wait for the device-only event for which this
+    // proxy is created.
+    //
+    // Get a new command list to be used on this call
+
+    // We want to batch these commands to avoid extra submissions (costly)
+    bool OkToBatch = true;
+
+    ur_command_list_ptr_t CommandList{};
+    UR_CALL(UrQueue->Context->getAvailableCommandList(
+        UrQueue, CommandList, false /* UseCopyEngine */, OkToBatch))
+
+    // Create a "proxy" host-visible event.
+    UR_CALL(createEventAndAssociateQueue(
+        UrQueue, &HostVisibleEvent, UR_EXT_COMMAND_TYPE_USER, CommandList,
+        /* IsInternal */ false, /* HostVisible */ true));
+
+    ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
+               (CommandList->first, 1, &ZeEvent));
+    ZE2UR_CALL(zeCommandListAppendSignalEvent,
+               (CommandList->first, HostVisibleEvent->ZeEvent));
+
+    UR_CALL(UrQueue->executeCommandList(CommandList, false, OkToBatch))
+  }
+
+  ZeHostVisibleEvent = HostVisibleEvent->ZeEvent;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventWait(
+    uint32_t NumEvents, ///< [in] number of events in the event list
+    const ur_event_handle_t
+        *EventWaitList ///< [in][range(0, numEvents)] pointer to a list of
+                       ///< events to wait for completion
+) {
+  for (uint32_t I = 0; I < NumEvents; I++) {
+    if (EventWaitList[I]->UrQueue->Device->ZeEventsScope ==
+        OnDemandHostVisibleProxy) {
+      // Make sure to add all host-visible "proxy" event signals if needed.
+      // This ensures that all signalling commands are submitted below and
+      // thus proxy events can be waited without a deadlock.
+      //
+      ur_event_handle_t_ *Event =
+          ur_cast<ur_event_handle_t_ *>(EventWaitList[I]);
+      if (!Event->hasExternalRefs())
+        die("urEventsWait must not be called for an internal event");
+
+      ze_event_handle_t ZeHostVisibleEvent;
+      if (auto Res = Event->getOrCreateHostVisibleEvent(ZeHostVisibleEvent))
+        return Res;
+    }
+  }
+  // Submit dependent open command lists for execution, if any
+  for (uint32_t I = 0; I < NumEvents; I++) {
+    ur_event_handle_t_ *Event = ur_cast<ur_event_handle_t_ *>(EventWaitList[I]);
+    auto UrQueue = Event->UrQueue;
+    if (UrQueue) {
+      // Lock automatically releases when this goes out of scope.
+      std::scoped_lock<ur_shared_mutex> lock(UrQueue->Mutex);
+
+      UR_CALL(UrQueue->executeAllOpenCommandLists());
+    }
+  }
+  std::unordered_set<ur_queue_handle_t> Queues;
+  for (uint32_t I = 0; I < NumEvents; I++) {
+    {
+      ur_event_handle_t_ *Event =
+          ur_cast<ur_event_handle_t_ *>(EventWaitList[I]);
+      {
+        std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
+        if (!Event->hasExternalRefs())
+          die("piEventsWait must not be called for an internal event");
+
+        if (!Event->Completed) {
+          auto HostVisibleEvent = Event->HostVisibleEvent;
+          if (!HostVisibleEvent)
+            die("The host-visible proxy event missing");
+
+          ze_event_handle_t ZeEvent = HostVisibleEvent->ZeEvent;
+          urPrint("ZeEvent = %#llx\n", ur_cast<std::uintptr_t>(ZeEvent));
+          ZE2UR_CALL(zeHostSynchronize, (ZeEvent));
+          Event->Completed = true;
+        }
+      }
+      if (auto Q = Event->UrQueue) {
+        if (Q->Device->ImmCommandListUsed && Q->isInOrderQueue())
+          // Use information about waited event to cleanup completed events in
+          // the in-order queue.
+          CleanupEventsInImmCmdLists(
+              Event->UrQueue, false /* QueueLocked */, false /* QueueSynced */,
+              reinterpret_cast<ur_event_handle_t>(Event));
+        else {
+          // NOTE: we are cleaning up after the event here to free resources
+          // sooner in case run-time is not calling piEventRelease soon enough.
+          CleanupCompletedEvent(reinterpret_cast<ur_event_handle_t>(Event));
+          // For the case when we have out-of-order queue or regular command
+          // lists its more efficient to check fences so put the queue in the
+          // set to cleanup later.
+          Queues.insert(Q);
+        }
+      }
+    }
+  }
+
+  // We waited some events above, check queue for signaled command lists and
+  // reset them.
+  for (auto &Q : Queues) {
+    std::unique_lock<ur_shared_mutex> Lock(Q->Mutex);
+    resetCommandLists(Q);
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(
+    ur_event_handle_t Event ///< [in] handle of the event object
+) {
+  Event->RefCountExternal++;
+  Event->RefCount.increment();
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(
+    ur_event_handle_t Event ///< [in] handle of the event object
+) {
+  Event->RefCountExternal--;
+  UR_CALL(urEventReleaseInternal(Event));
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle(
+    ur_event_handle_t Event, ///< [in] handle of the event.
+    ur_native_handle_t
+        *NativeEvent ///< [out] a pointer to the native handle of the event.
+) {
+  {
+    std::shared_lock<ur_shared_mutex> Lock(Event->Mutex);
+    auto *ZeEvent = ur_cast<ze_event_handle_t *>(NativeEvent);
+    *ZeEvent = Event->ZeEvent;
+  }
+  // Event can potentially be in an open command-list, make sure that
+  // it is submitted for execution to avoid potential deadlock if
+  // interop app is going to wait for it.
+  auto Queue = Event->UrQueue;
+  if (Queue) {
+    std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
+    const auto &OpenCommandList = Queue->eventOpenCommandList(Event);
+    if (OpenCommandList != Queue->CommandListMap.end()) {
+      UR_CALL(
+          Queue->executeOpenCommandList(OpenCommandList->second.isCopy(Queue)));
+    }
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urExtEventCreate(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    ur_event_handle_t
+        *Event ///< [out] pointer to the handle of the event object created.
+) {
+  UR_CALL(EventCreate(Context, nullptr, true, Event));
+
+  (*Event)->RefCountExternal++;
+  ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent));
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle(
+    ur_native_handle_t NativeEvent, ///< [in] the native handle of the event.
+    ur_context_handle_t Context,    ///< [in] handle of the context object
+    ur_event_handle_t
+        *Event ///< [out] pointer to the handle of the event object created.
+) {
+
+  // we dont have urEventCreate, so use this check for now to know that
+  // the call comes from piEventCreate()
+  if (NativeEvent == nullptr) {
+    UR_CALL(EventCreate(Context, nullptr, true, Event));
+
+    (*Event)->RefCountExternal++;
+    ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent));
+    return UR_RESULT_SUCCESS;
+  }
+
+  auto ZeEvent = ur_cast<ze_event_handle_t>(NativeEvent);
+  ur_event_handle_t_ *UrEvent{};
+  try {
+    UrEvent = new ur_event_handle_t_(ZeEvent, nullptr /* ZeEventPool */,
+                                     Context, UR_EXT_COMMAND_TYPE_USER, true);
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  // Assume native event is host-visible, or otherwise we'd
+  // need to create a host-visible proxy for it.
+  UrEvent->HostVisibleEvent = reinterpret_cast<ur_event_handle_t>(UrEvent);
+
+  // Unlike regular events managed by SYCL RT we don't have to wait for interop
+  // events completion, and not need to do the their `cleanup()`. This in
+  // particular guarantees that the extra `piEventRelease` is not called on
+  // them. That release is needed to match the `piEventRetain` of regular events
+  // made for waiting for event completion, but not this interop event.
+  UrEvent->CleanedUp = true;
+
+  *Event = reinterpret_cast<ur_event_handle_t>(UrEvent);
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(
+    ur_event_handle_t Event,        ///< [in] handle of the event object
+    ur_execution_info_t ExecStatus, ///< [in] execution status of the event
+    ur_event_callback_t Notify,     ///< [in] execution status of the event
+    void *UserData ///< [in][out][optional] pointer to data to be passed to
+                   ///< callback.
+) {
+  std::ignore = Event;
+  std::ignore = ExecStatus;
+  std::ignore = Notify;
+  std::ignore = UserData;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
+  if (!Event->RefCount.decrementAndTest())
+    return UR_RESULT_SUCCESS;
+
+  if (Event->CommandType == UR_COMMAND_MEM_UNMAP && Event->CommandData) {
+    // Free the memory allocated in the piEnqueueMemBufferMap.
+    if (auto Res = ZeMemFreeHelper(Event->Context, Event->CommandData))
+      return Res;
+    Event->CommandData = nullptr;
+  }
+  if (Event->OwnNativeHandle) {
+    if (DisableEventsCaching) {
+      auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
+      // Gracefully handle the case that L0 was already unloaded.
+      if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
+        return ze2urResult(ZeResult);
+      auto Context = Event->Context;
+      if (auto Res = Context->decrementUnreleasedEventsInPool(Event))
+        return Res;
+    }
+  }
+  // It is possible that host-visible event was never created.
+  // In case it was check if that's different from this same event
+  // and release a reference to it.
+  if (Event->HostVisibleEvent && Event->HostVisibleEvent != Event) {
+    // Decrement ref-count of the host-visible proxy event.
+    UR_CALL(urEventReleaseInternal(Event->HostVisibleEvent));
+  }
+
+  // Save pointer to the queue before deleting/resetting event.
+  // When we add an event to the cache we need to check whether profiling is
+  // enabled or not, so we access properties of the queue and that's why queue
+  // must released later.
+  auto Queue = Event->UrQueue;
+  if (DisableEventsCaching || !Event->OwnNativeHandle) {
+    delete Event;
+  } else {
+    Event->Context->addEventToContextCache(Event);
+  }
+
+  // We intentionally incremented the reference counter when an event is
+  // created so that we can avoid pi_queue is released before the associated
+  // pi_event is released. Here we have to decrement it so pi_queue
+  // can be released successfully.
+  if (Queue) {
+    UR_CALL(urQueueReleaseInternal(Queue));
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+// Helper function to implement zeHostSynchronize.
+// The behavior is to avoid infinite wait during host sync under ZE_DEBUG.
+// This allows for a much more responsive debugging of hangs.
+//
+template <typename T, typename Func>
+ze_result_t zeHostSynchronizeImpl(Func Api, T Handle) {
+  if (!UrL0Debug) {
+    return Api(Handle, UINT64_MAX);
+  }
+
+  ze_result_t R;
+  while ((R = Api(Handle, 1000)) == ZE_RESULT_NOT_READY)
+    ;
+  return R;
+}
+
+// Template function to do various types of host synchronizations.
+// This is intended to be used instead of direct calls to specific
+// Level-Zero synchronization APIs.
+//
+template <typename T> ze_result_t zeHostSynchronize(T Handle);
+template <> ze_result_t zeHostSynchronize(ze_event_handle_t Handle) {
+  return zeHostSynchronizeImpl(zeEventHostSynchronize, Handle);
+}
+template <> ze_result_t zeHostSynchronize(ze_command_queue_handle_t Handle) {
+  return zeHostSynchronizeImpl(zeCommandQueueSynchronize, Handle);
+}
+
+// Perform any necessary cleanup after an event has been signalled.
+// This currently makes sure to release any kernel that may have been used by
+// the event, updates the last command event in the queue and cleans up all dep
+// events of the event.
+// If the caller locks queue mutex then it must pass 'true' to QueueLocked.
+ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked) {
+  ur_kernel_handle_t AssociatedKernel = nullptr;
+  // List of dependent events.
+  std::list<ur_event_handle_t> EventsToBeReleased;
+  ur_queue_handle_t AssociatedQueue = nullptr;
+  {
+    std::scoped_lock<ur_shared_mutex> EventLock(Event->Mutex);
+    // Exit early of event was already cleanedup.
+    if (Event->CleanedUp)
+      return UR_RESULT_SUCCESS;
+
+    AssociatedQueue = Event->UrQueue;
+
+    // Remember the kernel associated with this event if there is one. We are
+    // going to release it later.
+    if (Event->CommandType == UR_COMMAND_KERNEL_LAUNCH && Event->CommandData) {
+      AssociatedKernel =
+          reinterpret_cast<ur_kernel_handle_t>(Event->CommandData);
+      Event->CommandData = nullptr;
+    }
+
+    // Make a list of all the dependent events that must have signalled
+    // because this event was dependent on them.
+    Event->WaitList.collectEventsForReleaseAndDestroyPiZeEventList(
+        EventsToBeReleased);
+
+    Event->CleanedUp = true;
+  }
+
+  auto ReleaseIndirectMem = [](ur_kernel_handle_t Kernel) {
+    if (IndirectAccessTrackingEnabled) {
+      // piKernelRelease is called by CleanupCompletedEvent(Event) as soon as
+      // kernel execution has finished. This is the place where we need to
+      // release memory allocations. If kernel is not in use (not submitted by
+      // some other thread) then release referenced memory allocations. As a
+      // result, memory can be deallocated and context can be removed from
+      // container in the platform. That's why we need to lock a mutex here.
+      ur_platform_handle_t Plt = Kernel->Program->Context->getPlatform();
+      std::scoped_lock<ur_shared_mutex> ContextsLock(Plt->ContextsMutex);
+
+      if (--Kernel->SubmissionsCount == 0) {
+        // Kernel is not submitted for execution, release referenced memory
+        // allocations.
+        for (auto &MemAlloc : Kernel->MemAllocs) {
+          // std::pair<void *const, MemAllocRecord> *, Hash
+          USMFreeHelper(MemAlloc->second.Context, MemAlloc->first,
+                        MemAlloc->second.OwnZeMemHandle);
+        }
+        Kernel->MemAllocs.clear();
+      }
+    }
+  };
+
+  // We've reset event data members above, now cleanup resources.
+  if (AssociatedKernel) {
+    ReleaseIndirectMem(AssociatedKernel);
+    UR_CALL(urKernelRelease(AssociatedKernel));
+  }
+
+  if (AssociatedQueue) {
+    {
+      // Lock automatically releases when this goes out of scope.
+      std::unique_lock<ur_shared_mutex> QueueLock(AssociatedQueue->Mutex,
+                                                  std::defer_lock);
+      if (!QueueLocked)
+        QueueLock.lock();
+
+      // If this event was the LastCommandEvent in the queue, being used
+      // to make sure that commands were executed in-order, remove this.
+      // If we don't do this, the event can get released and freed leaving
+      // a dangling pointer to this event.  It could also cause unneeded
+      // already finished events to show up in the wait list.
+      if (AssociatedQueue->LastCommandEvent == Event) {
+        AssociatedQueue->LastCommandEvent = nullptr;
+      }
+    }
+
+    // Release this event since we explicitly retained it on creation and
+    // association with queue. Events which don't have associated queue doesn't
+    // require this release because it means that they are not created using
+    // createEventAndAssociateQueue, i.e. additional retain was not made.
+    UR_CALL(urEventReleaseInternal(Event));
+  }
+
+  // The list of dependent events will be appended to as we walk it so that this
+  // algorithm doesn't go recursive due to dependent events themselves being
+  // dependent on other events forming a potentially very deep tree, and deep
+  // recursion.  That turned out to be a significant problem with the recursive
+  // code that preceded this implementation.
+  while (!EventsToBeReleased.empty()) {
+    ur_event_handle_t DepEvent = EventsToBeReleased.front();
+    DepEvent->Completed = true;
+    EventsToBeReleased.pop_front();
+
+    ur_kernel_handle_t DepEventKernel = nullptr;
+    {
+      std::scoped_lock<ur_shared_mutex> DepEventLock(DepEvent->Mutex);
+      DepEvent->WaitList.collectEventsForReleaseAndDestroyPiZeEventList(
+          EventsToBeReleased);
+      if (IndirectAccessTrackingEnabled) {
+        // DepEvent has finished, we can release the associated kernel if there
+        // is one. This is the earliest place we can do this and it can't be
+        // done twice, so it is safe. Lock automatically releases when this goes
+        // out of scope.
+        // TODO: this code needs to be moved out of the guard.
+        if (DepEvent->CommandType == UR_COMMAND_KERNEL_LAUNCH &&
+            DepEvent->CommandData) {
+          DepEventKernel =
+              reinterpret_cast<ur_kernel_handle_t>(DepEvent->CommandData);
+          DepEvent->CommandData = nullptr;
+        }
+      }
+    }
+    if (DepEventKernel) {
+      ReleaseIndirectMem(DepEventKernel);
+      // UR_CALL(piKernelRelease(DepEventKernel));
+    }
+    UR_CALL(urEventReleaseInternal(DepEvent));
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+// Helper function for creating a PI event.
+// The "Queue" argument specifies the PI queue where a command is submitted.
+// The "HostVisible" argument specifies if event needs to be allocated from
+// a host-visible pool.
+//
+ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
+                        bool HostVisible, ur_event_handle_t *RetEvent) {
+
+  bool ProfilingEnabled =
+      !Queue || (Queue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) != 0;
+
+  if (auto CachedEvent =
+          Context->getEventFromContextCache(HostVisible, ProfilingEnabled)) {
+    *RetEvent = CachedEvent;
+    return UR_RESULT_SUCCESS;
+  }
+
+  ze_event_handle_t ZeEvent;
+  ze_event_pool_handle_t ZeEventPool = {};
+
+  size_t Index = 0;
+
+  if (auto Res = Context->getFreeSlotInExistingOrNewPool(
+          ZeEventPool, Index, HostVisible, ProfilingEnabled))
+    return Res;
+
+  ZeStruct<ze_event_desc_t> ZeEventDesc;
+  ZeEventDesc.index = Index;
+  ZeEventDesc.wait = 0;
+
+  if (HostVisible) {
+    ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
+  } else {
+    //
+    // Set the scope to "device" for every event. This is sufficient for
+    // global device access and peer device access. If needed to be seen on
+    // the host we are doing special handling, see EventsScope options.
+    //
+    // TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be
+    //       used in some circumstances.
+    //
+    ZeEventDesc.signal = 0;
+  }
+
+  ZE2UR_CALL(zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeEvent));
+
+  try {
+    *RetEvent = new ur_event_handle_t_(
+        ZeEvent, ZeEventPool, reinterpret_cast<ur_context_handle_t>(Context),
+        UR_EXT_COMMAND_TYPE_USER, true);
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  if (HostVisible)
+    (*RetEvent)->HostVisibleEvent =
+        reinterpret_cast<ur_event_handle_t>(*RetEvent);
+
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t ur_event_handle_t_::reset() {
+  UrQueue = nullptr;
+  CleanedUp = false;
+  Completed = false;
+  CommandData = nullptr;
+  CommandType = UR_EXT_COMMAND_TYPE_USER;
+  WaitList = {};
+  RefCountExternal = 0;
+  RefCount.reset();
+  CommandList = std::nullopt;
+
+  if (!isHostVisible())
+    HostVisibleEvent = nullptr;
+
+  ZE2UR_CALL(zeEventHostReset, (ZeEvent));
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
+    uint32_t EventListLength, const ur_event_handle_t *EventList,
+    ur_queue_handle_t CurQueue, bool UseCopyEngine) {
+  this->Length = 0;
+  this->ZeEventList = nullptr;
+  this->UrEventList = nullptr;
+
+  if (CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr) {
+    if (CurQueue->Device->ImmCommandListUsed) {
+      if (ReuseDiscardedEvents && CurQueue->isDiscardEvents()) {
+        // If queue is in-order with discarded events and if
+        // new command list is different from the last used command list then
+        // signal new event from the last immediate command list. We are going
+        // to insert a barrier in the new command list waiting for that event.
+        auto QueueGroup = CurQueue->getQueueGroup(UseCopyEngine);
+        uint32_t QueueGroupOrdinal, QueueIndex;
+        auto NextIndex =
+            QueueGroup.getQueueIndex(&QueueGroupOrdinal, &QueueIndex,
+                                     /*QueryOnly */ true);
+        auto NextImmCmdList = QueueGroup.ImmCmdLists[NextIndex];
+        if (CurQueue->LastUsedCommandList != CurQueue->CommandListMap.end() &&
+            CurQueue->LastUsedCommandList != NextImmCmdList) {
+          CurQueue->signalEventFromCmdListIfLastEventDiscarded(
+              CurQueue->LastUsedCommandList);
+        }
+      }
+    } else {
+      // Ensure LastCommandEvent's batch is submitted if it is differrent
+      // from the one this command is going to. If we reuse discarded events
+      // then signalEventFromCmdListIfLastEventDiscarded will be called at batch
+      // close if needed.
+      const auto &OpenCommandList =
+          CurQueue->eventOpenCommandList(CurQueue->LastCommandEvent);
+      if (OpenCommandList != CurQueue->CommandListMap.end() &&
+          OpenCommandList->second.isCopy(CurQueue) != UseCopyEngine) {
+
+        if (auto Res = CurQueue->executeOpenCommandList(
+                OpenCommandList->second.isCopy(CurQueue)))
+          return Res;
+      }
+    }
+  }
+
+  // For in-order queues, every command should be executed only after the
+  // previous command has finished. The event associated with the last
+  // enqueued command is added into the waitlist to ensure in-order semantics.
+  bool IncludeLastCommandEvent =
+      CurQueue->isInOrderQueue() && CurQueue->LastCommandEvent != nullptr;
+
+  // If the last event is discarded then we already have a barrier waiting for
+  // that event, so must not include the last command event into the wait
+  // list because it will cause waiting for event which was reset.
+  if (ReuseDiscardedEvents && CurQueue->isDiscardEvents() &&
+      CurQueue->LastCommandEvent && CurQueue->LastCommandEvent->IsDiscarded)
+    IncludeLastCommandEvent = false;
+
+  try {
+    uint32_t TmpListLength = 0;
+
+    if (IncludeLastCommandEvent) {
+      this->ZeEventList = new ze_event_handle_t[EventListLength + 1];
+      this->UrEventList = new ur_event_handle_t[EventListLength + 1];
+      std::shared_lock<ur_shared_mutex> Lock(CurQueue->LastCommandEvent->Mutex);
+      this->ZeEventList[0] = CurQueue->LastCommandEvent->ZeEvent;
+      this->UrEventList[0] = CurQueue->LastCommandEvent;
+      TmpListLength = 1;
+    } else if (EventListLength > 0) {
+      this->ZeEventList = new ze_event_handle_t[EventListLength];
+      this->UrEventList = new ur_event_handle_t[EventListLength];
+    }
+
+    if (EventListLength > 0) {
+      for (uint32_t I = 0; I < EventListLength; I++) {
+        {
+          std::shared_lock<ur_shared_mutex> Lock(EventList[I]->Mutex);
+          if (EventList[I]->Completed)
+            continue;
+
+          // Poll of the host-visible events.
+          auto HostVisibleEvent = EventList[I]->HostVisibleEvent;
+          if (FilterEventWaitList && HostVisibleEvent) {
+            auto Res = ZE_CALL_NOCHECK(zeEventQueryStatus,
+                                       (HostVisibleEvent->ZeEvent));
+            if (Res == ZE_RESULT_SUCCESS) {
+              // Event has already completed, don't put it into the list
+              continue;
+            }
+          }
+        }
+
+        auto Queue = EventList[I]->UrQueue;
+        if (Queue) {
+          // The caller of createAndRetainUrZeEventList must already hold
+          // a lock of the CurQueue. Additionally lock the Queue if it
+          // is different from CurQueue.
+          // TODO: rework this to avoid deadlock when another thread is
+          //       locking the same queues but in a different order.
+          auto Lock = ((Queue == CurQueue)
+                           ? std::unique_lock<ur_shared_mutex>()
+                           : std::unique_lock<ur_shared_mutex>(Queue->Mutex));
+
+          // If the event that is going to be waited is in an open batch
+          // different from where this next command is going to be added,
+          // then we have to force execute of that open command-list
+          // to avoid deadlocks.
+          //
+          const auto &OpenCommandList =
+              Queue->eventOpenCommandList(EventList[I]);
+          if (OpenCommandList != Queue->CommandListMap.end()) {
+
+            if (Queue == CurQueue &&
+                OpenCommandList->second.isCopy(Queue) == UseCopyEngine) {
+              // Don't force execute the batch yet since the new command
+              // is going to the same open batch as the dependent event.
+            } else {
+              if (auto Res = Queue->executeOpenCommandList(
+                      OpenCommandList->second.isCopy(Queue)))
+                return Res;
+            }
+          }
+        } else {
+          // There is a dependency on an interop-event.
+          // Similarily to the above to avoid dead locks ensure that
+          // execution of all prior commands in the current command-
+          // batch is visible to the host. This may not be the case
+          // when we intended to have only last command in the batch
+          // produce host-visible event, e.g.
+          //
+          //  event0 = interop event
+          //  event1 = command1 (already in batch, no deps)
+          //  event2 = command2 (is being added, dep on event0)
+          //  event3 = signal host-visible event for the batch
+          //  event1.wait()
+          //  event0.signal()
+          //
+          // Make sure that event1.wait() will wait for a host-visible
+          // event that is signalled before the command2 is enqueued.
+          if (CurQueue->Device->ZeEventsScope != AllHostVisible) {
+            CurQueue->executeAllOpenCommandLists();
+          }
+        }
+
+        std::shared_lock<ur_shared_mutex> Lock(EventList[I]->Mutex);
+        this->ZeEventList[TmpListLength] = EventList[I]->ZeEvent;
+        this->UrEventList[TmpListLength] = EventList[I];
+        TmpListLength += 1;
+      }
+    }
+
+    this->Length = TmpListLength;
+
+  } catch (...) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  }
+
+  for (uint32_t I = 0; I < this->Length; I++) {
+    this->UrEventList[I]->RefCount.increment();
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t _ur_ze_event_list_t::collectEventsForReleaseAndDestroyPiZeEventList(
+    std::list<ur_event_handle_t> &EventsToBeReleased) {
+  // acquire a lock before reading the length and list fields.
+  // Acquire the lock, copy the needed data locally, and reset
+  // the fields, then release the lock.
+  // Only then do we do the actual actions to release and destroy,
+  // holding the lock for the minimum time necessary.
+  uint32_t LocLength = 0;
+  ze_event_handle_t *LocZeEventList = nullptr;
+  ur_event_handle_t *LocPiEventList = nullptr;
+
+  {
+    // acquire the lock and copy fields locally
+    // Lock automatically releases when this goes out of scope.
+    std::scoped_lock<ur_mutex> lock(this->UrZeEventListMutex);
+
+    LocLength = Length;
+    LocZeEventList = ZeEventList;
+    LocPiEventList = UrEventList;
+
+    Length = 0;
+    ZeEventList = nullptr;
+    UrEventList = nullptr;
+
+    // release lock by ending scope.
+  }
+
+  for (uint32_t I = 0; I < LocLength; I++) {
+    // Add the event to be released to the list
+    EventsToBeReleased.push_back(LocPiEventList[I]);
+  }
+
+  if (LocZeEventList != nullptr) {
+    delete[] LocZeEventList;
+  }
+  if (LocPiEventList != nullptr) {
+    delete[] LocPiEventList;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+// Tells if this event is with profiling capabilities.
+bool ur_event_handle_t_::isProfilingEnabled() const {
+  return !UrQueue || // tentatively assume user events are profiling enabled
+         (UrQueue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) != 0;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp
index 64443b6d5575c..6acbd7459ef83 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp
@@ -7,8 +7,265 @@
 //===-----------------------------------------------------------------===//
 #pragma once
 
+#include <cassert>
+#include <list>
+#include <map>
+#include <mutex>
+#include <optional>
+#include <stdarg.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <sycl/detail/pi.h>
+#include <ur/ur.hpp>
+#include <ur_api.h>
+#include <ze_api.h>
+#include <zes_api.h>
+
 #include "ur_level_zero_common.hpp"
+#include "ur_level_zero_queue.hpp"
+
+extern "C" {
+ur_result_t urEventReleaseInternal(ur_event_handle_t Event);
+ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
+                        bool HostVisible, ur_event_handle_t *RetEvent);
+} // extern "C"
+
+// This is an experimental option that allows to disable caching of events in
+// the context.
+const bool DisableEventsCaching = [] {
+  const char *DisableEventsCachingFlag =
+      std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_EVENTS_CACHING");
+  if (!DisableEventsCachingFlag)
+    return false;
+  return std::stoi(DisableEventsCachingFlag) != 0;
+}();
+
+// This is an experimental option that allows reset and reuse of uncompleted
+// events in the in-order queue with discard_events property.
+const bool ReuseDiscardedEvents = [] {
+  const char *ReuseDiscardedEventsFlag =
+      std::getenv("SYCL_PI_LEVEL_ZERO_REUSE_DISCARDED_EVENTS");
+  if (!ReuseDiscardedEventsFlag)
+    return true;
+  return std::stoi(ReuseDiscardedEventsFlag) > 0;
+}();
+
+// Maximum number of events that can be present in an event ZePool is captured
+// here. Setting it to 256 gave best possible performance for several
+// benchmarks.
+const uint32_t MaxNumEventsPerPool = [] {
+  const auto MaxNumEventsPerPoolEnv =
+      std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
+  uint32_t Result =
+      MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256;
+  if (Result <= 0)
+    Result = 256;
+  return Result;
+}();
+
+const bool FilterEventWaitList = [] {
+  const char *Ret = std::getenv("SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST");
+  const bool RetVal = Ret ? std::stoi(Ret) : 1;
+  return RetVal;
+}();
+
+struct _ur_ze_event_list_t {
+  // List of level zero events for this event list.
+  ze_event_handle_t *ZeEventList = {nullptr};
+
+  // List of pi_events for this event list.
+  ur_event_handle_t *UrEventList = {nullptr};
+
+  // length of both the lists.  The actual allocation of these lists
+  // may be longer than this length.  This length is the actual number
+  // of elements in the above arrays that are valid.
+  uint32_t Length = {0};
+
+  // A mutex is needed for destroying the event list.
+  // Creation is already thread-safe because we only create the list
+  // when an event is initially created.  However, it might be
+  // possible to have multiple threads racing to destroy the list,
+  // so this will be used to make list destruction thread-safe.
+  ur_mutex UrZeEventListMutex;
+
+  // Initialize this using the array of events in EventList, and retain
+  // all the pi_events in the created data structure.
+  // CurQueue is the pi_queue that the command with this event wait
+  // list is going to be added to.  That is needed to flush command
+  // batches for wait events that are in other queues.
+  // UseCopyEngine indicates if the next command (the one that this
+  // event wait-list is for) is going to go to copy or compute
+  // queue. This is used to properly submit the dependent open
+  // command-lists.
+  ur_result_t createAndRetainUrZeEventList(uint32_t EventListLength,
+                                           const ur_event_handle_t *EventList,
+                                           ur_queue_handle_t CurQueue,
+                                           bool UseCopyEngine);
+
+  // Add all the events in this object's UrEventList to the end
+  // of the list EventsToBeReleased. Destroy pi_ze_event_list_t data
+  // structure fields making it look empty.
+  ur_result_t collectEventsForReleaseAndDestroyPiZeEventList(
+      std::list<ur_event_handle_t> &EventsToBeReleased);
+
+  // Had to create custom assignment operator because the mutex is
+  // not assignment copyable. Just field by field copy of the other
+  // fields.
+  _ur_ze_event_list_t &operator=(const _ur_ze_event_list_t &other) {
+    if (this != &other) {
+      this->ZeEventList = other.ZeEventList;
+      this->UrEventList = other.UrEventList;
+      this->Length = other.Length;
+    }
+    return *this;
+  }
+};
+
+void printZeEventList(const _ur_ze_event_list_t &PiZeEventList);
+
+struct ur_event_handle_t_ : _ur_object {
+  ur_event_handle_t_(ze_event_handle_t ZeEvent,
+                     ze_event_pool_handle_t ZeEventPool,
+                     ur_context_handle_t Context, ur_command_t CommandType,
+                     bool OwnZeEvent)
+      : ZeEvent{ZeEvent}, ZeEventPool{ZeEventPool}, Context{Context},
+        CommandType{CommandType}, CommandData{nullptr} {
+    OwnNativeHandle = OwnZeEvent;
+  }
+
+  // Level Zero event handle.
+  ze_event_handle_t ZeEvent;
+
+  // Level Zero event pool handle.
+  ze_event_pool_handle_t ZeEventPool;
 
-struct _ur_event_handle_t : _ur_object {
-  _ur_event_handle_t() {}
+  // In case we use device-only events this holds their host-visible
+  // counterpart. If this event is itself host-visble then HostVisibleEvent
+  // points to this event. If this event is not host-visible then this field can
+  // be: 1) null, meaning that a host-visible event wasn't yet created 2) a PI
+  // event created internally that host will actually be redirected
+  //    to wait/query instead of this PI event.
+  //
+  // The HostVisibleEvent is a reference counted PI event and can be used more
+  // than by just this one event, depending on the mode (see EventsScope).
+  //
+  ur_event_handle_t HostVisibleEvent = {nullptr};
+  bool isHostVisible() const {
+    return this ==
+           const_cast<const ur_event_handle_t_ *>(
+               reinterpret_cast<ur_event_handle_t_ *>(HostVisibleEvent));
+  }
+
+  // Provide direct access to Context, instead of going via queue.
+  // Not every PI event has a queue, and we need a handle to Context
+  // to get to event pool related information.
+  ur_context_handle_t Context;
+
+  // Keeps the command-queue and command associated with the event.
+  // These are NULL for the user events.
+  ur_queue_handle_t UrQueue = {nullptr};
+  ur_command_t CommandType;
+
+  // Opaque data to hold any data needed for CommandType.
+  void *CommandData;
+
+  // Command list associated with the pi_event.
+  std::optional<ur_command_list_ptr_t> CommandList;
+
+  // List of events that were in the wait list of the command that will
+  // signal this event.  These events must be retained when the command is
+  // enqueued, and must then be released when this event has signalled.
+  // This list must be destroyed once the event has signalled.
+  _ur_ze_event_list_t WaitList;
+
+  // Tracks if the needed cleanup was already performed for
+  // a completed event. This allows to control that some cleanup
+  // actions are performed only once.
+  //
+  bool CleanedUp = {false};
+
+  // Indicates that this PI event had already completed in the sense
+  // that no other synchromization is needed. Note that the underlying
+  // L0 event (if any) is not guranteed to have been signalled, or
+  // being visible to the host at all.
+  bool Completed = {false};
+
+  // Indicates that this event is discarded, i.e. it is not visible outside of
+  // plugin.
+  bool IsDiscarded = {false};
+
+  // Besides each PI object keeping a total reference count in
+  // _ur_object::RefCount we keep special track of the event *external*
+  // references. This way we are able to tell when the event is not referenced
+  // externally anymore, i.e. it can't be passed as a dependency event to
+  // piEnqueue* functions and explicitly waited meaning that we can do some
+  // optimizations:
+  // 1. For in-order queues we can reset and reuse event even if it was not yet
+  // completed by submitting a reset command to the queue (since there are no
+  // external references, we know that nobody can wait this event somewhere in
+  // parallel thread or pass it as a dependency which may lead to hang)
+  // 2. We can avoid creating host proxy event.
+  // This counter doesn't track the lifetime of an event object. Even if it
+  // reaches zero an event object may not be destroyed and can be used
+  // internally in the plugin.
+  std::atomic<uint32_t> RefCountExternal{0};
+
+  bool hasExternalRefs() { return RefCountExternal != 0; }
+
+  // Reset _pi_event object.
+  ur_result_t reset();
+
+  // Tells if this event is with profiling capabilities.
+  bool isProfilingEnabled() const;
+
+  // Get the host-visible event or create one and enqueue its signal.
+  ur_result_t getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent);
 };
+
+// Helper function to implement zeHostSynchronize.
+// The behavior is to avoid infinite wait during host sync under ZE_DEBUG.
+// This allows for a much more responsive debugging of hangs.
+//
+template <typename T, typename Func>
+ze_result_t zeHostSynchronizeImpl(Func Api, T Handle);
+
+// Template function to do various types of host synchronizations.
+// This is intended to be used instead of direct calls to specific
+// Level-Zero synchronization APIs.
+//
+template <typename T> ze_result_t zeHostSynchronize(T Handle);
+template <> ze_result_t zeHostSynchronize(ze_event_handle_t Handle);
+template <> ze_result_t zeHostSynchronize(ze_command_queue_handle_t Handle);
+
+// Perform any necessary cleanup after an event has been signalled.
+// This currently makes sure to release any kernel that may have been used by
+// the event, updates the last command event in the queue and cleans up all dep
+// events of the event.
+// If the caller locks queue mutex then it must pass 'true' to QueueLocked.
+ur_result_t CleanupCompletedEvent(ur_event_handle_t Event,
+                                  bool QueueLocked = false);
+
+// Get value of device scope events env var setting or default setting
+static const EventsScope DeviceEventsSetting = [] {
+  const char *DeviceEventsSettingStr =
+      std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS");
+  if (DeviceEventsSettingStr) {
+    // Override the default if user has explicitly chosen the events scope.
+    switch (std::stoi(DeviceEventsSettingStr)) {
+    case 0:
+      return AllHostVisible;
+    case 1:
+      return OnDemandHostVisibleProxy;
+    case 2:
+      return LastCommandInBatchHostVisible;
+    default:
+      // fallthrough to default setting
+      break;
+    }
+  }
+  // This is our default setting, which is expected to be the fastest
+  // with the modern GPU drivers.
+  return AllHostVisible;
+}();
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
new file mode 100644
index 0000000000000..2a69a905c8e84
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
@@ -0,0 +1,771 @@
+//===--------- ur_level_zero_kernel.cpp - Level Zero Adapter ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "ur_level_zero_kernel.hpp"
+#include <ur_bindings.hpp>
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
+    ur_queue_handle_t Queue,   ///< [in] handle of the queue object
+    ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object
+    uint32_t WorkDim, ///< [in] number of dimensions, from 1 to 3, to specify
+                      ///< the global and work-group work-items
+    const size_t
+        *GlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned
+                           ///< values that specify the offset used to
+                           ///< calculate the global ID of a work-item
+    const size_t *GlobalWorkSize, ///< [in] pointer to an array of workDim
+                                  ///< unsigned values that specify the number
+                                  ///< of global work-items in workDim that
+                                  ///< will execute the kernel function
+    const size_t
+        *LocalWorkSize, ///< [in][optional] pointer to an array of workDim
+                        ///< unsigned values that specify the number of local
+                        ///< work-items forming a work-group that will execute
+                        ///< the kernel function. If nullptr, the runtime
+                        ///< implementation will choose the work-group size.
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before the kernel execution. If nullptr, the
+                        ///< numEventsInWaitList must be 0, indicating that no
+                        ///< wait event.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular kernel execution instance.
+) {
+  // Lock automatically releases when this goes out of scope.
+  std::scoped_lock<ur_shared_mutex, ur_shared_mutex, ur_shared_mutex> Lock(
+      Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex);
+  if (GlobalWorkOffset != NULL) {
+    if (!Queue->Device->Platform->ZeDriverGlobalOffsetExtensionFound) {
+      urPrint("No global offset extension found on this driver\n");
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+
+    ZE2UR_CALL(zeKernelSetGlobalOffsetExp,
+               (Kernel->ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1],
+                GlobalWorkOffset[2]));
+  }
+
+  // If there are any pending arguments set them now.
+  for (auto &Arg : Kernel->PendingArguments) {
+    // The ArgValue may be a NULL pointer in which case a NULL value is used for
+    // the kernel argument declared as a pointer to global or constant memory.
+    char **ZeHandlePtr = nullptr;
+    if (Arg.Value) {
+      UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode,
+                                        Queue->Device));
+    }
+    ZE2UR_CALL(zeKernelSetArgumentValue,
+               (Kernel->ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr));
+  }
+  Kernel->PendingArguments.clear();
+
+  ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
+  uint32_t WG[3]{};
+
+  // global_work_size of unused dimensions must be set to 1
+  UR_ASSERT(WorkDim == 3 || GlobalWorkSize[2] == 1,
+            UR_RESULT_ERROR_INVALID_VALUE);
+  UR_ASSERT(WorkDim >= 2 || GlobalWorkSize[1] == 1,
+            UR_RESULT_ERROR_INVALID_VALUE);
+  if (LocalWorkSize) {
+    // L0
+    UR_ASSERT(LocalWorkSize[0] < std::numeric_limits<uint32_t>::max(),
+              UR_RESULT_ERROR_INVALID_VALUE);
+    UR_ASSERT(LocalWorkSize[1] < std::numeric_limits<uint32_t>::max(),
+              UR_RESULT_ERROR_INVALID_VALUE);
+    UR_ASSERT(LocalWorkSize[2] < std::numeric_limits<uint32_t>::max(),
+              UR_RESULT_ERROR_INVALID_VALUE);
+    WG[0] = static_cast<uint32_t>(LocalWorkSize[0]);
+    WG[1] = static_cast<uint32_t>(LocalWorkSize[1]);
+    WG[2] = static_cast<uint32_t>(LocalWorkSize[2]);
+  } else {
+    // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize
+    // values do not fit to 32-bit that the API only supports currently.
+    bool SuggestGroupSize = true;
+    for (int I : {0, 1, 2}) {
+      if (GlobalWorkSize[I] > UINT32_MAX) {
+        SuggestGroupSize = false;
+      }
+    }
+    if (SuggestGroupSize) {
+      ZE2UR_CALL(zeKernelSuggestGroupSize,
+                 (Kernel->ZeKernel, GlobalWorkSize[0], GlobalWorkSize[1],
+                  GlobalWorkSize[2], &WG[0], &WG[1], &WG[2]));
+    } else {
+      for (int I : {0, 1, 2}) {
+        // Try to find a I-dimension WG size that the GlobalWorkSize[I] is
+        // fully divisable with. Start with the max possible size in
+        // each dimension.
+        uint32_t GroupSize[] = {
+            Queue->Device->ZeDeviceComputeProperties->maxGroupSizeX,
+            Queue->Device->ZeDeviceComputeProperties->maxGroupSizeY,
+            Queue->Device->ZeDeviceComputeProperties->maxGroupSizeZ};
+        GroupSize[I] = std::min(size_t(GroupSize[I]), GlobalWorkSize[I]);
+        while (GlobalWorkSize[I] % GroupSize[I]) {
+          --GroupSize[I];
+        }
+        if (GlobalWorkSize[I] / GroupSize[I] > UINT32_MAX) {
+          urPrint("urEnqueueKernelLaunch: can't find a WG size "
+                  "suitable for global work size > UINT32_MAX\n");
+          return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+        }
+        WG[I] = GroupSize[I];
+      }
+      urPrint("urEnqueueKernelLaunch: using computed WG size = {%d, %d, %d}\n",
+              WG[0], WG[1], WG[2]);
+    }
+  }
+
+  // TODO: assert if sizes do not fit into 32-bit?
+
+  switch (WorkDim) {
+  case 3:
+    ZeThreadGroupDimensions.groupCountX =
+        static_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
+    ZeThreadGroupDimensions.groupCountY =
+        static_cast<uint32_t>(GlobalWorkSize[1] / WG[1]);
+    ZeThreadGroupDimensions.groupCountZ =
+        static_cast<uint32_t>(GlobalWorkSize[2] / WG[2]);
+    break;
+  case 2:
+    ZeThreadGroupDimensions.groupCountX =
+        static_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
+    ZeThreadGroupDimensions.groupCountY =
+        static_cast<uint32_t>(GlobalWorkSize[1] / WG[1]);
+    WG[2] = 1;
+    break;
+  case 1:
+    ZeThreadGroupDimensions.groupCountX =
+        static_cast<uint32_t>(GlobalWorkSize[0] / WG[0]);
+    WG[1] = WG[2] = 1;
+    break;
+
+  default:
+    urPrint("urEnqueueKernelLaunch: unsupported work_dim\n");
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+
+  // Error handling for non-uniform group size case
+  if (GlobalWorkSize[0] !=
+      size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) {
+    urPrint("urEnqueueKernelLaunch: invalid work_dim. The range is not a "
+            "multiple of the group size in the 1st dimension\n");
+    return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+  }
+  if (GlobalWorkSize[1] !=
+      size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) {
+    urPrint("urEnqueueKernelLaunch: invalid work_dim. The range is not a "
+            "multiple of the group size in the 2nd dimension\n");
+    return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+  }
+  if (GlobalWorkSize[2] !=
+      size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) {
+    urPrint("urEnqueueKernelLaunch: invalid work_dim. The range is not a "
+            "multiple of the group size in the 3rd dimension\n");
+    return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+  }
+
+  ZE2UR_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2]));
+
+  bool UseCopyEngine = false;
+  _ur_ze_event_list_t TmpWaitList;
+  UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+      NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
+
+  // Get a new command list to be used on this call
+  ur_command_list_ptr_t CommandList{};
+  UR_CALL(Queue->Context->getAvailableCommandList(
+      Queue, CommandList, UseCopyEngine, true /* AllowBatching */));
+
+  ze_event_handle_t ZeEvent = nullptr;
+  ur_event_handle_t InternalEvent{};
+  bool IsInternal = OutEvent == nullptr;
+  ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
+
+  UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH,
+                                       CommandList, IsInternal));
+
+  ZeEvent = (*OutEvent)->ZeEvent;
+  (*OutEvent)->WaitList = TmpWaitList;
+
+  // Save the kernel in the event, so that when the event is signalled
+  // the code can do a piKernelRelease on this kernel.
+  (*OutEvent)->CommandData = (void *)Kernel;
+
+  // Increment the reference count of the Kernel and indicate that the Kernel is
+  // in use. Once the event has been signalled, the code in
+  // CleanupCompletedEvent(Event) will do a piReleaseKernel to update the
+  // reference count on the kernel, using the kernel saved in CommandData.
+  UR_CALL(urKernelRetain(Kernel));
+
+  // Add to list of kernels to be submitted
+  if (IndirectAccessTrackingEnabled)
+    Queue->KernelsToBeSubmitted.push_back(Kernel);
+
+  if (Queue->Device->ImmCommandListUsed && IndirectAccessTrackingEnabled) {
+    // If using immediate commandlists then gathering of indirect
+    // references and appending to the queue (which means submission)
+    // must be done together.
+    std::unique_lock<ur_shared_mutex> ContextsLock(
+        Queue->Device->Platform->ContextsMutex, std::defer_lock);
+    // We are going to submit kernels for execution. If indirect access flag is
+    // set for a kernel then we need to make a snapshot of existing memory
+    // allocations in all contexts in the platform. We need to lock the mutex
+    // guarding the list of contexts in the platform to prevent creation of new
+    // memory alocations in any context before we submit the kernel for
+    // execution.
+    ContextsLock.lock();
+    Queue->CaptureIndirectAccesses();
+    // Add the command to the command list, which implies submission.
+    ZE2UR_CALL(zeCommandListAppendLaunchKernel,
+               (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions,
+                ZeEvent, (*OutEvent)->WaitList.Length,
+                (*OutEvent)->WaitList.ZeEventList));
+  } else {
+    // Add the command to the command list for later submission.
+    // No lock is needed here, unlike the immediate commandlist case above,
+    // because the kernels are not actually submitted yet. Kernels will be
+    // submitted only when the comamndlist is closed. Then, a lock is held.
+    ZE2UR_CALL(zeCommandListAppendLaunchKernel,
+               (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions,
+                ZeEvent, (*OutEvent)->WaitList.Length,
+                (*OutEvent)->WaitList.ZeEventList));
+  }
+
+  urPrint("calling zeCommandListAppendLaunchKernel() with"
+          "  ZeEvent %#llx\n",
+          ur_cast<std::uintptr_t>(ZeEvent));
+  printZeEventList((*OutEvent)->WaitList);
+
+  // Execute command list asynchronously, as the event will be used
+  // to track down its completion.
+  UR_CALL(Queue->executeCommandList(CommandList, false, true));
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
+    ur_queue_handle_t Queue,     ///< [in] handle of the queue to submit to.
+    ur_program_handle_t Program, ///< [in] handle of the program containing the
+                                 ///< device global variable.
+    const char
+        *Name, ///< [in] the unique identifier for the device global variable.
+    bool BlockingWrite, ///< [in] indicates if this operation should block.
+    size_t Count,       ///< [in] the number of bytes to copy.
+    size_t Offset, ///< [in] the byte offset into the device global variable to
+                   ///< start copying.
+    const void *Src, ///< [in] pointer to where the data must be copied from.
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list.
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before the kernel execution. If nullptr, the
+                        ///< numEventsInWaitList must be 0, indicating that no
+                        ///< wait event.
+    ur_event_handle_t
+        *Event ///< [in,out][optional] return an event object that identifies
+               ///< this particular kernel execution instance.
+) {
+  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
+
+  // Find global variable pointer
+  size_t GlobalVarSize = 0;
+  void *GlobalVarPtr = nullptr;
+  ZE2UR_CALL(zeModuleGetGlobalPointer,
+             (Program->ZeModule, Name, &GlobalVarSize, &GlobalVarPtr));
+  if (GlobalVarSize < Offset + Count) {
+    setErrorMessage("Write device global variable is out of range.",
+                    UR_RESULT_ERROR_INVALID_VALUE);
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  // Copy engine is preferred only for host to device transfer.
+  // Device to device transfers run faster on compute engines.
+  bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src);
+
+  // Temporary option added to use copy engine for D2D copy
+  PreferCopyEngine |= UseCopyEngineForD2DCopy;
+
+  return enqueueMemCopyHelper(UR_COMMAND_DEVICE_GLOBAL_VARIABLE_WRITE, Queue,
+                              ur_cast<char *>(GlobalVarPtr) + Offset,
+                              BlockingWrite, Count, Src, NumEventsInWaitList,
+                              EventWaitList, Event, PreferCopyEngine);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
+    ur_queue_handle_t Queue,     ///< [in] handle of the queue to submit to.
+    ur_program_handle_t Program, ///< [in] handle of the program containing the
+                                 ///< device global variable.
+    const char
+        *Name, ///< [in] the unique identifier for the device global variable.
+    bool BlockingRead, ///< [in] indicates if this operation should block.
+    size_t Count,      ///< [in] the number of bytes to copy.
+    size_t Offset, ///< [in] the byte offset into the device global variable to
+                   ///< start copying.
+    void *Dst,     ///< [in] pointer to where the data must be copied to.
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list.
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before the kernel execution. If nullptr, the
+                        ///< numEventsInWaitList must be 0, indicating that no
+                        ///< wait event.
+    ur_event_handle_t
+        *Event ///< [in,out][optional] return an event object that identifies
+               ///< this particular kernel execution instance.
+) {
+
+  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
+
+  // Find global variable pointer
+  size_t GlobalVarSize = 0;
+  void *GlobalVarPtr = nullptr;
+  ZE2UR_CALL(zeModuleGetGlobalPointer,
+             (Program->ZeModule, Name, &GlobalVarSize, &GlobalVarPtr));
+  if (GlobalVarSize < Offset + Count) {
+    setErrorMessage("Read from device global variable is out of range.",
+                    UR_RESULT_ERROR_INVALID_VALUE);
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  // Copy engine is preferred only for host to device transfer.
+  // Device to device transfers run faster on compute engines.
+  bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Dst);
+
+  // Temporary option added to use copy engine for D2D copy
+  PreferCopyEngine |= UseCopyEngineForD2DCopy;
+
+  return enqueueMemCopyHelper(
+      UR_COMMAND_DEVICE_GLOBAL_VARIABLE_READ, Queue, Dst, BlockingRead, Count,
+      ur_cast<char *>(GlobalVarPtr) + Offset, NumEventsInWaitList,
+      EventWaitList, Event, PreferCopyEngine);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelCreate(
+    ur_program_handle_t Program, ///< [in] handle of the program instance
+    const char *KernelName,      ///< [in] pointer to null-terminated string.
+    ur_kernel_handle_t
+        *RetKernel ///< [out] pointer to handle of kernel object created.
+) {
+  std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
+  if (Program->State != ur_program_handle_t_::state::Exe) {
+    return UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE;
+  }
+
+  ZeStruct<ze_kernel_desc_t> ZeKernelDesc;
+  ZeKernelDesc.flags = 0;
+  ZeKernelDesc.pKernelName = KernelName;
+
+  ze_kernel_handle_t ZeKernel;
+  ZE2UR_CALL(zeKernelCreate, (Program->ZeModule, &ZeKernelDesc, &ZeKernel));
+
+  try {
+    ur_kernel_handle_t_ *UrKernel =
+        new ur_kernel_handle_t_(ZeKernel, true, Program);
+    *RetKernel = reinterpret_cast<ur_kernel_handle_t>(UrKernel);
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  UR_CALL((*RetKernel)->initialize());
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue(
+    ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object
+    uint32_t ArgIndex, ///< [in] argument index in range [0, num args - 1]
+    size_t ArgSize,    ///< [in] size of argument type
+    const void
+        *PArgValue ///< [in] argument value represented as matching arg type.
+) {
+  // OpenCL: "the arg_value pointer can be NULL or point to a NULL value
+  // in which case a NULL value will be used as the value for the argument
+  // declared as a pointer to global or constant memory in the kernel"
+  //
+  // We don't know the type of the argument but it seems that the only time
+  // SYCL RT would send a pointer to NULL in 'arg_value' is when the argument
+  // is a NULL pointer. Treat a pointer to NULL in 'arg_value' as a NULL.
+  if (ArgSize == sizeof(void *) && PArgValue &&
+      *(void **)(const_cast<void *>(PArgValue)) == nullptr) {
+    PArgValue = nullptr;
+  }
+
+  std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
+  ZE2UR_CALL(zeKernelSetArgumentValue,
+             (Kernel->ZeKernel, ArgIndex, ArgSize, PArgValue));
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgLocal(
+    ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object
+    uint32_t ArgIndex, ///< [in] argument index in range [0, num args - 1]
+    size_t ArgSize     ///< [in] size of the local buffer to be allocated by the
+                       ///< runtime
+) {
+  std::ignore = Kernel;
+  std::ignore = ArgIndex;
+  std::ignore = ArgSize;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(
+    ur_kernel_handle_t Kernel,  ///< [in] handle of the Kernel object
+    ur_kernel_info_t ParamName, ///< [in] name of the Kernel property to query
+    size_t PropSize,            ///< [in] the size of the Kernel property value.
+    void *KernelInfo, ///< [in,out][optional] array of bytes holding the kernel
+                      ///< info property. If propSize is not equal to or
+                      ///< greater than the real number of bytes needed to
+                      ///< return the info then the
+                      ///< ::UR_RESULT_ERROR_INVALID_SIZE error is returned and
+                      ///< pKernelInfo is not used.
+    size_t *PropSizeRet ///< [out][optional] pointer to the actual size in
+                        ///< bytes of data being queried by propName.
+) {
+
+  UrL0ReturnHelper ReturnValue(PropSize, KernelInfo, PropSizeRet);
+
+  std::shared_lock<ur_shared_mutex> Guard(Kernel->Mutex);
+  switch (ParamName) {
+  case UR_KERNEL_INFO_CONTEXT:
+    return ReturnValue(ur_context_handle_t{Kernel->Program->Context});
+  case UR_KERNEL_INFO_PROGRAM:
+    return ReturnValue(ur_program_handle_t{Kernel->Program});
+  case UR_KERNEL_INFO_FUNCTION_NAME:
+    try {
+      std::string &KernelName = *Kernel->ZeKernelName.operator->();
+      return ReturnValue(static_cast<const char *>(KernelName.c_str()));
+    } catch (const std::bad_alloc &) {
+      return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    } catch (...) {
+      return UR_RESULT_ERROR_UNKNOWN;
+    }
+  case UR_KERNEL_INFO_NUM_ARGS:
+    return ReturnValue(uint32_t{Kernel->ZeKernelProperties->numKernelArgs});
+  case UR_KERNEL_INFO_REFERENCE_COUNT:
+    return ReturnValue(uint32_t{Kernel->RefCount.load()});
+  case UR_KERNEL_INFO_ATTRIBUTES:
+    try {
+      uint32_t Size;
+      ZE2UR_CALL(zeKernelGetSourceAttributes,
+                 (Kernel->ZeKernel, &Size, nullptr));
+      char *attributes = new char[Size];
+      ZE2UR_CALL(zeKernelGetSourceAttributes,
+                 (Kernel->ZeKernel, &Size, &attributes));
+      auto Res = ReturnValue(attributes);
+      delete[] attributes;
+      return Res;
+    } catch (const std::bad_alloc &) {
+      return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    } catch (...) {
+      return UR_RESULT_ERROR_UNKNOWN;
+    }
+  default:
+    urPrint("Unsupported ParamName in urKernelGetInfo: ParamName=%d(0x%x)\n",
+            ParamName, ParamName);
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo(
+    ur_kernel_handle_t Kernel, ///< [in] handle of the Kernel object
+    ur_device_handle_t Device, ///< [in] handle of the Device object
+    ur_kernel_group_info_t
+        ParamName, ///< [in] name of the work Group property to query
+    size_t
+        ParamValueSize, ///< [in] size of the Kernel Work Group property value
+    void *ParamValue,   ///< [in,out][optional][range(0, propSize)] value of the
+                        ///< Kernel Work Group property.
+    size_t *ParamValueSizeRet ///< [out][optional] pointer to the actual size in
+                              ///< bytes of data being queried by propName.
+) {
+  UrL0ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
+
+  std::shared_lock<ur_shared_mutex> Guard(Kernel->Mutex);
+  switch (ParamName) {
+  case UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
+    // TODO: To revisit after level_zero/issues/262 is resolved
+    struct {
+      size_t Arr[3];
+    } WorkSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX,
+                   Device->ZeDeviceComputeProperties->maxGroupSizeY,
+                   Device->ZeDeviceComputeProperties->maxGroupSizeZ}};
+    return ReturnValue(WorkSize);
+  }
+  case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
+    // As of right now, L0 is missing API to query kernel and device specific
+    // max work group size.
+    return ReturnValue(
+        pi_uint64{Device->ZeDeviceComputeProperties->maxTotalGroupSize});
+  }
+  case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
+    struct {
+      size_t Arr[3];
+    } WgSize = {{Kernel->ZeKernelProperties->requiredGroupSizeX,
+                 Kernel->ZeKernelProperties->requiredGroupSizeY,
+                 Kernel->ZeKernelProperties->requiredGroupSizeZ}};
+    return ReturnValue(WgSize);
+  }
+  case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE:
+    return ReturnValue(uint32_t{Kernel->ZeKernelProperties->localMemSize});
+  case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
+    return ReturnValue(size_t{Device->ZeDeviceProperties->physicalEUSimdWidth});
+  }
+  case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
+    return ReturnValue(uint32_t{Kernel->ZeKernelProperties->privateMemSize});
+  }
+  default: {
+    urPrint("Unknown ParamName in urKernelGetGroupInfo: ParamName=%d(0x%x)\n",
+            ParamName, ParamName);
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSubGroupInfo(
+    ur_kernel_handle_t Kernel, ///< [in] handle of the Kernel object
+    ur_device_handle_t Device, ///< [in] handle of the Device object
+    ur_kernel_sub_group_info_t
+        PropName,       ///< [in] name of the SubGroup property to query
+    size_t PropSize,    ///< [in] size of the Kernel SubGroup property value
+    void *PropValue,    ///< [in,out][range(0, propSize)][optional] value of the
+                        ///< Kernel SubGroup property.
+    size_t *PropSizeRet ///< [out][optional] pointer to the actual size in
+                        ///< bytes of data being queried by propName.
+) {
+  std::ignore = Device;
+
+  UrReturnHelper ReturnValue(PropSize, PropValue, PropSizeRet);
+
+  std::shared_lock<ur_shared_mutex> Guard(Kernel->Mutex);
+  if (PropName == UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE) {
+    ReturnValue(uint32_t{Kernel->ZeKernelProperties->maxSubgroupSize});
+  } else if (PropName == UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS) {
+    ReturnValue(uint32_t{Kernel->ZeKernelProperties->maxNumSubgroups});
+  } else if (PropName == UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS) {
+    ReturnValue(uint32_t{Kernel->ZeKernelProperties->requiredNumSubGroups});
+  } else if (PropName == UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL) {
+    ReturnValue(uint32_t{Kernel->ZeKernelProperties->requiredSubgroupSize});
+  } else {
+    die("urKernelGetSubGroupInfo: parameter not implemented");
+    return {};
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain(
+    ur_kernel_handle_t Kernel ///< [in] handle for the Kernel to retain
+) {
+  Kernel->RefCount.increment();
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelRelease(
+    ur_kernel_handle_t Kernel ///< [in] handle for the Kernel to release
+) {
+  if (!Kernel->RefCount.decrementAndTest())
+    return UR_RESULT_SUCCESS;
+
+  auto KernelProgram = Kernel->Program;
+  if (Kernel->OwnNativeHandle) {
+    auto ZeResult = ZE_CALL_NOCHECK(zeKernelDestroy, (Kernel->ZeKernel));
+    // Gracefully handle the case that L0 was already unloaded.
+    if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
+      return ze2urResult(ZeResult);
+  }
+  if (IndirectAccessTrackingEnabled) {
+    UR_CALL(urContextRelease(KernelProgram->Context));
+  }
+  // do a release on the program this kernel was part of
+  UR_CALL(urProgramRelease(KernelProgram));
+  delete Kernel;
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer(
+    ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object
+    uint32_t ArgIndex,   ///< [in] argument index in range [0, num args - 1]
+    const void *ArgValue ///< [in][optional] SVM pointer to memory location
+                         ///< holding the argument value. If null then argument
+                         ///< value is considered null.
+) {
+  std::ignore = Kernel;
+  std::ignore = ArgIndex;
+  std::ignore = ArgValue;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer(
+    ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object
+    uint32_t ArgIndex,   ///< [in] argument index in range [0, num args - 1]
+    size_t ArgSize,      ///< [in] size of argument type
+    const void *ArgValue ///< [in][optional] SVM pointer to memory location
+                         ///< holding the argument value. If null then argument
+                         ///< value is considered null.
+) {
+  UR_CALL(urKernelSetArgValue(Kernel, ArgIndex, ArgSize, ArgValue));
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo(
+    ur_kernel_handle_t Kernel,      ///< [in] handle of the kernel object
+    ur_kernel_exec_info_t PropName, ///< [in] name of the execution attribute
+    size_t PropSize,                ///< [in] size in byte the attribute value
+    const void *PropValue ///< [in][range(0, propSize)] pointer to memory
+                          ///< location holding the property value.
+) {
+  std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
+  if (PropName == UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS &&
+      *(static_cast<const pi_bool *>(PropValue)) == PI_TRUE) {
+    // The whole point for users really was to not need to know anything
+    // about the types of allocations kernel uses. So in DPC++ we always
+    // just set all 3 modes for each kernel.
+    ze_kernel_indirect_access_flags_t IndirectFlags =
+        ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST |
+        ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE |
+        ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED;
+    ZE2UR_CALL(zeKernelSetIndirectAccess, (Kernel->ZeKernel, IndirectFlags));
+  } else if (PropName == UR_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG) {
+    ze_cache_config_flag_t ZeCacheConfig{};
+    auto CacheConfig = *(static_cast<const ur_kernel_exec_info_t *>(PropValue));
+    if (CacheConfig == UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM)
+      ZeCacheConfig = ZE_CACHE_CONFIG_FLAG_LARGE_SLM;
+    else if (CacheConfig == UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA)
+      ZeCacheConfig = ZE_CACHE_CONFIG_FLAG_LARGE_DATA;
+    else if (CacheConfig == UR_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT)
+      ZeCacheConfig = static_cast<ze_cache_config_flag_t>(0);
+    else
+      // Unexpected cache configuration value.
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    ZE2UR_CALL(zeKernelSetCacheConfig, (Kernel->ZeKernel, ZeCacheConfig););
+  } else {
+    urPrint("urKernelSetExecInfo: unsupported ParamName\n");
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler(
+    ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object
+    uint32_t ArgIndex, ///< [in] argument index in range [0, num args - 1]
+    ur_sampler_handle_t ArgValue ///< [in] handle of Sampler object.
+) {
+  std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
+  ZE2UR_CALL(zeKernelSetArgumentValue,
+             (ur_cast<ze_kernel_handle_t>(Kernel->ZeKernel), ArgIndex,
+              sizeof(void *), &ArgValue->ZeSampler));
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj(
+    ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object
+    uint32_t ArgIndex,       ///< [in] argument index in range [0, num args - 1]
+    ur_mem_handle_t ArgValue ///< [in][optional] handle of Memory object.
+) {
+  std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
+  // The ArgValue may be a NULL pointer in which case a NULL value is used for
+  // the kernel argument declared as a pointer to global or constant memory.
+
+  ur_mem_handle_t_ *UrMem = ur_cast<ur_mem_handle_t_ *>(ArgValue);
+
+  auto Arg = UrMem ? UrMem : nullptr;
+  Kernel->PendingArguments.push_back(
+      {ArgIndex, sizeof(void *), Arg, ur_mem_handle_t_::read_write});
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
+    ur_kernel_handle_t Kernel, ///< [in] handle of the kernel.
+    ur_native_handle_t
+        *NativeKernel ///< [out] a pointer to the native handle of the kernel.
+) {
+  std::shared_lock<ur_shared_mutex> Guard(Kernel->Mutex);
+
+  *NativeKernel = reinterpret_cast<ur_native_handle_t>(Kernel->ZeKernel);
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
+    ur_native_handle_t NativeKernel, ///< [in] the native handle of the kernel.
+    ur_context_handle_t Context,     ///< [in] handle of the context object
+    ur_kernel_handle_t *
+        RetKernel ///< [out] pointer to the handle of the kernel object created.
+) {
+  ze_kernel_handle_t ZeKernel = ur_cast<ze_kernel_handle_t>(NativeKernel);
+  ur_kernel_handle_t_ *Kernel = nullptr;
+  try {
+    Kernel = new ur_kernel_handle_t_(ZeKernel,
+                                     false, // OwnZeKernel
+                                     Context);
+    *RetKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  UR_CALL(Kernel->initialize());
+
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t ur_kernel_handle_t_::initialize() {
+  // Retain the program and context to show it's used by this kernel.
+  UR_CALL(urProgramRetain(Program));
+
+  if (IndirectAccessTrackingEnabled)
+    // TODO: do piContextRetain without the guard
+    UR_CALL(urContextRetain(Program->Context));
+
+  // Set up how to obtain kernel properties when needed.
+  ZeKernelProperties.Compute = [this](ze_kernel_properties_t &Properties) {
+    ZE_CALL_NOCHECK(zeKernelGetProperties, (ZeKernel, &Properties));
+  };
+
+  // Cache kernel name.
+  ZeKernelName.Compute = [this](std::string &Name) {
+    size_t Size = 0;
+    ZE_CALL_NOCHECK(zeKernelGetName, (ZeKernel, &Size, nullptr));
+    char *KernelName = new char[Size];
+    ZE_CALL_NOCHECK(zeKernelGetName, (ZeKernel, &Size, KernelName));
+    Name = KernelName;
+    delete[] KernelName;
+  };
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants(
+    ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object
+    uint32_t Count, ///< [in] the number of elements in the pSpecConstants array
+    const ur_specialization_constant_info_t
+        *SpecConstants ///< [in] array of specialization constant value
+                       ///< descriptions
+) {
+  std::ignore = Kernel;
+  std::ignore = Count;
+  std::ignore = SpecConstants;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
\ No newline at end of file
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.hpp
new file mode 100644
index 0000000000000..db7b87a6f6f82
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.hpp
@@ -0,0 +1,97 @@
+//===--------- ur_level_zero_kernel.hpp - Level Zero Adapter ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include "ur_level_zero_common.hpp"
+#include "ur_level_zero_mem.hpp"
+#include <unordered_set>
+
+struct ur_kernel_handle_t_ : _ur_object {
+  ur_kernel_handle_t_(ze_kernel_handle_t Kernel, bool OwnZeHandle,
+                      ur_program_handle_t Program)
+      : Program{Program}, ZeKernel{Kernel}, SubmissionsCount{0}, MemAllocs{} {
+    OwnNativeHandle = OwnZeHandle;
+  }
+
+  ur_kernel_handle_t_(ze_kernel_handle_t Kernel, bool OwnZeHandle,
+                      ur_context_handle_t Context)
+      : Context{Context}, ZeKernel{Kernel}, SubmissionsCount{0}, MemAllocs{} {
+    OwnNativeHandle = OwnZeHandle;
+  }
+
+  // Keep the program of the kernel.
+  ur_context_handle_t Context;
+
+  // Keep the program of the kernel.
+  ur_program_handle_t Program;
+
+  // Level Zero function handle.
+  ze_kernel_handle_t ZeKernel;
+
+  // Counter to track the number of submissions of the kernel.
+  // When this value is zero, it means that kernel is not submitted for an
+  // execution - at this time we can release memory allocations referenced by
+  // this kernel. We can do this when RefCount turns to 0 but it is too late
+  // because kernels are cached in the context by SYCL RT and they are released
+  // only during context object destruction. Regular RefCount is not usable to
+  // track submissions because user/SYCL RT can retain kernel object any number
+  // of times. And that's why there is no value of RefCount which can mean zero
+  // submissions.
+  std::atomic<uint32_t> SubmissionsCount;
+
+  // Returns true if kernel has indirect access, false otherwise.
+  bool hasIndirectAccess() {
+    // Currently indirect access flag is set for all kernels and there is no API
+    // to check if kernel actually indirectly access smth.
+    return true;
+  }
+
+  // Hash function object for the unordered_set below.
+  struct Hash {
+    size_t operator()(const std::pair<void *const, MemAllocRecord> *P) const {
+      return std::hash<void *>()(P->first);
+    }
+  };
+
+  // If kernel has indirect access we need to make a snapshot of all existing
+  // memory allocations to defer deletion of these memory allocations to the
+  // moment when kernel execution has finished.
+  // We store pointers to the elements because pointers are not invalidated by
+  // insert/delete for std::unordered_map (iterators are invalidated). We need
+  // to take a snapshot instead of just reference-counting the allocations,
+  // because picture of active allocations can change during kernel execution
+  // (new allocations can be added) and we need to know which memory allocations
+  // were retained by this kernel to release them (and don't touch new
+  // allocations) at kernel completion. Same kernel may be submitted several
+  // times and retained allocations may be different at each submission. That's
+  // why we have a set of memory allocations here and increase ref count only
+  // once even if kernel is submitted many times. We don't want to know how many
+  // times and which allocations were retained by each submission. We release
+  // all allocations in the set only when SubmissionsCount == 0.
+  std::unordered_set<std::pair<void *const, MemAllocRecord> *, Hash> MemAllocs;
+
+  // Completed initialization of PI kernel. Must be called after construction.
+  ur_result_t initialize();
+
+  // Keeps info about an argument to the kernel enough to set it with
+  // zeKernelSetArgumentValue.
+  struct ArgumentInfo {
+    uint32_t Index;
+    size_t Size;
+    // const ur_mem_handle_t_ *Value;
+    ur_mem_handle_t_ *Value;
+    ur_mem_handle_t_::access_mode_t AccessMode{ur_mem_handle_t_::unknown};
+  };
+  // Arguments that still need to be set (with zeKernelSetArgumentValue)
+  // before kernel is enqueued.
+  std::vector<ArgumentInfo> PendingArguments;
+
+  // Cache of the kernel properties.
+  ZeCache<ZeStruct<ze_kernel_properties_t>> ZeKernelProperties;
+  ZeCache<std::string> ZeKernelName;
+};
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index 080cb2eb5d201..e2b0b597eb2b1 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -6,7 +6,2530 @@
 //
 //===-----------------------------------------------------------------===//
 
-#include "ur_level_zero_mem.hpp"
+#include <algorithm>
+#include <climits>
+#include <string.h>
+
+#include "ur_level_zero.hpp"
+#include "ur_level_zero_context.hpp"
+#include "ur_level_zero_event.hpp"
+#include <ur_bindings.hpp>
+
+// Default to using compute engine for fill operation, but allow to
+// override this with an environment variable.
+static bool PreferCopyEngine = [] {
+  const char *Env = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL");
+  return Env ? std::stoi(Env) != 0 : false;
+}();
+
+// Helper function to check if a pointer is a device pointer.
+bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr) {
+  ze_device_handle_t ZeDeviceHandle;
+  ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
+
+  // Query memory type of the pointer
+  ZE2UR_CALL(zeMemGetAllocProperties,
+             (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties,
+              &ZeDeviceHandle));
+
+  return (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_DEVICE);
+}
+
+// Shared by all memory read/write/copy PI interfaces.
+// PI interfaces must have queue's and destination buffer's mutexes locked for
+// exclusive use and source buffer's mutex locked for shared use on entry.
+ur_result_t enqueueMemCopyHelper(ur_command_t CommandType,
+                                 ur_queue_handle_t Queue, void *Dst,
+                                 pi_bool BlockingWrite, size_t Size,
+                                 const void *Src, uint32_t NumEventsInWaitList,
+                                 const ur_event_handle_t *EventWaitList,
+                                 ur_event_handle_t *OutEvent,
+                                 bool PreferCopyEngine) {
+  bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine);
+
+  _ur_ze_event_list_t TmpWaitList;
+  UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+      NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
+
+  // We want to batch these commands to avoid extra submissions (costly)
+  bool OkToBatch = true;
+
+  // Get a new command list to be used on this call
+  ur_command_list_ptr_t CommandList{};
+  UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
+                                                  UseCopyEngine, OkToBatch));
+
+  ze_event_handle_t ZeEvent = nullptr;
+  ur_event_handle_t InternalEvent;
+  bool IsInternal = OutEvent == nullptr;
+  ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
+  UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
+                                       IsInternal));
+  ZeEvent = (*Event)->ZeEvent;
+  (*Event)->WaitList = TmpWaitList;
+
+  const auto &ZeCommandList = CommandList->first;
+  const auto &WaitList = (*Event)->WaitList;
+
+  urPrint("calling zeCommandListAppendMemoryCopy() with\n"
+          "  ZeEvent %#llx\n",
+          ur_cast<std::uintptr_t>(ZeEvent));
+  printZeEventList(WaitList);
+
+  ZE2UR_CALL(zeCommandListAppendMemoryCopy,
+             (ZeCommandList, Dst, Src, Size, ZeEvent, WaitList.Length,
+              WaitList.ZeEventList));
+
+  UR_CALL(Queue->executeCommandList(CommandList, BlockingWrite, OkToBatch));
+
+  return UR_RESULT_SUCCESS;
+}
+
+// Shared by all memory read/write/copy rect PI interfaces.
+// PI interfaces must have queue's and destination buffer's mutexes locked for
+// exclusive use and source buffer's mutex locked for shared use on entry.
+ur_result_t enqueueMemCopyRectHelper(
+    ur_command_t CommandType, ur_queue_handle_t Queue, const void *SrcBuffer,
+    void *DstBuffer, ur_rect_offset_t SrcOrigin, ur_rect_offset_t DstOrigin,
+    ur_rect_region_t Region, size_t SrcRowPitch, size_t DstRowPitch,
+    size_t SrcSlicePitch, size_t DstSlicePitch, pi_bool Blocking,
+    uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
+    ur_event_handle_t *OutEvent, bool PreferCopyEngine) {
+  bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine);
+
+  _ur_ze_event_list_t TmpWaitList;
+  UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+      NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
+
+  // We want to batch these commands to avoid extra submissions (costly)
+  bool OkToBatch = true;
+
+  // Get a new command list to be used on this call
+  ur_command_list_ptr_t CommandList{};
+  UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
+                                                  UseCopyEngine, OkToBatch));
+
+  ze_event_handle_t ZeEvent = nullptr;
+  ur_event_handle_t InternalEvent;
+  bool IsInternal = OutEvent == nullptr;
+  ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
+  UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
+                                       IsInternal));
+
+  ZeEvent = (*Event)->ZeEvent;
+  (*Event)->WaitList = TmpWaitList;
+
+  const auto &ZeCommandList = CommandList->first;
+  const auto &WaitList = (*Event)->WaitList;
+
+  urPrint("calling zeCommandListAppendMemoryCopy() with\n"
+          "  ZeEvent %#llx\n",
+          ur_cast<std::uintptr_t>(ZeEvent));
+  printZeEventList(WaitList);
+
+  uint32_t SrcOriginX = ur_cast<uint32_t>(SrcOrigin.x);
+  uint32_t SrcOriginY = ur_cast<uint32_t>(SrcOrigin.y);
+  uint32_t SrcOriginZ = ur_cast<uint32_t>(SrcOrigin.z);
+
+  uint32_t SrcPitch = SrcRowPitch;
+  if (SrcPitch == 0)
+    SrcPitch = ur_cast<uint32_t>(Region.width);
+
+  if (SrcSlicePitch == 0)
+    SrcSlicePitch = ur_cast<uint32_t>(Region.height) * SrcPitch;
+
+  uint32_t DstOriginX = ur_cast<uint32_t>(DstOrigin.x);
+  uint32_t DstOriginY = ur_cast<uint32_t>(DstOrigin.y);
+  uint32_t DstOriginZ = ur_cast<uint32_t>(DstOrigin.z);
+
+  uint32_t DstPitch = DstRowPitch;
+  if (DstPitch == 0)
+    DstPitch = ur_cast<uint32_t>(Region.width);
+
+  if (DstSlicePitch == 0)
+    DstSlicePitch = ur_cast<uint32_t>(Region.height) * DstPitch;
+
+  uint32_t Width = ur_cast<uint32_t>(Region.width);
+  uint32_t Height = ur_cast<uint32_t>(Region.height);
+  uint32_t Depth = ur_cast<uint32_t>(Region.depth);
+
+  const ze_copy_region_t ZeSrcRegion = {SrcOriginX, SrcOriginY, SrcOriginZ,
+                                        Width,      Height,     Depth};
+  const ze_copy_region_t ZeDstRegion = {DstOriginX, DstOriginY, DstOriginZ,
+                                        Width,      Height,     Depth};
+
+  ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion,
+             (ZeCommandList, DstBuffer, &ZeDstRegion, DstPitch, DstSlicePitch,
+              SrcBuffer, &ZeSrcRegion, SrcPitch, SrcSlicePitch, nullptr,
+              WaitList.Length, WaitList.ZeEventList));
+
+  urPrint("calling zeCommandListAppendMemoryCopyRegion()\n");
+
+  ZE2UR_CALL(zeCommandListAppendBarrier, (ZeCommandList, ZeEvent, 0, nullptr));
+
+  urPrint("calling zeCommandListAppendBarrier() with Event %#llx\n",
+          ur_cast<std::uintptr_t>(ZeEvent));
+
+  UR_CALL(Queue->executeCommandList(CommandList, Blocking, OkToBatch));
+
+  return UR_RESULT_SUCCESS;
+}
+
+// PI interfaces must have queue's and buffer's mutexes locked on entry.
+static ur_result_t enqueueMemFillHelper(ur_command_t CommandType,
+                                        ur_queue_handle_t Queue, void *Ptr,
+                                        const void *Pattern, size_t PatternSize,
+                                        size_t Size,
+                                        uint32_t NumEventsInWaitList,
+                                        const ur_event_handle_t *EventWaitList,
+                                        ur_event_handle_t *OutEvent) {
+  // Pattern size must be a power of two.
+  UR_ASSERT((PatternSize > 0) && ((PatternSize & (PatternSize - 1)) == 0),
+            UR_RESULT_ERROR_INVALID_VALUE);
+  auto &Device = Queue->Device;
+
+  // Make sure that pattern size matches the capability of the copy queues.
+  // Check both main and link groups as we don't known which one will be used.
+  //
+  if (PreferCopyEngine && Device->hasCopyEngine()) {
+    if (Device->hasMainCopyEngine() &&
+        Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::MainCopy]
+                .ZeProperties.maxMemoryFillPatternSize < PatternSize) {
+      PreferCopyEngine = false;
+    }
+    if (Device->hasLinkCopyEngine() &&
+        Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::LinkCopy]
+                .ZeProperties.maxMemoryFillPatternSize < PatternSize) {
+      PreferCopyEngine = false;
+    }
+  }
+
+  bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine);
+  if (!UseCopyEngine) {
+    // Pattern size must fit the compute queue capabilities.
+    UR_ASSERT(
+        PatternSize <=
+            Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
+                .ZeProperties.maxMemoryFillPatternSize,
+        UR_RESULT_ERROR_INVALID_VALUE);
+  }
+
+  _ur_ze_event_list_t TmpWaitList;
+  UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+      NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
+
+  ur_command_list_ptr_t CommandList{};
+  // We want to batch these commands to avoid extra submissions (costly)
+  bool OkToBatch = true;
+  UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
+                                                  UseCopyEngine, OkToBatch));
+
+  ze_event_handle_t ZeEvent = nullptr;
+  ur_event_handle_t InternalEvent;
+  bool IsInternal = OutEvent == nullptr;
+  ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
+  UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
+                                       IsInternal));
+
+  ZeEvent = (*Event)->ZeEvent;
+  (*Event)->WaitList = TmpWaitList;
+
+  const auto &ZeCommandList = CommandList->first;
+  const auto &WaitList = (*Event)->WaitList;
+
+  ZE2UR_CALL(zeCommandListAppendMemoryFill,
+             (ZeCommandList, Ptr, Pattern, PatternSize, Size, ZeEvent,
+              WaitList.Length, WaitList.ZeEventList));
+
+  urPrint("calling zeCommandListAppendMemoryFill() with\n"
+          "  ZeEvent %#llx\n",
+          ur_cast<uint64_t>(ZeEvent));
+  printZeEventList(WaitList);
+
+  // Execute command list asynchronously, as the event will be used
+  // to track down its completion.
+  UR_CALL(Queue->executeCommandList(CommandList, false, OkToBatch));
+
+  return UR_RESULT_SUCCESS;
+}
+
+// If indirect access tracking is enabled then performs reference counting,
+// otherwise just calls zeMemAllocHost.
+static ur_result_t ZeHostMemAllocHelper(void **ResultPtr,
+                                        ur_context_handle_t UrContext,
+                                        size_t Size) {
+  ur_platform_handle_t Plt = UrContext->getPlatform();
+  std::unique_lock<ur_shared_mutex> ContextsLock(Plt->ContextsMutex,
+                                                 std::defer_lock);
+  if (IndirectAccessTrackingEnabled) {
+    // Lock the mutex which is guarding contexts container in the platform.
+    // This prevents new kernels from being submitted in any context while
+    // we are in the process of allocating a memory, this is needed to
+    // properly capture allocations by kernels with indirect access.
+    ContextsLock.lock();
+    // We are going to defer memory release if there are kernels with
+    // indirect access, that is why explicitly retain context to be sure
+    // that it is released after all memory allocations in this context are
+    // released.
+    UR_CALL(urContextRetain(UrContext));
+  }
+
+  ZeStruct<ze_host_mem_alloc_desc_t> ZeDesc;
+  ZeDesc.flags = 0;
+  ZE2UR_CALL(zeMemAllocHost,
+             (UrContext->ZeContext, &ZeDesc, Size, 1, ResultPtr));
+
+  if (IndirectAccessTrackingEnabled) {
+    // Keep track of all memory allocations in the context
+    UrContext->MemAllocs.emplace(
+        std::piecewise_construct, std::forward_as_tuple(*ResultPtr),
+        std::forward_as_tuple(
+            reinterpret_cast<ur_context_handle_t>(UrContext)));
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+static ur_result_t getImageRegionHelper(_ur_image *Mem,
+                                        ur_rect_offset_t *Origin,
+                                        ur_rect_region_t *Region,
+                                        ze_image_region_t &ZeRegion) {
+  UR_ASSERT(Mem, UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(Origin, UR_RESULT_ERROR_INVALID_VALUE);
+
+  auto UrImage = static_cast<_ur_image *>(Mem);
+  ze_image_desc_t &ZeImageDesc = UrImage->ZeImageDesc;
+
+#ifndef NDEBUG
+  UR_ASSERT(Mem->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT((ZeImageDesc.type == ZE_IMAGE_TYPE_1D && Origin->y == 0 &&
+             Origin->z == 0) ||
+                (ZeImageDesc.type == ZE_IMAGE_TYPE_1DARRAY && Origin->z == 0) ||
+                (ZeImageDesc.type == ZE_IMAGE_TYPE_2D && Origin->z == 0) ||
+                (ZeImageDesc.type == ZE_IMAGE_TYPE_3D),
+            UR_RESULT_ERROR_INVALID_VALUE);
+
+  UR_ASSERT(Region->width && Region->height && Region->depth,
+            UR_RESULT_ERROR_INVALID_VALUE);
+  UR_ASSERT(
+      (ZeImageDesc.type == ZE_IMAGE_TYPE_1D && Region->height == 1 &&
+       Region->depth == 1) ||
+          (ZeImageDesc.type == ZE_IMAGE_TYPE_1DARRAY && Region->depth == 1) ||
+          (ZeImageDesc.type == ZE_IMAGE_TYPE_2D && Region->depth == 1) ||
+          (ZeImageDesc.type == ZE_IMAGE_TYPE_3D),
+      UR_RESULT_ERROR_INVALID_VALUE);
+#endif // !NDEBUG
+
+  uint32_t OriginX = ur_cast<uint32_t>(Origin->x);
+  uint32_t OriginY = ur_cast<uint32_t>(Origin->y);
+  uint32_t OriginZ = ur_cast<uint32_t>(Origin->z);
+
+  uint32_t Width = ur_cast<uint32_t>(Region->width);
+  uint32_t Height = ur_cast<uint32_t>(Region->height);
+  uint32_t Depth = ur_cast<uint32_t>(Region->depth);
+
+  ZeRegion = {OriginX, OriginY, OriginZ, Width, Height, Depth};
+
+  return UR_RESULT_SUCCESS;
+}
+
+// Helper function to implement image read/write/copy.
+// PI interfaces must have queue's and destination image's mutexes locked for
+// exclusive use and source image's mutex locked for shared use on entry.
+static ur_result_t enqueueMemImageCommandHelper(
+    ur_command_t CommandType, ur_queue_handle_t Queue,
+    const void *Src, // image or ptr
+    void *Dst,       // image or ptr
+    pi_bool IsBlocking, ur_rect_offset_t *SrcOrigin,
+    ur_rect_offset_t *DstOrigin, ur_rect_region_t *Region, size_t RowPitch,
+    size_t SlicePitch, uint32_t NumEventsInWaitList,
+    const ur_event_handle_t *EventWaitList, ur_event_handle_t *OutEvent,
+    bool PreferCopyEngine = false) {
+  bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine);
+
+  _ur_ze_event_list_t TmpWaitList;
+  UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+      NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
+
+  // We want to batch these commands to avoid extra submissions (costly)
+  bool OkToBatch = true;
+
+  // Get a new command list to be used on this call
+  ur_command_list_ptr_t CommandList{};
+  UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
+                                                  UseCopyEngine, OkToBatch));
+
+  ze_event_handle_t ZeEvent = nullptr;
+  ur_event_handle_t InternalEvent;
+  bool IsInternal = OutEvent == nullptr;
+  ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
+  UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
+                                       IsInternal));
+  ZeEvent = (*Event)->ZeEvent;
+  (*Event)->WaitList = TmpWaitList;
+
+  const auto &ZeCommandList = CommandList->first;
+  const auto &WaitList = (*Event)->WaitList;
+
+  if (CommandType == UR_COMMAND_MEM_IMAGE_READ) {
+    _ur_image *SrcMem = ur_cast<_ur_image *>(const_cast<void *>(Src));
+
+    ze_image_region_t ZeSrcRegion;
+    UR_CALL(getImageRegionHelper(SrcMem, SrcOrigin, Region, ZeSrcRegion));
+
+    // TODO: Level Zero does not support row_pitch/slice_pitch for images yet.
+    // Check that SYCL RT did not want pitch larger than default.
+    std::ignore = RowPitch;
+    std::ignore = SlicePitch;
+    UR_ASSERT(SrcMem->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+    auto SrcImage = SrcMem;
+    const ze_image_desc_t &ZeImageDesc = SrcImage->ZeImageDesc;
+    UR_ASSERT(
+        RowPitch == 0 ||
+            // special case RGBA image pitch equal to region's width
+            (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32 &&
+             RowPitch == 4 * 4 * ZeSrcRegion.width) ||
+            (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16 &&
+             RowPitch == 4 * 2 * ZeSrcRegion.width) ||
+            (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 &&
+             RowPitch == 4 * ZeSrcRegion.width),
+        UR_RESULT_ERROR_INVALID_IMAGE_SIZE);
+    UR_ASSERT(SlicePitch == 0 || SlicePitch == RowPitch * ZeSrcRegion.height,
+              UR_RESULT_ERROR_INVALID_IMAGE_SIZE);
+
+    char *ZeHandleSrc = nullptr;
+    UR_CALL(SrcMem->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only,
+                                Queue->Device));
+    ZE2UR_CALL(zeCommandListAppendImageCopyToMemory,
+               (ZeCommandList, Dst, ur_cast<ze_image_handle_t>(ZeHandleSrc),
+                &ZeSrcRegion, ZeEvent, WaitList.Length, WaitList.ZeEventList));
+  } else if (CommandType == UR_COMMAND_MEM_IMAGE_WRITE) {
+    _ur_image *DstMem = ur_cast<_ur_image *>(Dst);
+    ze_image_region_t ZeDstRegion;
+    UR_CALL(getImageRegionHelper(DstMem, DstOrigin, Region, ZeDstRegion));
+
+    // TODO: Level Zero does not support row_pitch/slice_pitch for images yet.
+    // Check that SYCL RT did not want pitch larger than default.
+    UR_ASSERT(DstMem->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+    auto DstImage = static_cast<_ur_image *>(DstMem);
+    const ze_image_desc_t &ZeImageDesc = DstImage->ZeImageDesc;
+    UR_ASSERT(
+        RowPitch == 0 ||
+            // special case RGBA image pitch equal to region's width
+            (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32 &&
+             RowPitch == 4 * 4 * ZeDstRegion.width) ||
+            (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16 &&
+             RowPitch == 4 * 2 * ZeDstRegion.width) ||
+            (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 &&
+             RowPitch == 4 * ZeDstRegion.width),
+        UR_RESULT_ERROR_INVALID_IMAGE_SIZE);
+    UR_ASSERT(SlicePitch == 0 || SlicePitch == RowPitch * ZeDstRegion.height,
+              UR_RESULT_ERROR_INVALID_IMAGE_SIZE);
+
+    char *ZeHandleDst = nullptr;
+    UR_CALL(DstMem->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
+                                Queue->Device));
+    ZE2UR_CALL(zeCommandListAppendImageCopyFromMemory,
+               (ZeCommandList, ur_cast<ze_image_handle_t>(ZeHandleDst), Src,
+                &ZeDstRegion, ZeEvent, WaitList.Length, WaitList.ZeEventList));
+  } else if (CommandType == UR_COMMAND_MEM_IMAGE_COPY) {
+    _ur_image *SrcImage = ur_cast<_ur_image *>(const_cast<void *>(Src));
+    _ur_image *DstImage = ur_cast<_ur_image *>(Dst);
+
+    ze_image_region_t ZeSrcRegion;
+    UR_CALL(getImageRegionHelper(SrcImage, SrcOrigin, Region, ZeSrcRegion));
+    ze_image_region_t ZeDstRegion;
+    UR_CALL(getImageRegionHelper(DstImage, DstOrigin, Region, ZeDstRegion));
+
+    char *ZeHandleSrc = nullptr;
+    char *ZeHandleDst = nullptr;
+    UR_CALL(SrcImage->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only,
+                                  Queue->Device));
+    UR_CALL(DstImage->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
+                                  Queue->Device));
+    ZE2UR_CALL(zeCommandListAppendImageCopyRegion,
+               (ZeCommandList, ur_cast<ze_image_handle_t>(ZeHandleDst),
+                ur_cast<ze_image_handle_t>(ZeHandleSrc), &ZeDstRegion,
+                &ZeSrcRegion, ZeEvent, 0, nullptr));
+  } else {
+    urPrint("enqueueMemImageUpdate: unsupported image command type\n");
+    return UR_RESULT_ERROR_INVALID_OPERATION;
+  }
+
+  UR_CALL(Queue->executeCommandList(CommandList, IsBlocking, OkToBatch));
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue object
+    ur_mem_handle_t hBuffer, ///< [in] handle of the buffer object
+    bool blockingRead, ///< [in] indicates blocking (true), non-blocking (false)
+    size_t offset,     ///< [in] offset in bytes in the buffer object
+    size_t size,       ///< [in] size in bytes of data being read
+    void *pDst, ///< [in] pointer to host memory where data is to be read into
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                          ///< pointer to a list of events that must be complete
+                          ///< before this command can be executed. If nullptr,
+                          ///< the numEventsInWaitList must be 0, indicating
+                          ///< that this command does not wait on any event to
+                          ///< complete.
+    ur_event_handle_t
+        *phEvent ///< [in,out][optional] return an event object that identifies
+                 ///< this particular command instance.
+) {
+  ur_mem_handle_t_ *Src = ur_cast<ur_mem_handle_t_ *>(hBuffer);
+
+  std::shared_lock<ur_shared_mutex> SrcLock(Src->Mutex, std::defer_lock);
+  std::scoped_lock<std::shared_lock<ur_shared_mutex>, ur_shared_mutex> LockAll(
+      SrcLock, Queue->Mutex);
+
+  char *ZeHandleSrc = nullptr;
+  UR_CALL(Src->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only,
+                           Queue->Device));
+  return enqueueMemCopyHelper(UR_COMMAND_MEM_BUFFER_READ, Queue, pDst,
+                              blockingRead, size, ZeHandleSrc + offset,
+                              numEventsInWaitList, phEventWaitList, phEvent,
+                              true /* PreferCopyEngine */);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue object
+    ur_mem_handle_t hBuffer, ///< [in] handle of the buffer object
+    bool
+        blockingWrite, ///< [in] indicates blocking (true), non-blocking (false)
+    size_t offset,     ///< [in] offset in bytes in the buffer object
+    size_t size,       ///< [in] size in bytes of data being written
+    const void
+        *pSrc, ///< [in] pointer to host memory where data is to be written from
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                          ///< pointer to a list of events that must be complete
+                          ///< before this command can be executed. If nullptr,
+                          ///< the numEventsInWaitList must be 0, indicating
+                          ///< that this command does not wait on any event to
+                          ///< complete.
+    ur_event_handle_t
+        *phEvent ///< [in,out][optional] return an event object that identifies
+                 ///< this particular command instance.
+) {
+  ur_mem_handle_t_ *Buffer = ur_cast<ur_mem_handle_t_ *>(hBuffer);
+
+  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
+                                                          Buffer->Mutex);
+
+  char *ZeHandleDst = nullptr;
+  UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
+                              Queue->Device));
+  return enqueueMemCopyHelper(UR_COMMAND_MEM_BUFFER_WRITE, Queue,
+                              ZeHandleDst + offset, // dst
+                              blockingWrite, size,
+                              pSrc, // src
+                              numEventsInWaitList, phEventWaitList, phEvent,
+                              true /* PreferCopyEngine */);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue object
+    ur_mem_handle_t hBuffer, ///< [in] handle of the buffer object
+    bool blockingRead, ///< [in] indicates blocking (true), non-blocking (false)
+    ur_rect_offset_t bufferOffset, ///< [in] 3D offset in the buffer
+    ur_rect_offset_t hostOffset,   ///< [in] 3D offset in the host region
+    ur_rect_region_t
+        region, ///< [in] 3D rectangular region descriptor: width, height, depth
+    size_t bufferRowPitch,   ///< [in] length of each row in bytes in the buffer
+                             ///< object
+    size_t bufferSlicePitch, ///< [in] length of each 2D slice in bytes in the
+                             ///< buffer object being read
+    size_t hostRowPitch,     ///< [in] length of each row in bytes in the host
+                             ///< memory region pointed by dst
+    size_t hostSlicePitch,   ///< [in] length of each 2D slice in bytes in the
+                             ///< host memory region pointed by dst
+    void *pDst, ///< [in] pointer to host memory where data is to be read into
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                          ///< pointer to a list of events that must be complete
+                          ///< before this command can be executed. If nullptr,
+                          ///< the numEventsInWaitList must be 0, indicating
+                          ///< that this command does not wait on any event to
+                          ///< complete.
+    ur_event_handle_t
+        *phEvent ///< [in,out][optional] return an event object that identifies
+                 ///< this particular command instance.
+) {
+  ur_mem_handle_t_ *Buffer = ur_cast<ur_mem_handle_t_ *>(hBuffer);
+
+  std::shared_lock<ur_shared_mutex> SrcLock(Buffer->Mutex, std::defer_lock);
+  std::scoped_lock<std::shared_lock<ur_shared_mutex>, ur_shared_mutex> LockAll(
+      SrcLock, Queue->Mutex);
+
+  char *ZeHandleSrc;
+  UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only,
+                              Queue->Device));
+  return enqueueMemCopyRectHelper(
+      UR_COMMAND_MEM_BUFFER_READ_RECT, Queue, ZeHandleSrc, pDst, bufferOffset,
+      hostOffset, region, bufferRowPitch, hostRowPitch, bufferSlicePitch,
+      hostSlicePitch, blockingRead, numEventsInWaitList, phEventWaitList,
+      phEvent);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue object
+    ur_mem_handle_t hBuffer, ///< [in] handle of the buffer object
+    bool
+        blockingWrite, ///< [in] indicates blocking (true), non-blocking (false)
+    ur_rect_offset_t bufferOffset, ///< [in] 3D offset in the buffer
+    ur_rect_offset_t hostOffset,   ///< [in] 3D offset in the host region
+    ur_rect_region_t
+        region, ///< [in] 3D rectangular region descriptor: width, height, depth
+    size_t bufferRowPitch,   ///< [in] length of each row in bytes in the buffer
+                             ///< object
+    size_t bufferSlicePitch, ///< [in] length of each 2D slice in bytes in the
+                             ///< buffer object being written
+    size_t hostRowPitch,     ///< [in] length of each row in bytes in the host
+                             ///< memory region pointed by src
+    size_t hostSlicePitch,   ///< [in] length of each 2D slice in bytes in the
+                             ///< host memory region pointed by src
+    void
+        *pSrc, ///< [in] pointer to host memory where data is to be written from
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                          ///< points to a list of events that must be complete
+                          ///< before this command can be executed. If nullptr,
+                          ///< the numEventsInWaitList must be 0, indicating
+                          ///< that this command does not wait on any event to
+                          ///< complete.
+    ur_event_handle_t
+        *phEvent ///< [in,out][optional] return an event object that identifies
+                 ///< this particular command instance.
+) {
+  ur_mem_handle_t_ *Buffer = ur_cast<ur_mem_handle_t_ *>(hBuffer);
+
+  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
+                                                          Buffer->Mutex);
+
+  char *ZeHandleDst = nullptr;
+  UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
+                              Queue->Device));
+  return enqueueMemCopyRectHelper(
+      UR_COMMAND_MEM_BUFFER_WRITE_RECT, Queue,
+      const_cast<char *>(static_cast<const char *>(pSrc)), ZeHandleDst,
+      hostOffset, bufferOffset, region, hostRowPitch, bufferRowPitch,
+      hostSlicePitch, bufferSlicePitch, blockingWrite, numEventsInWaitList,
+      phEventWaitList, phEvent);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
+    ur_queue_handle_t Queue,   ///< [in] handle of the queue object
+    ur_mem_handle_t BufferSrc, ///< [in] handle of the src buffer object
+    ur_mem_handle_t BufferDst, ///< [in] handle of the dest buffer object
+    size_t SrcOffset, ///< [in] offset into hBufferSrc to begin copying from
+    size_t DstOffset, ///< [in] offset info hBufferDst to begin copying into
+    size_t Size,      ///< [in] size in bytes of data being copied
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating
+                        ///< that this command does not wait on any event to
+                        ///< complete.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular command instance.
+) {
+  _ur_buffer *SrcBuffer = ur_cast<_ur_buffer *>(BufferSrc);
+  _ur_buffer *DstBuffer = ur_cast<_ur_buffer *>(BufferDst);
+
+  UR_ASSERT(!SrcBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(!DstBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  std::shared_lock<ur_shared_mutex> SrcLock(SrcBuffer->Mutex, std::defer_lock);
+  std::scoped_lock<std::shared_lock<ur_shared_mutex>, ur_shared_mutex,
+                   ur_shared_mutex>
+      LockAll(SrcLock, DstBuffer->Mutex, Queue->Mutex);
+
+  // Copy engine is preferred only for host to device transfer.
+  // Device to device transfers run faster on compute engines.
+  bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost);
+
+  // Temporary option added to use copy engine for D2D copy
+  PreferCopyEngine |= UseCopyEngineForD2DCopy;
+
+  char *ZeHandleSrc = nullptr;
+  UR_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only,
+                                 Queue->Device));
+  char *ZeHandleDst = nullptr;
+  UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
+                                 Queue->Device));
+
+  return enqueueMemCopyHelper(
+      UR_COMMAND_MEM_BUFFER_COPY, Queue, ZeHandleDst + DstOffset,
+      false, // blocking
+      Size, ZeHandleSrc + SrcOffset, NumEventsInWaitList, EventWaitList,
+      OutEvent, PreferCopyEngine);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
+    ur_queue_handle_t Queue,    ///< [in] handle of the queue object
+    ur_mem_handle_t BufferSrc,  ///< [in] handle of the source buffer object
+    ur_mem_handle_t BufferDst,  ///< [in] handle of the dest buffer object
+    ur_rect_offset_t SrcOrigin, ///< [in] 3D offset in the source buffer
+    ur_rect_offset_t DstOrigin, ///< [in] 3D offset in the destination buffer
+    ur_rect_region_t SrcRegion, ///< [in] source 3D rectangular region
+                                ///< descriptor: width, height, depth
+    size_t SrcRowPitch,   ///< [in] length of each row in bytes in the source
+                          ///< buffer object
+    size_t SrcSlicePitch, ///< [in] length of each 2D slice in bytes in the
+                          ///< source buffer object
+    size_t DstRowPitch, ///< [in] length of each row in bytes in the destination
+                        ///< buffer object
+    size_t DstSlicePitch, ///< [in] length of each 2D slice in bytes in the
+                          ///< destination buffer object
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating
+                        ///< that this command does not wait on any event to
+                        ///< complete.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular command instance.
+) {
+  _ur_buffer *SrcBuffer = ur_cast<_ur_buffer *>(BufferSrc);
+  _ur_buffer *DstBuffer = ur_cast<_ur_buffer *>(BufferDst);
+
+  UR_ASSERT(!SrcBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(!DstBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  std::shared_lock<ur_shared_mutex> SrcLock(SrcBuffer->Mutex, std::defer_lock);
+  std::scoped_lock<std::shared_lock<ur_shared_mutex>, ur_shared_mutex,
+                   ur_shared_mutex>
+      LockAll(SrcLock, DstBuffer->Mutex, Queue->Mutex);
+
+  // Copy engine is preferred only for host to device transfer.
+  // Device to device transfers run faster on compute engines.
+  bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost);
+
+  char *ZeHandleSrc = nullptr;
+  UR_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only,
+                                 Queue->Device));
+  char *ZeHandleDst = nullptr;
+  UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
+                                 Queue->Device));
+
+  return enqueueMemCopyRectHelper(
+      UR_COMMAND_MEM_BUFFER_COPY_RECT, Queue, ZeHandleSrc, ZeHandleDst,
+      SrcOrigin, DstOrigin, SrcRegion, SrcRowPitch, DstRowPitch, SrcSlicePitch,
+      DstSlicePitch,
+      false, // blocking
+      NumEventsInWaitList, EventWaitList, OutEvent, PreferCopyEngine);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue object
+    ur_mem_handle_t Buffer,  ///< [in] handle of the buffer object
+    const void *Pattern,     ///< [in] pointer to the fill pattern
+    size_t PatternSize,      ///< [in] size in bytes of the pattern
+    size_t Offset,           ///< [in] offset into the buffer
+    size_t Size, ///< [in] fill size in bytes, must be a multiple of patternSize
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating
+                        ///< that this command does not wait on any event to
+                        ///< complete.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular command instance.
+) {
+  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
+                                                          Buffer->Mutex);
+
+  char *ZeHandleDst = nullptr;
+  UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
+                              Queue->Device));
+  return enqueueMemFillHelper(UR_COMMAND_MEM_BUFFER_FILL, Queue,
+                              ZeHandleDst + Offset, Pattern, PatternSize, Size,
+                              NumEventsInWaitList, EventWaitList, OutEvent);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue object
+    ur_mem_handle_t Image,   ///< [in] handle of the image object
+    bool BlockingRead, ///< [in] indicates blocking (true), non-blocking (false)
+    ur_rect_offset_t Origin, ///< [in] defines the (x,y,z) offset in pixels in
+                             ///< the 1D, 2D, or 3D image
+    ur_rect_region_t Region, ///< [in] defines the (width, height, depth) in
+                             ///< pixels of the 1D, 2D, or 3D image
+    size_t RowPitch,         ///< [in] length of each row in bytes
+    size_t SlicePitch,       ///< [in] length of each 2D slice of the 3D image
+    void *Dst, ///< [in] pointer to host memory where image is to be read into
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating
+                        ///< that this command does not wait on any event to
+                        ///< complete.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular command instance.
+) {
+  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
+                                                          Image->Mutex);
+  return enqueueMemImageCommandHelper(
+      UR_COMMAND_MEM_IMAGE_READ, Queue, Image, Dst, BlockingRead, &Origin,
+      nullptr, &Region, RowPitch, SlicePitch, NumEventsInWaitList,
+      EventWaitList, OutEvent);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue object
+    ur_mem_handle_t Image,   ///< [in] handle of the image object
+    bool
+        BlockingWrite, ///< [in] indicates blocking (true), non-blocking (false)
+    ur_rect_offset_t Origin, ///< [in] defines the (x,y,z) offset in pixels in
+                             ///< the 1D, 2D, or 3D image
+    ur_rect_region_t Region, ///< [in] defines the (width, height, depth) in
+                             ///< pixels of the 1D, 2D, or 3D image
+    size_t RowPitch,         ///< [in] length of each row in bytes
+    size_t SlicePitch,       ///< [in] length of each 2D slice of the 3D image
+    void *Src, ///< [in] pointer to host memory where image is to be read into
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating
+                        ///< that this command does not wait on any event to
+                        ///< complete.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular command instance.
+) {
+  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
+                                                          Image->Mutex);
+  return enqueueMemImageCommandHelper(
+      UR_COMMAND_MEM_IMAGE_WRITE, Queue, Src, Image, BlockingWrite, nullptr,
+      &Origin, &Region, RowPitch, SlicePitch, NumEventsInWaitList,
+      EventWaitList, OutEvent);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
+    ur_queue_handle_t Queue,    ///< [in] handle of the queue object
+    ur_mem_handle_t ImageSrc,   ///< [in] handle of the src image object
+    ur_mem_handle_t ImageDst,   ///< [in] handle of the dest image object
+    ur_rect_offset_t SrcOrigin, ///< [in] defines the (x,y,z) offset in pixels
+                                ///< in the source 1D, 2D, or 3D image
+    ur_rect_offset_t DstOrigin, ///< [in] defines the (x,y,z) offset in pixels
+                                ///< in the destination 1D, 2D, or 3D image
+    ur_rect_region_t Region,    ///< [in] defines the (width, height, depth) in
+                                ///< pixels of the 1D, 2D, or 3D image
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating
+                        ///< that this command does not wait on any event to
+                        ///< complete.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular command instance.
+) {
+  std::shared_lock<ur_shared_mutex> SrcLock(ImageSrc->Mutex, std::defer_lock);
+  std::scoped_lock<std::shared_lock<ur_shared_mutex>, ur_shared_mutex,
+                   ur_shared_mutex>
+      LockAll(SrcLock, ImageDst->Mutex, Queue->Mutex);
+  // Copy engine is preferred only for host to device transfer.
+  // Device to device transfers run faster on compute engines.
+  // Images are always allocated on device.
+  bool PreferCopyEngine = false;
+  return enqueueMemImageCommandHelper(
+      UR_COMMAND_MEM_IMAGE_COPY, Queue, ImageSrc, ImageDst,
+      false, // is_blocking
+      &SrcOrigin, &DstOrigin, &Region,
+      0, // row pitch
+      0, // slice pitch
+      NumEventsInWaitList, EventWaitList, OutEvent, PreferCopyEngine);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue object
+    ur_mem_handle_t Buf,     ///< [in] handle of the buffer object
+    bool BlockingMap, ///< [in] indicates blocking (true), non-blocking (false)
+    ur_map_flags_t MapFlags, ///< [in] flags for read, write, readwrite mapping
+    size_t Offset, ///< [in] offset in bytes of the buffer region being mapped
+    size_t Size,   ///< [in] size in bytes of the buffer region being mapped
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating
+                        ///< that this command does not wait on any event to
+                        ///< complete.
+    ur_event_handle_t
+        *OutEvent, ///< [in,out][optional] return an event object that
+                   ///< identifies this particular command instance.
+    void **RetMap  ///< [in,out] return mapped pointer.  TODO: move it before
+                   ///< numEventsInWaitList?
+) {
+
+  auto Buffer = ur_cast<_ur_buffer *>(Buf);
+
+  UR_ASSERT(!Buffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  ur_event_handle_t InternalEvent;
+  bool IsInternal = OutEvent == nullptr;
+  ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
+  ze_event_handle_t ZeEvent = nullptr;
+
+  bool UseCopyEngine = false;
+  {
+    // Lock automatically releases when this goes out of scope.
+    std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
+
+    _ur_ze_event_list_t TmpWaitList;
+    UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+        NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
+
+    UR_CALL(
+        createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_BUFFER_MAP,
+                                     Queue->CommandListMap.end(), IsInternal));
+
+    ZeEvent = (*Event)->ZeEvent;
+    (*Event)->WaitList = TmpWaitList;
+  }
+
+  // Translate the host access mode info.
+  ur_mem_handle_t_::access_mode_t AccessMode = ur_mem_handle_t_::unknown;
+  if (MapFlags & UR_EXT_MAP_FLAG_WRITE_INVALIDATE_REGION)
+    AccessMode = ur_mem_handle_t_::write_only;
+  else {
+    if (MapFlags & UR_MAP_FLAG_READ) {
+      AccessMode = ur_mem_handle_t_::read_only;
+      if (MapFlags & UR_MAP_FLAG_WRITE)
+        AccessMode = ur_mem_handle_t_::read_write;
+    } else if (MapFlags & UR_MAP_FLAG_WRITE)
+      AccessMode = ur_mem_handle_t_::write_only;
+  }
+
+  UR_ASSERT(AccessMode != ur_mem_handle_t_::unknown,
+            UR_RESULT_ERROR_INVALID_VALUE);
+
+  // TODO: Level Zero is missing the memory "mapping" capabilities, so we are
+  // left to doing new memory allocation and a copy (read) on discrete devices.
+  // For integrated devices, we have allocated the buffer in host memory so no
+  // actions are needed here except for synchronizing on incoming events.
+  // A host-to-host copy is done if a host pointer had been supplied during
+  // buffer creation on integrated devices.
+  //
+  // TODO: for discrete, check if the input buffer is already allocated
+  // in shared memory and thus is accessible from the host as is.
+  // Can we get SYCL RT to predict/allocate in shared memory
+  // from the beginning?
+
+  // For integrated devices the buffer has been allocated in host memory.
+  if (Buffer->OnHost) {
+    // Wait on incoming events before doing the copy
+    if (NumEventsInWaitList > 0)
+      UR_CALL(urEventWait(NumEventsInWaitList, EventWaitList));
+
+    if (Queue->isInOrderQueue())
+      UR_CALL(urQueueFinish(Queue));
+
+    // Lock automatically releases when this goes out of scope.
+    std::scoped_lock<ur_shared_mutex> Guard(Buffer->Mutex);
+
+    char *ZeHandleSrc;
+    UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device));
+
+    if (Buffer->MapHostPtr) {
+      *RetMap = Buffer->MapHostPtr + Offset;
+      if (ZeHandleSrc != Buffer->MapHostPtr &&
+          AccessMode != ur_mem_handle_t_::write_only) {
+        memcpy(*RetMap, ZeHandleSrc + Offset, Size);
+      }
+    } else {
+      *RetMap = ZeHandleSrc + Offset;
+    }
+
+    auto Res = Buffer->Mappings.insert({*RetMap, {Offset, Size}});
+    // False as the second value in pair means that mapping was not inserted
+    // because mapping already exists.
+    if (!Res.second) {
+      urPrint("urEnqueueMemBufferMap: duplicate mapping detected\n");
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+
+    // Signal this event
+    ZE2UR_CALL(zeEventHostSignal, (ZeEvent));
+    (*Event)->Completed = true;
+    return UR_RESULT_SUCCESS;
+  }
+
+  // Lock automatically releases when this goes out of scope.
+  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
+                                                          Buffer->Mutex);
+
+  if (Buffer->MapHostPtr) {
+    *RetMap = Buffer->MapHostPtr + Offset;
+  } else {
+    // TODO: use USM host allocator here
+    // TODO: Do we even need every map to allocate new host memory?
+    //       In the case when the buffer is "OnHost" we use single allocation.
+    UR_CALL(ZeHostMemAllocHelper(RetMap, Queue->Context, Size));
+  }
+
+  // Take a shortcut if the host is not going to read buffer's data.
+  if (AccessMode == ur_mem_handle_t_::write_only) {
+    (*Event)->Completed = true;
+  } else {
+    // For discrete devices we need a command list
+    ur_command_list_ptr_t CommandList{};
+    UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
+                                                    UseCopyEngine));
+
+    // Add the event to the command list.
+    CommandList->second.append(reinterpret_cast<ur_event_handle_t>(*Event));
+    (*Event)->RefCount.increment();
+
+    const auto &ZeCommandList = CommandList->first;
+    const auto &WaitList = (*Event)->WaitList;
+
+    char *ZeHandleSrc;
+    UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device));
+
+    ZE2UR_CALL(zeCommandListAppendMemoryCopy,
+               (ZeCommandList, *RetMap, ZeHandleSrc + Offset, Size, ZeEvent,
+                WaitList.Length, WaitList.ZeEventList));
+
+    UR_CALL(Queue->executeCommandList(CommandList, BlockingMap));
+  }
+
+  auto Res = Buffer->Mappings.insert({*RetMap, {Offset, Size}});
+  // False as the second value in pair means that mapping was not inserted
+  // because mapping already exists.
+  if (!Res.second) {
+    urPrint("urEnqueueMemBufferMap: duplicate mapping detected\n");
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue object
+    ur_mem_handle_t Mem, ///< [in] handle of the memory (buffer or image) object
+    void *MappedPtr,     ///< [in] mapped host address
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating
+                        ///< that this command does not wait on any event to
+                        ///< complete.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular command instance.
+) {
+  UR_ASSERT(!Mem->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  auto Buffer = ur_cast<_ur_buffer *>(Mem);
+
+  bool UseCopyEngine = false;
+
+  ze_event_handle_t ZeEvent = nullptr;
+  ur_event_handle_t InternalEvent;
+  bool IsInternal = OutEvent == nullptr;
+  ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
+  {
+    // Lock automatically releases when this goes out of scope.
+    std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
+
+    _ur_ze_event_list_t TmpWaitList;
+    UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+        NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
+
+    UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_UNMAP,
+                                         Queue->CommandListMap.end(),
+                                         IsInternal));
+    ZeEvent = (*Event)->ZeEvent;
+    (*Event)->WaitList = TmpWaitList;
+  }
+
+  _ur_buffer::Mapping MapInfo = {};
+  {
+    // Lock automatically releases when this goes out of scope.
+    std::scoped_lock<ur_shared_mutex> Guard(Buffer->Mutex);
+    auto It = Buffer->Mappings.find(MappedPtr);
+    if (It == Buffer->Mappings.end()) {
+      urPrint("urEnqueueMemUnmap: unknown memory mapping\n");
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+    MapInfo = It->second;
+    Buffer->Mappings.erase(It);
+
+    // NOTE: we still have to free the host memory allocated/returned by
+    // piEnqueueMemBufferMap, but can only do so after the above copy
+    // is completed. Instead of waiting for It here (blocking), we shall
+    // do so in piEventRelease called for the pi_event tracking the unmap.
+    // In the case of an integrated device, the map operation does not allocate
+    // any memory, so there is nothing to free. This is indicated by a nullptr.
+    (*Event)->CommandData =
+        (Buffer->OnHost ? nullptr : (Buffer->MapHostPtr ? nullptr : MappedPtr));
+  }
+
+  // For integrated devices the buffer is allocated in host memory.
+  if (Buffer->OnHost) {
+    // Wait on incoming events before doing the copy
+    if (NumEventsInWaitList > 0)
+      UR_CALL(urEventWait(NumEventsInWaitList, EventWaitList));
+
+    if (Queue->isInOrderQueue())
+      UR_CALL(urQueueFinish(Queue));
+
+    char *ZeHandleDst;
+    UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
+                                Queue->Device));
+
+    std::scoped_lock<ur_shared_mutex> Guard(Buffer->Mutex);
+    if (Buffer->MapHostPtr)
+      memcpy(ZeHandleDst + MapInfo.Offset, MappedPtr, MapInfo.Size);
+
+    // Signal this event
+    ZE2UR_CALL(zeEventHostSignal, (ZeEvent));
+    (*Event)->Completed = true;
+    return UR_RESULT_SUCCESS;
+  }
+
+  // Lock automatically releases when this goes out of scope.
+  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
+                                                          Buffer->Mutex);
+
+  ur_command_list_ptr_t CommandList{};
+  UR_CALL(Queue->Context->getAvailableCommandList(
+      reinterpret_cast<ur_queue_handle_t>(Queue), CommandList, UseCopyEngine));
+
+  CommandList->second.append(reinterpret_cast<ur_event_handle_t>(*Event));
+  (*Event)->RefCount.increment();
+
+  const auto &ZeCommandList = CommandList->first;
+
+  // TODO: Level Zero is missing the memory "mapping" capabilities, so we are
+  // left to doing copy (write back to the device).
+  //
+  // NOTE: Keep this in sync with the implementation of
+  // piEnqueueMemBufferMap.
+
+  char *ZeHandleDst;
+  UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
+                              Queue->Device));
+
+  ZE2UR_CALL(zeCommandListAppendMemoryCopy,
+             (ZeCommandList, ZeHandleDst + MapInfo.Offset, MappedPtr,
+              MapInfo.Size, ZeEvent, (*Event)->WaitList.Length,
+              (*Event)->WaitList.ZeEventList));
+
+  // Execute command list asynchronously, as the event will be used
+  // to track down its completion.
+  UR_CALL(Queue->executeCommandList(CommandList));
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemset(
+    ur_queue_handle_t Queue,      ///< [in] handle of the queue object
+    void *Ptr,                    ///< [in] pointer to USM memory object
+    int8_t ByteValue,             ///< [in] byte value to fill
+    size_t Count,                 ///< [in] size in bytes to be set
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating
+                        ///< that this command does not wait on any event to
+                        ///< complete.
+    ur_event_handle_t *Event ///< [in,out][optional] return an event object that
+                             ///< identifies this particular command instance.
+) {
+  std::ignore = Queue;
+  std::ignore = Ptr;
+  std::ignore = ByteValue;
+  std::ignore = Count;
+  std::ignore = NumEventsInWaitList;
+  std::ignore = EventWaitList;
+  std::ignore = Event;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue object
+    bool Blocking,           ///< [in] blocking or non-blocking copy
+    void *Dst,       ///< [in] pointer to the destination USM memory object
+    const void *Src, ///< [in] pointer to the source USM memory object
+    size_t Size,     ///< [in] size in bytes to be copied
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating
+                        ///< that this command does not wait on any event to
+                        ///< complete.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular command instance.
+) {
+  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
+
+  // Device to Device copies are found to execute slower on copy engine
+  // (versus compute engine).
+  bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src) ||
+                          !IsDevicePointer(Queue->Context, Dst);
+
+  // Temporary option added to use copy engine for D2D copy
+  PreferCopyEngine |= UseCopyEngineForD2DCopy;
+
+  return enqueueMemCopyHelper( // TODO: do we need a new command type for this?
+      UR_COMMAND_MEM_BUFFER_COPY, Queue, Dst, Blocking, Size, Src,
+      NumEventsInWaitList, EventWaitList, OutEvent, PreferCopyEngine);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
+    ur_queue_handle_t Queue,        ///< [in] handle of the queue object
+    const void *Mem,                ///< [in] pointer to the USM memory object
+    size_t Size,                    ///< [in] size in bytes to be fetched
+    ur_usm_migration_flags_t Flags, ///< [in] USM prefetch flags
+    uint32_t NumEventsInWaitList,   ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating
+                        ///< that this command does not wait on any event to
+                        ///< complete.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular command instance.
+) {
+  // Lock automatically releases when this goes out of scope.
+  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
+
+  bool UseCopyEngine = false;
+
+  // Please note that the following code should be run before the
+  // subsequent getAvailableCommandList() call so that there is no
+  // dead-lock from waiting unsubmitted events in an open batch.
+  // The createAndRetainUrZeEventList() has the proper side-effect
+  // of submitting batches with dependent events.
+  //
+  _ur_ze_event_list_t TmpWaitList;
+  UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+      NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
+
+  // Get a new command list to be used on this call
+  ur_command_list_ptr_t CommandList{};
+  // TODO: Change UseCopyEngine argument to 'true' once L0 backend
+  // support is added
+  UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
+                                                  UseCopyEngine));
+
+  // TODO: do we need to create a unique command type for this?
+  ze_event_handle_t ZeEvent = nullptr;
+  ur_event_handle_t InternalEvent;
+  bool IsInternal = OutEvent == nullptr;
+  ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
+  UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_EXT_COMMAND_TYPE_USER,
+                                       CommandList, IsInternal));
+  ZeEvent = (*Event)->ZeEvent;
+  (*Event)->WaitList = TmpWaitList;
+
+  const auto &WaitList = (*Event)->WaitList;
+  const auto &ZeCommandList = CommandList->first;
+  if (WaitList.Length) {
+    ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
+               (ZeCommandList, WaitList.Length, WaitList.ZeEventList));
+  }
+  // TODO: figure out how to translate "flags"
+  ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, (ZeCommandList, Mem, Size));
+
+  // TODO: Level Zero does not have a completion "event" with the prefetch API,
+  // so manually add command to signal our event.
+  ZE2UR_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent));
+
+  UR_CALL(Queue->executeCommandList(CommandList, false));
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemAdvise(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue object
+    const void *Mem,         ///< [in] pointer to the USM memory object
+    size_t Size,             ///< [in] size in bytes to be advised
+    ur_mem_advice_t Advice,  ///< [in] USM memory advice
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular command instance.
+) {
+  // Lock automatically releases when this goes out of scope.
+  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
+
+  auto ZeAdvice = ur_cast<ze_memory_advice_t>(Advice);
+
+  bool UseCopyEngine = false;
+
+  _ur_ze_event_list_t TmpWaitList;
+  UR_CALL(TmpWaitList.createAndRetainUrZeEventList(0, nullptr, Queue,
+                                                   UseCopyEngine));
+
+  // Get a new command list to be used on this call
+  ur_command_list_ptr_t CommandList{};
+  // UseCopyEngine is set to 'false' here.
+  // TODO: Additional analysis is required to check if this operation will
+  // run faster on copy engines.
+  UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
+                                                  UseCopyEngine));
+
+  // TODO: do we need to create a unique command type for this?
+  ze_event_handle_t ZeEvent = nullptr;
+  ur_event_handle_t InternalEvent{};
+  bool IsInternal = OutEvent == nullptr;
+  ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
+  UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_EXT_COMMAND_TYPE_USER,
+                                       CommandList, IsInternal));
+  ZeEvent = (*Event)->ZeEvent;
+  (*Event)->WaitList = TmpWaitList;
+
+  const auto &ZeCommandList = CommandList->first;
+  const auto &WaitList = (*Event)->WaitList;
+
+  if (WaitList.Length) {
+    ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
+               (ZeCommandList, WaitList.Length, WaitList.ZeEventList));
+  }
+
+  ZE2UR_CALL(zeCommandListAppendMemAdvise,
+             (ZeCommandList, Queue->Device->ZeDevice, Mem, Size, ZeAdvice));
+
+  // TODO: Level Zero does not have a completion "event" with the advise API,
+  // so manually add command to signal our event.
+  ZE2UR_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent));
+
+  Queue->executeCommandList(CommandList, false);
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue to submit to.
+    void *Mem,               ///< [in] pointer to memory to be filled.
+    size_t Pitch, ///< [in] the total width of the destination memory including
+                  ///< padding.
+    size_t PatternSize,  ///< [in] the size in bytes of the pattern.
+    const void *Pattern, ///< [in] pointer with the bytes of the pattern to set.
+    size_t Width,        ///< [in] the width in bytes of each row to fill.
+    size_t Height,       ///< [in] the height of the columns to fill.
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before the kernel execution. If nullptr, the
+                        ///< numEventsInWaitList must be 0, indicating that no
+                        ///< wait event.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular kernel execution instance.
+) {
+  std::ignore = Queue;
+  std::ignore = Mem;
+  std::ignore = Pitch;
+  std::ignore = PatternSize;
+  std::ignore = Pattern;
+  std::ignore = Width;
+  std::ignore = Height;
+  std::ignore = NumEventsInWaitList;
+  std::ignore = EventWaitList;
+  std::ignore = OutEvent;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemset2D(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue to submit to.
+    void *Mem,               ///< [in] pointer to memory to be filled.
+    size_t Pitch,  ///< [in] the total width of the destination memory including
+                   ///< padding.
+    int Value,     ///< [in] the value to fill into the region in pMem.
+    size_t Width,  ///< [in] the width in bytes of each row to set.
+    size_t Height, ///< [in] the height of the columns to set.
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before the kernel execution. If nullptr, the
+                        ///< numEventsInWaitList must be 0, indicating that no
+                        ///< wait event.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular kernel execution instance.
+) {
+  std::ignore = Queue;
+  std::ignore = Mem;
+  std::ignore = Pitch;
+  std::ignore = Value;
+  std::ignore = Width;
+  std::ignore = Height;
+  std::ignore = NumEventsInWaitList;
+  std::ignore = EventWaitList;
+  std::ignore = OutEvent;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue to submit to.
+    bool Blocking, ///< [in] indicates if this operation should block the host.
+    void *Dst,     ///< [in] pointer to memory where data will be copied.
+    size_t DstPitch, ///< [in] the total width of the source memory including
+                     ///< padding.
+    const void *Src, ///< [in] pointer to memory to be copied.
+    size_t SrcPitch, ///< [in] the total width of the source memory including
+                     ///< padding.
+    size_t Width,    ///< [in] the width in bytes of each row to be copied.
+    size_t Height,   ///< [in] the height of columns to be copied.
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before the kernel execution. If nullptr, the
+                        ///< numEventsInWaitList must be 0, indicating that no
+                        ///< wait event.
+    ur_event_handle_t
+        *Event ///< [in,out][optional] return an event object that identifies
+               ///< this particular kernel execution instance.
+) {
+
+  ur_rect_offset_t ZeroOffset{0, 0, 0};
+  ur_rect_region_t Region{Width, Height, 0};
+
+  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
+
+  // Device to Device copies are found to execute slower on copy engine
+  // (versus compute engine).
+  bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src) ||
+                          !IsDevicePointer(Queue->Context, Dst);
+
+  // Temporary option added to use copy engine for D2D copy
+  PreferCopyEngine |= UseCopyEngineForD2DCopy;
+
+  return enqueueMemCopyRectHelper( // TODO: do we need a new command type for
+                                   // this?
+      UR_COMMAND_MEM_BUFFER_COPY_RECT, Queue, Src, Dst, ZeroOffset, ZeroOffset,
+      Region, SrcPitch, DstPitch, 0, /*SrcSlicePitch=*/
+      0,                             /*DstSlicePitch=*/
+      Blocking, NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    ur_mem_flags_t Flags, ///< [in] allocation and usage information flags
+    const ur_image_format_t
+        *ImageFormat, ///< [in] pointer to image format specification
+    const ur_image_desc_t *ImageDesc, ///< [in] pointer to image description
+    void *Host,                       ///< [in] pointer to the buffer data
+    ur_mem_handle_t *Mem ///< [out] pointer to handle of image object created
+) {
+  ze_image_format_type_t ZeImageFormatType;
+  size_t ZeImageFormatTypeSize;
+  switch (ImageFormat->channelType) {
+  case UR_IMAGE_CHANNEL_TYPE_FLOAT: {
+    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT;
+    ZeImageFormatTypeSize = 32;
+    break;
+  }
+  case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT: {
+    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_FLOAT;
+    ZeImageFormatTypeSize = 16;
+    break;
+  }
+  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: {
+    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
+    ZeImageFormatTypeSize = 32;
+    break;
+  }
+  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: {
+    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
+    ZeImageFormatTypeSize = 16;
+    break;
+  }
+  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: {
+    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UINT;
+    ZeImageFormatTypeSize = 8;
+    break;
+  }
+  case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16: {
+    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM;
+    ZeImageFormatTypeSize = 16;
+    break;
+  }
+  case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8: {
+    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_UNORM;
+    ZeImageFormatTypeSize = 8;
+    break;
+  }
+  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32: {
+    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
+    ZeImageFormatTypeSize = 32;
+    break;
+  }
+  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16: {
+    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
+    ZeImageFormatTypeSize = 16;
+    break;
+  }
+  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8: {
+    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SINT;
+    ZeImageFormatTypeSize = 8;
+    break;
+  }
+  case UR_IMAGE_CHANNEL_TYPE_SNORM_INT16: {
+    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM;
+    ZeImageFormatTypeSize = 16;
+    break;
+  }
+  case UR_IMAGE_CHANNEL_TYPE_SNORM_INT8: {
+    ZeImageFormatType = ZE_IMAGE_FORMAT_TYPE_SNORM;
+    ZeImageFormatTypeSize = 8;
+    break;
+  }
+  default:
+    urPrint("urMemImageCreate: unsupported image data type: data type = %d\n",
+            ImageFormat->channelType);
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+
+  // TODO: populate the layout mapping
+  ze_image_format_layout_t ZeImageFormatLayout;
+  switch (ImageFormat->channelOrder) {
+  case UR_IMAGE_CHANNEL_ORDER_RGBA: {
+    switch (ZeImageFormatTypeSize) {
+    case 8:
+      ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8;
+      break;
+    case 16:
+      ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16;
+      break;
+    case 32:
+      ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32;
+      break;
+    default:
+      urPrint("urMemImageCreate: unexpected data type Size\n");
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+    break;
+  }
+  default:
+    urPrint("format layout = %d\n", ImageFormat->channelOrder);
+    die("urMemImageCreate: unsupported image format layout\n");
+    break;
+  }
+
+  ze_image_format_t ZeFormatDesc = {
+      ZeImageFormatLayout, ZeImageFormatType,
+      // TODO: are swizzles deducted from image_format->image_channel_order?
+      ZE_IMAGE_FORMAT_SWIZZLE_R, ZE_IMAGE_FORMAT_SWIZZLE_G,
+      ZE_IMAGE_FORMAT_SWIZZLE_B, ZE_IMAGE_FORMAT_SWIZZLE_A};
+
+  ze_image_type_t ZeImageType;
+  switch (ImageDesc->type) {
+  case UR_MEM_TYPE_IMAGE1D:
+    ZeImageType = ZE_IMAGE_TYPE_1D;
+    break;
+  case UR_MEM_TYPE_IMAGE2D:
+    ZeImageType = ZE_IMAGE_TYPE_2D;
+    break;
+  case UR_MEM_TYPE_IMAGE3D:
+    ZeImageType = ZE_IMAGE_TYPE_3D;
+    break;
+  case UR_MEM_TYPE_IMAGE1D_ARRAY:
+    ZeImageType = ZE_IMAGE_TYPE_1DARRAY;
+    break;
+  case UR_MEM_TYPE_IMAGE2D_ARRAY:
+    ZeImageType = ZE_IMAGE_TYPE_2DARRAY;
+    break;
+  default:
+    urPrint("urMemImageCreate: unsupported image type\n");
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+
+  ZeStruct<ze_image_desc_t> ZeImageDesc;
+  ZeImageDesc.arraylevels = ZeImageDesc.flags = 0;
+  ZeImageDesc.type = ZeImageType;
+  ZeImageDesc.format = ZeFormatDesc;
+  ZeImageDesc.width = ur_cast<uint64_t>(ImageDesc->width);
+  ZeImageDesc.height = ur_cast<uint64_t>(ImageDesc->height);
+  ZeImageDesc.depth = ur_cast<uint64_t>(ImageDesc->depth);
+  ZeImageDesc.arraylevels = ur_cast<uint32_t>(ImageDesc->arraySize);
+  ZeImageDesc.miplevels = ImageDesc->numMipLevel;
+
+  std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
+
+  // Currently we have the "0" device in context with mutliple root devices to
+  // own the image.
+  // TODO: Implement explicit copying for acessing the image from other devices
+  // in the context.
+  ur_device_handle_t Device = Context->SingleRootDevice
+                                  ? Context->SingleRootDevice
+                                  : Context->Devices[0];
+  ze_image_handle_t ZeImage;
+  ZE2UR_CALL(zeImageCreate,
+             (Context->ZeContext, Device->ZeDevice, &ZeImageDesc, &ZeImage));
+
+  try {
+    auto UrImage =
+        new _ur_image(ur_cast<ur_context_handle_t>(Context), ZeImage);
+    *Mem = reinterpret_cast<ur_mem_handle_t>(UrImage);
+
+#ifndef NDEBUG
+    UrImage->ZeImageDesc = ZeImageDesc;
+#endif // !NDEBUG
+
+    if ((Flags & UR_MEM_FLAG_USE_HOST_POINTER) != 0 ||
+        (Flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) != 0) {
+      // Initialize image synchronously with immediate offload.
+      // zeCommandListAppendImageCopyFromMemory must not be called from
+      // simultaneous threads with the same command list handle, so we need
+      // exclusive lock.
+      std::scoped_lock<ur_mutex> Lock(Context->ImmediateCommandListMutex);
+      ZE2UR_CALL(zeCommandListAppendImageCopyFromMemory,
+                 (Context->ZeCommandListInit, ZeImage, Host, nullptr, nullptr,
+                  0, nullptr));
+    }
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    ur_mem_flags_t Flags, ///< [in] allocation and usage information flags
+    size_t Size, ///< [in] size in bytes of the memory object to be allocated
+    void *Host,  ///< [in][optional] pointer to the buffer data
+    ur_mem_handle_t
+        *RetBuffer ///< [out] pointer to handle of the memory buffer created
+) {
+  if (Flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) {
+    // Having PI_MEM_FLAGS_HOST_PTR_ALLOC for buffer requires allocation of
+    // pinned host memory, see:
+    // sycl/doc/extensions/supported/sycl_ext_oneapi_use_pinned_host_memory_property.asciidoc
+    // We are however missing such functionality in Level Zero, so we just
+    // ignore the flag for now.
+    //
+  }
+
+  // If USM Import feature is enabled and hostptr is supplied,
+  // import the hostptr if not already imported into USM.
+  // Data transfer rate is maximized when both source and destination
+  // are USM pointers. Promotion of the host pointer to USM thus
+  // optimizes data transfer performance.
+  bool HostPtrImported = false;
+  if (ZeUSMImport.Enabled && Host != nullptr &&
+      (Flags & UR_MEM_FLAG_USE_HOST_POINTER) != 0) {
+    // Query memory type of the host pointer
+    ze_device_handle_t ZeDeviceHandle;
+    ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
+    ZE2UR_CALL(zeMemGetAllocProperties,
+               (Context->ZeContext, Host, &ZeMemoryAllocationProperties,
+                &ZeDeviceHandle));
+
+    // If not shared of any type, we can import the ptr
+    if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) {
+      // Promote the host ptr to USM host memory
+      ze_driver_handle_t driverHandle = Context->getPlatform()->ZeDriver;
+      ZeUSMImport.doZeUSMImport(driverHandle, Host, Size);
+      HostPtrImported = true;
+    }
+  }
+
+  _ur_buffer *Buffer = nullptr;
+  auto HostPtrOrNull = (Flags & UR_MEM_FLAG_USE_HOST_POINTER)
+                           ? reinterpret_cast<char *>(Host)
+                           : nullptr;
+  try {
+    Buffer = new _ur_buffer(Context, Size, HostPtrOrNull, HostPtrImported);
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  // Initialize the buffer with user data
+  if (Host) {
+    if ((Flags & UR_MEM_FLAG_USE_HOST_POINTER) != 0 ||
+        (Flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) != 0) {
+
+      // We don't yet know which device needs this buffer, so make the first
+      // device in the context be the master, and hold the initial valid
+      // allocation.
+      char *ZeHandleDst;
+      UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
+                                  Context->Devices[0]));
+      if (Buffer->OnHost) {
+        // Do a host to host copy.
+        // For an imported HostPtr the copy is unneeded.
+        if (!HostPtrImported)
+          memcpy(ZeHandleDst, Host, Size);
+      } else {
+        // Initialize the buffer synchronously with immediate offload
+        // zeCommandListAppendMemoryCopy must not be called from simultaneous
+        // threads with the same command list handle, so we need exclusive lock.
+        std::scoped_lock<ur_mutex> Lock(Context->ImmediateCommandListMutex);
+        ZE2UR_CALL(zeCommandListAppendMemoryCopy,
+                   (Context->ZeCommandListInit, ZeHandleDst, Host, Size,
+                    nullptr, 0, nullptr));
+      }
+    } else if (Flags == 0 || (Flags == UR_MEM_FLAG_READ_WRITE)) {
+      // Nothing more to do.
+    } else
+      die("urMemBufferCreate: not implemented");
+  }
+
+  *RetBuffer = reinterpret_cast<ur_mem_handle_t>(Buffer);
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(
+    ur_mem_handle_t Mem ///< [in] handle of the memory object to get access
+) {
+  Mem->RefCount.increment();
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(
+    ur_mem_handle_t Mem ///< [in] handle of the memory object to release
+) {
+  if (!Mem->RefCount.decrementAndTest())
+    return UR_RESULT_SUCCESS;
+
+  if (Mem->isImage()) {
+    char *ZeHandleImage;
+    UR_CALL(Mem->getZeHandle(ZeHandleImage, ur_mem_handle_t_::write_only));
+    auto ZeResult = ZE_CALL_NOCHECK(
+        zeImageDestroy, (ur_cast<ze_image_handle_t>(ZeHandleImage)));
+    // Gracefully handle the case that L0 was already unloaded.
+    if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
+      return ze2urResult(ZeResult);
+  } else {
+    auto Buffer = reinterpret_cast<_ur_buffer *>(Mem);
+    Buffer->free();
+  }
+  delete Mem;
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
+    ur_mem_handle_t
+        Buffer,           ///< [in] handle of the buffer object to allocate from
+    ur_mem_flags_t Flags, ///< [in] allocation and usage information flags
+    ur_buffer_create_type_t BufferCreateType, ///< [in] buffer creation type
+    ur_buffer_region_t
+        *BufferCreateInfo, ///< [in] pointer to buffer create region information
+    ur_mem_handle_t
+        *RetMem ///< [out] pointer to the handle of sub buffer created
+) {
+  UR_ASSERT(Buffer && !Buffer->isImage() &&
+                !(static_cast<_ur_buffer *>(Buffer))->isSubBuffer(),
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  std::shared_lock<ur_shared_mutex> Guard(Buffer->Mutex);
+
+  if (Flags != UR_MEM_FLAG_READ_WRITE) {
+    die("urMemBufferPartition: Level-Zero implements only read-write buffer,"
+        "no read-only or write-only yet.");
+  }
+
+  try {
+    auto partitionedBuffer =
+        new _ur_buffer(static_cast<_ur_buffer *>(Buffer),
+                       BufferCreateInfo->origin, BufferCreateInfo->size);
+    *RetMem = reinterpret_cast<ur_mem_handle_t>(partitionedBuffer);
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle(
+    ur_mem_handle_t Mem, ///< [in] handle of the mem.
+    ur_native_handle_t
+        *NativeMem ///< [out] a pointer to the native handle of the mem.
+) {
+  std::shared_lock<ur_shared_mutex> Guard(Mem->Mutex);
+  char *ZeHandle = nullptr;
+  UR_CALL(Mem->getZeHandle(ZeHandle, ur_mem_handle_t_::read_write));
+  *NativeMem = ur_cast<ur_native_handle_t>(ZeHandle);
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemCreateWithNativeHandle(
+    ur_native_handle_t NativeMem, ///< [in] the native handle of the mem.
+    ur_context_handle_t Context,  ///< [in] handle of the context object
+    ur_mem_handle_t
+        *Mem ///< [out] pointer to the handle of the mem object created.
+) {
+  std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
+
+  // TODO: Get OwnNativeHandle from the output parameter while we get it in
+  // interface
+  bool OwnNativeHandle = (*Mem)->OwnNativeHandle;
+
+  // Get base of the allocation
+  void *Base = nullptr;
+  size_t Size = 0;
+  void *Ptr = ur_cast<void *>(NativeMem);
+  ZE2UR_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, &Base, &Size));
+  UR_ASSERT(Ptr == Base, UR_RESULT_ERROR_INVALID_VALUE);
+
+  ZeStruct<ze_memory_allocation_properties_t> ZeMemProps;
+  ze_device_handle_t ZeDevice = nullptr;
+  ZE2UR_CALL(zeMemGetAllocProperties,
+             (Context->ZeContext, Ptr, &ZeMemProps, &ZeDevice));
+
+  // Check type of the allocation
+  switch (ZeMemProps.type) {
+  case ZE_MEMORY_TYPE_HOST:
+  case ZE_MEMORY_TYPE_SHARED:
+  case ZE_MEMORY_TYPE_DEVICE:
+    break;
+  case ZE_MEMORY_TYPE_UNKNOWN:
+    // Memory allocation is unrelated to the context
+    return UR_RESULT_ERROR_INVALID_CONTEXT;
+  default:
+    die("Unexpected memory type");
+  }
+
+  ur_device_handle_t Device{};
+  if (ZeDevice) {
+    Device = Context->getPlatform()->getDeviceFromNativeHandle(ZeDevice);
+    UR_ASSERT(Context->isValidDevice(Device), UR_RESULT_ERROR_INVALID_CONTEXT);
+  }
+
+  _ur_buffer *Buffer = nullptr;
+  try {
+    Buffer = new _ur_buffer(Context, Device, Size);
+    *Mem = reinterpret_cast<ur_mem_handle_t>(Buffer);
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  ur_platform_handle_t Plt = Context->getPlatform();
+  std::unique_lock<ur_shared_mutex> ContextsLock(Plt->ContextsMutex,
+                                                 std::defer_lock);
+  // If we don't own the native handle then we can't control deallocation of
+  // that memory so there is no point of keeping track of the memory
+  // allocation for deferred memory release in the mode when indirect access
+  // tracking is enabled.
+  if (IndirectAccessTrackingEnabled && OwnNativeHandle) {
+    // We need to keep track of all memory allocations in the context
+    ContextsLock.lock();
+    // Retain context to be sure that it is released after all memory
+    // allocations in this context are released.
+    UR_CALL(urContextRetain(Context));
+
+    Context->MemAllocs.emplace(
+        std::piecewise_construct, std::forward_as_tuple(Ptr),
+        std::forward_as_tuple(Context,
+                              true /*ownNativeHandle, how do we pass it here? or
+                                      do we move all this logic to pi2ur? */
+                              ));
+  }
+
+  if (Device) {
+    // If this allocation is on a device, then we re-use it for the buffer.
+    // Nothing to do.
+  } else if (Buffer->OnHost) {
+    // If this is host allocation and buffer always stays on host there
+    // nothing more to do.
+  } else {
+    // In all other cases (shared allocation, or host allocation that cannot
+    // represent the buffer in this context) copy the data to a newly
+    // created device allocation.
+    char *ZeHandleDst;
+    UR_CALL(
+        Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, Device));
+
+    // zeCommandListAppendMemoryCopy must not be called from simultaneous
+    // threads with the same command list handle, so we need exclusive lock.
+    std::scoped_lock<ur_mutex> Lock(Context->ImmediateCommandListMutex);
+    ZE2UR_CALL(zeCommandListAppendMemoryCopy,
+               (Context->ZeCommandListInit, ZeHandleDst, Ptr, Size, nullptr, 0,
+                nullptr));
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(
+    ur_mem_handle_t Memory, ///< [in] handle to the memory object being queried.
+    ur_mem_info_t MemInfoType, ///< [in] type of the info to retrieve.
+    size_t PropSize, ///< [in] the number of bytes of memory pointed to by
+                     ///< pMemInfo.
+    void *MemInfo,   ///< [out][optional] array of bytes holding the info.
+                     ///< If propSize is less than the real number of bytes
+                     ///< needed to return the info then the
+                     ///< ::UR_RESULT_ERROR_INVALID_SIZE error is returned and
+                     ///< pMemInfo is not used.
+    size_t *PropSizeRet ///< [out][optional] pointer to the actual size in
+                        ///< bytes of data queried by pMemInfo.
+) {
+  UR_ASSERT(!Memory->isImage(), UR_RESULT_ERROR_INVALID_VALUE);
+
+  auto Buffer = reinterpret_cast<_ur_buffer *>(Memory);
+  std::shared_lock<ur_shared_mutex> Lock(Buffer->Mutex);
+  UrReturnHelper ReturnValue(PropSize, MemInfo, PropSizeRet);
+
+  switch (MemInfoType) {
+  case UR_MEM_INFO_CONTEXT: {
+    return ReturnValue(Buffer->UrContext);
+  }
+  case UR_MEM_INFO_SIZE: {
+    // Get size of the allocation
+    return ReturnValue(size_t{Buffer->Size});
+  }
+  default: {
+    die("urMemGetInfo: Parameter is not implemented");
+  }
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(
+    ur_mem_handle_t Memory, ///< [in] handle to the image object being queried.
+    ur_image_info_t ImgInfoType, ///< [in] type of image info to retrieve.
+    size_t PropSize, ///< [in] the number of bytes of memory pointer to by
+                     ///< pImgInfo.
+    void *ImgInfo,   ///< [out][optional] array of bytes holding the info.
+                     ///< If propSize is less than the real number of bytes
+                     ///< needed to return the info then the
+                     ///< ::UR_RESULT_ERROR_INVALID_SIZE error is returned and
+                     ///< pImgInfo is not used.
+    size_t *PropSizeRet ///< [out][optional] pointer to the actual size in
+                        ///< bytes of data queried by pImgInfo.
+) {
+  std::ignore = Memory;
+  std::ignore = ImgInfoType;
+  std::ignore = PropSize;
+  std::ignore = ImgInfo;
+  std::ignore = PropSizeRet;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    ur_usm_desc_t *USMDesc, ///< [in][optional] USM memory allocation descriptor
+    ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created
+                               ///< using urUSMPoolCreate
+    size_t
+        Size, ///< [in] size in bytes of the USM memory object to be allocated
+    uint32_t Align, ///< [in] alignment of the USM memory object
+    void **RetMem   ///< [out] pointer to USM host memory object
+) {
+  std::ignore = Pool;
+
+  // L0 supports alignment up to 64KB and silently ignores higher values.
+  // We flag alignment > 64KB as an invalid value.
+  if (Align > 65536)
+    return UR_RESULT_ERROR_INVALID_VALUE;
+
+  ur_usm_mem_flags_t *USMFlag = &USMDesc->flags;
+  std::ignore = USMFlag;
+
+  ur_platform_handle_t Plt = Context->getPlatform();
+  // If indirect access tracking is enabled then lock the mutex which is
+  // guarding contexts container in the platform. This prevents new kernels from
+  // being submitted in any context while we are in the process of allocating a
+  // memory, this is needed to properly capture allocations by kernels with
+  // indirect access. This lock also protects access to the context's data
+  // structures. If indirect access tracking is not enabled then lock context
+  // mutex to protect access to context's data structures.
+  std::shared_lock<ur_shared_mutex> ContextLock(Context->Mutex,
+                                                std::defer_lock);
+  std::unique_lock<ur_shared_mutex> IndirectAccessTrackingLock(
+      Plt->ContextsMutex, std::defer_lock);
+  if (IndirectAccessTrackingEnabled) {
+    IndirectAccessTrackingLock.lock();
+    // We are going to defer memory release if there are kernels with indirect
+    // access, that is why explicitly retain context to be sure that it is
+    // released after all memory allocations in this context are released.
+    UR_CALL(urContextRetain(Context));
+  } else {
+    ContextLock.lock();
+  }
+
+  if (!UseUSMAllocator ||
+      // L0 spec says that allocation fails if Alignment != 2^n, in order to
+      // keep the same behavior for the allocator, just call L0 API directly and
+      // return the error code.
+      ((Align & (Align - 1)) != 0)) {
+    ur_usm_mem_flags_t Properties{};
+    ur_result_t Res =
+        USMHostAllocImpl(RetMem, Context, &Properties, Size, Align);
+    if (IndirectAccessTrackingEnabled) {
+      // Keep track of all memory allocations in the context
+      Context->MemAllocs.emplace(std::piecewise_construct,
+                                 std::forward_as_tuple(*RetMem),
+                                 std::forward_as_tuple(Context));
+    }
+    return Res;
+  }
+
+  // There is a single allocator for Host USM allocations, so we don't need to
+  // find the allocator depending on context as we do for Shared and Device
+  // allocations.
+  try {
+    *RetMem = Context->HostMemAllocContext->allocate(Size, Align);
+    if (IndirectAccessTrackingEnabled) {
+      // Keep track of all memory allocations in the context
+      Context->MemAllocs.emplace(std::piecewise_construct,
+                                 std::forward_as_tuple(*RetMem),
+                                 std::forward_as_tuple(Context));
+    }
+  } catch (const UsmAllocationException &Ex) {
+    *RetMem = nullptr;
+    return Ex.getError();
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    ur_device_handle_t Device,   ///< [in] handle of the device object
+    ur_usm_desc_t *USMDesc, ///< [in][optional] USM memory allocation descriptor
+    ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created
+                               ///< using urUSMPoolCreate
+    size_t
+        Size, ///< [in] size in bytes of the USM memory object to be allocated
+    uint32_t Alignment, ///< [in] alignment of the USM memory object
+    void **RetMem       ///< [out] pointer to USM device memory object
+) {
+  std::ignore = Pool;
+
+  // L0 supports alignment up to 64KB and silently ignores higher values.
+  // We flag alignment > 64KB as an invalid value.
+  if (Alignment > 65536)
+    return UR_RESULT_ERROR_INVALID_VALUE;
+
+  ur_usm_mem_flags_t *USMProp = &USMDesc->flags;
+  std::ignore = USMProp;
+
+  ur_platform_handle_t Plt = Device->Platform;
+
+  // If indirect access tracking is enabled then lock the mutex which is
+  // guarding contexts container in the platform. This prevents new kernels from
+  // being submitted in any context while we are in the process of allocating a
+  // memory, this is needed to properly capture allocations by kernels with
+  // indirect access. This lock also protects access to the context's data
+  // structures. If indirect access tracking is not enabled then lock context
+  // mutex to protect access to context's data structures.
+  std::shared_lock<ur_shared_mutex> ContextLock(Context->Mutex,
+                                                std::defer_lock);
+  std::unique_lock<ur_shared_mutex> IndirectAccessTrackingLock(
+      Plt->ContextsMutex, std::defer_lock);
+  if (IndirectAccessTrackingEnabled) {
+    IndirectAccessTrackingLock.lock();
+    // We are going to defer memory release if there are kernels with indirect
+    // access, that is why explicitly retain context to be sure that it is
+    // released after all memory allocations in this context are released.
+    UR_CALL(urContextRetain(Context));
+  } else {
+    ContextLock.lock();
+  }
+
+  if (!UseUSMAllocator ||
+      // L0 spec says that allocation fails if Alignment != 2^n, in order to
+      // keep the same behavior for the allocator, just call L0 API directly and
+      // return the error code.
+      ((Alignment & (Alignment - 1)) != 0)) {
+    ur_result_t Res =
+        USMDeviceAllocImpl(RetMem, Context, Device, nullptr, Size, Alignment);
+    if (IndirectAccessTrackingEnabled) {
+      // Keep track of all memory allocations in the context
+      Context->MemAllocs.emplace(std::piecewise_construct,
+                                 std::forward_as_tuple(*RetMem),
+                                 std::forward_as_tuple(Context));
+    }
+    return Res;
+  }
+
+  try {
+    auto It = Context->DeviceMemAllocContexts.find(Device->ZeDevice);
+    if (It == Context->DeviceMemAllocContexts.end())
+      return UR_RESULT_ERROR_INVALID_VALUE;
+
+    *RetMem = It->second.allocate(Size, Alignment);
+    if (IndirectAccessTrackingEnabled) {
+      // Keep track of all memory allocations in the context
+      Context->MemAllocs.emplace(std::piecewise_construct,
+                                 std::forward_as_tuple(*RetMem),
+                                 std::forward_as_tuple(Context));
+    }
+
+  } catch (const UsmAllocationException &Ex) {
+    *RetMem = nullptr;
+    return Ex.getError();
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    ur_device_handle_t Device,   ///< [in] handle of the device object
+    ur_usm_desc_t *USMDesc, ///< [in][optional] USM memory allocation descriptor
+    ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created
+                               ///< using urUSMPoolCreate
+    size_t
+        Size, ///< [in] size in bytes of the USM memory object to be allocated
+    uint32_t Alignment, ///< [in] alignment of the USM memory object
+    void **RetMem       ///< [out] pointer to USM shared memory object
+) {
+  std::ignore = Pool;
+
+  ur_usm_mem_flags_t *Properties = &USMDesc->flags;
+
+  // See if the memory is going to be read-only on the device.
+  bool DeviceReadOnly = false;
+
+  // L0 supports alignment up to 64KB and silently ignores higher values.
+  // We flag alignment > 64KB as an invalid value.
+  if (Alignment > 65536)
+    return UR_RESULT_ERROR_INVALID_VALUE;
+
+  ur_platform_handle_t Plt = Device->Platform;
+
+  // If indirect access tracking is enabled then lock the mutex which is
+  // guarding contexts container in the platform. This prevents new kernels from
+  // being submitted in any context while we are in the process of allocating a
+  // memory, this is needed to properly capture allocations by kernels with
+  // indirect access. This lock also protects access to the context's data
+  // structures. If indirect access tracking is not enabled then lock context
+  // mutex to protect access to context's data structures.
+  std::scoped_lock<ur_shared_mutex> Lock(
+      IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex);
+
+  if (IndirectAccessTrackingEnabled) {
+    // We are going to defer memory release if there are kernels with indirect
+    // access, that is why explicitly retain context to be sure that it is
+    // released after all memory allocations in this context are released.
+    UR_CALL(urContextRetain(Context));
+  }
+
+  if (!UseUSMAllocator ||
+      // L0 spec says that allocation fails if Alignment != 2^n, in order to
+      // keep the same behavior for the allocator, just call L0 API directly and
+      // return the error code.
+      ((Alignment & (Alignment - 1)) != 0)) {
+    ur_result_t Res = USMSharedAllocImpl(RetMem, Context, Device, Properties,
+                                         Size, Alignment);
+    if (IndirectAccessTrackingEnabled) {
+      // Keep track of all memory allocations in the context
+      Context->MemAllocs.emplace(std::piecewise_construct,
+                                 std::forward_as_tuple(*RetMem),
+                                 std::forward_as_tuple(Context));
+    }
+    return Res;
+  }
+
+  try {
+    auto &Allocator = (DeviceReadOnly ? Context->SharedReadOnlyMemAllocContexts
+                                      : Context->SharedMemAllocContexts);
+    auto It = Allocator.find(Device->ZeDevice);
+    if (It == Allocator.end())
+      return UR_RESULT_ERROR_INVALID_VALUE;
+
+    *RetMem = It->second.allocate(Size, Alignment);
+    if (DeviceReadOnly) {
+      Context->SharedReadOnlyAllocs.insert(*RetMem);
+    }
+    if (IndirectAccessTrackingEnabled) {
+      // Keep track of all memory allocations in the context
+      Context->MemAllocs.emplace(std::piecewise_construct,
+                                 std::forward_as_tuple(*RetMem),
+                                 std::forward_as_tuple(Context));
+    }
+  } catch (const UsmAllocationException &Ex) {
+    *RetMem = nullptr;
+    return Ex.getError();
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    void *Mem                    ///< [in] pointer to USM memory object
+) {
+  ur_platform_handle_t Plt = Context->getPlatform();
+
+  std::scoped_lock<ur_shared_mutex> Lock(
+      IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex);
+
+  return USMFreeHelper(Context, Mem);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMGetMemAllocInfo(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    const void *Ptr,             ///< [in] pointer to USM memory object
+    ur_usm_alloc_info_t
+        PropName, ///< [in] the name of the USM allocation property to query
+    size_t PropValueSize, ///< [in] size in bytes of the USM allocation property
+                          ///< value
+    void *PropValue, ///< [out][optional] value of the USM allocation property
+    size_t *PropValueSizeRet ///< [out][optional] bytes returned in USM
+                             ///< allocation property
+) {
+  ze_device_handle_t ZeDeviceHandle;
+  ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
+
+  ZE2UR_CALL(zeMemGetAllocProperties,
+             (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties,
+              &ZeDeviceHandle));
+
+  UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet);
+  switch (PropName) {
+  case UR_USM_ALLOC_INFO_TYPE: {
+    pi_usm_type MemAllocaType;
+    switch (ZeMemoryAllocationProperties.type) {
+    case ZE_MEMORY_TYPE_UNKNOWN:
+      MemAllocaType = PI_MEM_TYPE_UNKNOWN;
+      break;
+    case ZE_MEMORY_TYPE_HOST:
+      MemAllocaType = PI_MEM_TYPE_HOST;
+      break;
+    case ZE_MEMORY_TYPE_DEVICE:
+      MemAllocaType = PI_MEM_TYPE_DEVICE;
+      break;
+    case ZE_MEMORY_TYPE_SHARED:
+      MemAllocaType = PI_MEM_TYPE_SHARED;
+      break;
+    default:
+      urPrint("urUSMGetMemAllocInfo: unexpected usm memory type\n");
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+    return ReturnValue(MemAllocaType);
+  }
+  case UR_USM_ALLOC_INFO_DEVICE:
+    if (ZeDeviceHandle) {
+      auto Platform = Context->getPlatform();
+      auto Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle);
+      return Device ? ReturnValue(Device) : UR_RESULT_ERROR_INVALID_VALUE;
+    } else {
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+  case UR_USM_ALLOC_INFO_BASE_PTR: {
+    void *Base;
+    ZE2UR_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, &Base, nullptr));
+    return ReturnValue(Base);
+  }
+  case UR_USM_ALLOC_INFO_SIZE: {
+    size_t Size;
+    ZE2UR_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, nullptr, &Size));
+    return ReturnValue(Size);
+  }
+  default:
+    urPrint("urUSMGetMemAllocInfo: unsupported ParamName\n");
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) {
+  ZE2UR_CALL(zeMemFree, (Context->ZeContext, Ptr));
+  return UR_RESULT_SUCCESS;
+}
+
+void *USMMemoryAllocBase::allocate(size_t Size) {
+  void *Ptr = nullptr;
+
+  auto Res = allocateImpl(&Ptr, Size, sizeof(void *));
+  if (Res != UR_RESULT_SUCCESS) {
+    throw UsmAllocationException(Res);
+  }
+
+  return Ptr;
+}
+
+void *USMMemoryAllocBase::allocate(size_t Size, size_t Alignment) {
+  void *Ptr = nullptr;
+
+  auto Res = allocateImpl(&Ptr, Size, Alignment);
+  if (Res != UR_RESULT_SUCCESS) {
+    throw UsmAllocationException(Res);
+  }
+  return Ptr;
+}
+
+void USMMemoryAllocBase::deallocate(void *Ptr) {
+  auto Res = USMFreeImpl(Context, Ptr);
+  if (Res != UR_RESULT_SUCCESS) {
+    throw UsmAllocationException(Res);
+  }
+}
+
+ur_result_t USMSharedMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
+                                               uint32_t Alignment) {
+  return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, Size,
+                            Alignment);
+}
+
+ur_result_t USMSharedReadOnlyMemoryAlloc::allocateImpl(void **ResultPtr,
+                                                       size_t Size,
+                                                       uint32_t Alignment) {
+  ur_usm_mem_flags_t Props = UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY;
+  return USMSharedAllocImpl(ResultPtr, Context, Device, &Props, Size,
+                            Alignment);
+}
+
+ur_result_t USMDeviceMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
+                                               uint32_t Alignment) {
+  return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size,
+                            Alignment);
+}
+
+ur_result_t USMHostMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
+                                             uint32_t Alignment) {
+  return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment);
+}
+
+enum class USMAllocationForceResidencyType {
+  // [Default] Do not force memory residency at allocation time.
+  None = 0,
+  // Force memory resident on the device of allocation at allocation time.
+  // For host allocation force residency on all devices in a context.
+  Device = 1,
+  // Force memory resident on all devices in the context with P2P
+  // access to the device of allocation.
+  // For host allocation force residency on all devices in a context.
+  P2PDevices = 2
+};
+
+// Returns the desired USM residency setting
+static USMAllocationForceResidencyType USMAllocationForceResidency = [] {
+  const auto Str = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT");
+  if (!Str)
+    return USMAllocationForceResidencyType::None;
+  switch (std::atoi(Str)) {
+  case 1:
+    return USMAllocationForceResidencyType::Device;
+  case 2:
+    return USMAllocationForceResidencyType::P2PDevices;
+  default:
+    return USMAllocationForceResidencyType::None;
+  };
+}();
+
+// Make USM allocation resident as requested
+static ur_result_t USMAllocationMakeResident(
+    ur_context_handle_t Context,
+    ur_device_handle_t Device, // nullptr for host allocation
+    void *Ptr, size_t Size) {
+
+  std::list<ur_device_handle_t> Devices;
+
+  if (USMAllocationForceResidency == USMAllocationForceResidencyType::None)
+    return UR_RESULT_SUCCESS;
+  else if (!Device) {
+    // Host allocation, make it resident on all devices in the context
+    Devices.insert(Devices.end(), Context->Devices.begin(),
+                   Context->Devices.end());
+  } else {
+    Devices.push_back(Device);
+    if (USMAllocationForceResidency ==
+        USMAllocationForceResidencyType::P2PDevices) {
+      ze_bool_t P2P;
+      for (const auto &D : Context->Devices) {
+        if (D == Device)
+          continue;
+        // TODO: Cache P2P devices for a context
+        ZE2UR_CALL(zeDeviceCanAccessPeer,
+                   (D->ZeDevice, Device->ZeDevice, &P2P));
+        if (P2P)
+          Devices.push_back(D);
+      }
+    }
+  }
+  for (const auto &D : Devices) {
+    ZE2UR_CALL(zeContextMakeMemoryResident,
+               (Context->ZeContext, D->ZeDevice, Ptr, Size));
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    ur_usm_pool_desc_t
+        *PoolDesc, ///< [in] pointer to USM pool descriptor. Can be chained with
+                   ///< ::ur_usm_pool_limits_desc_t
+    ur_usm_pool_handle_t *Pool ///< [out] pointer to USM memory pool
+) {
+  std::ignore = Context;
+  std::ignore = PoolDesc;
+  std::ignore = Pool;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolDestroy(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    ur_usm_pool_handle_t Pool    ///< [in] pointer to USM memory pool
+) {
+  std::ignore = Context;
+  std::ignore = Pool;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
+                               ur_device_handle_t Device,
+                               ur_usm_mem_flags_t *Properties, size_t Size,
+                               uint32_t Alignment) {
+  // TODO: translate PI properties to Level Zero flags
+  ZeStruct<ze_device_mem_alloc_desc_t> ZeDesc;
+  ZeDesc.flags = 0;
+  ZeDesc.ordinal = 0;
+
+  ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
+  if (Size > Device->ZeDeviceProperties->maxMemAllocSize) {
+    // Tell Level-Zero to accept Size > maxMemAllocSize
+    RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
+    ZeDesc.pNext = &RelaxedDesc;
+  }
+
+  ZE2UR_CALL(zeMemAllocDevice, (Context->ZeContext, &ZeDesc, Size, Alignment,
+                                Device->ZeDevice, ResultPtr));
+
+  UR_ASSERT(Alignment == 0 ||
+                reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
+            UR_RESULT_ERROR_INVALID_VALUE);
+
+  USMAllocationMakeResident(Context, Device, *ResultPtr, Size);
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
+                               ur_device_handle_t Device, ur_usm_mem_flags_t *,
+                               size_t Size, uint32_t Alignment) {
+
+  // TODO: translate PI properties to Level Zero flags
+  ZeStruct<ze_host_mem_alloc_desc_t> ZeHostDesc;
+  ZeHostDesc.flags = 0;
+  ZeStruct<ze_device_mem_alloc_desc_t> ZeDevDesc;
+  ZeDevDesc.flags = 0;
+  ZeDevDesc.ordinal = 0;
+
+  ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
+  if (Size > Device->ZeDeviceProperties->maxMemAllocSize) {
+    // Tell Level-Zero to accept Size > maxMemAllocSize
+    RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
+    ZeDevDesc.pNext = &RelaxedDesc;
+  }
+
+  ZE2UR_CALL(zeMemAllocShared, (Context->ZeContext, &ZeDevDesc, &ZeHostDesc,
+                                Size, Alignment, Device->ZeDevice, ResultPtr));
+
+  UR_ASSERT(Alignment == 0 ||
+                reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
+            UR_RESULT_ERROR_INVALID_VALUE);
+
+  USMAllocationMakeResident(Context, Device, *ResultPtr, Size);
+
+  // TODO: Handle PI_MEM_ALLOC_DEVICE_READ_ONLY.
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
+                             ur_usm_mem_flags_t *Properties, size_t Size,
+                             uint32_t Alignment) {
+  // TODO: translate PI properties to Level Zero flags
+  ZeStruct<ze_host_mem_alloc_desc_t> ZeHostDesc;
+  ZeHostDesc.flags = 0;
+  ZE2UR_CALL(zeMemAllocHost,
+             (Context->ZeContext, &ZeHostDesc, Size, Alignment, ResultPtr));
+
+  UR_ASSERT(Alignment == 0 ||
+                reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
+            UR_RESULT_ERROR_INVALID_VALUE);
+
+  USMAllocationMakeResident(Context, nullptr, *ResultPtr, Size);
+
+  return UR_RESULT_SUCCESS;
+}
+
+// If indirect access tracking is not enabled then this functions just performs
+// zeMemFree. If indirect access tracking is enabled then reference counting is
+// performed.
+ur_result_t ZeMemFreeHelper(ur_context_handle_t Context, void *Ptr) {
+  ur_platform_handle_t Plt = Context->getPlatform();
+  std::unique_lock<ur_shared_mutex> ContextsLock(Plt->ContextsMutex,
+                                                 std::defer_lock);
+  if (IndirectAccessTrackingEnabled) {
+    ContextsLock.lock();
+    auto It = Context->MemAllocs.find(Ptr);
+    if (It == std::end(Context->MemAllocs)) {
+      die("All memory allocations must be tracked!");
+    }
+    if (!It->second.RefCount.decrementAndTest()) {
+      // Memory can't be deallocated yet.
+      return UR_RESULT_SUCCESS;
+    }
+
+    // Reference count is zero, it is ok to free memory.
+    // We don't need to track this allocation anymore.
+    Context->MemAllocs.erase(It);
+  }
+
+  ZE2UR_CALL(zeMemFree, (Context->ZeContext, Ptr));
+
+  if (IndirectAccessTrackingEnabled)
+    UR_CALL(ContextReleaseHelper(Context));
+
+  return UR_RESULT_SUCCESS;
+}
 
 bool ShouldUseUSMAllocator() {
   // Enable allocator by default if it's not explicitly disabled
@@ -15,4 +2538,535 @@ bool ShouldUseUSMAllocator() {
   const char *Ret = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   return Ret == nullptr;
 }
-const bool UseUSMAllocator = ShouldUseUSMAllocator();
\ No newline at end of file
+
+const bool UseUSMAllocator = ShouldUseUSMAllocator();
+
+// Helper function to deallocate USM memory, if indirect access support is
+// enabled then a caller must lock the platform-level mutex guarding the
+// container with contexts because deallocating the memory can turn RefCount of
+// a context to 0 and as a result the context being removed from the list of
+// tracked contexts.
+// If indirect access tracking is not enabled then caller must lock Context
+// mutex.
+ur_result_t USMFreeHelper(ur_context_handle_t Context, void *Ptr,
+                          bool OwnZeMemHandle) {
+  if (!OwnZeMemHandle) {
+    // Memory should not be freed
+    return UR_RESULT_SUCCESS;
+  }
+
+  if (IndirectAccessTrackingEnabled) {
+    auto It = Context->MemAllocs.find(Ptr);
+    if (It == std::end(Context->MemAllocs)) {
+      die("All memory allocations must be tracked!");
+    }
+    if (!It->second.RefCount.decrementAndTest()) {
+      // Memory can't be deallocated yet.
+      return UR_RESULT_SUCCESS;
+    }
+
+    // Reference count is zero, it is ok to free memory.
+    // We don't need to track this allocation anymore.
+    Context->MemAllocs.erase(It);
+  }
+
+  if (!UseUSMAllocator) {
+    ur_result_t Res = USMFreeImpl(Context, Ptr);
+    if (IndirectAccessTrackingEnabled)
+      UR_CALL(ContextReleaseHelper(Context));
+    return Res;
+  }
+
+  // Query the device of the allocation to determine the right allocator context
+  ze_device_handle_t ZeDeviceHandle;
+  ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
+
+  // Query memory type of the pointer we're freeing to determine the correct
+  // way to do it(directly or via an allocator)
+  auto ZeResult =
+      ZE_CALL_NOCHECK(zeMemGetAllocProperties,
+                      (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties,
+                       &ZeDeviceHandle));
+
+  // Handle the case that L0 RT was already unloaded
+  if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
+    if (IndirectAccessTrackingEnabled)
+      UR_CALL(ContextReleaseHelper(Context));
+    return UR_RESULT_SUCCESS;
+  } else if (ZeResult) {
+    return ze2urResult(ZeResult);
+  }
+
+  // If memory type is host release from host pool
+  if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST) {
+    try {
+      Context->HostMemAllocContext->deallocate(Ptr);
+    } catch (const UsmAllocationException &Ex) {
+      return Ex.getError();
+    } catch (...) {
+      return UR_RESULT_ERROR_UNKNOWN;
+    }
+    if (IndirectAccessTrackingEnabled)
+      UR_CALL(ContextReleaseHelper(Context));
+    return UR_RESULT_SUCCESS;
+  }
+
+  // Points out an allocation in SharedReadOnlyMemAllocContexts
+  auto SharedReadOnlyAllocsIterator = Context->SharedReadOnlyAllocs.end();
+
+  if (!ZeDeviceHandle) {
+    // The only case where it is OK not have device identified is
+    // if the memory is not known to the driver. We should not ever get
+    // this either, probably.
+    UR_ASSERT(ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN,
+              UR_RESULT_ERROR_INVALID_DEVICE);
+  } else {
+    ur_device_handle_t Device;
+    // All context member devices or their descendants are of the same platform.
+    auto Platform = Context->getPlatform();
+    Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle);
+    UR_ASSERT(Device, UR_RESULT_ERROR_INVALID_DEVICE);
+
+    auto DeallocationHelper =
+        [Context, Device,
+         Ptr](std::unordered_map<ze_device_handle_t, USMAllocContext>
+                  &AllocContextMap) {
+          try {
+            auto It = AllocContextMap.find(Device->ZeDevice);
+            if (It == AllocContextMap.end())
+              return UR_RESULT_ERROR_INVALID_VALUE;
+
+            // The right context is found, deallocate the pointer
+            It->second.deallocate(Ptr);
+          } catch (const UsmAllocationException &Ex) {
+            return Ex.getError();
+          }
+
+          if (IndirectAccessTrackingEnabled)
+            UR_CALL(ContextReleaseHelper(Context));
+          return UR_RESULT_SUCCESS;
+        };
+
+    switch (ZeMemoryAllocationProperties.type) {
+    case ZE_MEMORY_TYPE_SHARED:
+      // Distinguish device_read_only allocations since they have own pool.
+      SharedReadOnlyAllocsIterator = Context->SharedReadOnlyAllocs.find(Ptr);
+      return DeallocationHelper(SharedReadOnlyAllocsIterator !=
+                                        Context->SharedReadOnlyAllocs.end()
+                                    ? Context->SharedReadOnlyMemAllocContexts
+                                    : Context->SharedMemAllocContexts);
+    case ZE_MEMORY_TYPE_DEVICE:
+      return DeallocationHelper(Context->DeviceMemAllocContexts);
+    default:
+      // Handled below
+      break;
+    }
+  }
+
+  ur_result_t Res = USMFreeImpl(Context, Ptr);
+  if (SharedReadOnlyAllocsIterator != Context->SharedReadOnlyAllocs.end()) {
+    Context->SharedReadOnlyAllocs.erase(SharedReadOnlyAllocsIterator);
+  }
+  if (IndirectAccessTrackingEnabled)
+    UR_CALL(ContextReleaseHelper(Context));
+  return Res;
+}
+
+// If indirect access tracking is enabled then performs reference counting,
+// otherwise just calls zeMemAllocDevice.
+static ur_result_t ZeDeviceMemAllocHelper(void **ResultPtr,
+                                          ur_context_handle_t Context,
+                                          ur_device_handle_t Device,
+                                          size_t Size) {
+  ur_platform_handle_t Plt = Device->Platform;
+  std::unique_lock<ur_shared_mutex> ContextsLock(Plt->ContextsMutex,
+                                                 std::defer_lock);
+  if (IndirectAccessTrackingEnabled) {
+    // Lock the mutex which is guarding contexts container in the platform.
+    // This prevents new kernels from being submitted in any context while
+    // we are in the process of allocating a memory, this is needed to
+    // properly capture allocations by kernels with indirect access.
+    ContextsLock.lock();
+    // We are going to defer memory release if there are kernels with
+    // indirect access, that is why explicitly retain context to be sure
+    // that it is released after all memory allocations in this context are
+    // released.
+    UR_CALL(urContextRetain(Context));
+  }
+
+  ze_device_mem_alloc_desc_t ZeDesc = {};
+  ZeDesc.flags = 0;
+  ZeDesc.ordinal = 0;
+  ZE2UR_CALL(zeMemAllocDevice, (Context->ZeContext, &ZeDesc, Size, 1,
+                                Device->ZeDevice, ResultPtr));
+
+  if (IndirectAccessTrackingEnabled) {
+    // Keep track of all memory allocations in the context
+    Context->MemAllocs.emplace(std::piecewise_construct,
+                               std::forward_as_tuple(*ResultPtr),
+                               std::forward_as_tuple(Context));
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode,
+                                    ur_device_handle_t Device) {
+
+  // NOTE: There might be no valid allocation at all yet and we get
+  // here from piEnqueueKernelLaunch that would be doing the buffer
+  // initialization. In this case the Device is not null as kernel
+  // launch is always on a specific device.
+  if (!Device)
+    Device = LastDeviceWithValidAllocation;
+  // If the device is still not selected then use the first one in
+  // the context of the buffer.
+  if (!Device)
+    Device = UrContext->Devices[0];
+
+  auto &Allocation = Allocations[Device];
+
+  // Sub-buffers don't maintain own allocations but rely on parent buffer.
+  if (isSubBuffer()) {
+    UR_CALL(SubBuffer.Parent->getZeHandle(ZeHandle, AccessMode, Device));
+    ZeHandle += SubBuffer.Origin;
+    // Still store the allocation info in the PI sub-buffer for
+    // getZeHandlePtr to work. At least zeKernelSetArgumentValue needs to
+    // be given a pointer to the allocation handle rather than its value.
+    //
+    Allocation.ZeHandle = ZeHandle;
+    Allocation.ReleaseAction = allocation_t::keep;
+    LastDeviceWithValidAllocation = Device;
+    return UR_RESULT_SUCCESS;
+  }
+
+  // First handle case where the buffer is represented by only
+  // a single host allocation.
+  if (OnHost) {
+    auto &HostAllocation = Allocations[nullptr];
+    // The host allocation may already exists, e.g. with imported
+    // host ptr, or in case of interop buffer.
+    if (!HostAllocation.ZeHandle) {
+      if (USMAllocatorConfigInstance.EnableBuffers) {
+        HostAllocation.ReleaseAction = allocation_t::free;
+        ur_usm_desc_t USMDesc{};
+        ur_usm_pool_handle_t Pool{};
+        UR_CALL(urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, getAlignment(),
+                               reinterpret_cast<void **>(&ZeHandle)));
+      } else {
+        HostAllocation.ReleaseAction = allocation_t::free_native;
+        UR_CALL(ZeHostMemAllocHelper(reinterpret_cast<void **>(&ZeHandle),
+                                     UrContext, Size));
+      }
+      HostAllocation.ZeHandle = ZeHandle;
+      HostAllocation.Valid = true;
+    }
+    Allocation = HostAllocation;
+    Allocation.ReleaseAction = allocation_t::keep;
+    ZeHandle = Allocation.ZeHandle;
+    LastDeviceWithValidAllocation = Device;
+    return UR_RESULT_SUCCESS;
+  }
+  // Reads user setting on how to deal with buffers in contexts where
+  // all devices have the same root-device. Returns "true" if the
+  // preference is to have allocate on each [sub-]device and migrate
+  // normally (copy) to other sub-devices as needed. Returns "false"
+  // if the preference is to have single root-device allocations
+  // serve the needs of all [sub-]devices, meaning potentially more
+  // cross-tile traffic.
+  //
+  static const bool SingleRootDeviceBufferMigration = [] {
+    const char *EnvStr =
+        std::getenv("SYCL_PI_LEVEL_ZERO_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION");
+    if (EnvStr)
+      return (std::stoi(EnvStr) != 0);
+    // The default is to migrate normally, which may not always be the
+    // best option (depends on buffer access patterns), but is an
+    // overall win on the set of the available benchmarks.
+    return true;
+  }();
+
+  // Peform actual device allocation as needed.
+  if (!Allocation.ZeHandle) {
+    if (!SingleRootDeviceBufferMigration && UrContext->SingleRootDevice &&
+        UrContext->SingleRootDevice != Device) {
+      // If all devices in the context are sub-devices of the same device
+      // then we reuse root-device allocation by all sub-devices in the
+      // context.
+      // TODO: we can probably generalize this and share root-device
+      //       allocations by its own sub-devices even if not all other
+      //       devices in the context have the same root.
+      UR_CALL(getZeHandle(ZeHandle, AccessMode, UrContext->SingleRootDevice));
+      Allocation.ReleaseAction = allocation_t::keep;
+      Allocation.ZeHandle = ZeHandle;
+      Allocation.Valid = true;
+      return UR_RESULT_SUCCESS;
+    } else { // Create device allocation
+      if (USMAllocatorConfigInstance.EnableBuffers) {
+        Allocation.ReleaseAction = allocation_t::free;
+        ur_usm_desc_t USMDesc{};
+        ur_usm_pool_handle_t Pool{};
+        UR_CALL(urUSMDeviceAlloc(UrContext, Device, &USMDesc, Pool, Size,
+                                 getAlignment(),
+                                 reinterpret_cast<void **>(&ZeHandle)));
+      } else {
+        Allocation.ReleaseAction = allocation_t::free_native;
+        UR_CALL(ZeDeviceMemAllocHelper(reinterpret_cast<void **>(&ZeHandle),
+                                       UrContext, Device, Size));
+      }
+    }
+    Allocation.ZeHandle = ZeHandle;
+  } else {
+    ZeHandle = Allocation.ZeHandle;
+  }
+
+  // If some prior access invalidated this allocation then make it valid again.
+  if (!Allocation.Valid) {
+    // LastDeviceWithValidAllocation should always have valid allocation.
+    if (Device == LastDeviceWithValidAllocation)
+      die("getZeHandle: last used allocation is not valid");
+
+    // For write-only access the allocation contents is not going to be used.
+    // So don't do anything to make it "valid".
+    bool NeedCopy = AccessMode != ur_mem_handle_t_::write_only;
+    // It's also possible that the buffer doesn't have a valid allocation
+    // yet presumably when it is passed to a kernel that will perform
+    // it's intialization.
+    if (NeedCopy && !LastDeviceWithValidAllocation) {
+      NeedCopy = false;
+    }
+    char *ZeHandleSrc = nullptr;
+    if (NeedCopy) {
+      UR_CALL(getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only,
+                          LastDeviceWithValidAllocation));
+      // It's possible with the single root-device contexts that
+      // the buffer is represented by the single root-device
+      // allocation and then skip the copy to itself.
+      if (ZeHandleSrc == ZeHandle)
+        NeedCopy = false;
+    }
+
+    if (NeedCopy) {
+      // Copy valid buffer data to this allocation.
+      // TODO: see if we should better use peer's device allocation used
+      // directly, if that capability is reported with zeDeviceCanAccessPeer,
+      // instead of maintaining a separate allocation and performing
+      // explciit copies.
+      //
+      // zeCommandListAppendMemoryCopy must not be called from simultaneous
+      // threads with the same command list handle, so we need exclusive lock.
+      ze_bool_t P2P = false;
+      ZE2UR_CALL(
+          zeDeviceCanAccessPeer,
+          (Device->ZeDevice, LastDeviceWithValidAllocation->ZeDevice, &P2P));
+      if (!P2P) {
+        // P2P copy is not possible, so copy through the host.
+        auto &HostAllocation = Allocations[nullptr];
+        // The host allocation may already exists, e.g. with imported
+        // host ptr, or in case of interop buffer.
+        if (!HostAllocation.ZeHandle) {
+          void *ZeHandleHost;
+          if (USMAllocatorConfigInstance.EnableBuffers) {
+            HostAllocation.ReleaseAction = allocation_t::free;
+            ur_usm_desc_t USMDesc{};
+            ur_usm_pool_handle_t Pool{};
+            UR_CALL(urUSMHostAlloc(UrContext, &USMDesc, Pool, Size,
+                                   getAlignment(), &ZeHandleHost));
+          } else {
+            HostAllocation.ReleaseAction = allocation_t::free_native;
+            UR_CALL(ZeHostMemAllocHelper(&ZeHandleHost, UrContext, Size));
+          }
+          HostAllocation.ZeHandle = reinterpret_cast<char *>(ZeHandleHost);
+          HostAllocation.Valid = false;
+        }
+        std::scoped_lock<ur_mutex> Lock(UrContext->ImmediateCommandListMutex);
+        if (!HostAllocation.Valid) {
+          ZE2UR_CALL(zeCommandListAppendMemoryCopy,
+                     (UrContext->ZeCommandListInit, HostAllocation.ZeHandle,
+                      ZeHandleSrc, Size, nullptr, 0, nullptr));
+          // Mark the host allocation data  as valid so it can be reused.
+          // It will be invalidated below if the current access is not
+          // read-only.
+          HostAllocation.Valid = true;
+        }
+        ZE2UR_CALL(zeCommandListAppendMemoryCopy,
+                   (UrContext->ZeCommandListInit, ZeHandle,
+                    HostAllocation.ZeHandle, Size, nullptr, 0, nullptr));
+      } else {
+        // Perform P2P copy.
+        std::scoped_lock<ur_mutex> Lock(UrContext->ImmediateCommandListMutex);
+        ZE2UR_CALL(zeCommandListAppendMemoryCopy,
+                   (UrContext->ZeCommandListInit, ZeHandle, ZeHandleSrc, Size,
+                    nullptr, 0, nullptr));
+      }
+    }
+    Allocation.Valid = true;
+    LastDeviceWithValidAllocation = Device;
+  }
+
+  // Invalidate other allocations that would become not valid if
+  // this access is not read-only.
+  if (AccessMode != ur_mem_handle_t_::read_only) {
+    for (auto &Alloc : Allocations) {
+      if (Alloc.first != LastDeviceWithValidAllocation)
+        Alloc.second.Valid = false;
+    }
+  }
+
+  urPrint("getZeHandle(pi_device{%p}) = %p\n", (void *)Device,
+          (void *)Allocation.ZeHandle);
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t _ur_buffer::free() {
+  for (auto &Alloc : Allocations) {
+    auto &ZeHandle = Alloc.second.ZeHandle;
+    // It is possible that the real allocation wasn't made if the buffer
+    // wasn't really used in this location.
+    if (!ZeHandle)
+      continue;
+
+    switch (Alloc.second.ReleaseAction) {
+    case allocation_t::keep:
+      break;
+    case allocation_t::free: {
+      ur_platform_handle_t Plt = UrContext->getPlatform();
+      std::scoped_lock<ur_shared_mutex> Lock(IndirectAccessTrackingEnabled
+                                                 ? Plt->ContextsMutex
+                                                 : UrContext->Mutex);
+
+      UR_CALL(USMFreeHelper(reinterpret_cast<ur_context_handle_t>(UrContext),
+                            ZeHandle));
+      break;
+    }
+    case allocation_t::free_native:
+      UR_CALL(ZeMemFreeHelper(UrContext, ZeHandle));
+      break;
+    case allocation_t::unimport:
+      ZeUSMImport.doZeUSMRelease(UrContext->getPlatform()->ZeDriver, ZeHandle);
+      break;
+    default:
+      die("_ur_buffer::free(): Unhandled release action");
+    }
+    ZeHandle = nullptr; // don't leave hanging pointers
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+// Buffer constructor
+_ur_buffer::_ur_buffer(ur_context_handle_t Context, size_t Size, char *HostPtr,
+                       bool ImportedHostPtr = false)
+    : ur_mem_handle_t_(Context), Size(Size), SubBuffer{nullptr, 0} {
+
+  // We treat integrated devices (physical memory shared with the CPU)
+  // differently from discrete devices (those with distinct memories).
+  // For integrated devices, allocating the buffer in the host memory
+  // enables automatic access from the device, and makes copying
+  // unnecessary in the map/unmap operations. This improves performance.
+  OnHost = Context->Devices.size() == 1 &&
+           Context->Devices[0]->ZeDeviceProperties->flags &
+               ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
+
+  // Fill the host allocation data.
+  if (HostPtr) {
+    MapHostPtr = HostPtr;
+    // If this host ptr is imported to USM then use this as a host
+    // allocation for this buffer.
+    if (ImportedHostPtr) {
+      Allocations[nullptr].ZeHandle = HostPtr;
+      Allocations[nullptr].Valid = true;
+      Allocations[nullptr].ReleaseAction = _ur_buffer::allocation_t::unimport;
+    }
+  }
+
+  // This initialization does not end up with any valid allocation yet.
+  LastDeviceWithValidAllocation = nullptr;
+}
+
+_ur_buffer::_ur_buffer(ur_context_handle_t Context, ur_device_handle_t Device,
+                       size_t Size)
+    : ur_mem_handle_t_(Context, Device), Size(Size) {}
+
+// Interop-buffer constructor
+_ur_buffer::_ur_buffer(ur_context_handle_t Context, size_t Size,
+                       ur_device_handle_t Device, char *ZeMemHandle,
+                       bool OwnZeMemHandle)
+    : ur_mem_handle_t_(Context, Device), Size(Size), SubBuffer{nullptr, 0} {
+
+  // Device == nullptr means host allocation
+  Allocations[Device].ZeHandle = ZeMemHandle;
+  Allocations[Device].Valid = true;
+  Allocations[Device].ReleaseAction =
+      OwnZeMemHandle ? allocation_t::free_native : allocation_t::keep;
+
+  // Check if this buffer can always stay on host
+  OnHost = false;
+  if (!Device) { // Host allocation
+    if (Context->Devices.size() == 1 &&
+        Context->Devices[0]->ZeDeviceProperties->flags &
+            ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) {
+      OnHost = true;
+      MapHostPtr = ZeMemHandle; // map to this allocation
+    }
+  }
+  LastDeviceWithValidAllocation = Device;
+}
+
+ur_result_t _ur_buffer::getZeHandlePtr(char **&ZeHandlePtr,
+                                       access_mode_t AccessMode,
+                                       ur_device_handle_t Device) {
+  char *ZeHandle;
+  UR_CALL(getZeHandle(ZeHandle, AccessMode, Device));
+  ZeHandlePtr = &Allocations[Device].ZeHandle;
+  return UR_RESULT_SUCCESS;
+}
+
+size_t _ur_buffer::getAlignment() const {
+  // Choose an alignment that is at most 64 and is the next power of 2
+  // for sizes less than 64.
+  auto Alignment = Size;
+  if (Alignment > 32UL)
+    Alignment = 64UL;
+  else if (Alignment > 16UL)
+    Alignment = 32UL;
+  else if (Alignment > 8UL)
+    Alignment = 16UL;
+  else if (Alignment > 4UL)
+    Alignment = 8UL;
+  else if (Alignment > 2UL)
+    Alignment = 4UL;
+  else if (Alignment > 1UL)
+    Alignment = 2UL;
+  else
+    Alignment = 1UL;
+  return Alignment;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue object
+    void *Ptr,               ///< [in] pointer to USM memory object
+    size_t PatternSize,  ///< [in] the size in bytes of the pattern. Must be a
+                         ///< power of 2 and less than or equal to width.
+    const void *Pattern, ///< [in] pointer with the bytes of the pattern to set.
+    size_t Size, ///< [in] size in bytes to be set. Must be a multiple of
+                 ///< patternSize.
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                       ///< pointer to a list of events that must be complete
+                       ///< before this command can be executed. If nullptr, the
+                       ///< numEventsInWaitList must be 0, indicating that this
+                       ///< command does not wait on any event to complete.
+    ur_event_handle_t *Event ///< [out][optional] return an event object that
+                             ///< identifies this particular command instance.
+) {
+  std::ignore = Queue;
+  std::ignore = Ptr;
+  std::ignore = PatternSize;
+  std::ignore = Pattern;
+  std::ignore = Size;
+  std::ignore = NumEventsInWaitList;
+  std::ignore = EventWaitList;
+  std::ignore = Event;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
index f8b2231909604..56b0c4a9dbaa6 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
@@ -8,7 +8,296 @@
 #pragma once
 
 #include "ur_level_zero_common.hpp"
+#include <cassert>
+#include <list>
+#include <map>
+#include <stdarg.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
 
-struct _ur_mem_handle_t : _ur_object {
-  _ur_mem_handle_t() {}
+#include <sycl/detail/pi.h>
+#include <ur/ur.hpp>
+#include <ur_api.h>
+#include <ze_api.h>
+#include <zes_api.h>
+
+#include "ur_level_zero.hpp"
+
+struct ur_device_handle_t_;
+
+bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr);
+
+// This is an experimental option to test performance of device to device copy
+// operations on copy engines (versus compute engine)
+const bool UseCopyEngineForD2DCopy = [] {
+  const char *CopyEngineForD2DCopy =
+      std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY");
+  return (CopyEngineForD2DCopy && (std::stoi(CopyEngineForD2DCopy) != 0));
+}();
+
+// Shared by all memory read/write/copy PI interfaces.
+// PI interfaces must have queue's and destination buffer's mutexes locked for
+// exclusive use and source buffer's mutex locked for shared use on entry.
+ur_result_t enqueueMemCopyHelper(ur_command_t CommandType,
+                                 ur_queue_handle_t Queue, void *Dst,
+                                 pi_bool BlockingWrite, size_t Size,
+                                 const void *Src, uint32_t NumEventsInWaitList,
+                                 const ur_event_handle_t *EventWaitList,
+                                 ur_event_handle_t *OutEvent,
+                                 bool PreferCopyEngine);
+
+ur_result_t enqueueMemCopyRectHelper(
+    ur_command_t CommandType, ur_queue_handle_t Queue, const void *SrcBuffer,
+    void *DstBuffer, ur_rect_offset_t SrcOrigin, ur_rect_offset_t DstOrigin,
+    ur_rect_region_t Region, size_t SrcRowPitch, size_t DstRowPitch,
+    size_t SrcSlicePitch, size_t DstSlicePitch, pi_bool Blocking,
+    uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
+    ur_event_handle_t *OutEvent, bool PreferCopyEngine = false);
+
+ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr);
+
+// Exception type to pass allocation errors
+class UsmAllocationException {
+  const ur_result_t Error;
+
+public:
+  UsmAllocationException(ur_result_t Err) : Error{Err} {}
+  ur_result_t getError() const { return Error; }
 };
+
+struct ur_mem_handle_t_ : _ur_object {
+  // Keeps the PI context of this memory handle.
+  ur_context_handle_t UrContext;
+
+  // Keeps device of this memory handle
+  ur_device_handle_t UrDevice;
+
+  // Enumerates all possible types of accesses.
+  enum access_mode_t { unknown, read_write, read_only, write_only };
+
+  // Interface of the _ur_mem object
+
+  // Get the Level Zero handle of the current memory object
+  virtual ur_result_t getZeHandle(char *&ZeHandle, access_mode_t,
+                                  ur_device_handle_t Device = nullptr) = 0;
+
+  // Get a pointer to the Level Zero handle of the current memory object
+  virtual ur_result_t getZeHandlePtr(char **&ZeHandlePtr, access_mode_t,
+                                     ur_device_handle_t Device = nullptr) = 0;
+
+  // Method to get type of the derived object (image or buffer)
+  virtual bool isImage() const = 0;
+
+  virtual ~ur_mem_handle_t_() = default;
+
+protected:
+  ur_mem_handle_t_(ur_context_handle_t Context) : UrContext{Context} {}
+
+  ur_mem_handle_t_(ur_context_handle_t Context, ur_device_handle_t Device)
+      : UrContext{Context}, UrDevice(Device) {}
+};
+
+struct _ur_buffer final : ur_mem_handle_t_ {
+  // Buffer constructor
+  _ur_buffer(ur_context_handle_t Context, ur_device_handle_t UrDevice,
+             size_t Size);
+
+  _ur_buffer(ur_context_handle_t Context, size_t Size, char *HostPtr,
+             bool ImportedHostPtr);
+
+  // Sub-buffer constructor
+  _ur_buffer(_ur_buffer *Parent, size_t Origin, size_t Size)
+      : ur_mem_handle_t_(Parent->UrContext), Size(Size),
+        SubBuffer{Parent, Origin} {}
+
+  // Interop-buffer constructor
+  _ur_buffer(ur_context_handle_t Context, size_t Size,
+             ur_device_handle_t Device, char *ZeMemHandle, bool OwnZeMemHandle);
+
+  // Returns a pointer to the USM allocation representing this PI buffer
+  // on the specified Device. If Device is nullptr then the returned
+  // USM allocation is on the device where this buffer was used the latest.
+  // The returned allocation is always valid, i.e. its contents is
+  // up-to-date and any data copies needed for that are performed under
+  // the hood.
+  //
+  virtual ur_result_t getZeHandle(char *&ZeHandle, access_mode_t,
+                                  ur_device_handle_t Device = nullptr) override;
+  virtual ur_result_t
+  getZeHandlePtr(char **&ZeHandlePtr, access_mode_t,
+                 ur_device_handle_t Device = nullptr) override;
+
+  bool isImage() const override { return false; }
+
+  bool isSubBuffer() const { return SubBuffer.Parent != nullptr; }
+
+  // Frees all allocations made for the buffer.
+  ur_result_t free();
+
+  // Information about a single allocation representing this buffer.
+  struct allocation_t {
+    // Level Zero memory handle is really just a naked pointer.
+    // It is just convenient to have it char * to simplify offset arithmetics.
+    char *ZeHandle{nullptr};
+    // Indicates if this allocation's data is valid.
+    bool Valid{false};
+    // Specifies the action that needs to be taken for this
+    // allocation at buffer destruction.
+    enum {
+      keep,       // do nothing, the allocation is not owned by us
+      unimport,   // release of the imported allocation
+      free,       // free from the pooling context (default)
+      free_native // free with a native call
+    } ReleaseAction{free};
+  };
+
+  // We maintain multiple allocations on possibly all devices in the context.
+  // The "nullptr" device identifies a host allocation representing buffer.
+  // Sub-buffers don't maintain own allocations but rely on parent buffer.
+  std::unordered_map<ur_device_handle_t, allocation_t> Allocations;
+  ur_device_handle_t LastDeviceWithValidAllocation{nullptr};
+
+  // Flag to indicate that this memory is allocated in host memory.
+  // Integrated device accesses this memory.
+  bool OnHost{false};
+
+  // Tells the host allocation to use for buffer map operations.
+  char *MapHostPtr{nullptr};
+
+  // Supplementary data to keep track of the mappings of this buffer
+  // created with piEnqueueMemBufferMap.
+  struct Mapping {
+    // The offset in the buffer giving the start of the mapped region.
+    size_t Offset;
+    // The size of the mapped region.
+    size_t Size;
+  };
+
+  // The key is the host pointer representing an active mapping.
+  // The value is the information needed to maintain/undo the mapping.
+  std::unordered_map<void *, Mapping> Mappings;
+
+  // The size and alignment of the buffer
+  size_t Size;
+  size_t getAlignment() const;
+
+  struct {
+    _ur_buffer *Parent;
+    size_t Origin; // only valid if Parent != nullptr
+  } SubBuffer;
+};
+
+struct _ur_image final : ur_mem_handle_t_ {
+  // Image constructor
+  _ur_image(ur_context_handle_t UrContext, ze_image_handle_t ZeImage)
+      : ur_mem_handle_t_(UrContext), ZeImage{ZeImage} {}
+
+  virtual ur_result_t getZeHandle(char *&ZeHandle, access_mode_t,
+                                  ur_device_handle_t = nullptr) override {
+    ZeHandle = reinterpret_cast<char *>(ZeImage);
+    return UR_RESULT_SUCCESS;
+  }
+  virtual ur_result_t getZeHandlePtr(char **&ZeHandlePtr, access_mode_t,
+                                     ur_device_handle_t = nullptr) override {
+    ZeHandlePtr = reinterpret_cast<char **>(&ZeImage);
+    return UR_RESULT_SUCCESS;
+  }
+
+  bool isImage() const override { return true; }
+
+#ifndef NDEBUG
+  // Keep the descriptor of the image (for debugging purposes)
+  ZeStruct<ze_image_desc_t> ZeImageDesc;
+#endif // !NDEBUG
+
+  // Level Zero image handle.
+  ze_image_handle_t ZeImage;
+};
+
+// Implements memory allocation via L0 RT for USM allocator interface.
+class USMMemoryAllocBase : public SystemMemory {
+protected:
+  ur_context_handle_t Context;
+  ur_device_handle_t Device;
+  // Internal allocation routine which must be implemented for each allocation
+  // type
+  virtual ur_result_t allocateImpl(void **ResultPtr, size_t Size,
+                                   uint32_t Alignment) = 0;
+
+public:
+  USMMemoryAllocBase(ur_context_handle_t Ctx, ur_device_handle_t Dev)
+      : Context{Ctx}, Device{Dev} {}
+  void *allocate(size_t Size) override final;
+  void *allocate(size_t Size, size_t Alignment) override final;
+  void deallocate(void *Ptr) override final;
+};
+
+// Allocation routines for shared memory type
+class USMSharedMemoryAlloc : public USMMemoryAllocBase {
+protected:
+  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
+                           uint32_t Alignment) override;
+
+public:
+  USMSharedMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev)
+      : USMMemoryAllocBase(Ctx, Dev) {}
+};
+
+// Allocation routines for shared memory type that is only modified from host.
+class USMSharedReadOnlyMemoryAlloc : public USMMemoryAllocBase {
+protected:
+  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
+                           uint32_t Alignment) override;
+
+public:
+  USMSharedReadOnlyMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev)
+      : USMMemoryAllocBase(Ctx, Dev) {}
+};
+
+// Allocation routines for device memory type
+class USMDeviceMemoryAlloc : public USMMemoryAllocBase {
+protected:
+  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
+                           uint32_t Alignment) override;
+
+public:
+  USMDeviceMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev)
+      : USMMemoryAllocBase(Ctx, Dev) {}
+};
+
+// Allocation routines for host memory type
+class USMHostMemoryAlloc : public USMMemoryAllocBase {
+protected:
+  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
+                           uint32_t Alignment) override;
+
+public:
+  USMHostMemoryAlloc(ur_context_handle_t Ctx)
+      : USMMemoryAllocBase(Ctx, nullptr) {}
+};
+
+ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
+                               ur_device_handle_t Device,
+                               ur_usm_mem_flags_t *Properties, size_t Size,
+                               uint32_t Alignment);
+
+ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
+                               ur_device_handle_t Device, ur_usm_mem_flags_t *,
+                               size_t Size, uint32_t Alignment);
+
+ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
+                             ur_usm_mem_flags_t *Properties, size_t Size,
+                             uint32_t Alignment);
+
+// If indirect access tracking is not enabled then this functions just performs
+// zeMemFree. If indirect access tracking is enabled then reference counting is
+// performed.
+ur_result_t ZeMemFreeHelper(ur_context_handle_t Context, void *Ptr);
+
+ur_result_t USMFreeHelper(ur_context_handle_t Context, void *Ptr,
+                          bool OwnZeMemHandle = true);
+
+bool ShouldUseUSMAllocator();
+
+extern const bool UseUSMAllocator;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_module.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_module.cpp
deleted file mode 100644
index 22476938ac884..0000000000000
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_module.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-//===--------- ur_level_zero_module.cpp - Level Zero Adapter ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===-----------------------------------------------------------------===//
-
-#include "ur_level_zero_module.hpp"
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_module.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_module.hpp
deleted file mode 100644
index 8ff81196df096..0000000000000
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_module.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//===--------- ur_level_zero_module.hpp - Level Zero Adapter ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===-----------------------------------------------------------------===//
-#pragma once
-
-#include "ur_level_zero_common.hpp"
-
-struct _ur_module_handle_t : _ur_object {
-  _ur_module_handle_t() {}
-};
-
-struct _ur_kernel_handle_t : _ur_object {
-  _ur_kernel_handle_t() {}
-};
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
index 99fab2d48dc16..1f2430274e6f4 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
@@ -7,3 +7,534 @@
 //===-----------------------------------------------------------------===//
 
 #include "ur_level_zero_platform.hpp"
+#include <ur_bindings.hpp>
+
+UR_APIEXPORT ur_result_t UR_APICALL urInit(
+    ur_device_init_flags_t
+        DeviceFlags ///< [in] device initialization flags.
+                    ///< must be 0 (default) or a combination of
+                    ///< ::ur_device_init_flag_t.
+) {
+  std::ignore = DeviceFlags;
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urTearDown(
+    void *Params ///< [in] pointer to tear down parameters
+) {
+  // reclaim pi_platform objects here since we don't have piPlatformRelease.
+  for (ur_platform_handle_t Platform : *PiPlatformsCache) {
+    delete Platform;
+  }
+  delete PiPlatformsCache;
+  delete PiPlatformsCacheMutex;
+
+  bool LeakFound = false;
+  // Print the balance of various create/destroy native calls.
+  // The idea is to verify if the number of create(+) and destroy(-) calls are
+  // matched.
+  if (UrL0Debug & UR_L0_DEBUG_CALL_COUNT) {
+    // clang-format off
+    //
+    // The format of this table is such that each row accounts for a
+    // specific type of objects, and all elements in the raw except the last
+    // one are allocating objects of that type, while the last element is known
+    // to deallocate objects of that type.
+    //
+    std::vector<std::vector<const char *>> CreateDestroySet = {
+      {"zeContextCreate",      "zeContextDestroy"},
+      {"zeCommandQueueCreate", "zeCommandQueueDestroy"},
+      {"zeModuleCreate",       "zeModuleDestroy"},
+      {"zeKernelCreate",       "zeKernelDestroy"},
+      {"zeEventPoolCreate",    "zeEventPoolDestroy"},
+      {"zeCommandListCreateImmediate", "zeCommandListCreate", "zeCommandListDestroy"},
+      {"zeEventCreate",        "zeEventDestroy"},
+      {"zeFenceCreate",        "zeFenceDestroy"},
+      {"zeImageCreate",        "zeImageDestroy"},
+      {"zeSamplerCreate",      "zeSamplerDestroy"},
+      {"zeMemAllocDevice", "zeMemAllocHost", "zeMemAllocShared", "zeMemFree"},
+    };
+
+    // A sample output aimed below is this:
+    // ------------------------------------------------------------------------
+    //                zeContextCreate = 1     \--->        zeContextDestroy = 1
+    //           zeCommandQueueCreate = 1     \--->   zeCommandQueueDestroy = 1
+    //                 zeModuleCreate = 1     \--->         zeModuleDestroy = 1
+    //                 zeKernelCreate = 1     \--->         zeKernelDestroy = 1
+    //              zeEventPoolCreate = 1     \--->      zeEventPoolDestroy = 1
+    //   zeCommandListCreateImmediate = 1     |
+    //            zeCommandListCreate = 1     \--->    zeCommandListDestroy = 1  ---> LEAK = 1
+    //                  zeEventCreate = 2     \--->          zeEventDestroy = 2
+    //                  zeFenceCreate = 1     \--->          zeFenceDestroy = 1
+    //                  zeImageCreate = 0     \--->          zeImageDestroy = 0
+    //                zeSamplerCreate = 0     \--->        zeSamplerDestroy = 0
+    //               zeMemAllocDevice = 0     |
+    //                 zeMemAllocHost = 1     |
+    //               zeMemAllocShared = 0     \--->               zeMemFree = 1
+    //
+    // clang-format on
+
+    fprintf(stderr, "ZE_DEBUG=%d: check balance of create/destroy calls\n",
+            UR_L0_DEBUG_CALL_COUNT);
+    fprintf(stderr,
+            "----------------------------------------------------------\n");
+    for (const auto &Row : CreateDestroySet) {
+      int diff = 0;
+      for (auto I = Row.begin(); I != Row.end();) {
+        const char *ZeName = *I;
+        const auto &ZeCount = (*ZeCallCount)[*I];
+
+        bool First = (I == Row.begin());
+        bool Last = (++I == Row.end());
+
+        if (Last) {
+          fprintf(stderr, " \\--->");
+          diff -= ZeCount;
+        } else {
+          diff += ZeCount;
+          if (!First) {
+            fprintf(stderr, " | \n");
+          }
+        }
+
+        fprintf(stderr, "%30s = %-5d", ZeName, ZeCount);
+      }
+
+      if (diff) {
+        LeakFound = true;
+        fprintf(stderr, " ---> LEAK = %d", diff);
+      }
+      fprintf(stderr, "\n");
+    }
+
+    ZeCallCount->clear();
+    delete ZeCallCount;
+    ZeCallCount = nullptr;
+  }
+  if (LeakFound)
+    return UR_RESULT_ERROR_INVALID_MEM_OBJECT;
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urPlatformGet(
+    uint32_t NumEntries, ///< [in] the number of platforms to be added to
+                         ///< phPlatforms. If phPlatforms is not NULL, then
+                         ///< NumEntries should be greater than zero, otherwise
+                         ///< ::UR_RESULT_ERROR_INVALID_SIZE, will be returned.
+    ur_platform_handle_t
+        *Platforms, ///< [out][optional][range(0, NumEntries)] array of handle
+                    ///< of platforms. If NumEntries is less than the number of
+                    ///< platforms available, then
+                    ///< ::urPlatformGet shall only retrieve that number of
+                    ///< platforms.
+    uint32_t *NumPlatforms ///< [out][optional] returns the total number of
+                           ///< platforms available.
+) {
+  static std::once_flag ZeCallCountInitialized;
+  try {
+    std::call_once(ZeCallCountInitialized, []() {
+      if (UrL0Debug & UR_L0_DEBUG_CALL_COUNT) {
+        ZeCallCount = new std::map<const char *, int>;
+      }
+    });
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  // Setting these environment variables before running zeInit will enable the
+  // validation layer in the Level Zero loader.
+  if (UrL0Debug & UR_L0_DEBUG_VALIDATION) {
+    setEnvVar("ZE_ENABLE_VALIDATION_LAYER", "1");
+    setEnvVar("ZE_ENABLE_PARAMETER_VALIDATION", "1");
+  }
+
+  // Enable SYSMAN support for obtaining the PCI address
+  // and maximum memory bandwidth.
+  if (getenv("SYCL_ENABLE_PCI") != nullptr) {
+    setEnvVar("ZES_ENABLE_SYSMAN", "1");
+  }
+
+  // TODO: We can still safely recover if something goes wrong during the init.
+  // Implement handling segfault using sigaction.
+
+  // We must only initialize the driver once, even if piPlatformsGet() is called
+  // multiple times.  Declaring the return value as "static" ensures it's only
+  // called once.
+  static ze_result_t ZeResult = ZE_CALL_NOCHECK(zeInit, (0));
+
+  // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms.
+  if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
+    UR_ASSERT(NumEntries != 0, UR_RESULT_ERROR_INVALID_VALUE);
+    if (NumPlatforms)
+      *NumPlatforms = 0;
+    return UR_RESULT_SUCCESS;
+  }
+
+  if (ZeResult != ZE_RESULT_SUCCESS) {
+    urPrint("zeInit: Level Zero initialization failure\n");
+    return ze2urResult(ZeResult);
+  }
+
+  // Cache pi_platforms for reuse in the future
+  // It solves two problems;
+  // 1. sycl::platform equality issue; we always return the same pi_platform.
+  // 2. performance; we can save time by immediately return from cache.
+  //
+
+  const std::lock_guard<SpinLock> Lock{*PiPlatformsCacheMutex};
+  if (!PiPlatformCachePopulated) {
+    try {
+      // Level Zero does not have concept of Platforms, but Level Zero driver is
+      // the closest match.
+      uint32_t ZeDriverCount = 0;
+      ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, nullptr));
+      if (ZeDriverCount == 0) {
+        PiPlatformCachePopulated = true;
+      } else {
+        std::vector<ze_driver_handle_t> ZeDrivers;
+        ZeDrivers.resize(ZeDriverCount);
+
+        ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data()));
+        for (uint32_t I = 0; I < ZeDriverCount; ++I) {
+          auto Platform = new ur_platform_handle_t_(ZeDrivers[I]);
+          // Save a copy in the cache for future uses.
+          PiPlatformsCache->push_back(Platform);
+
+          UR_CALL(Platform->initialize());
+        }
+        PiPlatformCachePopulated = true;
+      }
+    } catch (const std::bad_alloc &) {
+      return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    } catch (...) {
+      return UR_RESULT_ERROR_UNKNOWN;
+    }
+  }
+
+  // Populate returned platforms from the cache.
+  if (Platforms) {
+    UR_ASSERT(NumEntries <= PiPlatformsCache->size(),
+              UR_RESULT_ERROR_INVALID_PLATFORM);
+    std::copy_n(PiPlatformsCache->begin(), NumEntries, Platforms);
+  }
+
+  if (NumPlatforms) {
+    if (*NumPlatforms == 0)
+      *NumPlatforms = PiPlatformsCache->size();
+    else
+      *NumPlatforms = std::min(PiPlatformsCache->size(), (size_t)NumEntries);
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo(
+    ur_platform_handle_t Platform, ///< [in] handle of the platform
+    ur_platform_info_t ParamName,  ///< [in] type of the info to retrieve
+    size_t Size,      ///< [in] the number of bytes pointed to by pPlatformInfo.
+    void *ParamValue, ///< [out][optional] array of bytes holding the info.
+                      ///< If Size is not equal to or greater to the real number
+                      ///< of bytes needed to return the info then the
+                      ///< ::UR_RESULT_ERROR_INVALID_SIZE error is returned and
+                      ///< pPlatformInfo is not used.
+    size_t *SizeRet   ///< [out][optional] pointer to the actual number of bytes
+                      ///< being queried by pPlatformInfo.
+) {
+  UrReturnHelper ReturnValue(Size, ParamValue, SizeRet);
+
+  switch (ParamName) {
+  case UR_PLATFORM_INFO_NAME:
+    // TODO: Query Level Zero driver when relevant info is added there.
+    return ReturnValue("Intel(R) oneAPI Unified Runtime over Level-Zero");
+  case UR_PLATFORM_INFO_VENDOR_NAME:
+    // TODO: Query Level Zero driver when relevant info is added there.
+    return ReturnValue("Intel(R) Corporation");
+  case UR_PLATFORM_INFO_EXTENSIONS:
+    // Convention adopted from OpenCL:
+    //     "Returns a space-separated list of extension names (the extension
+    // names themselves do not contain any spaces) supported by the platform.
+    // Extensions defined here must be supported by all devices associated
+    // with this platform."
+    //
+    // TODO: Check the common extensions supported by all connected devices and
+    // return them. For now, hardcoding some extensions we know are supported by
+    // all Level Zero devices.
+    return ReturnValue(ZE_SUPPORTED_EXTENSIONS);
+  case UR_PLATFORM_INFO_PROFILE:
+    // TODO: figure out what this means and how is this used
+    return ReturnValue("FULL_PROFILE");
+  case UR_PLATFORM_INFO_VERSION:
+    // TODO: this should query to zeDriverGetDriverVersion
+    // but we don't yet have the driver handle here.
+    //
+    // From OpenCL 2.1: "This version string has the following format:
+    // OpenCL<space><major_version.minor_version><space><platform-specific
+    // information>. Follow the same notation here.
+    //
+    return ReturnValue(Platform->ZeDriverApiVersion.c_str());
+  default:
+    urPrint("urPlatformGetInfo: unrecognized ParamName\n");
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion(
+    ur_platform_handle_t Driver, ///< [in] handle of the platform
+    ur_api_version_t *Version    ///< [out] api version
+) {
+  std::ignore = Driver;
+  std::ignore = Version;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle(
+    ur_platform_handle_t Platform,     ///< [in] handle of the platform.
+    ur_native_handle_t *NativePlatform ///< [out] a pointer to the native
+                                       ///< handle of the platform.
+) {
+  // Extract the Level Zero driver handle from the given PI platform
+  *NativePlatform = reinterpret_cast<ur_native_handle_t>(Platform->ZeDriver);
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
+    ur_native_handle_t
+        NativePlatform,            ///< [in] the native handle of the platform.
+    ur_platform_handle_t *Platform ///< [out] pointer to the handle of the
+                                   ///< platform object created.
+) {
+  auto ZeDriver = ur_cast<ze_driver_handle_t>(NativePlatform);
+
+  uint32_t NumPlatforms = 0;
+  UR_CALL(urPlatformGet(0, nullptr, &NumPlatforms));
+
+  if (NumPlatforms) {
+    std::vector<ur_platform_handle_t> Platforms(NumPlatforms);
+    UR_CALL(urPlatformGet(NumPlatforms, Platforms.data(), nullptr));
+
+    // The SYCL spec requires that the set of platforms must remain fixed for
+    // the duration of the application's execution. We assume that we found all
+    // of the Level Zero drivers when we initialized the platform cache, so the
+    // "NativeHandle" must already be in the cache. If it is not, this must not
+    // be a valid Level Zero driver.
+    for (const ur_platform_handle_t &CachedPlatform : Platforms) {
+      if (CachedPlatform->ZeDriver == ZeDriver) {
+        *Platform = CachedPlatform;
+        return UR_RESULT_SUCCESS;
+      }
+    }
+  }
+
+  return UR_RESULT_ERROR_INVALID_VALUE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urGetLastResult(
+    ur_platform_handle_t Platform, ///< [in] handle of the platform instance
+    const char **Message ///< [out] pointer to a string containing adapter
+                         ///< specific result in string representation.
+) {
+  std::ignore = Platform;
+  std::ignore = Message;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+ur_result_t ur_platform_handle_t_::initialize() {
+  // Cache driver properties
+  ZeStruct<ze_driver_properties_t> ZeDriverProperties;
+  ZE2UR_CALL(zeDriverGetProperties, (ZeDriver, &ZeDriverProperties));
+  uint32_t DriverVersion = ZeDriverProperties.driverVersion;
+  // Intel Level-Zero GPU driver stores version as:
+  // | 31 - 24 | 23 - 16 | 15 - 0 |
+  // |  Major  |  Minor  | Build  |
+  auto VersionMajor = std::to_string((DriverVersion & 0xFF000000) >> 24);
+  auto VersionMinor = std::to_string((DriverVersion & 0x00FF0000) >> 16);
+  auto VersionBuild = std::to_string(DriverVersion & 0x0000FFFF);
+  ZeDriverVersion = VersionMajor + "." + VersionMinor + "." + VersionBuild;
+
+  ZE2UR_CALL(zeDriverGetApiVersion, (ZeDriver, &ZeApiVersion));
+  ZeDriverApiVersion = std::to_string(ZE_MAJOR_VERSION(ZeApiVersion)) + "." +
+                       std::to_string(ZE_MINOR_VERSION(ZeApiVersion));
+
+  // Cache driver extension properties
+  uint32_t Count = 0;
+  ZE2UR_CALL(zeDriverGetExtensionProperties, (ZeDriver, &Count, nullptr));
+
+  std::vector<ze_driver_extension_properties_t> ZeExtensions(Count);
+
+  ZE2UR_CALL(zeDriverGetExtensionProperties,
+             (ZeDriver, &Count, ZeExtensions.data()));
+
+  for (auto &extension : ZeExtensions) {
+    // Check if global offset extension is available
+    if (strncmp(extension.name, ZE_GLOBAL_OFFSET_EXP_NAME,
+                strlen(ZE_GLOBAL_OFFSET_EXP_NAME) + 1) == 0) {
+      if (extension.version == ZE_GLOBAL_OFFSET_EXP_VERSION_1_0) {
+        ZeDriverGlobalOffsetExtensionFound = true;
+      }
+    }
+    // Check if extension is available for "static linking" (compiling multiple
+    // SPIR-V modules together into one Level Zero module).
+    if (strncmp(extension.name, ZE_MODULE_PROGRAM_EXP_NAME,
+                strlen(ZE_MODULE_PROGRAM_EXP_NAME) + 1) == 0) {
+      if (extension.version == ZE_MODULE_PROGRAM_EXP_VERSION_1_0) {
+        ZeDriverModuleProgramExtensionFound = true;
+      }
+    }
+    zeDriverExtensionMap[extension.name] = extension.version;
+  }
+
+  // Check if import user ptr into USM feature has been requested.
+  // If yes, then set up L0 API pointers if the platform supports it.
+  ZeUSMImport.setZeUSMImport(this);
+
+  return UR_RESULT_SUCCESS;
+}
+
+// Get the cached PI device created for the L0 device handle.
+// Return NULL if no such PI device found.
+ur_device_handle_t
+ur_platform_handle_t_::getDeviceFromNativeHandle(ze_device_handle_t ZeDevice) {
+
+  ur_result_t Res = populateDeviceCacheIfNeeded();
+  if (Res != UR_RESULT_SUCCESS) {
+    return nullptr;
+  }
+
+  // TODO: our sub-sub-device representation is currently [Level-Zero device
+  // handle + Level-Zero compute group/engine index], so there is now no 1:1
+  // mapping from L0 device handle to PI device assumed in this function. Until
+  // Level-Zero adds unique ze_device_handle_t for sub-sub-devices, here we
+  // filter out PI sub-sub-devices.
+  std::shared_lock<ur_shared_mutex> Lock(PiDevicesCacheMutex);
+  auto it = std::find_if(PiDevicesCache.begin(), PiDevicesCache.end(),
+                         [&](std::unique_ptr<ur_device_handle_t_> &D) {
+                           return D.get()->ZeDevice == ZeDevice &&
+                                  (D.get()->RootDevice == nullptr ||
+                                   D.get()->RootDevice->RootDevice == nullptr);
+                         });
+  if (it != PiDevicesCache.end()) {
+    return (*it).get();
+  }
+  return nullptr;
+}
+
+// Check the device cache and load it if necessary.
+ur_result_t ur_platform_handle_t_::populateDeviceCacheIfNeeded() {
+  std::scoped_lock<ur_shared_mutex> Lock(PiDevicesCacheMutex);
+
+  if (DeviceCachePopulated) {
+    return UR_RESULT_SUCCESS;
+  }
+
+  uint32_t ZeDeviceCount = 0;
+  ZE2UR_CALL(zeDeviceGet, (ZeDriver, &ZeDeviceCount, nullptr));
+
+  try {
+    std::vector<ze_device_handle_t> ZeDevices(ZeDeviceCount);
+    ZE2UR_CALL(zeDeviceGet, (ZeDriver, &ZeDeviceCount, ZeDevices.data()));
+
+    for (uint32_t I = 0; I < ZeDeviceCount; ++I) {
+      std::unique_ptr<ur_device_handle_t_> Device(
+          new ur_device_handle_t_(ZeDevices[I], (ur_platform_handle_t)this));
+      UR_CALL(Device->initialize());
+
+      // Additionally we need to cache all sub-devices too, such that they
+      // are readily visible to the piextDeviceCreateWithNativeHandle.
+      //
+      uint32_t SubDevicesCount = 0;
+      ZE2UR_CALL(zeDeviceGetSubDevices,
+                 (Device->ZeDevice, &SubDevicesCount, nullptr));
+
+      auto ZeSubdevices = new ze_device_handle_t[SubDevicesCount];
+      ZE2UR_CALL(zeDeviceGetSubDevices,
+                 (Device->ZeDevice, &SubDevicesCount, ZeSubdevices));
+
+      // Wrap the Level Zero sub-devices into PI sub-devices, and add them to
+      // cache.
+      for (uint32_t I = 0; I < SubDevicesCount; ++I) {
+        std::unique_ptr<ur_device_handle_t_> UrSubDevice(
+            new ur_device_handle_t_(ZeSubdevices[I], (ur_platform_handle_t)this,
+                                    Device.get()));
+        auto Result = UrSubDevice->initialize();
+        if (Result != UR_RESULT_SUCCESS) {
+          delete[] ZeSubdevices;
+          return Result;
+        }
+
+        // collect all the ordinals for the sub-sub-devices
+        std::vector<int> Ordinals;
+
+        uint32_t numQueueGroups = 0;
+        ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties,
+                   (UrSubDevice->ZeDevice, &numQueueGroups, nullptr));
+        if (numQueueGroups == 0) {
+          return UR_RESULT_ERROR_UNKNOWN;
+        }
+        std::vector<ze_command_queue_group_properties_t> QueueGroupProperties(
+            numQueueGroups);
+        ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties,
+                   (UrSubDevice->ZeDevice, &numQueueGroups,
+                    QueueGroupProperties.data()));
+
+        for (uint32_t i = 0; i < numQueueGroups; i++) {
+          if (QueueGroupProperties[i].flags &
+                  ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE &&
+              QueueGroupProperties[i].numQueues > 1) {
+            Ordinals.push_back(i);
+          }
+        }
+
+        // If isn't PVC, then submissions to different CCS can be executed on
+        // the same EUs still, so we cannot treat them as sub-sub-devices.
+        if (UrSubDevice->isPVC() || ExposeCSliceInAffinityPartitioning) {
+          // Create PI sub-sub-devices with the sub-device for all the ordinals.
+          // Each {ordinal, index} points to a specific CCS which constructs
+          // a sub-sub-device at this point.
+          //
+          // FIXME: Level Zero creates multiple PiDevices for a single physical
+          // device when sub-device is partitioned into sub-sub-devices.
+          // Sub-sub-device is technically a command queue and we should not
+          // build program for each command queue. PiDevice is probably not the
+          // right abstraction for a Level Zero command queue.
+          for (uint32_t J = 0; J < Ordinals.size(); ++J) {
+            for (uint32_t K = 0;
+                 K < QueueGroupProperties[Ordinals[J]].numQueues; ++K) {
+              std::unique_ptr<ur_device_handle_t_> PiSubSubDevice(
+                  new ur_device_handle_t_(ZeSubdevices[I],
+                                          (ur_platform_handle_t)this,
+                                          UrSubDevice.get()));
+              UR_CALL(PiSubSubDevice->initialize(Ordinals[J], K));
+
+              // save pointers to sub-sub-devices for quick retrieval in the
+              // future.
+              UrSubDevice->SubDevices.push_back(PiSubSubDevice.get());
+              PiDevicesCache.push_back(std::move(PiSubSubDevice));
+            }
+          }
+        }
+
+        // save pointers to sub-devices for quick retrieval in the future.
+        Device->SubDevices.push_back(UrSubDevice.get());
+        PiDevicesCache.push_back(std::move(UrSubDevice));
+      }
+      delete[] ZeSubdevices;
+
+      // Save the root device in the cache for future uses.
+      PiDevicesCache.push_back(std::move(Device));
+    }
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+  DeviceCachePopulated = true;
+  return UR_RESULT_SUCCESS;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.hpp
index 40f5b961b8df0..2894de7139619 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.hpp
@@ -8,3 +8,47 @@
 #pragma once
 
 #include "ur_level_zero_common.hpp"
+
+struct ur_device_handle_t_;
+
+struct ur_platform_handle_t_ : public _ur_platform {
+  ur_platform_handle_t_(ze_driver_handle_t Driver) : ZeDriver{Driver} {}
+  // Performs initialization of a newly constructed PI platform.
+  ur_result_t initialize();
+
+  // Level Zero lacks the notion of a platform, but there is a driver, which is
+  // a pretty good fit to keep here.
+  ze_driver_handle_t ZeDriver;
+
+  // Cache versions info from zeDriverGetProperties.
+  std::string ZeDriverVersion;
+  std::string ZeDriverApiVersion;
+  ze_api_version_t ZeApiVersion;
+
+  // Cache driver extensions
+  std::unordered_map<std::string, uint32_t> zeDriverExtensionMap;
+
+  // Flags to tell whether various Level Zero platform extensions are available.
+  bool ZeDriverGlobalOffsetExtensionFound{false};
+  bool ZeDriverModuleProgramExtensionFound{false};
+
+  // Cache UR devices for reuse
+  std::vector<std::unique_ptr<ur_device_handle_t_>> PiDevicesCache;
+  ur_shared_mutex PiDevicesCacheMutex;
+  bool DeviceCachePopulated = false;
+
+  // Check the device cache and load it if necessary.
+  ur_result_t populateDeviceCacheIfNeeded();
+
+  // Return the PI device from cache that represents given native device.
+  // If not found, then nullptr is returned.
+  ur_device_handle_t getDeviceFromNativeHandle(ze_device_handle_t);
+
+  // Keep track of all contexts in the platform. This is needed to manage
+  // a lifetime of memory allocations in each context when there are kernels
+  // with indirect access.
+  // TODO: should be deleted when memory isolation in the context is implemented
+  // in the driver.
+  std::list<ur_context_handle_t> Contexts;
+  ur_shared_mutex ContextsMutex;
+};
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp
index ff45091ce6795..f9e32aa395084 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp
@@ -7,3 +7,761 @@
 //===-----------------------------------------------------------------===//
 
 #include "ur_level_zero_program.hpp"
+#include <ur_bindings.hpp>
+
+extern "C" {
+// Check to see if a Level Zero module has any unresolved symbols.
+//
+// @param ZeModule    The module handle to check.
+// @param ZeBuildLog  If there are unresolved symbols, this build log handle is
+//                     modified to receive information telling which symbols
+//                     are unresolved.
+//
+// @return ZE_RESULT_ERROR_MODULE_LINK_FAILURE indicates there are unresolved
+//  symbols.  ZE_RESULT_SUCCESS indicates all symbols are resolved.  Any other
+//  value indicates there was an error and we cannot tell if symbols are
+//  resolved.
+static ze_result_t
+checkUnresolvedSymbols(ze_module_handle_t ZeModule,
+                       ze_module_build_log_handle_t *ZeBuildLog) {
+
+  // First check to see if the module has any imported symbols.  If there are
+  // no imported symbols, it's not possible to have any unresolved symbols.  We
+  // do this check first because we assume it's faster than the call to
+  // zeModuleDynamicLink below.
+  ZeStruct<ze_module_properties_t> ZeModuleProps;
+  ze_result_t ZeResult =
+      ZE_CALL_NOCHECK(zeModuleGetProperties, (ZeModule, &ZeModuleProps));
+  if (ZeResult != ZE_RESULT_SUCCESS)
+    return ZeResult;
+
+  // If there are imported symbols, attempt to "link" the module with itself.
+  // As a side effect, this will return the error
+  // ZE_RESULT_ERROR_MODULE_LINK_FAILURE if there are any unresolved symbols.
+  if (ZeModuleProps.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS) {
+    return ZE_CALL_NOCHECK(zeModuleDynamicLink, (1, &ZeModule, ZeBuildLog));
+  }
+  return ZE_RESULT_SUCCESS;
+}
+} // extern "C"
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithIL(
+    ur_context_handle_t Context, ///< [in] handle of the context instance
+    const void *IL,              ///< [in] pointer to IL binary.
+    size_t Length,               ///< [in] length of `pIL` in bytes.
+    const ur_program_properties_t
+        *Properties, ///< [in][optional] pointer to program creation properties.
+    ur_program_handle_t
+        *Program ///< [out] pointer to handle of program object created.
+) {
+  try {
+    ur_program_handle_t_ *UrProgram =
+        new ur_program_handle_t_(ur_program_handle_t_::IL, Context, IL, Length);
+    *Program = reinterpret_cast<ur_program_handle_t>(UrProgram);
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
+    ur_context_handle_t Context, ///< [in] handle of the context instance
+    ur_device_handle_t
+        Device,            ///< [in] handle to device associated with binary.
+    size_t Size,           ///< [in] size in bytes.
+    const uint8_t *Binary, ///< [in] pointer to binary.
+    const ur_program_properties_t
+        *Properties, ///< [in][optional] pointer to program creation properties.
+    ur_program_handle_t
+        *Program ///< [out] pointer to handle of Program object created.
+) {
+  // In OpenCL, clCreateProgramWithBinary() can be used to load any of the
+  // following: "program executable", "compiled program", or "library of
+  // compiled programs".  In addition, the loaded program can be either
+  // IL (SPIR-v) or native device code.  For now, we assume that
+  // piProgramCreateWithBinary() is only used to load a "program executable"
+  // as native device code.
+  // If we wanted to support all the same cases as OpenCL, we would need to
+  // somehow examine the binary image to distinguish the cases.  Alternatively,
+  // we could change the PI interface and have the caller pass additional
+  // information to distinguish the cases.
+
+  try {
+    ur_program_handle_t_ *UrProgram = new ur_program_handle_t_(
+        ur_program_handle_t_::Native, Context, Binary, Size);
+    *Program = reinterpret_cast<ur_program_handle_t>(UrProgram);
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(
+    ur_context_handle_t Context, ///< [in] handle of the context instance.
+    ur_program_handle_t Program, ///< [in] Handle of the program to build.
+    const char *Options          ///< [in][optional] pointer to build options
+                                 ///< null-terminated string.
+) {
+  // TODO
+  // Check if device belongs to associated context.
+  // UR_ASSERT(Program->Context, UR_RESULT_ERROR_INVALID_PROGRAM);
+  // UR_ASSERT(Program->Context->isValidDevice(Devices[0]),
+  // UR_RESULT_ERROR_INVALID_VALUE);
+
+  // We should have either IL or native device code.
+  UR_ASSERT(Program->Code, UR_RESULT_ERROR_INVALID_PROGRAM);
+
+  // It is legal to build a program created from either IL or from native
+  // device code.
+  if (Program->State != ur_program_handle_t_::IL &&
+      Program->State != ur_program_handle_t_::Native) {
+    return UR_RESULT_ERROR_INVALID_OPERATION;
+  }
+
+  std::scoped_lock<ur_shared_mutex> Guard(Program->Mutex);
+
+  // Ask Level Zero to build and load the native code onto the device.
+  ZeStruct<ze_module_desc_t> ZeModuleDesc;
+  ur_program_handle_t_::SpecConstantShim Shim(Program);
+  ZeModuleDesc.format = (Program->State == ur_program_handle_t_::IL)
+                            ? ZE_MODULE_FORMAT_IL_SPIRV
+                            : ZE_MODULE_FORMAT_NATIVE;
+  ZeModuleDesc.inputSize = Program->CodeLength;
+  ZeModuleDesc.pInputModule = Program->Code.get();
+  ZeModuleDesc.pBuildFlags = Options;
+  ZeModuleDesc.pConstants = Shim.ze();
+
+  ze_device_handle_t ZeDevice = Context->Devices[0]->ZeDevice;
+  ze_context_handle_t ZeContext = Program->Context->ZeContext;
+  ze_module_handle_t ZeModule = nullptr;
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  Program->State = ur_program_handle_t_::Exe;
+  ze_result_t ZeResult =
+      ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc,
+                                       &ZeModule, &Program->ZeBuildLog));
+  if (ZeResult != ZE_RESULT_SUCCESS) {
+    // We adjust pi_program below to avoid attempting to release zeModule when
+    // RT calls piProgramRelease().
+    Program->State = ur_program_handle_t_::Invalid;
+    Result = ze2urResult(ZeResult);
+    if (Program->ZeBuildLog) {
+      ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (Program->ZeBuildLog));
+      Program->ZeBuildLog = nullptr;
+    }
+    if (ZeModule) {
+      ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule));
+      ZeModule = nullptr;
+    }
+  } else {
+    // The call to zeModuleCreate does not report an error if there are
+    // unresolved symbols because it thinks these could be resolved later via a
+    // call to zeModuleDynamicLink.  However, modules created with
+    // piProgramBuild are supposed to be fully linked and ready to use.
+    // Therefore, do an extra check now for unresolved symbols.
+    ZeResult = checkUnresolvedSymbols(ZeModule, &Program->ZeBuildLog);
+    if (ZeResult != ZE_RESULT_SUCCESS) {
+      Program->State = ur_program_handle_t_::Invalid;
+      Result = (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE)
+                   ? UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE
+                   : ze2urResult(ZeResult);
+      if (ZeModule) {
+        ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule));
+        ZeModule = nullptr;
+      }
+    }
+  }
+
+  // We no longer need the IL / native code.
+  Program->Code.reset();
+  Program->ZeModule = ZeModule;
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile(
+    ur_context_handle_t Context, ///< [in] handle of the context instance.
+    ur_program_handle_t
+        Program,        ///< [in][out] handle of the program to compile.
+    const char *Options ///< [in][optional] pointer to build options
+                        ///< null-terminated string.
+) {
+
+  std::scoped_lock<ur_shared_mutex> Guard(Program->Mutex);
+
+  // It's only valid to compile a program created from IL (we don't support
+  // programs created from source code).
+  //
+  // The OpenCL spec says that the header parameters are ignored when compiling
+  // IL programs, so we don't validate them.
+  if (Program->State != ur_program_handle_t_::IL)
+    return UR_RESULT_ERROR_INVALID_OPERATION;
+
+  // We don't compile anything now.  Instead, we delay compilation until
+  // piProgramLink, where we do both compilation and linking as a single step.
+  // This produces better code because the driver can do cross-module
+  // optimizations.  Therefore, we just remember the compilation flags, so we
+  // can use them later.
+  if (Options)
+    Program->BuildFlags = Options;
+  Program->State = ur_program_handle_t_::Object;
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramLink(
+    ur_context_handle_t Context, ///< [in] handle of the context instance.
+    uint32_t Count, ///< [in] number of program handles in `phPrograms`.
+    const ur_program_handle_t *Programs, ///< [in][range(0, count)] pointer to
+                                         ///< array of program handles.
+    const char *Options, ///< [in][optional] pointer to linker options
+                         ///< null-terminated string.
+    ur_program_handle_t
+        *Program ///< [out] pointer to handle of program object created.
+) {
+  // TODO
+  // UR_ASSERT(Context->isValidDevice(Context->Devices[0]),
+  // UR_RESULT_ERROR_INVALID_DEVICE);
+
+  // We do not support any link flags at this time because the Level Zero API
+  // does not have any way to pass flags that are specific to linking.
+  if (Options && *Options != '\0') {
+    std::string ErrorMessage(
+        "Level Zero does not support kernel link flags: \"");
+    ErrorMessage.append(Options);
+    ErrorMessage.push_back('\"');
+    ur_program_handle_t_ *UrProgram = new ur_program_handle_t_(
+        ur_program_handle_t_::Invalid, Context, ErrorMessage);
+    *Program = reinterpret_cast<ur_program_handle_t>(UrProgram);
+    return UR_RESULT_ERROR_PROGRAM_LINK_FAILURE;
+  }
+
+  ur_result_t UrResult = UR_RESULT_SUCCESS;
+  try {
+    // Acquire a "shared" lock on each of the input programs, and also validate
+    // that they are all in Object state.
+    //
+    // There is no danger of deadlock here even if two threads call
+    // piProgramLink simultaneously with the same input programs in a different
+    // order.  If we were acquiring these with "exclusive" access, this could
+    // lead to a classic lock ordering deadlock.  However, there is no such
+    // deadlock potential with "shared" access.  There could also be a deadlock
+    // potential if there was some other code that holds more than one of these
+    // locks simultaneously with "exclusive" access.  However, there is no such
+    // code like that, so this is also not a danger.
+    std::vector<std::shared_lock<ur_shared_mutex>> Guards(Count);
+    for (uint32_t I = 0; I < Count; I++) {
+      std::shared_lock<ur_shared_mutex> Guard(Programs[I]->Mutex);
+      Guards[I].swap(Guard);
+      if (Programs[I]->State != ur_program_handle_t_::Object) {
+        return UR_RESULT_ERROR_INVALID_OPERATION;
+      }
+    }
+
+    // Previous calls to piProgramCompile did not actually compile the SPIR-V.
+    // Instead, we postpone compilation until this point, when all the modules
+    // are linked together.  By doing compilation and linking together, the JIT
+    // compiler is able see all modules and do cross-module optimizations.
+    //
+    // Construct a ze_module_program_exp_desc_t which contains information about
+    // all of the modules that will be linked together.
+    ZeStruct<ze_module_program_exp_desc_t> ZeExtModuleDesc;
+    std::vector<size_t> CodeSizes(Count);
+    std::vector<const uint8_t *> CodeBufs(Count);
+    std::vector<const char *> BuildFlagPtrs(Count);
+    std::vector<const ze_module_constants_t *> SpecConstPtrs(Count);
+    std::vector<ur_program_handle_t_::SpecConstantShim> SpecConstShims;
+    SpecConstShims.reserve(Count);
+
+    for (uint32_t I = 0; I < Count; I++) {
+      ur_program_handle_t Program = Programs[I];
+      CodeSizes[I] = Program->CodeLength;
+      CodeBufs[I] = Program->Code.get();
+      BuildFlagPtrs[I] = Program->BuildFlags.c_str();
+      SpecConstShims.emplace_back(Program);
+      SpecConstPtrs[I] = SpecConstShims[I].ze();
+    }
+
+    ZeExtModuleDesc.count = Count;
+    ZeExtModuleDesc.inputSizes = CodeSizes.data();
+    ZeExtModuleDesc.pInputModules = CodeBufs.data();
+    ZeExtModuleDesc.pBuildFlags = BuildFlagPtrs.data();
+    ZeExtModuleDesc.pConstants = SpecConstPtrs.data();
+
+    ZeStruct<ze_module_desc_t> ZeModuleDesc;
+    ZeModuleDesc.pNext = &ZeExtModuleDesc;
+    ZeModuleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV;
+
+    // This works around a bug in the Level Zero driver.  When "ZE_DEBUG=-1",
+    // the driver does validation of the API calls, and it expects
+    // "pInputModule" to be non-NULL and "inputSize" to be non-zero.  This
+    // validation is wrong when using the "ze_module_program_exp_desc_t"
+    // extension because those fields are supposed to be ignored.  As a
+    // workaround, set both fields to 1.
+    //
+    // TODO: Remove this workaround when the driver is fixed.
+    ZeModuleDesc.pInputModule = reinterpret_cast<const uint8_t *>(1);
+    ZeModuleDesc.inputSize = 1;
+
+    // We need a Level Zero extension to compile multiple programs together into
+    // a single Level Zero module.  However, we don't need that extension if
+    // there happens to be only one input program.
+    //
+    // The "|| (NumInputPrograms == 1)" term is a workaround for a bug in the
+    // Level Zero driver.  The driver's "ze_module_program_exp_desc_t"
+    // extension should work even in the case when there is just one input
+    // module.  However, there is currently a bug in the driver that leads to a
+    // crash.  As a workaround, do not use the extension when there is one
+    // input module.
+    //
+    // TODO: Remove this workaround when the driver is fixed.
+    if (!Context->Devices[0]->Platform->ZeDriverModuleProgramExtensionFound ||
+        (Count == 1)) {
+      if (Count == 1) {
+        ZeModuleDesc.pNext = nullptr;
+        ZeModuleDesc.inputSize = ZeExtModuleDesc.inputSizes[0];
+        ZeModuleDesc.pInputModule = ZeExtModuleDesc.pInputModules[0];
+        ZeModuleDesc.pBuildFlags = ZeExtModuleDesc.pBuildFlags[0];
+        ZeModuleDesc.pConstants = ZeExtModuleDesc.pConstants[0];
+      } else {
+        urPrint("urProgramLink: level_zero driver does not have static linking "
+                "support.");
+        return UR_RESULT_ERROR_INVALID_VALUE;
+      }
+    }
+
+    // Call the Level Zero API to compile, link, and create the module.
+    ze_device_handle_t ZeDevice = Context->Devices[0]->ZeDevice;
+    ze_context_handle_t ZeContext = Context->ZeContext;
+    ze_module_handle_t ZeModule = nullptr;
+    ze_module_build_log_handle_t ZeBuildLog = nullptr;
+    ze_result_t ZeResult =
+        ZE_CALL_NOCHECK(zeModuleCreate, (ZeContext, ZeDevice, &ZeModuleDesc,
+                                         &ZeModule, &ZeBuildLog));
+
+    // We still create a ur_program_handle_t_ object even if there is a
+    // BUILD_FAILURE because we need the object to hold the ZeBuildLog.  There
+    // is no build log created for other errors, so we don't create an object.
+    UrResult = ze2urResult(ZeResult);
+    if (ZeResult != ZE_RESULT_SUCCESS &&
+        ZeResult != ZE_RESULT_ERROR_MODULE_BUILD_FAILURE) {
+      return ze2urResult(ZeResult);
+    }
+
+    // The call to zeModuleCreate does not report an error if there are
+    // unresolved symbols because it thinks these could be resolved later via a
+    // call to zeModuleDynamicLink.  However, modules created with piProgramLink
+    // are supposed to be fully linked and ready to use.  Therefore, do an extra
+    // check now for unresolved symbols.  Note that we still create a
+    // ur_program_handle_t_ if there are unresolved symbols because the
+    // ZeBuildLog tells which symbols are unresolved.
+    if (ZeResult == ZE_RESULT_SUCCESS) {
+      ZeResult = checkUnresolvedSymbols(ZeModule, &ZeBuildLog);
+      if (ZeResult == ZE_RESULT_ERROR_MODULE_LINK_FAILURE) {
+        UrResult =
+            UR_RESULT_ERROR_UNKNOWN; // TODO:
+                                     // UR_RESULT_ERROR_PROGRAM_LINK_FAILURE;
+      } else if (ZeResult != ZE_RESULT_SUCCESS) {
+        return ze2urResult(ZeResult);
+      }
+    }
+
+    ur_program_handle_t_::state State = (UrResult == UR_RESULT_SUCCESS)
+                                            ? ur_program_handle_t_::Exe
+                                            : ur_program_handle_t_::Invalid;
+    ur_program_handle_t_ *UrProgram =
+        new ur_program_handle_t_(State, Context, ZeModule, ZeBuildLog);
+    *Program = reinterpret_cast<ur_program_handle_t>(UrProgram);
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+  return UrResult;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramRetain(
+    ur_program_handle_t Program ///< [in] handle for the Program to retain
+) {
+  Program->RefCount.increment();
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramRelease(
+    ur_program_handle_t Program ///< [in] handle for the Program to release
+) {
+  if (!Program->RefCount.decrementAndTest())
+    return UR_RESULT_SUCCESS;
+
+  delete Program;
+
+  return UR_RESULT_SUCCESS;
+}
+
+// Function gets characters between delimeter's in str
+// then checks if they are equal to the sub_str.
+// returns true if there is at least one instance
+// returns false if there are no instances of the name
+static bool is_in_separated_string(const std::string &str, char delimiter,
+                                   const std::string &sub_str) {
+  size_t beg = 0;
+  size_t length = 0;
+  for (const auto &x : str) {
+    if (x == delimiter) {
+      if (str.substr(beg, length) == sub_str)
+        return true;
+
+      beg += length + 1;
+      length = 0;
+      continue;
+    }
+    length++;
+  }
+  if (length != 0)
+    if (str.substr(beg, length) == sub_str)
+      return true;
+
+  return false;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer(
+    ur_device_handle_t
+        Device, ///< [in] handle of the device to retrieve pointer for.
+    ur_program_handle_t
+        Program, ///< [in] handle of the program to search for function in.
+                 ///< The program must already be built to the specified
+                 ///< device, or otherwise
+                 ///< ::UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE is returned.
+    const char *FunctionName, ///< [in] A null-terminates string denoting the
+                              ///< mangled function name.
+    void **FunctionPointerRet ///< [out] Returns the pointer to the function if
+                              ///< it is found in the program.
+) {
+  std::ignore = Device;
+
+  std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
+  if (Program->State != ur_program_handle_t_::Exe) {
+    return UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE;
+  }
+
+  ze_result_t ZeResult =
+      ZE_CALL_NOCHECK(zeModuleGetFunctionPointer,
+                      (Program->ZeModule, FunctionName, FunctionPointerRet));
+
+  // zeModuleGetFunctionPointer currently fails for all
+  // kernels regardless of if the kernel exist or not
+  // with ZE_RESULT_ERROR_INVALID_ARGUMENT
+  // TODO: remove when this is no longer the case
+  // If zeModuleGetFunctionPointer returns invalid argument,
+  // fallback to searching through kernel list and return
+  // PI_ERROR_FUNCTION_ADDRESS_IS_NOT_AVAILABLE if the function exists
+  // or PI_ERROR_INVALID_KERNEL_NAME if the function does not exist.
+  // FunctionPointerRet should always be 0
+  if (ZeResult == ZE_RESULT_ERROR_INVALID_ARGUMENT) {
+    size_t Size;
+    *FunctionPointerRet = 0;
+    UR_CALL(urProgramGetInfo(Program, UR_PROGRAM_INFO_KERNEL_NAMES, 0, nullptr,
+                             &Size));
+
+    std::string ClResult(Size, ' ');
+    UR_CALL(urProgramGetInfo(Program, UR_PROGRAM_INFO_KERNEL_NAMES,
+                             ClResult.size(), &ClResult[0], nullptr));
+
+    // Get rid of the null terminator and search for kernel_name
+    // If function can be found return error code to indicate it
+    // exists
+    ClResult.pop_back();
+    if (is_in_separated_string(ClResult, ';', std::string(FunctionName)))
+      return UR_RESULT_ERROR_INVALID_FUNCTION_NAME;
+
+    return UR_RESULT_ERROR_INVALID_KERNEL_NAME;
+  }
+
+  if (ZeResult == ZE_RESULT_ERROR_INVALID_FUNCTION_NAME) {
+    *FunctionPointerRet = 0;
+    return UR_RESULT_ERROR_INVALID_KERNEL_NAME;
+  }
+
+  return ze2urResult(ZeResult);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramGetInfo(
+    ur_program_handle_t Program, ///< [in] handle of the Program object
+    ur_program_info_t PropName,  ///< [in] name of the Program property to query
+    size_t PropSize,             ///< [in] the size of the Program property.
+    void *ProgramInfo,  ///< [in,out][optional] array of bytes of holding the
+                        ///< program info property. If propSize is not equal to
+                        ///< or greater than the real number of bytes needed to
+                        ///< return the info then the
+                        ///< ::UR_RESULT_ERROR_INVALID_SIZE error is returned
+                        ///< and pProgramInfo is not used.
+    size_t *PropSizeRet ///< [out][optional] pointer to the actual size in
+                        ///< bytes of data copied to propName.
+) {
+  UrReturnHelper ReturnValue(PropSize, ProgramInfo, PropSizeRet);
+
+  switch (PropName) {
+  case UR_PROGRAM_INFO_REFERENCE_COUNT:
+    return ReturnValue(uint32_t{Program->RefCount.load()});
+  case UR_PROGRAM_INFO_CONTEXT:
+    return ReturnValue(Program->Context);
+  case UR_PROGRAM_INFO_NUM_DEVICES:
+    // TODO: return true number of devices this program exists for.
+    return ReturnValue(uint32_t{1});
+  case UR_PROGRAM_INFO_DEVICES:
+    // TODO: return all devices this program exists for.
+    return ReturnValue(Program->Context->Devices[0]);
+  case UR_PROGRAM_INFO_BINARY_SIZES: {
+    std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
+    size_t SzBinary;
+    if (Program->State == ur_program_handle_t_::IL ||
+        Program->State == ur_program_handle_t_::Native ||
+        Program->State == ur_program_handle_t_::Object) {
+      SzBinary = Program->CodeLength;
+    } else if (Program->State == ur_program_handle_t_::Exe) {
+      ZE2UR_CALL(zeModuleGetNativeBinary,
+                 (Program->ZeModule, &SzBinary, nullptr));
+    } else {
+      return UR_RESULT_ERROR_INVALID_PROGRAM;
+    }
+    // This is an array of 1 element, initialized as if it were scalar.
+    return ReturnValue(size_t{SzBinary});
+  }
+  case UR_PROGRAM_INFO_BINARIES: {
+    // The caller sets "ParamValue" to an array of pointers, one for each
+    // device.  Since Level Zero supports only one device, there is only one
+    // pointer.  If the pointer is NULL, we don't do anything.  Otherwise, we
+    // copy the program's binary image to the buffer at that pointer.
+    uint8_t **PBinary = ur_cast<uint8_t **>(ProgramInfo);
+    if (!PBinary[0])
+      break;
+
+    std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
+    if (Program->State == ur_program_handle_t_::IL ||
+        Program->State == ur_program_handle_t_::Native ||
+        Program->State == ur_program_handle_t_::Object) {
+      std::memcpy(PBinary[0], Program->Code.get(), Program->CodeLength);
+    } else if (Program->State == ur_program_handle_t_::Exe) {
+      size_t SzBinary = 0;
+      ZE2UR_CALL(zeModuleGetNativeBinary,
+                 (Program->ZeModule, &SzBinary, PBinary[0]));
+    } else {
+      return UR_RESULT_ERROR_INVALID_PROGRAM;
+    }
+    break;
+  }
+  case UR_PROGRAM_INFO_NUM_KERNELS: {
+    std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
+    uint32_t NumKernels;
+    if (Program->State == ur_program_handle_t_::IL ||
+        Program->State == ur_program_handle_t_::Native ||
+        Program->State == ur_program_handle_t_::Object) {
+      return UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE;
+    } else if (Program->State == ur_program_handle_t_::Exe) {
+      NumKernels = 0;
+      ZE2UR_CALL(zeModuleGetKernelNames,
+                 (Program->ZeModule, &NumKernels, nullptr));
+    } else {
+      return UR_RESULT_ERROR_INVALID_PROGRAM;
+    }
+    return ReturnValue(size_t{NumKernels});
+  }
+  case UR_PROGRAM_INFO_KERNEL_NAMES:
+    try {
+      std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
+      std::string PINames{""};
+      if (Program->State == ur_program_handle_t_::IL ||
+          Program->State == ur_program_handle_t_::Native ||
+          Program->State == ur_program_handle_t_::Object) {
+        return UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE;
+      } else if (Program->State == ur_program_handle_t_::Exe) {
+        uint32_t Count = 0;
+        ZE2UR_CALL(zeModuleGetKernelNames,
+                   (Program->ZeModule, &Count, nullptr));
+        std::unique_ptr<const char *[]> PNames(new const char *[Count]);
+        ZE2UR_CALL(zeModuleGetKernelNames,
+                   (Program->ZeModule, &Count, PNames.get()));
+        for (uint32_t I = 0; I < Count; ++I) {
+          PINames += (I > 0 ? ";" : "");
+          PINames += PNames[I];
+        }
+      } else {
+        return UR_RESULT_ERROR_INVALID_PROGRAM;
+      }
+      return ReturnValue(PINames.c_str());
+    } catch (const std::bad_alloc &) {
+      return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    } catch (...) {
+      return UR_RESULT_ERROR_UNKNOWN;
+    }
+  default:
+    die("urProgramGetInfo: not implemented");
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramGetBuildInfo(
+    ur_program_handle_t Program, ///< [in] handle of the Program object
+    ur_device_handle_t Device,   ///< [in] handle of the Device object
+    ur_program_build_info_t
+        PropName,    ///< [in] name of the Program build info to query
+    size_t PropSize, ///< [in] size of the Program build info property.
+    void *PropValue, ///< [in,out][optional] value of the Program build
+                     ///< property. If propSize is not equal to or greater than
+                     ///< the real number of bytes needed to return the info
+                     ///< then the ::UR_RESULT_ERROR_INVALID_SIZE error is
+                     ///< returned and pKernelInfo is not used.
+    size_t *PropSizeRet ///< [out][optional] pointer to the actual size in
+                        ///< bytes of data being queried by propName.
+) {
+  std::ignore = Device;
+
+  std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
+  UrReturnHelper ReturnValue(PropSize, PropValue, PropSizeRet);
+  if (PropName == UR_PROGRAM_BUILD_INFO_BINARY_TYPE) {
+    ur_program_binary_type_t Type = UR_PROGRAM_BINARY_TYPE_NONE;
+    if (Program->State == ur_program_handle_t_::Object) {
+      Type = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
+    } else if (Program->State == ur_program_handle_t_::Exe) {
+      Type = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;
+    }
+    return ReturnValue(ur_program_binary_type_t{Type});
+  }
+  if (PropName == UR_PROGRAM_BUILD_INFO_OPTIONS) {
+    // TODO: how to get module build options out of Level Zero?
+    // For the programs that we compiled we can remember the options
+    // passed with piProgramCompile/piProgramBuild, but what can we
+    // return for programs that were built outside and registered
+    // with piProgramRegister?
+    return ReturnValue("");
+  } else if (PropName == UR_PROGRAM_BUILD_INFO_LOG) {
+    // Check first to see if the plugin code recorded an error message.
+    if (!Program->ErrorMessage.empty()) {
+      return ReturnValue(Program->ErrorMessage.c_str());
+    }
+
+    // Next check if there is a Level Zero build log.
+    if (Program->ZeBuildLog) {
+      size_t LogSize = PropSize;
+      ZE2UR_CALL(zeModuleBuildLogGetString,
+                 (Program->ZeBuildLog, &LogSize, ur_cast<char *>(PropValue)));
+      if (PropSizeRet) {
+        *PropSizeRet = LogSize;
+      }
+      return UR_RESULT_SUCCESS;
+    }
+
+    // Otherwise, there is no error.  The OpenCL spec says to return an empty
+    // string if there ws no previous attempt to compile, build, or link the
+    // program.
+    return ReturnValue("");
+  } else {
+    urPrint("urProgramGetBuildInfo: unsupported ParamName\n");
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstant(
+    ur_program_handle_t Program, ///< [in] handle of the Program object
+    uint32_t SpecId,             ///< [in] specification constant Id
+    size_t SpecSize,      ///< [in] size of the specialization constant value
+    const void *SpecValue ///< [in] pointer to the specialization value bytes
+) {
+  std::ignore = Program;
+  std::ignore = SpecId;
+  std::ignore = SpecSize;
+  std::ignore = SpecValue;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
+    ur_program_handle_t Program,      ///< [in] handle of the program.
+    ur_native_handle_t *NativeProgram ///< [out] a pointer to the native
+                                      ///< handle of the program.
+) {
+  auto ZeModule = ur_cast<ze_module_handle_t *>(NativeProgram);
+
+  std::shared_lock<ur_shared_mutex> Guard(Program->Mutex);
+  switch (Program->State) {
+  case ur_program_handle_t_::Exe: {
+    *ZeModule = Program->ZeModule;
+    break;
+  }
+
+  default:
+    return UR_RESULT_ERROR_INVALID_OPERATION;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle(
+    ur_native_handle_t
+        NativeProgram,           ///< [in] the native handle of the program.
+    ur_context_handle_t Context, ///< [in] handle of the context instance
+    ur_program_handle_t *Program ///< [out] pointer to the handle of the
+                                 ///< program object created.
+) {
+  auto ZeModule = ur_cast<ze_module_handle_t>(NativeProgram);
+
+  // We assume here that programs created from a native handle always
+  // represent a fully linked executable (state Exe) and not an unlinked
+  // executable (state Object).
+
+  try {
+    ur_program_handle_t_ *UrProgram =
+        new ur_program_handle_t_(ur_program_handle_t_::Exe, Context, ZeModule);
+    *Program = reinterpret_cast<ur_program_handle_t>(UrProgram);
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+ur_program_handle_t_::~ur_program_handle_t_() {
+  // According to Level Zero Specification, all kernels and build logs
+  // must be destroyed before the Module can be destroyed.  So, be sure
+  // to destroy build log before destroying the module.
+  // printf("ZeBuildLog %lx\n", (unsigned long int)ZeBuildLog);
+  if (ZeBuildLog) {
+    ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (ZeBuildLog));
+  }
+
+  // printf("ZeModule %lx OwnZeModule %d\n", (unsigned long int)ZeModule,
+  // OwnZeModule);
+  if (ZeModule && OwnZeModule) {
+    ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule));
+  }
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants(
+    ur_program_handle_t Program, ///< [in] handle of the Program object
+    uint32_t Count, ///< [in] the number of elements in the pSpecConstants array
+    const ur_specialization_constant_info_t
+        *SpecConstants ///< [in][range(0, count)] array of specialization
+                       ///< constant value descriptions
+) {
+  std::scoped_lock<ur_shared_mutex> Guard(Program->Mutex);
+
+  // Remember the value of this specialization constant until the program is
+  // built.  Note that we only save the pointer to the buffer that contains the
+  // value.  The caller is responsible for maintaining storage for this buffer.
+  //
+  // NOTE: SpecSize is unused in Level Zero, the size is known from SPIR-V by
+  // SpecID.
+  for (uint32_t SpecIt = 0; SpecIt < Count; SpecIt++) {
+    uint32_t SpecId = SpecConstants[SpecIt].id;
+    Program->SpecConstants[SpecId] = SpecConstants[SpecIt].pValue;
+  }
+  return UR_RESULT_SUCCESS;
+}
\ No newline at end of file
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.hpp
index 9a2f9604f08c5..35cd9fe93ae1d 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.hpp
@@ -9,6 +9,125 @@
 
 #include "ur_level_zero_common.hpp"
 
-struct _ur_program_handle_t : _ur_object {
-  _ur_program_handle_t() {}
+struct ur_program_handle_t_ : _ur_object {
+  // ur_program_handle_t_() {}
+
+  typedef enum {
+    // The program has been created from intermediate language (SPIR-V), but it
+    // is not yet compiled.
+    IL,
+
+    // The program has been created by loading native code, but it has not yet
+    // been built.  This is equivalent to an OpenCL "program executable" that
+    // is loaded via clCreateProgramWithBinary().
+    Native,
+
+    // The program was notionally compiled from SPIR-V form.  However, since we
+    // postpone compilation until the module is linked, the internal state
+    // still represents the module as SPIR-V.
+    Object,
+
+    // The program has been built or linked, and it is represented as a Level
+    // Zero module.
+    Exe,
+
+    // An error occurred during piProgramLink, but we created a _pi_program
+    // object anyways in order to hold the ZeBuildLog.  Note that the ZeModule
+    // may or may not be nullptr in this state, depending on the error.
+    Invalid
+  } state;
+
+  // A utility class that converts specialization constants into the form
+  // required by the Level Zero driver.
+  class SpecConstantShim {
+  public:
+    SpecConstantShim(ur_program_handle_t_ *Program) {
+      ZeSpecConstants.numConstants = Program->SpecConstants.size();
+      ZeSpecContantsIds.reserve(ZeSpecConstants.numConstants);
+      ZeSpecContantsValues.reserve(ZeSpecConstants.numConstants);
+
+      for (auto &SpecConstant : Program->SpecConstants) {
+        ZeSpecContantsIds.push_back(SpecConstant.first);
+        ZeSpecContantsValues.push_back(SpecConstant.second);
+      }
+      ZeSpecConstants.pConstantIds = ZeSpecContantsIds.data();
+      ZeSpecConstants.pConstantValues = ZeSpecContantsValues.data();
+    }
+
+    const ze_module_constants_t *ze() { return &ZeSpecConstants; }
+
+  private:
+    std::vector<uint32_t> ZeSpecContantsIds;
+    std::vector<const void *> ZeSpecContantsValues;
+    ze_module_constants_t ZeSpecConstants;
+  };
+
+  // Construct a program in IL or Native state.
+  ur_program_handle_t_(state St, ur_context_handle_t Context, const void *Input,
+                       size_t Length)
+      : Context{Context}, OwnZeModule{true}, State{St},
+        Code{new uint8_t[Length]}, CodeLength{Length}, ZeModule{nullptr},
+        ZeBuildLog{nullptr} {
+    std::memcpy(Code.get(), Input, Length);
+  }
+
+  // Construct a program in Exe or Invalid state.
+  ur_program_handle_t_(state St, ur_context_handle_t Context,
+                       ze_module_handle_t ZeModule,
+                       ze_module_build_log_handle_t ZeBuildLog)
+      : Context{Context}, OwnZeModule{true}, State{St}, ZeModule{ZeModule},
+        ZeBuildLog{ZeBuildLog} {}
+
+  // Construct a program in Exe state (interop).
+  ur_program_handle_t_(state St, ur_context_handle_t Context,
+                       ze_module_handle_t ZeModule, bool OwnZeModule)
+      : Context{Context}, OwnZeModule{OwnZeModule}, State{St},
+        ZeModule{ZeModule}, ZeBuildLog{nullptr} {}
+
+  // Construct a program from native handle
+  ur_program_handle_t_(state St, ur_context_handle_t Context,
+                       ze_module_handle_t ZeModule)
+      : Context{Context}, OwnZeModule{true}, State{St}, ZeModule{ZeModule},
+        ZeBuildLog{nullptr} {}
+
+  // Construct a program in Invalid state with a custom error message.
+  ur_program_handle_t_(state St, ur_context_handle_t Context,
+                       const std::string &ErrorMessage)
+      : Context{Context}, OwnZeModule{true}, ErrorMessage{ErrorMessage},
+        State{St}, ZeModule{nullptr}, ZeBuildLog{nullptr} {}
+
+  ~ur_program_handle_t_();
+
+  const ur_context_handle_t Context; // Context of the program.
+
+  // Indicates if we own the ZeModule or it came from interop that
+  // asked to not transfer the ownership to SYCL RT.
+  const bool OwnZeModule;
+
+  // This error message is used only in Invalid state to hold a custom error
+  // message from a call to piProgramLink.
+  const std::string ErrorMessage;
+
+  state State;
+
+  // In IL and Object states, this contains the SPIR-V representation of the
+  // module.  In Native state, it contains the native code.
+  std::unique_ptr<uint8_t[]> Code; // Array containing raw IL / native code.
+  size_t CodeLength{0};            // Size (bytes) of the array.
+
+  // Used only in IL and Object states.  Contains the SPIR-V specialization
+  // constants as a map from the SPIR-V "SpecID" to a buffer that contains the
+  // associated value.  The caller of the PI layer is responsible for
+  // maintaining the storage of this buffer.
+  std::unordered_map<uint32_t, const void *> SpecConstants;
+
+  // Used only in Object state.  Contains the build flags from the last call to
+  // piProgramCompile().
+  std::string BuildFlags;
+
+  // The Level Zero module handle.  Used primarily in Exe state.
+  ze_module_handle_t ZeModule{};
+
+  // The Level Zero build log from the last call to zeModuleCreate().
+  ze_module_build_log_handle_t ZeBuildLog{};
 };
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
index 8838bb74269bf..e3e21eb3e98e2 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
@@ -6,4 +6,1786 @@
 //
 //===-----------------------------------------------------------------===//
 
+#include <algorithm>
+#include <climits>
+#include <optional>
+#include <string.h>
+
+#include "ur_level_zero_common.hpp"
 #include "ur_level_zero_queue.hpp"
+#include <ur_bindings.hpp>
+
+/// @brief Cleanup events in the immediate lists of the queue.
+/// @param Queue Queue where events need to be cleaned up.
+/// @param QueueLocked Indicates if the queue mutex is locked by caller.
+/// @param QueueSynced 'true' if queue was synchronized before the
+/// call and no other commands were submitted after synchronization, 'false'
+/// otherwise.
+/// @param CompletedEvent Hint providing an event which was synchronized before
+/// the call, in case of in-order queue it allows to cleanup all preceding
+/// events.
+/// @return PI_SUCCESS if successful, PI error code otherwise.
+ur_result_t CleanupEventsInImmCmdLists(ur_queue_handle_t UrQueue,
+                                       bool QueueLocked, bool QueueSynced,
+                                       ur_event_handle_t CompletedEvent) {
+  // Handle only immediate command lists here.
+  if (!UrQueue || !UrQueue->Device->ImmCommandListUsed)
+    return UR_RESULT_SUCCESS;
+
+  ur_event_handle_t_ *UrCompletedEvent =
+      reinterpret_cast<ur_event_handle_t_ *>(CompletedEvent);
+
+  std::vector<ur_event_handle_t> EventListToCleanup;
+  {
+    std::unique_lock<ur_shared_mutex> QueueLock(UrQueue->Mutex,
+                                                std::defer_lock);
+    if (!QueueLocked)
+      QueueLock.lock();
+    // If queue is locked and fully synchronized then cleanup all events.
+    // If queue is not locked then by this time there may be new submitted
+    // commands so we can't do full cleanup.
+    if (QueueLocked &&
+        (QueueSynced || (UrQueue->isInOrderQueue() &&
+                         (reinterpret_cast<ur_event_handle_t>(
+                              UrCompletedEvent) == UrQueue->LastCommandEvent ||
+                          !UrQueue->LastCommandEvent)))) {
+      UrQueue->LastCommandEvent = nullptr;
+      for (auto &&It = UrQueue->CommandListMap.begin();
+           It != UrQueue->CommandListMap.end(); ++It) {
+        UR_CALL(UrQueue->resetCommandList(It, true, EventListToCleanup,
+                                          false /* CheckStatus */));
+      }
+    } else if (UrQueue->isInOrderQueue() && UrCompletedEvent) {
+      // If the queue is in-order and we have information about completed event
+      // then cleanup all events in the command list preceding to CompletedEvent
+      // including itself.
+
+      // Check that the comleted event has associated command list.
+      if (!(UrCompletedEvent->CommandList &&
+            UrCompletedEvent->CommandList.value() !=
+                UrQueue->CommandListMap.end()))
+        return UR_RESULT_SUCCESS;
+
+      auto &CmdListEvents =
+          UrCompletedEvent->CommandList.value()->second.EventList;
+      auto CompletedEventIt = std::find(CmdListEvents.begin(),
+                                        CmdListEvents.end(), UrCompletedEvent);
+      if (CompletedEventIt != CmdListEvents.end()) {
+        // We can cleanup all events prior to the completed event in this
+        // command list and completed event itself.
+        // TODO: we can potentially cleanup more events here by finding
+        // completed events on another command lists, but it is currently not
+        // implemented.
+        std::move(std::begin(CmdListEvents), CompletedEventIt + 1,
+                  std::back_inserter(EventListToCleanup));
+        CmdListEvents.erase(CmdListEvents.begin(), CompletedEventIt + 1);
+      }
+    } else {
+      // Fallback to resetCommandList over all command lists.
+      for (auto &&It = UrQueue->CommandListMap.begin();
+           It != UrQueue->CommandListMap.end(); ++It) {
+        UR_CALL(UrQueue->resetCommandList(It, true, EventListToCleanup,
+                                          true /* CheckStatus */));
+      }
+    }
+  }
+  UR_CALL(CleanupEventListFromResetCmdList(EventListToCleanup, QueueLocked));
+  return UR_RESULT_SUCCESS;
+}
+
+/// @brief Reset signalled command lists in the queue and put them to the cache
+/// of command lists. Also cleanup events associated with signalled command
+/// lists. Queue must be locked by the caller for modification.
+/// @param Queue Queue where we look for signalled command lists and cleanup
+/// events.
+/// @return PI_SUCCESS if successful, PI error code otherwise.
+ur_result_t resetCommandLists(ur_queue_handle_t Queue) {
+  // Handle immediate command lists here, they don't need to be reset and we
+  // only need to cleanup events.
+  if (Queue->Device->ImmCommandListUsed) {
+    UR_CALL(CleanupEventsInImmCmdLists(Queue, true /*locked*/));
+    return UR_RESULT_SUCCESS;
+  }
+
+  // We need events to be cleaned up out of scope where queue is locked to avoid
+  // nested locks, because event cleanup requires event to be locked. Nested
+  // locks are hard to control and can cause deadlocks if mutexes are locked in
+  // different order.
+  std::vector<ur_event_handle_t> EventListToCleanup;
+
+  // We check for command lists that have been already signalled, but have not
+  // been added to the available list yet. Each command list has a fence
+  // associated which tracks if a command list has completed dispatch of its
+  // commands and is ready for reuse. If a command list is found to have been
+  // signalled, then the command list & fence are reset and command list is
+  // returned to the command list cache. All events associated with command
+  // list are cleaned up if command list was reset.
+  for (auto &&it = Queue->CommandListMap.begin();
+       it != Queue->CommandListMap.end(); ++it) {
+    // Immediate commandlists don't use a fence and are handled separately
+    // above.
+    assert(it->second.ZeFence != nullptr);
+    // It is possible that the fence was already noted as signalled and
+    // reset. In that case the ZeFenceInUse flag will be false.
+    if (it->second.ZeFenceInUse) {
+      ze_result_t ZeResult =
+          ZE_CALL_NOCHECK(zeFenceQueryStatus, (it->second.ZeFence));
+      if (ZeResult == ZE_RESULT_SUCCESS)
+        UR_CALL(Queue->resetCommandList(it, true, EventListToCleanup));
+    }
+  }
+  CleanupEventListFromResetCmdList(EventListToCleanup, true /*locked*/);
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(
+    ur_queue_handle_t Queue,   ///< [in] handle of the queue object
+    ur_queue_info_t ParamName, ///< [in] name of the queue property to query
+    size_t ParamValueSize, ///< [in] size in bytes of the queue property value
+                           ///< provided
+    void *ParamValue,      ///< [out] value of the queue property
+    size_t *ParamValueSizeRet ///< [out] size in bytes returned in queue
+                              ///< property value
+) {
+
+  std::shared_lock<ur_shared_mutex> Lock(Queue->Mutex);
+  UrReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
+  // TODO: consider support for queue properties and size
+  switch ((uint32_t)ParamName) { // cast to avoid warnings on EXT enum values
+  case UR_QUEUE_INFO_CONTEXT:
+    return ReturnValue(Queue->Context);
+  case UR_QUEUE_INFO_DEVICE:
+    return ReturnValue(Queue->Device);
+  case UR_QUEUE_INFO_REFERENCE_COUNT:
+    return ReturnValue(uint32_t{Queue->RefCount.load()});
+  case UR_QUEUE_INFO_PROPERTIES:
+    die("UR_QUEUE_INFO_PROPERTIES in urQueueGetInfo not implemented\n");
+    break;
+  case UR_QUEUE_INFO_SIZE:
+    die("UR_QUEUE_INFO_SIZE in urQueueGetInfo not implemented\n");
+    break;
+  case UR_QUEUE_INFO_DEVICE_DEFAULT:
+    die("UR_QUEUE_INFO_DEVICE_DEFAULT in urQueueGetInfo not implemented\n");
+    break;
+  case UR_EXT_ONEAPI_QUEUE_INFO_EMPTY: {
+    // We can exit early if we have in-order queue.
+    if (Queue->isInOrderQueue()) {
+      if (!Queue->LastCommandEvent)
+        return ReturnValue(true);
+
+      // We can check status of the event only if it isn't discarded otherwise
+      // it may be reset (because we are free to reuse such events) and
+      // zeEventQueryStatus will hang.
+      // TODO: use more robust way to check that ZeEvent is not owned by
+      // LastCommandEvent.
+      if (!Queue->LastCommandEvent->IsDiscarded) {
+        ze_result_t ZeResult = ZE_CALL_NOCHECK(
+            zeEventQueryStatus, (Queue->LastCommandEvent->ZeEvent));
+        if (ZeResult == ZE_RESULT_NOT_READY) {
+          return ReturnValue(false);
+        } else if (ZeResult != ZE_RESULT_SUCCESS) {
+          return ze2urResult(ZeResult);
+        }
+        return ReturnValue(true);
+      }
+      // For immediate command lists we have to check status of the event
+      // because immediate command lists are not associated with level zero
+      // queue. Conservatively return false in this case because last event is
+      // discarded and we can't check its status.
+      if (Queue->Device->ImmCommandListUsed)
+        return ReturnValue(false);
+    }
+
+    // If we have any open command list which is not empty then return false
+    // because it means that there are commands which are not even submitted for
+    // execution yet.
+    using IsCopy = bool;
+    if (Queue->hasOpenCommandList(IsCopy{true}) ||
+        Queue->hasOpenCommandList(IsCopy{false}))
+      return ReturnValue(false);
+
+    for (const auto &QueueMap :
+         {Queue->ComputeQueueGroupsByTID, Queue->CopyQueueGroupsByTID}) {
+      for (const auto &QueueGroup : QueueMap) {
+        if (Queue->Device->ImmCommandListUsed) {
+          // Immediate command lists are not associated with any Level Zero
+          // queue, that's why we have to check status of events in each
+          // immediate command list. Start checking from the end and exit early
+          // if some event is not completed.
+          for (const auto &ImmCmdList : QueueGroup.second.ImmCmdLists) {
+            if (ImmCmdList == Queue->CommandListMap.end())
+              continue;
+
+            auto EventList = ImmCmdList->second.EventList;
+            for (auto It = EventList.crbegin(); It != EventList.crend(); It++) {
+              ze_result_t ZeResult =
+                  ZE_CALL_NOCHECK(zeEventQueryStatus, ((*It)->ZeEvent));
+              if (ZeResult == ZE_RESULT_NOT_READY) {
+                return ReturnValue(false);
+              } else if (ZeResult != ZE_RESULT_SUCCESS) {
+                return ze2urResult(ZeResult);
+              }
+            }
+          }
+        } else {
+          for (const auto &ZeQueue : QueueGroup.second.ZeQueues) {
+            if (!ZeQueue)
+              continue;
+            // Provide 0 as the timeout parameter to immediately get the status
+            // of the Level Zero queue.
+            ze_result_t ZeResult = ZE_CALL_NOCHECK(zeCommandQueueSynchronize,
+                                                   (ZeQueue, /* timeout */ 0));
+            if (ZeResult == ZE_RESULT_NOT_READY) {
+              return ReturnValue(false);
+            } else if (ZeResult != ZE_RESULT_SUCCESS) {
+              return ze2urResult(ZeResult);
+            }
+          }
+        }
+      }
+    }
+    return ReturnValue(true);
+  }
+  default:
+    urPrint("Unsupported ParamName in urQueueGetInfo: ParamName=%d(0x%x)\n",
+            ParamName, ParamName);
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+// Controls if we should choose doing eager initialization
+// to make it happen on warmup paths and have the reportable
+// paths be less likely affected.
+//
+static bool doEagerInit = [] {
+  const char *EagerInit = std::getenv("SYCL_EAGER_INIT");
+  return EagerInit ? std::atoi(EagerInit) != 0 : false;
+}();
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(
+    ur_context_handle_t hContext, ///< [in] handle of the context object
+    ur_device_handle_t hDevice,   ///< [in] handle of the device object
+    const ur_queue_property_t
+        *pProps, ///< [in] specifies a list of queue properties and their
+                 ///< corresponding values. Each property name is immediately
+                 ///< followed by the corresponding desired value. The list is
+                 ///< terminated with a 0. If a property value is not specified,
+                 ///< then its default value will be used.
+    ur_queue_handle_t
+        *phQueue ///< [out] pointer to handle of queue object created
+) {
+  ur_context_handle_t Context = hContext;
+  ur_device_handle_t Device = hDevice;
+  ur_queue_handle_t_ **Queue = reinterpret_cast<ur_queue_handle_t_ **>(phQueue);
+
+  Context->Devices[0] = Device;
+
+  const pi_queue_properties *Properties =
+      reinterpret_cast<const pi_queue_properties *>(pProps);
+  pi_queue_properties Flags = Properties[1];
+
+  auto ForceComputeIndex = Properties[2] == PI_QUEUE_COMPUTE_INDEX
+                               ? static_cast<int>(Properties[3])
+                               : -1; // Use default/round-robin.
+
+  UR_ASSERT(Context->isValidDevice(Device), UR_RESULT_ERROR_INVALID_DEVICE);
+
+  // Create placeholder queues in the compute queue group.
+  // Actual L0 queues will be created at first use.
+  std::vector<ze_command_queue_handle_t> ZeComputeCommandQueues(
+      Device->QueueGroup[ur_queue_handle_t_::queue_type::Compute]
+          .ZeProperties.numQueues,
+      nullptr);
+
+  // Create placeholder queues in the copy queue group (main and link
+  // native groups are combined into one group).
+  // Actual L0 queues will be created at first use.
+  size_t NumCopyGroups = 0;
+  if (Device->hasMainCopyEngine()) {
+    NumCopyGroups +=
+        Device->QueueGroup[ur_queue_handle_t_::queue_type::MainCopy]
+            .ZeProperties.numQueues;
+  }
+  if (Device->hasLinkCopyEngine()) {
+    NumCopyGroups +=
+        Device->QueueGroup[ur_queue_handle_t_::queue_type::LinkCopy]
+            .ZeProperties.numQueues;
+  }
+  std::vector<ze_command_queue_handle_t> ZeCopyCommandQueues(NumCopyGroups,
+                                                             nullptr);
+
+  try {
+    *Queue =
+        new ur_queue_handle_t_(ZeComputeCommandQueues, ZeCopyCommandQueues,
+                               Context, Device, true, Flags, ForceComputeIndex);
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  // Do eager initialization of Level Zero handles on request.
+  if (doEagerInit) {
+    ur_queue_handle_t Q = *phQueue;
+    // Creates said number of command-lists.
+    auto warmupQueueGroup = [Q](bool UseCopyEngine,
+                                uint32_t RepeatCount) -> ur_result_t {
+      ur_command_list_ptr_t CommandList;
+      while (RepeatCount--) {
+        if (Q->Device->ImmCommandListUsed) {
+          CommandList = Q->getQueueGroup(UseCopyEngine).getImmCmdList();
+        } else {
+          // Heuristically create some number of regular command-list to reuse.
+          for (int I = 0; I < 10; ++I) {
+            UR_CALL(Q->createCommandList(UseCopyEngine, CommandList));
+            // Immediately return them to the cache of available command-lists.
+            std::vector<ur_event_handle_t> EventsUnused;
+            UR_CALL(Q->resetCommandList(CommandList, true /* MakeAvailable */,
+                                        EventsUnused));
+          }
+        }
+      }
+      return UR_RESULT_SUCCESS;
+    };
+    // Create as many command-lists as there are queues in the group.
+    // With this the underlying round-robin logic would initialize all
+    // native queues, and create command-lists and their fences.
+    // At this point only the thread creating the queue will have associated
+    // command-lists. Other threads have not accessed the queue yet. So we can
+    // only warmup the initial thread's command-lists.
+    auto QueueGroup = Q->ComputeQueueGroupsByTID.get();
+    UR_CALL(warmupQueueGroup(false, QueueGroup.UpperIndex -
+                                        QueueGroup.LowerIndex + 1));
+    if (Q->useCopyEngine()) {
+      auto QueueGroup = Q->CopyQueueGroupsByTID.get();
+      UR_CALL(warmupQueueGroup(true, QueueGroup.UpperIndex -
+                                         QueueGroup.LowerIndex + 1));
+    }
+    // TODO: warmup event pools. Both host-visible and device-only.
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(
+    ur_queue_handle_t Queue ///< [in] handle of the queue object to get access
+) {
+  {
+    std::scoped_lock<ur_shared_mutex> Lock(Queue->Mutex);
+    Queue->RefCountExternal++;
+  }
+  Queue->RefCount.increment();
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(
+    ur_queue_handle_t Queue ///< [in] handle of the queue object to release
+) {
+
+  std::vector<ur_event_handle_t> EventListToCleanup;
+  {
+    std::scoped_lock<ur_shared_mutex> Lock(Queue->Mutex);
+
+    if ((--Queue->RefCountExternal) != 0)
+      return UR_RESULT_SUCCESS;
+
+    // When external reference count goes to zero it is still possible
+    // that internal references still exists, e.g. command-lists that
+    // are not yet completed. So do full queue synchronization here
+    // and perform proper cleanup.
+    //
+    // It is possible to get to here and still have an open command list
+    // if no wait or finish ever occurred for this queue.
+    if (auto Res = Queue->executeAllOpenCommandLists())
+      return Res;
+
+    // Make sure all commands get executed.
+    Queue->synchronize();
+
+    // Destroy all the fences created associated with this queue.
+    for (auto it = Queue->CommandListMap.begin();
+         it != Queue->CommandListMap.end(); ++it) {
+      // This fence wasn't yet signalled when we polled it for recycling
+      // the command-list, so need to release the command-list too.
+      // For immediate commandlists we don't need to do an L0 reset of the
+      // commandlist but do need to do event cleanup which is also in the
+      // resetCommandList function.
+      // If the fence is a nullptr we are using immediate commandlists,
+      // otherwise regular commandlists which use a fence.
+      if (it->second.ZeFence == nullptr || it->second.ZeFenceInUse) {
+        Queue->resetCommandList(it, true, EventListToCleanup);
+      }
+      // TODO: remove "if" when the problem is fixed in the level zero
+      // runtime. Destroy only if a queue is healthy. Destroying a fence may
+      // cause a hang otherwise.
+      // If the fence is a nullptr we are using immediate commandlists.
+      if (Queue->Healthy && it->second.ZeFence != nullptr) {
+        auto ZeResult = ZE_CALL_NOCHECK(zeFenceDestroy, (it->second.ZeFence));
+        // Gracefully handle the case that L0 was already unloaded.
+        if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
+          return ze2urResult(ZeResult);
+      }
+    }
+    Queue->CommandListMap.clear();
+  }
+
+  for (auto &Event : EventListToCleanup) {
+    // We don't need to synchronize the events since the queue
+    // synchronized above already does that.
+    {
+      std::scoped_lock<ur_shared_mutex> EventLock(Event->Mutex);
+      Event->Completed = true;
+    }
+    UR_CALL(CleanupCompletedEvent(Event));
+    // This event was removed from the command list, so decrement ref count
+    // (it was incremented when they were added to the command list).
+    UR_CALL(urEventReleaseInternal(reinterpret_cast<ur_event_handle_t>(Event)));
+  }
+  UR_CALL(urQueueReleaseInternal(reinterpret_cast<ur_queue_handle_t>(Queue)));
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(
+    ur_queue_handle_t Queue, ///< [in] handle of the queue.
+    ur_native_handle_t
+        *NativeQueue ///< [out] a pointer to the native handle of the queue.
+) {
+  // Lock automatically releases when this goes out of scope.
+  std::shared_lock<ur_shared_mutex> lock(Queue->Mutex);
+
+  auto ZeQueue = ur_cast<ze_command_queue_handle_t *>(NativeQueue);
+
+  // Extract a Level Zero compute queue handle from the given PI queue
+  auto &QueueGroup = Queue->getQueueGroup(false /*compute*/);
+  uint32_t QueueGroupOrdinalUnused;
+  *ZeQueue = QueueGroup.getZeQueue(&QueueGroupOrdinalUnused);
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
+    ur_native_handle_t NativeQueue, ///< [in] the native handle of the queue.
+    ur_context_handle_t Context,    ///< [in] handle of the context object
+    ur_queue_handle_t
+        *RetQueue ///< [out] pointer to the handle of the queue object created.
+) {
+  auto ZeQueue = ur_cast<ze_command_queue_handle_t>(NativeQueue);
+  // Assume this is the "0" index queue in the compute command-group.
+  std::vector<ze_command_queue_handle_t> ZeQueues{ZeQueue};
+
+  // TODO: see what we can do to correctly initialize PI queue for
+  // compute vs. copy Level-Zero queue. Currently we will send
+  // all commands to the "ZeQueue".
+  std::vector<ze_command_queue_handle_t> ZeroCopyQueues;
+
+  // Get the device handle from first device in the platform
+  // Maybe this is not completely correct.
+  uint32_t NumEntries = 1;
+  ur_platform_handle_t Platform{};
+  UR_CALL(urPlatformGet(NumEntries, &Platform, nullptr));
+
+  ur_device_handle_t Device;
+  UR_CALL(
+      urDeviceGet(Platform, UR_DEVICE_TYPE_GPU, NumEntries, &Device, nullptr));
+
+  try {
+    ur_queue_handle_t_ *Queue = new ur_queue_handle_t_(ZeQueues, ZeroCopyQueues,
+                                                       Context, Device, false);
+    *RetQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(
+    ur_queue_handle_t hQueue ///< [in] handle of the queue to be finished.
+) {
+  // _pi_queue *PiQueue = reinterpret_cast<_pi_queue *>(Queue);
+  // ur_queue_handle_t UrQueue = PiQueue->UrQueue;
+  ur_queue_handle_t_ *UrQueue = reinterpret_cast<ur_queue_handle_t_ *>(hQueue);
+
+  if (UrQueue->Device->ImmCommandListUsed) {
+    // Lock automatically releases when this goes out of scope.
+    std::scoped_lock<ur_shared_mutex> Lock(UrQueue->Mutex);
+
+    UrQueue->synchronize();
+  } else {
+    std::unique_lock<ur_shared_mutex> Lock(UrQueue->Mutex);
+    std::vector<ze_command_queue_handle_t> ZeQueues;
+
+    // execute any command list that may still be open.
+    UR_CALL(UrQueue->executeAllOpenCommandLists());
+
+    // Make a copy of queues to sync and release the lock.
+    for (auto &QueueMap :
+         {UrQueue->ComputeQueueGroupsByTID, UrQueue->CopyQueueGroupsByTID})
+      for (auto &QueueGroup : QueueMap)
+        std::copy(QueueGroup.second.ZeQueues.begin(),
+                  QueueGroup.second.ZeQueues.end(),
+                  std::back_inserter(ZeQueues));
+
+    // Remember the last command's event.
+    auto LastCommandEvent = UrQueue->LastCommandEvent;
+
+    // Don't hold a lock to the queue's mutex while waiting.
+    // This allows continue working with the queue from other threads.
+    // TODO: this currently exhibits some issues in the driver, so
+    // we control this with an env var. Remove this control when
+    // we settle one way or the other.
+    static bool HoldLock =
+        std::getenv("SYCL_PI_LEVEL_ZERO_QUEUE_FINISH_HOLD_LOCK") != nullptr;
+    if (!HoldLock) {
+      Lock.unlock();
+    }
+
+    for (auto &ZeQueue : ZeQueues) {
+      if (ZeQueue)
+        ZE2UR_CALL(zeHostSynchronize, (ZeQueue));
+    }
+
+    // Prevent unneeded already finished events to show up in the wait list.
+    // We can only do so if nothing else was submitted to the queue
+    // while we were synchronizing it.
+    if (!HoldLock) {
+      std::scoped_lock<ur_shared_mutex> Lock(UrQueue->Mutex);
+      if (LastCommandEvent == UrQueue->LastCommandEvent) {
+        UrQueue->LastCommandEvent = nullptr;
+      }
+    } else {
+      UrQueue->LastCommandEvent = nullptr;
+    }
+  }
+  // Reset signalled command lists and return them back to the cache of
+  // available command lists. Events in the immediate command lists are cleaned
+  // up in synchronize().
+  if (!UrQueue->Device->ImmCommandListUsed) {
+    std::unique_lock<ur_shared_mutex> Lock(UrQueue->Mutex);
+    resetCommandLists(UrQueue);
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(
+    ur_queue_handle_t Queue ///< [in] handle of the queue to be flushed.
+) {
+  // Flushing cross-queue dependencies is covered by
+  // createAndRetainUrZeEventList, so this can be left as a no-op.
+  std::ignore = Queue;
+  return UR_RESULT_SUCCESS;
+}
+
+// Configuration of the command-list batching.
+struct zeCommandListBatchConfig {
+  // Default value of 0. This specifies to use dynamic batch size adjustment.
+  // Other values will try to collect specified amount of commands.
+  uint32_t Size{0};
+
+  // If doing dynamic batching, specifies start batch size.
+  uint32_t DynamicSizeStart{4};
+
+  // The maximum size for dynamic batch.
+  uint32_t DynamicSizeMax{64};
+
+  // The step size for dynamic batch increases.
+  uint32_t DynamicSizeStep{1};
+
+  // Thresholds for when increase batch size (number of closed early is small
+  // and number of closed full is high).
+  uint32_t NumTimesClosedEarlyThreshold{3};
+  uint32_t NumTimesClosedFullThreshold{8};
+
+  // Tells the starting size of a batch.
+  uint32_t startSize() const { return Size > 0 ? Size : DynamicSizeStart; }
+  // Tells is we are doing dynamic batch size adjustment.
+  bool dynamic() const { return Size == 0; }
+};
+
+// Helper function to initialize static variables that holds batch config info
+// for compute and copy command batching.
+static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) {
+  zeCommandListBatchConfig Config{}; // default initialize
+
+  // Default value of 0. This specifies to use dynamic batch size adjustment.
+  const auto BatchSizeStr =
+      (IsCopy) ? std::getenv("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE")
+               : std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE");
+  if (BatchSizeStr) {
+    pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr);
+    // Level Zero may only support a limted number of commands per command
+    // list.  The actual upper limit is not specified by the Level Zero
+    // Specification.  For now we allow an arbitrary upper limit.
+    if (BatchSizeStrVal > 0) {
+      Config.Size = BatchSizeStrVal;
+    } else if (BatchSizeStrVal == 0) {
+      Config.Size = 0;
+      // We are requested to do dynamic batching. Collect specifics, if any.
+      // The extended format supported is ":" separated values.
+      //
+      // NOTE: these extra settings are experimental and are intended to
+      // be used only for finding a better default heuristic.
+      //
+      std::string BatchConfig(BatchSizeStr);
+      size_t Ord = 0;
+      size_t Pos = 0;
+      while (true) {
+        if (++Ord > 5)
+          break;
+
+        Pos = BatchConfig.find(":", Pos);
+        if (Pos == std::string::npos)
+          break;
+        ++Pos; // past the ":"
+
+        uint32_t Val;
+        try {
+          Val = std::stoi(BatchConfig.substr(Pos));
+        } catch (...) {
+          if (IsCopy)
+            urPrint(
+                "SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: failed to parse value\n");
+          else
+            urPrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: failed to parse value\n");
+          break;
+        }
+        switch (Ord) {
+        case 1:
+          Config.DynamicSizeStart = Val;
+          break;
+        case 2:
+          Config.DynamicSizeMax = Val;
+          break;
+        case 3:
+          Config.DynamicSizeStep = Val;
+          break;
+        case 4:
+          Config.NumTimesClosedEarlyThreshold = Val;
+          break;
+        case 5:
+          Config.NumTimesClosedFullThreshold = Val;
+          break;
+        default:
+          die("Unexpected batch config");
+        }
+        if (IsCopy)
+          urPrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: dynamic batch param "
+                  "#%d: %d\n",
+                  (int)Ord, (int)Val);
+        else
+          urPrint(
+              "SYCL_PI_LEVEL_ZERO_BATCH_SIZE: dynamic batch param #%d: %d\n",
+              (int)Ord, (int)Val);
+      };
+
+    } else {
+      // Negative batch sizes are silently ignored.
+      if (IsCopy)
+        urPrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: ignored negative value\n");
+      else
+        urPrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: ignored negative value\n");
+    }
+  }
+  return Config;
+}
+
+// SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE can be set to an integer (>=0) in
+// which case all compute commands will be submitted to the command-queue
+// with the given index in the compute command group. If it is instead set
+// to negative then all available compute engines may be used.
+//
+// The default value is "0".
+//
+static const std::pair<int, int> getRangeOfAllowedComputeEngines() {
+  static const char *EnvVar =
+      std::getenv("SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE");
+  // If the environment variable is not set only use "0" CCS for now.
+  // TODO: allow all CCSs when HW support is complete.
+  if (!EnvVar)
+    return std::pair<int, int>(0, 0);
+
+  auto EnvVarValue = std::atoi(EnvVar);
+  if (EnvVarValue >= 0) {
+    return std::pair<int, int>(EnvVarValue, EnvVarValue);
+  }
+
+  return std::pair<int, int>(0, INT_MAX);
+}
+
+// Static variable that holds batch config info for compute command batching.
+static const zeCommandListBatchConfig ZeCommandListBatchComputeConfig = [] {
+  using IsCopy = bool;
+  return ZeCommandListBatchConfig(IsCopy{false});
+}();
+
+// Static variable that holds batch config info for copy command batching.
+static const zeCommandListBatchConfig ZeCommandListBatchCopyConfig = [] {
+  using IsCopy = bool;
+  return ZeCommandListBatchConfig(IsCopy{true});
+}();
+
+ur_queue_handle_t_::ur_queue_handle_t_(
+    std::vector<ze_command_queue_handle_t> &ComputeQueues,
+    std::vector<ze_command_queue_handle_t> &CopyQueues,
+    ur_context_handle_t Context, ur_device_handle_t Device,
+    bool OwnZeCommandQueue, pi_queue_properties Properties,
+    int ForceComputeIndex)
+    : Context{Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue},
+      Properties(Properties) {
+  // Compute group initialization.
+  // First, see if the queue's device allows for round-robin or it is
+  // fixed to one particular compute CCS (it is so for sub-sub-devices).
+  auto &ComputeQueueGroupInfo = Device->QueueGroup[queue_type::Compute];
+  pi_queue_group_t ComputeQueueGroup{reinterpret_cast<ur_queue_handle_t>(this),
+                                     queue_type::Compute};
+  ComputeQueueGroup.ZeQueues = ComputeQueues;
+  // Create space to hold immediate commandlists corresponding to the
+  // ZeQueues
+  if (Device->ImmCommandListUsed) {
+    ComputeQueueGroup.ImmCmdLists = std::vector<ur_command_list_ptr_t>(
+        ComputeQueueGroup.ZeQueues.size(), CommandListMap.end());
+  }
+  if (ComputeQueueGroupInfo.ZeIndex >= 0) {
+    // Sub-sub-device
+
+    // sycl::ext::intel::property::queue::compute_index works with any
+    // backend/device by allowing single zero index if multiple compute CCSes
+    // are not supported. Sub-sub-device falls into the same bucket.
+    assert(ForceComputeIndex <= 0);
+    ComputeQueueGroup.LowerIndex = ComputeQueueGroupInfo.ZeIndex;
+    ComputeQueueGroup.UpperIndex = ComputeQueueGroupInfo.ZeIndex;
+    ComputeQueueGroup.NextIndex = ComputeQueueGroupInfo.ZeIndex;
+  } else if (ForceComputeIndex >= 0) {
+    ComputeQueueGroup.LowerIndex = ForceComputeIndex;
+    ComputeQueueGroup.UpperIndex = ForceComputeIndex;
+    ComputeQueueGroup.NextIndex = ForceComputeIndex;
+  } else {
+    // Set-up to round-robin across allowed range of engines.
+    uint32_t FilterLowerIndex = getRangeOfAllowedComputeEngines().first;
+    uint32_t FilterUpperIndex = getRangeOfAllowedComputeEngines().second;
+    FilterUpperIndex = std::min((size_t)FilterUpperIndex,
+                                FilterLowerIndex + ComputeQueues.size() - 1);
+    if (FilterLowerIndex <= FilterUpperIndex) {
+      ComputeQueueGroup.LowerIndex = FilterLowerIndex;
+      ComputeQueueGroup.UpperIndex = FilterUpperIndex;
+      ComputeQueueGroup.NextIndex = ComputeQueueGroup.LowerIndex;
+    } else {
+      die("No compute queue available/allowed.");
+    }
+  }
+  if (Device->ImmCommandListUsed) {
+    // Create space to hold immediate commandlists corresponding to the
+    // ZeQueues
+    ComputeQueueGroup.ImmCmdLists = std::vector<ur_command_list_ptr_t>(
+        ComputeQueueGroup.ZeQueues.size(), CommandListMap.end());
+  }
+
+  ComputeQueueGroupsByTID.set(ComputeQueueGroup);
+
+  // Copy group initialization.
+  pi_queue_group_t CopyQueueGroup{reinterpret_cast<ur_queue_handle_t>(this),
+                                  queue_type::MainCopy};
+  const auto &Range = getRangeOfAllowedCopyEngines((ur_device_handle_t)Device);
+  if (Range.first < 0 || Range.second < 0) {
+    // We are asked not to use copy engines, just do nothing.
+    // Leave CopyQueueGroup.ZeQueues empty, and it won't be used.
+  } else {
+    uint32_t FilterLowerIndex = Range.first;
+    uint32_t FilterUpperIndex = Range.second;
+    FilterUpperIndex = std::min((size_t)FilterUpperIndex,
+                                FilterLowerIndex + CopyQueues.size() - 1);
+    if (FilterLowerIndex <= FilterUpperIndex) {
+      CopyQueueGroup.ZeQueues = CopyQueues;
+      CopyQueueGroup.LowerIndex = FilterLowerIndex;
+      CopyQueueGroup.UpperIndex = FilterUpperIndex;
+      CopyQueueGroup.NextIndex = CopyQueueGroup.LowerIndex;
+      // Create space to hold immediate commandlists corresponding to the
+      // ZeQueues
+      if (Device->ImmCommandListUsed) {
+        CopyQueueGroup.ImmCmdLists = std::vector<ur_command_list_ptr_t>(
+            CopyQueueGroup.ZeQueues.size(), CommandListMap.end());
+      }
+    }
+  }
+  CopyQueueGroupsByTID.set(CopyQueueGroup);
+
+  // Initialize compute/copy command batches.
+  ComputeCommandBatch.OpenCommandList = CommandListMap.end();
+  CopyCommandBatch.OpenCommandList = CommandListMap.end();
+  ComputeCommandBatch.QueueBatchSize =
+      ZeCommandListBatchComputeConfig.startSize();
+  CopyCommandBatch.QueueBatchSize = ZeCommandListBatchCopyConfig.startSize();
+}
+
+void ur_queue_handle_t_::adjustBatchSizeForFullBatch(bool IsCopy) {
+  auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch;
+  auto &ZeCommandListBatchConfig =
+      IsCopy ? ZeCommandListBatchCopyConfig : ZeCommandListBatchComputeConfig;
+  uint32_t &QueueBatchSize = CommandBatch.QueueBatchSize;
+  // QueueBatchSize of 0 means never allow batching.
+  if (QueueBatchSize == 0 || !ZeCommandListBatchConfig.dynamic())
+    return;
+  CommandBatch.NumTimesClosedFull += 1;
+
+  // If the number of times the list has been closed early is low, and
+  // the number of times it has been closed full is high, then raise
+  // the batching size slowly. Don't raise it if it is already pretty
+  // high.
+  if (CommandBatch.NumTimesClosedEarly <=
+          ZeCommandListBatchConfig.NumTimesClosedEarlyThreshold &&
+      CommandBatch.NumTimesClosedFull >
+          ZeCommandListBatchConfig.NumTimesClosedFullThreshold) {
+    if (QueueBatchSize < ZeCommandListBatchConfig.DynamicSizeMax) {
+      QueueBatchSize += ZeCommandListBatchConfig.DynamicSizeStep;
+      urPrint("Raising QueueBatchSize to %d\n", QueueBatchSize);
+    }
+    CommandBatch.NumTimesClosedEarly = 0;
+    CommandBatch.NumTimesClosedFull = 0;
+  }
+}
+
+void ur_queue_handle_t_::adjustBatchSizeForPartialBatch(bool IsCopy) {
+  auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch;
+  auto &ZeCommandListBatchConfig =
+      IsCopy ? ZeCommandListBatchCopyConfig : ZeCommandListBatchComputeConfig;
+  uint32_t &QueueBatchSize = CommandBatch.QueueBatchSize;
+  // QueueBatchSize of 0 means never allow batching.
+  if (QueueBatchSize == 0 || !ZeCommandListBatchConfig.dynamic())
+    return;
+  CommandBatch.NumTimesClosedEarly += 1;
+
+  // If we are closing early more than about 3x the number of times
+  // it is closing full, lower the batch size to the value of the
+  // current open command list. This is trying to quickly get to a
+  // batch size that will be able to be closed full at least once
+  // in a while.
+  if (CommandBatch.NumTimesClosedEarly >
+      (CommandBatch.NumTimesClosedFull + 1) * 3) {
+    QueueBatchSize = CommandBatch.OpenCommandList->second.size() - 1;
+    if (QueueBatchSize < 1)
+      QueueBatchSize = 1;
+    urPrint("Lowering QueueBatchSize to %d\n", QueueBatchSize);
+    CommandBatch.NumTimesClosedEarly = 0;
+    CommandBatch.NumTimesClosedFull = 0;
+  }
+}
+
+ur_result_t
+ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList,
+                                       bool IsBlocking, bool OKToBatchCommand) {
+  // Do nothing if command list is already closed.
+  if (CommandList->second.IsClosed)
+    return UR_RESULT_SUCCESS;
+
+  bool UseCopyEngine =
+      CommandList->second.isCopy(reinterpret_cast<ur_queue_handle_t>(this));
+
+  // If the current LastCommandEvent is the nullptr, then it means
+  // either that no command has ever been issued to the queue
+  // or it means that the LastCommandEvent has been signalled and
+  // therefore that this Queue is idle.
+  //
+  // NOTE: this behavior adds some flakyness to the batching
+  // since last command's event may or may not be completed by the
+  // time we get here depending on timings and system/gpu load.
+  // So, disable it for modes where we print PI traces. Printing
+  // traces incurs much different timings than real execution
+  // ansyway, and many regression tests use it.
+  //
+  bool CurrentlyEmpty = !PrintTrace && this->LastCommandEvent == nullptr;
+
+  // The list can be empty if command-list only contains signals of proxy
+  // events. It is possible that executeCommandList is called twice for the same
+  // command list without new appended command. We don't to want process the
+  // same last command event twice that's why additionally check that new
+  // command was appended to the command list.
+  if (!CommandList->second.EventList.empty() &&
+      this->LastCommandEvent != CommandList->second.EventList.back()) {
+    this->LastCommandEvent = CommandList->second.EventList.back();
+    if (doReuseDiscardedEvents()) {
+      UR_CALL(resetDiscardedEvent(CommandList));
+    }
+  }
+
+  this->LastUsedCommandList = CommandList;
+
+  if (!Device->ImmCommandListUsed) {
+    // Batch if allowed to, but don't batch if we know there are no kernels
+    // from this queue that are currently executing.  This is intended to get
+    // kernels started as soon as possible when there are no kernels from this
+    // queue awaiting execution, while allowing batching to occur when there
+    // are kernels already executing. Also, if we are using fixed size batching,
+    // as indicated by !ZeCommandListBatch.dynamic(), then just ignore
+    // CurrentlyEmpty as we want to strictly follow the batching the user
+    // specified.
+    auto &CommandBatch = UseCopyEngine ? CopyCommandBatch : ComputeCommandBatch;
+    auto &ZeCommandListBatchConfig = UseCopyEngine
+                                         ? ZeCommandListBatchCopyConfig
+                                         : ZeCommandListBatchComputeConfig;
+    if (OKToBatchCommand && this->isBatchingAllowed(UseCopyEngine) &&
+        (!ZeCommandListBatchConfig.dynamic() || !CurrentlyEmpty)) {
+
+      if (hasOpenCommandList(UseCopyEngine) &&
+          CommandBatch.OpenCommandList != CommandList)
+        die("executeCommandList: OpenCommandList should be equal to"
+            "null or CommandList");
+
+      if (CommandList->second.size() < CommandBatch.QueueBatchSize) {
+        CommandBatch.OpenCommandList = CommandList;
+        return UR_RESULT_SUCCESS;
+      }
+
+      adjustBatchSizeForFullBatch(UseCopyEngine);
+      CommandBatch.OpenCommandList = CommandListMap.end();
+    }
+  }
+
+  auto &ZeCommandQueue = CommandList->second.ZeQueue;
+  // Scope of the lock must be till the end of the function, otherwise new mem
+  // allocs can be created between the moment when we made a snapshot and the
+  // moment when command list is closed and executed. But mutex is locked only
+  // if indirect access tracking enabled, because std::defer_lock is used.
+  // unique_lock destructor at the end of the function will unlock the mutex
+  // if it was locked (which happens only if IndirectAccessTrackingEnabled is
+  // true).
+  std::unique_lock<ur_shared_mutex> ContextsLock(
+      Device->Platform->ContextsMutex, std::defer_lock);
+
+  if (IndirectAccessTrackingEnabled) {
+    // We are going to submit kernels for execution. If indirect access flag is
+    // set for a kernel then we need to make a snapshot of existing memory
+    // allocations in all contexts in the platform. We need to lock the mutex
+    // guarding the list of contexts in the platform to prevent creation of new
+    // memory alocations in any context before we submit the kernel for
+    // execution.
+    ContextsLock.lock();
+    CaptureIndirectAccesses();
+  }
+
+  if (!Device->ImmCommandListUsed) {
+    // In this mode all inner-batch events have device visibility only,
+    // and we want the last command in the batch to signal a host-visible
+    // event that anybody waiting for any event in the batch will
+    // really be using.
+    // We need to create a proxy host-visible event only if the list of events
+    // in the command list is not empty, otherwise we are going to just create
+    // and remove proxy event right away and dereference deleted object
+    // afterwards.
+    if (Device->ZeEventsScope == LastCommandInBatchHostVisible &&
+        !CommandList->second.EventList.empty()) {
+      // If there are only internal events in the command list then we don't
+      // need to create host proxy event.
+      auto Result = std::find_if(
+          CommandList->second.EventList.begin(),
+          CommandList->second.EventList.end(),
+          [](ur_event_handle_t E) { return E->hasExternalRefs(); });
+      if (Result != CommandList->second.EventList.end()) {
+        // Create a "proxy" host-visible event.
+        //
+        ur_event_handle_t HostVisibleEvent;
+        auto Res = createEventAndAssociateQueue(
+            reinterpret_cast<ur_queue_handle_t>(this), &HostVisibleEvent,
+            UR_EXT_COMMAND_TYPE_USER, CommandList,
+            /* IsInternal */ false, /* HostVisible */ true);
+        if (Res)
+          return Res;
+
+        // Update each command's event in the command-list to "see" this
+        // proxy event as a host-visible counterpart.
+        for (auto &Event : CommandList->second.EventList) {
+          std::scoped_lock<ur_shared_mutex> EventLock(Event->Mutex);
+          // Internal event doesn't need host-visible proxy.
+          if (!Event->hasExternalRefs())
+            continue;
+
+          if (!Event->HostVisibleEvent) {
+            Event->HostVisibleEvent =
+                reinterpret_cast<ur_event_handle_t>(HostVisibleEvent);
+            HostVisibleEvent->RefCount.increment();
+          }
+        }
+
+        // Decrement the reference count of the event such that all the
+        // remaining references are from the other commands in this batch and
+        // from the command-list itself. This host-visible event will not be
+        // waited/released by SYCL RT, so it must be destroyed after all events
+        // in the batch are gone. We know that refcount is more than 2 because
+        // we check that EventList of the command list is not empty above, i.e.
+        // after createEventAndAssociateQueue ref count is 2 and then +1 for
+        // each event in the EventList.
+        UR_CALL(urEventReleaseInternal(HostVisibleEvent));
+
+        if (doReuseDiscardedEvents()) {
+          // If we have in-order queue with discarded events then we want to
+          // treat this event as regular event. We insert a barrier in the next
+          // command list to wait for this event.
+          LastCommandEvent = HostVisibleEvent;
+        } else {
+          // For all other queues treat this as a special event and indicate no
+          // cleanup is needed.
+          // TODO: always treat this host event as a regular event.
+          UR_CALL(urEventReleaseInternal(HostVisibleEvent));
+          HostVisibleEvent->CleanedUp = true;
+        }
+
+        // Finally set to signal the host-visible event at the end of the
+        // command-list after a barrier that waits for all commands
+        // completion.
+        if (doReuseDiscardedEvents() && LastCommandEvent &&
+            LastCommandEvent->IsDiscarded) {
+          // If we the last event is discarded then we already have a barrier
+          // inserted, so just signal the event.
+          ZE2UR_CALL(zeCommandListAppendSignalEvent,
+                     (CommandList->first, HostVisibleEvent->ZeEvent));
+        } else {
+          ZE2UR_CALL(
+              zeCommandListAppendBarrier,
+              (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr));
+        }
+      } else {
+        // If we don't have host visible proxy then signal event if needed.
+        this->signalEventFromCmdListIfLastEventDiscarded(CommandList);
+      }
+    } else {
+      // If we don't have host visible proxy then signal event if needed.
+      this->signalEventFromCmdListIfLastEventDiscarded(CommandList);
+    }
+
+    // Close the command list and have it ready for dispatch.
+    ZE2UR_CALL(zeCommandListClose, (CommandList->first));
+    // Mark this command list as closed.
+    CommandList->second.IsClosed = true;
+    this->LastUsedCommandList = CommandListMap.end();
+    // Offload command list to the GPU for asynchronous execution
+    auto ZeCommandList = CommandList->first;
+    auto ZeResult = ZE_CALL_NOCHECK(
+        zeCommandQueueExecuteCommandLists,
+        (ZeCommandQueue, 1, &ZeCommandList, CommandList->second.ZeFence));
+    if (ZeResult != ZE_RESULT_SUCCESS) {
+      this->Healthy = false;
+      if (ZeResult == ZE_RESULT_ERROR_UNKNOWN) {
+        // Turn into a more informative end-user error.
+        return UR_RESULT_ERROR_UNKNOWN;
+      }
+      return ze2urResult(ZeResult);
+    }
+  }
+
+  // Check global control to make every command blocking for debugging.
+  if (IsBlocking || (UrL0Serialize & UrL0SerializeBlock) != 0) {
+    if (Device->ImmCommandListUsed) {
+      synchronize();
+    } else {
+      // Wait until command lists attached to the command queue are executed.
+      ZE2UR_CALL(zeHostSynchronize, (ZeCommandQueue));
+    }
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+bool ur_queue_handle_t_::doReuseDiscardedEvents() {
+  return ReuseDiscardedEvents && isInOrderQueue() && isDiscardEvents();
+}
+
+ur_result_t
+ur_queue_handle_t_::resetDiscardedEvent(ur_command_list_ptr_t CommandList) {
+  if (LastCommandEvent && LastCommandEvent->IsDiscarded) {
+    ZE2UR_CALL(zeCommandListAppendBarrier,
+               (CommandList->first, nullptr, 1, &(LastCommandEvent->ZeEvent)));
+    ZE2UR_CALL(zeCommandListAppendEventReset,
+               (CommandList->first, LastCommandEvent->ZeEvent));
+
+    // Create new pi_event but with the same ze_event_handle_t. We are going
+    // to use this pi_event for the next command with discarded event.
+    ur_event_handle_t_ *PiEvent;
+    try {
+      PiEvent = new ur_event_handle_t_(
+          LastCommandEvent->ZeEvent, LastCommandEvent->ZeEventPool,
+          reinterpret_cast<ur_context_handle_t>(Context),
+          UR_EXT_COMMAND_TYPE_USER, true);
+    } catch (const std::bad_alloc &) {
+      return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    } catch (...) {
+      return UR_RESULT_ERROR_UNKNOWN;
+    }
+
+    if (LastCommandEvent->isHostVisible())
+      PiEvent->HostVisibleEvent = reinterpret_cast<ur_event_handle_t>(PiEvent);
+
+    UR_CALL(addEventToQueueCache(reinterpret_cast<ur_event_handle_t>(PiEvent)));
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t ur_queue_handle_t_::addEventToQueueCache(ur_event_handle_t Event) {
+  auto Cache = Event->isHostVisible() ? &EventCaches[0] : &EventCaches[1];
+  Cache->emplace_back(Event);
+  return UR_RESULT_SUCCESS;
+}
+
+void ur_queue_handle_t_::active_barriers::add(ur_event_handle_t &Event) {
+  Event->RefCount.increment();
+  Events.push_back(Event);
+}
+
+ur_result_t ur_queue_handle_t_::active_barriers::clear() {
+  for (const auto &Event : Events)
+    UR_CALL(urEventReleaseInternal(Event));
+  Events.clear();
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) {
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+
+  if (!UrQueue->RefCount.decrementAndTest())
+    return UR_RESULT_SUCCESS;
+
+  for (auto &Cache : UrQueue->EventCaches)
+    for (auto &Event : Cache)
+      UR_CALL(urEventReleaseInternal(Event));
+
+  if (UrQueue->OwnZeCommandQueue) {
+    for (auto &QueueMap :
+         {UrQueue->ComputeQueueGroupsByTID, UrQueue->CopyQueueGroupsByTID})
+      for (auto &QueueGroup : QueueMap)
+        for (auto &ZeQueue : QueueGroup.second.ZeQueues)
+          if (ZeQueue) {
+            auto ZeResult = ZE_CALL_NOCHECK(zeCommandQueueDestroy, (ZeQueue));
+            // Gracefully handle the case that L0 was already unloaded.
+            if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
+              return ze2urResult(ZeResult);
+          }
+  }
+
+  urPrint("urQueueRelease(compute) NumTimesClosedFull %d, "
+          "NumTimesClosedEarly %d\n",
+          UrQueue->ComputeCommandBatch.NumTimesClosedFull,
+          UrQueue->ComputeCommandBatch.NumTimesClosedEarly);
+  urPrint("urQueueRelease(copy) NumTimesClosedFull %d, NumTimesClosedEarly "
+          "%d\n",
+          UrQueue->CopyCommandBatch.NumTimesClosedFull,
+          UrQueue->CopyCommandBatch.NumTimesClosedEarly);
+
+  delete UrQueue;
+
+  return UR_RESULT_SUCCESS;
+}
+
+bool ur_queue_handle_t_::isBatchingAllowed(bool IsCopy) const {
+  auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch;
+  return (CommandBatch.QueueBatchSize > 0 &&
+          ((UrL0Serialize & UrL0SerializeBlock) == 0));
+}
+
+bool ur_queue_handle_t_::isDiscardEvents() const {
+  return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS) != 0);
+}
+
+bool ur_queue_handle_t_::isPriorityLow() const {
+  return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW) != 0);
+}
+
+bool ur_queue_handle_t_::isPriorityHigh() const {
+  return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH) != 0);
+}
+
+bool ur_queue_handle_t_::isInOrderQueue() const {
+  // If out-of-order queue property is not set, then this is a in-order queue.
+  return ((this->Properties & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) ==
+          0);
+}
+
+// Helper function to perform the necessary cleanup of the events from reset cmd
+// list.
+ur_result_t CleanupEventListFromResetCmdList(
+    std::vector<ur_event_handle_t> &EventListToCleanup, bool QueueLocked) {
+  for (auto &Event : EventListToCleanup) {
+    // We don't need to synchronize the events since the fence associated with
+    // the command list was synchronized.
+    {
+      std::scoped_lock<ur_shared_mutex> EventLock(Event->Mutex);
+      Event->Completed = true;
+    }
+    UR_CALL(CleanupCompletedEvent(Event, QueueLocked));
+    // This event was removed from the command list, so decrement ref count
+    // (it was incremented when they were added to the command list).
+    UR_CALL(urEventReleaseInternal(Event));
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+// Wait on all operations in flight on this Queue.
+// The caller is expected to hold a lock on the Queue.
+// For standard commandlists sync the L0 queues directly.
+// For immediate commandlists add barriers to all commandlists associated
+// with the Queue. An alternative approach would be to wait on all Events
+// associated with the in-flight operations.
+// TODO: Event release in immediate commandlist mode is driven by the SYCL
+// runtime. Need to investigate whether relase can be done earlier, at sync
+// points such as this, to reduce total number of active Events.
+ur_result_t ur_queue_handle_t_::synchronize() {
+  if (!Healthy)
+    return UR_RESULT_SUCCESS;
+
+  auto syncImmCmdList = [](ur_queue_handle_t_ *Queue,
+                           ur_command_list_ptr_t ImmCmdList) {
+    if (ImmCmdList == Queue->CommandListMap.end())
+      return UR_RESULT_SUCCESS;
+
+    ur_event_handle_t Event{};
+    ur_result_t Res = createEventAndAssociateQueue(
+        reinterpret_cast<ur_queue_handle_t>(Queue), &Event,
+        UR_EXT_COMMAND_TYPE_USER, ImmCmdList, /* IsInternal */ false);
+    if (Res != UR_RESULT_SUCCESS)
+      return Res;
+    auto zeEvent = Event->ZeEvent;
+    ZE2UR_CALL(zeCommandListAppendBarrier,
+               (ImmCmdList->first, zeEvent, 0, nullptr));
+    ZE2UR_CALL(zeHostSynchronize, (zeEvent));
+    Event->Completed = true;
+    UR_CALL(urEventRelease(Event));
+    // Cleanup all events from the synced command list.
+    auto EventListToCleanup = std::move(ImmCmdList->second.EventList);
+    ImmCmdList->second.EventList.clear();
+    CleanupEventListFromResetCmdList(EventListToCleanup, true);
+    return UR_RESULT_SUCCESS;
+  };
+
+  for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID})
+    for (auto &QueueGroup : QueueMap) {
+      if (Device->ImmCommandListUsed) {
+        for (auto ImmCmdList : QueueGroup.second.ImmCmdLists)
+          syncImmCmdList(this, ImmCmdList);
+      } else {
+        for (auto &ZeQueue : QueueGroup.second.ZeQueues)
+          if (ZeQueue)
+            ZE2UR_CALL(zeHostSynchronize, (ZeQueue));
+      }
+    }
+  LastCommandEvent = nullptr;
+
+  // With the entire queue synchronized, the active barriers must be done so we
+  // can remove them.
+  if (auto Res = ActiveBarriers.clear())
+    return Res;
+
+  return UR_RESULT_SUCCESS;
+}
+
+ur_event_handle_t ur_queue_handle_t_::getEventFromQueueCache(bool HostVisible) {
+  auto Cache = HostVisible ? &EventCaches[0] : &EventCaches[1];
+
+  // If we don't have any events, return nullptr.
+  // If we have only a single event then it was used by the last command and we
+  // can't use it now because we have to enforce round robin between two events.
+  if (Cache->size() < 2)
+    return nullptr;
+
+  // If there are two events then return an event from the beginning of the list
+  // since event of the last command is added to the end of the list.
+  auto It = Cache->begin();
+  ur_event_handle_t RetEvent = *It;
+  Cache->erase(It);
+  return RetEvent;
+}
+
+// This helper function creates a pi_event and associate a pi_queue.
+// Note that the caller of this function must have acquired lock on the Queue
+// that is passed in.
+// \param Queue pi_queue to associate with a new event.
+// \param Event a pointer to hold the newly created pi_event
+// \param CommandType various command type determined by the caller
+// \param CommandList is the command list where the event is added
+// \param IsInternal tells if the event is internal, i.e. visible in the L0
+//        plugin only.
+// \param HostVisible tells if the event must be created in the
+//        host-visible pool. If not set then this function will decide.
+ur_result_t createEventAndAssociateQueue(ur_queue_handle_t Queue,
+                                         ur_event_handle_t *Event,
+                                         ur_command_t CommandType,
+                                         ur_command_list_ptr_t CommandList,
+                                         bool IsInternal,
+                                         std::optional<bool> HostVisible) {
+
+  if (!HostVisible.has_value()) {
+    // Internal/discarded events do not need host-scope visibility.
+    HostVisible =
+        IsInternal ? false : Queue->Device->ZeEventsScope == AllHostVisible;
+  }
+
+  // If event is discarded then try to get event from the queue cache.
+  *Event =
+      IsInternal ? Queue->getEventFromQueueCache(HostVisible.value()) : nullptr;
+
+  if (*Event == nullptr)
+    UR_CALL(EventCreate(Queue->Context, Queue, HostVisible.value(), Event));
+
+  (*Event)->UrQueue = Queue;
+  (*Event)->CommandType = CommandType;
+  (*Event)->IsDiscarded = IsInternal;
+  (*Event)->CommandList = CommandList;
+  // Discarded event doesn't own ze_event, it is used by multiple pi_event
+  // objects. We destroy corresponding ze_event by releasing events from the
+  // events cache at queue destruction. Event in the cache owns the Level Zero
+  // event.
+  if (IsInternal)
+    (*Event)->OwnNativeHandle = false;
+
+  // Append this Event to the CommandList, if any
+  if (CommandList != Queue->CommandListMap.end()) {
+    CommandList->second.append(*Event);
+    (*Event)->RefCount.increment();
+  }
+
+  // We need to increment the reference counter here to avoid pi_queue
+  // being released before the associated pi_event is released because
+  // piEventRelease requires access to the associated pi_queue.
+  // In piEventRelease, the reference counter of the Queue is decremented
+  // to release it.
+  Queue->RefCount.increment();
+
+  // SYCL RT does not track completion of the events, so it could
+  // release a PI event as soon as that's not being waited in the app.
+  // But we have to ensure that the event is not destroyed before
+  // it is really signalled, so retain it explicitly here and
+  // release in CleanupCompletedEvent(Event).
+  // If the event is internal then don't increment the reference count as this
+  // event will not be waited/released by SYCL RT, so it must be destroyed by
+  // EventRelease in resetCommandList.
+  if (!IsInternal)
+    UR_CALL(urEventRetain(*Event));
+
+  return UR_RESULT_SUCCESS;
+}
+
+void ur_queue_handle_t_::CaptureIndirectAccesses() {
+  for (auto &Kernel : KernelsToBeSubmitted) {
+    if (!Kernel->hasIndirectAccess())
+      continue;
+
+    auto &Contexts = Device->Platform->Contexts;
+    for (auto &Ctx : Contexts) {
+      for (auto &Elem : Ctx->MemAllocs) {
+        const auto &Pair = Kernel->MemAllocs.insert(&Elem);
+        // Kernel is referencing this memory allocation from now.
+        // If this memory allocation was already captured for this kernel, it
+        // means that kernel is submitted several times. Increase reference
+        // count only once because we release all allocations only when
+        // SubmissionsCount turns to 0. We don't want to know how many times
+        // allocation was retained by each submission.
+        if (Pair.second)
+          Elem.second.RefCount.increment();
+      }
+    }
+    Kernel->SubmissionsCount++;
+  }
+  KernelsToBeSubmitted.clear();
+}
+
+ur_result_t ur_queue_handle_t_::signalEventFromCmdListIfLastEventDiscarded(
+    ur_command_list_ptr_t CommandList) {
+  // We signal new event at the end of command list only if we have queue with
+  // discard_events property and the last command event is discarded.
+  if (!(doReuseDiscardedEvents() && LastCommandEvent &&
+        LastCommandEvent->IsDiscarded))
+    return UR_RESULT_SUCCESS;
+
+  // NOTE: We create this "glue" event not as internal so it is not
+  // participating in the discarded events reset/reuse logic, but
+  // with no host-visibility since it is not going to be waited
+  // from the host.
+  ur_event_handle_t Event;
+  UR_CALL(createEventAndAssociateQueue(
+      reinterpret_cast<ur_queue_handle_t>(this), &Event,
+      UR_EXT_COMMAND_TYPE_USER, CommandList,
+      /* IsInternal */ false, /* HostVisible */ false));
+  UR_CALL(urEventReleaseInternal(Event));
+  LastCommandEvent = Event;
+
+  ZE2UR_CALL(zeCommandListAppendSignalEvent,
+             (CommandList->first, Event->ZeEvent));
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t ur_queue_handle_t_::executeOpenCommandList(bool IsCopy) {
+  auto &CommandBatch = IsCopy ? CopyCommandBatch : ComputeCommandBatch;
+  // If there are any commands still in the open command list for this
+  // queue, then close and execute that command list now.
+  if (hasOpenCommandList(IsCopy)) {
+    adjustBatchSizeForPartialBatch(IsCopy);
+    auto Res = executeCommandList(CommandBatch.OpenCommandList, false, false);
+    CommandBatch.OpenCommandList = CommandListMap.end();
+    return Res;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t ur_queue_handle_t_::resetCommandList(
+    ur_command_list_ptr_t CommandList, bool MakeAvailable,
+    std::vector<ur_event_handle_t> &EventListToCleanup, bool CheckStatus) {
+  bool UseCopyEngine = CommandList->second.isCopy(this);
+
+  // Immediate commandlists do not have an associated fence.
+  if (CommandList->second.ZeFence != nullptr) {
+    // Fence had been signalled meaning the associated command-list completed.
+    // Reset the fence and put the command list into a cache for reuse in PI
+    // calls.
+    ZE2UR_CALL(zeFenceReset, (CommandList->second.ZeFence));
+    ZE2UR_CALL(zeCommandListReset, (CommandList->first));
+    CommandList->second.ZeFenceInUse = false;
+    CommandList->second.IsClosed = false;
+  }
+
+  auto &EventList = CommandList->second.EventList;
+  // Check if standard commandlist or fully synced in-order queue.
+  // If one of those conditions is met then we are sure that all events are
+  // completed so we don't need to check event status.
+  if (!CheckStatus || CommandList->second.ZeFence != nullptr ||
+      (isInOrderQueue() && !LastCommandEvent)) {
+    // Remember all the events in this command list which needs to be
+    // released/cleaned up and clear event list associated with command list.
+    std::move(std::begin(EventList), std::end(EventList),
+              std::back_inserter(EventListToCleanup));
+    EventList.clear();
+  } else if (!isDiscardEvents()) {
+    // For immediate commandlist reset only those events that have signalled.
+    // If events in the queue are discarded then we can't check their status.
+    for (auto it = EventList.begin(); it != EventList.end();) {
+      std::scoped_lock<ur_shared_mutex> EventLock((*it)->Mutex);
+      ze_result_t ZeResult =
+          (*it)->Completed
+              ? ZE_RESULT_SUCCESS
+              : ZE_CALL_NOCHECK(zeEventQueryStatus, ((*it)->ZeEvent));
+      // Break early as soon as we found first incomplete event because next
+      // events are submitted even later. We are not trying to find all
+      // completed events here because it may be costly. I.e. we are checking
+      // only elements which are most likely completed because they were
+      // submitted earlier. It is guaranteed that all events will be eventually
+      // cleaned up at queue sync/release.
+      if (ZeResult == ZE_RESULT_NOT_READY)
+        break;
+
+      if (ZeResult != ZE_RESULT_SUCCESS)
+        return ze2urResult(ZeResult);
+
+      EventListToCleanup.push_back(std::move((*it)));
+      it = EventList.erase(it);
+    }
+  }
+
+  // Standard commandlists move in and out of the cache as they are recycled.
+  // Immediate commandlists are always available.
+  if (CommandList->second.ZeFence != nullptr && MakeAvailable) {
+    std::scoped_lock<ur_mutex> Lock(this->Context->ZeCommandListCacheMutex);
+    auto &ZeCommandListCache =
+        UseCopyEngine
+            ? this->Context->ZeCopyCommandListCache[this->Device->ZeDevice]
+            : this->Context->ZeComputeCommandListCache[this->Device->ZeDevice];
+    ZeCommandListCache.push_back(CommandList->first);
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+bool pi_command_list_info_t::isCopy(ur_queue_handle_t Queue) const {
+  return ZeQueueGroupOrdinal !=
+         (uint32_t)Queue->Device
+             ->QueueGroup
+                 [ur_device_handle_t_::queue_group_info_t::type::Compute]
+             .ZeOrdinal;
+}
+
+ur_command_list_ptr_t
+ur_queue_handle_t_::eventOpenCommandList(ur_event_handle_t Event) {
+  using IsCopy = bool;
+
+  if (Device->ImmCommandListUsed) {
+    // When using immediate commandlists there are no open command lists.
+    return CommandListMap.end();
+  }
+
+  if (hasOpenCommandList(IsCopy{false})) {
+    const auto &ComputeEventList =
+        ComputeCommandBatch.OpenCommandList->second.EventList;
+    if (std::find(ComputeEventList.begin(), ComputeEventList.end(), Event) !=
+        ComputeEventList.end())
+      return ComputeCommandBatch.OpenCommandList;
+  }
+  if (hasOpenCommandList(IsCopy{true})) {
+    const auto &CopyEventList =
+        CopyCommandBatch.OpenCommandList->second.EventList;
+    if (std::find(CopyEventList.begin(), CopyEventList.end(), Event) !=
+        CopyEventList.end())
+      return CopyCommandBatch.OpenCommandList;
+  }
+  return CommandListMap.end();
+}
+
+ur_queue_handle_t_::pi_queue_group_t &
+ur_queue_handle_t_::getQueueGroup(bool UseCopyEngine) {
+  auto &Map = (UseCopyEngine ? CopyQueueGroupsByTID : ComputeQueueGroupsByTID);
+  return Map.get();
+}
+
+// Return the index of the next queue to use based on a
+// round robin strategy and the queue group ordinal.
+uint32_t ur_queue_handle_t_::pi_queue_group_t::getQueueIndex(
+    uint32_t *QueueGroupOrdinal, uint32_t *QueueIndex, bool QueryOnly) {
+  auto CurrentIndex = NextIndex;
+
+  if (!QueryOnly) {
+    ++NextIndex;
+    if (NextIndex > UpperIndex)
+      NextIndex = LowerIndex;
+  }
+
+  // Find out the right queue group ordinal (first queue might be "main" or
+  // "link")
+  auto QueueType = Type;
+  if (QueueType != queue_type::Compute)
+    QueueType = (CurrentIndex == 0 && Queue->Device->hasMainCopyEngine())
+                    ? queue_type::MainCopy
+                    : queue_type::LinkCopy;
+
+  *QueueGroupOrdinal = Queue->Device->QueueGroup[QueueType].ZeOrdinal;
+  // Adjust the index to the L0 queue group since we represent "main" and
+  // "link"
+  // L0 groups with a single copy group ("main" would take "0" index).
+  auto ZeCommandQueueIndex = CurrentIndex;
+  if (QueueType == queue_type::LinkCopy && Queue->Device->hasMainCopyEngine()) {
+    ZeCommandQueueIndex -= 1;
+  }
+  *QueueIndex = ZeCommandQueueIndex;
+
+  return CurrentIndex;
+}
+
+// This function will return one of possibly multiple available native
+// queues and the value of the queue group ordinal.
+ze_command_queue_handle_t &
+ur_queue_handle_t_::pi_queue_group_t::getZeQueue(uint32_t *QueueGroupOrdinal) {
+
+  // QueueIndex is the proper L0 index.
+  // Index is the plugins concept of index, with main and link copy engines in
+  // one range.
+  uint32_t QueueIndex;
+  auto Index = getQueueIndex(QueueGroupOrdinal, &QueueIndex);
+
+  ze_command_queue_handle_t &ZeQueue = ZeQueues[Index];
+  if (ZeQueue)
+    return ZeQueue;
+
+  ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
+  ZeCommandQueueDesc.ordinal = *QueueGroupOrdinal;
+  ZeCommandQueueDesc.index = QueueIndex;
+  ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
+  const char *Priority = "Normal";
+  if (Queue->isPriorityLow()) {
+    ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW;
+    Priority = "Low";
+  } else if (Queue->isPriorityHigh()) {
+    ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH;
+    Priority = "High";
+  }
+
+  // Evaluate performance of explicit usage for "0" index.
+  if (QueueIndex != 0) {
+    ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY;
+  }
+
+  urPrint("[getZeQueue]: create queue ordinal = %d, index = %d "
+          "(round robin in [%d, %d]) priority = %s\n",
+          ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
+          UpperIndex, Priority);
+
+  auto ZeResult = ZE_CALL_NOCHECK(
+      zeCommandQueueCreate, (Queue->Context->ZeContext, Queue->Device->ZeDevice,
+                             &ZeCommandQueueDesc, &ZeQueue));
+  if (ZeResult) {
+    die("[L0] getZeQueue: failed to create queue");
+  }
+
+  return ZeQueue;
+}
+
+int32_t ur_queue_handle_t_::pi_queue_group_t::getCmdQueueOrdinal(
+    ze_command_queue_handle_t CmdQueue) {
+  // Find out the right queue group ordinal (first queue might be "main" or
+  // "link")
+  auto QueueType = Type;
+  if (QueueType != queue_type::Compute)
+    QueueType = (ZeQueues[0] == CmdQueue && Queue->Device->hasMainCopyEngine())
+                    ? queue_type::MainCopy
+                    : queue_type::LinkCopy;
+  return Queue->Device->QueueGroup[QueueType].ZeOrdinal;
+}
+
+// Helper function to create a new command-list to this queue and associated
+// fence tracking its completion. This command list & fence are added to the
+// map of command lists in this queue with ZeFenceInUse = false.
+// The caller must hold a lock of the queue already.
+ur_result_t ur_queue_handle_t_::createCommandList(
+    bool UseCopyEngine, ur_command_list_ptr_t &CommandList,
+    ze_command_queue_handle_t *ForcedCmdQueue) {
+
+  ze_fence_handle_t ZeFence;
+  ZeStruct<ze_fence_desc_t> ZeFenceDesc;
+  ze_command_list_handle_t ZeCommandList;
+
+  uint32_t QueueGroupOrdinal;
+  auto &QGroup = getQueueGroup(UseCopyEngine);
+  auto &ZeCommandQueue =
+      ForcedCmdQueue ? *ForcedCmdQueue : QGroup.getZeQueue(&QueueGroupOrdinal);
+  if (ForcedCmdQueue)
+    QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue);
+
+  ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
+  ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
+
+  ZE2UR_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice,
+                                   &ZeCommandListDesc, &ZeCommandList));
+
+  ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
+  std::tie(CommandList, std::ignore) = CommandListMap.insert(
+      std::pair<ze_command_list_handle_t, pi_command_list_info_t>(
+          ZeCommandList,
+          {ZeFence, false, false, ZeCommandQueue, QueueGroupOrdinal}));
+
+  UR_CALL(insertStartBarrierIfDiscardEventsMode(CommandList));
+  UR_CALL(insertActiveBarriers(CommandList, UseCopyEngine));
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t
+ur_queue_handle_t_::insertActiveBarriers(ur_command_list_ptr_t &CmdList,
+                                         bool UseCopyEngine) {
+  // Early exit if there are no active barriers.
+  if (ActiveBarriers.empty())
+    return UR_RESULT_SUCCESS;
+
+  // Create a wait-list and retain events.
+  _ur_ze_event_list_t ActiveBarriersWaitList;
+  UR_CALL(ActiveBarriersWaitList.createAndRetainUrZeEventList(
+      ActiveBarriers.vector().size(), ActiveBarriers.vector().data(),
+      reinterpret_cast<ur_queue_handle_t>(this), UseCopyEngine));
+
+  // We can now replace active barriers with the ones in the wait list.
+  UR_CALL(ActiveBarriers.clear());
+
+  if (ActiveBarriersWaitList.Length == 0) {
+    return UR_RESULT_SUCCESS;
+  }
+
+  for (uint32_t I = 0; I < ActiveBarriersWaitList.Length; ++I) {
+    auto &Event = ActiveBarriersWaitList.UrEventList[I];
+    ActiveBarriers.add(Event);
+  }
+
+  ur_event_handle_t Event = nullptr;
+  if (auto Res = createEventAndAssociateQueue(
+          reinterpret_cast<ur_queue_handle_t>(this), &Event,
+          UR_EXT_COMMAND_TYPE_USER, CmdList,
+          /*IsInternal*/ true))
+    return Res;
+
+  Event->WaitList = ActiveBarriersWaitList;
+  Event->OwnNativeHandle = true;
+
+  // If there are more active barriers, insert a barrier on the command-list. We
+  // do not need an event for finishing so we pass nullptr.
+  ZE2UR_CALL(zeCommandListAppendBarrier,
+             (CmdList->first, nullptr, ActiveBarriersWaitList.Length,
+              ActiveBarriersWaitList.ZeEventList));
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t ur_queue_handle_t_::insertStartBarrierIfDiscardEventsMode(
+    ur_command_list_ptr_t &CmdList) {
+  // If current command list is different from the last command list then insert
+  // a barrier waiting for the last command event.
+  if (doReuseDiscardedEvents() && CmdList != LastUsedCommandList &&
+      LastCommandEvent) {
+    ZE2UR_CALL(zeCommandListAppendBarrier,
+               (CmdList->first, nullptr, 1, &(LastCommandEvent->ZeEvent)));
+    LastCommandEvent = nullptr;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+// This is an experimental option that allows the use of copy engine, if
+// available in the device, in Level Zero plugin for copy operations submitted
+// to an in-order queue. The default is 1.
+static const bool UseCopyEngineForInOrderQueue = [] {
+  const char *CopyEngineForInOrderQueue =
+      std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE");
+  return (!CopyEngineForInOrderQueue ||
+          (std::stoi(CopyEngineForInOrderQueue) != 0));
+}();
+
+bool ur_queue_handle_t_::useCopyEngine(bool PreferCopyEngine) const {
+  auto InitialCopyGroup = CopyQueueGroupsByTID.begin()->second;
+  return PreferCopyEngine && InitialCopyGroup.ZeQueues.size() > 0 &&
+         (!isInOrderQueue() || UseCopyEngineForInOrderQueue);
+}
+
+// This function will return one of po6ssibly multiple available
+// immediate commandlists associated with this Queue.
+ur_command_list_ptr_t &ur_queue_handle_t_::pi_queue_group_t::getImmCmdList() {
+
+  uint32_t QueueIndex, QueueOrdinal;
+  auto Index = getQueueIndex(&QueueOrdinal, &QueueIndex);
+
+  if (ImmCmdLists[Index] != Queue->CommandListMap.end())
+    return ImmCmdLists[Index];
+
+  ZeStruct<ze_command_queue_desc_t> ZeCommandQueueDesc;
+  ZeCommandQueueDesc.ordinal = QueueOrdinal;
+  ZeCommandQueueDesc.index = QueueIndex;
+  ZeCommandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
+  const char *Priority = "Normal";
+  if (Queue->isPriorityLow()) {
+    ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW;
+    Priority = "Low";
+  } else if (Queue->isPriorityHigh()) {
+    ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH;
+    Priority = "High";
+  }
+
+  // Evaluate performance of explicit usage for "0" index.
+  if (QueueIndex != 0) {
+    ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY;
+  }
+
+  urPrint("[getZeQueue]: create queue ordinal = %d, index = %d "
+          "(round robin in [%d, %d]) priority = %s\n",
+          ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
+          UpperIndex, Priority);
+
+  ze_command_list_handle_t ZeCommandList;
+  ZE_CALL_NOCHECK(zeCommandListCreateImmediate,
+                  (Queue->Context->ZeContext, Queue->Device->ZeDevice,
+                   &ZeCommandQueueDesc, &ZeCommandList));
+  ImmCmdLists[Index] =
+      Queue->CommandListMap
+          .insert(std::pair<ze_command_list_handle_t, pi_command_list_info_t>{
+              ZeCommandList, {nullptr, true, false, nullptr, QueueOrdinal}})
+          .first;
+  // Add this commandlist to the cache so it can be destroyed as part of
+  // urQueueReleaseInternal
+  auto QueueType = Type;
+  std::scoped_lock<ur_mutex> Lock(Queue->Context->ZeCommandListCacheMutex);
+  auto &ZeCommandListCache =
+      QueueType == queue_type::Compute
+          ? Queue->Context->ZeComputeCommandListCache[Queue->Device->ZeDevice]
+          : Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice];
+  ZeCommandListCache.push_back(ZeCommandList);
+
+  return ImmCmdLists[Index];
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
index 8aa0e11a42d9a..c7b81dbf30af3 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
@@ -1,4 +1,4 @@
-//===--------- ur_level_zero_queue.hpp - Level Zero Adapter -----------===//
+//===--------- ur_level_zero.hpp - Level Zero Adapter -----------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,8 +7,504 @@
 //===-----------------------------------------------------------------===//
 #pragma once
 
+#include <cassert>
+#include <list>
+#include <map>
+#include <stdarg.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <sycl/detail/pi.h>
+#include <ur/ur.hpp>
+#include <ur_api.h>
+#include <ze_api.h>
+#include <zes_api.h>
+
 #include "ur_level_zero_common.hpp"
+#include "ur_level_zero_device.hpp"
+
+extern "C" {
+ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue);
+} // extern "C"
+
+ur_result_t resetCommandLists(ur_queue_handle_t Queue);
+ur_result_t
+CleanupEventsInImmCmdLists(ur_queue_handle_t UrQueue, bool QueueLocked = false,
+                           bool QueueSynced = false,
+                           ur_event_handle_t CompletedEvent = nullptr);
+
+// Structure describing the specific use of a command-list in a queue.
+// This is because command-lists are re-used across multiple queues
+// in the same context.
+struct pi_command_list_info_t {
+  // The Level-Zero fence that will be signalled at completion.
+  // Immediate commandlists do not have an associated fence.
+  // A nullptr for the fence indicates that this is an immediate commandlist.
+  ze_fence_handle_t ZeFence{nullptr};
+  // Record if the fence is in use.
+  // This is needed to avoid leak of the tracked command-list if the fence
+  // was not yet signaled at the time all events in that list were already
+  // completed (we are polling the fence at events completion). The fence
+  // may be still "in-use" due to sporadic delay in HW.
+  bool ZeFenceInUse{false};
+
+  // Indicates if command list is in closed state. This is needed to avoid
+  // appending commands to the closed command list.
+  bool IsClosed{false};
+
+  // Record the queue to which the command list will be submitted.
+  ze_command_queue_handle_t ZeQueue{nullptr};
+  // Keeps the ordinal of the ZeQueue queue group. Invalid if ZeQueue==nullptr
+  uint32_t ZeQueueGroupOrdinal{0};
+  // Helper functions to tell if this is a copy command-list.
+  bool isCopy(ur_queue_handle_t Queue) const;
+
+  // Keeps events created by commands submitted into this command-list.
+  // TODO: use this for explicit wait/cleanup of events at command-list
+  // completion.
+  // TODO: use this for optimizing events in the same command-list, e.g.
+  // only have last one visible to the host.
+  std::vector<ur_event_handle_t> EventList{};
+  size_t size() const { return EventList.size(); }
+  void append(ur_event_handle_t Event) { EventList.push_back(Event); }
+};
+
+// The map type that would track all command-lists in a queue.
+using ur_command_list_map_t =
+    std::unordered_map<ze_command_list_handle_t, pi_command_list_info_t>;
+// The iterator pointing to a specific command-list in use.
+using ur_command_list_ptr_t = ur_command_list_map_t::iterator;
+
+struct ur_queue_handle_t_ : _ur_object {
+  ur_queue_handle_t_(std::vector<ze_command_queue_handle_t> &ComputeQueues,
+                     std::vector<ze_command_queue_handle_t> &CopyQueues,
+                     ur_context_handle_t Context, ur_device_handle_t Device,
+                     bool OwnZeCommandQueue, pi_queue_properties Properties = 0,
+                     int ForceComputeIndex = -1);
+
+  using queue_type = ur_device_handle_t_::queue_group_info_t::type;
+  // PI queue is in general a one to many mapping to L0 native queues.
+  struct pi_queue_group_t {
+    ur_queue_handle_t Queue;
+    pi_queue_group_t() = delete;
+
+    // The Queue argument captures the enclosing PI queue.
+    // The Type argument specifies the type of this queue group.
+    // The actual ZeQueues are populated at PI queue construction.
+    pi_queue_group_t(ur_queue_handle_t Queue, queue_type Type)
+        : Queue(Queue), Type(Type) {}
+
+    // The type of the queue group.
+    queue_type Type;
+    bool isCopy() const { return Type != queue_type::Compute; }
+
+    // Level Zero command queue handles.
+    std::vector<ze_command_queue_handle_t> ZeQueues;
+
+    // Immediate commandlist handles, one per Level Zero command queue handle.
+    // These are created only once, along with the L0 queues (see above)
+    // and reused thereafter.
+    std::vector<ur_command_list_ptr_t> ImmCmdLists;
+
+    // Return the index of the next queue to use based on a
+    // round robin strategy and the queue group ordinal.
+    // If QueryOnly is true then return index values but don't update internal
+    // index data members of the queue.
+    uint32_t getQueueIndex(uint32_t *QueueGroupOrdinal, uint32_t *QueueIndex,
+                           bool QueryOnly = false);
+
+    // Get the ordinal for a command queue handle.
+    int32_t getCmdQueueOrdinal(ze_command_queue_handle_t CmdQueue);
+
+    // This function will return one of possibly multiple available native
+    // queues and the value of the queue group ordinal.
+    ze_command_queue_handle_t &getZeQueue(uint32_t *QueueGroupOrdinal);
+
+    // This function returns the next immediate commandlist to use.
+    ur_command_list_ptr_t &getImmCmdList();
+
+    // These indices are to filter specific range of the queues to use,
+    // and to organize round-robin across them.
+    uint32_t UpperIndex{0};
+    uint32_t LowerIndex{0};
+    uint32_t NextIndex{0};
+  };
+
+  // Helper class to facilitate per-thread queue groups
+  // We maintain a hashtable of queue groups if requested to do them per-thread.
+  // Otherwise it is just single entry used for all threads.
+  struct pi_queue_group_by_tid_t
+      : public std::unordered_map<std::thread::id, pi_queue_group_t> {
+    bool PerThread = false;
+
+    // Returns thread id if doing per-thread, or a generic id that represents
+    // all the threads.
+    std::thread::id tid() const {
+      return PerThread ? std::this_thread::get_id() : std::thread::id();
+    }
+
+    // Make the specified queue group be the master
+    void set(const pi_queue_group_t &QueueGroup) {
+      const auto &Device = QueueGroup.Queue->Device;
+      PerThread =
+          Device->ImmCommandListUsed == ur_device_handle_t_::PerThreadPerQueue;
+      assert(empty());
+      insert({tid(), QueueGroup});
+    }
+
+    // Get a queue group to use for this thread
+    pi_queue_group_t &get() {
+      assert(!empty());
+      auto It = find(tid());
+      if (It != end()) {
+        return It->second;
+      }
+      // Add new queue group for this thread initialized from a master entry.
+      auto QueueGroup = begin()->second;
+      // Create space for queues and immediate commandlists, which are created
+      // on demand.
+      QueueGroup.ZeQueues = std::vector<ze_command_queue_handle_t>(
+          QueueGroup.ZeQueues.size(), nullptr);
+      QueueGroup.ImmCmdLists = std::vector<ur_command_list_ptr_t>(
+          QueueGroup.ZeQueues.size(), QueueGroup.Queue->CommandListMap.end());
+
+      std::tie(It, std::ignore) = insert({tid(), QueueGroup});
+      return It->second;
+    }
+  };
+
+  // A map of compute groups containing compute queue handles, one per thread.
+  // When a queue is accessed from multiple host threads, a separate queue group
+  // is created for each thread. The key used for mapping is the thread ID.
+  pi_queue_group_by_tid_t ComputeQueueGroupsByTID;
+
+  // A group containing copy queue handles. The main copy engine, if available,
+  // comes first followed by link copy engines, if available.
+  // When a queue is accessed from multiple host threads, a separate queue group
+  // is created for each thread. The key used for mapping is the thread ID.
+  pi_queue_group_by_tid_t CopyQueueGroupsByTID;
+
+  // Keeps the PI context to which this queue belongs.
+  // This field is only set at _pi_queue creation time, and cannot change.
+  // Therefore it can be accessed without holding a lock on this _pi_queue.
+  const ur_context_handle_t Context;
+
+  // Keeps the PI device to which this queue belongs.
+  // This field is only set at _pi_queue creation time, and cannot change.
+  // Therefore it can be accessed without holding a lock on this _pi_queue.
+  const ur_device_handle_t Device;
+
+  // Keeps track of the event associated with the last enqueued command into
+  // this queue. this is used to add dependency with the last command to add
+  // in-order semantics and updated with the latest event each time a new
+  // command is enqueued.
+  ur_event_handle_t LastCommandEvent = nullptr;
+
+  // Indicates if we own the ZeCommandQueue or it came from interop that
+  // asked to not transfer the ownership to SYCL RT.
+  bool OwnZeCommandQueue;
+
+  // Keeps the properties of this queue.
+  pi_queue_properties Properties;
+
+  // Map of all command lists used in this queue.
+  ur_command_list_map_t CommandListMap;
+
+  // Helper data structure to hold all variables related to batching
+  struct command_batch {
+    // These two members are used to keep track of how often the
+    // batching closes and executes a command list before reaching the
+    // QueueComputeBatchSize limit, versus how often we reach the limit.
+    // This info might be used to vary the QueueComputeBatchSize value.
+    uint32_t NumTimesClosedEarly = {0};
+    uint32_t NumTimesClosedFull = {0};
 
-struct _ur_queue_handle_t : _ur_object {
-  _ur_queue_handle_t() {}
+    // Open command list fields for batching commands into this queue.
+    ur_command_list_ptr_t OpenCommandList{};
+
+    // Approximate number of commands that are allowed to be batched for
+    // this queue.
+    // Added this member to the queue rather than using a global variable
+    // so that future implementation could use heuristics to change this on
+    // a queue specific basis. And by putting it in the queue itself, this
+    // is thread safe because of the locking of the queue that occurs.
+    uint32_t QueueBatchSize = {0};
+  };
+
+  // ComputeCommandBatch holds data related to batching of non-copy commands.
+  // CopyCommandBatch holds data related to batching of copy commands.
+  command_batch ComputeCommandBatch, CopyCommandBatch;
+
+  // A helper structure to keep active barriers of the queue.
+  // It additionally manages ref-count of events in this list.
+  struct active_barriers {
+    std::vector<ur_event_handle_t> Events;
+    void add(ur_event_handle_t &Event);
+    ur_result_t clear();
+    bool empty() { return Events.empty(); }
+    std::vector<ur_event_handle_t> &vector() { return Events; }
+  };
+  // A collection of currently active barriers.
+  // These should be inserted into a command list whenever an available command
+  // list is needed for a command.
+  active_barriers ActiveBarriers;
+
+  // Besides each PI object keeping a total reference count in
+  // _ur_object::RefCount we keep special track of the queue *external*
+  // references. This way we are able to tell when the queue is being finished
+  // externally, and can wait for internal references to complete, and do proper
+  // cleanup of the queue.
+  // This counter doesn't track the lifetime of a queue object, it only tracks
+  // the number of external references. I.e. even if it reaches zero a queue
+  // object may not be destroyed and can be used internally in the plugin.
+  // That's why we intentionally don't use atomic type for this counter to
+  // enforce guarding with a mutex all the work involving this counter.
+  uint32_t RefCountExternal{1};
+
+  // Indicates that the queue is healthy and all operations on it are OK.
+  bool Healthy{true};
+
+  // The following data structures and methods are used only for handling
+  // in-order queue with discard_events property. Some commands in such queue
+  // may have discarded event. Which means that event is not visible outside of
+  // the plugin. It is possible to reset and reuse discarded events in the same
+  // in-order queue because of the dependency between commands. We don't have to
+  // wait event completion to do this. We use the following 2-event model to
+  // reuse events inside each command list:
+  //
+  // Operation1 = zeCommantListAppendMemoryCopy (signal ze_event1)
+  // zeCommandListAppendBarrier(wait for ze_event1)
+  // zeCommandListAppendEventReset(ze_event1)
+  // # Create new pi_event using ze_event1 and append to the cache.
+  //
+  // Operation2 = zeCommandListAppendMemoryCopy (signal ze_event2)
+  // zeCommandListAppendBarrier(wait for ze_event2)
+  // zeCommandListAppendEventReset(ze_event2)
+  // # Create new pi_event using ze_event2 and append to the cache.
+  //
+  // # Get pi_event from the beginning of the cache because there are two events
+  // # there. So it is guaranteed that we do round-robin between two events -
+  // # event from the last command is appended to the cache.
+  // Operation3 = zeCommandListAppendMemoryCopy (signal ze_event1)
+  // # The same ze_event1 is used for Operation1 and Operation3.
+  //
+  // When we switch to a different command list we need to signal new event and
+  // wait for it in the new command list using barrier.
+  // [CmdList1]
+  // Operation1 = zeCommantListAppendMemoryCopy (signal event1)
+  // zeCommandListAppendBarrier(wait for event1)
+  // zeCommandListAppendEventReset(event1)
+  // zeCommandListAppendSignalEvent(NewEvent)
+  //
+  // [CmdList2]
+  // zeCommandListAppendBarrier(wait for NewEvent)
+  //
+  // This barrier guarantees that command list execution starts only after
+  // completion of previous command list which signals aforementioned event. It
+  // allows to reset and reuse same event handles inside all command lists in
+  // scope of the queue. It means that we need 2 reusable events of each type
+  // (host-visible and device-scope) per queue at maximum.
+
+  // This data member keeps track of the last used command list and allows to
+  // handle switch of immediate command lists because immediate command lists
+  // are never closed unlike regular command lists.
+  ur_command_list_ptr_t LastUsedCommandList = CommandListMap.end();
+
+  // Vector of 2 lists of reusable events: host-visible and device-scope.
+  // They are separated to allow faster access to stored events depending on
+  // requested type of event. Each list contains events which can be reused
+  // inside all command lists in the queue as described in the 2-event model.
+  // Leftover events in the cache are relased at the queue destruction.
+  std::vector<std::list<ur_event_handle_t>> EventCaches{2};
+
+  // adjust the queue's batch size, knowing that the current command list
+  // is being closed with a full batch.
+  // For copy commands, IsCopy is set to 'true'.
+  // For non-copy commands, IsCopy is set to 'false'.
+  void adjustBatchSizeForFullBatch(bool IsCopy);
+
+  // adjust the queue's batch size, knowing that the current command list
+  // is being closed with only a partial batch of commands.
+  // For copy commands, IsCopy is set to 'true'.
+  // For non-copy commands, IsCopy is set to 'false'.
+  void adjustBatchSizeForPartialBatch(bool IsCopy);
+
+  // Attach a command list to this queue.
+  // For non-immediate commandlist also close and execute it.
+  // Note that this command list cannot be appended to after this.
+  // The "IsBlocking" tells if the wait for completion is required.
+  // If OKToBatchCommand is true, then this command list may be executed
+  // immediately, or it may be left open for other future command to be
+  // batched into.
+  // If IsBlocking is true, then batching will not be allowed regardless
+  // of the value of OKToBatchCommand
+  //
+  // For immediate commandlists, no close and execute is necessary.
+  ur_result_t executeCommandList(ur_command_list_ptr_t CommandList,
+                                 bool IsBlocking = false,
+                                 bool OKToBatchCommand = false);
+
+  // Helper method telling whether we need to reuse discarded event in this
+  // queue.
+  bool doReuseDiscardedEvents();
+
+  // Append command to provided command list to wait and reset the last event if
+  // it is discarded and create new pi_event wrapper using the same native event
+  // and put it to the cache. We call this method after each command submission
+  // to make native event available to use by next commands.
+  ur_result_t resetDiscardedEvent(ur_command_list_ptr_t);
+
+  // Put pi_event to the cache. Provided pi_event object is not used by
+  // any command but its ZeEvent is used by many pi_event objects.
+  // Commands to wait and reset ZeEvent must be submitted to the queue before
+  // calling this method.
+  ur_result_t addEventToQueueCache(ur_event_handle_t Event);
+
+  // Returns true if any commands for this queue are allowed to
+  // be batched together.
+  // For copy commands, IsCopy is set to 'true'.
+  // For non-copy commands, IsCopy is set to 'false'.
+  bool isBatchingAllowed(bool IsCopy) const;
+
+  // Returns true if the queue is a in-order queue.
+  bool isInOrderQueue() const;
+
+  // Returns true if the queue has discard events property.
+  bool isDiscardEvents() const;
+
+  // Returns true if the queue has explicit priority set by user.
+  bool isPriorityLow() const;
+  bool isPriorityHigh() const;
+
+  // Wait for all commandlists associated with this Queue to finish operations.
+  ur_result_t synchronize();
+
+  // Get event from the queue's cache.
+  // Returns nullptr if the cache doesn't contain any reusable events or if the
+  // cache contains only one event which corresponds to the previous command and
+  // can't be used for the current command because we can't use the same event
+  // two times in a row and have to do round-robin between two events. Otherwise
+  // it picks an event from the beginning of the cache and returns it. Event
+  // from the last command is always appended to the end of the list.
+  ur_event_handle_t getEventFromQueueCache(bool HostVisible);
+
+  // Returns true if an OpenCommandList has commands that need to be submitted.
+  // If IsCopy is 'true', then the OpenCommandList containing copy commands is
+  // checked. Otherwise, the OpenCommandList containing compute commands is
+  // checked.
+  bool hasOpenCommandList(bool IsCopy) const {
+    auto CommandBatch = (IsCopy) ? CopyCommandBatch : ComputeCommandBatch;
+    return CommandBatch.OpenCommandList != CommandListMap.end();
+  }
+
+  // Update map of memory references made by the kernels about to be submitted
+  void CaptureIndirectAccesses();
+
+  // Kernel is not necessarily submitted for execution during
+  // piEnqueueKernelLaunch, it may be batched. That's why we need to save the
+  // list of kernels which is going to be submitted but have not been submitted
+  // yet. This is needed to capture memory allocations for each kernel with
+  // indirect access in the list at the moment when kernel is really submitted
+  // for execution.
+  std::vector<ur_kernel_handle_t> KernelsToBeSubmitted;
+
+  // Append command to the command list to signal new event if the last event in
+  // the command list is discarded. While we submit commands in scope of the
+  // same command list we can reset and reuse events but when we switch to a
+  // different command list we currently need to signal new event and wait for
+  // it in the new command list using barrier.
+  ur_result_t signalEventFromCmdListIfLastEventDiscarded(ur_command_list_ptr_t);
+
+  // If there is an open command list associated with this queue,
+  // close it, execute it, and reset the corresponding OpenCommandList.
+  // If IsCopy is 'true', then the OpenCommandList containing copy commands is
+  // executed. Otherwise OpenCommandList containing compute commands is
+  // executed.
+  ur_result_t executeOpenCommandList(bool IsCopy);
+
+  // Wrapper function to execute both OpenCommandLists (Copy and Compute).
+  // This wrapper is helpful when all 'open' commands need to be executed.
+  // Call-sites instances: piQuueueFinish, piQueueRelease, etc.
+  ur_result_t executeAllOpenCommandLists() {
+    using IsCopy = bool;
+    if (auto Res = executeOpenCommandList(IsCopy{false}))
+      return Res;
+    if (auto Res = executeOpenCommandList(IsCopy{true}))
+      return Res;
+    return UR_RESULT_SUCCESS;
+  }
+
+  /// @brief Resets the command list and associated fence in the map and removes
+  /// events from the command list.
+  /// @param CommandList The caller must verify that this command list and fence
+  /// have been signalled.
+  /// @param MakeAvailable If the reset command list should be made available,
+  /// then MakeAvailable needs to be set to true.
+  /// @param EventListToCleanup  The EventListToCleanup contains a list of
+  /// events from the command list which need to be cleaned up.
+  /// @param CheckStatus Hint informing whether we need to check status of the
+  /// events before removing them from the immediate command list. This is
+  /// needed because immediate command lists are not associated with fences and
+  /// in general status of the event needs to be checked.
+  /// @return PI_SUCCESS if successful, PI error code otherwise.
+  ur_result_t
+  resetCommandList(ur_command_list_ptr_t CommandList, bool MakeAvailable,
+                   std::vector<ur_event_handle_t> &EventListToCleanup,
+                   bool CheckStatus = true);
+
+  // Gets the open command containing the event, or CommandListMap.end()
+  ur_command_list_ptr_t eventOpenCommandList(ur_event_handle_t Event);
+
+  // Return the queue group to use based on standard/immediate commandlist mode,
+  // and if immediate mode, the thread-specific group.
+  pi_queue_group_t &getQueueGroup(bool UseCopyEngine);
+
+  // Helper function to create a new command-list to this queue and associated
+  // fence tracking its completion. This command list & fence are added to the
+  // map of command lists in this queue with ZeFenceInUse = false.
+  // The caller must hold a lock of the queue already.
+  ur_result_t
+  createCommandList(bool UseCopyEngine, ur_command_list_ptr_t &CommandList,
+                    ze_command_queue_handle_t *ForcedCmdQueue = nullptr);
+
+  // Inserts a barrier waiting for all unfinished events in ActiveBarriers into
+  // CmdList. Any finished events will be removed from ActiveBarriers.
+  ur_result_t insertActiveBarriers(ur_command_list_ptr_t &CmdList,
+                                   bool UseCopyEngine);
+
+  // This function considers multiple factors including copy engine
+  // availability and user preference and returns a boolean that is used to
+  // specify if copy engine will eventually be used for a particular command.
+  bool useCopyEngine(bool PreferCopyEngine = true) const;
+
+  // Insert a barrier waiting for the last command event into the beginning of
+  // command list. This barrier guarantees that command list execution starts
+  // only after completion of previous command list which signals aforementioned
+  // event. It allows to reset and reuse same event handles inside all command
+  // lists in the queue.
+  ur_result_t
+  insertStartBarrierIfDiscardEventsMode(ur_command_list_ptr_t &CmdList);
 };
+
+// This helper function creates a pi_event and associate a pi_queue.
+// Note that the caller of this function must have acquired lock on the Queue
+// that is passed in.
+// \param Queue pi_queue to associate with a new event.
+// \param Event a pointer to hold the newly created pi_event
+// \param CommandType various command type determined by the caller
+// \param CommandList is the command list where the event is added
+// \param IsInternal tells if the event is internal, i.e. visible in the L0
+//        plugin only.
+// \param ForceHostVisible tells if the event must be created in
+//        the host-visible pool
+ur_result_t createEventAndAssociateQueue(
+    ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType,
+    ur_command_list_ptr_t CommandList, bool IsInternal = false,
+    std::optional<bool> HostVisible = std::nullopt);
+
+// Helper function to perform the necessary cleanup of the events from reset cmd
+// list.
+ur_result_t CleanupEventListFromResetCmdList(
+    std::vector<ur_event_handle_t> &EventListToCleanup,
+    bool QueueLocked = false);
\ No newline at end of file
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
index 7014f92ddfb90..1b5496f5f59ed 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
@@ -7,3 +7,206 @@
 //===-----------------------------------------------------------------===//
 
 #include "ur_level_zero_sampler.hpp"
+#include <ur_bindings.hpp>
+
+UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    const ur_sampler_property_t
+        *Props, ///< [in] specifies a list of sampler property names and their
+                ///< corresponding values.
+    ur_sampler_handle_t
+        *Sampler ///< [out] pointer to handle of sampler object created
+) {
+  std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
+
+  // Have the "0" device in context to own the sampler. Rely on Level-Zero
+  // drivers to perform migration as necessary for sharing it across multiple
+  // devices in the context.
+  //
+  // TODO: figure out if we instead need explicit copying for acessing
+  // the sampler from other devices in the context.
+  //
+  ur_device_handle_t Device = Context->Devices[0];
+
+  ze_sampler_handle_t ZeSampler;
+  ZeStruct<ze_sampler_desc_t> ZeSamplerDesc;
+
+  // Set the default values for the ZeSamplerDesc.
+  ZeSamplerDesc.isNormalized = PI_TRUE;
+  ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_CLAMP;
+  ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST;
+
+  // Update the values of the ZeSamplerDesc from the pi_sampler_properties list.
+  // Default values will be used if any of the following is true:
+  //   a) SamplerProperties list is NULL
+  //   b) SamplerProperties list is missing any properties
+
+  if (Props) {
+    const ur_sampler_property_t *CurProperty = Props;
+
+    while (*CurProperty != 0) {
+      switch (*CurProperty) {
+      case UR_SAMPLER_PROPERTIES_NORMALIZED_COORDS: {
+        bool CurValueBool = ur_cast<pi_bool>(*(++CurProperty));
+
+        if (CurValueBool == PI_TRUE)
+          ZeSamplerDesc.isNormalized = PI_TRUE;
+        else if (CurValueBool == PI_FALSE)
+          ZeSamplerDesc.isNormalized = PI_FALSE;
+        else {
+          urPrint("urSamplerCreate: unsupported "
+                  "UR_SAMPLER_INFO_NORMALIZED_COORDS value\n");
+          return UR_RESULT_ERROR_INVALID_VALUE;
+        }
+      } break;
+
+      case UR_SAMPLER_PROPERTIES_ADDRESSING_MODE: {
+        ur_sampler_addressing_mode_t CurValueAddressingMode =
+            ur_cast<ur_sampler_addressing_mode_t>(
+                ur_cast<uint32_t>(*(++CurProperty)));
+
+        // Level Zero runtime with API version 1.2 and lower has a bug:
+        // ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER is implemented as "clamp to
+        // edge" and ZE_SAMPLER_ADDRESS_MODE_CLAMP is implemented as "clamp to
+        // border", i.e. logic is flipped. Starting from API version 1.3 this
+        // problem is going to be fixed. That's why check for API version to set
+        // an address mode.
+        ze_api_version_t ZeApiVersion = Context->getPlatform()->ZeApiVersion;
+        // TODO: add support for PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE
+        switch (CurValueAddressingMode) {
+        case UR_SAMPLER_ADDRESSING_MODE_NONE:
+          ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_NONE;
+          break;
+        case UR_SAMPLER_ADDRESSING_MODE_REPEAT:
+          ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_REPEAT;
+          break;
+        case UR_SAMPLER_ADDRESSING_MODE_CLAMP:
+          ZeSamplerDesc.addressMode =
+              ZeApiVersion < ZE_MAKE_VERSION(1, 3)
+                  ? ZE_SAMPLER_ADDRESS_MODE_CLAMP
+                  : ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER;
+          break;
+        case UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE:
+          ZeSamplerDesc.addressMode =
+              ZeApiVersion < ZE_MAKE_VERSION(1, 3)
+                  ? ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER
+                  : ZE_SAMPLER_ADDRESS_MODE_CLAMP;
+          break;
+        case UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT:
+          ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_MIRROR;
+          break;
+        default:
+          urPrint("urSamplerCreate: unsupported "
+                  "UR_SAMPLER_PROPERTIES_ADDRESSING_MODEE "
+                  "value\n");
+          urPrint("UR_SAMPLER_PROPERTIES_ADDRESSING_MODEE=%d\n",
+                  CurValueAddressingMode);
+          return UR_RESULT_ERROR_INVALID_VALUE;
+        }
+      } break;
+
+      case UR_SAMPLER_PROPERTIES_FILTER_MODE: {
+        ur_ext_sampler_filter_mode_t CurValueFilterMode =
+            ur_cast<ur_ext_sampler_filter_mode_t>(
+                ur_cast<uint32_t>(*(++CurProperty)));
+
+        if (CurValueFilterMode == UR_EXT_SAMPLER_FILTER_MODE_NEAREST)
+          ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST;
+        else if (CurValueFilterMode == UR_EXT_SAMPLER_FILTER_MODE_LINEAR)
+          ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_LINEAR;
+        else {
+          urPrint("UR_SAMPLER_FILTER_MODE=%d\n", CurValueFilterMode);
+          urPrint(
+              "urSamplerCreate: unsupported UR_SAMPLER_FILTER_MODE value\n");
+          return UR_RESULT_ERROR_INVALID_VALUE;
+        }
+      } break;
+
+      default:
+        break;
+      }
+      CurProperty++;
+    }
+  }
+
+  ZE2UR_CALL(zeSamplerCreate, (Context->ZeContext, Device->ZeDevice,
+                               &ZeSamplerDesc, // TODO: translate properties
+                               &ZeSampler));
+
+  try {
+    ur_sampler_handle_t_ *UrSampler = new ur_sampler_handle_t_(ZeSampler);
+    *Sampler = reinterpret_cast<ur_sampler_handle_t>(UrSampler);
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urSamplerRetain(
+    ur_sampler_handle_t
+        Sampler ///< [in] handle of the sampler object to get access
+) {
+  Sampler->RefCount.increment();
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urSamplerRelease(
+    ur_sampler_handle_t
+        Sampler ///< [in] handle of the sampler object to release
+) {
+  if (!Sampler->RefCount.decrementAndTest())
+    return UR_RESULT_SUCCESS;
+
+  auto ZeResult = ZE_CALL_NOCHECK(zeSamplerDestroy, (Sampler->ZeSampler));
+  // Gracefully handle the case that L0 was already unloaded.
+  if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
+    return ze2urResult(ZeResult);
+  delete Sampler;
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urSamplerGetInfo(
+    ur_sampler_handle_t Sampler, ///< [in] handle of the sampler object
+    ur_sampler_info_t PropName,  ///< [in] name of the sampler property to query
+    size_t PropValueSize, ///< [in] size in bytes of the sampler property value
+                          ///< provided
+    void *PropValue,      ///< [out] value of the sampler property
+    size_t
+        *PropSizeRet ///< [out] size in bytes returned in sampler property value
+) {
+  std::ignore = Sampler;
+  std::ignore = PropName;
+  std::ignore = PropValueSize;
+  std::ignore = PropValue;
+  std::ignore = PropSizeRet;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urSamplerGetNativeHandle(
+    ur_sampler_handle_t Sampler,      ///< [in] handle of the sampler.
+    ur_native_handle_t *NativeSampler ///< [out] a pointer to the native
+                                      ///< handle of the sampler.
+) {
+  std::ignore = Sampler;
+  std::ignore = NativeSampler;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreateWithNativeHandle(
+    ur_native_handle_t
+        NativeSampler,           ///< [in] the native handle of the sampler.
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    ur_sampler_handle_t *Sampler ///< [out] pointer to the handle of the
+                                 ///< sampler object created.
+) {
+  std::ignore = NativeSampler;
+  std::ignore = Context;
+  std::ignore = Sampler;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.hpp
index abbfb76c8e126..22463f76906e4 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.hpp
@@ -9,6 +9,9 @@
 
 #include "ur_level_zero_common.hpp"
 
-struct _ur_sampler_handle_t : _ur_object {
-  _ur_sampler_handle_t() {}
+struct ur_sampler_handle_t_ : _ur_object {
+  ur_sampler_handle_t_(ze_sampler_handle_t Sampler) : ZeSampler{Sampler} {}
+
+  // Level Zero sampler handle.
+  ze_sampler_handle_t ZeSampler;
 };
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp
index 0a58c57319b7b..a117de71b57e6 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp
@@ -32,7 +32,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable(
   }
 
   pDdiTable->pfnInit = urInit;
-  pDdiTable->pfnGetLastResult = nullptr;
+  pDdiTable->pfnGetLastResult = urGetLastResult;
   pDdiTable->pfnTearDown = urTearDown;
 
   return retVal;
@@ -48,13 +48,13 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetContextProcAddrTable(
     return retVal;
   }
 
-  pDdiTable->pfnCreate = nullptr;
-  pDdiTable->pfnRetain = nullptr;
-  pDdiTable->pfnRelease = nullptr;
-  pDdiTable->pfnGetInfo = nullptr;
-  pDdiTable->pfnGetNativeHandle = nullptr;
-  pDdiTable->pfnCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnSetExtendedDeleter = nullptr;
+  pDdiTable->pfnCreate = urContextCreate;
+  pDdiTable->pfnRetain = urContextRetain;
+  pDdiTable->pfnRelease = urContextRelease;
+  pDdiTable->pfnGetInfo = urContextGetInfo;
+  pDdiTable->pfnGetNativeHandle = urContextGetNativeHandle;
+  pDdiTable->pfnCreateWithNativeHandle = urContextCreateWithNativeHandle;
+  pDdiTable->pfnSetExtendedDeleter = urContextSetExtendedDeleter;
 
   return retVal;
 }
@@ -69,28 +69,29 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
     return retVal;
   }
 
-  pDdiTable->pfnKernelLaunch = nullptr;
-  pDdiTable->pfnEventsWait = nullptr;
-  pDdiTable->pfnEventsWaitWithBarrier = nullptr;
-  pDdiTable->pfnMemBufferRead = nullptr;
-  pDdiTable->pfnMemBufferWrite = nullptr;
-  pDdiTable->pfnMemBufferReadRect = nullptr;
-  pDdiTable->pfnMemBufferWriteRect = nullptr;
-  pDdiTable->pfnMemBufferCopy = nullptr;
-  pDdiTable->pfnMemBufferCopyRect = nullptr;
-  pDdiTable->pfnMemBufferFill = nullptr;
-  pDdiTable->pfnMemImageRead = nullptr;
-  pDdiTable->pfnMemImageWrite = nullptr;
-  pDdiTable->pfnMemImageCopy = nullptr;
-  pDdiTable->pfnMemBufferMap = nullptr;
-  pDdiTable->pfnMemUnmap = nullptr;
-  pDdiTable->pfnUSMMemcpy = nullptr;
-  pDdiTable->pfnUSMPrefetch = nullptr;
-  pDdiTable->pfnUSMAdvise = nullptr;
-  pDdiTable->pfnUSMFill2D = nullptr;
-  pDdiTable->pfnUSMMemcpy2D = nullptr;
-  pDdiTable->pfnDeviceGlobalVariableWrite = nullptr;
-  pDdiTable->pfnDeviceGlobalVariableRead = nullptr;
+  pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch;
+  pDdiTable->pfnEventsWait = urEnqueueEventsWait;
+  pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier;
+  pDdiTable->pfnMemBufferRead = urEnqueueMemBufferRead;
+  pDdiTable->pfnMemBufferWrite = urEnqueueMemBufferWrite;
+  pDdiTable->pfnMemBufferReadRect = urEnqueueMemBufferReadRect;
+  pDdiTable->pfnMemBufferWriteRect = urEnqueueMemBufferWriteRect;
+  pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy;
+  pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect;
+  pDdiTable->pfnMemBufferFill = urEnqueueMemBufferFill;
+  pDdiTable->pfnMemImageRead = urEnqueueMemImageRead;
+  pDdiTable->pfnMemImageWrite = urEnqueueMemImageWrite;
+  pDdiTable->pfnMemImageCopy = urEnqueueMemImageCopy;
+  pDdiTable->pfnMemBufferMap = urEnqueueMemBufferMap;
+  pDdiTable->pfnMemUnmap = urEnqueueMemUnmap;
+  pDdiTable->pfnUSMFill = urEnqueueUSMFill;
+  pDdiTable->pfnUSMMemcpy = urEnqueueUSMMemcpy;
+  pDdiTable->pfnUSMPrefetch = urEnqueueUSMPrefetch;
+  pDdiTable->pfnUSMMemAdvise = urEnqueueUSMMemAdvise;
+  pDdiTable->pfnUSMFill2D = urEnqueueUSMFill2D;
+  pDdiTable->pfnUSMMemcpy2D = urEnqueueUSMMemcpy2D;
+  pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite;
+  pDdiTable->pfnDeviceGlobalVariableRead = urEnqueueDeviceGlobalVariableRead;
 
   return retVal;
 }
@@ -104,14 +105,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable(
   if (UR_RESULT_SUCCESS != retVal) {
     return retVal;
   }
-  pDdiTable->pfnGetInfo = nullptr;
-  pDdiTable->pfnGetProfilingInfo = nullptr;
-  pDdiTable->pfnWait = nullptr;
-  pDdiTable->pfnRetain = nullptr;
-  pDdiTable->pfnRelease = nullptr;
-  pDdiTable->pfnGetNativeHandle = nullptr;
-  pDdiTable->pfnCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnSetCallback = nullptr;
+  pDdiTable->pfnGetInfo = urEventGetInfo;
+  pDdiTable->pfnGetProfilingInfo = urEventGetProfilingInfo;
+  pDdiTable->pfnWait = urEventWait;
+  pDdiTable->pfnRetain = urEventRetain;
+  pDdiTable->pfnRelease = urEventRelease;
+  pDdiTable->pfnGetNativeHandle = urEventGetNativeHandle;
+  pDdiTable->pfnCreateWithNativeHandle = urEventCreateWithNativeHandle;
+  pDdiTable->pfnSetCallback = urEventSetCallback;
 
   return retVal;
 }
@@ -125,20 +126,21 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   if (UR_RESULT_SUCCESS != retVal) {
     return retVal;
   }
-  pDdiTable->pfnCreate = nullptr;
-  pDdiTable->pfnGetInfo = nullptr;
-  pDdiTable->pfnGetGroupInfo = nullptr;
-  pDdiTable->pfnGetSubGroupInfo = nullptr;
-  pDdiTable->pfnRetain = nullptr;
-  pDdiTable->pfnRelease = nullptr;
-  pDdiTable->pfnGetNativeHandle = nullptr;
-  pDdiTable->pfnCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnSetArgValue = nullptr;
-  pDdiTable->pfnSetArgLocal = nullptr;
-  pDdiTable->pfnSetArgPointer = nullptr;
-  pDdiTable->pfnSetExecInfo = nullptr;
-  pDdiTable->pfnSetArgSampler = nullptr;
-  pDdiTable->pfnSetArgMemObj = nullptr;
+  pDdiTable->pfnCreate = urKernelCreate;
+  pDdiTable->pfnGetInfo = urKernelGetInfo;
+  pDdiTable->pfnGetGroupInfo = urKernelGetGroupInfo;
+  pDdiTable->pfnGetSubGroupInfo = urKernelGetSubGroupInfo;
+  pDdiTable->pfnRetain = urKernelRetain;
+  pDdiTable->pfnRelease = urKernelRelease;
+  pDdiTable->pfnGetNativeHandle = urKernelGetNativeHandle;
+  pDdiTable->pfnCreateWithNativeHandle = urKernelCreateWithNativeHandle;
+  pDdiTable->pfnSetArgValue = urKernelSetArgValue;
+  pDdiTable->pfnSetArgLocal = urKernelSetArgLocal;
+  pDdiTable->pfnSetArgPointer = urKernelSetArgPointer;
+  pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
+  pDdiTable->pfnSetArgSampler = urKernelSetArgSampler;
+  pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj;
+  pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants;
   return retVal;
 }
 
@@ -151,15 +153,15 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetMemProcAddrTable(
   if (UR_RESULT_SUCCESS != retVal) {
     return retVal;
   }
-  pDdiTable->pfnImageCreate = nullptr;
-  pDdiTable->pfnBufferCreate = nullptr;
-  pDdiTable->pfnRetain = nullptr;
-  pDdiTable->pfnRelease = nullptr;
-  pDdiTable->pfnBufferPartition = nullptr;
-  pDdiTable->pfnGetNativeHandle = nullptr;
-  pDdiTable->pfnCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnGetInfo = nullptr;
-  pDdiTable->pfnImageGetInfo = nullptr;
+  pDdiTable->pfnImageCreate = urMemImageCreate;
+  pDdiTable->pfnBufferCreate = urMemBufferCreate;
+  pDdiTable->pfnRetain = urMemRetain;
+  pDdiTable->pfnRelease = urMemRelease;
+  pDdiTable->pfnBufferPartition = urMemBufferPartition;
+  pDdiTable->pfnGetNativeHandle = urMemGetNativeHandle;
+  pDdiTable->pfnCreateWithNativeHandle = urMemCreateWithNativeHandle;
+  pDdiTable->pfnGetInfo = urMemGetInfo;
+  pDdiTable->pfnImageGetInfo = urMemImageGetInfo;
 
   return retVal;
 }
@@ -175,9 +177,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable(
   }
   pDdiTable->pfnGet = urPlatformGet;
   pDdiTable->pfnGetInfo = urPlatformGetInfo;
-  pDdiTable->pfnGetNativeHandle = nullptr;
-  pDdiTable->pfnCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnGetApiVersion = nullptr;
+  pDdiTable->pfnGetNativeHandle = urPlatformGetNativeHandle;
+  pDdiTable->pfnCreateWithNativeHandle = urPlatformCreateWithNativeHandle;
+  pDdiTable->pfnGetApiVersion = urPlatformGetApiVersion;
 
   return retVal;
 }
@@ -192,14 +194,20 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable(
   if (UR_RESULT_SUCCESS != retVal) {
     return retVal;
   }
-  pDdiTable->pfnCreateWithBinary = nullptr;
-  pDdiTable->pfnRetain = nullptr;
-  pDdiTable->pfnRelease = nullptr;
-  pDdiTable->pfnGetFunctionPointer = nullptr;
-  pDdiTable->pfnGetInfo = nullptr;
-  pDdiTable->pfnGetBuildInfo = nullptr;
-  pDdiTable->pfnGetNativeHandle = nullptr;
-  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnCreateWithIL = urProgramCreateWithIL;
+  pDdiTable->pfnCreateWithBinary = urProgramCreateWithBinary;
+  pDdiTable->pfnBuild = urProgramBuild;
+  pDdiTable->pfnCompile = urProgramCompile;
+  pDdiTable->pfnLink = urProgramLink;
+  pDdiTable->pfnRetain = urProgramRetain;
+  pDdiTable->pfnRelease = urProgramRelease;
+  pDdiTable->pfnGetFunctionPointer = urProgramGetFunctionPointer;
+  pDdiTable->pfnGetInfo = urProgramGetInfo;
+  pDdiTable->pfnGetBuildInfo = urProgramGetBuildInfo;
+  pDdiTable->pfnSetSpecializationConstants =
+      urProgramSetSpecializationConstants;
+  pDdiTable->pfnGetNativeHandle = urProgramGetNativeHandle;
+  pDdiTable->pfnCreateWithNativeHandle = urProgramCreateWithNativeHandle;
 
   return retVal;
 }
@@ -214,14 +222,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable(
     return retVal;
   }
 
-  pDdiTable->pfnGetInfo = nullptr;
-  pDdiTable->pfnCreate = nullptr;
-  pDdiTable->pfnRetain = nullptr;
-  pDdiTable->pfnRelease = nullptr;
-  pDdiTable->pfnGetNativeHandle = nullptr;
-  pDdiTable->pfnCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnFinish = nullptr;
-  pDdiTable->pfnFlush = nullptr;
+  pDdiTable->pfnGetInfo = urQueueGetInfo;
+  pDdiTable->pfnCreate = urQueueCreate;
+  pDdiTable->pfnRetain = urQueueRetain;
+  pDdiTable->pfnRelease = urQueueRelease;
+  pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle;
+  pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle;
+  pDdiTable->pfnFinish = urQueueFinish;
+  pDdiTable->pfnFlush = urQueueFlush;
 
   return retVal;
 }
@@ -235,12 +243,12 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable(
   if (UR_RESULT_SUCCESS != retVal) {
     return retVal;
   }
-  pDdiTable->pfnCreate = nullptr;
-  pDdiTable->pfnRetain = nullptr;
-  pDdiTable->pfnRelease = nullptr;
-  pDdiTable->pfnGetInfo = nullptr;
-  pDdiTable->pfnGetNativeHandle = nullptr;
-  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnCreate = urSamplerCreate;
+  pDdiTable->pfnRetain = urSamplerRetain;
+  pDdiTable->pfnRelease = urSamplerRelease;
+  pDdiTable->pfnGetInfo = urSamplerGetInfo;
+  pDdiTable->pfnGetNativeHandle = urSamplerGetNativeHandle;
+  pDdiTable->pfnCreateWithNativeHandle = urSamplerCreateWithNativeHandle;
 
   return retVal;
 }
@@ -254,11 +262,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMProcAddrTable(
   if (UR_RESULT_SUCCESS != retVal) {
     return retVal;
   }
-  pDdiTable->pfnHostAlloc = nullptr;
-  pDdiTable->pfnDeviceAlloc = nullptr;
-  pDdiTable->pfnSharedAlloc = nullptr;
-  pDdiTable->pfnFree = nullptr;
-  pDdiTable->pfnGetMemAllocInfo = nullptr;
+
+  pDdiTable->pfnHostAlloc = urUSMHostAlloc;
+  pDdiTable->pfnDeviceAlloc = urUSMDeviceAlloc;
+  pDdiTable->pfnSharedAlloc = urUSMSharedAlloc;
+  pDdiTable->pfnFree = urUSMFree;
+  pDdiTable->pfnGetMemAllocInfo = urUSMGetMemAllocInfo;
+  pDdiTable->pfnPoolCreate = urUSMPoolCreate;
+  pDdiTable->pfnPoolDestroy = urUSMPoolDestroy;
 
   return retVal;
 }
@@ -277,10 +288,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable(
   pDdiTable->pfnRetain = urDeviceRetain;
   pDdiTable->pfnRelease = urDeviceRelease;
   pDdiTable->pfnPartition = urDevicePartition;
-  pDdiTable->pfnSelectBinary = nullptr;
-  pDdiTable->pfnGetNativeHandle = nullptr;
-  pDdiTable->pfnCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnGetGlobalTimestamps = nullptr;
+  pDdiTable->pfnSelectBinary = urDeviceSelectBinary;
+  pDdiTable->pfnGetNativeHandle = urDeviceGetNativeHandle;
+  pDdiTable->pfnCreateWithNativeHandle = urDeviceCreateWithNativeHandle;
+  pDdiTable->pfnGetGlobalTimestamps = urDeviceGetGlobalTimestamps;
 
   return retVal;
 }
diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp
index 70a52aabe290c..d25e36db39bc5 100644
--- a/sycl/plugins/unified_runtime/ur/ur.hpp
+++ b/sycl/plugins/unified_runtime/ur/ur.hpp
@@ -48,6 +48,9 @@ const int UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT = UR_EXT_DEVICE_INFO_END - 15;
 const ur_device_info_t UR_EXT_DEVICE_INFO_OPENCL_C_VERSION =
     (ur_device_info_t)0x103D;
 
+const uint32_t UR_EXT_MAP_FLAG_WRITE_INVALIDATE_REGION =
+    (UR_MAP_FLAG_WRITE << 1);
+
 const int UR_EXT_RESULT_END = 0x1000;
 const ur_result_t UR_EXT_RESULT_ADAPTER_SPECIFIC_ERROR =
     ur_result_t(UR_EXT_RESULT_END - 1);
@@ -57,6 +60,38 @@ const int UR_EXT_USM_CAPS_ATOMIC_ACCESS = 1 << 1;
 const int UR_EXT_USM_CAPS_CONCURRENT_ACCESS = 1 << 2;
 const int UR_EXT_USM_CAPS_CONCURRENT_ATOMIC_ACCESS = 1 << 3;
 
+const int UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY = 1 << 5;
+
+const ur_context_info_t UR_EXT_CONTEXT_INFO_REFERENCE_COUNT =
+    (ur_context_info_t)(UR_CONTEXT_INFO_FORCE_UINT32 - 2);
+
+const ur_context_info_t UR_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES =
+    (ur_context_info_t)(UR_CONTEXT_INFO_FORCE_UINT32 - 1);
+
+const ur_queue_info_t UR_EXT_ONEAPI_QUEUE_INFO_EMPTY =
+    (ur_queue_info_t)(UR_QUEUE_INFO_SIZE + 1);
+
+const ur_command_t UR_EXT_COMMAND_TYPE_USER =
+    (ur_command_t)((uint32_t)UR_COMMAND_FORCE_UINT32 - 1);
+
+const ur_image_channel_order_t UR_EXT_IMAGE_CHANNEL_ORDER_ABGR =
+    ur_image_channel_order_t(UR_IMAGE_CHANNEL_ORDER_FORCE_UINT32 - 1);
+
+typedef enum ur_ext_sampler_filter_mode_t {
+  UR_EXT_SAMPLER_FILTER_MODE_NEAREST = 0,
+  UR_EXT_SAMPLER_FILTER_MODE_LINEAR = 1,
+  UR_EXT_SAMPLER_FILTER_MODE_FORCE_UINT32 = 0x7fffffff
+} ur_ext_sampler_filter_mode_t;
+
+const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG =
+    (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 1);
+const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM =
+    (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 2);
+const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA =
+    (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 3);
+const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT =
+    (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 4);
+
 // Terminates the process with a catastrophic error message.
 [[noreturn]] inline void die(const char *Message) {
   std::cerr << "die: " << Message << std::endl;
@@ -228,10 +263,14 @@ struct _ur_object {
   //   std::shared_lock Obj3Lock(Obj3->Mutex, std::defer_lock);
   //   std::scoped_lock LockAll(Obj1->Mutex, Obj2->Mutex, Obj3Lock);
   ur_shared_mutex Mutex;
+
+  // Indicates if we own the native handle or it came from interop that
+  // asked to not transfer the ownership to SYCL RT.
+  bool OwnNativeHandle = false;
 };
 
 // Helper for one-liner validation
-#define PI_ASSERT(condition, error)                                            \
+#define UR_ASSERT(condition, error)                                            \
   if (!(condition))                                                            \
     return error;
 
@@ -279,7 +318,7 @@ ur_result_t getInfo(size_t param_value_size, void *param_value,
                     size_t *param_value_size_ret, T value) {
 
   auto assignment = [](void *param_value, T value, size_t value_size) {
-    (void)value_size;
+    std::ignore = value_size;
     *static_cast<T *>(param_value) = value;
   };
 
diff --git a/sycl/plugins/unified_runtime/ur_bindings.hpp b/sycl/plugins/unified_runtime/ur_bindings.hpp
old mode 100755
new mode 100644
index 8597547221a88..4b58d0f73ff87
--- a/sycl/plugins/unified_runtime/ur_bindings.hpp
+++ b/sycl/plugins/unified_runtime/ur_bindings.hpp
@@ -9,44 +9,3 @@
 
 #include <ur/adapters/level_zero/ur_level_zero.hpp>
 #include <ur_api.h>
-
-// Make the Unified Runtime handles definition complete.
-// This is used in various "create" API where new handles are allocated.
-struct ur_platform_handle_t_ : public _ur_platform_handle_t {
-  using _ur_platform_handle_t::_ur_platform_handle_t;
-};
-struct ur_device_handle_t_ : public _ur_device_handle_t {
-  using _ur_device_handle_t::_ur_device_handle_t;
-};
-
-struct ur_context_handle_t_ : public _ur_context_handle_t {
-  using _ur_context_handle_t::_ur_context_handle_t;
-};
-
-struct ur_event_handle_t_ : public _ur_event_handle_t {
-  using _ur_event_handle_t::_ur_event_handle_t;
-};
-
-struct ur_program_handle_t_ : public _ur_program_handle_t {
-  using _ur_program_handle_t::_ur_program_handle_t;
-};
-
-struct ur_module_handle_t_ : public _ur_module_handle_t {
-  using _ur_module_handle_t::_ur_module_handle_t;
-};
-
-struct ur_kernel_handle_t_ : public _ur_kernel_handle_t {
-  using _ur_kernel_handle_t::_ur_kernel_handle_t;
-};
-
-struct ur_queue_handle_t_ : public _ur_queue_handle_t {
-  using _ur_queue_handle_t::_ur_queue_handle_t;
-};
-
-struct ur_sampler_handle_t_ : public _ur_sampler_handle_t {
-  using _ur_sampler_handle_t::_ur_sampler_handle_t;
-};
-
-struct ur_mem_handle_t_ : public _ur_mem_handle_t {
-  using _ur_mem_handle_t::_ur_mem_handle_t;
-};

From 9644ae2a2b052f5384253a22c8595eaac20bfc39 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Thu, 6 Apr 2023 00:16:57 -0700
Subject: [PATCH 02/50] Some fixes

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/unified_runtime/pi2ur.hpp        | 31 ++++++++++++++-----
 .../level_zero/ur_level_zero_kernel.cpp       | 17 +++++-----
 .../adapters/level_zero/ur_level_zero_mem.cpp | 31 +++++++++++++------
 3 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 5ca4b1b9ae4f6..3c81faab879b1 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -2573,6 +2573,23 @@ inline pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context,
   auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
 
   ur_usm_desc_t USMDesc{};
+  if (Properties) {
+    if (Properties[0] == PI_MEM_ALLOC_FLAGS) {
+      if (Properties[1] == PI_MEM_ALLOC_WRTITE_COMBINED) {
+        USMDesc.flags |= UR_USM_MEM_FLAG_WRITE_COMBINED;
+      }
+      if (Properties[1] == PI_MEM_ALLOC_INITIAL_PLACEMENT_DEVICE) {
+        USMDesc.flags |= UR_USM_MEM_FLAG_INITIAL_PLACEMENT_DEVICE;
+      }
+      if (Properties[1] == PI_MEM_ALLOC_INITIAL_PLACEMENT_HOST) {
+        USMDesc.flags |= UR_USM_MEM_FLAG_INITIAL_PLACEMENT_HOST;
+      }
+      if (Properties[1] == PI_MEM_ALLOC_DEVICE_READ_ONLY) {
+        USMDesc.flags |= UR_USM_MEM_FLAG_DEVICE_READ_ONLY;
+      }
+    }
+  }
+
   ur_usm_pool_handle_t Pool{};
   HANDLE_ERRORS(urUSMSharedAlloc(UrContext, UrDevice, &USMDesc, Pool, Size,
                                  Alignment, ResultPtr));
@@ -2987,8 +3004,10 @@ inline pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr,
                                        pi_uint32 NumEventsInWaitList,
                                        const pi_event *EventsWaitList,
                                        pi_event *OutEvent) {
-  PI_ASSERT(Ptr, PI_ERROR_INVALID_MEM_OBJECT);
   PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+  if (!Ptr) {
+    return PI_ERROR_INVALID_VALUE;
+  }
 
   ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
   ur_mem_handle_t UrBuffer = reinterpret_cast<ur_mem_handle_t>(Ptr);
@@ -2997,12 +3016,10 @@ inline pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr,
 
   ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
 
-  uint32_t Pattern = Value;
-  size_t PatternSize = sizeof(Pattern);
-  HANDLE_ERRORS(urEnqueueMemBufferFill(
-      UrQueue, UrBuffer,
-      const_cast<const void *>(reinterpret_cast<void *>(&Pattern)), PatternSize,
-      0, Count, NumEventsInWaitList, UrEventsWaitList, UrEvent));
+  size_t PatternSize = 1;
+  HANDLE_ERRORS(urEnqueueMemBufferFill(UrQueue, UrBuffer, &Value, PatternSize,
+                                       0, Count, NumEventsInWaitList,
+                                       UrEventsWaitList, UrEvent));
 
   return PI_SUCCESS;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
index 2a69a905c8e84..74571b0ef8669 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
@@ -192,13 +192,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
   UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH,
                                        CommandList, IsInternal));
-
-  ZeEvent = (*OutEvent)->ZeEvent;
-  (*OutEvent)->WaitList = TmpWaitList;
+  ZeEvent = (*Event)->ZeEvent;
+  (*Event)->WaitList = TmpWaitList;
 
   // Save the kernel in the event, so that when the event is signalled
   // the code can do a piKernelRelease on this kernel.
-  (*OutEvent)->CommandData = (void *)Kernel;
+  (*Event)->CommandData = (void *)Kernel;
 
   // Increment the reference count of the Kernel and indicate that the Kernel is
   // in use. Once the event has been signalled, the code in
@@ -227,8 +226,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     // Add the command to the command list, which implies submission.
     ZE2UR_CALL(zeCommandListAppendLaunchKernel,
                (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions,
-                ZeEvent, (*OutEvent)->WaitList.Length,
-                (*OutEvent)->WaitList.ZeEventList));
+                ZeEvent, (*Event)->WaitList.Length,
+                (*Event)->WaitList.ZeEventList));
   } else {
     // Add the command to the command list for later submission.
     // No lock is needed here, unlike the immediate commandlist case above,
@@ -236,14 +235,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     // submitted only when the comamndlist is closed. Then, a lock is held.
     ZE2UR_CALL(zeCommandListAppendLaunchKernel,
                (CommandList->first, Kernel->ZeKernel, &ZeThreadGroupDimensions,
-                ZeEvent, (*OutEvent)->WaitList.Length,
-                (*OutEvent)->WaitList.ZeEventList));
+                ZeEvent, (*Event)->WaitList.Length,
+                (*Event)->WaitList.ZeEventList));
   }
 
   urPrint("calling zeCommandListAppendLaunchKernel() with"
           "  ZeEvent %#llx\n",
           ur_cast<std::uintptr_t>(ZeEvent));
-  printZeEventList((*OutEvent)->WaitList);
+  printZeEventList((*Event)->WaitList);
 
   // Execute command list asynchronously, as the event will be used
   // to track down its completion.
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index e2b0b597eb2b1..76cce8b081c34 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -753,15 +753,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
         *OutEvent ///< [in,out][optional] return an event object that identifies
                   ///< this particular command instance.
 ) {
-  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
-                                                          Buffer->Mutex);
+  // std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
+  //                                                         Buffer->Mutex);
+  std::scoped_lock<ur_shared_mutex> Lock(Queue->Mutex);
 
-  char *ZeHandleDst = nullptr;
-  UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
-                              Queue->Device));
-  return enqueueMemFillHelper(UR_COMMAND_MEM_BUFFER_FILL, Queue,
-                              ZeHandleDst + Offset, Pattern, PatternSize, Size,
-                              NumEventsInWaitList, EventWaitList, OutEvent);
+  // if Offset is not zero, then look for Ze Handle to
+  // determine correct dst with offset
+  if (Offset != 0) {
+    char *ZeHandleDst = nullptr;
+    _ur_buffer *UrBuffer = reinterpret_cast<_ur_buffer *>(Buffer);
+    UR_CALL(UrBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
+                                  Queue->Device));
+    return enqueueMemFillHelper(
+        UR_COMMAND_MEM_BUFFER_FILL, Queue, ZeHandleDst + Offset, Pattern,
+        PatternSize, Size, NumEventsInWaitList, EventWaitList, OutEvent);
+  } else {
+    return enqueueMemFillHelper(
+        // TODO: do we need a new command type for USM memset?
+        UR_COMMAND_MEM_BUFFER_FILL, Queue, Buffer,
+        Pattern,     // It will be interpreted as an 8-bit value,
+        PatternSize, // which is indicated with this pattern_size==1
+        Size, NumEventsInWaitList, EventWaitList, OutEvent);
+  }
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
@@ -2131,7 +2144,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(
   ur_usm_mem_flags_t *Properties = &USMDesc->flags;
 
   // See if the memory is going to be read-only on the device.
-  bool DeviceReadOnly = false;
+  bool DeviceReadOnly = *Properties & UR_USM_MEM_FLAG_DEVICE_READ_ONLY;
 
   // L0 supports alignment up to 64KB and silently ignores higher values.
   // We flag alignment > 64KB as an invalid value.

From b447851670ca4009cef1cb21490a59de504d5ce0 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Thu, 6 Apr 2023 15:43:16 -0700
Subject: [PATCH 03/50] Some fixes

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/unified_runtime/pi2ur.hpp        |  7 ++-
 .../level_zero/ur_level_zero_context.cpp      | 12 ++++-
 .../adapters/level_zero/ur_level_zero_mem.cpp | 52 +++++++------------
 3 files changed, 34 insertions(+), 37 deletions(-)

diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 3c81faab879b1..6a7a0898dca99 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -3010,16 +3010,15 @@ inline pi_result piextUSMEnqueueMemset(pi_queue Queue, void *Ptr,
   }
 
   ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
-  ur_mem_handle_t UrBuffer = reinterpret_cast<ur_mem_handle_t>(Ptr);
   const ur_event_handle_t *UrEventsWaitList =
       reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
 
   ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
 
   size_t PatternSize = 1;
-  HANDLE_ERRORS(urEnqueueMemBufferFill(UrQueue, UrBuffer, &Value, PatternSize,
-                                       0, Count, NumEventsInWaitList,
-                                       UrEventsWaitList, UrEvent));
+  HANDLE_ERRORS(urEnqueueUSMFill(UrQueue, Ptr, PatternSize, &Value, Count,
+                                 NumEventsInWaitList, UrEventsWaitList,
+                                 UrEvent));
 
   return PI_SUCCESS;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
index 815a1a5db06cf..5f54f588febe4 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
@@ -70,6 +70,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextRelease(
   return ContextReleaseHelper(Context);
 }
 
+// Due to a bug with 2D memory copy to and from non-USM pointers, this option is
+// disabled by default.
+static const bool UseMemcpy2DOperations = [] {
+  const char *UseMemcpy2DOperationsFlag =
+      std::getenv("SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D");
+  if (!UseMemcpy2DOperationsFlag)
+    return false;
+  return std::stoi(UseMemcpy2DOperationsFlag) > 0;
+}();
+
 UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
     ur_context_handle_t Context,       ///< [in] handle of the context
     ur_context_info_t ContextInfoType, ///< [in] type of the info to retrieve
@@ -95,7 +105,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
     return ReturnValue(uint32_t{Context->RefCount.load()});
   case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT:
     // 2D USM memcpy is supported.
-    return ReturnValue(pi_bool{true});
+    return ReturnValue(pi_bool{UseMemcpy2DOperations});
   case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT:
     // 2D USM fill is not supported.
     return ReturnValue(pi_bool{false});
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index 76cce8b081c34..bb146c2728e1b 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -753,28 +753,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
         *OutEvent ///< [in,out][optional] return an event object that identifies
                   ///< this particular command instance.
 ) {
-  // std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
-  //                                                         Buffer->Mutex);
-  std::scoped_lock<ur_shared_mutex> Lock(Queue->Mutex);
+  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> Lock(Queue->Mutex,
+                                                          Buffer->Mutex);
 
-  // if Offset is not zero, then look for Ze Handle to
-  // determine correct dst with offset
-  if (Offset != 0) {
-    char *ZeHandleDst = nullptr;
-    _ur_buffer *UrBuffer = reinterpret_cast<_ur_buffer *>(Buffer);
-    UR_CALL(UrBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
-                                  Queue->Device));
-    return enqueueMemFillHelper(
-        UR_COMMAND_MEM_BUFFER_FILL, Queue, ZeHandleDst + Offset, Pattern,
-        PatternSize, Size, NumEventsInWaitList, EventWaitList, OutEvent);
-  } else {
-    return enqueueMemFillHelper(
-        // TODO: do we need a new command type for USM memset?
-        UR_COMMAND_MEM_BUFFER_FILL, Queue, Buffer,
-        Pattern,     // It will be interpreted as an 8-bit value,
-        PatternSize, // which is indicated with this pattern_size==1
-        Size, NumEventsInWaitList, EventWaitList, OutEvent);
-  }
+  char *ZeHandleDst = nullptr;
+  _ur_buffer *UrBuffer = reinterpret_cast<_ur_buffer *>(Buffer);
+  UR_CALL(UrBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
+                                Queue->Device));
+  return enqueueMemFillHelper(
+      UR_COMMAND_MEM_BUFFER_FILL, Queue, ZeHandleDst + Offset,
+      Pattern,     // It will be interpreted as an 8-bit value,
+      PatternSize, // which is indicated with this pattern_size==1
+      Size, NumEventsInWaitList, EventWaitList, OutEvent);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
@@ -3072,14 +3062,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
     ur_event_handle_t *Event ///< [out][optional] return an event object that
                              ///< identifies this particular command instance.
 ) {
-  std::ignore = Queue;
-  std::ignore = Ptr;
-  std::ignore = PatternSize;
-  std::ignore = Pattern;
-  std::ignore = Size;
-  std::ignore = NumEventsInWaitList;
-  std::ignore = EventWaitList;
-  std::ignore = Event;
-  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
+  std::scoped_lock<ur_shared_mutex> Lock(Queue->Mutex);
+
+  return enqueueMemFillHelper(
+      // TODO: do we need a new command type for USM memset?
+      UR_COMMAND_MEM_BUFFER_FILL, Queue, Ptr,
+      Pattern,     // It will be interpreted as an 8-bit value,
+      PatternSize, // which is indicated with this pattern_size==1
+      Size, NumEventsInWaitList, EventWaitList, Event);
+}
\ No newline at end of file

From d7e55784dc198dd8b6d416e9a8f03c1d555e6b76 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Thu, 6 Apr 2023 23:05:17 -0700
Subject: [PATCH 04/50] Some fixes

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/unified_runtime/pi2ur.hpp        | 63 +++++++++++++------
 .../level_zero/ur_level_zero_kernel.cpp       |  5 +-
 .../level_zero/ur_level_zero_queue.hpp        |  3 +-
 .../level_zero/ur_level_zero_sampler.cpp      | 23 +++----
 4 files changed, 57 insertions(+), 37 deletions(-)

diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 6a7a0898dca99..44aae44d4dec2 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -3448,26 +3448,49 @@ inline pi_result piSamplerCreate(pi_context Context,
   ur_context_handle_t UrContext =
       reinterpret_cast<ur_context_handle_t>(Context);
   ur_sampler_property_t UrProps[6]{};
-  UrProps[0] = UR_SAMPLER_PROPERTIES_NORMALIZED_COORDS;
-  UrProps[1] = SamplerProperties[1];
-
-  UrProps[2] = UR_SAMPLER_PROPERTIES_ADDRESSING_MODE;
-  if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT)
-    UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
-  else if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_REPEAT)
-    UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_REPEAT;
-  else if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE)
-    UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
-  else if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_CLAMP)
-    UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_CLAMP;
-  else if (SamplerProperties[3] & PI_SAMPLER_ADDRESSING_MODE_NONE)
-    UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_NONE;
-
-  UrProps[4] = UR_SAMPLER_PROPERTIES_FILTER_MODE;
-  if (SamplerProperties[4] & PI_SAMPLER_FILTER_MODE_NEAREST)
-    UrProps[5] = UR_EXT_SAMPLER_FILTER_MODE_NEAREST;
-  else if (SamplerProperties[4] & PI_SAMPLER_FILTER_MODE_LINEAR)
-    UrProps[5] = UR_EXT_SAMPLER_FILTER_MODE_LINEAR;
+  const pi_sampler_properties *CurProperty = SamplerProperties;
+  while (*CurProperty != 0) {
+    switch (*CurProperty) {
+    case PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS: {
+      UrProps[0] = UR_SAMPLER_PROPERTIES_NORMALIZED_COORDS;
+      UrProps[1] = ur_cast<pi_bool>(*(++CurProperty));
+    } break;
+
+    case PI_SAMPLER_PROPERTIES_ADDRESSING_MODE: {
+      UrProps[2] = UR_SAMPLER_PROPERTIES_ADDRESSING_MODE;
+      pi_sampler_addressing_mode CurValueAddressingMode =
+          ur_cast<pi_sampler_addressing_mode>(
+              ur_cast<pi_uint32>(*(++CurProperty)));
+
+      if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT)
+        UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
+      else if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_REPEAT)
+        UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_REPEAT;
+      else if (CurValueAddressingMode ==
+               PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE)
+        UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
+      else if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_CLAMP)
+        UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_CLAMP;
+      else if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_NONE)
+        UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_NONE;
+    } break;
+
+    case PI_SAMPLER_PROPERTIES_FILTER_MODE: {
+      UrProps[4] = UR_SAMPLER_PROPERTIES_FILTER_MODE;
+      pi_sampler_filter_mode CurValueFilterMode =
+          ur_cast<pi_sampler_filter_mode>(ur_cast<pi_uint32>(*(++CurProperty)));
+
+      if (CurValueFilterMode == PI_SAMPLER_FILTER_MODE_NEAREST)
+        UrProps[5] = UR_EXT_SAMPLER_FILTER_MODE_NEAREST;
+      else if (CurValueFilterMode == PI_SAMPLER_FILTER_MODE_LINEAR)
+        UrProps[5] = UR_EXT_SAMPLER_FILTER_MODE_LINEAR;
+    } break;
+
+    default:
+      break;
+    }
+    CurProperty++;
+  }
 
   ur_sampler_handle_t *UrSampler =
       reinterpret_cast<ur_sampler_handle_t *>(RetSampler);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
index 74571b0ef8669..92061bc0e91c4 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
@@ -669,9 +669,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler(
     ur_sampler_handle_t ArgValue ///< [in] handle of Sampler object.
 ) {
   std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
-  ZE2UR_CALL(zeKernelSetArgumentValue,
-             (ur_cast<ze_kernel_handle_t>(Kernel->ZeKernel), ArgIndex,
-              sizeof(void *), &ArgValue->ZeSampler));
+  ZE2UR_CALL(zeKernelSetArgumentValue, (Kernel->ZeKernel, ArgIndex,
+                                        sizeof(void *), &ArgValue->ZeSampler));
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
index c7b81dbf30af3..75b64638ac262 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
@@ -14,6 +14,7 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include <optional>
 
 #include <sycl/detail/pi.h>
 #include <ur/ur.hpp>
@@ -500,7 +501,7 @@ struct ur_queue_handle_t_ : _ur_object {
 //        the host-visible pool
 ur_result_t createEventAndAssociateQueue(
     ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType,
-    ur_command_list_ptr_t CommandList, bool IsInternal = false,
+    ur_command_list_ptr_t CommandList, bool IsInternal,
     std::optional<bool> HostVisible = std::nullopt);
 
 // Helper function to perform the necessary cleanup of the events from reset cmd
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
index 1b5496f5f59ed..5fdeb4ca0a7af 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
@@ -32,7 +32,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate(
   ZeStruct<ze_sampler_desc_t> ZeSamplerDesc;
 
   // Set the default values for the ZeSamplerDesc.
-  ZeSamplerDesc.isNormalized = PI_TRUE;
+  ZeSamplerDesc.isNormalized = true;
   ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_CLAMP;
   ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST;
 
@@ -42,16 +42,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate(
   //   b) SamplerProperties list is missing any properties
 
   if (Props) {
-    const ur_sampler_property_t *CurProperty = Props;
-
-    while (*CurProperty != 0) {
-      switch (*CurProperty) {
+    uint32_t PropCount = 0;
+    while (PropCount < 6) { // We expect only 3 pairs of sampler properties
+      switch (Props[PropCount]) {
       case UR_SAMPLER_PROPERTIES_NORMALIZED_COORDS: {
-        bool CurValueBool = ur_cast<pi_bool>(*(++CurProperty));
+        auto CurValueBool = Props[++PropCount];
 
-        if (CurValueBool == PI_TRUE)
+        if (CurValueBool == 1UL)
           ZeSamplerDesc.isNormalized = PI_TRUE;
-        else if (CurValueBool == PI_FALSE)
+        else if (CurValueBool == 0UL)
           ZeSamplerDesc.isNormalized = PI_FALSE;
         else {
           urPrint("urSamplerCreate: unsupported "
@@ -62,8 +61,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate(
 
       case UR_SAMPLER_PROPERTIES_ADDRESSING_MODE: {
         ur_sampler_addressing_mode_t CurValueAddressingMode =
-            ur_cast<ur_sampler_addressing_mode_t>(
-                ur_cast<uint32_t>(*(++CurProperty)));
+            static_cast<ur_sampler_addressing_mode_t>(Props[++PropCount]);
 
         // Level Zero runtime with API version 1.2 and lower has a bug:
         // ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER is implemented as "clamp to
@@ -107,8 +105,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate(
 
       case UR_SAMPLER_PROPERTIES_FILTER_MODE: {
         ur_ext_sampler_filter_mode_t CurValueFilterMode =
-            ur_cast<ur_ext_sampler_filter_mode_t>(
-                ur_cast<uint32_t>(*(++CurProperty)));
+            static_cast<ur_ext_sampler_filter_mode_t>(Props[++PropCount]);
 
         if (CurValueFilterMode == UR_EXT_SAMPLER_FILTER_MODE_NEAREST)
           ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST;
@@ -125,7 +122,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate(
       default:
         break;
       }
-      CurProperty++;
+      PropCount++;
     }
   }
 

From b9681bbf22d1c26f66ed7a8d9681b19fe3edaab9 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Thu, 6 Apr 2023 23:55:06 -0700
Subject: [PATCH 05/50] Stubs for the make_queue interop APIs

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/level_zero/pi_level_zero.cpp     | 22 ++++++++++++++++
 .../unified_runtime/pi_unified_runtime.cpp    | 25 +++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 44d747c12b871..c8b823d47602e 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -160,6 +160,28 @@ pi_result piextQueueCreate(pi_context Context, pi_device Device,
   return pi2ur::piextQueueCreate(Context, Device, Properties, Queue);
 }
 
+pi_result piextQueueCreate2(pi_context Context, pi_device Device,
+                            pi_queue_properties *Properties, pi_queue *Queue) {
+  return pi2ur::piextQueueCreate(Context, Device, Properties, Queue);
+}
+
+pi_result piextQueueGetNativeHandle2(pi_queue Queue,
+                                     pi_native_handle *NativeHandle,
+                                     int32_t *NativeHandleDesc) {
+  std::ignore = NativeHandleDesc;
+  return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle);
+}
+
+pi_result piextQueueCreateWithNativeHandle2(
+    pi_native_handle NativeHandle, int32_t NativeHandleDesc, pi_context Context,
+    pi_device Device, bool OwnNativeHandle, pi_queue_properties *Properties,
+    pi_queue *Queue) {
+  std::ignore = NativeHandleDesc;
+  std::ignore = Properties;
+  return pi2ur::piextQueueCreateWithNativeHandle(NativeHandle, Context, Device,
+                                                 OwnNativeHandle, Queue);
+}
+
 pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName,
                          size_t ParamValueSize, void *ParamValue,
                          size_t *ParamValueSizeRet) {
diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
index ba1cb72e8518f..b719273bf484e 100644
--- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
+++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
@@ -100,6 +100,28 @@ __SYCL_EXPORT pi_result piextQueueCreate(pi_context Context, pi_device Device,
   return pi2ur::piextQueueCreate(Context, Device, Properties, Queue);
 }
 
+__SYCL_EXPORT pi_result piextQueueCreate2(pi_context Context, pi_device Device,
+                                          pi_queue_properties *Properties,
+                                          pi_queue *Queue) {
+  return pi2ur::piextQueueCreate(Context, Device, Properties, Queue);
+}
+
+__SYCL_EXPORT pi_result piextQueueGetNativeHandle2(
+    pi_queue Queue, pi_native_handle *NativeHandle, int32_t *NativeHandleDesc) {
+  std::ignore = NativeHandleDesc;
+  return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle);
+}
+
+__SYCL_EXPORT pi_result piextQueueCreateWithNativeHandle2(
+    pi_native_handle NativeHandle, int32_t NativeHandleDesc, pi_context Context,
+    pi_device Device, bool OwnNativeHandle, pi_queue_properties *Properties,
+    pi_queue *Queue) {
+  std::ignore = NativeHandleDesc;
+  std::ignore = Properties;
+  return pi2ur::piextQueueCreateWithNativeHandle(NativeHandle, Context, Device,
+                                                 OwnNativeHandle, Queue);
+}
+
 __SYCL_EXPORT pi_result piQueueRelease(pi_queue Queue) {
   return pi2ur::piQueueRelease(Queue);
 }
@@ -1020,6 +1042,9 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_API(piQueueFlush)
   _PI_API(piextQueueGetNativeHandle)
   _PI_API(piextQueueCreateWithNativeHandle)
+  _PI_API(piextQueueCreate2)
+  _PI_API(piextQueueGetNativeHandle2)
+  _PI_API(piextQueueCreateWithNativeHandle2)
 
   _PI_API(piProgramCreate)
   _PI_API(piProgramBuild)

From 165b2c2b7720d3f618a212da2bd02c936b02e0e6 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Fri, 7 Apr 2023 16:56:07 -0700
Subject: [PATCH 06/50] Use custom urContextCreateWithNativeHandle

This requires for now using a custom loader with
the proper parameters

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/unified_runtime/CMakeLists.txt   |   4 +-
 sycl/plugins/unified_runtime/pi2ur.hpp        | 187 +++++++++++-------
 .../level_zero/ur_level_zero_context.cpp      |  14 +-
 .../level_zero/ur_level_zero_context.hpp      |   9 +-
 .../level_zero/ur_level_zero_device.cpp       |  57 +++---
 .../level_zero/ur_level_zero_kernel.cpp       |   5 +-
 .../adapters/level_zero/ur_level_zero_mem.cpp |  71 ++++---
 .../adapters/level_zero/ur_level_zero_mem.hpp |   6 +-
 .../level_zero/ur_level_zero_queue.cpp        |  54 +++--
 .../level_zero/ur_level_zero_queue.hpp        |  15 +-
 .../level_zero/ur_level_zero_sampler.cpp      | 127 +++++-------
 .../level_zero/ur_loader_interface.cpp        |   2 +-
 sycl/plugins/unified_runtime/ur/ur.hpp        |  26 +--
 13 files changed, 299 insertions(+), 278 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index a4eee6963601e..5b709ef7adacf 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -3,8 +3,8 @@
 if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_DIR)
   include(FetchContent)
 
-  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  set(UNIFIED_RUNTIME_TAG 74843ea0800e6fb7ce0f82e0ef991fc258f4b9bd)
+  set(UNIFIED_RUNTIME_REPO "https://github.com/jandres742/unified-runtime.git")
+  set(UNIFIED_RUNTIME_TAG b5c2119ba147306a76067e86c25e0c6c383172c6)
 
   message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}")
   FetchContent_Declare(unified-runtime
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 44aae44d4dec2..509448db3d3a4 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -1014,9 +1014,44 @@ piextDeviceSelectBinary(pi_device Device, // TODO: does this need to be context?
                         pi_uint32 *SelectedBinaryInd) {
 
   auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
-  const uint8_t **UrBinaries =
-      const_cast<const uint8_t **>(reinterpret_cast<uint8_t **>(Binaries));
-  HANDLE_ERRORS(urDeviceSelectBinary(UrDevice, UrBinaries, NumBinaries,
+  std::vector<ur_device_binary_t> UrBinaries(NumBinaries);
+
+  for (uint32_t BinaryCount = 0; BinaryCount < NumBinaries; BinaryCount++) {
+    if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec,
+               __SYCL_PI_DEVICE_BINARY_TARGET_UNKNOWN) == 0)
+      UrBinaries[BinaryCount].pDeviceTargetSpec =
+          UR_DEVICE_BINARY_TARGET_UNKNOWN;
+    else if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec,
+                    __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32) == 0)
+      UrBinaries[BinaryCount].pDeviceTargetSpec =
+          UR_DEVICE_BINARY_TARGET_SPIRV32;
+    else if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec,
+                    __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64) == 0)
+      UrBinaries[BinaryCount].pDeviceTargetSpec =
+          UR_DEVICE_BINARY_TARGET_SPIRV64;
+    else if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec,
+                    __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_X86_64) == 0)
+      UrBinaries[BinaryCount].pDeviceTargetSpec =
+          UR_DEVICE_BINARY_TARGET_SPIRV64_X86_64;
+    else if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec,
+                    __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN) == 0)
+      UrBinaries[BinaryCount].pDeviceTargetSpec =
+          UR_DEVICE_BINARY_TARGET_SPIRV64_GEN;
+    else if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec,
+                    __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_FPGA) == 0)
+      UrBinaries[BinaryCount].pDeviceTargetSpec =
+          UR_DEVICE_BINARY_TARGET_SPIRV64_FPGA;
+    else if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec,
+                    __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64) == 0)
+      UrBinaries[BinaryCount].pDeviceTargetSpec =
+          UR_DEVICE_BINARY_TARGET_NVPTX64;
+    else if (strcmp(Binaries[BinaryCount]->DeviceTargetSpec,
+                    __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN) == 0)
+      UrBinaries[BinaryCount].pDeviceTargetSpec =
+          UR_DEVICE_BINARY_TARGET_AMDGCN;
+  }
+
+  HANDLE_ERRORS(urDeviceSelectBinary(UrDevice, UrBinaries.data(), NumBinaries,
                                      SelectedBinaryInd));
   return PI_SUCCESS;
 }
@@ -1074,10 +1109,13 @@ inline pi_result piextContextCreateWithNativeHandle(
 
   ur_native_handle_t NativeContext =
       reinterpret_cast<ur_native_handle_t>(NativeHandle);
+  const ur_device_handle_t *UrDevices =
+      reinterpret_cast<const ur_device_handle_t *>(Devices);
   ur_context_handle_t *UrContext =
       reinterpret_cast<ur_context_handle_t *>(RetContext);
-  HANDLE_ERRORS(urContextCreateWithNativeHandle(NativeContext, UrContext));
-  (*UrContext)->OwnZeContext = OwnNativeHandle;
+
+  HANDLE_ERRORS(urContextCreateWithNativeHandle(
+      NativeContext, NumDevices, UrDevices, OwnNativeHandle, UrContext));
 
   return PI_SUCCESS;
 }
@@ -1096,21 +1134,16 @@ inline pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName,
     ContextInfoType = UR_CONTEXT_INFO_DEVICES;
     break;
   }
-  case PI_CONTEXT_INFO_PLATFORM: {
-    die("urGetContextInfo: unsuppported ParamName.");
-  }
   case PI_CONTEXT_INFO_NUM_DEVICES: {
     ContextInfoType = UR_CONTEXT_INFO_NUM_DEVICES;
     break;
   }
-  case PI_CONTEXT_INFO_PROPERTIES: {
-    die("urGetContextInfo: unsuppported ParamName.");
-  }
   case PI_CONTEXT_INFO_REFERENCE_COUNT: {
     ContextInfoType = UR_EXT_CONTEXT_INFO_REFERENCE_COUNT;
     break;
   }
   case PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT: {
+  case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT:
     ContextInfoType = UR_CONTEXT_INFO_USM_FILL2D_SUPPORT;
     break;
   }
@@ -1127,7 +1160,7 @@ inline pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName,
     die("These queries should have never come here");
   }
   default: {
-    die("piGetContextInfo: unsuppported ParamName.");
+    die("piContextGetInfo: unsuppported ParamName.");
   }
   }
 
@@ -1155,19 +1188,6 @@ inline pi_result piContextRelease(pi_context Context) {
 
 ///////////////////////////////////////////////////////////////////////////////
 // Queue
-inline pi_result piQueueCreate(pi_context Context, pi_device Device,
-                               pi_queue_properties Flags, pi_queue *Queue) {
-
-  ur_context_handle_t UrContext =
-      reinterpret_cast<ur_context_handle_t>(Context);
-  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
-  ur_queue_property_t Props{};
-  ur_queue_handle_t *UrQueue = reinterpret_cast<ur_queue_handle_t *>(Queue);
-  HANDLE_ERRORS(urQueueCreate(UrContext, UrDevice, &Props, UrQueue));
-
-  return PI_SUCCESS;
-}
-
 inline pi_result piextQueueCreate(pi_context Context, pi_device Device,
                                   pi_queue_properties *Properties,
                                   pi_queue *Queue) {
@@ -1194,38 +1214,46 @@ inline pi_result piextQueueCreate(pi_context Context, pi_device Device,
   PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
   PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
 
-  ur_queue_property_t props[5]{};
-  props[0] = UR_QUEUE_PROPERTIES_FLAGS;
+  ur_queue_properties_t UrProperties{};
   if (Properties[1] & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)
-    props[1] |= UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+    UrProperties.flags |= UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
   if (Properties[1] & PI_QUEUE_FLAG_PROFILING_ENABLE)
-    props[1] |= UR_QUEUE_FLAG_PROFILING_ENABLE;
+    UrProperties.flags |= UR_QUEUE_FLAG_PROFILING_ENABLE;
   if (Properties[1] & PI_QUEUE_FLAG_ON_DEVICE)
-    props[1] |= UR_QUEUE_FLAG_ON_DEVICE;
+    UrProperties.flags |= UR_QUEUE_FLAG_ON_DEVICE;
   if (Properties[1] & PI_QUEUE_FLAG_ON_DEVICE_DEFAULT)
-    props[1] |= UR_QUEUE_FLAG_ON_DEVICE_DEFAULT;
+    UrProperties.flags |= UR_QUEUE_FLAG_ON_DEVICE_DEFAULT;
   if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS)
-    props[1] |= UR_QUEUE_FLAG_DISCARD_EVENTS;
+    UrProperties.flags |= UR_QUEUE_FLAG_DISCARD_EVENTS;
   if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW)
-    props[1] |= UR_QUEUE_FLAG_PRIORITY_LOW;
+    UrProperties.flags |= UR_QUEUE_FLAG_PRIORITY_LOW;
   if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH)
-    props[1] |= UR_QUEUE_FLAG_PRIORITY_HIGH;
+    UrProperties.flags |= UR_QUEUE_FLAG_PRIORITY_HIGH;
 
+  ur_queue_index_properties_t IndexProperties{};
+  IndexProperties.stype = UR_STRUCTURE_TYPE_QUEUE_INDEX_PROPERTIES;
   if (Properties[2] != 0) {
-    props[2] = UR_QUEUE_PROPERTIES_COMPUTE_INDEX;
-    props[3] = Properties[3];
+    IndexProperties.computeIndex = Properties[3];
   }
 
+  UrProperties.pNext = &IndexProperties;
+
   ur_context_handle_t UrContext =
       reinterpret_cast<ur_context_handle_t>(Context);
   auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
 
   ur_queue_handle_t *UrQueue = reinterpret_cast<ur_queue_handle_t *>(Queue);
-  HANDLE_ERRORS(urQueueCreate(UrContext, UrDevice, props, UrQueue));
+  HANDLE_ERRORS(urQueueCreate(UrContext, UrDevice, &UrProperties, UrQueue));
 
   return PI_SUCCESS;
 }
 
+inline pi_result piQueueCreate(pi_context Context, pi_device Device,
+                               pi_queue_properties Flags, pi_queue *Queue) {
+  pi_queue_properties Properties[] = {PI_QUEUE_FLAGS, Flags, 0};
+  return pi2ur::piextQueueCreate(Context, Device, Properties, Queue);
+}
+
 inline pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
                                                   pi_context Context,
                                                   pi_device Device,
@@ -1308,7 +1336,7 @@ inline pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName,
     break;
   }
   case PI_QUEUE_INFO_PROPERTIES: {
-    UrParamName = UR_QUEUE_INFO_PROPERTIES;
+    UrParamName = UR_QUEUE_INFO_FLAGS;
     break;
   }
   case PI_QUEUE_INFO_REFERENCE_COUNT: {
@@ -1766,25 +1794,40 @@ inline pi_result piKernelSetExecInfo(pi_kernel Kernel,
   PI_ASSERT(ParamValue, PI_ERROR_INVALID_VALUE);
 
   ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
-  ur_kernel_exec_info_t propName{};
+  ur_kernel_exec_info_t PropName{};
+  uint64_t PropValue{};
   switch (ParamName) {
   case PI_USM_INDIRECT_ACCESS: {
-    propName = UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS;
+    PropName = UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS;
+    PropValue = *(static_cast<uint64_t *>(const_cast<void *>(ParamValue)));
     break;
   }
   case PI_USM_PTRS: {
-    propName = UR_KERNEL_EXEC_INFO_USM_PTRS;
+    PropName = UR_KERNEL_EXEC_INFO_USM_PTRS;
     break;
   }
   case PI_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG: {
-    propName = UR_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG;
+    PropName = UR_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG;
+    auto Param = (*(static_cast<const pi_kernel_cache_config *>(ParamValue)));
+    if (Param == PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM) {
+      PropValue =
+          static_cast<uint64_t>(UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM);
+    } else if (Param == PI_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA) {
+      PropValue =
+          static_cast<uint64_t>(UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA);
+      break;
+    } else if (Param == PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT) {
+      PropValue = static_cast<uint64_t>(UR_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT);
+    } else {
+      die("piKernelSetExecInfo: unsupported ParamValue\n");
+    }
     break;
   }
   default:
-    return PI_ERROR_INVALID_PROPERTY;
+    die("piKernelSetExecInfo: unsupported ParamName\n");
   }
   HANDLE_ERRORS(
-      urKernelSetExecInfo(UrKernel, propName, ParamValueSize, ParamValue));
+      urKernelSetExecInfo(UrKernel, PropName, ParamValueSize, &PropValue));
 
   return PI_SUCCESS;
 }
@@ -2164,9 +2207,11 @@ inline pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags,
     UrBufferFlags |= UR_MEM_FLAG_ALLOC_HOST_POINTER;
   }
 
+  ur_buffer_properties_t UrProps{};
+  UrProps.pHost = HostPtr;
   ur_mem_handle_t *UrBuffer = reinterpret_cast<ur_mem_handle_t *>(RetMem);
   HANDLE_ERRORS(
-      urMemBufferCreate(UrContext, UrBufferFlags, Size, HostPtr, UrBuffer));
+      urMemBufferCreate(UrContext, UrBufferFlags, Size, &UrProps, UrBuffer));
 
   return PI_SUCCESS;
 }
@@ -2178,9 +2223,9 @@ inline pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context,
   ur_context_handle_t UrContext =
       reinterpret_cast<ur_context_handle_t>(Context);
   ur_usm_desc_t USMDesc{};
+  USMDesc.align = Alignment;
   ur_usm_pool_handle_t Pool{};
-  HANDLE_ERRORS(
-      urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, Alignment, ResultPtr));
+  HANDLE_ERRORS(urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, ResultPtr));
   return PI_SUCCESS;
 }
 
@@ -2551,9 +2596,10 @@ inline pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context,
   auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
 
   ur_usm_desc_t USMDesc{};
+  USMDesc.align = Alignment;
   ur_usm_pool_handle_t Pool{};
-  HANDLE_ERRORS(urUSMDeviceAlloc(UrContext, UrDevice, &USMDesc, Pool, Size,
-                                 Alignment, ResultPtr));
+  HANDLE_ERRORS(
+      urUSMDeviceAlloc(UrContext, UrDevice, &USMDesc, Pool, Size, ResultPtr));
 
   return PI_SUCCESS;
 }
@@ -2576,23 +2622,25 @@ inline pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context,
   if (Properties) {
     if (Properties[0] == PI_MEM_ALLOC_FLAGS) {
       if (Properties[1] == PI_MEM_ALLOC_WRTITE_COMBINED) {
-        USMDesc.flags |= UR_USM_MEM_FLAG_WRITE_COMBINED;
+        USMDesc.flags |= UR_EXT_USM_MEM_FLAG_WRITE_COMBINED;
       }
       if (Properties[1] == PI_MEM_ALLOC_INITIAL_PLACEMENT_DEVICE) {
-        USMDesc.flags |= UR_USM_MEM_FLAG_INITIAL_PLACEMENT_DEVICE;
+        USMDesc.flags |= UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_DEVICE;
       }
       if (Properties[1] == PI_MEM_ALLOC_INITIAL_PLACEMENT_HOST) {
-        USMDesc.flags |= UR_USM_MEM_FLAG_INITIAL_PLACEMENT_HOST;
+        USMDesc.flags |= UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_HOST;
       }
       if (Properties[1] == PI_MEM_ALLOC_DEVICE_READ_ONLY) {
-        USMDesc.flags |= UR_USM_MEM_FLAG_DEVICE_READ_ONLY;
+        USMDesc.flags |= UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY;
       }
     }
   }
 
+  USMDesc.align = Alignment;
+
   ur_usm_pool_handle_t Pool{};
-  HANDLE_ERRORS(urUSMSharedAlloc(UrContext, UrDevice, &USMDesc, Pool, Size,
-                                 Alignment, ResultPtr));
+  HANDLE_ERRORS(
+      urUSMSharedAlloc(UrContext, UrDevice, &USMDesc, Pool, Size, ResultPtr));
 
   return PI_SUCCESS;
 }
@@ -2682,8 +2730,8 @@ inline pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr,
 
   // TODO: to map from pi_mem_advice to ur_mem_advice_t
   // once we have those defined
-  ur_mem_advice_t UrAdvice{};
-  HANDLE_ERRORS(urEnqueueUSMMemAdvise(UrQueue, Ptr, Length, UrAdvice, UrEvent));
+  ur_usm_advice_flags_t UrAdvice{};
+  HANDLE_ERRORS(urEnqueueUSMAdvise(UrQueue, Ptr, Length, UrAdvice, UrEvent));
 
   return PI_SUCCESS;
 }
@@ -3387,7 +3435,7 @@ inline pi_result piextEventCreateWithNativeHandle(pi_native_handle NativeHandle,
   ur_context_handle_t UrContext =
       reinterpret_cast<ur_context_handle_t>(Context);
 
-  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(*Event);
+  ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(Event);
   HANDLE_ERRORS(
       urEventCreateWithNativeHandle(UrNativeKernel, UrContext, UrEvent));
   (*UrEvent)->OwnNativeHandle = OwnNativeHandle;
@@ -3447,43 +3495,40 @@ inline pi_result piSamplerCreate(pi_context Context,
 
   ur_context_handle_t UrContext =
       reinterpret_cast<ur_context_handle_t>(Context);
-  ur_sampler_property_t UrProps[6]{};
+  ur_sampler_desc_t UrProps{};
   const pi_sampler_properties *CurProperty = SamplerProperties;
   while (*CurProperty != 0) {
     switch (*CurProperty) {
     case PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS: {
-      UrProps[0] = UR_SAMPLER_PROPERTIES_NORMALIZED_COORDS;
-      UrProps[1] = ur_cast<pi_bool>(*(++CurProperty));
+      UrProps.normalizedCoords = ur_cast<pi_bool>(*(++CurProperty));
     } break;
 
     case PI_SAMPLER_PROPERTIES_ADDRESSING_MODE: {
-      UrProps[2] = UR_SAMPLER_PROPERTIES_ADDRESSING_MODE;
       pi_sampler_addressing_mode CurValueAddressingMode =
           ur_cast<pi_sampler_addressing_mode>(
               ur_cast<pi_uint32>(*(++CurProperty)));
 
       if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT)
-        UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
+        UrProps.addressingMode = UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
       else if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_REPEAT)
-        UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_REPEAT;
+        UrProps.addressingMode = UR_SAMPLER_ADDRESSING_MODE_REPEAT;
       else if (CurValueAddressingMode ==
                PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE)
-        UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
+        UrProps.addressingMode = UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
       else if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_CLAMP)
-        UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_CLAMP;
+        UrProps.addressingMode = UR_SAMPLER_ADDRESSING_MODE_CLAMP;
       else if (CurValueAddressingMode == PI_SAMPLER_ADDRESSING_MODE_NONE)
-        UrProps[3] = UR_SAMPLER_ADDRESSING_MODE_NONE;
+        UrProps.addressingMode = UR_SAMPLER_ADDRESSING_MODE_NONE;
     } break;
 
     case PI_SAMPLER_PROPERTIES_FILTER_MODE: {
-      UrProps[4] = UR_SAMPLER_PROPERTIES_FILTER_MODE;
       pi_sampler_filter_mode CurValueFilterMode =
           ur_cast<pi_sampler_filter_mode>(ur_cast<pi_uint32>(*(++CurProperty)));
 
       if (CurValueFilterMode == PI_SAMPLER_FILTER_MODE_NEAREST)
-        UrProps[5] = UR_EXT_SAMPLER_FILTER_MODE_NEAREST;
+        UrProps.filterMode = UR_SAMPLER_FILTER_MODE_NEAREST;
       else if (CurValueFilterMode == PI_SAMPLER_FILTER_MODE_LINEAR)
-        UrProps[5] = UR_EXT_SAMPLER_FILTER_MODE_LINEAR;
+        UrProps.filterMode = UR_SAMPLER_FILTER_MODE_LINEAR;
     } break;
 
     default:
@@ -3495,7 +3540,7 @@ inline pi_result piSamplerCreate(pi_context Context,
   ur_sampler_handle_t *UrSampler =
       reinterpret_cast<ur_sampler_handle_t *>(RetSampler);
 
-  HANDLE_ERRORS(urSamplerCreate(UrContext, UrProps, UrSampler));
+  HANDLE_ERRORS(urSamplerCreate(UrContext, &UrProps, UrSampler));
 
   return PI_SUCCESS;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
index 5f54f588febe4..2f29904b04563 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
@@ -137,14 +137,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle(
 
 UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle(
     ur_native_handle_t
-        NativeContext,           ///< [in] the native handle of the context.
-    ur_context_handle_t *Context ///< [out] pointer to the handle of the
-                                 ///< context object created.
+        NativeContext, ///< [in] the native handle of the context.
+    uint32_t NumDevices, const ur_device_handle_t *Devices,
+    bool OwnNativeHandle,
+    ur_context_handle_t
+        *Context ///< [out] pointer to the handle of the context object created.
 ) {
   try {
     ze_context_handle_t ZeContext =
         reinterpret_cast<ze_context_handle_t>(NativeContext);
-    ur_context_handle_t_ *UrContext = new ur_context_handle_t_(ZeContext);
+    ur_context_handle_t_ *UrContext = new ur_context_handle_t_(
+        ZeContext, NumDevices, Devices, OwnNativeHandle);
     UrContext->initialize();
     *Context = reinterpret_cast<ur_context_handle_t>(UrContext);
   } catch (const std::bad_alloc &) {
@@ -152,7 +155,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle(
   } catch (...) {
     return UR_RESULT_ERROR_UNKNOWN;
   }
-
   return UR_RESULT_SUCCESS;
 }
 
@@ -310,7 +312,7 @@ ur_result_t ContextReleaseHelper(ur_context_handle_t Context) {
       Contexts.erase(It);
   }
   ze_context_handle_t DestroyZeContext =
-      Context->OwnZeContext ? Context->ZeContext : nullptr;
+      Context->OwnNativeHandle ? Context->ZeContext : nullptr;
 
   // Clean up any live memory associated with Context
   ur_result_t Result = Context->finalize();
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp
index 8cb8a94124b6a..a980a80a855f3 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp
@@ -29,7 +29,9 @@ struct ur_context_handle_t_ : _ur_object {
   ur_context_handle_t_(ze_context_handle_t ZeContext, uint32_t NumDevices,
                        const ur_device_handle_t *Devs, bool OwnZeContext)
       : ZeContext{ZeContext}, Devices{Devs, Devs + NumDevices},
-        OwnZeContext{OwnZeContext} {}
+        NumDevices{NumDevices} {
+    OwnNativeHandle = OwnZeContext;
+  }
 
   ur_context_handle_t_(ze_context_handle_t ZeContext) : ZeContext{ZeContext} {}
 
@@ -44,10 +46,7 @@ struct ur_context_handle_t_ : _ur_object {
   // Therefore it can be accessed without holding a lock on this _pi_context.
   // const std::vector<ur_device_handle_t> Devices;
   std::vector<ur_device_handle_t> Devices;
-
-  // Indicates if we own the ZeContext or it came from interop that
-  // asked to not transfer the ownership to SYCL RT.
-  bool OwnZeContext = false;
+  uint32_t NumDevices{};
 
   // Immediate Level Zero command list for the device in this context, to be
   // used for initializations. To be created as:
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
index 8983835ad0811..0a21858fc2842 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
@@ -406,25 +406,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     ze_device_fp_flags_t ZeSingleFPCapabilities =
         Device->ZeDeviceModuleProperties->fp32flags;
     if (ZE_DEVICE_FP_FLAG_DENORM & ZeSingleFPCapabilities) {
-      SingleFPValue |= UR_FP_CAPABILITY_FLAG_DENORM;
+      SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_DENORM;
     }
     if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeSingleFPCapabilities) {
-      SingleFPValue |= UR_FP_CAPABILITY_FLAG_INF_NAN;
+      SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN;
     }
     if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeSingleFPCapabilities) {
-      SingleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST;
+      SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST;
     }
     if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeSingleFPCapabilities) {
-      SingleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_ZERO;
+      SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO;
     }
     if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeSingleFPCapabilities) {
-      SingleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_INF;
+      SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF;
     }
     if (ZE_DEVICE_FP_FLAG_FMA & ZeSingleFPCapabilities) {
-      SingleFPValue |= UR_FP_CAPABILITY_FLAG_FMA;
+      SingleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
     }
     if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeSingleFPCapabilities) {
-      SingleFPValue |= UR_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
+      SingleFPValue |=
+          UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
     }
     return ReturnValue(uint64_t{SingleFPValue});
   }
@@ -433,25 +434,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     ze_device_fp_flags_t ZeHalfFPCapabilities =
         Device->ZeDeviceModuleProperties->fp16flags;
     if (ZE_DEVICE_FP_FLAG_DENORM & ZeHalfFPCapabilities) {
-      HalfFPValue |= UR_FP_CAPABILITY_FLAG_DENORM;
+      HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_DENORM;
     }
     if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeHalfFPCapabilities) {
-      HalfFPValue |= UR_FP_CAPABILITY_FLAG_INF_NAN;
+      HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN;
     }
     if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeHalfFPCapabilities) {
-      HalfFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST;
+      HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST;
     }
     if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeHalfFPCapabilities) {
-      HalfFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_ZERO;
+      HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO;
     }
     if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeHalfFPCapabilities) {
-      HalfFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_INF;
+      HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF;
     }
     if (ZE_DEVICE_FP_FLAG_FMA & ZeHalfFPCapabilities) {
-      HalfFPValue |= UR_FP_CAPABILITY_FLAG_FMA;
+      HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
     }
     if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeHalfFPCapabilities) {
-      HalfFPValue |= UR_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
+      HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
     }
     return ReturnValue(uint64_t{HalfFPValue});
   }
@@ -460,25 +461,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     ze_device_fp_flags_t ZeDoubleFPCapabilities =
         Device->ZeDeviceModuleProperties->fp64flags;
     if (ZE_DEVICE_FP_FLAG_DENORM & ZeDoubleFPCapabilities) {
-      DoubleFPValue |= UR_FP_CAPABILITY_FLAG_DENORM;
+      DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_DENORM;
     }
     if (ZE_DEVICE_FP_FLAG_INF_NAN & ZeDoubleFPCapabilities) {
-      DoubleFPValue |= UR_FP_CAPABILITY_FLAG_INF_NAN;
+      DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN;
     }
     if (ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST & ZeDoubleFPCapabilities) {
-      DoubleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST;
+      DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST;
     }
     if (ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO & ZeDoubleFPCapabilities) {
-      DoubleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_ZERO;
+      DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO;
     }
     if (ZE_DEVICE_FP_FLAG_ROUND_TO_INF & ZeDoubleFPCapabilities) {
-      DoubleFPValue |= UR_FP_CAPABILITY_FLAG_ROUND_TO_INF;
+      DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF;
     }
     if (ZE_DEVICE_FP_FLAG_FMA & ZeDoubleFPCapabilities) {
-      DoubleFPValue |= UR_FP_CAPABILITY_FLAG_FMA;
+      DoubleFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
     }
     if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeDoubleFPCapabilities) {
-      DoubleFPValue |= UR_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
+      DoubleFPValue |=
+          UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
     }
     return ReturnValue(uint64_t{DoubleFPValue});
   }
@@ -1138,7 +1140,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDevicePartition(
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary(
     ur_device_handle_t
         Device, ///< [in] handle of the device to select binary for.
-    const uint8_t **BinaryArray, ///< [in] the array of binaries to select from.
+    const ur_device_binary_t
+        *Binaries,        ///< [in] the array of binaries to select from.
     uint32_t NumBinaries, ///< [in] the number of binaries passed in ppBinaries.
                           ///< Must greater than or equal to zero otherwise
                           ///< ::UR_RESULT_ERROR_INVALID_VALUE is returned.
@@ -1162,10 +1165,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary(
   // plugin for platform/device the ctx was created for.
 
   // Look for GEN binary, which we known can only be handled by Level-Zero now.
-  const char *BinaryTarget = __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN;
-
-  pi_device_binary *Binaries =
-      reinterpret_cast<pi_device_binary *>(const_cast<uint8_t **>(BinaryArray));
+  const char *BinaryTarget =
+      UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; //__SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN;
 
   uint32_t *SelectedBinaryInd = SelectedBinary;
 
@@ -1174,11 +1175,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary(
   uint32_t Spirv = InvalidInd;
 
   for (uint32_t i = 0; i < NumBinaries; ++i) {
-    if (strcmp(Binaries[i]->DeviceTargetSpec, BinaryTarget) == 0) {
+    if (strcmp(Binaries[i].pDeviceTargetSpec, BinaryTarget) == 0) {
       *SelectedBinaryInd = i;
       return UR_RESULT_SUCCESS;
     }
-    if (strcmp(Binaries[i]->DeviceTargetSpec,
+    if (strcmp(Binaries[i].pDeviceTargetSpec,
                __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64) == 0)
       Spirv = i;
   }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
index 92061bc0e91c4..336f8ea530cdb 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
@@ -631,6 +631,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo(
     const void *PropValue ///< [in][range(0, propSize)] pointer to memory
                           ///< location holding the property value.
 ) {
+  std::ignore = PropSize;
+
   std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
   if (PropName == UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS &&
       *(static_cast<const pi_bool *>(PropValue)) == PI_TRUE) {
@@ -644,7 +646,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo(
     ZE2UR_CALL(zeKernelSetIndirectAccess, (Kernel->ZeKernel, IndirectFlags));
   } else if (PropName == UR_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG) {
     ze_cache_config_flag_t ZeCacheConfig{};
-    auto CacheConfig = *(static_cast<const ur_kernel_exec_info_t *>(PropValue));
+    auto CacheConfig =
+        *(static_cast<const ur_kernel_cache_config *>(PropValue));
     if (CacheConfig == UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM)
       ZeCacheConfig = ZE_CACHE_CONFIG_FLAG_LARGE_SLM;
     else if (CacheConfig == UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA)
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index bb146c2728e1b..d09f18fe76c48 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -1281,11 +1281,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
   return UR_RESULT_SUCCESS;
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemAdvise(
-    ur_queue_handle_t Queue, ///< [in] handle of the queue object
-    const void *Mem,         ///< [in] pointer to the USM memory object
-    size_t Size,             ///< [in] size in bytes to be advised
-    ur_mem_advice_t Advice,  ///< [in] USM memory advice
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise(
+    ur_queue_handle_t Queue,      ///< [in] handle of the queue object
+    const void *Mem,              ///< [in] pointer to the USM memory object
+    size_t Size,                  ///< [in] size in bytes to be advised
+    ur_usm_advice_flags_t Advice, ///< [in] USM memory advice
     ur_event_handle_t
         *OutEvent ///< [in,out][optional] return an event object that identifies
                   ///< this particular command instance.
@@ -1636,7 +1636,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
     ur_context_handle_t Context, ///< [in] handle of the context object
     ur_mem_flags_t Flags, ///< [in] allocation and usage information flags
     size_t Size, ///< [in] size in bytes of the memory object to be allocated
-    void *Host,  ///< [in][optional] pointer to the buffer data
+    const ur_buffer_properties_t *Properties,
     ur_mem_handle_t
         *RetBuffer ///< [out] pointer to handle of the memory buffer created
 ) {
@@ -1649,6 +1649,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
     //
   }
 
+  void *Host = Properties->pHost;
+
   // If USM Import feature is enabled and hostptr is supplied,
   // import the hostptr if not already imported into USM.
   // Data transfer rate is maximized when both source and destination
@@ -1755,7 +1757,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
         Buffer,           ///< [in] handle of the buffer object to allocate from
     ur_mem_flags_t Flags, ///< [in] allocation and usage information flags
     ur_buffer_create_type_t BufferCreateType, ///< [in] buffer creation type
-    ur_buffer_region_t
+    const ur_buffer_region_t
         *BufferCreateInfo, ///< [in] pointer to buffer create region information
     ur_mem_handle_t
         *RetMem ///< [out] pointer to the handle of sub buffer created
@@ -1957,22 +1959,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(
 
 UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(
     ur_context_handle_t Context, ///< [in] handle of the context object
-    ur_usm_desc_t *USMDesc, ///< [in][optional] USM memory allocation descriptor
+    const ur_usm_desc_t
+        *USMDesc, ///< [in][optional] USM memory allocation descriptor
     ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created
                                ///< using urUSMPoolCreate
     size_t
         Size, ///< [in] size in bytes of the USM memory object to be allocated
-    uint32_t Align, ///< [in] alignment of the USM memory object
-    void **RetMem   ///< [out] pointer to USM host memory object
+    void **RetMem ///< [out] pointer to USM host memory object
 ) {
   std::ignore = Pool;
 
+  uint32_t Align = USMDesc->align;
   // L0 supports alignment up to 64KB and silently ignores higher values.
   // We flag alignment > 64KB as an invalid value.
   if (Align > 65536)
     return UR_RESULT_ERROR_INVALID_VALUE;
 
-  ur_usm_mem_flags_t *USMFlag = &USMDesc->flags;
+  const ur_usm_flags_t *USMFlag = &USMDesc->flags;
   std::ignore = USMFlag;
 
   ur_platform_handle_t Plt = Context->getPlatform();
@@ -2002,7 +2005,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(
       // keep the same behavior for the allocator, just call L0 API directly and
       // return the error code.
       ((Align & (Align - 1)) != 0)) {
-    ur_usm_mem_flags_t Properties{};
+    ur_usm_flags_t Properties{};
     ur_result_t Res =
         USMHostAllocImpl(RetMem, Context, &Properties, Size, Align);
     if (IndirectAccessTrackingEnabled) {
@@ -2038,22 +2041,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(
 UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(
     ur_context_handle_t Context, ///< [in] handle of the context object
     ur_device_handle_t Device,   ///< [in] handle of the device object
-    ur_usm_desc_t *USMDesc, ///< [in][optional] USM memory allocation descriptor
+    const ur_usm_desc_t
+        *USMDesc, ///< [in][optional] USM memory allocation descriptor
     ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created
                                ///< using urUSMPoolCreate
     size_t
         Size, ///< [in] size in bytes of the USM memory object to be allocated
-    uint32_t Alignment, ///< [in] alignment of the USM memory object
-    void **RetMem       ///< [out] pointer to USM device memory object
+    void **RetMem ///< [out] pointer to USM device memory object
 ) {
   std::ignore = Pool;
 
+  uint32_t Alignment = USMDesc->align;
+
   // L0 supports alignment up to 64KB and silently ignores higher values.
   // We flag alignment > 64KB as an invalid value.
   if (Alignment > 65536)
     return UR_RESULT_ERROR_INVALID_VALUE;
 
-  ur_usm_mem_flags_t *USMProp = &USMDesc->flags;
+  const ur_usm_flags_t *USMProp = &USMDesc->flags;
   std::ignore = USMProp;
 
   ur_platform_handle_t Plt = Device->Platform;
@@ -2121,20 +2126,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(
 UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(
     ur_context_handle_t Context, ///< [in] handle of the context object
     ur_device_handle_t Device,   ///< [in] handle of the device object
-    ur_usm_desc_t *USMDesc, ///< [in][optional] USM memory allocation descriptor
+    const ur_usm_desc_t
+        *USMDesc, ///< [in][optional] USM memory allocation descriptor
     ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created
                                ///< using urUSMPoolCreate
     size_t
         Size, ///< [in] size in bytes of the USM memory object to be allocated
-    uint32_t Alignment, ///< [in] alignment of the USM memory object
-    void **RetMem       ///< [out] pointer to USM shared memory object
+    void **RetMem ///< [out] pointer to USM shared memory object
 ) {
   std::ignore = Pool;
 
-  ur_usm_mem_flags_t *Properties = &USMDesc->flags;
+  const ur_usm_flags_t *Properties = &USMDesc->flags;
+  uint32_t Alignment = USMDesc->align;
 
   // See if the memory is going to be read-only on the device.
-  bool DeviceReadOnly = *Properties & UR_USM_MEM_FLAG_DEVICE_READ_ONLY;
+  bool DeviceReadOnly = *Properties & UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY;
 
   // L0 supports alignment up to 64KB and silently ignores higher values.
   // We flag alignment > 64KB as an invalid value.
@@ -2165,8 +2171,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(
       // keep the same behavior for the allocator, just call L0 API directly and
       // return the error code.
       ((Alignment & (Alignment - 1)) != 0)) {
-    ur_result_t Res = USMSharedAllocImpl(RetMem, Context, Device, Properties,
-                                         Size, Alignment);
+    ur_result_t Res = USMSharedAllocImpl(
+        RetMem, Context, Device, const_cast<ur_usm_flags_t *>(Properties), Size,
+        Alignment);
     if (IndirectAccessTrackingEnabled) {
       // Keep track of all memory allocations in the context
       Context->MemAllocs.emplace(std::piecewise_construct,
@@ -2323,7 +2330,7 @@ ur_result_t USMSharedMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
 ur_result_t USMSharedReadOnlyMemoryAlloc::allocateImpl(void **ResultPtr,
                                                        size_t Size,
                                                        uint32_t Alignment) {
-  ur_usm_mem_flags_t Props = UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY;
+  ur_usm_flags_t Props = UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY;
   return USMSharedAllocImpl(ResultPtr, Context, Device, &Props, Size,
                             Alignment);
 }
@@ -2429,7 +2436,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolDestroy(
 
 ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                                ur_device_handle_t Device,
-                               ur_usm_mem_flags_t *Properties, size_t Size,
+                               ur_usm_flags_t *Properties, size_t Size,
                                uint32_t Alignment) {
   // TODO: translate PI properties to Level Zero flags
   ZeStruct<ze_device_mem_alloc_desc_t> ZeDesc;
@@ -2455,7 +2462,7 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
 }
 
 ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                               ur_device_handle_t Device, ur_usm_mem_flags_t *,
+                               ur_device_handle_t Device, ur_usm_flags_t *,
                                size_t Size, uint32_t Alignment) {
 
   // TODO: translate PI properties to Level Zero flags
@@ -2486,7 +2493,7 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
 }
 
 ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                             ur_usm_mem_flags_t *Properties, size_t Size,
+                             ur_usm_flags_t *Properties, size_t Size,
                              uint32_t Alignment) {
   // TODO: translate PI properties to Level Zero flags
   ZeStruct<ze_host_mem_alloc_desc_t> ZeHostDesc;
@@ -2752,8 +2759,9 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode,
       if (USMAllocatorConfigInstance.EnableBuffers) {
         HostAllocation.ReleaseAction = allocation_t::free;
         ur_usm_desc_t USMDesc{};
+        USMDesc.align = getAlignment();
         ur_usm_pool_handle_t Pool{};
-        UR_CALL(urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, getAlignment(),
+        UR_CALL(urUSMHostAlloc(UrContext, &USMDesc, Pool, Size,
                                reinterpret_cast<void **>(&ZeHandle)));
       } else {
         HostAllocation.ReleaseAction = allocation_t::free_native;
@@ -2807,9 +2815,9 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode,
       if (USMAllocatorConfigInstance.EnableBuffers) {
         Allocation.ReleaseAction = allocation_t::free;
         ur_usm_desc_t USMDesc{};
+        USMDesc.align = getAlignment();
         ur_usm_pool_handle_t Pool{};
         UR_CALL(urUSMDeviceAlloc(UrContext, Device, &USMDesc, Pool, Size,
-                                 getAlignment(),
                                  reinterpret_cast<void **>(&ZeHandle)));
       } else {
         Allocation.ReleaseAction = allocation_t::free_native;
@@ -2871,9 +2879,10 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode,
           if (USMAllocatorConfigInstance.EnableBuffers) {
             HostAllocation.ReleaseAction = allocation_t::free;
             ur_usm_desc_t USMDesc{};
+            USMDesc.align = getAlignment();
             ur_usm_pool_handle_t Pool{};
-            UR_CALL(urUSMHostAlloc(UrContext, &USMDesc, Pool, Size,
-                                   getAlignment(), &ZeHandleHost));
+            UR_CALL(
+                urUSMHostAlloc(UrContext, &USMDesc, Pool, Size, &ZeHandleHost));
           } else {
             HostAllocation.ReleaseAction = allocation_t::free_native;
             UR_CALL(ZeHostMemAllocHelper(&ZeHandleHost, UrContext, Size));
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
index 56b0c4a9dbaa6..575ab61959184 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
@@ -279,15 +279,15 @@ class USMHostMemoryAlloc : public USMMemoryAllocBase {
 
 ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                                ur_device_handle_t Device,
-                               ur_usm_mem_flags_t *Properties, size_t Size,
+                               ur_usm_flags_t *Properties, size_t Size,
                                uint32_t Alignment);
 
 ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                               ur_device_handle_t Device, ur_usm_mem_flags_t *,
+                               ur_device_handle_t Device, ur_usm_flags_t *,
                                size_t Size, uint32_t Alignment);
 
 ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                             ur_usm_mem_flags_t *Properties, size_t Size,
+                             ur_usm_flags_t *Properties, size_t Size,
                              uint32_t Alignment);
 
 // If indirect access tracking is not enabled then this functions just performs
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
index e3e21eb3e98e2..941804b535b3c 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
@@ -158,8 +158,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(
     return ReturnValue(Queue->Device);
   case UR_QUEUE_INFO_REFERENCE_COUNT:
     return ReturnValue(uint32_t{Queue->RefCount.load()});
-  case UR_QUEUE_INFO_PROPERTIES:
-    die("UR_QUEUE_INFO_PROPERTIES in urQueueGetInfo not implemented\n");
+  case UR_QUEUE_INFO_FLAGS:
+    die("UR_QUEUE_INFO_FLAGS in urQueueGetInfo not implemented\n");
     break;
   case UR_QUEUE_INFO_SIZE:
     die("UR_QUEUE_INFO_SIZE in urQueueGetInfo not implemented\n");
@@ -265,30 +265,29 @@ static bool doEagerInit = [] {
 }();
 
 UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(
-    ur_context_handle_t hContext, ///< [in] handle of the context object
-    ur_device_handle_t hDevice,   ///< [in] handle of the device object
-    const ur_queue_property_t
-        *pProps, ///< [in] specifies a list of queue properties and their
-                 ///< corresponding values. Each property name is immediately
-                 ///< followed by the corresponding desired value. The list is
-                 ///< terminated with a 0. If a property value is not specified,
-                 ///< then its default value will be used.
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    ur_device_handle_t Device,   ///< [in] handle of the device object
+    const ur_queue_properties_t
+        *Props, ///< [in] specifies a list of queue properties and their
+                ///< corresponding values. Each property name is immediately
+                ///< followed by the corresponding desired value. The list is
+                ///< terminated with a 0. If a property value is not specified,
+                ///< then its default value will be used.
     ur_queue_handle_t
-        *phQueue ///< [out] pointer to handle of queue object created
+        *Queue ///< [out] pointer to handle of queue object created
 ) {
-  ur_context_handle_t Context = hContext;
-  ur_device_handle_t Device = hDevice;
-  ur_queue_handle_t_ **Queue = reinterpret_cast<ur_queue_handle_t_ **>(phQueue);
-
   Context->Devices[0] = Device;
 
-  const pi_queue_properties *Properties =
-      reinterpret_cast<const pi_queue_properties *>(pProps);
-  pi_queue_properties Flags = Properties[1];
-
-  auto ForceComputeIndex = Properties[2] == PI_QUEUE_COMPUTE_INDEX
-                               ? static_cast<int>(Properties[3])
-                               : -1; // Use default/round-robin.
+  int ForceComputeIndex = -1; // Use default/round-robin.
+  if (Props->pNext) {
+    const ur_base_properties_t *extendedDesc =
+        reinterpret_cast<const ur_base_properties_t *>(Props->pNext);
+    if (extendedDesc->stype == UR_STRUCTURE_TYPE_QUEUE_INDEX_PROPERTIES) {
+      const ur_queue_index_properties_t *IndexProperties =
+          reinterpret_cast<const ur_queue_index_properties_t *>(extendedDesc);
+      ForceComputeIndex = IndexProperties->computeIndex;
+    }
+  }
 
   UR_ASSERT(Context->isValidDevice(Device), UR_RESULT_ERROR_INVALID_DEVICE);
 
@@ -317,9 +316,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(
                                                              nullptr);
 
   try {
-    *Queue =
-        new ur_queue_handle_t_(ZeComputeCommandQueues, ZeCopyCommandQueues,
-                               Context, Device, true, Flags, ForceComputeIndex);
+    *Queue = new ur_queue_handle_t_(ZeComputeCommandQueues, ZeCopyCommandQueues,
+                                    Context, Device, true, Props->flags,
+                                    ForceComputeIndex);
   } catch (const std::bad_alloc &) {
     return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
   } catch (...) {
@@ -328,7 +327,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(
 
   // Do eager initialization of Level Zero handles on request.
   if (doEagerInit) {
-    ur_queue_handle_t Q = *phQueue;
+    ur_queue_handle_t Q = *Queue;
     // Creates said number of command-lists.
     auto warmupQueueGroup = [Q](bool UseCopyEngine,
                                 uint32_t RepeatCount) -> ur_result_t {
@@ -732,8 +731,7 @@ ur_queue_handle_t_::ur_queue_handle_t_(
     std::vector<ze_command_queue_handle_t> &ComputeQueues,
     std::vector<ze_command_queue_handle_t> &CopyQueues,
     ur_context_handle_t Context, ur_device_handle_t Device,
-    bool OwnZeCommandQueue, pi_queue_properties Properties,
-    int ForceComputeIndex)
+    bool OwnZeCommandQueue, ur_queue_flags_t Properties, int ForceComputeIndex)
     : Context{Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue},
       Properties(Properties) {
   // Compute group initialization.
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
index 75b64638ac262..76cfda295f2f8 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
@@ -10,11 +10,11 @@
 #include <cassert>
 #include <list>
 #include <map>
+#include <optional>
 #include <stdarg.h>
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include <optional>
 
 #include <sycl/detail/pi.h>
 #include <ur/ur.hpp>
@@ -81,7 +81,7 @@ struct ur_queue_handle_t_ : _ur_object {
   ur_queue_handle_t_(std::vector<ze_command_queue_handle_t> &ComputeQueues,
                      std::vector<ze_command_queue_handle_t> &CopyQueues,
                      ur_context_handle_t Context, ur_device_handle_t Device,
-                     bool OwnZeCommandQueue, pi_queue_properties Properties = 0,
+                     bool OwnZeCommandQueue, ur_queue_flags_t Properties = 0,
                      int ForceComputeIndex = -1);
 
   using queue_type = ur_device_handle_t_::queue_group_info_t::type;
@@ -207,7 +207,7 @@ struct ur_queue_handle_t_ : _ur_object {
   bool OwnZeCommandQueue;
 
   // Keeps the properties of this queue.
-  pi_queue_properties Properties;
+  ur_queue_flags_t Properties;
 
   // Map of all command lists used in this queue.
   ur_command_list_map_t CommandListMap;
@@ -499,10 +499,11 @@ struct ur_queue_handle_t_ : _ur_object {
 //        plugin only.
 // \param ForceHostVisible tells if the event must be created in
 //        the host-visible pool
-ur_result_t createEventAndAssociateQueue(
-    ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType,
-    ur_command_list_ptr_t CommandList, bool IsInternal,
-    std::optional<bool> HostVisible = std::nullopt);
+ur_result_t
+createEventAndAssociateQueue(ur_queue_handle_t Queue, ur_event_handle_t *Event,
+                             ur_command_t CommandType,
+                             ur_command_list_ptr_t CommandList, bool IsInternal,
+                             std::optional<bool> HostVisible = std::nullopt);
 
 // Helper function to perform the necessary cleanup of the events from reset cmd
 // list.
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
index 5fdeb4ca0a7af..42c431ec94632 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
@@ -11,7 +11,7 @@
 
 UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate(
     ur_context_handle_t Context, ///< [in] handle of the context object
-    const ur_sampler_property_t
+    const ur_sampler_desc_t
         *Props, ///< [in] specifies a list of sampler property names and their
                 ///< corresponding values.
     ur_sampler_handle_t
@@ -42,87 +42,50 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate(
   //   b) SamplerProperties list is missing any properties
 
   if (Props) {
-    uint32_t PropCount = 0;
-    while (PropCount < 6) { // We expect only 3 pairs of sampler properties
-      switch (Props[PropCount]) {
-      case UR_SAMPLER_PROPERTIES_NORMALIZED_COORDS: {
-        auto CurValueBool = Props[++PropCount];
-
-        if (CurValueBool == 1UL)
-          ZeSamplerDesc.isNormalized = PI_TRUE;
-        else if (CurValueBool == 0UL)
-          ZeSamplerDesc.isNormalized = PI_FALSE;
-        else {
-          urPrint("urSamplerCreate: unsupported "
-                  "UR_SAMPLER_INFO_NORMALIZED_COORDS value\n");
-          return UR_RESULT_ERROR_INVALID_VALUE;
-        }
-      } break;
-
-      case UR_SAMPLER_PROPERTIES_ADDRESSING_MODE: {
-        ur_sampler_addressing_mode_t CurValueAddressingMode =
-            static_cast<ur_sampler_addressing_mode_t>(Props[++PropCount]);
-
-        // Level Zero runtime with API version 1.2 and lower has a bug:
-        // ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER is implemented as "clamp to
-        // edge" and ZE_SAMPLER_ADDRESS_MODE_CLAMP is implemented as "clamp to
-        // border", i.e. logic is flipped. Starting from API version 1.3 this
-        // problem is going to be fixed. That's why check for API version to set
-        // an address mode.
-        ze_api_version_t ZeApiVersion = Context->getPlatform()->ZeApiVersion;
-        // TODO: add support for PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE
-        switch (CurValueAddressingMode) {
-        case UR_SAMPLER_ADDRESSING_MODE_NONE:
-          ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_NONE;
-          break;
-        case UR_SAMPLER_ADDRESSING_MODE_REPEAT:
-          ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_REPEAT;
-          break;
-        case UR_SAMPLER_ADDRESSING_MODE_CLAMP:
-          ZeSamplerDesc.addressMode =
-              ZeApiVersion < ZE_MAKE_VERSION(1, 3)
-                  ? ZE_SAMPLER_ADDRESS_MODE_CLAMP
-                  : ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER;
-          break;
-        case UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE:
-          ZeSamplerDesc.addressMode =
-              ZeApiVersion < ZE_MAKE_VERSION(1, 3)
-                  ? ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER
-                  : ZE_SAMPLER_ADDRESS_MODE_CLAMP;
-          break;
-        case UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT:
-          ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_MIRROR;
-          break;
-        default:
-          urPrint("urSamplerCreate: unsupported "
-                  "UR_SAMPLER_PROPERTIES_ADDRESSING_MODEE "
-                  "value\n");
-          urPrint("UR_SAMPLER_PROPERTIES_ADDRESSING_MODEE=%d\n",
-                  CurValueAddressingMode);
-          return UR_RESULT_ERROR_INVALID_VALUE;
-        }
-      } break;
-
-      case UR_SAMPLER_PROPERTIES_FILTER_MODE: {
-        ur_ext_sampler_filter_mode_t CurValueFilterMode =
-            static_cast<ur_ext_sampler_filter_mode_t>(Props[++PropCount]);
-
-        if (CurValueFilterMode == UR_EXT_SAMPLER_FILTER_MODE_NEAREST)
-          ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST;
-        else if (CurValueFilterMode == UR_EXT_SAMPLER_FILTER_MODE_LINEAR)
-          ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_LINEAR;
-        else {
-          urPrint("UR_SAMPLER_FILTER_MODE=%d\n", CurValueFilterMode);
-          urPrint(
-              "urSamplerCreate: unsupported UR_SAMPLER_FILTER_MODE value\n");
-          return UR_RESULT_ERROR_INVALID_VALUE;
-        }
-      } break;
-
-      default:
-        break;
-      }
-      PropCount++;
+    ZeSamplerDesc.isNormalized = Props->normalizedCoords;
+
+    // Level Zero runtime with API version 1.2 and lower has a bug:
+    // ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER is implemented as "clamp to
+    // edge" and ZE_SAMPLER_ADDRESS_MODE_CLAMP is implemented as "clamp to
+    // border", i.e. logic is flipped. Starting from API version 1.3 this
+    // problem is going to be fixed. That's why check for API version to set
+    // an address mode.
+    ze_api_version_t ZeApiVersion = Context->getPlatform()->ZeApiVersion;
+    // TODO: add support for PI_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE
+    switch (Props->addressingMode) {
+    case UR_SAMPLER_ADDRESSING_MODE_NONE:
+      ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_NONE;
+      break;
+    case UR_SAMPLER_ADDRESSING_MODE_REPEAT:
+      ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_REPEAT;
+      break;
+    case UR_SAMPLER_ADDRESSING_MODE_CLAMP:
+      ZeSamplerDesc.addressMode = ZeApiVersion < ZE_MAKE_VERSION(1, 3)
+                                      ? ZE_SAMPLER_ADDRESS_MODE_CLAMP
+                                      : ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER;
+      break;
+    case UR_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE:
+      ZeSamplerDesc.addressMode = ZeApiVersion < ZE_MAKE_VERSION(1, 3)
+                                      ? ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER
+                                      : ZE_SAMPLER_ADDRESS_MODE_CLAMP;
+      break;
+    case UR_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT:
+      ZeSamplerDesc.addressMode = ZE_SAMPLER_ADDRESS_MODE_MIRROR;
+      break;
+    default:
+      urPrint("urSamplerCreate: unsupported "
+              "UR_SAMPLER_PROPERTIES_ADDRESSING_MODEE "
+              "value\n");
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+
+    if (Props->filterMode == UR_SAMPLER_FILTER_MODE_NEAREST)
+      ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_NEAREST;
+    else if (Props->filterMode == UR_SAMPLER_FILTER_MODE_LINEAR)
+      ZeSamplerDesc.filterMode = ZE_SAMPLER_FILTER_MODE_LINEAR;
+    else {
+      urPrint("urSamplerCreate: unsupported UR_SAMPLER_FILTER_MODE value\n");
+      return UR_RESULT_ERROR_INVALID_VALUE;
     }
   }
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp
index a117de71b57e6..e6164fe6519af 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp
@@ -87,7 +87,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
   pDdiTable->pfnUSMFill = urEnqueueUSMFill;
   pDdiTable->pfnUSMMemcpy = urEnqueueUSMMemcpy;
   pDdiTable->pfnUSMPrefetch = urEnqueueUSMPrefetch;
-  pDdiTable->pfnUSMMemAdvise = urEnqueueUSMMemAdvise;
+  pDdiTable->pfnUSMAdvise = urEnqueueUSMAdvise;
   pDdiTable->pfnUSMFill2D = urEnqueueUSMFill2D;
   pDdiTable->pfnUSMMemcpy2D = urEnqueueUSMMemcpy2D;
   pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite;
diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp
index d25e36db39bc5..c1d7464387922 100644
--- a/sycl/plugins/unified_runtime/ur/ur.hpp
+++ b/sycl/plugins/unified_runtime/ur/ur.hpp
@@ -60,7 +60,10 @@ const int UR_EXT_USM_CAPS_ATOMIC_ACCESS = 1 << 1;
 const int UR_EXT_USM_CAPS_CONCURRENT_ACCESS = 1 << 2;
 const int UR_EXT_USM_CAPS_CONCURRENT_ATOMIC_ACCESS = 1 << 3;
 
-const int UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY = 1 << 5;
+const int UR_EXT_USM_MEM_FLAG_WRITE_COMBINED = 1 << 27;
+const int UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_DEVICE = 1 << 28;
+const int UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_HOST = 1 << 29;
+const int UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY = 1 << 30;
 
 const ur_context_info_t UR_EXT_CONTEXT_INFO_REFERENCE_COUNT =
     (ur_context_info_t)(UR_CONTEXT_INFO_FORCE_UINT32 - 2);
@@ -77,20 +80,17 @@ const ur_command_t UR_EXT_COMMAND_TYPE_USER =
 const ur_image_channel_order_t UR_EXT_IMAGE_CHANNEL_ORDER_ABGR =
     ur_image_channel_order_t(UR_IMAGE_CHANNEL_ORDER_FORCE_UINT32 - 1);
 
-typedef enum ur_ext_sampler_filter_mode_t {
-  UR_EXT_SAMPLER_FILTER_MODE_NEAREST = 0,
-  UR_EXT_SAMPLER_FILTER_MODE_LINEAR = 1,
-  UR_EXT_SAMPLER_FILTER_MODE_FORCE_UINT32 = 0x7fffffff
-} ur_ext_sampler_filter_mode_t;
-
 const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG =
     (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 1);
-const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM =
-    (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 2);
-const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA =
-    (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 3);
-const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT =
-    (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 4);
+
+typedef enum {
+  // No preference for SLM or data cache.
+  UR_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT = 0x0,
+  // Large SLM size.
+  UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_SLM = 0x1,
+  // Large General Data size.
+  UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA = 0x2
+} ur_kernel_cache_config;
 
 // Terminates the process with a catastrophic error message.
 [[noreturn]] inline void die(const char *Message) {

From 0535b22cd0ed3cd140c6159c7f4eb1b3d5f86abc Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Sun, 9 Apr 2023 21:30:49 -0700
Subject: [PATCH 07/50] Some fixes to interop and other tests

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/unified_runtime/CMakeLists.txt     |  2 +-
 sycl/plugins/unified_runtime/pi2ur.hpp          | 15 +++++++++------
 .../level_zero/ur_level_zero_kernel.cpp         |  3 +++
 .../adapters/level_zero/ur_level_zero_mem.cpp   | 17 ++++++-----------
 4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 5b709ef7adacf..059990a7906e3 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -4,7 +4,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/jandres742/unified-runtime.git")
-  set(UNIFIED_RUNTIME_TAG b5c2119ba147306a76067e86c25e0c6c383172c6)
+  set(UNIFIED_RUNTIME_TAG 6bcd2a224d717cf904568d7311e84e2d057fcbef)
 
   message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}")
   FetchContent_Declare(unified-runtime
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 509448db3d3a4..4cac17faf43d9 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -1724,7 +1724,9 @@ inline pi_result piextKernelSetArgMemObj(pi_kernel Kernel, pi_uint32 ArgIndex,
 
   PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
 
-  ur_mem_handle_t UrMemory = reinterpret_cast<ur_mem_handle_t>(*ArgValue);
+  ur_mem_handle_t UrMemory{};
+  if (ArgValue)
+    UrMemory = reinterpret_cast<ur_mem_handle_t>(*ArgValue);
 
   // We don't yet know the device where this kernel will next be run on.
   // Thus we can't know the actual memory allocation that needs to be used.
@@ -1765,10 +1767,11 @@ piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
       reinterpret_cast<ur_native_handle_t>(NativeHandle);
   ur_context_handle_t UrContext =
       reinterpret_cast<ur_context_handle_t>(Context);
-  std::ignore = Program;
+  ur_program_handle_t UrProgram =
+      reinterpret_cast<ur_program_handle_t>(Program);
   ur_kernel_handle_t *UrKernel = reinterpret_cast<ur_kernel_handle_t *>(Kernel);
-  HANDLE_ERRORS(
-      urKernelCreateWithNativeHandle(UrNativeKernel, UrContext, UrKernel));
+  HANDLE_ERRORS(urKernelCreateWithNativeHandle(UrNativeKernel, UrContext,
+                                               UrProgram, UrKernel));
   (*UrKernel)->OwnNativeHandle = OwnNativeHandle;
 
   return PI_SUCCESS;
@@ -2580,8 +2583,8 @@ inline pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
   ur_mem_handle_t *UrMem = reinterpret_cast<ur_mem_handle_t *>(Mem);
   // TODO: Pass OwnNativeHandle to the output parameter
   // while we get it in interface
-  (*UrMem)->OwnNativeHandle = OwnNativeHandle;
-  HANDLE_ERRORS(urMemCreateWithNativeHandle(UrNativeMem, UrContext, UrMem));
+  HANDLE_ERRORS(urMemCreateWithNativeHandle(UrNativeMem, UrContext,
+                                            OwnNativeHandle, UrMem));
 
   return PI_SUCCESS;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
index 336f8ea530cdb..d5cc3f3894abf 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
@@ -710,6 +710,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
 UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     ur_native_handle_t NativeKernel, ///< [in] the native handle of the kernel.
     ur_context_handle_t Context,     ///< [in] handle of the context object
+    ur_program_handle_t Program,
     ur_kernel_handle_t *
         RetKernel ///< [out] pointer to the handle of the kernel object created.
 ) {
@@ -726,6 +727,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     return UR_RESULT_ERROR_UNKNOWN;
   }
 
+  Kernel->Program = Program;
+
   UR_CALL(Kernel->initialize());
 
   return UR_RESULT_SUCCESS;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index d09f18fe76c48..2a96cf1c9a8c9 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -1803,15 +1803,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle(
 UR_APIEXPORT ur_result_t UR_APICALL urMemCreateWithNativeHandle(
     ur_native_handle_t NativeMem, ///< [in] the native handle of the mem.
     ur_context_handle_t Context,  ///< [in] handle of the context object
+    bool OwnNativeHandle,
     ur_mem_handle_t
         *Mem ///< [out] pointer to the handle of the mem object created.
 ) {
   std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
 
-  // TODO: Get OwnNativeHandle from the output parameter while we get it in
-  // interface
-  bool OwnNativeHandle = (*Mem)->OwnNativeHandle;
-
   // Get base of the allocation
   void *Base = nullptr;
   size_t Size = 0;
@@ -1845,7 +1842,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemCreateWithNativeHandle(
 
   _ur_buffer *Buffer = nullptr;
   try {
-    Buffer = new _ur_buffer(Context, Device, Size);
+    Buffer = new _ur_buffer(Context, Size, Device, ur_cast<char *>(NativeMem),
+                            OwnNativeHandle);
     *Mem = reinterpret_cast<ur_mem_handle_t>(Buffer);
   } catch (const std::bad_alloc &) {
     return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
@@ -1867,12 +1865,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemCreateWithNativeHandle(
     // allocations in this context are released.
     UR_CALL(urContextRetain(Context));
 
-    Context->MemAllocs.emplace(
-        std::piecewise_construct, std::forward_as_tuple(Ptr),
-        std::forward_as_tuple(Context,
-                              true /*ownNativeHandle, how do we pass it here? or
-                                      do we move all this logic to pi2ur? */
-                              ));
+    Context->MemAllocs.emplace(std::piecewise_construct,
+                               std::forward_as_tuple(Ptr),
+                               std::forward_as_tuple(Context, OwnNativeHandle));
   }
 
   if (Device) {

From e079d6aa03de99a16e71a1dacc85353179d2c12c Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Mon, 10 Apr 2023 09:43:18 -0700
Subject: [PATCH 08/50] Implement urPlatformGetApiVersion

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../ur/adapters/level_zero/ur_level_zero_platform.cpp       | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
index 1f2430274e6f4..c247b4d854047 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
@@ -288,10 +288,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion(
     ur_platform_handle_t Driver, ///< [in] handle of the platform
     ur_api_version_t *Version    ///< [out] api version
 ) {
-  std::ignore = Driver;
-  std::ignore = Version;
-  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  *Version = UR_API_VERSION_0_6;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle(

From baefa3a32691fd327707498e7d9ba0a3536bb5fc Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Mon, 10 Apr 2023 10:47:37 -0700
Subject: [PATCH 09/50] Add UR_CONTEXT_INFO_REFERENCE_COUNT

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/unified_runtime/pi2ur.hpp                         | 2 +-
 .../ur/adapters/level_zero/ur_level_zero_context.cpp           | 2 +-
 sycl/plugins/unified_runtime/ur/ur.hpp                         | 3 ---
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 4cac17faf43d9..5a0d83352b146 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -1139,7 +1139,7 @@ inline pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName,
     break;
   }
   case PI_CONTEXT_INFO_REFERENCE_COUNT: {
-    ContextInfoType = UR_EXT_CONTEXT_INFO_REFERENCE_COUNT;
+    ContextInfoType = UR_CONTEXT_INFO_REFERENCE_COUNT;
     break;
   }
   case PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT: {
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
index 2f29904b04563..6dc5ad362d6ed 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
@@ -101,7 +101,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
     return ReturnValue(&Context->Devices[0], Context->Devices.size());
   case UR_CONTEXT_INFO_NUM_DEVICES:
     return ReturnValue(uint32_t(Context->Devices.size()));
-  case UR_EXT_CONTEXT_INFO_REFERENCE_COUNT:
+  case UR_CONTEXT_INFO_REFERENCE_COUNT:
     return ReturnValue(uint32_t{Context->RefCount.load()});
   case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT:
     // 2D USM memcpy is supported.
diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp
index c1d7464387922..f8578cafbb3b8 100644
--- a/sycl/plugins/unified_runtime/ur/ur.hpp
+++ b/sycl/plugins/unified_runtime/ur/ur.hpp
@@ -65,9 +65,6 @@ const int UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_DEVICE = 1 << 28;
 const int UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_HOST = 1 << 29;
 const int UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY = 1 << 30;
 
-const ur_context_info_t UR_EXT_CONTEXT_INFO_REFERENCE_COUNT =
-    (ur_context_info_t)(UR_CONTEXT_INFO_FORCE_UINT32 - 2);
-
 const ur_context_info_t UR_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES =
     (ur_context_info_t)(UR_CONTEXT_INFO_FORCE_UINT32 - 1);
 

From 7f8a6c527115289d04a443db1687957c8a7b737d Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Mon, 10 Apr 2023 11:07:12 -0700
Subject: [PATCH 10/50] Port Optimize sync of an in-order queue

https://github.com/intel/llvm/pull/8601
https://github.com/intel/llvm/pull/8993

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../level_zero/ur_level_zero_queue.cpp        | 32 +++++++++++++------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
index 941804b535b3c..b4bdf3347096f 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
@@ -1248,6 +1248,7 @@ ur_result_t ur_queue_handle_t_::synchronize() {
     ZE2UR_CALL(zeHostSynchronize, (zeEvent));
     Event->Completed = true;
     UR_CALL(urEventRelease(Event));
+
     // Cleanup all events from the synced command list.
     auto EventListToCleanup = std::move(ImmCmdList->second.EventList);
     ImmCmdList->second.EventList.clear();
@@ -1255,17 +1256,30 @@ ur_result_t ur_queue_handle_t_::synchronize() {
     return UR_RESULT_SUCCESS;
   };
 
-  for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID})
-    for (auto &QueueGroup : QueueMap) {
-      if (Device->ImmCommandListUsed) {
-        for (auto ImmCmdList : QueueGroup.second.ImmCmdLists)
-          syncImmCmdList(this, ImmCmdList);
-      } else {
-        for (auto &ZeQueue : QueueGroup.second.ZeQueues)
-          if (ZeQueue)
-            ZE2UR_CALL(zeHostSynchronize, (ZeQueue));
+  // Do nothing if the queue is empty
+  if (!LastCommandEvent)
+    return UR_RESULT_SUCCESS;
+
+  // For in-order queue just wait for the last command.
+  // If event is discarded then it can be in reset state or underlying level
+  // zero handle can have device scope, so we can't synchronize the last event.
+  if (isInOrderQueue() && !LastCommandEvent->IsDiscarded) {
+    ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent));
+  } else {
+    // Otherwise sync all L0 queues/immediate command-lists.
+    for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) {
+      for (auto &QueueGroup : QueueMap) {
+        if (Device->ImmCommandListUsed) {
+          for (auto ImmCmdList : QueueGroup.second.ImmCmdLists)
+            syncImmCmdList(this, ImmCmdList);
+        } else {
+          for (auto &ZeQueue : QueueGroup.second.ZeQueues)
+            if (ZeQueue)
+              ZE2UR_CALL(zeHostSynchronize, (ZeQueue));
+        }
       }
     }
+  }
   LastCommandEvent = nullptr;
 
   // With the entire queue synchronized, the active barriers must be done so we

From 28d7280b73bda08a3509a8183efd79c66b8304e7 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Wed, 12 Apr 2023 07:47:06 -0700
Subject: [PATCH 11/50] Port Do not use piGetDeviceAndHostTimer for only host
 time query

https://github.com/intel/llvm/pull/8996

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../ur/adapters/level_zero/ur_level_zero_event.cpp          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
index 318a931d608f3..f1eba37f331ec 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
@@ -426,9 +426,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
   case UR_PROFILING_INFO_COMMAND_QUEUED:
   case UR_PROFILING_INFO_COMMAND_SUBMIT:
     // Note: No users for this case
-    // TODO: Implement commmand submission time when needed,
-    //        by recording device timestamp (using zeDeviceGetGlobalTimestamps)
-    //        before submitting command to device
+    // The "command_submit" time is implemented by recording submission
+    // timestamp with a call to piGetDeviceAndHostTimer before command enqueue.
+    //
     return ReturnValue(uint64_t{0});
   default:
     urPrint("urEventGetProfilingInfo: not supported ParamName\n");

From 77af5383da98519c52f06fbf0834cec3e920b3ff Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Thu, 13 Apr 2023 22:31:40 -0700
Subject: [PATCH 12/50] Port Fix PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE queries

https://github.com/intel/llvm/pull/8769

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../ur/adapters/level_zero/ur_level_zero_kernel.cpp   | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
index d5cc3f3894abf..7a523e561967a 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
@@ -500,10 +500,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo(
     // TODO: To revisit after level_zero/issues/262 is resolved
     struct {
       size_t Arr[3];
-    } WorkSize = {{Device->ZeDeviceComputeProperties->maxGroupSizeX,
-                   Device->ZeDeviceComputeProperties->maxGroupSizeY,
-                   Device->ZeDeviceComputeProperties->maxGroupSizeZ}};
-    return ReturnValue(WorkSize);
+    } GlobalWorkSize = {{(Device->ZeDeviceComputeProperties->maxGroupSizeX *
+                          Device->ZeDeviceComputeProperties->maxGroupCountX),
+                         (Device->ZeDeviceComputeProperties->maxGroupSizeY *
+                          Device->ZeDeviceComputeProperties->maxGroupCountY),
+                         (Device->ZeDeviceComputeProperties->maxGroupSizeZ *
+                          Device->ZeDeviceComputeProperties->maxGroupCountZ)}};
+    return ReturnValue(GlobalWorkSize);
   }
   case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
     // As of right now, L0 is missing API to query kernel and device specific

From d39759aeb2f3daa32fab08f407d051acba7ab435 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Thu, 13 Apr 2023 22:34:59 -0700
Subject: [PATCH 13/50] Port Retain build-log when program build failed

https://github.com/intel/llvm/pull/8848

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../ur/adapters/level_zero/ur_level_zero_program.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp
index f9e32aa395084..0b4d07b0366a3 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp
@@ -654,6 +654,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetBuildInfo(
       if (PropSizeRet) {
         *PropSizeRet = LogSize;
       }
+      if (PropValue) {
+        // When the program build fails in piProgramBuild(), we delayed cleaning
+        // up the build log because RT later calls this routine to get the
+        // failed build log.
+        // To avoid memory leaks, we should clean up the failed build log here
+        // because RT does not create sycl::program when piProgramBuild() fails,
+        // thus it won't call piProgramRelease() to clean up the build log.
+        if (Program->State == ur_program_handle_t_::Invalid) {
+          ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (Program->ZeBuildLog));
+          Program->ZeBuildLog = nullptr;
+        }
+      }
       return UR_RESULT_SUCCESS;
     }
 

From a02b1a5322ae297335949ed9602845c4064f6411 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Thu, 13 Apr 2023 22:40:02 -0700
Subject: [PATCH 14/50] Port Heuristically reduce overhead from immediate
 command-list cleanup

https://github.com/intel/llvm/pull/9052

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../level_zero/ur_level_zero_context.cpp      |  2 +-
 .../level_zero/ur_level_zero_queue.cpp        | 41 +++++++++++++++----
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
index 6dc5ad362d6ed..55354358124bd 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
@@ -518,7 +518,7 @@ ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) {
 static const size_t ImmCmdListsEventCleanupThreshold = [] {
   const char *ImmCmdListsEventCleanupThresholdStr = std::getenv(
       "SYCL_PI_LEVEL_ZERO_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD");
-  static constexpr int Default = 20;
+  static constexpr int Default = 1000;
   if (!ImmCmdListsEventCleanupThresholdStr)
     return Default;
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
index b4bdf3347096f..298b9d65467fb 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
@@ -1467,26 +1467,49 @@ ur_result_t ur_queue_handle_t_::resetCommandList(
               std::back_inserter(EventListToCleanup));
     EventList.clear();
   } else if (!isDiscardEvents()) {
-    // For immediate commandlist reset only those events that have signalled.
     // If events in the queue are discarded then we can't check their status.
-    for (auto it = EventList.begin(); it != EventList.end();) {
-      std::scoped_lock<ur_shared_mutex> EventLock((*it)->Mutex);
+    // Helper for checking of event completion
+    auto EventCompleted = [](ur_event_handle_t Event) -> bool {
+      std::scoped_lock<ur_shared_mutex> EventLock(Event->Mutex);
       ze_result_t ZeResult =
-          (*it)->Completed
+          Event->Completed
               ? ZE_RESULT_SUCCESS
-              : ZE_CALL_NOCHECK(zeEventQueryStatus, ((*it)->ZeEvent));
+              : ZE_CALL_NOCHECK(zeEventQueryStatus, (Event->ZeEvent));
+      return ZeResult == ZE_RESULT_SUCCESS;
+    };
+    // Handle in-order specially as we can just in few checks (with binary
+    // search) a completed event and then all events before it are also
+    // done.
+    if (isInOrderQueue()) {
+      size_t Bisect = EventList.size();
+      size_t Iter = 0;
+      for (auto it = EventList.rbegin(); it != EventList.rend(); ++Iter) {
+        if (!EventCompleted(*it)) {
+          if (Bisect > 1 && Iter < 3) { // Heuristically limit by 3 checks
+            Bisect >>= 1;
+            it += Bisect;
+            continue;
+          }
+          break;
+        }
+        // Bulk move of event up to "it" to the list ready for cleanup
+        std::move(it, EventList.rend(), std::back_inserter(EventListToCleanup));
+        EventList.erase(EventList.begin(), it.base());
+        break;
+      }
+      return UR_RESULT_SUCCESS;
+    }
+    // For immediate commandlist reset only those events that have signalled.
+    for (auto it = EventList.begin(); it != EventList.end();) {
       // Break early as soon as we found first incomplete event because next
       // events are submitted even later. We are not trying to find all
       // completed events here because it may be costly. I.e. we are checking
       // only elements which are most likely completed because they were
       // submitted earlier. It is guaranteed that all events will be eventually
       // cleaned up at queue sync/release.
-      if (ZeResult == ZE_RESULT_NOT_READY)
+      if (!EventCompleted(*it))
         break;
 
-      if (ZeResult != ZE_RESULT_SUCCESS)
-        return ze2urResult(ZeResult);
-
       EventListToCleanup.push_back(std::move((*it)));
       it = EventList.erase(it);
     }

From 7250d4610102365d04443d9e8f92d30ec596682c Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Thu, 13 Apr 2023 22:49:52 -0700
Subject: [PATCH 15/50] Port Add support to propagate compile flags to device
 backend compiler

https://github.com/intel/llvm/pull/8763

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/level_zero/pi_level_zero.cpp     | 19 ++++++++++++
 sycl/plugins/unified_runtime/CMakeLists.txt   |  2 +-
 sycl/plugins/unified_runtime/pi2ur.hpp        | 16 ++++++++++
 .../unified_runtime/pi_unified_runtime.cpp    |  8 +++++
 .../level_zero/ur_level_zero_platform.cpp     | 30 +++++++++++++++++++
 .../level_zero/ur_loader_interface.cpp        |  1 +
 6 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index c8b823d47602e..ee8b124a19d86 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -53,6 +53,25 @@ pi_result piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle,
 }
 
 pi_result piPluginGetLastError(char **message) {
+  return pi2ur::piPluginGetLastError(message);
+}
+
+// Returns plugin specific backend option.
+// Current support is only for optimization options.
+// Return '-ze-opt-disable' for frontend_option = -O0.
+// Return '-ze-opt-level=1' for frontend_option = -O1 or -O2.
+// Return '-ze-opt-level=2' for frontend_option = -O3.
+pi_result piPluginGetBackendOption(pi_platform platform,
+                                   const char *frontend_option,
+                                   const char **backend_option) {
+  return pi2ur::piPluginGetBackendOption(platform, frontend_option,
+                                         backend_option);
+}
+
+pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType,
+                       pi_uint32 NumEntries, pi_device *Devices,
+                       pi_uint32 *NumDevices) {
+  return pi2ur::piDevicesGet(Platform, DeviceType, NumEntries, Devices,
                              NumDevices);
 }
 
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 059990a7906e3..3af474c33af6c 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -4,7 +4,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/jandres742/unified-runtime.git")
-  set(UNIFIED_RUNTIME_TAG 6bcd2a224d717cf904568d7311e84e2d057fcbef)
+  set(UNIFIED_RUNTIME_TAG b674dc2b59997d5b6cff462f8c33ee05a2ce0450)
 
   message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}")
   FetchContent_Declare(unified-runtime
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 5a0d83352b146..8aef9ef74f0b6 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -506,6 +506,22 @@ inline pi_result piextPluginGetOpaqueData(void *opaque_data_param,
   return PI_ERROR_UNKNOWN;
 }
 
+// Returns plugin specific backend option.
+// Current support is only for optimization options.
+// Return '-ze-opt-disable' for frontend_option = -O0.
+// Return '-ze-opt-level=1' for frontend_option = -O1 or -O2.
+// Return '-ze-opt-level=2' for frontend_option = -O3.
+inline pi_result piPluginGetBackendOption(pi_platform Platform,
+                                          const char *FrontendOption,
+                                          const char **PlatformOption) {
+
+  auto UrPlatform = reinterpret_cast<ur_platform_handle_t>(Platform);
+  HANDLE_ERRORS(
+      urPlatformGetBackendOption(UrPlatform, FrontendOption, PlatformOption));
+
+  return PI_SUCCESS;
+}
+
 // Platform
 ///////////////////////////////////////////////////////////////////////////////
 
diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
index b719273bf484e..3aa24712478ea 100644
--- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
+++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
@@ -981,6 +981,13 @@ __SYCL_EXPORT pi_result piGetDeviceAndHostTimer(pi_device Device,
   return pi2ur::piGetDeviceAndHostTimer(Device, DeviceTime, HostTime);
 }
 
+__SYCL_EXPORT pi_result piPluginGetBackendOption(pi_platform platform,
+                                                 const char *frontend_option,
+                                                 const char **backend_option) {
+  return pi2ur::piPluginGetBackendOption(platform, frontend_option,
+                                         backend_option);
+}
+
 // This interface is not in Unified Runtime currently
 __SYCL_EXPORT pi_result piTearDown(void *PluginParameter) {
   return pi2ur::piTearDown(PluginParameter);
@@ -1025,6 +1032,7 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_API(piextPlatformCreateWithNativeHandle)
   _PI_API(piextDeviceGetNativeHandle)
   _PI_API(piextDeviceCreateWithNativeHandle)
+  _PI_API(piPluginGetBackendOption)
 
   _PI_API(piContextCreate)
   _PI_API(piContextRelease)
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
index c247b4d854047..61c021472bf3f 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
@@ -536,3 +536,33 @@ ur_result_t ur_platform_handle_t_::populateDeviceCacheIfNeeded() {
   DeviceCachePopulated = true;
   return UR_RESULT_SUCCESS;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption(
+    ur_platform_handle_t Platform, ///< [in] handle of the platform instance.
+    const char *FrontendOption, ///< [in] string containing the frontend option.
+    const char *
+        *PlatformOption ///< [out] returns the correct platform specific
+                        ///< compiler option based on the frontend option.
+) {
+  using namespace std::literals;
+  if (FrontendOption == nullptr) {
+    return UR_RESULT_SUCCESS;
+  }
+  if (FrontendOption == ""sv) {
+    *PlatformOption = "";
+    return UR_RESULT_SUCCESS;
+  }
+  if (FrontendOption == "-O0"sv) {
+    *PlatformOption = "-ze-opt-disable";
+    return UR_RESULT_SUCCESS;
+  }
+  if (FrontendOption == "-O1"sv || FrontendOption == "-O2"sv) {
+    *PlatformOption = "-ze-opt-level=1";
+    return UR_RESULT_SUCCESS;
+  }
+  if (FrontendOption == "-O3"sv) {
+    *PlatformOption = "-ze-opt-level=2";
+    return UR_RESULT_SUCCESS;
+  }
+  return UR_RESULT_ERROR_INVALID_VALUE;
+}
\ No newline at end of file
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp
index e6164fe6519af..01b174aa93774 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp
@@ -180,6 +180,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable(
   pDdiTable->pfnGetNativeHandle = urPlatformGetNativeHandle;
   pDdiTable->pfnCreateWithNativeHandle = urPlatformCreateWithNativeHandle;
   pDdiTable->pfnGetApiVersion = urPlatformGetApiVersion;
+  pDdiTable->pfnGetBackendOption = urPlatformGetBackendOption;
 
   return retVal;
 }

From f569e92029a1f50313e2a2ce5be216fa8cfa671c Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Mon, 17 Apr 2023 09:55:03 -0700
Subject: [PATCH 16/50] Update loader

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/unified_runtime/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 3af474c33af6c..770b3d360f1f3 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -4,7 +4,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/jandres742/unified-runtime.git")
-  set(UNIFIED_RUNTIME_TAG b674dc2b59997d5b6cff462f8c33ee05a2ce0450)
+  set(UNIFIED_RUNTIME_TAG 91d194234710f40c7d4dc3670cca7abc2020682f)
 
   message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}")
   FetchContent_Declare(unified-runtime

From 3d18689d7af9021d87a064103d36e2c2a25c4028 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Tue, 18 Apr 2023 16:28:40 -0700
Subject: [PATCH 17/50] Some fixes for test_queue, and rebase loader

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/unified_runtime/CMakeLists.txt   |  2 +-
 .../level_zero/ur_level_zero_device.cpp       |  5 ++++
 .../adapters/level_zero/ur_level_zero_mem.cpp |  5 +++-
 .../level_zero/ur_level_zero_queue.cpp        | 27 ++++++++++++-------
 4 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 770b3d360f1f3..d66bfaba17bf1 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -4,7 +4,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/jandres742/unified-runtime.git")
-  set(UNIFIED_RUNTIME_TAG 91d194234710f40c7d4dc3670cca7abc2020682f)
+  set(UNIFIED_RUNTIME_TAG 586cc2d9a9612ad6886704aba7b38f1cd8ae610e)
 
   message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}")
   FetchContent_Declare(unified-runtime
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
index 0a21858fc2842..28b91a729e328 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
@@ -764,6 +764,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(capabilities);
   }
 
+  case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES:
+  case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: {
+    return ReturnValue(0);
+  }
+
   default:
     urPrint("Unsupported ParamName in urGetDeviceInfo\n");
     urPrint("ParamName=%d(0x%x)\n", ParamName, ParamName);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index 2a96cf1c9a8c9..73ac57cf9ec93 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -1649,7 +1649,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
     //
   }
 
-  void *Host = Properties->pHost;
+  void *Host = nullptr;
+  if (Properties) {
+    Host = Properties->pHost;
+  }
 
   // If USM Import feature is enabled and hostptr is supplied,
   // import the hostptr if not already imported into USM.
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
index 298b9d65467fb..df7d39be50cb5 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
@@ -278,14 +278,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(
 ) {
   Context->Devices[0] = Device;
 
+  ur_queue_flags_t Flags{};
+  if (Props) {
+    Flags = Props->flags;
+  }
+
   int ForceComputeIndex = -1; // Use default/round-robin.
-  if (Props->pNext) {
-    const ur_base_properties_t *extendedDesc =
-        reinterpret_cast<const ur_base_properties_t *>(Props->pNext);
-    if (extendedDesc->stype == UR_STRUCTURE_TYPE_QUEUE_INDEX_PROPERTIES) {
-      const ur_queue_index_properties_t *IndexProperties =
-          reinterpret_cast<const ur_queue_index_properties_t *>(extendedDesc);
-      ForceComputeIndex = IndexProperties->computeIndex;
+  if (Props) {
+    if (Props->pNext) {
+      const ur_base_properties_t *extendedDesc =
+          reinterpret_cast<const ur_base_properties_t *>(Props->pNext);
+      if (extendedDesc->stype == UR_STRUCTURE_TYPE_QUEUE_INDEX_PROPERTIES) {
+        const ur_queue_index_properties_t *IndexProperties =
+            reinterpret_cast<const ur_queue_index_properties_t *>(extendedDesc);
+        ForceComputeIndex = IndexProperties->computeIndex;
+      }
     }
   }
 
@@ -316,9 +323,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(
                                                              nullptr);
 
   try {
-    *Queue = new ur_queue_handle_t_(ZeComputeCommandQueues, ZeCopyCommandQueues,
-                                    Context, Device, true, Props->flags,
-                                    ForceComputeIndex);
+    *Queue =
+        new ur_queue_handle_t_(ZeComputeCommandQueues, ZeCopyCommandQueues,
+                               Context, Device, true, Flags, ForceComputeIndex);
   } catch (const std::bad_alloc &) {
     return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
   } catch (...) {

From 4c6b101202659e023574e4b595c88f4bc9021003 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Tue, 18 Apr 2023 16:36:19 -0700
Subject: [PATCH 18/50] Partially port native image handle support for
 LevelZero

https://github.com/intel/llvm/pull/8603/files

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/level_zero/pi_level_zero.cpp     | 44 +++---------
 sycl/plugins/unified_runtime/pi2ur.hpp        | 23 ++++++
 .../unified_runtime/pi_unified_runtime.cpp    |  8 +++
 .../adapters/level_zero/ur_level_zero_mem.cpp | 72 ++++++++++++++++---
 .../adapters/level_zero/ur_level_zero_mem.hpp |  7 ++
 5 files changed, 108 insertions(+), 46 deletions(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index ee8b124a19d86..bc55890ada108 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -271,42 +271,6 @@ pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
                                                ownNativeHandle, Mem);
 }
 
-pi_result piextMemImageCreateWithNativeHandle(
-    pi_native_handle NativeHandle, pi_context Context, bool OwnNativeHandle,
-    const pi_image_format *ImageFormat, const pi_image_desc *ImageDesc,
-    pi_mem *RetImage) {
-
-  PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-
-  std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
-
-  ze_image_handle_t ZeHImage = pi_cast<ze_image_handle_t>(NativeHandle);
-
-  try {
-    auto ZePIImage = new _pi_image(Context, ZeHImage, OwnNativeHandle);
-    *RetImage = ZePIImage;
-
-#ifndef NDEBUG
-    ZeStruct<ze_image_desc_t> ZeImageDesc;
-    pi_result DescriptionResult =
-        pi2zeImageDesc(ImageFormat, ImageDesc, ZeImageDesc);
-    if (DescriptionResult != PI_SUCCESS)
-      return DescriptionResult;
-
-    ZePIImage->ZeImageDesc = ZeImageDesc;
-#endif // !NDEBUG
-
-  } catch (const std::bad_alloc &) {
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  return PI_SUCCESS;
-}
-
 pi_result piProgramCreate(pi_context Context, const void *ILBytes,
                           size_t Length, pi_program *Program) {
   return pi2ur::piProgramCreate(Context, ILBytes, Length, Program);
@@ -323,6 +287,14 @@ pi_result piProgramCreateWithBinary(
                                           Metadata, BinaryStatus, Program);
 }
 
+pi_result piextMemImageCreateWithNativeHandle(
+    pi_native_handle NativeHandle, pi_context Context, bool OwnNativeHandle,
+    const pi_image_format *ImageFormat, const pi_image_desc *ImageDesc,
+    pi_mem *Img) {
+  return pi2ur::piextMemImageCreateWithNativeHandle(
+      NativeHandle, Context, OwnNativeHandle, ImageFormat, ImageDesc, Img);
+}
+
 pi_result piclProgramCreateWithSource(pi_context Context, pi_uint32 Count,
                                       const char **Strings,
                                       const size_t *Lengths,
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 8aef9ef74f0b6..55dd4258e6f33 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -2492,6 +2492,29 @@ inline pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
   return PI_SUCCESS;
 }
 
+inline pi_result piextMemImageCreateWithNativeHandle(
+    pi_native_handle NativeHandle, pi_context Context, bool OwnNativeHandle,
+    const pi_image_format *ImageFormat, const pi_image_desc *ImageDesc,
+    pi_mem *RetImage) {
+
+  PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE);
+  PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+
+  std::ignore = NativeHandle;
+  std::ignore = Context;
+  std::ignore = OwnNativeHandle;
+  std::ignore = ImageFormat;
+  std::ignore = ImageDesc;
+  std::ignore = RetImage;
+
+  // ur_mem_handle_t *UrMem = reinterpret_cast<ur_mem_handle_t *>(RetImage);
+  // HANDLE_ERRORS(urMemImageCreateWithNativeHandle(UrContext, OwnNativeHandle,
+  // HostPtr, UrMem));
+
+  return PI_SUCCESS;
+}
+
 inline pi_result piMemBufferPartition(pi_mem Buffer, pi_mem_flags Flags,
                                       pi_buffer_create_type BufferCreateType,
                                       void *BufferCreateInfo, pi_mem *RetMem) {
diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
index 3aa24712478ea..3cf3e10a21676 100644
--- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
+++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
@@ -975,6 +975,14 @@ pi_result piextEnqueueDeviceGlobalVariableRead(
       NumEventsInWaitList, EventsWaitList, Event);
 }
 
+pi_result piextMemImageCreateWithNativeHandle(
+    pi_native_handle NativeHandle, pi_context Context, bool OwnNativeHandle,
+    const pi_image_format *ImageFormat, const pi_image_desc *ImageDesc,
+    pi_mem *Img) {
+  return pi2ur::piextMemImageCreateWithNativeHandle(
+      NativeHandle, Context, OwnNativeHandle, ImageFormat, ImageDesc, Img);
+}
+
 __SYCL_EXPORT pi_result piGetDeviceAndHostTimer(pi_device Device,
                                                 uint64_t *DeviceTime,
                                                 uint64_t *HostTime) {
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index 73ac57cf9ec93..5cf5cedc43a00 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -1449,15 +1449,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
       Blocking, NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine);
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
-    ur_context_handle_t Context, ///< [in] handle of the context object
-    ur_mem_flags_t Flags, ///< [in] allocation and usage information flags
-    const ur_image_format_t
-        *ImageFormat, ///< [in] pointer to image format specification
-    const ur_image_desc_t *ImageDesc, ///< [in] pointer to image description
-    void *Host,                       ///< [in] pointer to the buffer data
-    ur_mem_handle_t *Mem ///< [out] pointer to handle of image object created
-) {
+static ur_result_t ur2zeImageDesc(const ur_image_format_t *ImageFormat,
+                                  const ur_image_desc_t *ImageDesc,
+                                  ZeStruct<ze_image_desc_t> &ZeImageDesc) {
+
   ze_image_format_type_t ZeImageFormatType;
   size_t ZeImageFormatTypeSize;
   switch (ImageFormat->channelType) {
@@ -1581,7 +1576,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
     return UR_RESULT_ERROR_INVALID_VALUE;
   }
 
-  ZeStruct<ze_image_desc_t> ZeImageDesc;
   ZeImageDesc.arraylevels = ZeImageDesc.flags = 0;
   ZeImageDesc.type = ZeImageType;
   ZeImageDesc.format = ZeFormatDesc;
@@ -1591,8 +1585,66 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
   ZeImageDesc.arraylevels = ur_cast<uint32_t>(ImageDesc->arraySize);
   ZeImageDesc.miplevels = ImageDesc->numMipLevel;
 
+  return UR_RESULT_SUCCESS;
+}
+
+#if 0
+UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle(
+    ur_native_handle_t NativeMem, ///< [in] the native handle of the mem.
+    ur_context_handle_t Context,  ///< [in] handle of the context object
+    bool OwnNativeHandle,
+/*
+    const ur_image_format_t
+        *ImageFormat, ///< [in] pointer to image format specification
+    const ur_image_desc_t *ImageDesc, ///< [in] pointer to image description
+*/
+    ur_mem_handle_t
+        *Mem ///< [out] pointer to the handle of the mem object created.
+) {
+
+  std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
+
+  ze_image_handle_t ZeImage = ur_cast<ze_image_handle_t>(NativeMem);
+
+try {
+    auto UrImage =
+        new _ur_image(ur_cast<ur_context_handle_t>(Context), ZeImage, OwnNativeHandle);
+    *Mem = reinterpret_cast<ur_mem_handle_t>(UrImage);
+
+/*
+#ifndef NDEBUG
+    ZeStruct<ze_image_desc_t> ZeImageDesc;
+    UR_CALL(ur2zeImageDesc(ImageFormat, ImageDesc, ZeImageDesc));
+
+    UrImage->ZeImageDesc = ZeImageDesc;
+#endif // !NDEBUG
+*/
+
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return UR_RESULT_SUCCESS;
+
+}
+#endif
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    ur_mem_flags_t Flags, ///< [in] allocation and usage information flags
+    const ur_image_format_t
+        *ImageFormat, ///< [in] pointer to image format specification
+    const ur_image_desc_t *ImageDesc, ///< [in] pointer to image description
+    void *Host,                       ///< [in] pointer to the buffer data
+    ur_mem_handle_t *Mem ///< [out] pointer to handle of image object created
+) {
   std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
 
+  ZeStruct<ze_image_desc_t> ZeImageDesc;
+  UR_CALL(ur2zeImageDesc(ImageFormat, ImageDesc, ZeImageDesc));
+
   // Currently we have the "0" device in context with mutliple root devices to
   // own the image.
   // TODO: Implement explicit copying for acessing the image from other devices
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
index 575ab61959184..ecae9a0c1b11b 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
@@ -193,6 +193,11 @@ struct _ur_image final : ur_mem_handle_t_ {
   _ur_image(ur_context_handle_t UrContext, ze_image_handle_t ZeImage)
       : ur_mem_handle_t_(UrContext), ZeImage{ZeImage} {}
 
+  _ur_image(ur_context_handle_t UrContext, ze_image_handle_t ZeImage,
+            bool OwnNativeHandle)
+      : ur_mem_handle_t_(UrContext), ZeImage{ZeImage},
+        OwnZeMemHandle{OwnNativeHandle} {}
+
   virtual ur_result_t getZeHandle(char *&ZeHandle, access_mode_t,
                                   ur_device_handle_t = nullptr) override {
     ZeHandle = reinterpret_cast<char *>(ZeImage);
@@ -213,6 +218,8 @@ struct _ur_image final : ur_mem_handle_t_ {
 
   // Level Zero image handle.
   ze_image_handle_t ZeImage;
+
+  bool OwnZeMemHandle = true;
 };
 
 // Implements memory allocation via L0 RT for USM allocator interface.

From 65cd8b2b80ba4bdb1c5ed82c5253357796db43fa Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Tue, 18 Apr 2023 16:40:07 -0700
Subject: [PATCH 19/50] Port Report events as submitted, not running, until
 they are completed

https://github.com/intel/llvm/pull/9094

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../ur/adapters/level_zero/ur_level_zero_event.cpp        | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
index f1eba37f331ec..237d27cf130a9 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
@@ -337,8 +337,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(
 
     // Level Zero has a much more explicit notion of command submission than
     // OpenCL. It doesn't happen unless the user submits a command list. We've
-    // done it just above so the status is at least PI_EVENT_RUNNING.
-    uint32_t Result = ur_cast<uint32_t>(UR_EVENT_STATUS_RUNNING);
+    // done it just above so the status is at least PI_EVENT_SUBMITTED.
+    //
+    // NOTE: We currently cannot tell if command is currently running, so
+    // it will always show up "submitted" before it is finally "completed".
+    //
+    uint32_t Result = ur_cast<uint32_t>(UR_EVENT_STATUS_SUBMITTED);
 
     // Make sure that we query a host-visible event only.
     // If one wasn't yet created then don't create it here as well, and

From d9da97b54ab588b3a95cc0dde930f6af870e3d67 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Thu, 20 Apr 2023 17:15:31 -0700
Subject: [PATCH 20/50] Remove not needed code

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../level_zero/ur_level_zero_common.hpp       | 122 ------------------
 .../level_zero/ur_level_zero_kernel.cpp       |   4 +-
 2 files changed, 2 insertions(+), 124 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
index a26e3412fadca..599527ae34a2d 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
@@ -362,128 +362,6 @@ const bool IndirectAccessTrackingEnabled = [] {
 
 extern const bool UseUSMAllocator;
 
-// The getInfo*/ReturnHelper facilities provide shortcut way of
-// writing return bytes for the various getInfo APIs.
-template <typename T, typename Assign>
-ur_result_t urL0getInfoImpl(size_t param_value_size, void *param_value,
-                            size_t *param_value_size_ret, T value,
-                            size_t value_size, Assign &&assign_func) {
-
-  if (param_value != nullptr) {
-
-    if (param_value_size < value_size) {
-      return UR_RESULT_ERROR_INVALID_VALUE;
-    }
-
-    assign_func(param_value, value, value_size);
-  }
-
-  if (param_value_size_ret != nullptr) {
-    *param_value_size_ret = value_size;
-  }
-
-  return UR_RESULT_SUCCESS;
-}
-
-template <typename T>
-ur_result_t urL0getInfo(size_t param_value_size, void *param_value,
-                        size_t *param_value_size_ret, T value) {
-
-  auto assignment = [](void *param_value, T value, size_t value_size) {
-    std::ignore = value_size;
-    *static_cast<T *>(param_value) = value;
-  };
-
-  return urL0getInfoImpl(param_value_size, param_value, param_value_size_ret,
-                         value, sizeof(T), assignment);
-}
-
-template <typename T>
-ur_result_t urL0getInfoArray(size_t array_length, size_t param_value_size,
-                             void *param_value, size_t *param_value_size_ret,
-                             const T *value) {
-  return urL0getInfoImpl(param_value_size, param_value, param_value_size_ret,
-                         value, array_length * sizeof(T), memcpy);
-}
-
-template <typename T, typename RetType>
-ur_result_t urL0getInfoArray(size_t array_length, size_t param_value_size,
-                             void *param_value, size_t *param_value_size_ret,
-                             const T *value) {
-  if (param_value) {
-    memset(param_value, 0, param_value_size);
-    for (uint32_t I = 0; I < array_length; I++)
-      ((RetType *)param_value)[I] = (RetType)value[I];
-  }
-  if (param_value_size_ret)
-    *param_value_size_ret = array_length * sizeof(RetType);
-  return UR_RESULT_SUCCESS;
-}
-
-template <>
-inline ur_result_t
-urL0getInfo<const char *>(size_t param_value_size, void *param_value,
-                          size_t *param_value_size_ret, const char *value) {
-  return urL0getInfoArray(strlen(value) + 1, param_value_size, param_value,
-                          param_value_size_ret, value);
-}
-
-class UrL0ReturnHelperBase {
-public:
-  UrL0ReturnHelperBase(size_t param_value_size, void *param_value,
-                       size_t *param_value_size_ret)
-      : param_value_size(param_value_size), param_value(param_value),
-        param_value_size_ret(param_value_size_ret) {}
-
-  // A version where in/out info size is represented by a single pointer
-  // to a value which is updated on return
-  UrL0ReturnHelperBase(size_t *param_value_size, void *param_value)
-      : param_value_size(*param_value_size), param_value(param_value),
-        param_value_size_ret(param_value_size) {}
-
-  // Scalar return value
-  template <class T> ur_result_t operator()(const T &t) {
-    return getInfo(param_value_size, param_value, param_value_size_ret, t);
-  }
-
-  // Array return value
-  template <class T> ur_result_t operator()(const T *t, size_t s) {
-    return urL0getInfoArray(s, param_value_size, param_value,
-                            param_value_size_ret, t);
-  }
-
-  // Array return value where element type is differrent from T
-  template <class RetType, class T>
-  ur_result_t operator()(const T *t, size_t s) {
-    return urL0getInfoArray<T, RetType>(s, param_value_size, param_value,
-                                        param_value_size_ret, t);
-  }
-
-protected:
-  size_t param_value_size;
-  void *param_value;
-  size_t *param_value_size_ret;
-};
-
-// A version of return helper that returns pi_result and not ur_result_t
-class UrL0ReturnHelper : public UrL0ReturnHelperBase {
-public:
-  using UrL0ReturnHelperBase::UrL0ReturnHelperBase;
-
-  template <class T> ur_result_t operator()(const T &t) {
-    return UrL0ReturnHelperBase::operator()(t);
-  }
-  // Array return value
-  template <class T> ur_result_t operator()(const T *t, size_t s) {
-    return UrL0ReturnHelperBase::operator()(t, s);
-  }
-  // Array return value where element type is differrent from T
-  template <class RetType, class T>
-  ur_result_t operator()(const T *t, size_t s) {
-    return UrL0ReturnHelperBase::operator()<RetType>(t, s);
-  }
-};
-
 const bool ExposeCSliceInAffinityPartitioning = [] {
   char *UrRet = std::getenv("UR_L0_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING");
   char *PiRet =
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
index 7a523e561967a..fae4734b031b0 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
@@ -434,7 +434,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(
                         ///< bytes of data being queried by propName.
 ) {
 
-  UrL0ReturnHelper ReturnValue(PropSize, KernelInfo, PropSizeRet);
+  UrReturnHelper ReturnValue(PropSize, KernelInfo, PropSizeRet);
 
   std::shared_lock<ur_shared_mutex> Guard(Kernel->Mutex);
   switch (ParamName) {
@@ -492,7 +492,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo(
     size_t *ParamValueSizeRet ///< [out][optional] pointer to the actual size in
                               ///< bytes of data being queried by propName.
 ) {
-  UrL0ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
+  UrReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
 
   std::shared_lock<ur_shared_mutex> Guard(Kernel->Mutex);
   switch (ParamName) {

From eb9b4b7a81c2f3b026c884daa6bbf23451a882b6 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Thu, 20 Apr 2023 17:19:13 -0700
Subject: [PATCH 21/50] Port Change the default to
 SYCL_PI_LEVEL_ZERO_USM_RESIDENT=2

https://github.com/intel/llvm/pull/9109

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../ur/adapters/level_zero/ur_level_zero_mem.cpp            | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index 5cf5cedc43a00..7bdbe320bd0d1 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -2397,12 +2397,12 @@ ur_result_t USMHostMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
 }
 
 enum class USMAllocationForceResidencyType {
-  // [Default] Do not force memory residency at allocation time.
+  // Do not force memory residency at allocation time.
   None = 0,
   // Force memory resident on the device of allocation at allocation time.
   // For host allocation force residency on all devices in a context.
   Device = 1,
-  // Force memory resident on all devices in the context with P2P
+  // [Default] Force memory resident on all devices in the context with P2P
   // access to the device of allocation.
   // For host allocation force residency on all devices in a context.
   P2PDevices = 2
@@ -2412,7 +2412,7 @@ enum class USMAllocationForceResidencyType {
 static USMAllocationForceResidencyType USMAllocationForceResidency = [] {
   const auto Str = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT");
   if (!Str)
-    return USMAllocationForceResidencyType::None;
+    return USMAllocationForceResidencyType::P2PDevices;
   switch (std::atoi(Str)) {
   case 1:
     return USMAllocationForceResidencyType::Device;

From df8ca5880b4b079ed783f6d0beb2150506026d4b Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Thu, 20 Apr 2023 17:25:33 -0700
Subject: [PATCH 22/50] Port Add infrastructure to know backend of a platform

https://github.com/intel/llvm/pull/9067

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/unified_runtime/pi2ur.hpp                         | 3 +--
 .../ur/adapters/level_zero/ur_level_zero_platform.cpp          | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 55dd4258e6f33..fd58feca8be29 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -491,11 +491,10 @@ inline pi_result piPlatformGetInfo(pi_platform Platform,
 
   size_t SizeInOut = ParamValueSize;
   auto UrPlatform = reinterpret_cast<ur_platform_handle_t>(Platform);
-  HANDLE_ERRORS(urPlatformGetInfo(UrPlatform, UrParamName, ParamValueSize,
+  HANDLE_ERRORS(urPlatformGetInfo(UrPlatform, UrParamName, SizeInOut,
                                   ParamValue, ParamValueSizeRet));
 
   ur2piPlatformInfoValue(UrParamName, ParamValueSize, &SizeInOut, ParamValue);
-
   return PI_SUCCESS;
 }
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
index 61c021472bf3f..7d0bef4cb84f5 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
@@ -276,6 +276,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo(
     // information>. Follow the same notation here.
     //
     return ReturnValue(Platform->ZeDriverApiVersion.c_str());
+  case UR_PLATFORM_INFO_BACKEND:
+    return ReturnValue(UR_PLATFORM_BACKEND_LEVEL_ZERO);
   default:
     urPrint("urPlatformGetInfo: unrecognized ParamName\n");
     return UR_RESULT_ERROR_INVALID_VALUE;

From f8884d2452dcc07b9f771d8fbf976e52b3003e89 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Sun, 23 Apr 2023 23:07:54 -0700
Subject: [PATCH 23/50] Make USMFreeImpl static

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../adapters/level_zero/ur_level_zero_context.cpp   | 13 +++++++++++++
 .../ur/adapters/level_zero/ur_level_zero_event.hpp  | 13 -------------
 .../ur/adapters/level_zero/ur_level_zero_mem.cpp    |  2 +-
 .../ur/adapters/level_zero/ur_level_zero_mem.hpp    |  2 --
 4 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
index 55354358124bd..b9184cb2555cc 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
@@ -403,6 +403,19 @@ ur_result_t ur_context_handle_t_::finalize() {
   return UR_RESULT_SUCCESS;
 }
 
+// Maximum number of events that can be present in an event ZePool is captured
+// here. Setting it to 256 gave best possible performance for several
+// benchmarks.
+static const pi_uint32 MaxNumEventsPerPool = [] {
+  const auto MaxNumEventsPerPoolEnv =
+      std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
+  uint32_t Result =
+      MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256;
+  if (Result <= 0)
+    Result = 256;
+  return Result;
+}();
+
 ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
     ze_event_pool_handle_t &Pool, size_t &Index, bool HostVisible,
     bool ProfilingEnabled) {
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp
index 6acbd7459ef83..fcb3b156af0db 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp
@@ -52,19 +52,6 @@ const bool ReuseDiscardedEvents = [] {
   return std::stoi(ReuseDiscardedEventsFlag) > 0;
 }();
 
-// Maximum number of events that can be present in an event ZePool is captured
-// here. Setting it to 256 gave best possible performance for several
-// benchmarks.
-const uint32_t MaxNumEventsPerPool = [] {
-  const auto MaxNumEventsPerPoolEnv =
-      std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
-  uint32_t Result =
-      MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256;
-  if (Result <= 0)
-    Result = 256;
-  return Result;
-}();
-
 const bool FilterEventWaitList = [] {
   const char *Ret = std::getenv("SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST");
   const bool RetVal = Ret ? std::stoi(Ret) : 1;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index 7bdbe320bd0d1..369a0eef74d9e 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -2338,7 +2338,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMGetMemAllocInfo(
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) {
+static ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) {
   ZE2UR_CALL(zeMemFree, (Context->ZeContext, Ptr));
   return UR_RESULT_SUCCESS;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
index ecae9a0c1b11b..9661063f0e5f2 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
@@ -55,8 +55,6 @@ ur_result_t enqueueMemCopyRectHelper(
     uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
     ur_event_handle_t *OutEvent, bool PreferCopyEngine = false);
 
-ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr);
-
 // Exception type to pass allocation errors
 class UsmAllocationException {
   const ur_result_t Error;

From 7f20421f6aa4908cee615ad66b5c938e1518e7de Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Sun, 23 Apr 2023 23:17:59 -0700
Subject: [PATCH 24/50] Port Enable immediate command lists by default

https://github.com/intel/llvm/pull/8982

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../level_zero/ur_level_zero_device.cpp       | 148 +++++++++---------
 1 file changed, 76 insertions(+), 72 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
index 28b91a729e328..d32eb5bd03308 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
@@ -794,7 +794,7 @@ getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device) {
   // used.
   if (!EnvVar) {
     if (Device->ImmCommandListUsed)
-      return std::pair<int, int>(-1, -1);   // No copy engines can be used.
+      return std::pair<int, int>(0, 0); // Only main copy engine will be used.
     return std::pair<int, int>(0, INT_MAX); // All copy engines will be used.
   }
   std::string CopyEngineRange = EnvVar;
@@ -845,8 +845,12 @@ ur_device_handle_t_::useImmediateCommandLists() {
   }();
 
   if (ImmediateCommandlistsSetting == -1)
-    // Change this to PerQueue as default after more testing.
+  // Change this to PerQueue as default after more testing.
+#ifdef _WIN32
     return NotUsed;
+#else
+    return isPVC() ? PerQueue : NotUsed;
+#endif
   switch (ImmediateCommandlistsSetting) {
   case 0:
     return NotUsed;
@@ -861,76 +865,6 @@ ur_device_handle_t_::useImmediateCommandLists() {
 
 ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal,
                                             int SubSubDeviceIndex) {
-  uint32_t numQueueGroups = 0;
-  ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties,
-             (ZeDevice, &numQueueGroups, nullptr));
-  if (numQueueGroups == 0) {
-    return UR_RESULT_ERROR_UNKNOWN;
-  }
-  urPrint("NOTE: Number of queue groups = %d\n", numQueueGroups);
-  std::vector<ZeStruct<ze_command_queue_group_properties_t>>
-      QueueGroupProperties(numQueueGroups);
-  ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties,
-             (ZeDevice, &numQueueGroups, QueueGroupProperties.data()));
-
-  // Initialize ordinal and compute queue group properties
-  for (uint32_t i = 0; i < numQueueGroups; i++) {
-    if (QueueGroupProperties[i].flags &
-        ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
-      QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal =
-          i;
-      QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
-          .ZeProperties = QueueGroupProperties[i];
-      break;
-    }
-  }
-
-  // Reinitialize a sub-sub-device with its own ordinal, index.
-  // Our sub-sub-device representation is currently [Level-Zero sub-device
-  // handle + Level-Zero compute group/engine index]. Only the specified
-  // index queue will be used to submit work to the sub-sub-device.
-  if (SubSubDeviceOrdinal >= 0) {
-    QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal =
-        SubSubDeviceOrdinal;
-    QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeIndex =
-        SubSubDeviceIndex;
-  } else { // Proceed with initialization for root and sub-device
-    // How is it possible that there are no "compute" capabilities?
-    if (QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal <
-        0) {
-      return UR_RESULT_ERROR_UNKNOWN;
-    }
-
-    if (CopyEngineRequested((ur_device_handle_t)this)) {
-      for (uint32_t i = 0; i < numQueueGroups; i++) {
-        if (((QueueGroupProperties[i].flags &
-              ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) &&
-            (QueueGroupProperties[i].flags &
-             ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) {
-          if (QueueGroupProperties[i].numQueues == 1) {
-            QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal = i;
-            QueueGroup[queue_group_info_t::MainCopy].ZeProperties =
-                QueueGroupProperties[i];
-          } else {
-            QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal = i;
-            QueueGroup[queue_group_info_t::LinkCopy].ZeProperties =
-                QueueGroupProperties[i];
-            break;
-          }
-        }
-      }
-      if (QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal < 0)
-        urPrint("NOTE: main blitter/copy engine is not available\n");
-      else
-        urPrint("NOTE: main blitter/copy engine is available\n");
-
-      if (QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal < 0)
-        urPrint("NOTE: link blitter/copy engines are not available\n");
-      else
-        urPrint("NOTE: link blitter/copy engines are available\n");
-    }
-  }
-
   // Maintain various device properties cache.
   // Note that we just describe here how to compute the data.
   // The real initialization is upon first access.
@@ -1002,6 +936,76 @@ ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal,
     ZeEventsScope = DeviceEventsSetting;
   }
 
+  uint32_t numQueueGroups = 0;
+  ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties,
+             (ZeDevice, &numQueueGroups, nullptr));
+  if (numQueueGroups == 0) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+  urPrint("NOTE: Number of queue groups = %d\n", numQueueGroups);
+  std::vector<ZeStruct<ze_command_queue_group_properties_t>>
+      QueueGroupProperties(numQueueGroups);
+  ZE2UR_CALL(zeDeviceGetCommandQueueGroupProperties,
+             (ZeDevice, &numQueueGroups, QueueGroupProperties.data()));
+
+  // Initialize ordinal and compute queue group properties
+  for (uint32_t i = 0; i < numQueueGroups; i++) {
+    if (QueueGroupProperties[i].flags &
+        ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
+      QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal =
+          i;
+      QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
+          .ZeProperties = QueueGroupProperties[i];
+      break;
+    }
+  }
+
+  // Reinitialize a sub-sub-device with its own ordinal, index.
+  // Our sub-sub-device representation is currently [Level-Zero sub-device
+  // handle + Level-Zero compute group/engine index]. Only the specified
+  // index queue will be used to submit work to the sub-sub-device.
+  if (SubSubDeviceOrdinal >= 0) {
+    QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal =
+        SubSubDeviceOrdinal;
+    QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeIndex =
+        SubSubDeviceIndex;
+  } else { // Proceed with initialization for root and sub-device
+           // How is it possible that there are no "compute" capabilities?
+    if (QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute].ZeOrdinal <
+        0) {
+      return UR_RESULT_ERROR_UNKNOWN;
+    }
+
+    if (CopyEngineRequested((ur_device_handle_t)this)) {
+      for (uint32_t i = 0; i < numQueueGroups; i++) {
+        if (((QueueGroupProperties[i].flags &
+              ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) == 0) &&
+            (QueueGroupProperties[i].flags &
+             ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY)) {
+          if (QueueGroupProperties[i].numQueues == 1) {
+            QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal = i;
+            QueueGroup[queue_group_info_t::MainCopy].ZeProperties =
+                QueueGroupProperties[i];
+          } else {
+            QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal = i;
+            QueueGroup[queue_group_info_t::LinkCopy].ZeProperties =
+                QueueGroupProperties[i];
+            break;
+          }
+        }
+      }
+      if (QueueGroup[queue_group_info_t::MainCopy].ZeOrdinal < 0)
+        urPrint("NOTE: main blitter/copy engine is not available\n");
+      else
+        urPrint("NOTE: main blitter/copy engine is available\n");
+
+      if (QueueGroup[queue_group_info_t::LinkCopy].ZeOrdinal < 0)
+        urPrint("NOTE: link blitter/copy engines are not available\n");
+      else
+        urPrint("NOTE: link blitter/copy engines are available\n");
+    }
+  }
+
   return UR_RESULT_SUCCESS;
 }
 

From caf288876f3aa2d89bdfd74321ae4be5ab4361a4 Mon Sep 17 00:00:00 2001
From: Brandon Yates <brandon.yates@intel.com>
Date: Tue, 25 Apr 2023 20:49:31 +0000
Subject: [PATCH 25/50] Fix failing device CTS (#2)

* Fix failing device CTS

Signed-off-by: Brandon Yates <brandon.yates@intel.com>
---
 sycl/plugins/unified_runtime/pi2ur.hpp        |  6 +-
 .../level_zero/ur_level_zero_device.cpp       | 59 +++++++++++--------
 sycl/plugins/unified_runtime/ur/ur.hpp        |  3 -
 3 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index fd58feca8be29..033920f73321f 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -858,7 +858,7 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
     InfoType = (ur_device_info_t)UR_DEVICE_INFO_DEVICE_ID;
     break;
   case PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY:
-    InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_FREE_MEMORY;
+    InfoType = (ur_device_info_t)UR_DEVICE_INFO_GLOBAL_MEM_FREE;
     break;
   case PI_EXT_INTEL_DEVICE_INFO_MEMORY_CLOCK_RATE:
     InfoType = (ur_device_info_t)UR_DEVICE_INFO_MEMORY_CLOCK_RATE;
@@ -870,7 +870,7 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
     InfoType = (ur_device_info_t)UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES;
     break;
   case PI_DEVICE_INFO_GPU_SLICES:
-    InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_GPU_SLICES;
+    InfoType = (ur_device_info_t)UR_DEVICE_INFO_GPU_EU_SLICES;
     break;
   case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
     InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE;
@@ -879,7 +879,7 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
     InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_GPU_HW_THREADS_PER_EU;
     break;
   case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
-    InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_MAX_MEM_BANDWIDTH;
+    InfoType = (ur_device_info_t)UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH;
     break;
   case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS:
     InfoType = (ur_device_info_t)UR_DEVICE_INFO_BFLOAT16;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
index d32eb5bd03308..02c3232176177 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
@@ -89,7 +89,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     ur_device_handle_t Device,  ///< [in] handle of the device instance
     ur_device_info_t ParamName, ///< [in] type of the info to retrieve
-    size_t propSize,  ///< [in] the number of bytes pointed to by pDeviceInfo.
+    size_t propSize,  ///< [in] the number of bytes pointed to by ParamValue.
     void *ParamValue, ///< [out][optional] array of bytes holding the info.
                       ///< If propSize is not equal to or greater than the real
                       ///< number of bytes needed to return the info then the
@@ -130,8 +130,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(UUID, sizeof(UUID));
   }
   case UR_DEVICE_INFO_ATOMIC_64:
-    return ReturnValue(uint32_t{Device->ZeDeviceModuleProperties->flags &
-                                ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS});
+    return ReturnValue(static_cast<ur_bool_t>(Device->ZeDeviceModuleProperties->flags &
+                                ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS));
   case UR_DEVICE_INFO_EXTENSIONS: {
     // Convention adopted from OpenCL:
     //     "Returns a space separated list of extension names (the extension
@@ -195,9 +195,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   case UR_EXT_DEVICE_INFO_BUILD_ON_SUBDEVICE:
     return ReturnValue(uint32_t{0});
   case UR_DEVICE_INFO_COMPILER_AVAILABLE:
-    return ReturnValue(uint32_t{1});
+    return ReturnValue(static_cast<ur_bool_t>(true));
   case UR_DEVICE_INFO_LINKER_AVAILABLE:
-    return ReturnValue(uint32_t{1});
+    return ReturnValue(static_cast<ur_bool_t>(true));
   case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
     uint32_t MaxComputeUnits =
         Device->ZeDeviceProperties->numEUsPerSubslice *
@@ -255,17 +255,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
         uint64_t{Device->ZeDeviceComputeProperties->maxSharedLocalMemory});
   case UR_DEVICE_INFO_IMAGE_SUPPORTED:
     return ReturnValue(
-        uint32_t{Device->ZeDeviceImageProperties->maxImageDims1D > 0});
+        static_cast<ur_bool_t>(Device->ZeDeviceImageProperties->maxImageDims1D > 0));
   case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY:
-    return ReturnValue(uint32_t{(Device->ZeDeviceProperties->flags &
-                                 ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0});
+    return ReturnValue(static_cast<ur_bool_t>((Device->ZeDeviceProperties->flags &
+                                 ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0));
   case UR_DEVICE_INFO_AVAILABLE:
-    return ReturnValue(uint32_t{ZeDevice ? true : false});
+    return ReturnValue(static_cast<ur_bool_t>(ZeDevice ? true : false));
   case UR_DEVICE_INFO_VENDOR:
     // TODO: Level-Zero does not return vendor's name at the moment
     // only the ID.
     return ReturnValue("Intel(R) Corporation");
   case UR_DEVICE_INFO_DRIVER_VERSION:
+  case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION:
     return ReturnValue(Device->Platform->ZeDriverVersion.c_str());
   case UR_DEVICE_INFO_VERSION:
     return ReturnValue(Device->Platform->ZeDriverApiVersion.c_str());
@@ -346,7 +347,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION:
     return ReturnValue("");
   case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC:
-    return ReturnValue(uint32_t{true});
+    return ReturnValue(static_cast<ur_bool_t>(true));
   case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE:
     return ReturnValue(
         size_t{Device->ZeDeviceModuleProperties->printfBufferSize});
@@ -363,12 +364,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(ur_device_exec_capability_flag_t{
         UR_DEVICE_EXEC_CAPABILITY_FLAG_NATIVE_KERNEL});
   case UR_DEVICE_INFO_ENDIAN_LITTLE:
-    return ReturnValue(uint32_t{true});
+    return ReturnValue(static_cast<ur_bool_t>(true));
   case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT:
-    return ReturnValue(uint32_t{Device->ZeDeviceProperties->flags &
-                                ZE_DEVICE_PROPERTY_FLAG_ECC});
+    return ReturnValue(static_cast<ur_bool_t>(Device->ZeDeviceProperties->flags &
+                                ZE_DEVICE_PROPERTY_FLAG_ECC));
   case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION:
-    return ReturnValue(size_t{Device->ZeDeviceProperties->timerResolution});
+    return ReturnValue(static_cast<size_t>(Device->ZeDeviceProperties->timerResolution));
   case UR_DEVICE_INFO_LOCAL_MEM_TYPE:
     return ReturnValue(UR_DEVICE_LOCAL_MEM_TYPE_LOCAL);
   case UR_DEVICE_INFO_MAX_CONSTANT_ARGS:
@@ -402,7 +403,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(
         uint32_t{Device->ZeDeviceImageProperties->maxWriteImageArgs});
   case UR_DEVICE_INFO_SINGLE_FP_CONFIG: {
-    uint64_t SingleFPValue = 0;
+    ur_device_fp_capability_flags_t SingleFPValue = 0;
     ze_device_fp_flags_t ZeSingleFPCapabilities =
         Device->ZeDeviceModuleProperties->fp32flags;
     if (ZE_DEVICE_FP_FLAG_DENORM & ZeSingleFPCapabilities) {
@@ -427,10 +428,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
       SingleFPValue |=
           UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
     }
-    return ReturnValue(uint64_t{SingleFPValue});
+    return ReturnValue(SingleFPValue);
   }
   case UR_DEVICE_INFO_HALF_FP_CONFIG: {
-    uint64_t HalfFPValue = 0;
+    ur_device_fp_capability_flags_t HalfFPValue = 0;
     ze_device_fp_flags_t ZeHalfFPCapabilities =
         Device->ZeDeviceModuleProperties->fp16flags;
     if (ZE_DEVICE_FP_FLAG_DENORM & ZeHalfFPCapabilities) {
@@ -454,10 +455,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     if (ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT & ZeHalfFPCapabilities) {
       HalfFPValue |= UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
     }
-    return ReturnValue(uint64_t{HalfFPValue});
+    return ReturnValue(HalfFPValue);
   }
   case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: {
-    uint64_t DoubleFPValue = 0;
+    ur_device_fp_capability_flags_t DoubleFPValue = 0;
     ze_device_fp_flags_t ZeDoubleFPCapabilities =
         Device->ZeDeviceModuleProperties->fp64flags;
     if (ZE_DEVICE_FP_FLAG_DENORM & ZeDoubleFPCapabilities) {
@@ -482,7 +483,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
       DoubleFPValue |=
           UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
     }
-    return ReturnValue(uint64_t{DoubleFPValue});
+    return ReturnValue(DoubleFPValue);
   }
   case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH:
     return ReturnValue(size_t{Device->ZeDeviceImageProperties->maxImageDims2D});
@@ -537,7 +538,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   }
   case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: {
     // TODO: Not supported yet. Needs to be updated after support is added.
-    return ReturnValue(uint32_t{false});
+    return ReturnValue(static_cast<ur_bool_t>(false));
   }
   case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
     // ze_device_compute_properties.subGroupSizes is in uint32_t whereas the
@@ -617,7 +618,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(AddressBuffer);
   }
 
-  case UR_EXT_DEVICE_INFO_FREE_MEMORY: {
+  case UR_DEVICE_INFO_GLOBAL_MEM_FREE: {
     if (getenv("ZES_ENABLE_SYSMAN") == nullptr) {
       setErrorMessage("Set ZES_ENABLE_SYSMAN=1 to obtain free memory",
                       UR_RESULT_SUCCESS);
@@ -698,11 +699,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
                      Device->ZeDeviceProperties->numSlices;
     return ReturnValue(uint32_t{count});
   }
+  case UR_DEVICE_INFO_GPU_EU_SLICES: {
+    return ReturnValue(uint32_t{Device->ZeDeviceProperties->numSlices});
+  }
   case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH:
     return ReturnValue(
         uint32_t{Device->ZeDeviceProperties->physicalEUSimdWidth});
-  case UR_EXT_DEVICE_INFO_GPU_SLICES:
-    return ReturnValue(uint32_t{Device->ZeDeviceProperties->numSlices});
   case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
     return ReturnValue(
         uint32_t{Device->ZeDeviceProperties->numSubslicesPerSlice});
@@ -710,7 +712,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(uint32_t{Device->ZeDeviceProperties->numEUsPerSubslice});
   case UR_EXT_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
     return ReturnValue(uint32_t{Device->ZeDeviceProperties->numThreadsPerEU});
-  case UR_EXT_DEVICE_INFO_MAX_MEM_BANDWIDTH:
+  case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH:
     // currently not supported in level zero runtime
     return UR_RESULT_ERROR_INVALID_VALUE;
   case UR_DEVICE_INFO_BFLOAT16: {
@@ -766,8 +768,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
 
   case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES:
   case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: {
-    return ReturnValue(0);
+    ur_queue_flags_t queue_flags = 0;
+    return ReturnValue(queue_flags);
   }
+  case UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS: {
+    return ReturnValue(static_cast<uint32_t>(0));  //__read_write attribute currently undefinde in opencl
+  }
+
 
   default:
     urPrint("Unsupported ParamName in urGetDeviceInfo\n");
diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp
index f8578cafbb3b8..c03ba316860f7 100644
--- a/sycl/plugins/unified_runtime/ur/ur.hpp
+++ b/sycl/plugins/unified_runtime/ur/ur.hpp
@@ -29,17 +29,14 @@ const int UR_EXT_DEVICE_INFO_MAX_WORK_GROUPS_3D = UR_EXT_DEVICE_INFO_END - 2;
 //     UR_EXT_DEVICE_INFO_END - 3;
 // const int ZER_EXT_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS =
 //     UR_EXT_DEVICE_INFO_END - 4;
-const int UR_EXT_DEVICE_INFO_MAX_MEM_BANDWIDTH = UR_EXT_DEVICE_INFO_END - 6;
 const int UR_EXT_DEVICE_INFO_GPU_HW_THREADS_PER_EU = UR_EXT_DEVICE_INFO_END - 7;
 const int UR_EXT_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE =
     UR_EXT_DEVICE_INFO_END - 8;
-const int UR_EXT_DEVICE_INFO_GPU_SLICES = UR_EXT_DEVICE_INFO_END - 9;
 // const int UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES =
 //     UR_EXT_DEVICE_INFO_END - 10;
 const int UR_EXT_DEVICE_INFO_MEMORY_BUS_WIDTH = UR_EXT_DEVICE_INFO_END - 11;
 // const int ZER_EXT_DEVICE_INFO_MEMORY_CLOCK_RATE = UR_EXT_DEVICE_INFO_END -
 // 12;
-const int UR_EXT_DEVICE_INFO_FREE_MEMORY = UR_EXT_DEVICE_INFO_END - 13;
 // const int ZER_EXT_DEVICE_INFO_DEVICE_ID = UR_EXT_DEVICE_INFO_END - 14;
 // const int ZER_EXT_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE =
 //     UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE;

From 4ae54329c2ac86217b6e53c29526e2b25a044342 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Tue, 25 Apr 2023 19:00:48 -0700
Subject: [PATCH 26/50] Fix formatting

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../level_zero/ur_level_zero_device.cpp       | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
index 02c3232176177..4c15b61e19a6f 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
@@ -130,8 +130,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(UUID, sizeof(UUID));
   }
   case UR_DEVICE_INFO_ATOMIC_64:
-    return ReturnValue(static_cast<ur_bool_t>(Device->ZeDeviceModuleProperties->flags &
-                                ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS));
+    return ReturnValue(
+        static_cast<ur_bool_t>(Device->ZeDeviceModuleProperties->flags &
+                               ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS));
   case UR_DEVICE_INFO_EXTENSIONS: {
     // Convention adopted from OpenCL:
     //     "Returns a space separated list of extension names (the extension
@@ -254,11 +255,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(
         uint64_t{Device->ZeDeviceComputeProperties->maxSharedLocalMemory});
   case UR_DEVICE_INFO_IMAGE_SUPPORTED:
-    return ReturnValue(
-        static_cast<ur_bool_t>(Device->ZeDeviceImageProperties->maxImageDims1D > 0));
+    return ReturnValue(static_cast<ur_bool_t>(
+        Device->ZeDeviceImageProperties->maxImageDims1D > 0));
   case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY:
-    return ReturnValue(static_cast<ur_bool_t>((Device->ZeDeviceProperties->flags &
-                                 ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0));
+    return ReturnValue(
+        static_cast<ur_bool_t>((Device->ZeDeviceProperties->flags &
+                                ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0));
   case UR_DEVICE_INFO_AVAILABLE:
     return ReturnValue(static_cast<ur_bool_t>(ZeDevice ? true : false));
   case UR_DEVICE_INFO_VENDOR:
@@ -366,10 +368,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   case UR_DEVICE_INFO_ENDIAN_LITTLE:
     return ReturnValue(static_cast<ur_bool_t>(true));
   case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT:
-    return ReturnValue(static_cast<ur_bool_t>(Device->ZeDeviceProperties->flags &
-                                ZE_DEVICE_PROPERTY_FLAG_ECC));
+    return ReturnValue(static_cast<ur_bool_t>(
+        Device->ZeDeviceProperties->flags & ZE_DEVICE_PROPERTY_FLAG_ECC));
   case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION:
-    return ReturnValue(static_cast<size_t>(Device->ZeDeviceProperties->timerResolution));
+    return ReturnValue(
+        static_cast<size_t>(Device->ZeDeviceProperties->timerResolution));
   case UR_DEVICE_INFO_LOCAL_MEM_TYPE:
     return ReturnValue(UR_DEVICE_LOCAL_MEM_TYPE_LOCAL);
   case UR_DEVICE_INFO_MAX_CONSTANT_ARGS:
@@ -772,10 +775,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(queue_flags);
   }
   case UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS: {
-    return ReturnValue(static_cast<uint32_t>(0));  //__read_write attribute currently undefinde in opencl
+    return ReturnValue(static_cast<uint32_t>(
+        0)); //__read_write attribute currently undefinde in opencl
   }
 
-
   default:
     urPrint("Unsupported ParamName in urGetDeviceInfo\n");
     urPrint("ParamName=%d(0x%x)\n", ParamName, ParamName);

From 4aedd0fa072aa4ca6514962bf9864227a16d15ed Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Wed, 26 Apr 2023 16:38:00 -0700
Subject: [PATCH 27/50] Update loader with CreateWithNativeHandle updates

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/unified_runtime/CMakeLists.txt   |  2 +-
 sycl/plugins/unified_runtime/pi2ur.hpp        | 37 ++++++++++++-------
 .../level_zero/ur_level_zero_context.cpp      |  3 +-
 .../level_zero/ur_level_zero_event.cpp        |  4 +-
 .../level_zero/ur_level_zero_kernel.cpp       |  4 +-
 .../adapters/level_zero/ur_level_zero_mem.cpp |  4 +-
 .../level_zero/ur_level_zero_queue.cpp        | 15 +++++---
 7 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index d66bfaba17bf1..2cda6e083f6c4 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -4,7 +4,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/jandres742/unified-runtime.git")
-  set(UNIFIED_RUNTIME_TAG 586cc2d9a9612ad6886704aba7b38f1cd8ae610e)
+  set(UNIFIED_RUNTIME_TAG af603dbef47adb62aafbf067931f0c9358a4cac6)
 
   message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}")
   FetchContent_Declare(unified-runtime
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 033920f73321f..ebca37978a696 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -1129,8 +1129,10 @@ inline pi_result piextContextCreateWithNativeHandle(
   ur_context_handle_t *UrContext =
       reinterpret_cast<ur_context_handle_t *>(RetContext);
 
+  ur_context_native_properties_t Properties{};
+  Properties.isNativeHandleOwned = OwnNativeHandle;
   HANDLE_ERRORS(urContextCreateWithNativeHandle(
-      NativeContext, NumDevices, UrDevices, OwnNativeHandle, UrContext));
+      NativeContext, NumDevices, UrDevices, &Properties, UrContext));
 
   return PI_SUCCESS;
 }
@@ -1281,13 +1283,14 @@ inline pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
 
   ur_context_handle_t UrContext =
       reinterpret_cast<ur_context_handle_t>(Context);
-
+  ur_device_handle_t UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
   ur_native_handle_t UrNativeHandle =
       reinterpret_cast<ur_native_handle_t>(NativeHandle);
   ur_queue_handle_t *UrQueue = reinterpret_cast<ur_queue_handle_t *>(Queue);
-  HANDLE_ERRORS(
-      urQueueCreateWithNativeHandle(UrNativeHandle, UrContext, UrQueue));
-  (*UrQueue)->OwnNativeHandle = OwnNativeHandle;
+  ur_queue_native_properties_t Properties{};
+  Properties.isNativeHandleOwned = OwnNativeHandle;
+  HANDLE_ERRORS(urQueueCreateWithNativeHandle(UrNativeHandle, UrContext,
+                                              UrDevice, &Properties, UrQueue));
   return PI_SUCCESS;
 }
 
@@ -1785,9 +1788,10 @@ piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
   ur_program_handle_t UrProgram =
       reinterpret_cast<ur_program_handle_t>(Program);
   ur_kernel_handle_t *UrKernel = reinterpret_cast<ur_kernel_handle_t *>(Kernel);
-  HANDLE_ERRORS(urKernelCreateWithNativeHandle(UrNativeKernel, UrContext,
-                                               UrProgram, UrKernel));
-  (*UrKernel)->OwnNativeHandle = OwnNativeHandle;
+  ur_kernel_native_properties_t Properties{};
+  Properties.isNativeHandleOwned = OwnNativeHandle;
+  HANDLE_ERRORS(urKernelCreateWithNativeHandle(
+      UrNativeKernel, UrContext, UrProgram, &Properties, UrKernel));
 
   return PI_SUCCESS;
 }
@@ -2621,8 +2625,10 @@ inline pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
   ur_mem_handle_t *UrMem = reinterpret_cast<ur_mem_handle_t *>(Mem);
   // TODO: Pass OwnNativeHandle to the output parameter
   // while we get it in interface
-  HANDLE_ERRORS(urMemCreateWithNativeHandle(UrNativeMem, UrContext,
-                                            OwnNativeHandle, UrMem));
+  ur_mem_native_properties_t Properties{};
+  Properties.isNativeHandleOwned = OwnNativeHandle;
+  HANDLE_ERRORS(
+      urMemCreateWithNativeHandle(UrNativeMem, UrContext, &Properties, UrMem));
 
   return PI_SUCCESS;
 }
@@ -3456,7 +3462,9 @@ inline pi_result piEventCreate(pi_context Context, pi_event *RetEvent) {
   ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(RetEvent);
   // pass null for the hNativeHandle to use urEventCreateWithNativeHandle
   // as urEventCreate
-  HANDLE_ERRORS(urEventCreateWithNativeHandle(nullptr, UrContext, UrEvent));
+  ur_event_native_properties_t Properties{};
+  HANDLE_ERRORS(
+      urEventCreateWithNativeHandle(nullptr, UrContext, &Properties, UrEvent));
 
   return PI_SUCCESS;
 }
@@ -3477,9 +3485,10 @@ inline pi_result piextEventCreateWithNativeHandle(pi_native_handle NativeHandle,
       reinterpret_cast<ur_context_handle_t>(Context);
 
   ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(Event);
-  HANDLE_ERRORS(
-      urEventCreateWithNativeHandle(UrNativeKernel, UrContext, UrEvent));
-  (*UrEvent)->OwnNativeHandle = OwnNativeHandle;
+  ur_event_native_properties_t Properties{};
+  Properties.isNativeHandleOwned = OwnNativeHandle;
+  HANDLE_ERRORS(urEventCreateWithNativeHandle(UrNativeKernel, UrContext,
+                                              &Properties, UrEvent));
 
   return PI_SUCCESS;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
index b9184cb2555cc..c0c4f9958aaf0 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
@@ -139,10 +139,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle(
     ur_native_handle_t
         NativeContext, ///< [in] the native handle of the context.
     uint32_t NumDevices, const ur_device_handle_t *Devices,
-    bool OwnNativeHandle,
+    const ur_context_native_properties_t *Properties,
     ur_context_handle_t
         *Context ///< [out] pointer to the handle of the context object created.
 ) {
+  bool OwnNativeHandle = Properties->isNativeHandleOwned;
   try {
     ze_context_handle_t ZeContext =
         reinterpret_cast<ze_context_handle_t>(NativeContext);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
index 237d27cf130a9..197ec1e8f70aa 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
@@ -625,6 +625,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urExtEventCreate(
 UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle(
     ur_native_handle_t NativeEvent, ///< [in] the native handle of the event.
     ur_context_handle_t Context,    ///< [in] handle of the context object
+    const ur_event_native_properties_t *Properties,
     ur_event_handle_t
         *Event ///< [out] pointer to the handle of the event object created.
 ) {
@@ -643,7 +644,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle(
   ur_event_handle_t_ *UrEvent{};
   try {
     UrEvent = new ur_event_handle_t_(ZeEvent, nullptr /* ZeEventPool */,
-                                     Context, UR_EXT_COMMAND_TYPE_USER, true);
+                                     Context, UR_EXT_COMMAND_TYPE_USER,
+                                     Properties->isNativeHandleOwned);
   } catch (const std::bad_alloc &) {
     return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
   } catch (...) {
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
index fae4734b031b0..be7e88ddb6923 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
@@ -714,14 +714,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     ur_native_handle_t NativeKernel, ///< [in] the native handle of the kernel.
     ur_context_handle_t Context,     ///< [in] handle of the context object
     ur_program_handle_t Program,
+    const ur_kernel_native_properties_t *Properties,
     ur_kernel_handle_t *
         RetKernel ///< [out] pointer to the handle of the kernel object created.
 ) {
   ze_kernel_handle_t ZeKernel = ur_cast<ze_kernel_handle_t>(NativeKernel);
   ur_kernel_handle_t_ *Kernel = nullptr;
   try {
-    Kernel = new ur_kernel_handle_t_(ZeKernel,
-                                     false, // OwnZeKernel
+    Kernel = new ur_kernel_handle_t_(ZeKernel, Properties->isNativeHandleOwned,
                                      Context);
     *RetKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
   } catch (const std::bad_alloc &) {
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index 369a0eef74d9e..62f1bf19bf24e 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -1858,10 +1858,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle(
 UR_APIEXPORT ur_result_t UR_APICALL urMemCreateWithNativeHandle(
     ur_native_handle_t NativeMem, ///< [in] the native handle of the mem.
     ur_context_handle_t Context,  ///< [in] handle of the context object
-    bool OwnNativeHandle,
+    const ur_mem_native_properties_t *Properties,
     ur_mem_handle_t
         *Mem ///< [out] pointer to the handle of the mem object created.
 ) {
+  bool OwnNativeHandle = Properties->isNativeHandleOwned;
+
   std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
 
   // Get base of the allocation
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
index df7d39be50cb5..dc4801f6628b8 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
@@ -474,6 +474,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(
 UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
     ur_native_handle_t NativeQueue, ///< [in] the native handle of the queue.
     ur_context_handle_t Context,    ///< [in] handle of the context object
+    ur_device_handle_t Device,      ///
+    const ur_queue_native_properties_t *Properties, ///
     ur_queue_handle_t
         *RetQueue ///< [out] pointer to the handle of the queue object created.
 ) {
@@ -492,13 +494,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
   ur_platform_handle_t Platform{};
   UR_CALL(urPlatformGet(NumEntries, &Platform, nullptr));
 
-  ur_device_handle_t Device;
-  UR_CALL(
-      urDeviceGet(Platform, UR_DEVICE_TYPE_GPU, NumEntries, &Device, nullptr));
+  ur_device_handle_t UrDevice = Device;
+  if (UrDevice == nullptr) {
+    UR_CALL(urDeviceGet(Platform, UR_DEVICE_TYPE_GPU, NumEntries, &UrDevice,
+                        nullptr));
+  }
 
   try {
-    ur_queue_handle_t_ *Queue = new ur_queue_handle_t_(ZeQueues, ZeroCopyQueues,
-                                                       Context, Device, false);
+    ur_queue_handle_t_ *Queue =
+        new ur_queue_handle_t_(ZeQueues, ZeroCopyQueues, Context, UrDevice,
+                               Properties->isNativeHandleOwned);
     *RetQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
   } catch (const std::bad_alloc &) {
     return UR_RESULT_ERROR_OUT_OF_RESOURCES;

From ff370d2cd2190ad2752629dc7f442afde27c2b04 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Wed, 26 Apr 2023 17:36:48 -0700
Subject: [PATCH 28/50] Move some code to L0

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/unified_runtime/pi2ur.hpp        | 19 ----
 .../level_zero/ur_level_zero_common.cpp       | 17 ++++
 .../level_zero/ur_level_zero_common.hpp       | 92 +++++++++++++++---
 sycl/plugins/unified_runtime/ur/ur.cpp        | 17 ----
 sycl/plugins/unified_runtime/ur/ur.hpp        | 95 +++----------------
 5 files changed, 111 insertions(+), 129 deletions(-)

diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index ebca37978a696..138ab10e06ff2 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -7,7 +7,6 @@
 //===------------------------------------------------------------------===//
 #pragma once
 
-#include "ur/adapters/level_zero/ur_level_zero.hpp"
 #include "ur_api.h"
 #include <cstdarg>
 #include <sycl/detail/pi.h>
@@ -382,24 +381,6 @@ inline pi_result ur2piDeviceInfoValue(ur_device_info_t ParamName,
   return PI_SUCCESS;
 }
 
-struct _pi_context : ur_context_handle_t_ {};
-
-struct _pi_queue : ur_context_handle_t_ {};
-
-struct _pi_program : ur_program_handle_t_ {};
-
-struct _pi_kernel : ur_kernel_handle_t_ {};
-
-struct _pi_mem : ur_mem_handle_t_ {};
-
-struct _pi_buffer : ur_mem_handle_t_ {};
-
-struct _pi_image : ur_mem_handle_t_ {};
-
-struct _pi_sampler : ur_sampler_handle_t_ {};
-
-struct _pi_event : ur_event_handle_t_ {};
-
 namespace pi2ur {
 
 inline pi_result piTearDown(void *PluginParameter) {
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.cpp
index ca5259a80abcd..4603fbe741354 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.cpp
@@ -264,3 +264,20 @@ template <> zes_structure_type_t getZesStructureType<zes_mem_state_t>() {
 template <> zes_structure_type_t getZesStructureType<zes_mem_properties_t>() {
   return ZES_STRUCTURE_TYPE_MEM_PROPERTIES;
 }
+
+// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR
+thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS;
+thread_local char ErrorMessage[MaxMessageSize];
+
+// Utility function for setting a message and warning
+[[maybe_unused]] void setErrorMessage(const char *message,
+                                      ur_result_t error_code) {
+  assert(strlen(message) <= MaxMessageSize);
+  strcpy(ErrorMessage, message);
+  ErrorMessageCode = error_code;
+}
+
+ur_result_t zerPluginGetLastError(char **message) {
+  *message = &ErrorMessage[0];
+  return ErrorMessageCode;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
index 599527ae34a2d..491c58e668763 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
@@ -26,19 +26,6 @@
 
 struct _ur_platform_handle_t;
 
-template <class To, class From> To ur_cast(From Value) {
-  // TODO: see if more sanity checks are possible.
-  assert(sizeof(From) == sizeof(To));
-  return (To)(Value);
-}
-
-template <> uint32_t inline ur_cast(uint64_t Value) {
-  // Cast value and check that we don't lose any information.
-  uint32_t CastedValue = (uint32_t)(Value);
-  assert((uint64_t)CastedValue == Value);
-  return CastedValue;
-}
-
 static auto getUrResultString = [](ur_result_t Result) {
   switch (Result) {
   case UR_RESULT_SUCCESS:
@@ -332,6 +319,76 @@ ur_result_t ze2urResult(ze_result_t ZeResult);
 #define ZE_CALL_NOCHECK(ZeName, ZeArgs)                                        \
   ZeCall().doCall(ZeName ZeArgs, #ZeName, #ZeArgs, false)
 
+// This wrapper around std::atomic is created to limit operations with reference
+// counter and to make allowed operations more transparent in terms of
+// thread-safety in the plugin. increment() and load() operations do not need a
+// mutex guard around them since the underlying data is already atomic.
+// decrementAndTest() method is used to guard a code which needs to be
+// executed when object's ref count becomes zero after release. This method also
+// doesn't need a mutex guard because decrement operation is atomic and only one
+// thread can reach ref count equal to zero, i.e. only a single thread can pass
+// through this check.
+struct ReferenceCounter {
+  ReferenceCounter() : RefCount{1} {}
+
+  // Reset the counter to the initial value.
+  void reset() { RefCount = 1; }
+
+  // Used when retaining an object.
+  void increment() { RefCount++; }
+
+  // Supposed to be used in pi*GetInfo* methods where ref count value is
+  // requested.
+  uint32_t load() { return RefCount.load(); }
+
+  // This method allows to guard a code which needs to be executed when object's
+  // ref count becomes zero after release. It is important to notice that only a
+  // single thread can pass through this check. This is true because of several
+  // reasons:
+  //   1. Decrement operation is executed atomically.
+  //   2. It is not allowed to retain an object after its refcount reaches zero.
+  //   3. It is not allowed to release an object more times than the value of
+  //   the ref count.
+  // 2. and 3. basically means that we can't use an object at all as soon as its
+  // refcount reaches zero. Using this check guarantees that code for deleting
+  // an object and releasing its resources is executed once by a single thread
+  // and we don't need to use any mutexes to guard access to this object in the
+  // scope after this check. Of course if we access another objects in this code
+  // (not the one which is being deleted) then access to these objects must be
+  // guarded, for example with a mutex.
+  bool decrementAndTest() { return --RefCount == 0; }
+
+private:
+  std::atomic<uint32_t> RefCount;
+};
+
+// Base class to store common data
+struct _ur_object {
+  _ur_object() : RefCount{} {}
+
+  // Must be atomic to prevent data race when incrementing/decrementing.
+  ReferenceCounter RefCount;
+
+  // This mutex protects accesses to all the non-const member variables.
+  // Exclusive access is required to modify any of these members.
+  //
+  // To get shared access to the object in a scope use std::shared_lock:
+  //    std::shared_lock Lock(Obj->Mutex);
+  // To get exclusive access to the object in a scope use std::scoped_lock:
+  //    std::scoped_lock Lock(Obj->Mutex);
+  //
+  // If several pi objects are accessed in a scope then each object's mutex must
+  // be locked. For example, to get write access to Obj1 and Obj2 and read
+  // access to Obj3 in a scope use the following approach:
+  //   std::shared_lock Obj3Lock(Obj3->Mutex, std::defer_lock);
+  //   std::scoped_lock LockAll(Obj1->Mutex, Obj2->Mutex, Obj3Lock);
+  ur_shared_mutex Mutex;
+
+  // Indicates if we own the native handle or it came from interop that
+  // asked to not transfer the ownership to SYCL RT.
+  bool OwnNativeHandle = false;
+};
+
 // Record for a memory allocation. This structure is used to keep information
 // for each memory allocation.
 struct MemAllocRecord : _ur_object {
@@ -403,3 +460,12 @@ extern std::map<const char *, int> *ZeCallCount;
 constexpr char ZE_SUPPORTED_EXTENSIONS[] =
     "cl_khr_il_program cl_khr_subgroups cl_intel_subgroups "
     "cl_intel_subgroups_short cl_intel_required_subgroup_size ";
+
+// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR
+constexpr size_t MaxMessageSize = 256;
+extern thread_local ur_result_t ErrorMessageCode;
+extern thread_local char ErrorMessage[MaxMessageSize];
+
+// Utility function for setting a message and warning
+[[maybe_unused]] void setErrorMessage(const char *message,
+                                      ur_result_t error_code);
diff --git a/sycl/plugins/unified_runtime/ur/ur.cpp b/sycl/plugins/unified_runtime/ur/ur.cpp
index 67a6ac4bb391d..0db860fbd0daa 100644
--- a/sycl/plugins/unified_runtime/ur/ur.cpp
+++ b/sycl/plugins/unified_runtime/ur/ur.cpp
@@ -28,20 +28,3 @@ std::vector<ur_platform_handle_t> *PiPlatformsCache =
     new std::vector<ur_platform_handle_t>;
 SpinLock *PiPlatformsCacheMutex = new SpinLock;
 bool PiPlatformCachePopulated = false;
-
-// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR
-thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS;
-thread_local char ErrorMessage[MaxMessageSize];
-
-// Utility function for setting a message and warning
-[[maybe_unused]] void setErrorMessage(const char *message,
-                                      ur_result_t error_code) {
-  assert(strlen(message) <= MaxMessageSize);
-  strcpy(ErrorMessage, message);
-  ErrorMessageCode = error_code;
-}
-
-ur_result_t zerPluginGetLastError(char **message) {
-  *message = &ErrorMessage[0];
-  return ErrorMessageCode;
-}
diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp
index c03ba316860f7..790c2fd39bd00 100644
--- a/sycl/plugins/unified_runtime/ur/ur.hpp
+++ b/sycl/plugins/unified_runtime/ur/ur.hpp
@@ -8,6 +8,7 @@
 #pragma once
 
 #include <atomic>
+#include <cassert>
 #include <cstdint>
 #include <cstring>
 #include <functional>
@@ -20,6 +21,19 @@
 
 #include <ur_api.h>
 
+template <class To, class From> To ur_cast(From Value) {
+  // TODO: see if more sanity checks are possible.
+  assert(sizeof(From) == sizeof(To));
+  return (To)(Value);
+}
+
+template <> uint32_t inline ur_cast(uint64_t Value) {
+  // Cast value and check that we don't lose any information.
+  uint32_t CastedValue = (uint32_t)(Value);
+  assert((uint64_t)CastedValue == Value);
+  return CastedValue;
+}
+
 // TODO: promote all of the below extensions to the Unified Runtime
 //       and get rid of these ZER_EXT constants.
 const int UR_EXT_DEVICE_INFO_END = UR_DEVICE_INFO_FORCE_UINT32;
@@ -193,76 +207,6 @@ template <class T> struct ZeCache : private T {
   }
 };
 
-// This wrapper around std::atomic is created to limit operations with reference
-// counter and to make allowed operations more transparent in terms of
-// thread-safety in the plugin. increment() and load() operations do not need a
-// mutex guard around them since the underlying data is already atomic.
-// decrementAndTest() method is used to guard a code which needs to be
-// executed when object's ref count becomes zero after release. This method also
-// doesn't need a mutex guard because decrement operation is atomic and only one
-// thread can reach ref count equal to zero, i.e. only a single thread can pass
-// through this check.
-struct ReferenceCounter {
-  ReferenceCounter() : RefCount{1} {}
-
-  // Reset the counter to the initial value.
-  void reset() { RefCount = 1; }
-
-  // Used when retaining an object.
-  void increment() { RefCount++; }
-
-  // Supposed to be used in pi*GetInfo* methods where ref count value is
-  // requested.
-  uint32_t load() { return RefCount.load(); }
-
-  // This method allows to guard a code which needs to be executed when object's
-  // ref count becomes zero after release. It is important to notice that only a
-  // single thread can pass through this check. This is true because of several
-  // reasons:
-  //   1. Decrement operation is executed atomically.
-  //   2. It is not allowed to retain an object after its refcount reaches zero.
-  //   3. It is not allowed to release an object more times than the value of
-  //   the ref count.
-  // 2. and 3. basically means that we can't use an object at all as soon as its
-  // refcount reaches zero. Using this check guarantees that code for deleting
-  // an object and releasing its resources is executed once by a single thread
-  // and we don't need to use any mutexes to guard access to this object in the
-  // scope after this check. Of course if we access another objects in this code
-  // (not the one which is being deleted) then access to these objects must be
-  // guarded, for example with a mutex.
-  bool decrementAndTest() { return --RefCount == 0; }
-
-private:
-  std::atomic<uint32_t> RefCount;
-};
-
-// Base class to store common data
-struct _ur_object {
-  _ur_object() : RefCount{} {}
-
-  // Must be atomic to prevent data race when incrementing/decrementing.
-  ReferenceCounter RefCount;
-
-  // This mutex protects accesses to all the non-const member variables.
-  // Exclusive access is required to modify any of these members.
-  //
-  // To get shared access to the object in a scope use std::shared_lock:
-  //    std::shared_lock Lock(Obj->Mutex);
-  // To get exclusive access to the object in a scope use std::scoped_lock:
-  //    std::scoped_lock Lock(Obj->Mutex);
-  //
-  // If several pi objects are accessed in a scope then each object's mutex must
-  // be locked. For example, to get write access to Obj1 and Obj2 and read
-  // access to Obj3 in a scope use the following approach:
-  //   std::shared_lock Obj3Lock(Obj3->Mutex, std::defer_lock);
-  //   std::scoped_lock LockAll(Obj1->Mutex, Obj2->Mutex, Obj3Lock);
-  ur_shared_mutex Mutex;
-
-  // Indicates if we own the native handle or it came from interop that
-  // asked to not transfer the ownership to SYCL RT.
-  bool OwnNativeHandle = false;
-};
-
 // Helper for one-liner validation
 #define UR_ASSERT(condition, error)                                            \
   if (!(condition))                                                            \
@@ -385,13 +329,4 @@ class UrReturnHelper {
   size_t param_value_size;
   void *param_value;
   size_t *param_value_size_ret;
-};
-
-// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR
-constexpr size_t MaxMessageSize = 256;
-extern thread_local ur_result_t ErrorMessageCode;
-extern thread_local char ErrorMessage[MaxMessageSize];
-
-// Utility function for setting a message and warning
-[[maybe_unused]] void setErrorMessage(const char *message,
-                                      ur_result_t error_code);
+};
\ No newline at end of file

From 151d5096802446dfe46987526c58139a8661b9b3 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Thu, 27 Apr 2023 01:08:41 -0700
Subject: [PATCH 29/50] Add more code for piextMemImageCreateWithNativeHandle

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/unified_runtime/pi2ur.hpp        | 206 ++++++++++--------
 .../adapters/level_zero/ur_level_zero_mem.cpp | 152 ++++++++-----
 2 files changed, 214 insertions(+), 144 deletions(-)

diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 138ab10e06ff2..602b23329ec4d 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -2258,100 +2258,70 @@ inline pi_result piMemGetInfo(pi_mem Mem, pi_mem_info ParamName,
   return PI_SUCCESS;
 }
 
-inline pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
-                                  const pi_image_format *ImageFormat,
-                                  const pi_image_desc *ImageDesc, void *HostPtr,
-                                  pi_mem *RetImage) {
-
-  // TODO: implement read-only, write-only
-  if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) {
-    die("piMemImageCreate: Level-Zero implements only read-write buffer,"
-        "no read-only or write-only yet.");
-  }
-  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
-  PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE);
-  PI_ASSERT(ImageFormat, PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
-
-  ur_context_handle_t UrContext =
-      reinterpret_cast<ur_context_handle_t>(Context);
-
-  ur_mem_flags_t UrFlags{};
-  if (Flags & PI_MEM_FLAGS_ACCESS_RW) {
-    UrFlags |= UR_MEM_FLAG_READ_WRITE;
-  }
-  if (Flags & PI_MEM_ACCESS_READ_ONLY) {
-    UrFlags |= UR_MEM_FLAG_READ_ONLY;
-  }
-  if (Flags & PI_MEM_FLAGS_HOST_PTR_USE) {
-    UrFlags |= UR_MEM_FLAG_USE_HOST_POINTER;
-  }
-  if (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) {
-    UrFlags |= UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER;
-  }
-  if (Flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
-    UrFlags |= UR_MEM_FLAG_ALLOC_HOST_POINTER;
-  }
+static void pi2urImageDesc(const pi_image_format *ImageFormat,
+                           const pi_image_desc *ImageDesc,
+                           ur_image_format_t *UrFormat,
+                           ur_image_desc_t *UrDesc) {
 
-  ur_image_format_t UrFormat{};
   switch (ImageFormat->image_channel_data_type) {
   case PI_IMAGE_CHANNEL_TYPE_SNORM_INT8: {
-    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SNORM_INT8;
+    UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_SNORM_INT8;
     break;
   }
   case PI_IMAGE_CHANNEL_TYPE_SNORM_INT16: {
-    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SNORM_INT16;
+    UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_SNORM_INT16;
     break;
   }
   case PI_IMAGE_CHANNEL_TYPE_UNORM_INT8: {
-    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_INT8;
+    UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_INT8;
     break;
   }
   case PI_IMAGE_CHANNEL_TYPE_UNORM_INT16: {
-    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_INT16;
+    UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_INT16;
     break;
   }
   case PI_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565: {
-    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565;
+    UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565;
     break;
   }
   case PI_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555: {
-    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555;
+    UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555;
     break;
   }
   case PI_IMAGE_CHANNEL_TYPE_UNORM_INT_101010: {
-    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_INT_101010;
+    UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_INT_101010;
     break;
   }
   case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8: {
-    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8;
+    UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8;
     break;
   }
   case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT16: {
-    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16;
+    UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16;
     break;
   }
   case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT32: {
-    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32;
+    UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32;
     break;
   }
   case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: {
-    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
+    UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8;
     break;
   }
   case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: {
-    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16;
+    UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16;
     break;
   }
   case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: {
-    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32;
+    UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32;
     break;
   }
   case PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT: {
-    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT;
+    UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT;
     break;
   }
   case PI_IMAGE_CHANNEL_TYPE_FLOAT: {
-    UrFormat.channelType = UR_IMAGE_CHANNEL_TYPE_FLOAT;
+    UrFormat->channelType = UR_IMAGE_CHANNEL_TYPE_FLOAT;
     break;
   }
   default: {
@@ -2360,113 +2330,153 @@ inline pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
   }
   switch (ImageFormat->image_channel_order) {
   case PI_IMAGE_CHANNEL_ORDER_A: {
-    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_A;
+    UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_A;
     break;
   }
   case PI_IMAGE_CHANNEL_ORDER_R: {
-    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_R;
+    UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_R;
     break;
   }
   case PI_IMAGE_CHANNEL_ORDER_RG: {
-    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RG;
+    UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_RG;
     break;
   }
   case PI_IMAGE_CHANNEL_ORDER_RA: {
-    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RA;
+    UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_RA;
     break;
   }
   case PI_IMAGE_CHANNEL_ORDER_RGB: {
-    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RGB;
+    UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_RGB;
     break;
   }
   case PI_IMAGE_CHANNEL_ORDER_RGBA: {
-    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RGBA;
+    UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_RGBA;
     break;
   }
   case PI_IMAGE_CHANNEL_ORDER_BGRA: {
-    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_BGRA;
+    UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_BGRA;
     break;
   }
   case PI_IMAGE_CHANNEL_ORDER_ARGB: {
-    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_ARGB;
+    UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_ARGB;
     break;
   }
   case PI_IMAGE_CHANNEL_ORDER_ABGR: {
-    UrFormat.channelOrder = UR_EXT_IMAGE_CHANNEL_ORDER_ABGR;
+    UrFormat->channelOrder = UR_EXT_IMAGE_CHANNEL_ORDER_ABGR;
     break;
   }
   case PI_IMAGE_CHANNEL_ORDER_INTENSITY: {
-    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_INTENSITY;
+    UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_INTENSITY;
     break;
   }
   case PI_IMAGE_CHANNEL_ORDER_LUMINANCE: {
-    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_LUMINANCE;
+    UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_LUMINANCE;
     break;
   }
   case PI_IMAGE_CHANNEL_ORDER_Rx: {
-    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RX;
+    UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_RX;
     break;
   }
   case PI_IMAGE_CHANNEL_ORDER_RGx: {
-    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RGX;
+    UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_RGX;
     break;
   }
   case PI_IMAGE_CHANNEL_ORDER_RGBx: {
-    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_RGBX;
+    UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_RGBX;
     break;
   }
   case PI_IMAGE_CHANNEL_ORDER_sRGBA: {
-    UrFormat.channelOrder = UR_IMAGE_CHANNEL_ORDER_SRGBA;
+    UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_SRGBA;
     break;
   }
   default: {
     die("piMemImageCreate: unsuppported image_channel_data_type.");
   }
   }
-  ur_image_desc_t UrDesc{};
-  UrDesc.arraySize = ImageDesc->image_array_size;
-  UrDesc.depth = ImageDesc->image_depth;
-  UrDesc.height = ImageDesc->image_height;
-  UrDesc.numMipLevel = ImageDesc->num_mip_levels;
-  UrDesc.numSamples = ImageDesc->num_samples;
-  UrDesc.rowPitch = ImageDesc->image_row_pitch;
-  UrDesc.slicePitch = ImageDesc->image_slice_pitch;
+
+  UrDesc->arraySize = ImageDesc->image_array_size;
+  UrDesc->depth = ImageDesc->image_depth;
+  UrDesc->height = ImageDesc->image_height;
+  UrDesc->numMipLevel = ImageDesc->num_mip_levels;
+  UrDesc->numSamples = ImageDesc->num_samples;
+  UrDesc->rowPitch = ImageDesc->image_row_pitch;
+  UrDesc->slicePitch = ImageDesc->image_slice_pitch;
   switch (ImageDesc->image_type) {
   case PI_MEM_TYPE_BUFFER: {
-    UrDesc.type = UR_MEM_TYPE_BUFFER;
+    UrDesc->type = UR_MEM_TYPE_BUFFER;
     break;
   }
   case PI_MEM_TYPE_IMAGE2D: {
-    UrDesc.type = UR_MEM_TYPE_IMAGE2D;
+    UrDesc->type = UR_MEM_TYPE_IMAGE2D;
     break;
   }
   case PI_MEM_TYPE_IMAGE3D: {
-    UrDesc.type = UR_MEM_TYPE_IMAGE3D;
+    UrDesc->type = UR_MEM_TYPE_IMAGE3D;
     break;
   }
   case PI_MEM_TYPE_IMAGE2D_ARRAY: {
-    UrDesc.type = UR_MEM_TYPE_IMAGE2D_ARRAY;
+    UrDesc->type = UR_MEM_TYPE_IMAGE2D_ARRAY;
     break;
   }
   case PI_MEM_TYPE_IMAGE1D: {
-    UrDesc.type = UR_MEM_TYPE_IMAGE1D;
+    UrDesc->type = UR_MEM_TYPE_IMAGE1D;
     break;
   }
   case PI_MEM_TYPE_IMAGE1D_ARRAY: {
-    UrDesc.type = UR_MEM_TYPE_IMAGE1D_ARRAY;
+    UrDesc->type = UR_MEM_TYPE_IMAGE1D_ARRAY;
     break;
   }
   case PI_MEM_TYPE_IMAGE1D_BUFFER: {
-    UrDesc.type = UR_MEM_TYPE_IMAGE1D_BUFFER;
+    UrDesc->type = UR_MEM_TYPE_IMAGE1D_BUFFER;
     break;
   }
   default: {
     die("piMemImageCreate: unsuppported image_type.");
   }
   }
-  UrDesc.width = ImageDesc->image_width;
-  UrDesc.arraySize = ImageDesc->image_array_size;
-  UrDesc.arraySize = ImageDesc->image_array_size;
+  UrDesc->width = ImageDesc->image_width;
+  UrDesc->arraySize = ImageDesc->image_array_size;
+  UrDesc->arraySize = ImageDesc->image_array_size;
+}
+
+inline pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
+                                  const pi_image_format *ImageFormat,
+                                  const pi_image_desc *ImageDesc, void *HostPtr,
+                                  pi_mem *RetImage) {
+
+  // TODO: implement read-only, write-only
+  if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) {
+    die("piMemImageCreate: Level-Zero implements only read-write buffer,"
+        "no read-only or write-only yet.");
+  }
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+  PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE);
+  PI_ASSERT(ImageFormat, PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+
+  ur_mem_flags_t UrFlags{};
+  if (Flags & PI_MEM_FLAGS_ACCESS_RW) {
+    UrFlags |= UR_MEM_FLAG_READ_WRITE;
+  }
+  if (Flags & PI_MEM_ACCESS_READ_ONLY) {
+    UrFlags |= UR_MEM_FLAG_READ_ONLY;
+  }
+  if (Flags & PI_MEM_FLAGS_HOST_PTR_USE) {
+    UrFlags |= UR_MEM_FLAG_USE_HOST_POINTER;
+  }
+  if (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) {
+    UrFlags |= UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER;
+  }
+  if (Flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
+    UrFlags |= UR_MEM_FLAG_ALLOC_HOST_POINTER;
+  }
+
+  ur_image_format_t UrFormat{};
+  ur_image_desc_t UrDesc{};
+  pi2urImageDesc(ImageFormat, ImageDesc, &UrFormat, &UrDesc);
+
   // TODO: UrDesc doesn't have something for ImageDesc->buffer
 
   ur_mem_handle_t *UrMem = reinterpret_cast<ur_mem_handle_t *>(RetImage);
@@ -2485,16 +2495,28 @@ inline pi_result piextMemImageCreateWithNativeHandle(
   PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
   PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
 
-  std::ignore = NativeHandle;
-  std::ignore = Context;
-  std::ignore = OwnNativeHandle;
-  std::ignore = ImageFormat;
-  std::ignore = ImageDesc;
-  std::ignore = RetImage;
-
-  // ur_mem_handle_t *UrMem = reinterpret_cast<ur_mem_handle_t *>(RetImage);
-  // HANDLE_ERRORS(urMemImageCreateWithNativeHandle(UrContext, OwnNativeHandle,
-  // HostPtr, UrMem));
+  ur_native_handle_t UrNativeMem =
+      reinterpret_cast<ur_native_handle_t>(NativeHandle);
+
+  ur_context_handle_t UrContext =
+      reinterpret_cast<ur_context_handle_t>(Context);
+
+  ur_mem_handle_t *UrMem = reinterpret_cast<ur_mem_handle_t *>(RetImage);
+  ur_mem_native_properties_t Properties{};
+  Properties.isNativeHandleOwned = OwnNativeHandle;
+
+  ur_image_format_t UrFormat{};
+  ur_image_desc_t UrDesc{};
+  pi2urImageDesc(ImageFormat, ImageDesc, &UrFormat, &UrDesc);
+
+  ur_mem_image_native_properties_t ImageProperties{};
+  ImageProperties.stype = UR_STRUCTURE_TYPE_MEM_IMAGE_NATIVE_PROPERTIES;
+  ImageProperties.pImageFormat = &UrFormat;
+  ImageProperties.pImageDesc = &UrDesc;
+  Properties.pNext = &ImageProperties;
+
+  HANDLE_ERRORS(
+      urMemCreateWithNativeHandle(UrNativeMem, UrContext, &Properties, UrMem));
 
   return PI_SUCCESS;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index 62f1bf19bf24e..6dc21eab41d4f 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -1588,49 +1588,6 @@ static ur_result_t ur2zeImageDesc(const ur_image_format_t *ImageFormat,
   return UR_RESULT_SUCCESS;
 }
 
-#if 0
-UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle(
-    ur_native_handle_t NativeMem, ///< [in] the native handle of the mem.
-    ur_context_handle_t Context,  ///< [in] handle of the context object
-    bool OwnNativeHandle,
-/*
-    const ur_image_format_t
-        *ImageFormat, ///< [in] pointer to image format specification
-    const ur_image_desc_t *ImageDesc, ///< [in] pointer to image description
-*/
-    ur_mem_handle_t
-        *Mem ///< [out] pointer to the handle of the mem object created.
-) {
-
-  std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
-
-  ze_image_handle_t ZeImage = ur_cast<ze_image_handle_t>(NativeMem);
-
-try {
-    auto UrImage =
-        new _ur_image(ur_cast<ur_context_handle_t>(Context), ZeImage, OwnNativeHandle);
-    *Mem = reinterpret_cast<ur_mem_handle_t>(UrImage);
-
-/*
-#ifndef NDEBUG
-    ZeStruct<ze_image_desc_t> ZeImageDesc;
-    UR_CALL(ur2zeImageDesc(ImageFormat, ImageDesc, ZeImageDesc));
-
-    UrImage->ZeImageDesc = ZeImageDesc;
-#endif // !NDEBUG
-*/
-
-  } catch (const std::bad_alloc &) {
-    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
-  } catch (...) {
-    return UR_RESULT_ERROR_UNKNOWN;
-  }
-
-  return UR_RESULT_SUCCESS;
-
-}
-#endif
-
 UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
     ur_context_handle_t Context, ///< [in] handle of the context object
     ur_mem_flags_t Flags, ///< [in] allocation and usage information flags
@@ -1657,8 +1614,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
              (Context->ZeContext, Device->ZeDevice, &ZeImageDesc, &ZeImage));
 
   try {
-    auto UrImage =
-        new _ur_image(ur_cast<ur_context_handle_t>(Context), ZeImage);
+    auto UrImage = new _ur_image(Context, ZeImage);
     *Mem = reinterpret_cast<ur_mem_handle_t>(UrImage);
 
 #ifndef NDEBUG
@@ -1684,6 +1640,53 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
   return UR_RESULT_SUCCESS;
 }
 
+#if 0
+UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle(
+    ur_native_handle_t NativeMem, ///< [in] the native handle to the memory.
+    ur_context_handle_t Context,  ///< [in] handle of the context object.
+    const ur_mem_native_properties_t *
+        Properties, ///< [in][optional] pointer to native memory creation properties.
+    ur_mem_handle_t
+        *Mem ///< [out] pointer to handle of memory object created.
+) {
+  std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
+
+  ze_image_handle_t ZeHImage = ur_cast<ze_image_handle_t>(NativeMem);
+
+  _ur_image *Image = nullptr;
+  try {
+    Image = new _ur_image(Context, ZeHImage, Properties->isNativeHandleOwned);
+    *Mem = reinterpret_cast<ur_mem_handle_t>(Image);
+
+#ifndef NDEBUG
+    ZeStruct<ze_image_desc_t> ZeImageDesc;
+    if (Properties->pNext != nullptr) {
+      ur_base_desc_t *BaseDesc = reinterpret_cast<ur_base_desc_t *>(Properties->pNext);
+      if (BaseDesc->stype == UR_STRUCTURE_TYPE_MEM_IMAGE_NATIVE_PROPERTIES) {
+        ur_mem_image_native_properties_t *ImageProperties = reinterpret_cast<ur_mem_image_native_properties_t *>(Properties->pNext);
+        ur_result_t Res = ur2zeImageDesc(ImageProperties->pImageFormat,
+                                        ImageProperties->pImageDesc,
+                                        ZeImageDesc);
+        if (Res != UR_RESULT_SUCCESS) {
+          delete Image;
+          *Mem = nullptr;
+          return Res;
+        }
+      }
+    }
+    Image->ZeImageDesc = ZeImageDesc;
+#endif // !NDEBUG
+
+  } catch (const std::bad_alloc &) {
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+  
+  return UR_RESULT_SUCCESS;
+}
+#endif
+
 UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
     ur_context_handle_t Context, ///< [in] handle of the context object
     ur_mem_flags_t Flags, ///< [in] allocation and usage information flags
@@ -1792,12 +1795,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(
 
   if (Mem->isImage()) {
     char *ZeHandleImage;
-    UR_CALL(Mem->getZeHandle(ZeHandleImage, ur_mem_handle_t_::write_only));
-    auto ZeResult = ZE_CALL_NOCHECK(
-        zeImageDestroy, (ur_cast<ze_image_handle_t>(ZeHandleImage)));
-    // Gracefully handle the case that L0 was already unloaded.
-    if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
-      return ze2urResult(ZeResult);
+    auto Image = static_cast<_ur_image *>(Mem);
+    if (Image->OwnNativeHandle) {
+      UR_CALL(Mem->getZeHandle(ZeHandleImage, ur_mem_handle_t_::write_only));
+      auto ZeResult = ZE_CALL_NOCHECK(
+          zeImageDestroy, (ur_cast<ze_image_handle_t>(ZeHandleImage)));
+      // Gracefully handle the case that L0 was already unloaded.
+      if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
+        return ze2urResult(ZeResult);
+    }
   } else {
     auto Buffer = reinterpret_cast<_ur_buffer *>(Mem);
     Buffer->free();
@@ -1866,6 +1872,47 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemCreateWithNativeHandle(
 
   std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
 
+  // Check if this is an image
+  {
+    if (Properties->pNext != nullptr) {
+      ur_base_desc_t *BaseDesc =
+          reinterpret_cast<ur_base_desc_t *>(Properties->pNext);
+      if (BaseDesc->stype == UR_STRUCTURE_TYPE_MEM_IMAGE_NATIVE_PROPERTIES) {
+        ur_mem_image_native_properties_t *ImageProperties =
+            reinterpret_cast<ur_mem_image_native_properties_t *>(
+                Properties->pNext);
+
+        ze_image_handle_t ZeHImage = ur_cast<ze_image_handle_t>(NativeMem);
+
+        _ur_image *Image = nullptr;
+        try {
+          Image =
+              new _ur_image(Context, ZeHImage, Properties->isNativeHandleOwned);
+          *Mem = reinterpret_cast<ur_mem_handle_t>(Image);
+
+#ifndef NDEBUG
+          ZeStruct<ze_image_desc_t> ZeImageDesc;
+          ur_result_t Res =
+              ur2zeImageDesc(ImageProperties->pImageFormat,
+                             ImageProperties->pImageDesc, ZeImageDesc);
+          if (Res != UR_RESULT_SUCCESS) {
+            delete Image;
+            *Mem = nullptr;
+            return Res;
+          }
+          Image->ZeImageDesc = ZeImageDesc;
+#endif // !NDEBUG
+
+        } catch (const std::bad_alloc &) {
+          return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+        } catch (...) {
+          return UR_RESULT_ERROR_UNKNOWN;
+        }
+        return UR_RESULT_SUCCESS;
+      }
+    }
+  }
+
   // Get base of the allocation
   void *Base = nullptr;
   size_t Size = 0;
@@ -1965,7 +2012,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(
     size_t *PropSizeRet ///< [out][optional] pointer to the actual size in
                         ///< bytes of data queried by pMemInfo.
 ) {
-  UR_ASSERT(!Memory->isImage(), UR_RESULT_ERROR_INVALID_VALUE);
+  UR_ASSERT(MemInfoType == UR_MEM_INFO_CONTEXT || !Memory->isImage(),
+            UR_RESULT_ERROR_INVALID_VALUE);
 
   auto Buffer = reinterpret_cast<_ur_buffer *>(Memory);
   std::shared_lock<ur_shared_mutex> Lock(Buffer->Mutex);

From 68413643bea41bfeb081d23a79f6884340bf68e1 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Thu, 27 Apr 2023 11:58:29 -0700
Subject: [PATCH 30/50] Port Fix handling of mem_channel buffer property

https://github.com/intel/llvm/pull/9203

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../ur/adapters/level_zero/ur_level_zero_context.cpp            | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
index c0c4f9958aaf0..cd4513011565d 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
@@ -118,6 +118,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
         UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST;
     return ReturnValue(Capabilities);
   }
+  case UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
+    return ReturnValue(pi_bool{false});
   default:
     // TODO: implement other parameters
     die("urGetContextInfo: unsuppported ParamName.");

From a928c7508161cbf37dc598d7ba7092597b451984 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Mon, 1 May 2023 18:25:56 -0700
Subject: [PATCH 31/50] Port Implement proper queries for
 aspect::ext_oneapi_srgb

https://github.com/intel/llvm/pull/9243

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../ur/adapters/level_zero/ur_level_zero_context.cpp          | 3 +--
 .../ur/adapters/level_zero/ur_level_zero_device.cpp           | 4 ++++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
index cd4513011565d..01f36c1814d66 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
@@ -118,8 +118,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
         UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST;
     return ReturnValue(Capabilities);
   }
-  case UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
-    return ReturnValue(pi_bool{false});
+
   default:
     // TODO: implement other parameters
     die("urGetContextInfo: unsuppported ParamName.");
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
index 4c15b61e19a6f..8e522602146d8 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
@@ -768,6 +768,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
         UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST;
     return ReturnValue(capabilities);
   }
+  case UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
+    return ReturnValue(pi_bool{false});
+  case UR_DEVICE_INFO_IMAGE_SRGB:
+    return ReturnValue(pi_bool{false});
 
   case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES:
   case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: {

From 5d6d006eafb008eaf6eeac8d7dddb19773b96d34 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Mon, 1 May 2023 20:59:34 -0700
Subject: [PATCH 32/50] Port Avoid leak of active barriers' events

https://github.com/intel/llvm/pull/9275

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../level_zero/ur_level_zero_queue.cpp        | 41 +++++++++----------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
index dc4801f6628b8..d91c1fd414181 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
@@ -1268,31 +1268,30 @@ ur_result_t ur_queue_handle_t_::synchronize() {
     return UR_RESULT_SUCCESS;
   };
 
-  // Do nothing if the queue is empty
-  if (!LastCommandEvent)
-    return UR_RESULT_SUCCESS;
-
-  // For in-order queue just wait for the last command.
-  // If event is discarded then it can be in reset state or underlying level
-  // zero handle can have device scope, so we can't synchronize the last event.
-  if (isInOrderQueue() && !LastCommandEvent->IsDiscarded) {
-    ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent));
-  } else {
-    // Otherwise sync all L0 queues/immediate command-lists.
-    for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) {
-      for (auto &QueueGroup : QueueMap) {
-        if (Device->ImmCommandListUsed) {
-          for (auto ImmCmdList : QueueGroup.second.ImmCmdLists)
-            syncImmCmdList(this, ImmCmdList);
-        } else {
-          for (auto &ZeQueue : QueueGroup.second.ZeQueues)
-            if (ZeQueue)
-              ZE2UR_CALL(zeHostSynchronize, (ZeQueue));
+  if (LastCommandEvent) {
+    // For in-order queue just wait for the last command.
+    // If event is discarded then it can be in reset state or underlying level
+    // zero handle can have device scope, so we can't synchronize the last
+    // event.
+    if (isInOrderQueue() && !LastCommandEvent->IsDiscarded) {
+      ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent));
+    } else {
+      // Otherwise sync all L0 queues/immediate command-lists.
+      for (auto &QueueMap : {ComputeQueueGroupsByTID, CopyQueueGroupsByTID}) {
+        for (auto &QueueGroup : QueueMap) {
+          if (Device->ImmCommandListUsed) {
+            for (auto ImmCmdList : QueueGroup.second.ImmCmdLists)
+              syncImmCmdList(this, ImmCmdList);
+          } else {
+            for (auto &ZeQueue : QueueGroup.second.ZeQueues)
+              if (ZeQueue)
+                ZE2UR_CALL(zeHostSynchronize, (ZeQueue));
+          }
         }
       }
     }
+    LastCommandEvent = nullptr;
   }
-  LastCommandEvent = nullptr;
 
   // With the entire queue synchronized, the active barriers must be done so we
   // can remove them.

From 943afe7e7f3a97a1e9d719b034f737cf5e9b0bef Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Wed, 3 May 2023 13:54:53 -0700
Subject: [PATCH 33/50] Rebase loader

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/unified_runtime/CMakeLists.txt   |   2 +-
 sycl/plugins/unified_runtime/pi2ur.hpp        |  28 ++--
 .../adapters/level_zero/ur_level_zero_mem.cpp | 149 ++++++++----------
 .../adapters/level_zero/ur_level_zero_mem.hpp |  10 +-
 .../level_zero/ur_loader_interface.cpp        |   4 +-
 sycl/plugins/unified_runtime/ur/ur.hpp        |   5 -
 6 files changed, 86 insertions(+), 112 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 2cda6e083f6c4..e9dfeaa6e6fc9 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -4,7 +4,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/jandres742/unified-runtime.git")
-  set(UNIFIED_RUNTIME_TAG af603dbef47adb62aafbf067931f0c9358a4cac6)
+  set(UNIFIED_RUNTIME_TAG 8cb3cb2891148a14ef84e840398a1ae8cd84cd6f)
 
   message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}")
   FetchContent_Declare(unified-runtime
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 602b23329ec4d..7444981e836d0 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -2509,14 +2509,8 @@ inline pi_result piextMemImageCreateWithNativeHandle(
   ur_image_desc_t UrDesc{};
   pi2urImageDesc(ImageFormat, ImageDesc, &UrFormat, &UrDesc);
 
-  ur_mem_image_native_properties_t ImageProperties{};
-  ImageProperties.stype = UR_STRUCTURE_TYPE_MEM_IMAGE_NATIVE_PROPERTIES;
-  ImageProperties.pImageFormat = &UrFormat;
-  ImageProperties.pImageDesc = &UrDesc;
-  Properties.pNext = &ImageProperties;
-
-  HANDLE_ERRORS(
-      urMemCreateWithNativeHandle(UrNativeMem, UrContext, &Properties, UrMem));
+  HANDLE_ERRORS(urMemImageCreateWithNativeHandle(
+      UrNativeMem, UrContext, &UrFormat, &UrDesc, &Properties, UrMem));
 
   return PI_SUCCESS;
 }
@@ -2630,8 +2624,8 @@ inline pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle,
   // while we get it in interface
   ur_mem_native_properties_t Properties{};
   Properties.isNativeHandleOwned = OwnNativeHandle;
-  HANDLE_ERRORS(
-      urMemCreateWithNativeHandle(UrNativeMem, UrContext, &Properties, UrMem));
+  HANDLE_ERRORS(urMemBufferCreateWithNativeHandle(UrNativeMem, UrContext,
+                                                  &Properties, UrMem));
 
   return PI_SUCCESS;
 }
@@ -2669,22 +2663,28 @@ inline pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context,
   auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
 
   ur_usm_desc_t USMDesc{};
+  ur_usm_device_desc_t UsmDeviceDesc{};
+  UsmDeviceDesc.stype = UR_STRUCTURE_TYPE_USM_DEVICE_DESC;
+  ur_usm_host_desc_t UsmHostDesc{};
+  UsmHostDesc.stype = UR_STRUCTURE_TYPE_USM_HOST_DESC;
   if (Properties) {
     if (Properties[0] == PI_MEM_ALLOC_FLAGS) {
       if (Properties[1] == PI_MEM_ALLOC_WRTITE_COMBINED) {
-        USMDesc.flags |= UR_EXT_USM_MEM_FLAG_WRITE_COMBINED;
+        UsmDeviceDesc.flags |= UR_USM_DEVICE_MEM_FLAG_WRITE_COMBINED;
       }
       if (Properties[1] == PI_MEM_ALLOC_INITIAL_PLACEMENT_DEVICE) {
-        USMDesc.flags |= UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_DEVICE;
+        UsmDeviceDesc.flags |= UR_USM_DEVICE_MEM_FLAG_INITIAL_PLACEMENT;
       }
       if (Properties[1] == PI_MEM_ALLOC_INITIAL_PLACEMENT_HOST) {
-        USMDesc.flags |= UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_HOST;
+        UsmHostDesc.flags |= UR_USM_HOST_MEM_FLAG_INITIAL_PLACEMENT;
       }
       if (Properties[1] == PI_MEM_ALLOC_DEVICE_READ_ONLY) {
-        USMDesc.flags |= UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY;
+        UsmDeviceDesc.flags |= UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY;
       }
     }
   }
+  UsmDeviceDesc.pNext = &UsmHostDesc;
+  USMDesc.pNext = &UsmDeviceDesc;
 
   USMDesc.align = Alignment;
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index 6dc21eab41d4f..ed30bf3c9e69b 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -1640,15 +1640,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
   return UR_RESULT_SUCCESS;
 }
 
-#if 0
 UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle(
     ur_native_handle_t NativeMem, ///< [in] the native handle to the memory.
     ur_context_handle_t Context,  ///< [in] handle of the context object.
-    const ur_mem_native_properties_t *
-        Properties, ///< [in][optional] pointer to native memory creation properties.
-    ur_mem_handle_t
-        *Mem ///< [out] pointer to handle of memory object created.
-) {
+    const ur_image_format_t
+        *ImageFormat, ///< [in] pointer to image format specification.
+    const ur_image_desc_t *ImageDesc, ///< [in] pointer to image description.
+    const ur_mem_native_properties_t
+        *Properties, ///< [in][optional] pointer to native memory creation
+                     ///< properties.
+    ur_mem_handle_t *Mem) {
   std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
 
   ze_image_handle_t ZeHImage = ur_cast<ze_image_handle_t>(NativeMem);
@@ -1660,19 +1661,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle(
 
 #ifndef NDEBUG
     ZeStruct<ze_image_desc_t> ZeImageDesc;
-    if (Properties->pNext != nullptr) {
-      ur_base_desc_t *BaseDesc = reinterpret_cast<ur_base_desc_t *>(Properties->pNext);
-      if (BaseDesc->stype == UR_STRUCTURE_TYPE_MEM_IMAGE_NATIVE_PROPERTIES) {
-        ur_mem_image_native_properties_t *ImageProperties = reinterpret_cast<ur_mem_image_native_properties_t *>(Properties->pNext);
-        ur_result_t Res = ur2zeImageDesc(ImageProperties->pImageFormat,
-                                        ImageProperties->pImageDesc,
-                                        ZeImageDesc);
-        if (Res != UR_RESULT_SUCCESS) {
-          delete Image;
-          *Mem = nullptr;
-          return Res;
-        }
-      }
+    ur_result_t Res = ur2zeImageDesc(ImageFormat, ImageDesc, ZeImageDesc);
+    if (Res != UR_RESULT_SUCCESS) {
+      delete Image;
+      *Mem = nullptr;
+      return Res;
     }
     Image->ZeImageDesc = ZeImageDesc;
 #endif // !NDEBUG
@@ -1682,10 +1675,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle(
   } catch (...) {
     return UR_RESULT_ERROR_UNKNOWN;
   }
-  
+
   return UR_RESULT_SUCCESS;
 }
-#endif
 
 UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
     ur_context_handle_t Context, ///< [in] handle of the context object
@@ -1861,58 +1853,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle(
   return UR_RESULT_SUCCESS;
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urMemCreateWithNativeHandle(
-    ur_native_handle_t NativeMem, ///< [in] the native handle of the mem.
-    ur_context_handle_t Context,  ///< [in] handle of the context object
-    const ur_mem_native_properties_t *Properties,
+UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle(
+    ur_native_handle_t NativeMem, ///< [in] the native handle to the memory.
+    ur_context_handle_t Context,  ///< [in] handle of the context object.
+    const ur_mem_native_properties_t
+        *Properties, ///< [in][optional] pointer to native memory creation
+                     ///< properties.
     ur_mem_handle_t
-        *Mem ///< [out] pointer to the handle of the mem object created.
+        *Mem ///< [out] pointer to handle of buffer memory object created.
 ) {
   bool OwnNativeHandle = Properties->isNativeHandleOwned;
 
   std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
 
-  // Check if this is an image
-  {
-    if (Properties->pNext != nullptr) {
-      ur_base_desc_t *BaseDesc =
-          reinterpret_cast<ur_base_desc_t *>(Properties->pNext);
-      if (BaseDesc->stype == UR_STRUCTURE_TYPE_MEM_IMAGE_NATIVE_PROPERTIES) {
-        ur_mem_image_native_properties_t *ImageProperties =
-            reinterpret_cast<ur_mem_image_native_properties_t *>(
-                Properties->pNext);
-
-        ze_image_handle_t ZeHImage = ur_cast<ze_image_handle_t>(NativeMem);
-
-        _ur_image *Image = nullptr;
-        try {
-          Image =
-              new _ur_image(Context, ZeHImage, Properties->isNativeHandleOwned);
-          *Mem = reinterpret_cast<ur_mem_handle_t>(Image);
-
-#ifndef NDEBUG
-          ZeStruct<ze_image_desc_t> ZeImageDesc;
-          ur_result_t Res =
-              ur2zeImageDesc(ImageProperties->pImageFormat,
-                             ImageProperties->pImageDesc, ZeImageDesc);
-          if (Res != UR_RESULT_SUCCESS) {
-            delete Image;
-            *Mem = nullptr;
-            return Res;
-          }
-          Image->ZeImageDesc = ZeImageDesc;
-#endif // !NDEBUG
-
-        } catch (const std::bad_alloc &) {
-          return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
-        } catch (...) {
-          return UR_RESULT_ERROR_UNKNOWN;
-        }
-        return UR_RESULT_SUCCESS;
-      }
-    }
-  }
-
   // Get base of the allocation
   void *Base = nullptr;
   size_t Size = 0;
@@ -2075,8 +2028,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(
   if (Align > 65536)
     return UR_RESULT_ERROR_INVALID_VALUE;
 
-  const ur_usm_flags_t *USMFlag = &USMDesc->flags;
-  std::ignore = USMFlag;
+  const ur_usm_advice_flags_t *USMHintFlags = &USMDesc->hints;
+  std::ignore = USMHintFlags;
 
   ur_platform_handle_t Plt = Context->getPlatform();
   // If indirect access tracking is enabled then lock the mutex which is
@@ -2105,9 +2058,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(
       // keep the same behavior for the allocator, just call L0 API directly and
       // return the error code.
       ((Align & (Align - 1)) != 0)) {
-    ur_usm_flags_t Properties{};
-    ur_result_t Res =
-        USMHostAllocImpl(RetMem, Context, &Properties, Size, Align);
+    ur_usm_host_mem_flags_t Flags{};
+    ur_result_t Res = USMHostAllocImpl(RetMem, Context, &Flags, Size, Align);
     if (IndirectAccessTrackingEnabled) {
       // Keep track of all memory allocations in the context
       Context->MemAllocs.emplace(std::piecewise_construct,
@@ -2158,8 +2110,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(
   if (Alignment > 65536)
     return UR_RESULT_ERROR_INVALID_VALUE;
 
-  const ur_usm_flags_t *USMProp = &USMDesc->flags;
-  std::ignore = USMProp;
+  const ur_usm_advice_flags_t *USMHintFlags = &USMDesc->hints;
+  std::ignore = USMHintFlags;
 
   ur_platform_handle_t Plt = Device->Platform;
 
@@ -2236,11 +2188,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(
 ) {
   std::ignore = Pool;
 
-  const ur_usm_flags_t *Properties = &USMDesc->flags;
   uint32_t Alignment = USMDesc->align;
 
+  ur_usm_host_mem_flags_t UsmHostFlags{};
+
   // See if the memory is going to be read-only on the device.
-  bool DeviceReadOnly = *Properties & UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY;
+  bool DeviceReadOnly = false;
+  ur_usm_device_mem_flags_t UsmDeviceFlags{};
+
+  void *pNext = const_cast<void *>(USMDesc->pNext);
+  while (pNext != nullptr) {
+    const ur_base_desc_t *BaseDesc =
+        reinterpret_cast<const ur_base_desc_t *>(pNext);
+    if (BaseDesc->stype == UR_STRUCTURE_TYPE_USM_DEVICE_DESC) {
+      const ur_usm_device_desc_t *UsmDeviceDesc =
+          reinterpret_cast<const ur_usm_device_desc_t *>(pNext);
+      UsmDeviceFlags = UsmDeviceDesc->flags;
+    }
+    if (BaseDesc->stype == UR_STRUCTURE_TYPE_USM_HOST_DESC) {
+      const ur_usm_host_desc_t *UsmHostDesc =
+          reinterpret_cast<const ur_usm_host_desc_t *>(pNext);
+      UsmHostFlags = UsmHostDesc->flags;
+    }
+    pNext = const_cast<void *>(BaseDesc->pNext);
+  }
+  DeviceReadOnly = UsmDeviceFlags & UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY;
 
   // L0 supports alignment up to 64KB and silently ignores higher values.
   // We flag alignment > 64KB as an invalid value.
@@ -2271,9 +2243,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(
       // keep the same behavior for the allocator, just call L0 API directly and
       // return the error code.
       ((Alignment & (Alignment - 1)) != 0)) {
-    ur_result_t Res = USMSharedAllocImpl(
-        RetMem, Context, Device, const_cast<ur_usm_flags_t *>(Properties), Size,
-        Alignment);
+    ur_result_t Res = USMSharedAllocImpl(RetMem, Context, Device, &UsmHostFlags,
+                                         &UsmDeviceFlags, Size, Alignment);
     if (IndirectAccessTrackingEnabled) {
       // Keep track of all memory allocations in the context
       Context->MemAllocs.emplace(std::piecewise_construct,
@@ -2423,16 +2394,18 @@ void USMMemoryAllocBase::deallocate(void *Ptr) {
 
 ur_result_t USMSharedMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
                                                uint32_t Alignment) {
-  return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, Size,
+  return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size,
                             Alignment);
 }
 
 ur_result_t USMSharedReadOnlyMemoryAlloc::allocateImpl(void **ResultPtr,
                                                        size_t Size,
                                                        uint32_t Alignment) {
-  ur_usm_flags_t Props = UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY;
-  return USMSharedAllocImpl(ResultPtr, Context, Device, &Props, Size,
-                            Alignment);
+  ur_usm_device_desc_t UsmDeviceDesc{};
+  UsmDeviceDesc.flags = UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY;
+  ur_usm_host_desc_t UsmHostDesc{};
+  return USMSharedAllocImpl(ResultPtr, Context, Device, &UsmDeviceDesc.flags,
+                            &UsmHostDesc.flags, Size, Alignment);
 }
 
 ur_result_t USMDeviceMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
@@ -2536,7 +2509,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolDestroy(
 
 ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                                ur_device_handle_t Device,
-                               ur_usm_flags_t *Properties, size_t Size,
+                               ur_usm_device_mem_flags_t *Flags, size_t Size,
                                uint32_t Alignment) {
   // TODO: translate PI properties to Level Zero flags
   ZeStruct<ze_device_mem_alloc_desc_t> ZeDesc;
@@ -2562,8 +2535,10 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
 }
 
 ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                               ur_device_handle_t Device, ur_usm_flags_t *,
-                               size_t Size, uint32_t Alignment) {
+                               ur_device_handle_t Device,
+                               ur_usm_host_mem_flags_t *,
+                               ur_usm_device_mem_flags_t *, size_t Size,
+                               uint32_t Alignment) {
 
   // TODO: translate PI properties to Level Zero flags
   ZeStruct<ze_host_mem_alloc_desc_t> ZeHostDesc;
@@ -2593,7 +2568,7 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
 }
 
 ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                             ur_usm_flags_t *Properties, size_t Size,
+                             ur_usm_host_mem_flags_t *Flags, size_t Size,
                              uint32_t Alignment) {
   // TODO: translate PI properties to Level Zero flags
   ZeStruct<ze_host_mem_alloc_desc_t> ZeHostDesc;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
index 9661063f0e5f2..fa0aa966688d5 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
@@ -284,15 +284,17 @@ class USMHostMemoryAlloc : public USMMemoryAllocBase {
 
 ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                                ur_device_handle_t Device,
-                               ur_usm_flags_t *Properties, size_t Size,
+                               ur_usm_device_mem_flags_t *Flags, size_t Size,
                                uint32_t Alignment);
 
 ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                               ur_device_handle_t Device, ur_usm_flags_t *,
-                               size_t Size, uint32_t Alignment);
+                               ur_device_handle_t Device,
+                               ur_usm_host_mem_flags_t *,
+                               ur_usm_device_mem_flags_t *, size_t Size,
+                               uint32_t Alignment);
 
 ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                             ur_usm_flags_t *Properties, size_t Size,
+                             ur_usm_host_mem_flags_t *Flags, size_t Size,
                              uint32_t Alignment);
 
 // If indirect access tracking is not enabled then this functions just performs
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp
index 01b174aa93774..0d37c805bfb2b 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp
@@ -159,7 +159,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetMemProcAddrTable(
   pDdiTable->pfnRelease = urMemRelease;
   pDdiTable->pfnBufferPartition = urMemBufferPartition;
   pDdiTable->pfnGetNativeHandle = urMemGetNativeHandle;
-  pDdiTable->pfnCreateWithNativeHandle = urMemCreateWithNativeHandle;
+  pDdiTable->pfnBufferCreateWithNativeHandle =
+      urMemBufferCreateWithNativeHandle;
+  pDdiTable->pfnImageCreateWithNativeHandle = urMemImageCreateWithNativeHandle;
   pDdiTable->pfnGetInfo = urMemGetInfo;
   pDdiTable->pfnImageGetInfo = urMemImageGetInfo;
 
diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp
index 790c2fd39bd00..e5bd87108e824 100644
--- a/sycl/plugins/unified_runtime/ur/ur.hpp
+++ b/sycl/plugins/unified_runtime/ur/ur.hpp
@@ -71,11 +71,6 @@ const int UR_EXT_USM_CAPS_ATOMIC_ACCESS = 1 << 1;
 const int UR_EXT_USM_CAPS_CONCURRENT_ACCESS = 1 << 2;
 const int UR_EXT_USM_CAPS_CONCURRENT_ATOMIC_ACCESS = 1 << 3;
 
-const int UR_EXT_USM_MEM_FLAG_WRITE_COMBINED = 1 << 27;
-const int UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_DEVICE = 1 << 28;
-const int UR_EXT_USM_MEM_FLAG_INITIAL_PLACEMENT_HOST = 1 << 29;
-const int UR_EXT_USM_MEM_FLAG_DEVICE_READ_ONLY = 1 << 30;
-
 const ur_context_info_t UR_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES =
     (ur_context_info_t)(UR_CONTEXT_INFO_FORCE_UINT32 - 1);
 

From e1e5f631d3f08db43ead7a6c2463730e9f0e52d2 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Wed, 3 May 2023 20:04:45 -0700
Subject: [PATCH 34/50] fix interop image

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../ur/adapters/level_zero/ur_level_zero_common.hpp      | 8 +++-----
 .../ur/adapters/level_zero/ur_level_zero_event.cpp       | 2 +-
 .../ur/adapters/level_zero/ur_level_zero_mem.hpp         | 9 ++++-----
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
index 491c58e668763..f3a8ba48b2eba 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
@@ -393,17 +393,15 @@ struct _ur_object {
 // for each memory allocation.
 struct MemAllocRecord : _ur_object {
   MemAllocRecord(ur_context_handle_t Context, bool OwnZeMemHandle = true)
-      : Context(Context), OwnZeMemHandle(OwnZeMemHandle) {}
+      : Context(Context) {
+    OwnNativeHandle = OwnZeMemHandle;
+  }
   // Currently kernel can reference memory allocations from different contexts
   // and we need to know the context of a memory allocation when we release it
   // in piKernelRelease.
   // TODO: this should go away when memory isolation issue is fixed in the Level
   // Zero runtime.
   ur_context_handle_t Context;
-
-  // Indicates if we own the native memory handle or it came from interop that
-  // asked to not transfer the ownership to SYCL RT.
-  bool OwnZeMemHandle;
 };
 
 extern usm_settings::USMAllocatorConfig USMAllocatorConfigInstance;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
index 197ec1e8f70aa..446caee4a7e51 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
@@ -813,7 +813,7 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked) {
         for (auto &MemAlloc : Kernel->MemAllocs) {
           // std::pair<void *const, MemAllocRecord> *, Hash
           USMFreeHelper(MemAlloc->second.Context, MemAlloc->first,
-                        MemAlloc->second.OwnZeMemHandle);
+                        MemAlloc->second.OwnNativeHandle);
         }
         Kernel->MemAllocs.clear();
       }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
index fa0aa966688d5..12f782d862ac7 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
@@ -192,9 +192,10 @@ struct _ur_image final : ur_mem_handle_t_ {
       : ur_mem_handle_t_(UrContext), ZeImage{ZeImage} {}
 
   _ur_image(ur_context_handle_t UrContext, ze_image_handle_t ZeImage,
-            bool OwnNativeHandle)
-      : ur_mem_handle_t_(UrContext), ZeImage{ZeImage},
-        OwnZeMemHandle{OwnNativeHandle} {}
+            bool OwnZeMemHandle)
+      : ur_mem_handle_t_(UrContext), ZeImage{ZeImage} {
+    OwnNativeHandle = OwnZeMemHandle;
+  }
 
   virtual ur_result_t getZeHandle(char *&ZeHandle, access_mode_t,
                                   ur_device_handle_t = nullptr) override {
@@ -216,8 +217,6 @@ struct _ur_image final : ur_mem_handle_t_ {
 
   // Level Zero image handle.
   ze_image_handle_t ZeImage;
-
-  bool OwnZeMemHandle = true;
 };
 
 // Implements memory allocation via L0 RT for USM allocator interface.

From 5eaf7f37112244f2b86bd38dbfcb378f99eb9fb7 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Mon, 8 May 2023 18:47:20 -0700
Subject: [PATCH 35/50] Port Port PI L0 environment variables to UR L0

https://github.com/intel/llvm/pull/9300

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../level_zero/ur_level_zero_context.cpp      | 21 ++++++--
 .../level_zero/ur_level_zero_device.cpp       | 18 ++++---
 .../level_zero/ur_level_zero_device.hpp       |  2 +-
 .../level_zero/ur_level_zero_event.cpp        |  7 ++-
 .../level_zero/ur_level_zero_event.hpp        | 12 +++--
 .../adapters/level_zero/ur_level_zero_mem.cpp | 15 ++++--
 .../adapters/level_zero/ur_level_zero_mem.hpp |  4 +-
 .../level_zero/ur_level_zero_queue.cpp        | 50 ++++++++++++-------
 sycl/plugins/unified_runtime/ur/ur.cpp        |  1 -
 .../ur/usm_allocator_config.cpp               |  2 -
 10 files changed, 89 insertions(+), 43 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
index 01f36c1814d66..4c998fb6294ea 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
@@ -73,8 +73,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextRelease(
 // Due to a bug with 2D memory copy to and from non-USM pointers, this option is
 // disabled by default.
 static const bool UseMemcpy2DOperations = [] {
+  const char *UrRet = std::getenv("UR_L0_USE_NATIVE_USM_MEMCPY2D");
+  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D");
   const char *UseMemcpy2DOperationsFlag =
-      std::getenv("SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D");
+      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   if (!UseMemcpy2DOperationsFlag)
     return false;
   return std::stoi(UseMemcpy2DOperationsFlag) > 0;
@@ -409,8 +411,10 @@ ur_result_t ur_context_handle_t_::finalize() {
 // here. Setting it to 256 gave best possible performance for several
 // benchmarks.
 static const pi_uint32 MaxNumEventsPerPool = [] {
-  const auto MaxNumEventsPerPoolEnv =
-      std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
+  const char *UrRet = std::getenv("UR_L0_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
+  const char *PiRet = std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
+  const char *MaxNumEventsPerPoolEnv =
+      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   uint32_t Result =
       MaxNumEventsPerPoolEnv ? std::atoi(MaxNumEventsPerPoolEnv) : 256;
   if (Result <= 0)
@@ -531,8 +535,12 @@ ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) {
 // If number of events in the immediate command list exceeds this threshold then
 // cleanup process for those events is executed.
 static const size_t ImmCmdListsEventCleanupThreshold = [] {
-  const char *ImmCmdListsEventCleanupThresholdStr = std::getenv(
+  const char *UrRet =
+      std::getenv("UR_L0_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD");
+  const char *PiRet = std::getenv(
       "SYCL_PI_LEVEL_ZERO_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD");
+  const char *ImmCmdListsEventCleanupThresholdStr =
+      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   static constexpr int Default = 1000;
   if (!ImmCmdListsEventCleanupThresholdStr)
     return Default;
@@ -549,8 +557,11 @@ static const size_t ImmCmdListsEventCleanupThreshold = [] {
 // Get value of the threshold for number of active command lists allowed before
 // we start heuristically cleaning them up.
 static const size_t CmdListsCleanupThreshold = [] {
-  const char *CmdListsCleanupThresholdStr =
+  const char *UrRet = std::getenv("UR_L0_COMMANDLISTS_CLEANUP_THRESHOLD");
+  const char *PiRet =
       std::getenv("SYCL_PI_LEVEL_ZERO_COMMANDLISTS_CLEANUP_THRESHOLD");
+  const char *CmdListsCleanupThresholdStr =
+      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   static constexpr int Default = 20;
   if (!CmdListsCleanupThresholdStr)
     return Default;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
index 8e522602146d8..5fac6f1e4d77a 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
@@ -792,7 +792,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   return UR_RESULT_SUCCESS;
 }
 
-// SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE can be set to an integer value, or
+// UR_L0_USE_COPY_ENGINE can be set to an integer value, or
 // a pair of integer values of the form "lower_index:upper_index".
 // Here, the indices point to copy engines in a list of all available copy
 // engines.
@@ -802,7 +802,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
 // available copy engines can be used.
 const std::pair<int, int>
 getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device) {
-  static const char *EnvVar = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE");
+  const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE");
+  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE");
+  static const char *EnvVar = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   // If the environment variable is not set, no copy engines are used when
   // immediate commandlists are being used. For standard commandlists all are
   // used.
@@ -825,7 +827,7 @@ getRangeOfAllowedCopyEngines(const ur_device_handle_t &Device) {
   int UpperCopyEngineIndex = std::stoi(CopyEngineRange.substr(pos + 1));
   if ((LowerCopyEngineIndex > UpperCopyEngineIndex) ||
       (LowerCopyEngineIndex < -1) || (UpperCopyEngineIndex < -1)) {
-    urPrint("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE: invalid value provided, "
+    urPrint("UR_L0_LEVEL_ZERO_USE_COPY_ENGINE: invalid value provided, "
             "default set.\n");
     LowerCopyEngineIndex = 0;
     UpperCopyEngineIndex = INT_MAX;
@@ -843,16 +845,20 @@ bool CopyEngineRequested(const ur_device_handle_t &Device) {
 // The default is standard commandlists. Setting 1 or 2 specifies use of
 // immediate commandlists. Note: when immediate commandlists are used then
 // device-only events must be either AllHostVisible or OnDemandHostVisibleProxy.
-// (See env var SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS).
+// (See env var UR_L0_DEVICE_SCOPE_EVENTS).
 
 // Get value of immediate commandlists env var setting or -1 if unset
 ur_device_handle_t_::ImmCmdlistMode
 ur_device_handle_t_::useImmediateCommandLists() {
   // If immediate commandlist setting is not explicitly set, then use the device
   // default.
+  // TODO: confirm this is good once make_queue revert is added
   static const int ImmediateCommandlistsSetting = [] {
-    const char *ImmediateCommandlistsSettingStr =
+    const char *UrRet = std::getenv("UR_L0_USE_IMMEDIATE_COMMANDLISTS");
+    const char *PiRet =
         std::getenv("SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS");
+    const char *ImmediateCommandlistsSettingStr =
+        UrRet ? UrRet : (PiRet ? PiRet : nullptr);
     if (!ImmediateCommandlistsSettingStr)
       return -1;
     return std::stoi(ImmediateCommandlistsSettingStr);
@@ -1122,7 +1128,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDevicePartition(
 
     // Sub-Sub-Devices are partitioned by CSlices, not by affinity domain.
     // However, if
-    // SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING overrides that
+    // UR_L0_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING overrides that
     // still expose CSlices in partitioning by affinity domain for compatibility
     // reasons.
     if (Properties[0] == UR_DEVICE_PARTITION_BY_AFFINITY_DOMAIN &&
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp
index 09e942a6441b8..8aff6f170127f 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp
@@ -137,7 +137,7 @@ struct ur_device_handle_t_ : _ur_object {
   ImmCmdlistMode ImmCommandListUsed{};
 
   // Scope of events used for events on the device
-  // Can be adjusted with SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS
+  // Can be adjusted with UR_L0_DEVICE_SCOPE_EVENTS
   // for non-immediate command lists
   EventsScope ZeEventsScope = AllHostVisible;
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
index 446caee4a7e51..d39c40982bd6f 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
@@ -28,8 +28,11 @@ void printZeEventList(const _ur_ze_event_list_t &UrZeEventList) {
 // This is an experimental option that allows the use of multiple command lists
 // when submitting barriers. The default is 0.
 static const bool UseMultipleCmdlistBarriers = [] {
-  const char *UseMultipleCmdlistBarriersFlag =
+  const char *UrRet = std::getenv("UR_L0_USE_MULTIPLE_COMMANDLIST_BARRIERS");
+  const char *PiRet =
       std::getenv("SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS");
+  const char *UseMultipleCmdlistBarriersFlag =
+      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   if (!UseMultipleCmdlistBarriersFlag)
     return true;
   return std::stoi(UseMultipleCmdlistBarriersFlag) > 0;
@@ -162,7 +165,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
   // If we have a list of events to make the barrier from, then we can create a
   // barrier on these and use the resulting event as our future barrier.
   // We use the same approach if
-  // SYCL_PI_LEVEL_ZERO_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a
+  // UR_L0_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a
   // positive value.
   // We use the same approach if we have in-order queue because every command
   // depends on previous one, so we don't need to insert barrier to multiple
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp
index fcb3b156af0db..9e129adb0fb7e 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp
@@ -35,8 +35,10 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
 // This is an experimental option that allows to disable caching of events in
 // the context.
 const bool DisableEventsCaching = [] {
+  const char *UrRet = std::getenv("UR_L0_DISABLE_EVENTS_CACHING");
+  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_EVENTS_CACHING");
   const char *DisableEventsCachingFlag =
-      std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_EVENTS_CACHING");
+      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   if (!DisableEventsCachingFlag)
     return false;
   return std::stoi(DisableEventsCachingFlag) != 0;
@@ -45,8 +47,10 @@ const bool DisableEventsCaching = [] {
 // This is an experimental option that allows reset and reuse of uncompleted
 // events in the in-order queue with discard_events property.
 const bool ReuseDiscardedEvents = [] {
+  const char *UrRet = std::getenv("UR_L0_REUSE_DISCARDED_EVENTS");
+  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_REUSE_DISCARDED_EVENTS");
   const char *ReuseDiscardedEventsFlag =
-      std::getenv("SYCL_PI_LEVEL_ZERO_REUSE_DISCARDED_EVENTS");
+      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   if (!ReuseDiscardedEventsFlag)
     return true;
   return std::stoi(ReuseDiscardedEventsFlag) > 0;
@@ -236,8 +240,10 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event,
 
 // Get value of device scope events env var setting or default setting
 static const EventsScope DeviceEventsSetting = [] {
+  char *UrRet = std::getenv("UR_L0_DEVICE_SCOPE_EVENTS");
+  char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS");
   const char *DeviceEventsSettingStr =
-      std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS");
+      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   if (DeviceEventsSettingStr) {
     // Override the default if user has explicitly chosen the events scope.
     switch (std::stoi(DeviceEventsSettingStr)) {
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index ed30bf3c9e69b..43c0d691f5ad0 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -18,8 +18,10 @@
 // Default to using compute engine for fill operation, but allow to
 // override this with an environment variable.
 static bool PreferCopyEngine = [] {
-  const char *Env = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL");
-  return Env ? std::stoi(Env) != 0 : false;
+  const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_FILL");
+  const char *PiRet =
+      std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL");
+  return (UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0));
 }();
 
 // Helper function to check if a pointer is a device pointer.
@@ -2433,7 +2435,9 @@ enum class USMAllocationForceResidencyType {
 
 // Returns the desired USM residency setting
 static USMAllocationForceResidencyType USMAllocationForceResidency = [] {
-  const auto Str = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT");
+  const char *UrRet = std::getenv("UR_L0_USM_RESIDENT");
+  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT");
+  const char *Str = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   if (!Str)
     return USMAllocationForceResidencyType::P2PDevices;
   switch (std::atoi(Str)) {
@@ -2861,8 +2865,11 @@ ur_result_t _ur_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode,
   // cross-tile traffic.
   //
   static const bool SingleRootDeviceBufferMigration = [] {
-    const char *EnvStr =
+    const char *UrRet =
+        std::getenv("UR_L0_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION");
+    const char *PiRet =
         std::getenv("SYCL_PI_LEVEL_ZERO_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION");
+    const char *EnvStr = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
     if (EnvStr)
       return (std::stoi(EnvStr) != 0);
     // The default is to migrate normally, which may not always be the
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
index 12f782d862ac7..e9ad0d49bbdbb 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
@@ -31,8 +31,10 @@ bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr);
 // This is an experimental option to test performance of device to device copy
 // operations on copy engines (versus compute engine)
 const bool UseCopyEngineForD2DCopy = [] {
-  const char *CopyEngineForD2DCopy =
+  const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_D2D_COPY");
+  const char *PiRet =
       std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY");
+  const char *CopyEngineForD2DCopy = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   return (CopyEngineForD2DCopy && (std::stoi(CopyEngineForD2DCopy) != 0));
 }();
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
index d91c1fd414181..fe81cd1e2a3a0 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
@@ -260,7 +260,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(
 // paths be less likely affected.
 //
 static bool doEagerInit = [] {
-  const char *EagerInit = std::getenv("SYCL_EAGER_INIT");
+  const char *UrRet = std::getenv("UR_L0_EAGER_INIT");
+  const char *PiRet = std::getenv("SYCL_EAGER_INIT");
+  const char *EagerInit = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   return EagerInit ? std::atoi(EagerInit) != 0 : false;
 }();
 
@@ -549,8 +551,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(
     // TODO: this currently exhibits some issues in the driver, so
     // we control this with an env var. Remove this control when
     // we settle one way or the other.
+    const char *UrRet = std::getenv("UR_L0_QUEUE_FINISH_HOLD_LOCK");
+    const char *PiRet =
+        std::getenv("SYCL_PI_LEVEL_ZERO_QUEUE_FINISH_HOLD_LOCK");
     static bool HoldLock =
-        std::getenv("SYCL_PI_LEVEL_ZERO_QUEUE_FINISH_HOLD_LOCK") != nullptr;
+        UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0);
     if (!HoldLock) {
       Lock.unlock();
     }
@@ -623,9 +628,16 @@ static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) {
   zeCommandListBatchConfig Config{}; // default initialize
 
   // Default value of 0. This specifies to use dynamic batch size adjustment.
-  const auto BatchSizeStr =
-      (IsCopy) ? std::getenv("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE")
-               : std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE");
+  const char *UrRet = nullptr;
+  const char *PiRet = nullptr;
+  if (IsCopy) {
+    UrRet = std::getenv("UR_L0_COPY_BATCH_SIZE");
+    PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE");
+  } else {
+    UrRet = std::getenv("UR_L0_BATCH_SIZE");
+    PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_BATCH_SIZE");
+  }
+  const char *BatchSizeStr = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   if (BatchSizeStr) {
     pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr);
     // Level Zero may only support a limted number of commands per command
@@ -658,10 +670,9 @@ static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) {
           Val = std::stoi(BatchConfig.substr(Pos));
         } catch (...) {
           if (IsCopy)
-            urPrint(
-                "SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: failed to parse value\n");
+            urPrint("UR_L0_COPY_BATCH_SIZE: failed to parse value\n");
           else
-            urPrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: failed to parse value\n");
+            urPrint("UR_L0_BATCH_SIZE: failed to parse value\n");
           break;
         }
         switch (Ord) {
@@ -684,27 +695,26 @@ static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) {
           die("Unexpected batch config");
         }
         if (IsCopy)
-          urPrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: dynamic batch param "
+          urPrint("UR_L0_COPY_BATCH_SIZE: dynamic batch param "
                   "#%d: %d\n",
                   (int)Ord, (int)Val);
         else
-          urPrint(
-              "SYCL_PI_LEVEL_ZERO_BATCH_SIZE: dynamic batch param #%d: %d\n",
-              (int)Ord, (int)Val);
+          urPrint("UR_L0_BATCH_SIZE: dynamic batch param #%d: %d\n", (int)Ord,
+                  (int)Val);
       };
 
     } else {
       // Negative batch sizes are silently ignored.
       if (IsCopy)
-        urPrint("SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE: ignored negative value\n");
+        urPrint("UR_L0_COPY_BATCH_SIZE: ignored negative value\n");
       else
-        urPrint("SYCL_PI_LEVEL_ZERO_BATCH_SIZE: ignored negative value\n");
+        urPrint("UR_L0_BATCH_SIZE: ignored negative value\n");
     }
   }
   return Config;
 }
 
-// SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE can be set to an integer (>=0) in
+// UR_L0_LEVEL_ZERO_USE_COMPUTE_ENGINE can be set to an integer (>=0) in
 // which case all compute commands will be submitted to the command-queue
 // with the given index in the compute command group. If it is instead set
 // to negative then all available compute engines may be used.
@@ -712,8 +722,9 @@ static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) {
 // The default value is "0".
 //
 static const std::pair<int, int> getRangeOfAllowedComputeEngines() {
-  static const char *EnvVar =
-      std::getenv("SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE");
+  const char *UrRet = std::getenv("UR_L0_USE_COMPUTE_ENGINE");
+  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USE_COMPUTE_ENGINE");
+  const char *EnvVar = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   // If the environment variable is not set only use "0" CCS for now.
   // TODO: allow all CCSs when HW support is complete.
   if (!EnvVar)
@@ -1769,8 +1780,11 @@ ur_result_t ur_queue_handle_t_::insertStartBarrierIfDiscardEventsMode(
 // available in the device, in Level Zero plugin for copy operations submitted
 // to an in-order queue. The default is 1.
 static const bool UseCopyEngineForInOrderQueue = [] {
-  const char *CopyEngineForInOrderQueue =
+  const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE");
+  const char *PiRet =
       std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE");
+  const char *CopyEngineForInOrderQueue =
+      UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   return (!CopyEngineForInOrderQueue ||
           (std::stoi(CopyEngineForInOrderQueue) != 0));
 }();
diff --git a/sycl/plugins/unified_runtime/ur/ur.cpp b/sycl/plugins/unified_runtime/ur/ur.cpp
index 0db860fbd0daa..319e95bde6e72 100644
--- a/sycl/plugins/unified_runtime/ur/ur.cpp
+++ b/sycl/plugins/unified_runtime/ur/ur.cpp
@@ -15,7 +15,6 @@ bool PrintTrace = [] {
   const char *UrRet = std::getenv("UR_L0_TRACE");
   const char *PiRet = std::getenv("SYCL_PI_TRACE");
   const char *Trace = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-
   const int TraceValue = Trace ? std::stoi(Trace) : 0;
   if (TraceValue == -1 || TraceValue == 2) { // Means print all traces
     return true;
diff --git a/sycl/plugins/unified_runtime/ur/usm_allocator_config.cpp b/sycl/plugins/unified_runtime/ur/usm_allocator_config.cpp
index 30b67945ad28a..8d77a67b1a6e4 100644
--- a/sycl/plugins/unified_runtime/ur/usm_allocator_config.cpp
+++ b/sycl/plugins/unified_runtime/ur/usm_allocator_config.cpp
@@ -184,7 +184,6 @@ USMAllocatorConfig::USMAllocatorConfig() {
   const char *UrRet = std::getenv("UR_L0_USM_ALLOCATOR");
   const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR");
   const char *PoolParams = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-
   if (PoolParams != nullptr) {
     std::string Params(PoolParams);
     size_t Pos = Params.find(';');
@@ -229,7 +228,6 @@ USMAllocatorConfig::USMAllocatorConfig() {
   const char *PoolTraceVal =
       UrRetUsmAllocator ? UrRetUsmAllocator
                         : (PiRetUsmAllocator ? PiRetUsmAllocator : nullptr);
-
   int PoolTrace = 0;
   if (PoolTraceVal != nullptr) {
     PoolTrace = std::atoi(PoolTraceVal);

From a47af47361a879c9dd58f94910a6913070f446d7 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Thu, 11 May 2023 15:25:15 -0700
Subject: [PATCH 36/50] fix implementation of urKernelSetArgPointer

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../level_zero/ur_level_zero_kernel.cpp         | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
index be7e88ddb6923..da98f12f2580a 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
@@ -608,22 +608,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer(
                          ///< holding the argument value. If null then argument
                          ///< value is considered null.
 ) {
-  std::ignore = Kernel;
-  std::ignore = ArgIndex;
-  std::ignore = ArgValue;
-  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer(
-    ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object
-    uint32_t ArgIndex,   ///< [in] argument index in range [0, num args - 1]
-    size_t ArgSize,      ///< [in] size of argument type
-    const void *ArgValue ///< [in][optional] SVM pointer to memory location
-                         ///< holding the argument value. If null then argument
-                         ///< value is considered null.
-) {
-  UR_CALL(urKernelSetArgValue(Kernel, ArgIndex, ArgSize, ArgValue));
+  UR_CALL(urKernelSetArgValue(Kernel, ArgIndex, sizeof(const void *), ArgValue));
   return UR_RESULT_SUCCESS;
 }
 

From ce3aeb176734593d551eea905359080b8a407733 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Fri, 12 May 2023 15:38:19 +0100
Subject: [PATCH 37/50] Bump UR + various pi2ur fixes to allow rebasing cuda
 adapter (#8)

---
 sycl/plugins/unified_runtime/CMakeLists.txt   |   6 +-
 sycl/plugins/unified_runtime/pi2ur.hpp        | 362 ++++++++++++++----
 .../level_zero/ur_level_zero_context.cpp      |   2 +-
 .../level_zero/ur_level_zero_device.cpp       |  23 +-
 .../adapters/level_zero/ur_level_zero_mem.cpp |  41 +-
 .../level_zero/ur_level_zero_queue.cpp        |   2 +-
 sycl/plugins/unified_runtime/ur/ur.hpp        |  57 +--
 7 files changed, 354 insertions(+), 139 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index e9dfeaa6e6fc9..0b4bcef273b73 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -3,8 +3,8 @@
 if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_DIR)
   include(FetchContent)
 
-  set(UNIFIED_RUNTIME_REPO "https://github.com/jandres742/unified-runtime.git")
-  set(UNIFIED_RUNTIME_TAG 8cb3cb2891148a14ef84e840398a1ae8cd84cd6f)
+  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
+  set(UNIFIED_RUNTIME_TAG 0125b2b42aea73c350f7961cd68e0f1f94cc1238)
 
   message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}")
   FetchContent_Declare(unified-runtime
@@ -37,7 +37,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D
   # Restore original flags
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_BAK}")
 
-  add_library(UnifiedRuntimeLoader ALIAS loader)
+  add_library(UnifiedRuntimeLoader ALIAS ur_loader)
 
   set(UNIFIED_RUNTIME_SOURCE_DIR
     ${unified-runtime_SOURCE_DIR} CACHE PATH "Path to Unified Runtime Headers")
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 7444981e836d0..2f3b6211cb46b 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -381,6 +381,76 @@ inline pi_result ur2piDeviceInfoValue(ur_device_info_t ParamName,
   return PI_SUCCESS;
 }
 
+// Translate UR device info values to PI info values
+inline pi_result ur2piUSMAllocInfoValue(ur_usm_alloc_info_t ParamName,
+                                        size_t ParamValueSizePI,
+                                        size_t *ParamValueSizeUR,
+                                        void *ParamValue) {
+  ConvertHelper Value(ParamValueSizePI, ParamValue, ParamValueSizeUR);
+
+  if (ParamName == UR_USM_ALLOC_INFO_TYPE) {
+    auto ConvertFunc = [](ur_usm_type_t UrValue) {
+      switch (UrValue) {
+      case UR_USM_TYPE_UNKNOWN:
+        return PI_MEM_TYPE_UNKNOWN;
+      case UR_USM_TYPE_HOST:
+        return PI_MEM_TYPE_HOST;
+      case UR_USM_TYPE_DEVICE:
+        return PI_MEM_TYPE_DEVICE;
+      case UR_USM_TYPE_SHARED:
+        return PI_MEM_TYPE_SHARED;
+      default:
+        die("UR_USM_ALLOC_INFO_TYPE: unhandled value");
+      }
+    };
+    return Value.convert<ur_usm_type_t, pi_usm_type>(ConvertFunc);
+  }
+
+  return PI_SUCCESS;
+}
+
+// Handle mismatched PI and UR type return sizes for info queries
+inline pi_result fixupInfoValueTypes(size_t ParamValueSizeUR,
+                                     size_t *ParamValueSizeRetPI,
+                                     void *ParamValue) {
+  if (ParamValueSizeUR == 1) {
+    // extend bool to pi_bool (uint32_t)
+    auto *ValIn = static_cast<bool *>(ParamValue);
+    auto *ValOut = static_cast<pi_bool *>(ParamValue);
+    *ValOut = static_cast<pi_bool>(*ValIn);
+    if (ParamValueSizeRetPI) {
+      *ParamValueSizeRetPI = sizeof(pi_bool);
+    }
+  }
+
+  return PI_SUCCESS;
+}
+
+
+inline ur_result_t
+mapPIMetadataToUR(const pi_device_binary_property *pi_metadata,
+                  ur_program_metadata_t *ur_metadata) {
+  ur_metadata->pName = (*pi_metadata)->Name;
+  ur_metadata->size = (*pi_metadata)->ValSize;
+  switch ((*pi_metadata)->Type) {
+  case PI_PROPERTY_TYPE_UINT32:
+    ur_metadata->type = UR_PROGRAM_METADATA_TYPE_UINT32;
+    ur_metadata->value.data32 = (*pi_metadata)->ValSize;
+    return UR_RESULT_SUCCESS;
+  case PI_PROPERTY_TYPE_BYTE_ARRAY:
+    ur_metadata->type = UR_PROGRAM_METADATA_TYPE_BYTE_ARRAY;
+    ur_metadata->value.pData = (*pi_metadata)->ValAddr;
+    return UR_RESULT_SUCCESS;
+  case PI_PROPERTY_TYPE_STRING:
+    ur_metadata->type = UR_PROGRAM_METADATA_TYPE_STRING;
+    ur_metadata->value.pString =
+        reinterpret_cast<char *>((*pi_metadata)->ValAddr);
+    return UR_RESULT_SUCCESS;
+  default:
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+}
+
 namespace pi2ur {
 
 inline pi_result piTearDown(void *PluginParameter) {
@@ -476,6 +546,8 @@ inline pi_result piPlatformGetInfo(pi_platform Platform,
                                   ParamValue, ParamValueSizeRet));
 
   ur2piPlatformInfoValue(UrParamName, ParamValueSize, &SizeInOut, ParamValue);
+  fixupInfoValueTypes(SizeInOut, ParamValueSizeRet, ParamValue);
+
   return PI_SUCCESS;
 }
 
@@ -827,68 +899,65 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
     InfoType = UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE;
     break;
   case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE:
-    InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_BUILD_ON_SUBDEVICE;
+    InfoType = UR_DEVICE_INFO_BUILD_ON_SUBDEVICE;
     break;
   case PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D:
-    InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_MAX_WORK_GROUPS_3D;
+    InfoType = UR_DEVICE_INFO_MAX_WORK_GROUPS_3D;
     break;
   case PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE:
-    InfoType = (ur_device_info_t)UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE;
+    InfoType = UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE;
     break;
   case PI_DEVICE_INFO_DEVICE_ID:
-    InfoType = (ur_device_info_t)UR_DEVICE_INFO_DEVICE_ID;
+    InfoType = UR_DEVICE_INFO_DEVICE_ID;
     break;
   case PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY:
-    InfoType = (ur_device_info_t)UR_DEVICE_INFO_GLOBAL_MEM_FREE;
+    InfoType = UR_DEVICE_INFO_GLOBAL_MEM_FREE;
     break;
   case PI_EXT_INTEL_DEVICE_INFO_MEMORY_CLOCK_RATE:
-    InfoType = (ur_device_info_t)UR_DEVICE_INFO_MEMORY_CLOCK_RATE;
+    InfoType = UR_DEVICE_INFO_MEMORY_CLOCK_RATE;
     break;
   case PI_EXT_INTEL_DEVICE_INFO_MEMORY_BUS_WIDTH:
-    InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_MEMORY_BUS_WIDTH;
+    InfoType = UR_DEVICE_INFO_MEMORY_BUS_WIDTH;
     break;
   case PI_EXT_INTEL_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES:
-    InfoType = (ur_device_info_t)UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES;
+    InfoType = UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES;
     break;
   case PI_DEVICE_INFO_GPU_SLICES:
-    InfoType = (ur_device_info_t)UR_DEVICE_INFO_GPU_EU_SLICES;
+    InfoType = UR_DEVICE_INFO_GPU_EU_SLICES;
     break;
   case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
-    InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE;
+    InfoType = UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE;
     break;
   case PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
-    InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_GPU_HW_THREADS_PER_EU;
+    InfoType = UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU;
     break;
   case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH:
-    InfoType = (ur_device_info_t)UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH;
+    InfoType = UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH;
     break;
   case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS:
-    InfoType = (ur_device_info_t)UR_DEVICE_INFO_BFLOAT16;
+    InfoType = UR_DEVICE_INFO_BFLOAT16;
     break;
   case PI_EXT_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES:
-    InfoType =
-        (ur_device_info_t)UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES;
+    InfoType = UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES;
     break;
   case PI_EXT_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES:
-    InfoType =
-        (ur_device_info_t)UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES;
+    InfoType = UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES;
     break;
   case PI_EXT_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES:
-    InfoType = (ur_device_info_t)UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES;
+    InfoType = UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES;
     break;
   case PI_EXT_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES:
-    InfoType = (ur_device_info_t)UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES;
+    InfoType = UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES;
     break;
   case PI_EXT_INTEL_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
-    InfoType = (ur_device_info_t)UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT;
+    InfoType = UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT;
     break;
   case PI_DEVICE_INFO_IMAGE_SRGB:
-    InfoType = (ur_device_info_t)UR_DEVICE_INFO_IMAGE_SRGB;
+    InfoType = UR_DEVICE_INFO_IMAGE_SRGB;
     break;
   case PI_DEVICE_INFO_BACKEND_VERSION: {
-    // TODO: return some meaningful for backend_version below
-    ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet);
-    return ReturnValue("");
+    InfoType = UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION;
+    break;
   }
   default:
     return PI_ERROR_UNKNOWN;
@@ -903,6 +972,7 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
                                 ParamValueSizeRet));
 
   ur2piDeviceInfoValue(InfoType, ParamValueSize, &SizeInOut, ParamValue);
+  fixupInfoValueTypes(SizeInOut, ParamValueSizeRet, ParamValue);
 
   return PI_SUCCESS;
 }
@@ -1074,13 +1144,12 @@ inline pi_result piContextCreate(const pi_context_properties *Properties,
   return PI_SUCCESS;
 }
 
-// FIXME: Dummy implementation to prevent link fail
 inline pi_result piextContextSetExtendedDeleter(
     pi_context Context, pi_context_extended_deleter Function, void *UserData) {
-  std::ignore = Context;
-  std::ignore = Function;
-  std::ignore = UserData;
-  die("piextContextSetExtendedDeleter: not supported");
+  auto hContext = reinterpret_cast<ur_context_handle_t>(Context);
+
+  HANDLE_ERRORS(urContextSetExtendedDeleter(hContext, Function, UserData));
+
   return PI_SUCCESS;
 }
 
@@ -1164,6 +1233,8 @@ inline pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName,
 
   HANDLE_ERRORS(urContextGetInfo(hContext, ContextInfoType, ParamValueSize,
                                  ParamValue, ParamValueSizeRet));
+  fixupInfoValueTypes(ParamValueSize, ParamValueSizeRet, ParamValue);
+
   return PI_SUCCESS;
 }
 
@@ -1213,6 +1284,7 @@ inline pi_result piextQueueCreate(pi_context Context, pi_device Device,
   PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
 
   ur_queue_properties_t UrProperties{};
+  UrProperties.stype = UR_STRUCTURE_TYPE_QUEUE_PROPERTIES;
   if (Properties[1] & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)
     UrProperties.flags |= UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
   if (Properties[1] & PI_QUEUE_FLAG_PROFILING_ENABLE)
@@ -1252,6 +1324,12 @@ inline pi_result piQueueCreate(pi_context Context, pi_device Device,
   return pi2ur::piextQueueCreate(Context, Device, Properties, Queue);
 }
 
+inline pi_result piextQueueCreate2(pi_context context, pi_device device,
+                                   pi_queue_properties *properties,
+                                   pi_queue *queue) {
+  return pi2ur::piextQueueCreate(context, device, properties, queue);
+}
+
 inline pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
                                                   pi_context Context,
                                                   pi_device Device,
@@ -1275,6 +1353,16 @@ inline pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
   return PI_SUCCESS;
 }
 
+inline pi_result piextQueueCreateWithNativeHandle2(
+    pi_native_handle nativeHandle, int32_t nativeHandleDesc, pi_context context,
+    pi_device device, bool pluginOwnsNativeHandle,
+    pi_queue_properties *Properties, pi_queue *queue) {
+  (void)nativeHandleDesc;
+  (void)Properties;
+  return pi2ur::piextQueueCreateWithNativeHandle(nativeHandle, context, device,
+                                                 pluginOwnsNativeHandle, queue);
+}
+
 inline pi_result piextQueueGetNativeHandle(pi_queue Queue,
                                            pi_native_handle *NativeHandle) {
 
@@ -1291,6 +1379,16 @@ inline pi_result piextQueueGetNativeHandle(pi_queue Queue,
   return PI_SUCCESS;
 }
 
+
+inline pi_result piextQueueGetNativeHandle2(pi_queue Queue,
+                                            pi_native_handle *NativeHandle,
+                                            int32_t *NativeHandleDesc) {
+
+  (void)NativeHandleDesc;
+  return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle);
+}
+
+
 inline pi_result piQueueRelease(pi_queue Queue) {
   PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
 
@@ -1347,7 +1445,7 @@ inline pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName,
     break;
   }
   case PI_EXT_ONEAPI_QUEUE_INFO_EMPTY: {
-    UrParamName = UR_EXT_ONEAPI_QUEUE_INFO_EMPTY;
+    UrParamName = UR_QUEUE_INFO_EMPTY;
     break;
   }
   default: {
@@ -1414,9 +1512,6 @@ inline pi_result piProgramCreateWithBinary(
     const size_t *Lengths, const unsigned char **Binaries,
     size_t NumMetadataEntries, const pi_device_binary_property *Metadata,
     pi_int32 *BinaryStatus, pi_program *Program) {
-  std::ignore = Metadata;
-  std::ignore = NumMetadataEntries;
-
   PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
   PI_ASSERT(DeviceList && NumDevices, PI_ERROR_INVALID_VALUE);
   PI_ASSERT(Binaries && Lengths, PI_ERROR_INVALID_VALUE);
@@ -1437,8 +1532,18 @@ inline pi_result piProgramCreateWithBinary(
       reinterpret_cast<ur_context_handle_t>(Context);
   auto UrDevice = reinterpret_cast<ur_device_handle_t>(DeviceList[0]);
 
-  // TODO: Translate Metadata into Properties?
-  ur_program_properties_t Properties{};
+  std::unique_ptr<ur_program_metadata_t[]> pMetadatas(
+      new ur_program_metadata_t[NumMetadataEntries]);
+  for (unsigned i = 0; i < NumMetadataEntries; i++) {
+    HANDLE_ERRORS(mapPIMetadataToUR(&Metadata[i], &pMetadatas[i]));
+  }
+
+  ur_program_properties_t Properties;
+  Properties.stype = UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES;
+  Properties.pNext = nullptr;
+  Properties.count = NumMetadataEntries;
+  Properties.pMetadatas = pMetadatas.get();
+
   ur_program_handle_t *UrProgram =
       reinterpret_cast<ur_program_handle_t *>(Program);
   HANDLE_ERRORS(urProgramCreateWithBinary(UrContext, UrDevice, Lengths[0],
@@ -1753,6 +1858,15 @@ inline pi_result piKernelSetArg(pi_kernel Kernel, pi_uint32 ArgIndex,
   return PI_SUCCESS;
 }
 
+inline pi_result piKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index,
+                                       size_t arg_size, const void *arg_value) {
+  (void)arg_size;
+  auto hKernel = reinterpret_cast<ur_kernel_handle_t>(kernel);
+  HANDLE_ERRORS(urKernelSetArgPointer(hKernel, arg_index, arg_value));
+
+  return PI_SUCCESS;
+}
+
 inline pi_result
 piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
                                   pi_context Context, pi_program Program,
@@ -2178,14 +2292,6 @@ inline pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags,
   PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
   PI_ASSERT(RetMem, PI_ERROR_INVALID_VALUE);
 
-  // TODO: implement support for more access modes
-  if (!((Flags & PI_MEM_FLAGS_ACCESS_RW) ||
-        (Flags & PI_MEM_ACCESS_READ_ONLY))) {
-    die("piMemBufferCreate: Level-Zero supports read-write and read-only "
-        "buffer,"
-        "but not other accesses (such as write-only) yet.");
-  }
-
   if (properties != nullptr) {
     die("piMemBufferCreate: no mem properties goes to Level-Zero RT yet");
   }
@@ -2362,7 +2468,7 @@ static void pi2urImageDesc(const pi_image_format *ImageFormat,
     break;
   }
   case PI_IMAGE_CHANNEL_ORDER_ABGR: {
-    UrFormat->channelOrder = UR_EXT_IMAGE_CHANNEL_ORDER_ABGR;
+    UrFormat->channelOrder = UR_IMAGE_CHANNEL_ORDER_ABGR;
     break;
   }
   case PI_IMAGE_CHANNEL_ORDER_INTENSITY: {
@@ -2444,11 +2550,6 @@ inline pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags,
                                   const pi_image_desc *ImageDesc, void *HostPtr,
                                   pi_mem *RetImage) {
 
-  // TODO: implement read-only, write-only
-  if ((Flags & PI_MEM_FLAGS_ACCESS_RW) == 0) {
-    die("piMemImageCreate: Level-Zero implements only read-write buffer,"
-        "no read-only or write-only yet.");
-  }
   PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
   PI_ASSERT(RetImage, PI_ERROR_INVALID_VALUE);
   PI_ASSERT(ImageFormat, PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
@@ -2778,9 +2879,23 @@ inline pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr,
 
   ur_event_handle_t *UrEvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
 
-  // TODO: to map from pi_mem_advice to ur_mem_advice_t
-  // once we have those defined
   ur_usm_advice_flags_t UrAdvice{};
+  if (Advice & PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY) {
+    UrAdvice |= UR_USM_ADVICE_FLAG_SET_READ_MOSTLY;
+  }
+  if (Advice & PI_MEM_ADVICE_CUDA_UNSET_READ_MOSTLY) {
+    UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY;
+  }
+  if (Advice & PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION) {
+    UrAdvice |= UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION;
+  }
+  if (Advice & PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION) {
+    UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION;
+  }
+  if (Advice & PI_MEM_ADVICE_RESET) {
+    UrAdvice |= UR_USM_ADVICE_FLAG_DEFAULT;
+  }
+
   HANDLE_ERRORS(urEnqueueUSMAdvise(UrQueue, Ptr, Length, UrAdvice, UrEvent));
 
   return PI_SUCCESS;
@@ -2805,18 +2920,18 @@ inline pi_result piextUSMEnqueueFill2D(pi_queue Queue, void *Ptr, size_t Pitch,
                                        const pi_event *EventsWaitList,
                                        pi_event *Event) {
 
-  std::ignore = Queue;
-  std::ignore = Ptr;
-  std::ignore = Pitch;
-  std::ignore = PatternSize;
-  std::ignore = Pattern;
-  std::ignore = Width;
-  std::ignore = Height;
-  std::ignore = NumEventsWaitList;
-  std::ignore = EventsWaitList;
-  std::ignore = Event;
-  die("piextUSMEnqueueFill2D: not implemented");
-  return {};
+
+  auto hQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  auto phEventWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+  auto phEvent = reinterpret_cast<ur_event_handle_t *>(Event);
+
+  HANDLE_ERRORS(urEnqueueUSMFill2D(hQueue, Ptr, Pitch, PatternSize, Pattern,
+                                   Width, Height, NumEventsWaitList,
+                                   phEventWaitList, phEvent));
+
+  return PI_SUCCESS;
+
 }
 
 inline pi_result piextUSMEnqueueMemset2D(pi_queue Queue, void *Ptr,
@@ -2872,25 +2987,57 @@ inline pi_result piextUSMGetMemAllocInfo(pi_context Context, const void *Ptr,
   }
   }
 
+  size_t SizeInOut = ParamValueSize;
   HANDLE_ERRORS(urUSMGetMemAllocInfo(UrContext, Ptr, UrParamName,
                                      ParamValueSize, ParamValue,
                                      ParamValueSizeRet))
+  ur2piUSMAllocInfoValue(UrParamName, ParamValueSize, &SizeInOut, ParamValue);
   return PI_SUCCESS;
 }
 
 inline pi_result piMemImageGetInfo(pi_mem Image, pi_image_info ParamName,
                                    size_t ParamValueSize, void *ParamValue,
-                                   size_t *ParamValueSizeRet) { // missing
-  std::ignore = Image;
-  std::ignore = ParamName;
-  std::ignore = ParamValueSize;
-  std::ignore = ParamValue;
-  std::ignore = ParamValueSizeRet;
+                                   size_t *ParamValueSizeRet) {
+
+  auto hMem = reinterpret_cast<ur_mem_handle_t>(Image);
 
-  // TODO: use urMemImageGetInfo
+  ur_image_info_t UrParamName{};
+  switch (ParamName) {
+  case PI_IMAGE_INFO_FORMAT: {
+    UrParamName = UR_IMAGE_INFO_FORMAT;
+    break;
+  }
+  case PI_IMAGE_INFO_ELEMENT_SIZE: {
+    UrParamName = UR_IMAGE_INFO_ELEMENT_SIZE;
+    break;
+  }
+  case PI_IMAGE_INFO_ROW_PITCH: {
+    UrParamName = UR_IMAGE_INFO_ROW_PITCH;
+    break;
+  }
+  case PI_IMAGE_INFO_SLICE_PITCH: {
+    UrParamName = UR_IMAGE_INFO_SLICE_PITCH;
+    break;
+  }
+  case PI_IMAGE_INFO_WIDTH: {
+    UrParamName = UR_IMAGE_INFO_WIDTH;
+    break;
+  }
+  case PI_IMAGE_INFO_HEIGHT: {
+    UrParamName = UR_IMAGE_INFO_HEIGHT;
+    break;
+  }
+  case PI_IMAGE_INFO_DEPTH: {
+    UrParamName = UR_IMAGE_INFO_DEPTH;
+    break;
+  }
+  default:
+    return PI_ERROR_UNKNOWN;
+  }
 
-  die("piMemImageGetInfo: not implemented");
-  return {};
+  HANDLE_ERRORS(urMemImageGetInfo(hMem, UrParamName, ParamValueSize, ParamValue,
+                                  ParamValueSizeRet));
+  return PI_SUCCESS;
 }
 
 /// USM 2D Memcpy API
@@ -3039,7 +3186,7 @@ inline pi_result piEnqueueMemBufferMap(
   if (MapFlags & PI_MAP_WRITE)
     UrMapFlags |= UR_MAP_FLAG_WRITE;
   if (MapFlags & PI_MAP_WRITE_INVALIDATE_REGION)
-    UrMapFlags |= UR_EXT_MAP_FLAG_WRITE_INVALIDATE_REGION;
+    UrMapFlags |= UR_MAP_FLAG_WRITE_INVALIDATE_REGION;
 
   const ur_event_handle_t *UrEventsWaitList =
       reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
@@ -3356,6 +3503,43 @@ inline pi_result piEnqueueEventsWait(pi_queue Queue,
 
   return PI_SUCCESS;
 }
+
+
+inline pi_result
+piextEnqueueReadHostPipe(pi_queue queue, pi_program program,
+                         const char *pipe_symbol, pi_bool blocking, void *ptr,
+                         size_t size, pi_uint32 num_events_in_waitlist,
+                         const pi_event *events_waitlist, pi_event *event) {
+  auto hQueue = reinterpret_cast<ur_queue_handle_t>(queue);
+  auto hProgram = reinterpret_cast<ur_program_handle_t>(program);
+  auto phEventWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(events_waitlist);
+  auto phEvent = reinterpret_cast<ur_event_handle_t *>(event);
+
+  HANDLE_ERRORS(urEnqueueReadHostPipe(hQueue, hProgram, pipe_symbol, blocking,
+                                      ptr, size, num_events_in_waitlist,
+                                      phEventWaitList, phEvent));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result
+piextEnqueueWriteHostPipe(pi_queue queue, pi_program program,
+                          const char *pipe_symbol, pi_bool blocking, void *ptr,
+                          size_t size, pi_uint32 num_events_in_waitlist,
+                          const pi_event *events_waitlist, pi_event *event) {
+  auto hQueue = reinterpret_cast<ur_queue_handle_t>(queue);
+  auto hProgram = reinterpret_cast<ur_program_handle_t>(program);
+  auto phEventWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(events_waitlist);
+  auto phEvent = reinterpret_cast<ur_event_handle_t *>(event);
+
+  HANDLE_ERRORS(urEnqueueWriteHostPipe(hQueue, hProgram, pipe_symbol, blocking,
+                                       ptr, size, num_events_in_waitlist,
+                                       phEventWaitList, phEvent));
+
+  return PI_SUCCESS;
+}
 // Enqueue
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -3601,13 +3785,33 @@ inline pi_result piSamplerCreate(pi_context Context,
 inline pi_result piSamplerGetInfo(pi_sampler Sampler, pi_sampler_info ParamName,
                                   size_t ParamValueSize, void *ParamValue,
                                   size_t *ParamValueSizeRet) {
-  std::ignore = Sampler;
-  std::ignore = ParamName;
-  std::ignore = ParamValueSize;
-  std::ignore = ParamValue;
-  std::ignore = ParamValueSizeRet;
+  ur_sampler_info_t InfoType{};
+  switch (ParamName) {
+  case PI_SAMPLER_INFO_REFERENCE_COUNT:
+    InfoType = UR_SAMPLER_INFO_REFERENCE_COUNT;
+    break;
+  case PI_SAMPLER_INFO_CONTEXT:
+    InfoType = UR_SAMPLER_INFO_CONTEXT;
+    break;
+  case PI_SAMPLER_INFO_NORMALIZED_COORDS:
+    InfoType = UR_SAMPLER_INFO_NORMALIZED_COORDS;
+    break;
+  case PI_SAMPLER_INFO_ADDRESSING_MODE:
+    InfoType = UR_SAMPLER_INFO_ADDRESSING_MODE;
+    break;
+  case PI_SAMPLER_INFO_FILTER_MODE:
+    InfoType = UR_SAMPLER_INFO_FILTER_MODE;
+    break;
+  default:
+    return PI_ERROR_UNKNOWN;
+  }
+
+  size_t SizeInOut = ParamValueSize;
+  auto hSampler = reinterpret_cast<ur_sampler_handle_t>(Sampler);
+  HANDLE_ERRORS(urSamplerGetInfo(hSampler, InfoType, SizeInOut, ParamValue,
+                                 ParamValueSizeRet));
+  fixupInfoValueTypes(SizeInOut, ParamValueSizeRet, ParamValue);
 
-  die("piSamplerGetInfo: not implemented");
   return PI_SUCCESS;
 }
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
index 4c998fb6294ea..9b61460205087 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
@@ -111,7 +111,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
   case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT:
     // 2D USM fill is not supported.
     return ReturnValue(pi_bool{false});
-  case UR_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
+  case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
     ur_memory_order_capability_flags_t Capabilities =
         UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
         UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
index 5fac6f1e4d77a..f1e9ee46ea76b 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
@@ -193,7 +193,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   // zeModuleCreate allows using root device module for sub-devices:
   // > The application must only use the module for the device, or its
   // > sub-devices, which was provided during creation.
-  case UR_EXT_DEVICE_INFO_BUILD_ON_SUBDEVICE:
+  case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE:
     return ReturnValue(uint32_t{0});
   case UR_DEVICE_INFO_COMPILER_AVAILABLE:
     return ReturnValue(static_cast<ur_bool_t>(true));
@@ -227,7 +227,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
                        Device->ZeDeviceComputeProperties->maxGroupSizeZ}};
     return ReturnValue(MaxGroupSize);
   }
-  case UR_EXT_DEVICE_INFO_MAX_WORK_GROUPS_3D: {
+  case UR_DEVICE_INFO_MAX_WORK_GROUPS_3D: {
     struct {
       size_t Arr[3];
     } MaxGroupCounts = {{Device->ZeDeviceComputeProperties->maxGroupCountX,
@@ -575,13 +575,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     auto MapCaps = [](const ze_memory_access_cap_flags_t &ZeCapabilities) {
       uint64_t Capabilities = 0;
       if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_RW)
-        Capabilities |= UR_EXT_USM_CAPS_ACCESS;
+        Capabilities |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS;
       if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC)
-        Capabilities |= UR_EXT_USM_CAPS_ATOMIC_ACCESS;
+        Capabilities |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS;
       if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT)
-        Capabilities |= UR_EXT_USM_CAPS_CONCURRENT_ACCESS;
+        Capabilities |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS;
       if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC)
-        Capabilities |= UR_EXT_USM_CAPS_CONCURRENT_ATOMIC_ACCESS;
+        Capabilities |=
+            UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
       return Capabilities;
     };
     auto &Props = Device->ZeDeviceMemoryAccessProperties;
@@ -625,7 +626,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     if (getenv("ZES_ENABLE_SYSMAN") == nullptr) {
       setErrorMessage("Set ZES_ENABLE_SYSMAN=1 to obtain free memory",
                       UR_RESULT_SUCCESS);
-      return UR_EXT_RESULT_ADAPTER_SPECIFIC_ERROR;
+      return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
     }
     // Only report device memory which zeMemAllocDevice can allocate from.
     // Currently this is only the one enumerated with ordinal 0.
@@ -669,7 +670,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
                          Device->ZeDeviceMemoryProperties->first.end(), Comp);
     return ReturnValue(uint32_t{MinIt->maxClockRate});
   }
-  case UR_EXT_DEVICE_INFO_MEMORY_BUS_WIDTH: {
+  case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: {
     // If there are not any memory modules then return 0.
     if (Device->ZeDeviceMemoryProperties->first.empty())
       return ReturnValue(uint32_t{0});
@@ -711,9 +712,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
     return ReturnValue(
         uint32_t{Device->ZeDeviceProperties->numSubslicesPerSlice});
-  case UR_EXT_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
+  case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
     return ReturnValue(uint32_t{Device->ZeDeviceProperties->numEUsPerSubslice});
-  case UR_EXT_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
+  case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
     return ReturnValue(uint32_t{Device->ZeDeviceProperties->numThreadsPerEU});
   case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH:
     // currently not supported in level zero runtime
@@ -768,7 +769,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
         UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST;
     return ReturnValue(capabilities);
   }
-  case UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
+  case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
     return ReturnValue(pi_bool{false});
   case UR_DEVICE_INFO_IMAGE_SRGB:
     return ReturnValue(pi_bool{false});
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index 43c0d691f5ad0..133306c910ce4 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -921,7 +921,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
 
   // Translate the host access mode info.
   ur_mem_handle_t_::access_mode_t AccessMode = ur_mem_handle_t_::unknown;
-  if (MapFlags & UR_EXT_MAP_FLAG_WRITE_INVALIDATE_REGION)
+  if (MapFlags & UR_MAP_FLAG_WRITE_INVALIDATE_REGION)
     AccessMode = ur_mem_handle_t_::write_only;
   else {
     if (MapFlags & UR_MAP_FLAG_READ) {
@@ -3161,4 +3161,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
       Pattern,     // It will be interpreted as an 8-bit value,
       PatternSize, // which is indicated with this pattern_size==1
       Size, NumEventsInWaitList, EventWaitList, Event);
-}
\ No newline at end of file
+}
+
+/// Host Pipes
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueReadHostPipe(
+    ur_queue_handle_t hQueue, ur_program_handle_t hProgram,
+    const char *pipe_symbol, bool blocking, void *pDst, size_t size,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  std::ignore = hQueue;
+  std::ignore = hProgram;
+  std::ignore = pipe_symbol;
+  std::ignore = blocking;
+  std::ignore = pDst;
+  std::ignore = size;
+  std::ignore = numEventsInWaitList;
+  std::ignore = phEventWaitList;
+  std::ignore = phEvent;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe(
+    ur_queue_handle_t hQueue, ur_program_handle_t hProgram,
+    const char *pipe_symbol, bool blocking, void *pSrc, size_t size,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  std::ignore = hQueue;
+  std::ignore = hProgram;
+  std::ignore = pipe_symbol;
+  std::ignore = blocking;
+  std::ignore = pSrc;
+  std::ignore = size;
+  std::ignore = numEventsInWaitList;
+  std::ignore = phEventWaitList;
+  std::ignore = phEvent;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
index fe81cd1e2a3a0..3ca6ecad4c994 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
@@ -167,7 +167,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(
   case UR_QUEUE_INFO_DEVICE_DEFAULT:
     die("UR_QUEUE_INFO_DEVICE_DEFAULT in urQueueGetInfo not implemented\n");
     break;
-  case UR_EXT_ONEAPI_QUEUE_INFO_EMPTY: {
+  case UR_QUEUE_INFO_EMPTY: {
     // We can exit early if we have in-order queue.
     if (Queue->isInOrderQueue()) {
       if (!Queue->LastCommandEvent)
diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp
index e5bd87108e824..24a38ab318751 100644
--- a/sycl/plugins/unified_runtime/ur/ur.hpp
+++ b/sycl/plugins/unified_runtime/ur/ur.hpp
@@ -36,53 +36,12 @@ template <> uint32_t inline ur_cast(uint64_t Value) {
 
 // TODO: promote all of the below extensions to the Unified Runtime
 //       and get rid of these ZER_EXT constants.
-const int UR_EXT_DEVICE_INFO_END = UR_DEVICE_INFO_FORCE_UINT32;
-const int UR_EXT_DEVICE_INFO_BUILD_ON_SUBDEVICE = UR_EXT_DEVICE_INFO_END - 1;
-const int UR_EXT_DEVICE_INFO_MAX_WORK_GROUPS_3D = UR_EXT_DEVICE_INFO_END - 2;
-// const int UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES =
-//     UR_EXT_DEVICE_INFO_END - 3;
-// const int ZER_EXT_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS =
-//     UR_EXT_DEVICE_INFO_END - 4;
-const int UR_EXT_DEVICE_INFO_GPU_HW_THREADS_PER_EU = UR_EXT_DEVICE_INFO_END - 7;
-const int UR_EXT_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE =
-    UR_EXT_DEVICE_INFO_END - 8;
-// const int UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES =
-//     UR_EXT_DEVICE_INFO_END - 10;
-const int UR_EXT_DEVICE_INFO_MEMORY_BUS_WIDTH = UR_EXT_DEVICE_INFO_END - 11;
-// const int ZER_EXT_DEVICE_INFO_MEMORY_CLOCK_RATE = UR_EXT_DEVICE_INFO_END -
-// 12;
-// const int ZER_EXT_DEVICE_INFO_DEVICE_ID = UR_EXT_DEVICE_INFO_END - 14;
-// const int ZER_EXT_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE =
-//     UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE;
-const int UR_EXT_DEVICE_INFO_MEM_CHANNEL_SUPPORT = UR_EXT_DEVICE_INFO_END - 15;
-
 const ur_device_info_t UR_EXT_DEVICE_INFO_OPENCL_C_VERSION =
     (ur_device_info_t)0x103D;
 
-const uint32_t UR_EXT_MAP_FLAG_WRITE_INVALIDATE_REGION =
-    (UR_MAP_FLAG_WRITE << 1);
-
-const int UR_EXT_RESULT_END = 0x1000;
-const ur_result_t UR_EXT_RESULT_ADAPTER_SPECIFIC_ERROR =
-    ur_result_t(UR_EXT_RESULT_END - 1);
-
-const int UR_EXT_USM_CAPS_ACCESS = 1 << 0;
-const int UR_EXT_USM_CAPS_ATOMIC_ACCESS = 1 << 1;
-const int UR_EXT_USM_CAPS_CONCURRENT_ACCESS = 1 << 2;
-const int UR_EXT_USM_CAPS_CONCURRENT_ATOMIC_ACCESS = 1 << 3;
-
-const ur_context_info_t UR_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES =
-    (ur_context_info_t)(UR_CONTEXT_INFO_FORCE_UINT32 - 1);
-
-const ur_queue_info_t UR_EXT_ONEAPI_QUEUE_INFO_EMPTY =
-    (ur_queue_info_t)(UR_QUEUE_INFO_SIZE + 1);
-
 const ur_command_t UR_EXT_COMMAND_TYPE_USER =
     (ur_command_t)((uint32_t)UR_COMMAND_FORCE_UINT32 - 1);
 
-const ur_image_channel_order_t UR_EXT_IMAGE_CHANNEL_ORDER_ABGR =
-    ur_image_channel_order_t(UR_IMAGE_CHANNEL_ORDER_FORCE_UINT32 - 1);
-
 const ur_kernel_exec_info_t UR_EXT_KERNEL_EXEC_INFO_CACHE_CONFIG =
     (ur_kernel_exec_info_t)(UR_KERNEL_EXEC_INFO_FORCE_UINT32 - 1);
 
@@ -95,6 +54,20 @@ typedef enum {
   UR_EXT_KERNEL_EXEC_INFO_CACHE_LARGE_DATA = 0x2
 } ur_kernel_cache_config;
 
+// TODO(ur): These CUDA specific queue properties should live in the UR spec. In
+// the mean time just use the PI values.
+// PI Command Queue using Default stream
+#define __SYCL_UR_CUDA_USE_DEFAULT_STREAM (0xFF03)
+// PI Command queue will sync with default stream
+#define __SYCL_UR_CUDA_SYNC_WITH_DEFAULT (0xFF04)
+
+/// Program metadata tags recognized by the UR adapters. For kernels the tag
+/// must appear after the kernel name.
+#define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE                    \
+  "@reqd_work_group_size"
+#define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping"
+
+
 // Terminates the process with a catastrophic error message.
 [[noreturn]] inline void die(const char *Message) {
   std::cerr << "die: " << Message << std::endl;
@@ -324,4 +297,4 @@ class UrReturnHelper {
   size_t param_value_size;
   void *param_value;
   size_t *param_value_size_ret;
-};
\ No newline at end of file
+};

From a73bc2030cc7bd23bd910dce113c7dba1a5a5f3a Mon Sep 17 00:00:00 2001
From: Brandon Yates <brandon.yates@intel.com>
Date: Fri, 12 May 2023 20:35:50 +0000
Subject: [PATCH 38/50] Fixes after reg (#9)

* Fixes for porting to UR repo (#4)

* Fixes for porting to UR repo

Signed-off-by: Brandon Yates <brandon.yates@intel.com>
---
 .../ur/adapters/level_zero/ur_level_zero.cpp  |  2 +-
 .../level_zero/ur_level_zero_common.hpp       |  4 +--
 .../level_zero/ur_level_zero_context.cpp      | 10 +++++---
 .../level_zero/ur_level_zero_context.hpp      |  2 +-
 .../level_zero/ur_level_zero_device.cpp       | 11 ++++----
 .../level_zero/ur_level_zero_device.hpp       |  2 +-
 .../level_zero/ur_level_zero_event.cpp        | 10 +++++---
 .../level_zero/ur_level_zero_event.hpp        |  2 +-
 .../level_zero/ur_level_zero_kernel.cpp       |  7 +++---
 .../adapters/level_zero/ur_level_zero_mem.cpp | 25 +++++++++++--------
 .../adapters/level_zero/ur_level_zero_mem.hpp |  6 ++---
 .../level_zero/ur_level_zero_platform.cpp     |  3 ++-
 .../level_zero/ur_level_zero_program.cpp      |  3 ++-
 .../level_zero/ur_level_zero_queue.cpp        | 13 +++++-----
 .../level_zero/ur_level_zero_queue.hpp        |  2 +-
 .../level_zero/ur_level_zero_sampler.cpp      |  3 ++-
 16 files changed, 60 insertions(+), 45 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp
index 51fe4cf9c475b..92ada96340bd9 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp
@@ -11,7 +11,7 @@
 #include <string.h>
 
 #include "ur_level_zero.hpp"
-#include <ur_bindings.hpp>
+  
 
 // Define the static class field
 std::mutex ZeCall::GlobalLock;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
index f3a8ba48b2eba..9d375bb8e2fab 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
@@ -16,13 +16,13 @@
 #include <unordered_map>
 #include <vector>
 
-#include <sycl/detail/pi.h>
+ 
 #include <ur/ur.hpp>
 #include <ur_api.h>
 #include <ze_api.h>
 #include <zes_api.h>
 
-#include "ur/usm_allocator_config.hpp"
+#include <ur/usm_allocator_config.hpp>
 
 struct _ur_platform_handle_t;
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
index 9b61460205087..e6cd4ff02f981 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
@@ -13,7 +13,8 @@
 
 #include "ur_level_zero.hpp"
 #include "ur_level_zero_context.hpp"
-#include <ur_bindings.hpp>
+#include "ur_level_zero.hpp"
+  
 
 UR_APIEXPORT ur_result_t UR_APICALL urContextCreate(
     uint32_t DeviceCount, ///< [in] the number of devices given in phDevices
@@ -107,11 +108,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
     return ReturnValue(uint32_t{Context->RefCount.load()});
   case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT:
     // 2D USM memcpy is supported.
-    return ReturnValue(pi_bool{UseMemcpy2DOperations});
+    return ReturnValue(ur_bool_t{UseMemcpy2DOperations});
   case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT:
     // 2D USM fill is not supported.
-    return ReturnValue(pi_bool{false});
+    return ReturnValue(ur_bool_t{false});
   case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
+
     ur_memory_order_capability_flags_t Capabilities =
         UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
         UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
@@ -410,7 +412,7 @@ ur_result_t ur_context_handle_t_::finalize() {
 // Maximum number of events that can be present in an event ZePool is captured
 // here. Setting it to 256 gave best possible performance for several
 // benchmarks.
-static const pi_uint32 MaxNumEventsPerPool = [] {
+static const uint32_t MaxNumEventsPerPool = [] {
   const char *UrRet = std::getenv("UR_L0_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
   const char *PiRet = std::getenv("ZE_MAX_NUMBER_OF_EVENTS_PER_EVENT_POOL");
   const char *MaxNumEventsPerPoolEnv =
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp
index a980a80a855f3..2a9d2f97e84f9 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp
@@ -15,7 +15,7 @@
 #include <unordered_set>
 #include <vector>
 
-#include <sycl/detail/pi.h>
+ 
 #include <ur/ur.hpp>
 #include <ur_api.h>
 #include <ze_api.h>
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
index f1e9ee46ea76b..83a5a33abda51 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
@@ -7,9 +7,10 @@
 //===-----------------------------------------------------------------===//
 
 #include "ur_level_zero_device.hpp"
+#include "ur_level_zero.hpp"
 #include <algorithm>
 #include <climits>
-#include <ur_bindings.hpp>
+  
 
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(
     ur_platform_handle_t Platform, ///< [in] handle of the platform instance
@@ -770,9 +771,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(capabilities);
   }
   case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
-    return ReturnValue(pi_bool{false});
+    return ReturnValue(ur_bool_t{false});
   case UR_DEVICE_INFO_IMAGE_SRGB:
-    return ReturnValue(pi_bool{false});
+    return ReturnValue(ur_bool_t{false});
 
   case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES:
   case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: {
@@ -1196,7 +1197,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary(
 
   // Look for GEN binary, which we known can only be handled by Level-Zero now.
   const char *BinaryTarget =
-      UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; //__SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN;
+      UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; //UR_DEVICE_BINARY_TARGET_SPIRV64_GEN;
 
   uint32_t *SelectedBinaryInd = SelectedBinary;
 
@@ -1210,7 +1211,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary(
       return UR_RESULT_SUCCESS;
     }
     if (strcmp(Binaries[i].pDeviceTargetSpec,
-               __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64) == 0)
+               UR_DEVICE_BINARY_TARGET_SPIRV64) == 0)
       Spirv = i;
   }
   // Points to a spirv image, if such indeed was found
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp
index 8aff6f170127f..e8514ce569f45 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp
@@ -15,7 +15,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include <sycl/detail/pi.h>
+ 
 #include <ur/ur.hpp>
 #include <ur_api.h>
 #include <ze_api.h>
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
index d39c40982bd6f..72cfcbed5bbbc 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
@@ -13,7 +13,8 @@
 
 #include "ur_level_zero_common.hpp"
 #include "ur_level_zero_event.hpp"
-#include <ur_bindings.hpp>
+#include "ur_level_zero.hpp"
+  
 
 void printZeEventList(const _ur_ze_event_list_t &UrZeEventList) {
   urPrint("  NumEventsInWaitList %d:", UrZeEventList.Length);
@@ -389,7 +390,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
 ) {
   std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
   if (Event->UrQueue &&
-      (Event->UrQueue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) == 0) {
+      (Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) {
     return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
   }
 
@@ -649,6 +650,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle(
     UrEvent = new ur_event_handle_t_(ZeEvent, nullptr /* ZeEventPool */,
                                      Context, UR_EXT_COMMAND_TYPE_USER,
                                      Properties->isNativeHandleOwned);
+
   } catch (const std::bad_alloc &) {
     return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
   } catch (...) {
@@ -902,7 +904,7 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
                         bool HostVisible, ur_event_handle_t *RetEvent) {
 
   bool ProfilingEnabled =
-      !Queue || (Queue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) != 0;
+      !Queue || (Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0;
 
   if (auto CachedEvent =
           Context->getEventFromContextCache(HostVisible, ProfilingEnabled)) {
@@ -1181,5 +1183,5 @@ ur_result_t _ur_ze_event_list_t::collectEventsForReleaseAndDestroyPiZeEventList(
 // Tells if this event is with profiling capabilities.
 bool ur_event_handle_t_::isProfilingEnabled() const {
   return !UrQueue || // tentatively assume user events are profiling enabled
-         (UrQueue->Properties & PI_QUEUE_FLAG_PROFILING_ENABLE) != 0;
+         (UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp
index 9e129adb0fb7e..42c9468ec2ef0 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp
@@ -17,7 +17,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include <sycl/detail/pi.h>
+ 
 #include <ur/ur.hpp>
 #include <ur_api.h>
 #include <ze_api.h>
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
index da98f12f2580a..38df90c31ee72 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
@@ -7,7 +7,8 @@
 //===-----------------------------------------------------------------===//
 
 #include "ur_level_zero_kernel.hpp"
-#include <ur_bindings.hpp>
+#include "ur_level_zero.hpp"
+  
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     ur_queue_handle_t Queue,   ///< [in] handle of the queue object
@@ -512,7 +513,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo(
     // As of right now, L0 is missing API to query kernel and device specific
     // max work group size.
     return ReturnValue(
-        pi_uint64{Device->ZeDeviceComputeProperties->maxTotalGroupSize});
+        uint64_t{Device->ZeDeviceComputeProperties->maxTotalGroupSize});
   }
   case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
     struct {
@@ -623,7 +624,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetExecInfo(
 
   std::scoped_lock<ur_shared_mutex> Guard(Kernel->Mutex);
   if (PropName == UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS &&
-      *(static_cast<const pi_bool *>(PropValue)) == PI_TRUE) {
+      *(static_cast<const ur_bool_t *>(PropValue)) == true) {
     // The whole point for users really was to not need to know anything
     // about the types of allocations kernel uses. So in DPC++ we always
     // just set all 3 modes for each kernel.
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index 133306c910ce4..d61b93d581f5e 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -13,7 +13,8 @@
 #include "ur_level_zero.hpp"
 #include "ur_level_zero_context.hpp"
 #include "ur_level_zero_event.hpp"
-#include <ur_bindings.hpp>
+#include "ur_level_zero.hpp"
+  
 
 // Default to using compute engine for fill operation, but allow to
 // override this with an environment variable.
@@ -42,7 +43,7 @@ bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr) {
 // exclusive use and source buffer's mutex locked for shared use on entry.
 ur_result_t enqueueMemCopyHelper(ur_command_t CommandType,
                                  ur_queue_handle_t Queue, void *Dst,
-                                 pi_bool BlockingWrite, size_t Size,
+                                 ur_bool_t BlockingWrite, size_t Size,
                                  const void *Src, uint32_t NumEventsInWaitList,
                                  const ur_event_handle_t *EventWaitList,
                                  ur_event_handle_t *OutEvent,
@@ -94,7 +95,7 @@ ur_result_t enqueueMemCopyRectHelper(
     ur_command_t CommandType, ur_queue_handle_t Queue, const void *SrcBuffer,
     void *DstBuffer, ur_rect_offset_t SrcOrigin, ur_rect_offset_t DstOrigin,
     ur_rect_region_t Region, size_t SrcRowPitch, size_t DstRowPitch,
-    size_t SrcSlicePitch, size_t DstSlicePitch, pi_bool Blocking,
+    size_t SrcSlicePitch, size_t DstSlicePitch, ur_bool_t Blocking,
     uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
     ur_event_handle_t *OutEvent, bool PreferCopyEngine) {
   bool UseCopyEngine = Queue->useCopyEngine(PreferCopyEngine);
@@ -298,10 +299,10 @@ static ur_result_t getImageRegionHelper(_ur_image *Mem,
   UR_ASSERT(Mem, UR_RESULT_ERROR_INVALID_MEM_OBJECT);
   UR_ASSERT(Origin, UR_RESULT_ERROR_INVALID_VALUE);
 
+#ifndef NDEBUG
   auto UrImage = static_cast<_ur_image *>(Mem);
   ze_image_desc_t &ZeImageDesc = UrImage->ZeImageDesc;
 
-#ifndef NDEBUG
   UR_ASSERT(Mem->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
   UR_ASSERT((ZeImageDesc.type == ZE_IMAGE_TYPE_1D && Origin->y == 0 &&
              Origin->z == 0) ||
@@ -341,7 +342,7 @@ static ur_result_t enqueueMemImageCommandHelper(
     ur_command_t CommandType, ur_queue_handle_t Queue,
     const void *Src, // image or ptr
     void *Dst,       // image or ptr
-    pi_bool IsBlocking, ur_rect_offset_t *SrcOrigin,
+    ur_bool_t IsBlocking, ur_rect_offset_t *SrcOrigin,
     ur_rect_offset_t *DstOrigin, ur_rect_region_t *Region, size_t RowPitch,
     size_t SlicePitch, uint32_t NumEventsInWaitList,
     const ur_event_handle_t *EventWaitList, ur_event_handle_t *OutEvent,
@@ -384,6 +385,7 @@ static ur_result_t enqueueMemImageCommandHelper(
     std::ignore = SlicePitch;
     UR_ASSERT(SrcMem->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
+#ifndef NDEBUG
     auto SrcImage = SrcMem;
     const ze_image_desc_t &ZeImageDesc = SrcImage->ZeImageDesc;
     UR_ASSERT(
@@ -396,6 +398,7 @@ static ur_result_t enqueueMemImageCommandHelper(
             (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 &&
              RowPitch == 4 * ZeSrcRegion.width),
         UR_RESULT_ERROR_INVALID_IMAGE_SIZE);
+#endif
     UR_ASSERT(SlicePitch == 0 || SlicePitch == RowPitch * ZeSrcRegion.height,
               UR_RESULT_ERROR_INVALID_IMAGE_SIZE);
 
@@ -414,6 +417,7 @@ static ur_result_t enqueueMemImageCommandHelper(
     // Check that SYCL RT did not want pitch larger than default.
     UR_ASSERT(DstMem->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
+#ifndef NDEBUG
     auto DstImage = static_cast<_ur_image *>(DstMem);
     const ze_image_desc_t &ZeImageDesc = DstImage->ZeImageDesc;
     UR_ASSERT(
@@ -426,6 +430,7 @@ static ur_result_t enqueueMemImageCommandHelper(
             (ZeImageDesc.format.layout == ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 &&
              RowPitch == 4 * ZeDstRegion.width),
         UR_RESULT_ERROR_INVALID_IMAGE_SIZE);
+#endif
     UR_ASSERT(SlicePitch == 0 || SlicePitch == RowPitch * ZeDstRegion.height,
               UR_RESULT_ERROR_INVALID_IMAGE_SIZE);
 
@@ -2316,19 +2321,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMGetMemAllocInfo(
   UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet);
   switch (PropName) {
   case UR_USM_ALLOC_INFO_TYPE: {
-    pi_usm_type MemAllocaType;
+    ur_usm_type_t MemAllocaType;
     switch (ZeMemoryAllocationProperties.type) {
     case ZE_MEMORY_TYPE_UNKNOWN:
-      MemAllocaType = PI_MEM_TYPE_UNKNOWN;
+      MemAllocaType = UR_USM_TYPE_UNKNOWN;
       break;
     case ZE_MEMORY_TYPE_HOST:
-      MemAllocaType = PI_MEM_TYPE_HOST;
+      MemAllocaType = UR_USM_TYPE_HOST;
       break;
     case ZE_MEMORY_TYPE_DEVICE:
-      MemAllocaType = PI_MEM_TYPE_DEVICE;
+      MemAllocaType = UR_USM_TYPE_DEVICE;
       break;
     case ZE_MEMORY_TYPE_SHARED:
-      MemAllocaType = PI_MEM_TYPE_SHARED;
+      MemAllocaType = UR_USM_TYPE_SHARED;
       break;
     default:
       urPrint("urUSMGetMemAllocInfo: unexpected usm memory type\n");
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
index e9ad0d49bbdbb..74850eb78f08e 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
@@ -16,7 +16,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include <sycl/detail/pi.h>
+ 
 #include <ur/ur.hpp>
 #include <ur_api.h>
 #include <ze_api.h>
@@ -43,7 +43,7 @@ const bool UseCopyEngineForD2DCopy = [] {
 // exclusive use and source buffer's mutex locked for shared use on entry.
 ur_result_t enqueueMemCopyHelper(ur_command_t CommandType,
                                  ur_queue_handle_t Queue, void *Dst,
-                                 pi_bool BlockingWrite, size_t Size,
+                                 ur_bool_t BlockingWrite, size_t Size,
                                  const void *Src, uint32_t NumEventsInWaitList,
                                  const ur_event_handle_t *EventWaitList,
                                  ur_event_handle_t *OutEvent,
@@ -53,7 +53,7 @@ ur_result_t enqueueMemCopyRectHelper(
     ur_command_t CommandType, ur_queue_handle_t Queue, const void *SrcBuffer,
     void *DstBuffer, ur_rect_offset_t SrcOrigin, ur_rect_offset_t DstOrigin,
     ur_rect_region_t Region, size_t SrcRowPitch, size_t DstRowPitch,
-    size_t SrcSlicePitch, size_t DstSlicePitch, pi_bool Blocking,
+    size_t SrcSlicePitch, size_t DstSlicePitch, ur_bool_t Blocking,
     uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
     ur_event_handle_t *OutEvent, bool PreferCopyEngine = false);
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
index 7d0bef4cb84f5..71469d4e68020 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
@@ -7,7 +7,8 @@
 //===-----------------------------------------------------------------===//
 
 #include "ur_level_zero_platform.hpp"
-#include <ur_bindings.hpp>
+#include "ur_level_zero.hpp"
+  
 
 UR_APIEXPORT ur_result_t UR_APICALL urInit(
     ur_device_init_flags_t
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp
index 0b4d07b0366a3..281eacdd11509 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp
@@ -7,7 +7,8 @@
 //===-----------------------------------------------------------------===//
 
 #include "ur_level_zero_program.hpp"
-#include <ur_bindings.hpp>
+#include "ur_level_zero.hpp"
+  
 
 extern "C" {
 // Check to see if a Level Zero module has any unresolved symbols.
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
index 3ca6ecad4c994..15b6ab451ac3d 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
@@ -13,7 +13,8 @@
 
 #include "ur_level_zero_common.hpp"
 #include "ur_level_zero_queue.hpp"
-#include <ur_bindings.hpp>
+#include "ur_level_zero.hpp"
+  
 
 /// @brief Cleanup events in the immediate lists of the queue.
 /// @param Queue Queue where events need to be cleaned up.
@@ -639,7 +640,7 @@ static const zeCommandListBatchConfig ZeCommandListBatchConfig(bool IsCopy) {
   }
   const char *BatchSizeStr = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
   if (BatchSizeStr) {
-    pi_int32 BatchSizeStrVal = std::atoi(BatchSizeStr);
+    int32_t BatchSizeStrVal = std::atoi(BatchSizeStr);
     // Level Zero may only support a limted number of commands per command
     // list.  The actual upper limit is not specified by the Level Zero
     // Specification.  For now we allow an arbitrary upper limit.
@@ -1205,20 +1206,20 @@ bool ur_queue_handle_t_::isBatchingAllowed(bool IsCopy) const {
 }
 
 bool ur_queue_handle_t_::isDiscardEvents() const {
-  return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS) != 0);
+  return ((this->Properties & UR_QUEUE_FLAG_DISCARD_EVENTS) != 0);
 }
 
 bool ur_queue_handle_t_::isPriorityLow() const {
-  return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW) != 0);
+  return ((this->Properties & UR_QUEUE_FLAG_PRIORITY_LOW) != 0);
 }
 
 bool ur_queue_handle_t_::isPriorityHigh() const {
-  return ((this->Properties & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH) != 0);
+  return ((this->Properties & UR_QUEUE_FLAG_PRIORITY_HIGH) != 0);
 }
 
 bool ur_queue_handle_t_::isInOrderQueue() const {
   // If out-of-order queue property is not set, then this is a in-order queue.
-  return ((this->Properties & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) ==
+  return ((this->Properties & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) ==
           0);
 }
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
index 76cfda295f2f8..707463ecf55cc 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
@@ -16,7 +16,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include <sycl/detail/pi.h>
+ 
 #include <ur/ur.hpp>
 #include <ur_api.h>
 #include <ze_api.h>
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
index 42c431ec94632..c8cedf4b74f88 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
@@ -7,7 +7,8 @@
 //===-----------------------------------------------------------------===//
 
 #include "ur_level_zero_sampler.hpp"
-#include <ur_bindings.hpp>
+#include "ur_level_zero.hpp"
+  
 
 UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate(
     ur_context_handle_t Context, ///< [in] handle of the context object

From 04fc86b37a43a273bcfd2c8498d2e9141adbbc24 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Fri, 12 May 2023 18:33:15 -0700
Subject: [PATCH 39/50] Fix casting for srgba-read.cpp test

pi_bool is uint32_t and ur_bool_t is uint8_t, so to make sure
correct functionality is maintain, use uint32_t as replacement
for pi_bool, instead of ur_bool_t.

Also, add back check for urMemImageCreate that was before in
piMemImageCreate.

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../ur/adapters/level_zero/ur_level_zero_context.cpp      | 6 ++----
 .../ur/adapters/level_zero/ur_level_zero_device.cpp       | 7 +++----
 .../ur/adapters/level_zero/ur_level_zero_mem.cpp          | 8 ++++++--
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
index e6cd4ff02f981..c177926c24c30 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
@@ -13,8 +13,6 @@
 
 #include "ur_level_zero.hpp"
 #include "ur_level_zero_context.hpp"
-#include "ur_level_zero.hpp"
-  
 
 UR_APIEXPORT ur_result_t UR_APICALL urContextCreate(
     uint32_t DeviceCount, ///< [in] the number of devices given in phDevices
@@ -108,10 +106,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
     return ReturnValue(uint32_t{Context->RefCount.load()});
   case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT:
     // 2D USM memcpy is supported.
-    return ReturnValue(ur_bool_t{UseMemcpy2DOperations});
+    return ReturnValue(uint32_t{UseMemcpy2DOperations});
   case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT:
     // 2D USM fill is not supported.
-    return ReturnValue(ur_bool_t{false});
+    return ReturnValue(uint32_t{false});
   case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
 
     ur_memory_order_capability_flags_t Capabilities =
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
index 83a5a33abda51..f3d242f7f4e5d 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
@@ -10,7 +10,6 @@
 #include "ur_level_zero.hpp"
 #include <algorithm>
 #include <climits>
-  
 
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(
     ur_platform_handle_t Platform, ///< [in] handle of the platform instance
@@ -771,9 +770,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(capabilities);
   }
   case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
-    return ReturnValue(ur_bool_t{false});
+    return ReturnValue(uint32_t{false});
   case UR_DEVICE_INFO_IMAGE_SRGB:
-    return ReturnValue(ur_bool_t{false});
+    return ReturnValue(uint32_t{false});
 
   case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES:
   case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: {
@@ -1197,7 +1196,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary(
 
   // Look for GEN binary, which we known can only be handled by Level-Zero now.
   const char *BinaryTarget =
-      UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; //UR_DEVICE_BINARY_TARGET_SPIRV64_GEN;
+      UR_DEVICE_BINARY_TARGET_SPIRV64_GEN; // UR_DEVICE_BINARY_TARGET_SPIRV64_GEN;
 
   uint32_t *SelectedBinaryInd = SelectedBinary;
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index d61b93d581f5e..ba4e36aaeb21d 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -13,8 +13,6 @@
 #include "ur_level_zero.hpp"
 #include "ur_level_zero_context.hpp"
 #include "ur_level_zero_event.hpp"
-#include "ur_level_zero.hpp"
-  
 
 // Default to using compute engine for fill operation, but allow to
 // override this with an environment variable.
@@ -1604,6 +1602,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
     void *Host,                       ///< [in] pointer to the buffer data
     ur_mem_handle_t *Mem ///< [out] pointer to handle of image object created
 ) {
+  // TODO: implement read-only, write-only
+  if ((Flags & UR_MEM_FLAG_READ_WRITE) == 0) {
+    die("urMemImageCreate: Level-Zero implements only read-write buffer,"
+        "no read-only or write-only yet.");
+  }
+
   std::shared_lock<ur_shared_mutex> Lock(Context->Mutex);
 
   ZeStruct<ze_image_desc_t> ZeImageDesc;

From f35fef4d84dadad0a292cf2e2c46053f98a3637d Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Mon, 15 May 2023 19:45:08 -0700
Subject: [PATCH 40/50] Rebase and fix format

- Add changes to fix tests after

[SYCL] Add Unified Runtime plugin and route to it with SYCL_PREFER_UR
https://github.com/intel/llvm/pull/9232

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/unified_runtime/pi2ur.hpp                | 11 ++++-------
 sycl/plugins/unified_runtime/pi_unified_runtime.cpp   | 10 ++++++++++
 .../ur/adapters/level_zero/ur_level_zero.cpp          |  1 -
 .../ur/adapters/level_zero/ur_level_zero_common.hpp   |  1 -
 .../ur/adapters/level_zero/ur_level_zero_context.hpp  |  1 -
 .../ur/adapters/level_zero/ur_level_zero_device.hpp   |  1 -
 .../ur/adapters/level_zero/ur_level_zero_event.cpp    |  3 +--
 .../ur/adapters/level_zero/ur_level_zero_event.hpp    |  1 -
 .../ur/adapters/level_zero/ur_level_zero_kernel.cpp   |  4 ++--
 .../ur/adapters/level_zero/ur_level_zero_mem.hpp      |  1 -
 .../ur/adapters/level_zero/ur_level_zero_platform.cpp |  1 -
 .../ur/adapters/level_zero/ur_level_zero_program.cpp  |  1 -
 .../ur/adapters/level_zero/ur_level_zero_queue.cpp    |  3 +--
 .../ur/adapters/level_zero/ur_level_zero_queue.hpp    |  1 -
 .../ur/adapters/level_zero/ur_level_zero_sampler.cpp  |  1 -
 sycl/plugins/unified_runtime/ur/ur.hpp                |  1 -
 16 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 2f3b6211cb46b..f36ce228d48a2 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -426,7 +426,6 @@ inline pi_result fixupInfoValueTypes(size_t ParamValueSizeUR,
   return PI_SUCCESS;
 }
 
-
 inline ur_result_t
 mapPIMetadataToUR(const pi_device_binary_property *pi_metadata,
                   ur_program_metadata_t *ur_metadata) {
@@ -455,7 +454,10 @@ namespace pi2ur {
 
 inline pi_result piTearDown(void *PluginParameter) {
   std::ignore = PluginParameter;
-  HANDLE_ERRORS(urTearDown(nullptr));
+  // TODO: Dont check for errors in urTearDown, since
+  // when using Level Zero plugin, the second urTearDown
+  // will fail as ur_loader.so has already been unloaded,
+  urTearDown(nullptr);
   return PI_SUCCESS;
 }
 
@@ -1379,7 +1381,6 @@ inline pi_result piextQueueGetNativeHandle(pi_queue Queue,
   return PI_SUCCESS;
 }
 
-
 inline pi_result piextQueueGetNativeHandle2(pi_queue Queue,
                                             pi_native_handle *NativeHandle,
                                             int32_t *NativeHandleDesc) {
@@ -1388,7 +1389,6 @@ inline pi_result piextQueueGetNativeHandle2(pi_queue Queue,
   return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle);
 }
 
-
 inline pi_result piQueueRelease(pi_queue Queue) {
   PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
 
@@ -2920,7 +2920,6 @@ inline pi_result piextUSMEnqueueFill2D(pi_queue Queue, void *Ptr, size_t Pitch,
                                        const pi_event *EventsWaitList,
                                        pi_event *Event) {
 
-
   auto hQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
   auto phEventWaitList =
       reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
@@ -2931,7 +2930,6 @@ inline pi_result piextUSMEnqueueFill2D(pi_queue Queue, void *Ptr, size_t Pitch,
                                    phEventWaitList, phEvent));
 
   return PI_SUCCESS;
-
 }
 
 inline pi_result piextUSMEnqueueMemset2D(pi_queue Queue, void *Ptr,
@@ -3504,7 +3502,6 @@ inline pi_result piEnqueueEventsWait(pi_queue Queue,
   return PI_SUCCESS;
 }
 
-
 inline pi_result
 piextEnqueueReadHostPipe(pi_queue queue, pi_program program,
                          const char *pipe_symbol, pi_bool blocking, void *ptr,
diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
index 3cf3e10a21676..acff4810f9dc9 100644
--- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
+++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
@@ -84,6 +84,15 @@ piContextCreate(const pi_context_properties *Properties, pi_uint32 NumDevices,
                                 UserData, RetContext);
 }
 
+__SYCL_EXPORT pi_result piContextGetInfo(pi_context Context,
+                                         pi_context_info ParamName,
+                                         size_t ParamValueSize,
+                                         void *ParamValue,
+                                         size_t *ParamValueSizeRet) {
+  return pi2ur::piContextGetInfo(Context, ParamName, ParamValueSize, ParamValue,
+                                 ParamValueSizeRet);
+}
+
 __SYCL_EXPORT pi_result piContextRelease(pi_context Context) {
   return pi2ur::piContextRelease(Context);
 }
@@ -1045,6 +1054,7 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_API(piContextCreate)
   _PI_API(piContextRelease)
   _PI_API(piContextRetain)
+  _PI_API(piContextGetInfo)
   _PI_API(piextContextSetExtendedDeleter)
   _PI_API(piextContextGetNativeHandle)
   _PI_API(piextContextCreateWithNativeHandle)
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp
index 92ada96340bd9..c0a873025e8b8 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp
@@ -11,7 +11,6 @@
 #include <string.h>
 
 #include "ur_level_zero.hpp"
-  
 
 // Define the static class field
 std::mutex ZeCall::GlobalLock;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
index 9d375bb8e2fab..ed269665cd99b 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_common.hpp
@@ -16,7 +16,6 @@
 #include <unordered_map>
 #include <vector>
 
- 
 #include <ur/ur.hpp>
 #include <ur_api.h>
 #include <ze_api.h>
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp
index 2a9d2f97e84f9..cc1775d87f3c9 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp
@@ -15,7 +15,6 @@
 #include <unordered_set>
 #include <vector>
 
- 
 #include <ur/ur.hpp>
 #include <ur_api.h>
 #include <ze_api.h>
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp
index e8514ce569f45..4bc56c6fc5108 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp
@@ -15,7 +15,6 @@
 #include <unordered_map>
 #include <vector>
 
- 
 #include <ur/ur.hpp>
 #include <ur_api.h>
 #include <ze_api.h>
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
index 72cfcbed5bbbc..6d14ae2176681 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
@@ -11,10 +11,9 @@
 #include <mutex>
 #include <string.h>
 
+#include "ur_level_zero.hpp"
 #include "ur_level_zero_common.hpp"
 #include "ur_level_zero_event.hpp"
-#include "ur_level_zero.hpp"
-  
 
 void printZeEventList(const _ur_ze_event_list_t &UrZeEventList) {
   urPrint("  NumEventsInWaitList %d:", UrZeEventList.Length);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp
index 42c9468ec2ef0..9922742c7776d 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.hpp
@@ -17,7 +17,6 @@
 #include <unordered_map>
 #include <vector>
 
- 
 #include <ur/ur.hpp>
 #include <ur_api.h>
 #include <ze_api.h>
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
index 38df90c31ee72..73111abeb475a 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_kernel.cpp
@@ -8,7 +8,6 @@
 
 #include "ur_level_zero_kernel.hpp"
 #include "ur_level_zero.hpp"
-  
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     ur_queue_handle_t Queue,   ///< [in] handle of the queue object
@@ -609,7 +608,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer(
                          ///< holding the argument value. If null then argument
                          ///< value is considered null.
 ) {
-  UR_CALL(urKernelSetArgValue(Kernel, ArgIndex, sizeof(const void *), ArgValue));
+  UR_CALL(
+      urKernelSetArgValue(Kernel, ArgIndex, sizeof(const void *), ArgValue));
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
index 74850eb78f08e..0d658342fb0b1 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
@@ -16,7 +16,6 @@
 #include <unordered_map>
 #include <vector>
 
- 
 #include <ur/ur.hpp>
 #include <ur_api.h>
 #include <ze_api.h>
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
index 71469d4e68020..469c39d3e668c 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
@@ -8,7 +8,6 @@
 
 #include "ur_level_zero_platform.hpp"
 #include "ur_level_zero.hpp"
-  
 
 UR_APIEXPORT ur_result_t UR_APICALL urInit(
     ur_device_init_flags_t
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp
index 281eacdd11509..5519f7e2254bd 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp
@@ -8,7 +8,6 @@
 
 #include "ur_level_zero_program.hpp"
 #include "ur_level_zero.hpp"
-  
 
 extern "C" {
 // Check to see if a Level Zero module has any unresolved symbols.
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
index 15b6ab451ac3d..efd3538887f93 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
@@ -11,10 +11,9 @@
 #include <optional>
 #include <string.h>
 
+#include "ur_level_zero.hpp"
 #include "ur_level_zero_common.hpp"
 #include "ur_level_zero_queue.hpp"
-#include "ur_level_zero.hpp"
-  
 
 /// @brief Cleanup events in the immediate lists of the queue.
 /// @param Queue Queue where events need to be cleaned up.
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
index 707463ecf55cc..4a5a6fe8b731d 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
@@ -16,7 +16,6 @@
 #include <unordered_map>
 #include <vector>
 
- 
 #include <ur/ur.hpp>
 #include <ur_api.h>
 #include <ze_api.h>
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
index c8cedf4b74f88..bf32fdd9367d0 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
@@ -8,7 +8,6 @@
 
 #include "ur_level_zero_sampler.hpp"
 #include "ur_level_zero.hpp"
-  
 
 UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreate(
     ur_context_handle_t Context, ///< [in] handle of the context object
diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp
index 24a38ab318751..d0d1fb8f46912 100644
--- a/sycl/plugins/unified_runtime/ur/ur.hpp
+++ b/sycl/plugins/unified_runtime/ur/ur.hpp
@@ -67,7 +67,6 @@ typedef enum {
   "@reqd_work_group_size"
 #define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping"
 
-
 // Terminates the process with a catastrophic error message.
 [[noreturn]] inline void die(const char *Message) {
   std::cerr << "die: " << Message << std::endl;

From 5935f8b25d50c8bd3abc01fa7c5837e73b9bdbf3 Mon Sep 17 00:00:00 2001
From: Brandon Yates <brandon.yates@intel.com>
Date: Tue, 16 May 2023 19:46:04 -0400
Subject: [PATCH 41/50] Move urUSM into new file (#10)

Signed-off-by: Brandon Yates <brandon.yates@intel.com>
---
 sycl/plugins/level_zero/CMakeLists.txt        |   2 +
 sycl/plugins/unified_runtime/CMakeLists.txt   |   2 +
 .../ur/adapters/level_zero/ur_level_zero.hpp  |   1 +
 .../adapters/level_zero/ur_level_zero_mem.cpp | 749 -----------------
 .../adapters/level_zero/ur_level_zero_mem.hpp |  98 ---
 .../adapters/level_zero/ur_level_zero_usm.cpp | 764 ++++++++++++++++++
 .../adapters/level_zero/ur_level_zero_usm.hpp | 108 +++
 7 files changed, 877 insertions(+), 847 deletions(-)
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp

diff --git a/sycl/plugins/level_zero/CMakeLists.txt b/sycl/plugins/level_zero/CMakeLists.txt
index 3cd25f2dc6826..916680cdf3959 100755
--- a/sycl/plugins/level_zero/CMakeLists.txt
+++ b/sycl/plugins/level_zero/CMakeLists.txt
@@ -111,6 +111,7 @@ add_sycl_plugin(level_zero
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_program.hpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.hpp"
+    "../unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero.cpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_common.cpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp"
@@ -122,6 +123,7 @@ add_sycl_plugin(level_zero
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp"
     "../unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp"
+    "../unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp"
     # Following are the PI Level-Zero Plugin only codes.
     "pi_level_zero.cpp"
     "pi_level_zero.hpp"
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 0b4bcef273b73..177537363380e 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -96,6 +96,7 @@ add_sycl_library("ur_adapter_level_zero" SHARED
     "ur/adapters/level_zero/ur_level_zero_program.hpp"
     "ur/adapters/level_zero/ur_level_zero_queue.hpp"
     "ur/adapters/level_zero/ur_level_zero_sampler.hpp"
+    "ur/adapters/level_zero/ur_level_zero_usm.hpp"
     "ur/adapters/level_zero/ur_level_zero.cpp"
     "ur/adapters/level_zero/ur_level_zero_common.cpp"
     "ur/adapters/level_zero/ur_level_zero_context.cpp"
@@ -107,6 +108,7 @@ add_sycl_library("ur_adapter_level_zero" SHARED
     "ur/adapters/level_zero/ur_level_zero_program.cpp"
     "ur/adapters/level_zero/ur_level_zero_queue.cpp"
     "ur/adapters/level_zero/ur_level_zero_sampler.cpp"
+    "ur/adapters/level_zero/ur_level_zero_usm.cpp"
   INCLUDE_DIRS
     ${sycl_inc_dir}
   LIBRARIES
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp
index 5095e168a4a3e..0da70b073ab1e 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero.hpp
@@ -30,3 +30,4 @@
 #include "ur_level_zero_program.hpp"
 #include "ur_level_zero_queue.hpp"
 #include "ur_level_zero_sampler.hpp"
+#include "ur_level_zero_usm.hpp"
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
index ba4e36aaeb21d..1974f6052ff04 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.cpp
@@ -2021,755 +2021,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(
-    ur_context_handle_t Context, ///< [in] handle of the context object
-    const ur_usm_desc_t
-        *USMDesc, ///< [in][optional] USM memory allocation descriptor
-    ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created
-                               ///< using urUSMPoolCreate
-    size_t
-        Size, ///< [in] size in bytes of the USM memory object to be allocated
-    void **RetMem ///< [out] pointer to USM host memory object
-) {
-  std::ignore = Pool;
-
-  uint32_t Align = USMDesc->align;
-  // L0 supports alignment up to 64KB and silently ignores higher values.
-  // We flag alignment > 64KB as an invalid value.
-  if (Align > 65536)
-    return UR_RESULT_ERROR_INVALID_VALUE;
-
-  const ur_usm_advice_flags_t *USMHintFlags = &USMDesc->hints;
-  std::ignore = USMHintFlags;
-
-  ur_platform_handle_t Plt = Context->getPlatform();
-  // If indirect access tracking is enabled then lock the mutex which is
-  // guarding contexts container in the platform. This prevents new kernels from
-  // being submitted in any context while we are in the process of allocating a
-  // memory, this is needed to properly capture allocations by kernels with
-  // indirect access. This lock also protects access to the context's data
-  // structures. If indirect access tracking is not enabled then lock context
-  // mutex to protect access to context's data structures.
-  std::shared_lock<ur_shared_mutex> ContextLock(Context->Mutex,
-                                                std::defer_lock);
-  std::unique_lock<ur_shared_mutex> IndirectAccessTrackingLock(
-      Plt->ContextsMutex, std::defer_lock);
-  if (IndirectAccessTrackingEnabled) {
-    IndirectAccessTrackingLock.lock();
-    // We are going to defer memory release if there are kernels with indirect
-    // access, that is why explicitly retain context to be sure that it is
-    // released after all memory allocations in this context are released.
-    UR_CALL(urContextRetain(Context));
-  } else {
-    ContextLock.lock();
-  }
-
-  if (!UseUSMAllocator ||
-      // L0 spec says that allocation fails if Alignment != 2^n, in order to
-      // keep the same behavior for the allocator, just call L0 API directly and
-      // return the error code.
-      ((Align & (Align - 1)) != 0)) {
-    ur_usm_host_mem_flags_t Flags{};
-    ur_result_t Res = USMHostAllocImpl(RetMem, Context, &Flags, Size, Align);
-    if (IndirectAccessTrackingEnabled) {
-      // Keep track of all memory allocations in the context
-      Context->MemAllocs.emplace(std::piecewise_construct,
-                                 std::forward_as_tuple(*RetMem),
-                                 std::forward_as_tuple(Context));
-    }
-    return Res;
-  }
-
-  // There is a single allocator for Host USM allocations, so we don't need to
-  // find the allocator depending on context as we do for Shared and Device
-  // allocations.
-  try {
-    *RetMem = Context->HostMemAllocContext->allocate(Size, Align);
-    if (IndirectAccessTrackingEnabled) {
-      // Keep track of all memory allocations in the context
-      Context->MemAllocs.emplace(std::piecewise_construct,
-                                 std::forward_as_tuple(*RetMem),
-                                 std::forward_as_tuple(Context));
-    }
-  } catch (const UsmAllocationException &Ex) {
-    *RetMem = nullptr;
-    return Ex.getError();
-  } catch (...) {
-    return UR_RESULT_ERROR_UNKNOWN;
-  }
-
-  return UR_RESULT_SUCCESS;
-}
-
-UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(
-    ur_context_handle_t Context, ///< [in] handle of the context object
-    ur_device_handle_t Device,   ///< [in] handle of the device object
-    const ur_usm_desc_t
-        *USMDesc, ///< [in][optional] USM memory allocation descriptor
-    ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created
-                               ///< using urUSMPoolCreate
-    size_t
-        Size, ///< [in] size in bytes of the USM memory object to be allocated
-    void **RetMem ///< [out] pointer to USM device memory object
-) {
-  std::ignore = Pool;
-
-  uint32_t Alignment = USMDesc->align;
-
-  // L0 supports alignment up to 64KB and silently ignores higher values.
-  // We flag alignment > 64KB as an invalid value.
-  if (Alignment > 65536)
-    return UR_RESULT_ERROR_INVALID_VALUE;
-
-  const ur_usm_advice_flags_t *USMHintFlags = &USMDesc->hints;
-  std::ignore = USMHintFlags;
-
-  ur_platform_handle_t Plt = Device->Platform;
-
-  // If indirect access tracking is enabled then lock the mutex which is
-  // guarding contexts container in the platform. This prevents new kernels from
-  // being submitted in any context while we are in the process of allocating a
-  // memory, this is needed to properly capture allocations by kernels with
-  // indirect access. This lock also protects access to the context's data
-  // structures. If indirect access tracking is not enabled then lock context
-  // mutex to protect access to context's data structures.
-  std::shared_lock<ur_shared_mutex> ContextLock(Context->Mutex,
-                                                std::defer_lock);
-  std::unique_lock<ur_shared_mutex> IndirectAccessTrackingLock(
-      Plt->ContextsMutex, std::defer_lock);
-  if (IndirectAccessTrackingEnabled) {
-    IndirectAccessTrackingLock.lock();
-    // We are going to defer memory release if there are kernels with indirect
-    // access, that is why explicitly retain context to be sure that it is
-    // released after all memory allocations in this context are released.
-    UR_CALL(urContextRetain(Context));
-  } else {
-    ContextLock.lock();
-  }
-
-  if (!UseUSMAllocator ||
-      // L0 spec says that allocation fails if Alignment != 2^n, in order to
-      // keep the same behavior for the allocator, just call L0 API directly and
-      // return the error code.
-      ((Alignment & (Alignment - 1)) != 0)) {
-    ur_result_t Res =
-        USMDeviceAllocImpl(RetMem, Context, Device, nullptr, Size, Alignment);
-    if (IndirectAccessTrackingEnabled) {
-      // Keep track of all memory allocations in the context
-      Context->MemAllocs.emplace(std::piecewise_construct,
-                                 std::forward_as_tuple(*RetMem),
-                                 std::forward_as_tuple(Context));
-    }
-    return Res;
-  }
-
-  try {
-    auto It = Context->DeviceMemAllocContexts.find(Device->ZeDevice);
-    if (It == Context->DeviceMemAllocContexts.end())
-      return UR_RESULT_ERROR_INVALID_VALUE;
-
-    *RetMem = It->second.allocate(Size, Alignment);
-    if (IndirectAccessTrackingEnabled) {
-      // Keep track of all memory allocations in the context
-      Context->MemAllocs.emplace(std::piecewise_construct,
-                                 std::forward_as_tuple(*RetMem),
-                                 std::forward_as_tuple(Context));
-    }
-
-  } catch (const UsmAllocationException &Ex) {
-    *RetMem = nullptr;
-    return Ex.getError();
-  } catch (...) {
-    return UR_RESULT_ERROR_UNKNOWN;
-  }
-
-  return UR_RESULT_SUCCESS;
-}
-
-UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(
-    ur_context_handle_t Context, ///< [in] handle of the context object
-    ur_device_handle_t Device,   ///< [in] handle of the device object
-    const ur_usm_desc_t
-        *USMDesc, ///< [in][optional] USM memory allocation descriptor
-    ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created
-                               ///< using urUSMPoolCreate
-    size_t
-        Size, ///< [in] size in bytes of the USM memory object to be allocated
-    void **RetMem ///< [out] pointer to USM shared memory object
-) {
-  std::ignore = Pool;
-
-  uint32_t Alignment = USMDesc->align;
-
-  ur_usm_host_mem_flags_t UsmHostFlags{};
-
-  // See if the memory is going to be read-only on the device.
-  bool DeviceReadOnly = false;
-  ur_usm_device_mem_flags_t UsmDeviceFlags{};
-
-  void *pNext = const_cast<void *>(USMDesc->pNext);
-  while (pNext != nullptr) {
-    const ur_base_desc_t *BaseDesc =
-        reinterpret_cast<const ur_base_desc_t *>(pNext);
-    if (BaseDesc->stype == UR_STRUCTURE_TYPE_USM_DEVICE_DESC) {
-      const ur_usm_device_desc_t *UsmDeviceDesc =
-          reinterpret_cast<const ur_usm_device_desc_t *>(pNext);
-      UsmDeviceFlags = UsmDeviceDesc->flags;
-    }
-    if (BaseDesc->stype == UR_STRUCTURE_TYPE_USM_HOST_DESC) {
-      const ur_usm_host_desc_t *UsmHostDesc =
-          reinterpret_cast<const ur_usm_host_desc_t *>(pNext);
-      UsmHostFlags = UsmHostDesc->flags;
-    }
-    pNext = const_cast<void *>(BaseDesc->pNext);
-  }
-  DeviceReadOnly = UsmDeviceFlags & UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY;
-
-  // L0 supports alignment up to 64KB and silently ignores higher values.
-  // We flag alignment > 64KB as an invalid value.
-  if (Alignment > 65536)
-    return UR_RESULT_ERROR_INVALID_VALUE;
-
-  ur_platform_handle_t Plt = Device->Platform;
-
-  // If indirect access tracking is enabled then lock the mutex which is
-  // guarding contexts container in the platform. This prevents new kernels from
-  // being submitted in any context while we are in the process of allocating a
-  // memory, this is needed to properly capture allocations by kernels with
-  // indirect access. This lock also protects access to the context's data
-  // structures. If indirect access tracking is not enabled then lock context
-  // mutex to protect access to context's data structures.
-  std::scoped_lock<ur_shared_mutex> Lock(
-      IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex);
-
-  if (IndirectAccessTrackingEnabled) {
-    // We are going to defer memory release if there are kernels with indirect
-    // access, that is why explicitly retain context to be sure that it is
-    // released after all memory allocations in this context are released.
-    UR_CALL(urContextRetain(Context));
-  }
-
-  if (!UseUSMAllocator ||
-      // L0 spec says that allocation fails if Alignment != 2^n, in order to
-      // keep the same behavior for the allocator, just call L0 API directly and
-      // return the error code.
-      ((Alignment & (Alignment - 1)) != 0)) {
-    ur_result_t Res = USMSharedAllocImpl(RetMem, Context, Device, &UsmHostFlags,
-                                         &UsmDeviceFlags, Size, Alignment);
-    if (IndirectAccessTrackingEnabled) {
-      // Keep track of all memory allocations in the context
-      Context->MemAllocs.emplace(std::piecewise_construct,
-                                 std::forward_as_tuple(*RetMem),
-                                 std::forward_as_tuple(Context));
-    }
-    return Res;
-  }
-
-  try {
-    auto &Allocator = (DeviceReadOnly ? Context->SharedReadOnlyMemAllocContexts
-                                      : Context->SharedMemAllocContexts);
-    auto It = Allocator.find(Device->ZeDevice);
-    if (It == Allocator.end())
-      return UR_RESULT_ERROR_INVALID_VALUE;
-
-    *RetMem = It->second.allocate(Size, Alignment);
-    if (DeviceReadOnly) {
-      Context->SharedReadOnlyAllocs.insert(*RetMem);
-    }
-    if (IndirectAccessTrackingEnabled) {
-      // Keep track of all memory allocations in the context
-      Context->MemAllocs.emplace(std::piecewise_construct,
-                                 std::forward_as_tuple(*RetMem),
-                                 std::forward_as_tuple(Context));
-    }
-  } catch (const UsmAllocationException &Ex) {
-    *RetMem = nullptr;
-    return Ex.getError();
-  } catch (...) {
-    return UR_RESULT_ERROR_UNKNOWN;
-  }
-
-  return UR_RESULT_SUCCESS;
-}
-
-UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(
-    ur_context_handle_t Context, ///< [in] handle of the context object
-    void *Mem                    ///< [in] pointer to USM memory object
-) {
-  ur_platform_handle_t Plt = Context->getPlatform();
-
-  std::scoped_lock<ur_shared_mutex> Lock(
-      IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex);
-
-  return USMFreeHelper(Context, Mem);
-}
-
-UR_APIEXPORT ur_result_t UR_APICALL urUSMGetMemAllocInfo(
-    ur_context_handle_t Context, ///< [in] handle of the context object
-    const void *Ptr,             ///< [in] pointer to USM memory object
-    ur_usm_alloc_info_t
-        PropName, ///< [in] the name of the USM allocation property to query
-    size_t PropValueSize, ///< [in] size in bytes of the USM allocation property
-                          ///< value
-    void *PropValue, ///< [out][optional] value of the USM allocation property
-    size_t *PropValueSizeRet ///< [out][optional] bytes returned in USM
-                             ///< allocation property
-) {
-  ze_device_handle_t ZeDeviceHandle;
-  ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
-
-  ZE2UR_CALL(zeMemGetAllocProperties,
-             (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties,
-              &ZeDeviceHandle));
-
-  UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet);
-  switch (PropName) {
-  case UR_USM_ALLOC_INFO_TYPE: {
-    ur_usm_type_t MemAllocaType;
-    switch (ZeMemoryAllocationProperties.type) {
-    case ZE_MEMORY_TYPE_UNKNOWN:
-      MemAllocaType = UR_USM_TYPE_UNKNOWN;
-      break;
-    case ZE_MEMORY_TYPE_HOST:
-      MemAllocaType = UR_USM_TYPE_HOST;
-      break;
-    case ZE_MEMORY_TYPE_DEVICE:
-      MemAllocaType = UR_USM_TYPE_DEVICE;
-      break;
-    case ZE_MEMORY_TYPE_SHARED:
-      MemAllocaType = UR_USM_TYPE_SHARED;
-      break;
-    default:
-      urPrint("urUSMGetMemAllocInfo: unexpected usm memory type\n");
-      return UR_RESULT_ERROR_INVALID_VALUE;
-    }
-    return ReturnValue(MemAllocaType);
-  }
-  case UR_USM_ALLOC_INFO_DEVICE:
-    if (ZeDeviceHandle) {
-      auto Platform = Context->getPlatform();
-      auto Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle);
-      return Device ? ReturnValue(Device) : UR_RESULT_ERROR_INVALID_VALUE;
-    } else {
-      return UR_RESULT_ERROR_INVALID_VALUE;
-    }
-  case UR_USM_ALLOC_INFO_BASE_PTR: {
-    void *Base;
-    ZE2UR_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, &Base, nullptr));
-    return ReturnValue(Base);
-  }
-  case UR_USM_ALLOC_INFO_SIZE: {
-    size_t Size;
-    ZE2UR_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, nullptr, &Size));
-    return ReturnValue(Size);
-  }
-  default:
-    urPrint("urUSMGetMemAllocInfo: unsupported ParamName\n");
-    return UR_RESULT_ERROR_INVALID_VALUE;
-  }
-  return UR_RESULT_SUCCESS;
-}
-
-static ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) {
-  ZE2UR_CALL(zeMemFree, (Context->ZeContext, Ptr));
-  return UR_RESULT_SUCCESS;
-}
-
-void *USMMemoryAllocBase::allocate(size_t Size) {
-  void *Ptr = nullptr;
-
-  auto Res = allocateImpl(&Ptr, Size, sizeof(void *));
-  if (Res != UR_RESULT_SUCCESS) {
-    throw UsmAllocationException(Res);
-  }
-
-  return Ptr;
-}
-
-void *USMMemoryAllocBase::allocate(size_t Size, size_t Alignment) {
-  void *Ptr = nullptr;
-
-  auto Res = allocateImpl(&Ptr, Size, Alignment);
-  if (Res != UR_RESULT_SUCCESS) {
-    throw UsmAllocationException(Res);
-  }
-  return Ptr;
-}
-
-void USMMemoryAllocBase::deallocate(void *Ptr) {
-  auto Res = USMFreeImpl(Context, Ptr);
-  if (Res != UR_RESULT_SUCCESS) {
-    throw UsmAllocationException(Res);
-  }
-}
-
-ur_result_t USMSharedMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
-                                               uint32_t Alignment) {
-  return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size,
-                            Alignment);
-}
-
-ur_result_t USMSharedReadOnlyMemoryAlloc::allocateImpl(void **ResultPtr,
-                                                       size_t Size,
-                                                       uint32_t Alignment) {
-  ur_usm_device_desc_t UsmDeviceDesc{};
-  UsmDeviceDesc.flags = UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY;
-  ur_usm_host_desc_t UsmHostDesc{};
-  return USMSharedAllocImpl(ResultPtr, Context, Device, &UsmDeviceDesc.flags,
-                            &UsmHostDesc.flags, Size, Alignment);
-}
-
-ur_result_t USMDeviceMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
-                                               uint32_t Alignment) {
-  return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size,
-                            Alignment);
-}
-
-ur_result_t USMHostMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
-                                             uint32_t Alignment) {
-  return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment);
-}
-
-enum class USMAllocationForceResidencyType {
-  // Do not force memory residency at allocation time.
-  None = 0,
-  // Force memory resident on the device of allocation at allocation time.
-  // For host allocation force residency on all devices in a context.
-  Device = 1,
-  // [Default] Force memory resident on all devices in the context with P2P
-  // access to the device of allocation.
-  // For host allocation force residency on all devices in a context.
-  P2PDevices = 2
-};
-
-// Returns the desired USM residency setting
-static USMAllocationForceResidencyType USMAllocationForceResidency = [] {
-  const char *UrRet = std::getenv("UR_L0_USM_RESIDENT");
-  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT");
-  const char *Str = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-  if (!Str)
-    return USMAllocationForceResidencyType::P2PDevices;
-  switch (std::atoi(Str)) {
-  case 1:
-    return USMAllocationForceResidencyType::Device;
-  case 2:
-    return USMAllocationForceResidencyType::P2PDevices;
-  default:
-    return USMAllocationForceResidencyType::None;
-  };
-}();
-
-// Make USM allocation resident as requested
-static ur_result_t USMAllocationMakeResident(
-    ur_context_handle_t Context,
-    ur_device_handle_t Device, // nullptr for host allocation
-    void *Ptr, size_t Size) {
-
-  std::list<ur_device_handle_t> Devices;
-
-  if (USMAllocationForceResidency == USMAllocationForceResidencyType::None)
-    return UR_RESULT_SUCCESS;
-  else if (!Device) {
-    // Host allocation, make it resident on all devices in the context
-    Devices.insert(Devices.end(), Context->Devices.begin(),
-                   Context->Devices.end());
-  } else {
-    Devices.push_back(Device);
-    if (USMAllocationForceResidency ==
-        USMAllocationForceResidencyType::P2PDevices) {
-      ze_bool_t P2P;
-      for (const auto &D : Context->Devices) {
-        if (D == Device)
-          continue;
-        // TODO: Cache P2P devices for a context
-        ZE2UR_CALL(zeDeviceCanAccessPeer,
-                   (D->ZeDevice, Device->ZeDevice, &P2P));
-        if (P2P)
-          Devices.push_back(D);
-      }
-    }
-  }
-  for (const auto &D : Devices) {
-    ZE2UR_CALL(zeContextMakeMemoryResident,
-               (Context->ZeContext, D->ZeDevice, Ptr, Size));
-  }
-  return UR_RESULT_SUCCESS;
-}
-
-UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate(
-    ur_context_handle_t Context, ///< [in] handle of the context object
-    ur_usm_pool_desc_t
-        *PoolDesc, ///< [in] pointer to USM pool descriptor. Can be chained with
-                   ///< ::ur_usm_pool_limits_desc_t
-    ur_usm_pool_handle_t *Pool ///< [out] pointer to USM memory pool
-) {
-  std::ignore = Context;
-  std::ignore = PoolDesc;
-  std::ignore = Pool;
-  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolDestroy(
-    ur_context_handle_t Context, ///< [in] handle of the context object
-    ur_usm_pool_handle_t Pool    ///< [in] pointer to USM memory pool
-) {
-  std::ignore = Context;
-  std::ignore = Pool;
-  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                               ur_device_handle_t Device,
-                               ur_usm_device_mem_flags_t *Flags, size_t Size,
-                               uint32_t Alignment) {
-  // TODO: translate PI properties to Level Zero flags
-  ZeStruct<ze_device_mem_alloc_desc_t> ZeDesc;
-  ZeDesc.flags = 0;
-  ZeDesc.ordinal = 0;
-
-  ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
-  if (Size > Device->ZeDeviceProperties->maxMemAllocSize) {
-    // Tell Level-Zero to accept Size > maxMemAllocSize
-    RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
-    ZeDesc.pNext = &RelaxedDesc;
-  }
-
-  ZE2UR_CALL(zeMemAllocDevice, (Context->ZeContext, &ZeDesc, Size, Alignment,
-                                Device->ZeDevice, ResultPtr));
-
-  UR_ASSERT(Alignment == 0 ||
-                reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
-            UR_RESULT_ERROR_INVALID_VALUE);
-
-  USMAllocationMakeResident(Context, Device, *ResultPtr, Size);
-  return UR_RESULT_SUCCESS;
-}
-
-ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                               ur_device_handle_t Device,
-                               ur_usm_host_mem_flags_t *,
-                               ur_usm_device_mem_flags_t *, size_t Size,
-                               uint32_t Alignment) {
-
-  // TODO: translate PI properties to Level Zero flags
-  ZeStruct<ze_host_mem_alloc_desc_t> ZeHostDesc;
-  ZeHostDesc.flags = 0;
-  ZeStruct<ze_device_mem_alloc_desc_t> ZeDevDesc;
-  ZeDevDesc.flags = 0;
-  ZeDevDesc.ordinal = 0;
-
-  ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
-  if (Size > Device->ZeDeviceProperties->maxMemAllocSize) {
-    // Tell Level-Zero to accept Size > maxMemAllocSize
-    RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
-    ZeDevDesc.pNext = &RelaxedDesc;
-  }
-
-  ZE2UR_CALL(zeMemAllocShared, (Context->ZeContext, &ZeDevDesc, &ZeHostDesc,
-                                Size, Alignment, Device->ZeDevice, ResultPtr));
-
-  UR_ASSERT(Alignment == 0 ||
-                reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
-            UR_RESULT_ERROR_INVALID_VALUE);
-
-  USMAllocationMakeResident(Context, Device, *ResultPtr, Size);
-
-  // TODO: Handle PI_MEM_ALLOC_DEVICE_READ_ONLY.
-  return UR_RESULT_SUCCESS;
-}
-
-ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                             ur_usm_host_mem_flags_t *Flags, size_t Size,
-                             uint32_t Alignment) {
-  // TODO: translate PI properties to Level Zero flags
-  ZeStruct<ze_host_mem_alloc_desc_t> ZeHostDesc;
-  ZeHostDesc.flags = 0;
-  ZE2UR_CALL(zeMemAllocHost,
-             (Context->ZeContext, &ZeHostDesc, Size, Alignment, ResultPtr));
-
-  UR_ASSERT(Alignment == 0 ||
-                reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
-            UR_RESULT_ERROR_INVALID_VALUE);
-
-  USMAllocationMakeResident(Context, nullptr, *ResultPtr, Size);
-
-  return UR_RESULT_SUCCESS;
-}
-
-// If indirect access tracking is not enabled then this functions just performs
-// zeMemFree. If indirect access tracking is enabled then reference counting is
-// performed.
-ur_result_t ZeMemFreeHelper(ur_context_handle_t Context, void *Ptr) {
-  ur_platform_handle_t Plt = Context->getPlatform();
-  std::unique_lock<ur_shared_mutex> ContextsLock(Plt->ContextsMutex,
-                                                 std::defer_lock);
-  if (IndirectAccessTrackingEnabled) {
-    ContextsLock.lock();
-    auto It = Context->MemAllocs.find(Ptr);
-    if (It == std::end(Context->MemAllocs)) {
-      die("All memory allocations must be tracked!");
-    }
-    if (!It->second.RefCount.decrementAndTest()) {
-      // Memory can't be deallocated yet.
-      return UR_RESULT_SUCCESS;
-    }
-
-    // Reference count is zero, it is ok to free memory.
-    // We don't need to track this allocation anymore.
-    Context->MemAllocs.erase(It);
-  }
-
-  ZE2UR_CALL(zeMemFree, (Context->ZeContext, Ptr));
-
-  if (IndirectAccessTrackingEnabled)
-    UR_CALL(ContextReleaseHelper(Context));
-
-  return UR_RESULT_SUCCESS;
-}
-
-bool ShouldUseUSMAllocator() {
-  // Enable allocator by default if it's not explicitly disabled
-  const char *UrRet = std::getenv("UR_L0_DISABLE_USM_ALLOCATOR");
-  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR");
-  const char *Ret = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-  return Ret == nullptr;
-}
-
-const bool UseUSMAllocator = ShouldUseUSMAllocator();
-
-// Helper function to deallocate USM memory, if indirect access support is
-// enabled then a caller must lock the platform-level mutex guarding the
-// container with contexts because deallocating the memory can turn RefCount of
-// a context to 0 and as a result the context being removed from the list of
-// tracked contexts.
-// If indirect access tracking is not enabled then caller must lock Context
-// mutex.
-ur_result_t USMFreeHelper(ur_context_handle_t Context, void *Ptr,
-                          bool OwnZeMemHandle) {
-  if (!OwnZeMemHandle) {
-    // Memory should not be freed
-    return UR_RESULT_SUCCESS;
-  }
-
-  if (IndirectAccessTrackingEnabled) {
-    auto It = Context->MemAllocs.find(Ptr);
-    if (It == std::end(Context->MemAllocs)) {
-      die("All memory allocations must be tracked!");
-    }
-    if (!It->second.RefCount.decrementAndTest()) {
-      // Memory can't be deallocated yet.
-      return UR_RESULT_SUCCESS;
-    }
-
-    // Reference count is zero, it is ok to free memory.
-    // We don't need to track this allocation anymore.
-    Context->MemAllocs.erase(It);
-  }
-
-  if (!UseUSMAllocator) {
-    ur_result_t Res = USMFreeImpl(Context, Ptr);
-    if (IndirectAccessTrackingEnabled)
-      UR_CALL(ContextReleaseHelper(Context));
-    return Res;
-  }
-
-  // Query the device of the allocation to determine the right allocator context
-  ze_device_handle_t ZeDeviceHandle;
-  ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
-
-  // Query memory type of the pointer we're freeing to determine the correct
-  // way to do it(directly or via an allocator)
-  auto ZeResult =
-      ZE_CALL_NOCHECK(zeMemGetAllocProperties,
-                      (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties,
-                       &ZeDeviceHandle));
-
-  // Handle the case that L0 RT was already unloaded
-  if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
-    if (IndirectAccessTrackingEnabled)
-      UR_CALL(ContextReleaseHelper(Context));
-    return UR_RESULT_SUCCESS;
-  } else if (ZeResult) {
-    return ze2urResult(ZeResult);
-  }
-
-  // If memory type is host release from host pool
-  if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST) {
-    try {
-      Context->HostMemAllocContext->deallocate(Ptr);
-    } catch (const UsmAllocationException &Ex) {
-      return Ex.getError();
-    } catch (...) {
-      return UR_RESULT_ERROR_UNKNOWN;
-    }
-    if (IndirectAccessTrackingEnabled)
-      UR_CALL(ContextReleaseHelper(Context));
-    return UR_RESULT_SUCCESS;
-  }
-
-  // Points out an allocation in SharedReadOnlyMemAllocContexts
-  auto SharedReadOnlyAllocsIterator = Context->SharedReadOnlyAllocs.end();
-
-  if (!ZeDeviceHandle) {
-    // The only case where it is OK not have device identified is
-    // if the memory is not known to the driver. We should not ever get
-    // this either, probably.
-    UR_ASSERT(ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN,
-              UR_RESULT_ERROR_INVALID_DEVICE);
-  } else {
-    ur_device_handle_t Device;
-    // All context member devices or their descendants are of the same platform.
-    auto Platform = Context->getPlatform();
-    Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle);
-    UR_ASSERT(Device, UR_RESULT_ERROR_INVALID_DEVICE);
-
-    auto DeallocationHelper =
-        [Context, Device,
-         Ptr](std::unordered_map<ze_device_handle_t, USMAllocContext>
-                  &AllocContextMap) {
-          try {
-            auto It = AllocContextMap.find(Device->ZeDevice);
-            if (It == AllocContextMap.end())
-              return UR_RESULT_ERROR_INVALID_VALUE;
-
-            // The right context is found, deallocate the pointer
-            It->second.deallocate(Ptr);
-          } catch (const UsmAllocationException &Ex) {
-            return Ex.getError();
-          }
-
-          if (IndirectAccessTrackingEnabled)
-            UR_CALL(ContextReleaseHelper(Context));
-          return UR_RESULT_SUCCESS;
-        };
-
-    switch (ZeMemoryAllocationProperties.type) {
-    case ZE_MEMORY_TYPE_SHARED:
-      // Distinguish device_read_only allocations since they have own pool.
-      SharedReadOnlyAllocsIterator = Context->SharedReadOnlyAllocs.find(Ptr);
-      return DeallocationHelper(SharedReadOnlyAllocsIterator !=
-                                        Context->SharedReadOnlyAllocs.end()
-                                    ? Context->SharedReadOnlyMemAllocContexts
-                                    : Context->SharedMemAllocContexts);
-    case ZE_MEMORY_TYPE_DEVICE:
-      return DeallocationHelper(Context->DeviceMemAllocContexts);
-    default:
-      // Handled below
-      break;
-    }
-  }
-
-  ur_result_t Res = USMFreeImpl(Context, Ptr);
-  if (SharedReadOnlyAllocsIterator != Context->SharedReadOnlyAllocs.end()) {
-    Context->SharedReadOnlyAllocs.erase(SharedReadOnlyAllocsIterator);
-  }
-  if (IndirectAccessTrackingEnabled)
-    UR_CALL(ContextReleaseHelper(Context));
-  return Res;
-}
-
 // If indirect access tracking is enabled then performs reference counting,
 // otherwise just calls zeMemAllocDevice.
 static ur_result_t ZeDeviceMemAllocHelper(void **ResultPtr,
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
index 0d658342fb0b1..d07d929a59867 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_mem.hpp
@@ -56,15 +56,6 @@ ur_result_t enqueueMemCopyRectHelper(
     uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
     ur_event_handle_t *OutEvent, bool PreferCopyEngine = false);
 
-// Exception type to pass allocation errors
-class UsmAllocationException {
-  const ur_result_t Error;
-
-public:
-  UsmAllocationException(ur_result_t Err) : Error{Err} {}
-  ur_result_t getError() const { return Error; }
-};
-
 struct ur_mem_handle_t_ : _ur_object {
   // Keeps the PI context of this memory handle.
   ur_context_handle_t UrContext;
@@ -219,92 +210,3 @@ struct _ur_image final : ur_mem_handle_t_ {
   // Level Zero image handle.
   ze_image_handle_t ZeImage;
 };
-
-// Implements memory allocation via L0 RT for USM allocator interface.
-class USMMemoryAllocBase : public SystemMemory {
-protected:
-  ur_context_handle_t Context;
-  ur_device_handle_t Device;
-  // Internal allocation routine which must be implemented for each allocation
-  // type
-  virtual ur_result_t allocateImpl(void **ResultPtr, size_t Size,
-                                   uint32_t Alignment) = 0;
-
-public:
-  USMMemoryAllocBase(ur_context_handle_t Ctx, ur_device_handle_t Dev)
-      : Context{Ctx}, Device{Dev} {}
-  void *allocate(size_t Size) override final;
-  void *allocate(size_t Size, size_t Alignment) override final;
-  void deallocate(void *Ptr) override final;
-};
-
-// Allocation routines for shared memory type
-class USMSharedMemoryAlloc : public USMMemoryAllocBase {
-protected:
-  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
-                           uint32_t Alignment) override;
-
-public:
-  USMSharedMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev)
-      : USMMemoryAllocBase(Ctx, Dev) {}
-};
-
-// Allocation routines for shared memory type that is only modified from host.
-class USMSharedReadOnlyMemoryAlloc : public USMMemoryAllocBase {
-protected:
-  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
-                           uint32_t Alignment) override;
-
-public:
-  USMSharedReadOnlyMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev)
-      : USMMemoryAllocBase(Ctx, Dev) {}
-};
-
-// Allocation routines for device memory type
-class USMDeviceMemoryAlloc : public USMMemoryAllocBase {
-protected:
-  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
-                           uint32_t Alignment) override;
-
-public:
-  USMDeviceMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev)
-      : USMMemoryAllocBase(Ctx, Dev) {}
-};
-
-// Allocation routines for host memory type
-class USMHostMemoryAlloc : public USMMemoryAllocBase {
-protected:
-  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
-                           uint32_t Alignment) override;
-
-public:
-  USMHostMemoryAlloc(ur_context_handle_t Ctx)
-      : USMMemoryAllocBase(Ctx, nullptr) {}
-};
-
-ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                               ur_device_handle_t Device,
-                               ur_usm_device_mem_flags_t *Flags, size_t Size,
-                               uint32_t Alignment);
-
-ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                               ur_device_handle_t Device,
-                               ur_usm_host_mem_flags_t *,
-                               ur_usm_device_mem_flags_t *, size_t Size,
-                               uint32_t Alignment);
-
-ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                             ur_usm_host_mem_flags_t *Flags, size_t Size,
-                             uint32_t Alignment);
-
-// If indirect access tracking is not enabled then this functions just performs
-// zeMemFree. If indirect access tracking is enabled then reference counting is
-// performed.
-ur_result_t ZeMemFreeHelper(ur_context_handle_t Context, void *Ptr);
-
-ur_result_t USMFreeHelper(ur_context_handle_t Context, void *Ptr,
-                          bool OwnZeMemHandle = true);
-
-bool ShouldUseUSMAllocator();
-
-extern const bool UseUSMAllocator;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp
new file mode 100644
index 0000000000000..b6236e388a913
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp
@@ -0,0 +1,764 @@
+//===--------- ur_level_zero_usm.cpp - Level Zero Adapter -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include <algorithm>
+#include <climits>
+#include <string.h>
+
+#include "ur_level_zero.hpp"
+#include "ur_level_zero_context.hpp"
+#include "ur_level_zero_event.hpp"
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    const ur_usm_desc_t
+        *USMDesc, ///< [in][optional] USM memory allocation descriptor
+    ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created
+                               ///< using urUSMPoolCreate
+    size_t
+        Size, ///< [in] size in bytes of the USM memory object to be allocated
+    void **RetMem ///< [out] pointer to USM host memory object
+) {
+  std::ignore = Pool;
+
+  uint32_t Align = USMDesc->align;
+  // L0 supports alignment up to 64KB and silently ignores higher values.
+  // We flag alignment > 64KB as an invalid value.
+  if (Align > 65536)
+    return UR_RESULT_ERROR_INVALID_VALUE;
+
+  const ur_usm_advice_flags_t *USMHintFlags = &USMDesc->hints;
+  std::ignore = USMHintFlags;
+
+  ur_platform_handle_t Plt = Context->getPlatform();
+  // If indirect access tracking is enabled then lock the mutex which is
+  // guarding contexts container in the platform. This prevents new kernels from
+  // being submitted in any context while we are in the process of allocating a
+  // memory, this is needed to properly capture allocations by kernels with
+  // indirect access. This lock also protects access to the context's data
+  // structures. If indirect access tracking is not enabled then lock context
+  // mutex to protect access to context's data structures.
+  std::shared_lock<ur_shared_mutex> ContextLock(Context->Mutex,
+                                                std::defer_lock);
+  std::unique_lock<ur_shared_mutex> IndirectAccessTrackingLock(
+      Plt->ContextsMutex, std::defer_lock);
+  if (IndirectAccessTrackingEnabled) {
+    IndirectAccessTrackingLock.lock();
+    // We are going to defer memory release if there are kernels with indirect
+    // access, that is why explicitly retain context to be sure that it is
+    // released after all memory allocations in this context are released.
+    UR_CALL(urContextRetain(Context));
+  } else {
+    ContextLock.lock();
+  }
+
+  if (!UseUSMAllocator ||
+      // L0 spec says that allocation fails if Alignment != 2^n, in order to
+      // keep the same behavior for the allocator, just call L0 API directly and
+      // return the error code.
+      ((Align & (Align - 1)) != 0)) {
+    ur_usm_host_mem_flags_t Flags{};
+    ur_result_t Res = USMHostAllocImpl(RetMem, Context, &Flags, Size, Align);
+    if (IndirectAccessTrackingEnabled) {
+      // Keep track of all memory allocations in the context
+      Context->MemAllocs.emplace(std::piecewise_construct,
+                                 std::forward_as_tuple(*RetMem),
+                                 std::forward_as_tuple(Context));
+    }
+    return Res;
+  }
+
+  // There is a single allocator for Host USM allocations, so we don't need to
+  // find the allocator depending on context as we do for Shared and Device
+  // allocations.
+  try {
+    *RetMem = Context->HostMemAllocContext->allocate(Size, Align);
+    if (IndirectAccessTrackingEnabled) {
+      // Keep track of all memory allocations in the context
+      Context->MemAllocs.emplace(std::piecewise_construct,
+                                 std::forward_as_tuple(*RetMem),
+                                 std::forward_as_tuple(Context));
+    }
+  } catch (const UsmAllocationException &Ex) {
+    *RetMem = nullptr;
+    return Ex.getError();
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    ur_device_handle_t Device,   ///< [in] handle of the device object
+    const ur_usm_desc_t
+        *USMDesc, ///< [in][optional] USM memory allocation descriptor
+    ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created
+                               ///< using urUSMPoolCreate
+    size_t
+        Size, ///< [in] size in bytes of the USM memory object to be allocated
+    void **RetMem ///< [out] pointer to USM device memory object
+) {
+  std::ignore = Pool;
+
+  uint32_t Alignment = USMDesc->align;
+
+  // L0 supports alignment up to 64KB and silently ignores higher values.
+  // We flag alignment > 64KB as an invalid value.
+  if (Alignment > 65536)
+    return UR_RESULT_ERROR_INVALID_VALUE;
+
+  const ur_usm_advice_flags_t *USMHintFlags = &USMDesc->hints;
+  std::ignore = USMHintFlags;
+
+  ur_platform_handle_t Plt = Device->Platform;
+
+  // If indirect access tracking is enabled then lock the mutex which is
+  // guarding contexts container in the platform. This prevents new kernels from
+  // being submitted in any context while we are in the process of allocating a
+  // memory, this is needed to properly capture allocations by kernels with
+  // indirect access. This lock also protects access to the context's data
+  // structures. If indirect access tracking is not enabled then lock context
+  // mutex to protect access to context's data structures.
+  std::shared_lock<ur_shared_mutex> ContextLock(Context->Mutex,
+                                                std::defer_lock);
+  std::unique_lock<ur_shared_mutex> IndirectAccessTrackingLock(
+      Plt->ContextsMutex, std::defer_lock);
+  if (IndirectAccessTrackingEnabled) {
+    IndirectAccessTrackingLock.lock();
+    // We are going to defer memory release if there are kernels with indirect
+    // access, that is why explicitly retain context to be sure that it is
+    // released after all memory allocations in this context are released.
+    UR_CALL(urContextRetain(Context));
+  } else {
+    ContextLock.lock();
+  }
+
+  if (!UseUSMAllocator ||
+      // L0 spec says that allocation fails if Alignment != 2^n, in order to
+      // keep the same behavior for the allocator, just call L0 API directly and
+      // return the error code.
+      ((Alignment & (Alignment - 1)) != 0)) {
+    ur_result_t Res =
+        USMDeviceAllocImpl(RetMem, Context, Device, nullptr, Size, Alignment);
+    if (IndirectAccessTrackingEnabled) {
+      // Keep track of all memory allocations in the context
+      Context->MemAllocs.emplace(std::piecewise_construct,
+                                 std::forward_as_tuple(*RetMem),
+                                 std::forward_as_tuple(Context));
+    }
+    return Res;
+  }
+
+  try {
+    auto It = Context->DeviceMemAllocContexts.find(Device->ZeDevice);
+    if (It == Context->DeviceMemAllocContexts.end())
+      return UR_RESULT_ERROR_INVALID_VALUE;
+
+    *RetMem = It->second.allocate(Size, Alignment);
+    if (IndirectAccessTrackingEnabled) {
+      // Keep track of all memory allocations in the context
+      Context->MemAllocs.emplace(std::piecewise_construct,
+                                 std::forward_as_tuple(*RetMem),
+                                 std::forward_as_tuple(Context));
+    }
+
+  } catch (const UsmAllocationException &Ex) {
+    *RetMem = nullptr;
+    return Ex.getError();
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    ur_device_handle_t Device,   ///< [in] handle of the device object
+    const ur_usm_desc_t
+        *USMDesc, ///< [in][optional] USM memory allocation descriptor
+    ur_usm_pool_handle_t Pool, ///< [in][optional] Pointer to a pool created
+                               ///< using urUSMPoolCreate
+    size_t
+        Size, ///< [in] size in bytes of the USM memory object to be allocated
+    void **RetMem ///< [out] pointer to USM shared memory object
+) {
+  std::ignore = Pool;
+
+  uint32_t Alignment = USMDesc->align;
+
+  ur_usm_host_mem_flags_t UsmHostFlags{};
+
+  // See if the memory is going to be read-only on the device.
+  bool DeviceReadOnly = false;
+  ur_usm_device_mem_flags_t UsmDeviceFlags{};
+
+  void *pNext = const_cast<void *>(USMDesc->pNext);
+  while (pNext != nullptr) {
+    const ur_base_desc_t *BaseDesc =
+        reinterpret_cast<const ur_base_desc_t *>(pNext);
+    if (BaseDesc->stype == UR_STRUCTURE_TYPE_USM_DEVICE_DESC) {
+      const ur_usm_device_desc_t *UsmDeviceDesc =
+          reinterpret_cast<const ur_usm_device_desc_t *>(pNext);
+      UsmDeviceFlags = UsmDeviceDesc->flags;
+    }
+    if (BaseDesc->stype == UR_STRUCTURE_TYPE_USM_HOST_DESC) {
+      const ur_usm_host_desc_t *UsmHostDesc =
+          reinterpret_cast<const ur_usm_host_desc_t *>(pNext);
+      UsmHostFlags = UsmHostDesc->flags;
+    }
+    pNext = const_cast<void *>(BaseDesc->pNext);
+  }
+  DeviceReadOnly = UsmDeviceFlags & UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY;
+
+  // L0 supports alignment up to 64KB and silently ignores higher values.
+  // We flag alignment > 64KB as an invalid value.
+  if (Alignment > 65536)
+    return UR_RESULT_ERROR_INVALID_VALUE;
+
+  ur_platform_handle_t Plt = Device->Platform;
+
+  // If indirect access tracking is enabled then lock the mutex which is
+  // guarding contexts container in the platform. This prevents new kernels from
+  // being submitted in any context while we are in the process of allocating a
+  // memory, this is needed to properly capture allocations by kernels with
+  // indirect access. This lock also protects access to the context's data
+  // structures. If indirect access tracking is not enabled then lock context
+  // mutex to protect access to context's data structures.
+  std::scoped_lock<ur_shared_mutex> Lock(
+      IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex);
+
+  if (IndirectAccessTrackingEnabled) {
+    // We are going to defer memory release if there are kernels with indirect
+    // access, that is why explicitly retain context to be sure that it is
+    // released after all memory allocations in this context are released.
+    UR_CALL(urContextRetain(Context));
+  }
+
+  if (!UseUSMAllocator ||
+      // L0 spec says that allocation fails if Alignment != 2^n, in order to
+      // keep the same behavior for the allocator, just call L0 API directly and
+      // return the error code.
+      ((Alignment & (Alignment - 1)) != 0)) {
+    ur_result_t Res = USMSharedAllocImpl(RetMem, Context, Device, &UsmHostFlags,
+                                         &UsmDeviceFlags, Size, Alignment);
+    if (IndirectAccessTrackingEnabled) {
+      // Keep track of all memory allocations in the context
+      Context->MemAllocs.emplace(std::piecewise_construct,
+                                 std::forward_as_tuple(*RetMem),
+                                 std::forward_as_tuple(Context));
+    }
+    return Res;
+  }
+
+  try {
+    auto &Allocator = (DeviceReadOnly ? Context->SharedReadOnlyMemAllocContexts
+                                      : Context->SharedMemAllocContexts);
+    auto It = Allocator.find(Device->ZeDevice);
+    if (It == Allocator.end())
+      return UR_RESULT_ERROR_INVALID_VALUE;
+
+    *RetMem = It->second.allocate(Size, Alignment);
+    if (DeviceReadOnly) {
+      Context->SharedReadOnlyAllocs.insert(*RetMem);
+    }
+    if (IndirectAccessTrackingEnabled) {
+      // Keep track of all memory allocations in the context
+      Context->MemAllocs.emplace(std::piecewise_construct,
+                                 std::forward_as_tuple(*RetMem),
+                                 std::forward_as_tuple(Context));
+    }
+  } catch (const UsmAllocationException &Ex) {
+    *RetMem = nullptr;
+    return Ex.getError();
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    void *Mem                    ///< [in] pointer to USM memory object
+) {
+  ur_platform_handle_t Plt = Context->getPlatform();
+
+  std::scoped_lock<ur_shared_mutex> Lock(
+      IndirectAccessTrackingEnabled ? Plt->ContextsMutex : Context->Mutex);
+
+  return USMFreeHelper(Context, Mem);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMGetMemAllocInfo(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    const void *Ptr,             ///< [in] pointer to USM memory object
+    ur_usm_alloc_info_t
+        PropName, ///< [in] the name of the USM allocation property to query
+    size_t PropValueSize, ///< [in] size in bytes of the USM allocation property
+                          ///< value
+    void *PropValue, ///< [out][optional] value of the USM allocation property
+    size_t *PropValueSizeRet ///< [out][optional] bytes returned in USM
+                             ///< allocation property
+) {
+  ze_device_handle_t ZeDeviceHandle;
+  ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
+
+  ZE2UR_CALL(zeMemGetAllocProperties,
+             (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties,
+              &ZeDeviceHandle));
+
+  UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet);
+  switch (PropName) {
+  case UR_USM_ALLOC_INFO_TYPE: {
+    ur_usm_type_t MemAllocaType;
+    switch (ZeMemoryAllocationProperties.type) {
+    case ZE_MEMORY_TYPE_UNKNOWN:
+      MemAllocaType = UR_USM_TYPE_UNKNOWN;
+      break;
+    case ZE_MEMORY_TYPE_HOST:
+      MemAllocaType = UR_USM_TYPE_HOST;
+      break;
+    case ZE_MEMORY_TYPE_DEVICE:
+      MemAllocaType = UR_USM_TYPE_DEVICE;
+      break;
+    case ZE_MEMORY_TYPE_SHARED:
+      MemAllocaType = UR_USM_TYPE_SHARED;
+      break;
+    default:
+      urPrint("urUSMGetMemAllocInfo: unexpected usm memory type\n");
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+    return ReturnValue(MemAllocaType);
+  }
+  case UR_USM_ALLOC_INFO_DEVICE:
+    if (ZeDeviceHandle) {
+      auto Platform = Context->getPlatform();
+      auto Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle);
+      return Device ? ReturnValue(Device) : UR_RESULT_ERROR_INVALID_VALUE;
+    } else {
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+  case UR_USM_ALLOC_INFO_BASE_PTR: {
+    void *Base;
+    ZE2UR_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, &Base, nullptr));
+    return ReturnValue(Base);
+  }
+  case UR_USM_ALLOC_INFO_SIZE: {
+    size_t Size;
+    ZE2UR_CALL(zeMemGetAddressRange, (Context->ZeContext, Ptr, nullptr, &Size));
+    return ReturnValue(Size);
+  }
+  default:
+    urPrint("urUSMGetMemAllocInfo: unsupported ParamName\n");
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+static ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) {
+  ZE2UR_CALL(zeMemFree, (Context->ZeContext, Ptr));
+  return UR_RESULT_SUCCESS;
+}
+
+void *USMMemoryAllocBase::allocate(size_t Size) {
+  void *Ptr = nullptr;
+
+  auto Res = allocateImpl(&Ptr, Size, sizeof(void *));
+  if (Res != UR_RESULT_SUCCESS) {
+    throw UsmAllocationException(Res);
+  }
+
+  return Ptr;
+}
+
+void *USMMemoryAllocBase::allocate(size_t Size, size_t Alignment) {
+  void *Ptr = nullptr;
+
+  auto Res = allocateImpl(&Ptr, Size, Alignment);
+  if (Res != UR_RESULT_SUCCESS) {
+    throw UsmAllocationException(Res);
+  }
+  return Ptr;
+}
+
+void USMMemoryAllocBase::deallocate(void *Ptr) {
+  auto Res = USMFreeImpl(Context, Ptr);
+  if (Res != UR_RESULT_SUCCESS) {
+    throw UsmAllocationException(Res);
+  }
+}
+
+ur_result_t USMSharedMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
+                                               uint32_t Alignment) {
+  return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size,
+                            Alignment);
+}
+
+ur_result_t USMSharedReadOnlyMemoryAlloc::allocateImpl(void **ResultPtr,
+                                                       size_t Size,
+                                                       uint32_t Alignment) {
+  ur_usm_device_desc_t UsmDeviceDesc{};
+  UsmDeviceDesc.flags = UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY;
+  ur_usm_host_desc_t UsmHostDesc{};
+  return USMSharedAllocImpl(ResultPtr, Context, Device, &UsmDeviceDesc.flags,
+                            &UsmHostDesc.flags, Size, Alignment);
+}
+
+ur_result_t USMDeviceMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
+                                               uint32_t Alignment) {
+  return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size,
+                            Alignment);
+}
+
+ur_result_t USMHostMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
+                                             uint32_t Alignment) {
+  return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment);
+}
+
+enum class USMAllocationForceResidencyType {
+  // Do not force memory residency at allocation time.
+  None = 0,
+  // Force memory resident on the device of allocation at allocation time.
+  // For host allocation force residency on all devices in a context.
+  Device = 1,
+  // [Default] Force memory resident on all devices in the context with P2P
+  // access to the device of allocation.
+  // For host allocation force residency on all devices in a context.
+  P2PDevices = 2
+};
+
+// Returns the desired USM residency setting
+static USMAllocationForceResidencyType USMAllocationForceResidency = [] {
+  const char *UrRet = std::getenv("UR_L0_USM_RESIDENT");
+  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT");
+  const char *Str = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
+  if (!Str)
+    return USMAllocationForceResidencyType::P2PDevices;
+  switch (std::atoi(Str)) {
+  case 1:
+    return USMAllocationForceResidencyType::Device;
+  case 2:
+    return USMAllocationForceResidencyType::P2PDevices;
+  default:
+    return USMAllocationForceResidencyType::None;
+  };
+}();
+
+// Make USM allocation resident as requested
+static ur_result_t USMAllocationMakeResident(
+    ur_context_handle_t Context,
+    ur_device_handle_t Device, // nullptr for host allocation
+    void *Ptr, size_t Size) {
+
+  std::list<ur_device_handle_t> Devices;
+
+  if (USMAllocationForceResidency == USMAllocationForceResidencyType::None)
+    return UR_RESULT_SUCCESS;
+  else if (!Device) {
+    // Host allocation, make it resident on all devices in the context
+    Devices.insert(Devices.end(), Context->Devices.begin(),
+                   Context->Devices.end());
+  } else {
+    Devices.push_back(Device);
+    if (USMAllocationForceResidency ==
+        USMAllocationForceResidencyType::P2PDevices) {
+      ze_bool_t P2P;
+      for (const auto &D : Context->Devices) {
+        if (D == Device)
+          continue;
+        // TODO: Cache P2P devices for a context
+        ZE2UR_CALL(zeDeviceCanAccessPeer,
+                   (D->ZeDevice, Device->ZeDevice, &P2P));
+        if (P2P)
+          Devices.push_back(D);
+      }
+    }
+  }
+  for (const auto &D : Devices) {
+    ZE2UR_CALL(zeContextMakeMemoryResident,
+               (Context->ZeContext, D->ZeDevice, Ptr, Size));
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    ur_usm_pool_desc_t
+        *PoolDesc, ///< [in] pointer to USM pool descriptor. Can be chained with
+                   ///< ::ur_usm_pool_limits_desc_t
+    ur_usm_pool_handle_t *Pool ///< [out] pointer to USM memory pool
+) {
+  std::ignore = Context;
+  std::ignore = PoolDesc;
+  std::ignore = Pool;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolDestroy(
+    ur_context_handle_t Context, ///< [in] handle of the context object
+    ur_usm_pool_handle_t Pool    ///< [in] pointer to USM memory pool
+) {
+  std::ignore = Context;
+  std::ignore = Pool;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
+                               ur_device_handle_t Device,
+                               ur_usm_device_mem_flags_t *Flags, size_t Size,
+                               uint32_t Alignment) {
+  // TODO: translate PI properties to Level Zero flags
+  ZeStruct<ze_device_mem_alloc_desc_t> ZeDesc;
+  ZeDesc.flags = 0;
+  ZeDesc.ordinal = 0;
+
+  ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
+  if (Size > Device->ZeDeviceProperties->maxMemAllocSize) {
+    // Tell Level-Zero to accept Size > maxMemAllocSize
+    RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
+    ZeDesc.pNext = &RelaxedDesc;
+  }
+
+  ZE2UR_CALL(zeMemAllocDevice, (Context->ZeContext, &ZeDesc, Size, Alignment,
+                                Device->ZeDevice, ResultPtr));
+
+  UR_ASSERT(Alignment == 0 ||
+                reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
+            UR_RESULT_ERROR_INVALID_VALUE);
+
+  USMAllocationMakeResident(Context, Device, *ResultPtr, Size);
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
+                               ur_device_handle_t Device,
+                               ur_usm_host_mem_flags_t *,
+                               ur_usm_device_mem_flags_t *, size_t Size,
+                               uint32_t Alignment) {
+
+  // TODO: translate PI properties to Level Zero flags
+  ZeStruct<ze_host_mem_alloc_desc_t> ZeHostDesc;
+  ZeHostDesc.flags = 0;
+  ZeStruct<ze_device_mem_alloc_desc_t> ZeDevDesc;
+  ZeDevDesc.flags = 0;
+  ZeDevDesc.ordinal = 0;
+
+  ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
+  if (Size > Device->ZeDeviceProperties->maxMemAllocSize) {
+    // Tell Level-Zero to accept Size > maxMemAllocSize
+    RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
+    ZeDevDesc.pNext = &RelaxedDesc;
+  }
+
+  ZE2UR_CALL(zeMemAllocShared, (Context->ZeContext, &ZeDevDesc, &ZeHostDesc,
+                                Size, Alignment, Device->ZeDevice, ResultPtr));
+
+  UR_ASSERT(Alignment == 0 ||
+                reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
+            UR_RESULT_ERROR_INVALID_VALUE);
+
+  USMAllocationMakeResident(Context, Device, *ResultPtr, Size);
+
+  // TODO: Handle PI_MEM_ALLOC_DEVICE_READ_ONLY.
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
+                             ur_usm_host_mem_flags_t *Flags, size_t Size,
+                             uint32_t Alignment) {
+  // TODO: translate PI properties to Level Zero flags
+  ZeStruct<ze_host_mem_alloc_desc_t> ZeHostDesc;
+  ZeHostDesc.flags = 0;
+  ZE2UR_CALL(zeMemAllocHost,
+             (Context->ZeContext, &ZeHostDesc, Size, Alignment, ResultPtr));
+
+  UR_ASSERT(Alignment == 0 ||
+                reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
+            UR_RESULT_ERROR_INVALID_VALUE);
+
+  USMAllocationMakeResident(Context, nullptr, *ResultPtr, Size);
+
+  return UR_RESULT_SUCCESS;
+}
+
+// If indirect access tracking is not enabled then this functions just performs
+// zeMemFree. If indirect access tracking is enabled then reference counting is
+// performed.
+ur_result_t ZeMemFreeHelper(ur_context_handle_t Context, void *Ptr) {
+  ur_platform_handle_t Plt = Context->getPlatform();
+  std::unique_lock<ur_shared_mutex> ContextsLock(Plt->ContextsMutex,
+                                                 std::defer_lock);
+  if (IndirectAccessTrackingEnabled) {
+    ContextsLock.lock();
+    auto It = Context->MemAllocs.find(Ptr);
+    if (It == std::end(Context->MemAllocs)) {
+      die("All memory allocations must be tracked!");
+    }
+    if (!It->second.RefCount.decrementAndTest()) {
+      // Memory can't be deallocated yet.
+      return UR_RESULT_SUCCESS;
+    }
+
+    // Reference count is zero, it is ok to free memory.
+    // We don't need to track this allocation anymore.
+    Context->MemAllocs.erase(It);
+  }
+
+  ZE2UR_CALL(zeMemFree, (Context->ZeContext, Ptr));
+
+  if (IndirectAccessTrackingEnabled)
+    UR_CALL(ContextReleaseHelper(Context));
+
+  return UR_RESULT_SUCCESS;
+}
+
+bool ShouldUseUSMAllocator() {
+  // Enable allocator by default if it's not explicitly disabled
+  const char *UrRet = std::getenv("UR_L0_DISABLE_USM_ALLOCATOR");
+  const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR");
+  const char *Ret = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
+  return Ret == nullptr;
+}
+
+const bool UseUSMAllocator = ShouldUseUSMAllocator();
+
+// Helper function to deallocate USM memory, if indirect access support is
+// enabled then a caller must lock the platform-level mutex guarding the
+// container with contexts because deallocating the memory can turn RefCount of
+// a context to 0 and as a result the context being removed from the list of
+// tracked contexts.
+// If indirect access tracking is not enabled then caller must lock Context
+// mutex.
+ur_result_t USMFreeHelper(ur_context_handle_t Context, void *Ptr,
+                          bool OwnZeMemHandle) {
+  if (!OwnZeMemHandle) {
+    // Memory should not be freed
+    return UR_RESULT_SUCCESS;
+  }
+
+  if (IndirectAccessTrackingEnabled) {
+    auto It = Context->MemAllocs.find(Ptr);
+    if (It == std::end(Context->MemAllocs)) {
+      die("All memory allocations must be tracked!");
+    }
+    if (!It->second.RefCount.decrementAndTest()) {
+      // Memory can't be deallocated yet.
+      return UR_RESULT_SUCCESS;
+    }
+
+    // Reference count is zero, it is ok to free memory.
+    // We don't need to track this allocation anymore.
+    Context->MemAllocs.erase(It);
+  }
+
+  if (!UseUSMAllocator) {
+    ur_result_t Res = USMFreeImpl(Context, Ptr);
+    if (IndirectAccessTrackingEnabled)
+      UR_CALL(ContextReleaseHelper(Context));
+    return Res;
+  }
+
+  // Query the device of the allocation to determine the right allocator context
+  ze_device_handle_t ZeDeviceHandle;
+  ZeStruct<ze_memory_allocation_properties_t> ZeMemoryAllocationProperties;
+
+  // Query memory type of the pointer we're freeing to determine the correct
+  // way to do it(directly or via an allocator)
+  auto ZeResult =
+      ZE_CALL_NOCHECK(zeMemGetAllocProperties,
+                      (Context->ZeContext, Ptr, &ZeMemoryAllocationProperties,
+                       &ZeDeviceHandle));
+
+  // Handle the case that L0 RT was already unloaded
+  if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
+    if (IndirectAccessTrackingEnabled)
+      UR_CALL(ContextReleaseHelper(Context));
+    return UR_RESULT_SUCCESS;
+  } else if (ZeResult) {
+    return ze2urResult(ZeResult);
+  }
+
+  // If memory type is host release from host pool
+  if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST) {
+    try {
+      Context->HostMemAllocContext->deallocate(Ptr);
+    } catch (const UsmAllocationException &Ex) {
+      return Ex.getError();
+    } catch (...) {
+      return UR_RESULT_ERROR_UNKNOWN;
+    }
+    if (IndirectAccessTrackingEnabled)
+      UR_CALL(ContextReleaseHelper(Context));
+    return UR_RESULT_SUCCESS;
+  }
+
+  // Points out an allocation in SharedReadOnlyMemAllocContexts
+  auto SharedReadOnlyAllocsIterator = Context->SharedReadOnlyAllocs.end();
+
+  if (!ZeDeviceHandle) {
+    // The only case where it is OK not have device identified is
+    // if the memory is not known to the driver. We should not ever get
+    // this either, probably.
+    UR_ASSERT(ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN,
+              UR_RESULT_ERROR_INVALID_DEVICE);
+  } else {
+    ur_device_handle_t Device;
+    // All context member devices or their descendants are of the same platform.
+    auto Platform = Context->getPlatform();
+    Device = Platform->getDeviceFromNativeHandle(ZeDeviceHandle);
+    UR_ASSERT(Device, UR_RESULT_ERROR_INVALID_DEVICE);
+
+    auto DeallocationHelper =
+        [Context, Device,
+         Ptr](std::unordered_map<ze_device_handle_t, USMAllocContext>
+                  &AllocContextMap) {
+          try {
+            auto It = AllocContextMap.find(Device->ZeDevice);
+            if (It == AllocContextMap.end())
+              return UR_RESULT_ERROR_INVALID_VALUE;
+
+            // The right context is found, deallocate the pointer
+            It->second.deallocate(Ptr);
+          } catch (const UsmAllocationException &Ex) {
+            return Ex.getError();
+          }
+
+          if (IndirectAccessTrackingEnabled)
+            UR_CALL(ContextReleaseHelper(Context));
+          return UR_RESULT_SUCCESS;
+        };
+
+    switch (ZeMemoryAllocationProperties.type) {
+    case ZE_MEMORY_TYPE_SHARED:
+      // Distinguish device_read_only allocations since they have own pool.
+      SharedReadOnlyAllocsIterator = Context->SharedReadOnlyAllocs.find(Ptr);
+      return DeallocationHelper(SharedReadOnlyAllocsIterator !=
+                                        Context->SharedReadOnlyAllocs.end()
+                                    ? Context->SharedReadOnlyMemAllocContexts
+                                    : Context->SharedMemAllocContexts);
+    case ZE_MEMORY_TYPE_DEVICE:
+      return DeallocationHelper(Context->DeviceMemAllocContexts);
+    default:
+      // Handled below
+      break;
+    }
+  }
+
+  ur_result_t Res = USMFreeImpl(Context, Ptr);
+  if (SharedReadOnlyAllocsIterator != Context->SharedReadOnlyAllocs.end()) {
+    Context->SharedReadOnlyAllocs.erase(SharedReadOnlyAllocsIterator);
+  }
+  if (IndirectAccessTrackingEnabled)
+    UR_CALL(ContextReleaseHelper(Context));
+  return Res;
+}
\ No newline at end of file
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp
new file mode 100644
index 0000000000000..ba0130089906e
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp
@@ -0,0 +1,108 @@
+//===--------- ur_level_zero_usm.hpp - Level Zero Adapter -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include "ur_level_zero_common.hpp"
+
+// Exception type to pass allocation errors
+class UsmAllocationException {
+  const ur_result_t Error;
+
+public:
+  UsmAllocationException(ur_result_t Err) : Error{Err} {}
+  ur_result_t getError() const { return Error; }
+};
+
+// Implements memory allocation via L0 RT for USM allocator interface.
+class USMMemoryAllocBase : public SystemMemory {
+protected:
+  ur_context_handle_t Context;
+  ur_device_handle_t Device;
+  // Internal allocation routine which must be implemented for each allocation
+  // type
+  virtual ur_result_t allocateImpl(void **ResultPtr, size_t Size,
+                                   uint32_t Alignment) = 0;
+
+public:
+  USMMemoryAllocBase(ur_context_handle_t Ctx, ur_device_handle_t Dev)
+      : Context{Ctx}, Device{Dev} {}
+  void *allocate(size_t Size) override final;
+  void *allocate(size_t Size, size_t Alignment) override final;
+  void deallocate(void *Ptr) override final;
+};
+
+// Allocation routines for shared memory type
+class USMSharedMemoryAlloc : public USMMemoryAllocBase {
+protected:
+  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
+                           uint32_t Alignment) override;
+
+public:
+  USMSharedMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev)
+      : USMMemoryAllocBase(Ctx, Dev) {}
+};
+
+// Allocation routines for shared memory type that is only modified from host.
+class USMSharedReadOnlyMemoryAlloc : public USMMemoryAllocBase {
+protected:
+  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
+                           uint32_t Alignment) override;
+
+public:
+  USMSharedReadOnlyMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev)
+      : USMMemoryAllocBase(Ctx, Dev) {}
+};
+
+// Allocation routines for device memory type
+class USMDeviceMemoryAlloc : public USMMemoryAllocBase {
+protected:
+  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
+                           uint32_t Alignment) override;
+
+public:
+  USMDeviceMemoryAlloc(ur_context_handle_t Ctx, ur_device_handle_t Dev)
+      : USMMemoryAllocBase(Ctx, Dev) {}
+};
+
+// Allocation routines for host memory type
+class USMHostMemoryAlloc : public USMMemoryAllocBase {
+protected:
+  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
+                           uint32_t Alignment) override;
+
+public:
+  USMHostMemoryAlloc(ur_context_handle_t Ctx)
+      : USMMemoryAllocBase(Ctx, nullptr) {}
+};
+
+ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
+                               ur_device_handle_t Device,
+                               ur_usm_device_mem_flags_t *Flags, size_t Size,
+                               uint32_t Alignment);
+
+ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
+                               ur_device_handle_t Device,
+                               ur_usm_host_mem_flags_t *,
+                               ur_usm_device_mem_flags_t *, size_t Size,
+                               uint32_t Alignment);
+
+ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
+                             ur_usm_host_mem_flags_t *Flags, size_t Size,
+                             uint32_t Alignment);
+
+// If indirect access tracking is not enabled then this functions just performs
+// zeMemFree. If indirect access tracking is enabled then reference counting is
+// performed.
+ur_result_t ZeMemFreeHelper(ur_context_handle_t Context, void *Ptr);
+
+ur_result_t USMFreeHelper(ur_context_handle_t Context, void *Ptr,
+                          bool OwnZeMemHandle = true);
+
+bool ShouldUseUSMAllocator();
+
+extern const bool UseUSMAllocator;

From cd11f354be2ac4b3b2e36c8578270053b275eb56 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Tue, 16 May 2023 17:13:55 -0700
Subject: [PATCH 42/50] Port [SYCL[L0] Change the
 SYCL_PI_LEVEL_ZERO_USM_RESIDENT default

https://github.com/intel/llvm/pull/9442

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../adapters/level_zero/ur_level_zero_usm.cpp | 64 ++++++++++++++-----
 1 file changed, 48 insertions(+), 16 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp
index b6236e388a913..2a5effb541ad7 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp
@@ -429,20 +429,37 @@ enum class USMAllocationForceResidencyType {
   // Force memory resident on the device of allocation at allocation time.
   // For host allocation force residency on all devices in a context.
   Device = 1,
-  // [Default] Force memory resident on all devices in the context with P2P
+  // Force memory resident on all devices in the context with P2P
   // access to the device of allocation.
   // For host allocation force residency on all devices in a context.
   P2PDevices = 2
 };
 
-// Returns the desired USM residency setting
-static USMAllocationForceResidencyType USMAllocationForceResidency = [] {
+// Input value is of the form 0xHSD, where:
+//   4-bits of D control device allocations
+//   4-bits of S control shared allocations
+//   4-bits of H control host allocations
+// Each 4-bit value is holding a USMAllocationForceResidencyType enum value.
+// The default is 0x2, i.e. force full residency for device allocations only.
+//
+static uint32_t USMAllocationForceResidency = [] {
   const char *UrRet = std::getenv("UR_L0_USM_RESIDENT");
   const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT");
   const char *Str = UrRet ? UrRet : (PiRet ? PiRet : nullptr);
-  if (!Str)
-    return USMAllocationForceResidencyType::P2PDevices;
-  switch (std::atoi(Str)) {
+  try {
+    if (Str) {
+      // Auto-detect radix to allow more convinient hex base
+      return std::stoi(Str, nullptr, 0);
+    }
+  } catch (...) {
+  }
+  return 0x2;
+}();
+
+// Convert from an integer value to USMAllocationForceResidencyType enum value
+static USMAllocationForceResidencyType
+USMAllocationForceResidencyConvert(uint32_t Val) {
+  switch (Val) {
   case 1:
     return USMAllocationForceResidencyType::Device;
   case 2:
@@ -450,26 +467,38 @@ static USMAllocationForceResidencyType USMAllocationForceResidency = [] {
   default:
     return USMAllocationForceResidencyType::None;
   };
+}
+
+static USMAllocationForceResidencyType USMHostAllocationForceResidency = [] {
+  return USMAllocationForceResidencyConvert(
+      (USMAllocationForceResidency & 0xf00) >> 8);
+}();
+static USMAllocationForceResidencyType USMSharedAllocationForceResidency = [] {
+  return USMAllocationForceResidencyConvert(
+      (USMAllocationForceResidency & 0x0f0) >> 4);
+}();
+static USMAllocationForceResidencyType USMDeviceAllocationForceResidency = [] {
+  return USMAllocationForceResidencyConvert(
+      (USMAllocationForceResidency & 0x00f));
 }();
 
 // Make USM allocation resident as requested
 static ur_result_t USMAllocationMakeResident(
-    ur_context_handle_t Context,
+    USMAllocationForceResidencyType ForceResidency, ur_context_handle_t Context,
     ur_device_handle_t Device, // nullptr for host allocation
     void *Ptr, size_t Size) {
 
-  std::list<ur_device_handle_t> Devices;
-
-  if (USMAllocationForceResidency == USMAllocationForceResidencyType::None)
+  if (ForceResidency == USMAllocationForceResidencyType::None)
     return UR_RESULT_SUCCESS;
-  else if (!Device) {
+
+  std::list<ur_device_handle_t> Devices;
+  if (!Device) {
     // Host allocation, make it resident on all devices in the context
     Devices.insert(Devices.end(), Context->Devices.begin(),
                    Context->Devices.end());
   } else {
     Devices.push_back(Device);
-    if (USMAllocationForceResidency ==
-        USMAllocationForceResidencyType::P2PDevices) {
+    if (ForceResidency == USMAllocationForceResidencyType::P2PDevices) {
       ze_bool_t P2P;
       for (const auto &D : Context->Devices) {
         if (D == Device)
@@ -536,7 +565,8 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                 reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
             UR_RESULT_ERROR_INVALID_VALUE);
 
-  USMAllocationMakeResident(Context, Device, *ResultPtr, Size);
+  USMAllocationMakeResident(USMDeviceAllocationForceResidency, Context, Device,
+                            *ResultPtr, Size);
   return UR_RESULT_SUCCESS;
 }
 
@@ -567,7 +597,8 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                 reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
             UR_RESULT_ERROR_INVALID_VALUE);
 
-  USMAllocationMakeResident(Context, Device, *ResultPtr, Size);
+  USMAllocationMakeResident(USMSharedAllocationForceResidency, Context, Device,
+                            *ResultPtr, Size);
 
   // TODO: Handle PI_MEM_ALLOC_DEVICE_READ_ONLY.
   return UR_RESULT_SUCCESS;
@@ -586,7 +617,8 @@ ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                 reinterpret_cast<std::uintptr_t>(*ResultPtr) % Alignment == 0,
             UR_RESULT_ERROR_INVALID_VALUE);
 
-  USMAllocationMakeResident(Context, nullptr, *ResultPtr, Size);
+  USMAllocationMakeResident(USMHostAllocationForceResidency, Context, nullptr,
+                            *ResultPtr, Size);
 
   return UR_RESULT_SUCCESS;
 }

From 2922ae7f37de79cb250a6a6c077660a8404e7622 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Thu, 18 May 2023 09:20:59 -0700
Subject: [PATCH 43/50] Port [SYCL] [L0] Remove unneeded backwards
 compatibility of 2023.2 make_queue and get_native

https://github.com/intel/llvm/pull/8871

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/level_zero/pi_level_zero.cpp     |  35 ++----
 sycl/plugins/unified_runtime/pi2ur.hpp        |  69 +++++------
 .../unified_runtime/pi_unified_runtime.cpp    |  42 ++-----
 .../level_zero/ur_level_zero_queue.cpp        | 111 ++++++++++++++----
 .../level_zero/ur_level_zero_queue.hpp        |   9 ++
 5 files changed, 150 insertions(+), 116 deletions(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index bc55890ada108..cd5ed69889253 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -179,28 +179,6 @@ pi_result piextQueueCreate(pi_context Context, pi_device Device,
   return pi2ur::piextQueueCreate(Context, Device, Properties, Queue);
 }
 
-pi_result piextQueueCreate2(pi_context Context, pi_device Device,
-                            pi_queue_properties *Properties, pi_queue *Queue) {
-  return pi2ur::piextQueueCreate(Context, Device, Properties, Queue);
-}
-
-pi_result piextQueueGetNativeHandle2(pi_queue Queue,
-                                     pi_native_handle *NativeHandle,
-                                     int32_t *NativeHandleDesc) {
-  std::ignore = NativeHandleDesc;
-  return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle);
-}
-
-pi_result piextQueueCreateWithNativeHandle2(
-    pi_native_handle NativeHandle, int32_t NativeHandleDesc, pi_context Context,
-    pi_device Device, bool OwnNativeHandle, pi_queue_properties *Properties,
-    pi_queue *Queue) {
-  std::ignore = NativeHandleDesc;
-  std::ignore = Properties;
-  return pi2ur::piextQueueCreateWithNativeHandle(NativeHandle, Context, Device,
-                                                 OwnNativeHandle, Queue);
-}
-
 pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName,
                          size_t ParamValueSize, void *ParamValue,
                          size_t *ParamValueSizeRet) {
@@ -220,18 +198,23 @@ pi_result piQueueFinish(pi_queue Queue) { return pi2ur::piQueueFinish(Queue); }
 pi_result piQueueFlush(pi_queue Queue) { return pi2ur::piQueueFlush(Queue); }
 
 pi_result piextQueueGetNativeHandle(pi_queue Queue,
-                                    pi_native_handle *NativeHandle) {
+                                    pi_native_handle *NativeHandle,
+                                    int32_t *NativeHandleDesc) {
 
-  return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle);
+  return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle,
+                                          NativeHandleDesc);
 }
 
 pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
+                                           int32_t NativeHandleDesc,
                                            pi_context Context, pi_device Device,
                                            bool OwnNativeHandle,
+                                           pi_queue_properties *Properties,
                                            pi_queue *Queue) {
 
-  return pi2ur::piextQueueCreateWithNativeHandle(NativeHandle, Context, Device,
-                                                 OwnNativeHandle, Queue);
+  return pi2ur::piextQueueCreateWithNativeHandle(
+      NativeHandle, NativeHandleDesc, Context, Device, OwnNativeHandle,
+      Properties, Queue);
 }
 
 pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index f36ce228d48a2..f21484d657595 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -1326,17 +1326,10 @@ inline pi_result piQueueCreate(pi_context Context, pi_device Device,
   return pi2ur::piextQueueCreate(Context, Device, Properties, Queue);
 }
 
-inline pi_result piextQueueCreate2(pi_context context, pi_device device,
-                                   pi_queue_properties *properties,
-                                   pi_queue *queue) {
-  return pi2ur::piextQueueCreate(context, device, properties, queue);
-}
-
-inline pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
-                                                  pi_context Context,
-                                                  pi_device Device,
-                                                  bool OwnNativeHandle,
-                                                  pi_queue *Queue) {
+inline pi_result piextQueueCreateWithNativeHandle(
+    pi_native_handle NativeHandle, int32_t NativeHandleDesc, pi_context Context,
+    pi_device Device, bool OwnNativeHandle, pi_queue_properties *Properties,
+    pi_queue *Queue) {
   PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
   PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
   PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
@@ -1348,29 +1341,45 @@ inline pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
   ur_native_handle_t UrNativeHandle =
       reinterpret_cast<ur_native_handle_t>(NativeHandle);
   ur_queue_handle_t *UrQueue = reinterpret_cast<ur_queue_handle_t *>(Queue);
-  ur_queue_native_properties_t Properties{};
-  Properties.isNativeHandleOwned = OwnNativeHandle;
-  HANDLE_ERRORS(urQueueCreateWithNativeHandle(UrNativeHandle, UrContext,
-                                              UrDevice, &Properties, UrQueue));
-  return PI_SUCCESS;
-}
+  ur_queue_native_properties_t UrNativeProperties{};
+  UrNativeProperties.isNativeHandleOwned = OwnNativeHandle;
 
-inline pi_result piextQueueCreateWithNativeHandle2(
-    pi_native_handle nativeHandle, int32_t nativeHandleDesc, pi_context context,
-    pi_device device, bool pluginOwnsNativeHandle,
-    pi_queue_properties *Properties, pi_queue *queue) {
-  (void)nativeHandleDesc;
-  (void)Properties;
-  return pi2ur::piextQueueCreateWithNativeHandle(nativeHandle, context, device,
-                                                 pluginOwnsNativeHandle, queue);
+  ur_queue_properties_t UrProperties{};
+  UrProperties.stype = UR_STRUCTURE_TYPE_QUEUE_PROPERTIES;
+  if (Properties[1] & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)
+    UrProperties.flags |= UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+  if (Properties[1] & PI_QUEUE_FLAG_PROFILING_ENABLE)
+    UrProperties.flags |= UR_QUEUE_FLAG_PROFILING_ENABLE;
+  if (Properties[1] & PI_QUEUE_FLAG_ON_DEVICE)
+    UrProperties.flags |= UR_QUEUE_FLAG_ON_DEVICE;
+  if (Properties[1] & PI_QUEUE_FLAG_ON_DEVICE_DEFAULT)
+    UrProperties.flags |= UR_QUEUE_FLAG_ON_DEVICE_DEFAULT;
+  if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_DISCARD_EVENTS)
+    UrProperties.flags |= UR_QUEUE_FLAG_DISCARD_EVENTS;
+  if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_LOW)
+    UrProperties.flags |= UR_QUEUE_FLAG_PRIORITY_LOW;
+  if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH)
+    UrProperties.flags |= UR_QUEUE_FLAG_PRIORITY_HIGH;
+
+  UrNativeProperties.pNext = &UrProperties;
+
+  // TODO: How to pass this up in the urQueueCreateWithNativeHandle interface?
+  std::ignore = NativeHandleDesc;
+  HANDLE_ERRORS(urQueueCreateWithNativeHandle(
+      UrNativeHandle, UrContext, UrDevice, &UrNativeProperties, UrQueue));
+  return PI_SUCCESS;
 }
 
 inline pi_result piextQueueGetNativeHandle(pi_queue Queue,
-                                           pi_native_handle *NativeHandle) {
+                                           pi_native_handle *NativeHandle,
+                                           int32_t *NativeHandleDesc) {
 
   PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
   PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
 
+  // TODO: How to pass this up in the urQueueGetNativeHandle interface?
+  std::ignore = NativeHandleDesc;
+
   ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
 
   ur_native_handle_t UrNativeQueue{};
@@ -1381,14 +1390,6 @@ inline pi_result piextQueueGetNativeHandle(pi_queue Queue,
   return PI_SUCCESS;
 }
 
-inline pi_result piextQueueGetNativeHandle2(pi_queue Queue,
-                                            pi_native_handle *NativeHandle,
-                                            int32_t *NativeHandleDesc) {
-
-  (void)NativeHandleDesc;
-  return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle);
-}
-
 inline pi_result piQueueRelease(pi_queue Queue) {
   PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
 
diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
index acff4810f9dc9..20fe7384a9c63 100644
--- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
+++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
@@ -109,28 +109,6 @@ __SYCL_EXPORT pi_result piextQueueCreate(pi_context Context, pi_device Device,
   return pi2ur::piextQueueCreate(Context, Device, Properties, Queue);
 }
 
-__SYCL_EXPORT pi_result piextQueueCreate2(pi_context Context, pi_device Device,
-                                          pi_queue_properties *Properties,
-                                          pi_queue *Queue) {
-  return pi2ur::piextQueueCreate(Context, Device, Properties, Queue);
-}
-
-__SYCL_EXPORT pi_result piextQueueGetNativeHandle2(
-    pi_queue Queue, pi_native_handle *NativeHandle, int32_t *NativeHandleDesc) {
-  std::ignore = NativeHandleDesc;
-  return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle);
-}
-
-__SYCL_EXPORT pi_result piextQueueCreateWithNativeHandle2(
-    pi_native_handle NativeHandle, int32_t NativeHandleDesc, pi_context Context,
-    pi_device Device, bool OwnNativeHandle, pi_queue_properties *Properties,
-    pi_queue *Queue) {
-  std::ignore = NativeHandleDesc;
-  std::ignore = Properties;
-  return pi2ur::piextQueueCreateWithNativeHandle(NativeHandle, Context, Device,
-                                                 OwnNativeHandle, Queue);
-}
-
 __SYCL_EXPORT pi_result piQueueRelease(pi_queue Queue) {
   return pi2ur::piQueueRelease(Queue);
 }
@@ -724,16 +702,19 @@ __SYCL_EXPORT pi_result piextContextCreateWithNativeHandle(
       NativeHandle, NumDevices, Devices, OwnNativeHandle, RetContext);
 }
 
-__SYCL_EXPORT pi_result
-piextQueueGetNativeHandle(pi_queue Queue, pi_native_handle *NativeHandle) {
-  return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle);
+__SYCL_EXPORT pi_result piextQueueGetNativeHandle(
+    pi_queue Queue, pi_native_handle *NativeHandle, int32_t *NativeHandleDesc) {
+  return pi2ur::piextQueueGetNativeHandle(Queue, NativeHandle,
+                                          NativeHandleDesc);
 }
 
 __SYCL_EXPORT pi_result piextQueueCreateWithNativeHandle(
-    pi_native_handle NativeHandle, pi_context Context, pi_device Device,
-    bool OwnNativeHandle, pi_queue *Queue) {
-  return pi2ur::piextQueueCreateWithNativeHandle(NativeHandle, Context, Device,
-                                                 OwnNativeHandle, Queue);
+    pi_native_handle NativeHandle, int32_t NativeHandleDesc, pi_context Context,
+    pi_device Device, bool OwnNativeHandle, pi_queue_properties *Properties,
+    pi_queue *Queue) {
+  return pi2ur::piextQueueCreateWithNativeHandle(
+      NativeHandle, NativeHandleDesc, Context, Device, OwnNativeHandle,
+      Properties, Queue);
 }
 
 __SYCL_EXPORT pi_result piMemRelease(pi_mem Mem) {
@@ -1068,9 +1049,6 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_API(piQueueFlush)
   _PI_API(piextQueueGetNativeHandle)
   _PI_API(piextQueueCreateWithNativeHandle)
-  _PI_API(piextQueueCreate2)
-  _PI_API(piextQueueGetNativeHandle2)
-  _PI_API(piextQueueCreateWithNativeHandle2)
 
   _PI_API(piProgramCreate)
   _PI_API(piProgramBuild)
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
index efd3538887f93..7137bf9e3c11b 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
@@ -463,32 +463,66 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(
   // Lock automatically releases when this goes out of scope.
   std::shared_lock<ur_shared_mutex> lock(Queue->Mutex);
 
-  auto ZeQueue = ur_cast<ze_command_queue_handle_t *>(NativeQueue);
-
-  // Extract a Level Zero compute queue handle from the given PI queue
+  // Get handle to this thread's queue group.
   auto &QueueGroup = Queue->getQueueGroup(false /*compute*/);
-  uint32_t QueueGroupOrdinalUnused;
-  *ZeQueue = QueueGroup.getZeQueue(&QueueGroupOrdinalUnused);
+
+  if (Queue->UsingImmCmdLists) {
+    auto ZeCmdList = ur_cast<ze_command_list_handle_t *>(NativeQueue);
+    // Extract the Level Zero command list handle from the given PI queue
+    *ZeCmdList = QueueGroup.getImmCmdList()->first;
+    // TODO: How to pass this up in the urQueueGetNativeHandle interface?
+    // *NativeHandleDesc = true;
+  } else {
+    auto ZeQueue = ur_cast<ze_command_queue_handle_t *>(NativeQueue);
+
+    // Extract a Level Zero compute queue handle from the given PI queue
+    auto &QueueGroup = Queue->getQueueGroup(false /*compute*/);
+    uint32_t QueueGroupOrdinalUnused;
+    *ZeQueue = QueueGroup.getZeQueue(&QueueGroupOrdinalUnused);
+    // TODO: How to pass this up in the urQueueGetNativeHandle interface?
+    // *NativeHandleDesc = false;
+  }
 
   return UR_RESULT_SUCCESS;
 }
 
+void ur_queue_handle_t_::pi_queue_group_t::setImmCmdList(
+    ze_command_list_handle_t ZeCommandList) {
+  ImmCmdLists = std::vector<ur_command_list_ptr_t>(
+      1,
+      Queue->CommandListMap
+          .insert(std::pair<ze_command_list_handle_t, pi_command_list_info_t>{
+              ZeCommandList, {nullptr, true, false, nullptr, 0}})
+          .first);
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
     ur_native_handle_t NativeQueue, ///< [in] the native handle of the queue.
     ur_context_handle_t Context,    ///< [in] handle of the context object
     ur_device_handle_t Device,      ///
-    const ur_queue_native_properties_t *Properties, ///
+    const ur_queue_native_properties_t *NativeProperties, ///
     ur_queue_handle_t
         *RetQueue ///< [out] pointer to the handle of the queue object created.
 ) {
-  auto ZeQueue = ur_cast<ze_command_queue_handle_t>(NativeQueue);
-  // Assume this is the "0" index queue in the compute command-group.
-  std::vector<ze_command_queue_handle_t> ZeQueues{ZeQueue};
+  bool OwnNativeHandle = false;
+  ur_queue_flags_t Flags{};
 
-  // TODO: see what we can do to correctly initialize PI queue for
-  // compute vs. copy Level-Zero queue. Currently we will send
-  // all commands to the "ZeQueue".
-  std::vector<ze_command_queue_handle_t> ZeroCopyQueues;
+  if (NativeProperties) {
+    OwnNativeHandle = NativeProperties->isNativeHandleOwned;
+    if (NativeProperties->pNext) {
+      const ur_base_properties_t *extendedProperties =
+          reinterpret_cast<const ur_base_properties_t *>(
+              NativeProperties->pNext);
+      if (extendedProperties->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) {
+        const ur_queue_properties_t *UrProperties =
+            reinterpret_cast<const ur_queue_properties_t *>(extendedProperties);
+        Flags = UrProperties->flags;
+      }
+    }
+  }
+
+  // TODO: How to pass this up in the urQueueCreateWithNativeHandle interface?
+  int32_t NativeHandleDesc = 0;
 
   // Get the device handle from first device in the platform
   // Maybe this is not completely correct.
@@ -502,15 +536,42 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
                         nullptr));
   }
 
-  try {
-    ur_queue_handle_t_ *Queue =
-        new ur_queue_handle_t_(ZeQueues, ZeroCopyQueues, Context, UrDevice,
-                               Properties->isNativeHandleOwned);
-    *RetQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
-  } catch (const std::bad_alloc &) {
-    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
-  } catch (...) {
-    return UR_RESULT_ERROR_UNKNOWN;
+  // The NativeHandleDesc has value if if the native handle is an immediate
+  // command list.
+  if (NativeHandleDesc == 1) {
+    std::vector<ze_command_queue_handle_t> ComputeQueues{nullptr};
+    std::vector<ze_command_queue_handle_t> CopyQueues;
+
+    try {
+      ur_queue_handle_t_ *Queue = new ur_queue_handle_t_(
+          ComputeQueues, CopyQueues, Context, UrDevice, OwnNativeHandle, Flags);
+      *RetQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+    } catch (const std::bad_alloc &) {
+      return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    } catch (...) {
+      return UR_RESULT_ERROR_UNKNOWN;
+    }
+    auto &InitialGroup = (*RetQueue)->ComputeQueueGroupsByTID.begin()->second;
+    InitialGroup.setImmCmdList(ur_cast<ze_command_list_handle_t>(NativeQueue));
+  } else {
+    auto ZeQueue = ur_cast<ze_command_queue_handle_t>(NativeQueue);
+    // Assume this is the "0" index queue in the compute command-group.
+    std::vector<ze_command_queue_handle_t> ZeQueues{ZeQueue};
+
+    // TODO: see what we can do to correctly initialize PI queue for
+    // compute vs. copy Level-Zero queue. Currently we will send
+    // all commands to the "ZeQueue".
+    std::vector<ze_command_queue_handle_t> ZeroCopyQueues;
+
+    try {
+      ur_queue_handle_t_ *Queue = new ur_queue_handle_t_(
+          ZeQueues, ZeroCopyQueues, Context, UrDevice, OwnNativeHandle, Flags);
+      *RetQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+    } catch (const std::bad_alloc &) {
+      return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    } catch (...) {
+      return UR_RESULT_ERROR_UNKNOWN;
+    }
   }
 
   return UR_RESULT_SUCCESS;
@@ -757,6 +818,8 @@ ur_queue_handle_t_::ur_queue_handle_t_(
     bool OwnZeCommandQueue, ur_queue_flags_t Properties, int ForceComputeIndex)
     : Context{Context}, Device{Device}, OwnZeCommandQueue{OwnZeCommandQueue},
       Properties(Properties) {
+  // Set the type of commandlists the queue will use.
+  UsingImmCmdLists = Device->useImmediateCommandLists();
   // Compute group initialization.
   // First, see if the queue's device allows for round-robin or it is
   // fixed to one particular compute CCS (it is so for sub-sub-devices).
@@ -766,7 +829,7 @@ ur_queue_handle_t_::ur_queue_handle_t_(
   ComputeQueueGroup.ZeQueues = ComputeQueues;
   // Create space to hold immediate commandlists corresponding to the
   // ZeQueues
-  if (Device->ImmCommandListUsed) {
+  if (UsingImmCmdLists) {
     ComputeQueueGroup.ImmCmdLists = std::vector<ur_command_list_ptr_t>(
         ComputeQueueGroup.ZeQueues.size(), CommandListMap.end());
   }
@@ -798,7 +861,7 @@ ur_queue_handle_t_::ur_queue_handle_t_(
       die("No compute queue available/allowed.");
     }
   }
-  if (Device->ImmCommandListUsed) {
+  if (UsingImmCmdLists) {
     // Create space to hold immediate commandlists corresponding to the
     // ZeQueues
     ComputeQueueGroup.ImmCmdLists = std::vector<ur_command_list_ptr_t>(
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
index 4a5a6fe8b731d..81b02825ecff9 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
@@ -121,6 +121,9 @@ struct ur_queue_handle_t_ : _ur_object {
     // queues and the value of the queue group ordinal.
     ze_command_queue_handle_t &getZeQueue(uint32_t *QueueGroupOrdinal);
 
+    // This function sets an immediate commandlist from the interop interface.
+    void setImmCmdList(ze_command_list_handle_t);
+
     // This function returns the next immediate commandlist to use.
     ur_command_list_ptr_t &getImmCmdList();
 
@@ -195,6 +198,12 @@ struct ur_queue_handle_t_ : _ur_object {
   // Therefore it can be accessed without holding a lock on this _pi_queue.
   const ur_device_handle_t Device;
 
+  // A queue may use either standard or immediate commandlists. At queue
+  // construction time this is set based on the device and any env var settings
+  // that change the default for the device type. When an interop queue is
+  // constructed, the caller chooses the type of commandlists to use.
+  bool UsingImmCmdLists = false;
+
   // Keeps track of the event associated with the last enqueued command into
   // this queue. this is used to add dependency with the last command to add
   // in-order semantics and updated with the latest event each time a new

From 3cd033c726e0389cfcfc77a837444f20b007af21 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Thu, 18 May 2023 09:25:00 -0700
Subject: [PATCH 44/50] Port [SYCL] [L0] Correct the device id check for PVC

https://github.com/intel/llvm/pull/9503

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../ur/adapters/level_zero/ur_level_zero_device.hpp    | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp
index 4bc56c6fc5108..ca010ef3e0b06 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.hpp
@@ -142,8 +142,14 @@ struct ur_device_handle_t_ : _ur_object {
 
   bool isSubDevice() { return RootDevice != nullptr; }
 
-  // Is this a Data Center GPU Max series (aka PVC).
-  bool isPVC() { return (ZeDeviceProperties->deviceId & 0xff0) == 0xbd0; }
+  // Is this a Data Center GPU Max series (aka PVC)?
+  // TODO: change to use
+  // https://spec.oneapi.io/level-zero/latest/core/api.html#ze-device-ip-version-ext-t
+  // when that is stable.
+  bool isPVC() {
+    return (ZeDeviceProperties->deviceId & 0xff0) == 0xbd0 ||
+           (ZeDeviceProperties->deviceId & 0xff0) == 0xb60;
+  }
 
   // Does this device represent a single compute slice?
   bool isCCS() const {

From b17b2d44bc72cae393a1f88b5332ec73b91bf684 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Mon, 22 May 2023 16:59:37 -0700
Subject: [PATCH 45/50] Port [SYCL][L0] Optimize barrier for in-order queue

https://github.com/intel/llvm/pull/9446

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../level_zero/ur_level_zero_event.cpp        | 50 +++++++++++++++++--
 1 file changed, 47 insertions(+), 3 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
index 6d14ae2176681..0710ef349a519 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_event.cpp
@@ -122,6 +122,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
   return UR_RESULT_SUCCESS;
 }
 
+// Control if wait with barrier is implemented by signal of an event
+// as opposed by true barrier command for in-order queue.
+static const bool InOrderBarrierBySignal = [] {
+  const char *UrRet = std::getenv("UR_L0_IN_ORDER_BARRIER_BY_SIGNAL");
+  return (UrRet ? std::atoi(UrRet) : true);
+}();
+
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
     ur_queue_handle_t Queue,      ///< [in] handle of the queue object
     uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
@@ -144,16 +151,53 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
       [&Queue](ur_command_list_ptr_t CmdList,
                const _ur_ze_event_list_t &EventWaitList,
                ur_event_handle_t &Event, bool IsInternal) {
+        // For in-order queue and empty wait-list just use the last command
+        // event as the barrier event.
+        if (Queue->isInOrderQueue() && !EventWaitList.Length &&
+            Queue->LastCommandEvent && !Queue->LastCommandEvent->IsDiscarded) {
+          UR_CALL(urEventRetain(Queue->LastCommandEvent));
+          Event = Queue->LastCommandEvent;
+          return UR_RESULT_SUCCESS;
+        }
+
         UR_CALL(createEventAndAssociateQueue(
             Queue, &Event, UR_EXT_COMMAND_TYPE_USER, CmdList, IsInternal));
 
         Event->WaitList = EventWaitList;
-        ZE2UR_CALL(zeCommandListAppendBarrier,
-                   (CmdList->first, Event->ZeEvent, EventWaitList.Length,
-                    EventWaitList.ZeEventList));
+
+        // For in-order queue we don't need a real barrier, just wait for
+        // requested events in potentially different queues and add a "barrier"
+        // event signal because it is already guaranteed that previous commands
+        // in this queue are completed when the signal is started.
+        //
+        // TODO: this and other special handling of in-order queues to be
+        // updated when/if Level Zero adds native support for in-order queues.
+        //
+        if (Queue->isInOrderQueue() && InOrderBarrierBySignal) {
+          if (EventWaitList.Length) {
+            ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
+                       (CmdList->first, EventWaitList.Length,
+                        EventWaitList.ZeEventList));
+          }
+          ZE2UR_CALL(zeCommandListAppendSignalEvent,
+                     (CmdList->first, Event->ZeEvent));
+        } else {
+          ZE2UR_CALL(zeCommandListAppendBarrier,
+                     (CmdList->first, Event->ZeEvent, EventWaitList.Length,
+                      EventWaitList.ZeEventList));
+        }
         return UR_RESULT_SUCCESS;
       };
 
+  // If the queue is in-order then each command in it effectively acts as a
+  // barrier, so we don't need to do anything except if we were requested
+  // a "barrier" event to be created. Or if we need to wait for events in
+  // potentially different queues.
+  //
+  if (Queue->isInOrderQueue() && NumEventsInWaitList == 0 && !OutEvent) {
+    return UR_RESULT_SUCCESS;
+  }
+
   ur_event_handle_t InternalEvent;
   bool IsInternal = OutEvent == nullptr;
   ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;

From 438221084e63d2c05ec1a09a0135340a90072f13 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Mon, 22 May 2023 17:34:47 -0700
Subject: [PATCH 46/50] Port [SYCL] [L0] Recycle immediate command lists for
 queues in a context

https://github.com/intel/llvm/pull/9409

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../level_zero/ur_level_zero_context.cpp      | 22 +++--
 .../level_zero/ur_level_zero_context.hpp      |  8 +-
 .../level_zero/ur_level_zero_queue.cpp        | 96 ++++++++++++++-----
 .../level_zero/ur_level_zero_queue.hpp        | 12 ++-
 4 files changed, 102 insertions(+), 36 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
index c177926c24c30..34b4e5ceb7229 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.cpp
@@ -383,7 +383,8 @@ ur_result_t ur_context_handle_t_::finalize() {
 
   std::scoped_lock<ur_mutex> Lock(ZeCommandListCacheMutex);
   for (auto &List : ZeComputeCommandListCache) {
-    for (ze_command_list_handle_t &ZeCommandList : List.second) {
+    for (auto &Item : List.second) {
+      ze_command_list_handle_t ZeCommandList = Item.first;
       if (ZeCommandList)
         if (ZeCommandList) {
           auto ZeResult =
@@ -395,7 +396,8 @@ ur_result_t ur_context_handle_t_::finalize() {
     }
   }
   for (auto &List : ZeCopyCommandListCache) {
-    for (ze_command_list_handle_t &ZeCommandList : List.second) {
+    for (auto &Item : List.second) {
+      ze_command_list_handle_t ZeCommandList = Item.first;
       if (ZeCommandList) {
         auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList));
         // Gracefully handle the case that L0 was already unloaded.
@@ -647,7 +649,7 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList(
 
     for (auto ZeCommandListIt = ZeCommandListCache.begin();
          ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) {
-      auto &ZeCommandList = *ZeCommandListIt;
+      auto &ZeCommandList = ZeCommandListIt->first;
       auto it = Queue->CommandListMap.find(ZeCommandList);
       if (it != Queue->CommandListMap.end()) {
         if (ForcedCmdQueue && *ForcedCmdQueue != it->second.ZeQueue)
@@ -671,12 +673,14 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList(
         ze_fence_handle_t ZeFence;
         ZeStruct<ze_fence_desc_t> ZeFenceDesc;
         ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
-        CommandList = Queue->CommandListMap
-                          .emplace(ZeCommandList,
-                                   pi_command_list_info_t{ZeFence, true, false,
-                                                          ZeCommandQueue,
-                                                          QueueGroupOrdinal})
-                          .first;
+        ZeStruct<ze_command_queue_desc_t> ZeQueueDesc;
+        ZeQueueDesc.ordinal = QueueGroupOrdinal;
+        CommandList =
+            Queue->CommandListMap
+                .emplace(ZeCommandList,
+                         pi_command_list_info_t{ZeFence, true, false,
+                                                ZeCommandQueue, ZeQueueDesc})
+                .first;
       }
       ZeCommandListCache.erase(ZeCommandListIt);
       if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList))
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp
index cc1775d87f3c9..a945826d8fb8c 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_context.hpp
@@ -80,9 +80,13 @@ struct ur_context_handle_t_ : _ur_object {
   // application must only use the command list for the device, or its
   // sub-devices, which was provided during creation."
   //
-  std::unordered_map<ze_device_handle_t, std::list<ze_command_list_handle_t>>
+  std::unordered_map<ze_device_handle_t,
+                     std::list<std::pair<ze_command_list_handle_t,
+                                         ZeStruct<ze_command_queue_desc_t>>>>
       ZeComputeCommandListCache;
-  std::unordered_map<ze_device_handle_t, std::list<ze_command_list_handle_t>>
+  std::unordered_map<ze_device_handle_t,
+                     std::list<std::pair<ze_command_list_handle_t,
+                                         ZeStruct<ze_command_queue_desc_t>>>>
       ZeCopyCommandListCache;
 
   // Store USM allocator context(internal allocator structures)
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
index 7137bf9e3c11b..5ee786a1fb3b7 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
@@ -435,6 +435,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(
         if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
           return ze2urResult(ZeResult);
       }
+      if (Queue->UsingImmCmdLists && Queue->OwnZeCommandQueue) {
+        std::scoped_lock<ur_mutex> Lock(
+            Queue->Context->ZeCommandListCacheMutex);
+        const pi_command_list_info_t &MapEntry = it->second;
+        if (MapEntry.CanReuse) {
+          // Add commandlist to the cache for future use.
+          // It will be deleted when the context is destroyed.
+          auto &ZeCommandListCache =
+              MapEntry.isCopy(Queue)
+                  ? Queue->Context
+                        ->ZeCopyCommandListCache[Queue->Device->ZeDevice]
+                  : Queue->Context
+                        ->ZeComputeCommandListCache[Queue->Device->ZeDevice];
+          ZeCommandListCache.push_back({it->first, it->second.ZeQueueDesc});
+        } else {
+          // A non-reusable comamnd list that came from a make_queue call is
+          // destroyed since it cannot be recycled.
+          ze_command_list_handle_t ZeCommandList = it->first;
+          if (ZeCommandList) {
+            ZE2UR_CALL(zeCommandListDestroy, (ZeCommandList));
+          }
+        }
+      }
     }
     Queue->CommandListMap.clear();
   }
@@ -488,11 +511,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(
 
 void ur_queue_handle_t_::pi_queue_group_t::setImmCmdList(
     ze_command_list_handle_t ZeCommandList) {
+  // An immediate command list was given to us but we don't have the queue
+  // descriptor information. Create a dummy and note that it is not recycleable.
+  ZeStruct<ze_command_queue_desc_t> ZeQueueDesc;
   ImmCmdLists = std::vector<ur_command_list_ptr_t>(
       1,
       Queue->CommandListMap
           .insert(std::pair<ze_command_list_handle_t, pi_command_list_info_t>{
-              ZeCommandList, {nullptr, true, false, nullptr, 0}})
+              ZeCommandList,
+              {nullptr, true, false, nullptr, ZeQueueDesc, false}})
           .first);
 }
 
@@ -1608,14 +1635,15 @@ ur_result_t ur_queue_handle_t_::resetCommandList(
         UseCopyEngine
             ? this->Context->ZeCopyCommandListCache[this->Device->ZeDevice]
             : this->Context->ZeComputeCommandListCache[this->Device->ZeDevice];
-    ZeCommandListCache.push_back(CommandList->first);
+    ZeCommandListCache.push_back(
+        {CommandList->first, CommandList->second.ZeQueueDesc});
   }
 
   return UR_RESULT_SUCCESS;
 }
 
 bool pi_command_list_info_t::isCopy(ur_queue_handle_t Queue) const {
-  return ZeQueueGroupOrdinal !=
+  return ZeQueueDesc.ordinal !=
          (uint32_t)Queue->Device
              ->QueueGroup
                  [ur_device_handle_t_::queue_group_info_t::type::Compute]
@@ -1773,10 +1801,11 @@ ur_result_t ur_queue_handle_t_::createCommandList(
                                    &ZeCommandListDesc, &ZeCommandList));
 
   ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
+  ZeStruct<ze_command_queue_desc_t> ZeQueueDesc;
+  ZeQueueDesc.ordinal = QueueGroupOrdinal;
   std::tie(CommandList, std::ignore) = CommandListMap.insert(
       std::pair<ze_command_list_handle_t, pi_command_list_info_t>(
-          ZeCommandList,
-          {ZeFence, false, false, ZeCommandQueue, QueueGroupOrdinal}));
+          ZeCommandList, {ZeFence, false, false, ZeCommandQueue, ZeQueueDesc}));
 
   UR_CALL(insertStartBarrierIfDiscardEventsMode(CommandList));
   UR_CALL(insertActiveBarriers(CommandList, UseCopyEngine));
@@ -1886,29 +1915,50 @@ ur_command_list_ptr_t &ur_queue_handle_t_::pi_queue_group_t::getImmCmdList() {
     ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY;
   }
 
-  urPrint("[getZeQueue]: create queue ordinal = %d, index = %d "
-          "(round robin in [%d, %d]) priority = %s\n",
-          ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
-          UpperIndex, Priority);
+  // Check if context's command list cache has an immediate command list with
+  // matching index.
+  ze_command_list_handle_t ZeCommandList = nullptr;
+  {
+    // Acquire lock to avoid race conditions.
+    std::scoped_lock<ur_mutex> Lock(Queue->Context->ZeCommandListCacheMutex);
+    // Under mutex since operator[] does insertion on the first usage for every
+    // unique ZeDevice.
+    auto &ZeCommandListCache =
+        isCopy()
+            ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]
+            : Queue->Context
+                  ->ZeComputeCommandListCache[Queue->Device->ZeDevice];
+    for (auto ZeCommandListIt = ZeCommandListCache.begin();
+         ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) {
+      const auto &Desc = (*ZeCommandListIt).second;
+      if (Desc.index == ZeCommandQueueDesc.index &&
+          Desc.flags == ZeCommandQueueDesc.flags &&
+          Desc.mode == ZeCommandQueueDesc.mode &&
+          Desc.priority == ZeCommandQueueDesc.priority) {
+        ZeCommandList = (*ZeCommandListIt).first;
+        ZeCommandListCache.erase(ZeCommandListIt);
+        break;
+      }
+    }
+  }
 
-  ze_command_list_handle_t ZeCommandList;
-  ZE_CALL_NOCHECK(zeCommandListCreateImmediate,
-                  (Queue->Context->ZeContext, Queue->Device->ZeDevice,
-                   &ZeCommandQueueDesc, &ZeCommandList));
+  // If cache didn't contain a command list, create one.
+  if (!ZeCommandList) {
+    urPrint("[getZeQueue]: create queue ordinal = %d, index = %d "
+            "(round robin in [%d, %d]) priority = %s\n",
+            ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
+            UpperIndex, Priority);
+
+    ZE_CALL_NOCHECK(zeCommandListCreateImmediate,
+                    (Queue->Context->ZeContext, Queue->Device->ZeDevice,
+                     &ZeCommandQueueDesc, &ZeCommandList));
+  }
   ImmCmdLists[Index] =
       Queue->CommandListMap
           .insert(std::pair<ze_command_list_handle_t, pi_command_list_info_t>{
-              ZeCommandList, {nullptr, true, false, nullptr, QueueOrdinal}})
+              ZeCommandList,
+              {nullptr, true, false, nullptr, ZeCommandQueueDesc}})
           .first;
-  // Add this commandlist to the cache so it can be destroyed as part of
-  // urQueueReleaseInternal
-  auto QueueType = Type;
-  std::scoped_lock<ur_mutex> Lock(Queue->Context->ZeCommandListCacheMutex);
-  auto &ZeCommandListCache =
-      QueueType == queue_type::Compute
-          ? Queue->Context->ZeComputeCommandListCache[Queue->Device->ZeDevice]
-          : Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice];
-  ZeCommandListCache.push_back(ZeCommandList);
 
   return ImmCmdLists[Index];
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
index 81b02825ecff9..4f8e47f0ab5df 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.hpp
@@ -55,8 +55,16 @@ struct pi_command_list_info_t {
 
   // Record the queue to which the command list will be submitted.
   ze_command_queue_handle_t ZeQueue{nullptr};
-  // Keeps the ordinal of the ZeQueue queue group. Invalid if ZeQueue==nullptr
-  uint32_t ZeQueueGroupOrdinal{0};
+
+  // Record the queue descriptor fields used when creating the command list
+  // because we cannot recover these fields from the command list. Immediate
+  // command lists are recycled across queues and then all fields are used. For
+  // standard command lists only the ordinal is used. For queues created through
+  // the make_queue API the descriptor is unavailable so a dummy descriptor is
+  // used and then this entry is marked as not eligible for recycling.
+  ZeStruct<ze_command_queue_desc_t> ZeQueueDesc;
+  bool CanReuse{true};
+
   // Helper functions to tell if this is a copy command-list.
   bool isCopy(ur_queue_handle_t Queue) const;
 

From 3e9fb4eef5cc469f387eea5633a42666d7a1f3b2 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Tue, 23 May 2023 10:14:43 -0700
Subject: [PATCH 47/50] Rebase ur_loader and headers

this to absorb latest changes in queue native handle APIs

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/unified_runtime/CMakeLists.txt   |  2 +-
 sycl/plugins/unified_runtime/pi2ur.hpp        | 29 ++++++++++-------
 .../level_zero/ur_level_zero_device.cpp       |  4 +++
 .../level_zero/ur_level_zero_platform.cpp     |  6 +++-
 .../level_zero/ur_level_zero_program.cpp      |  4 +++
 .../level_zero/ur_level_zero_queue.cpp        | 29 ++++++++++++-----
 .../level_zero/ur_level_zero_sampler.cpp      |  4 +++
 .../adapters/level_zero/ur_level_zero_usm.cpp | 31 ++++++++++++++++---
 .../level_zero/ur_loader_interface.cpp        |  4 ++-
 9 files changed, 87 insertions(+), 26 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 177537363380e..c912025f2991b 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -4,7 +4,7 @@ if (NOT DEFINED UNIFIED_RUNTIME_LIBRARY OR NOT DEFINED UNIFIED_RUNTIME_INCLUDE_D
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  set(UNIFIED_RUNTIME_TAG 0125b2b42aea73c350f7961cd68e0f1f94cc1238)
+  set(UNIFIED_RUNTIME_TAG 620ddb1e8bb1f5ef6cc775edf79ba4674057fe2e)
 
   message(STATUS "Will fetch Unified Runtime from ${UNIFIED_RUNTIME_REPO}")
   FetchContent_Declare(unified-runtime
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index f21484d657595..4a1ca333e0977 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -498,7 +498,8 @@ piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle,
   ur_platform_handle_t UrPlatform{};
   ur_native_handle_t UrNativeHandle =
       reinterpret_cast<ur_native_handle_t>(NativeHandle);
-  urPlatformCreateWithNativeHandle(UrNativeHandle, &UrPlatform);
+  ur_platform_native_properties_t UrProperties{};
+  urPlatformCreateWithNativeHandle(UrNativeHandle, &UrProperties, &UrPlatform);
 
   *Platform = reinterpret_cast<pi_platform>(UrPlatform);
 
@@ -1004,8 +1005,9 @@ piextDeviceCreateWithNativeHandle(pi_native_handle NativeHandle,
   ur_platform_handle_t UrPlatform =
       reinterpret_cast<ur_platform_handle_t>(Platform);
   auto UrDevice = reinterpret_cast<ur_device_handle_t *>(Device);
-  HANDLE_ERRORS(
-      urDeviceCreateWithNativeHandle(UrNativeDevice, UrPlatform, UrDevice));
+  ur_device_native_properties_t UrProperties{};
+  HANDLE_ERRORS(urDeviceCreateWithNativeHandle(UrNativeDevice, UrPlatform,
+                                               &UrProperties, UrDevice));
 
   return PI_SUCCESS;
 }
@@ -1361,10 +1363,13 @@ inline pi_result piextQueueCreateWithNativeHandle(
   if (Properties[1] & PI_EXT_ONEAPI_QUEUE_FLAG_PRIORITY_HIGH)
     UrProperties.flags |= UR_QUEUE_FLAG_PRIORITY_HIGH;
 
+  ur_queue_native_desc_t UrNativeDesc{};
+  UrNativeDesc.stype = UR_STRUCTURE_TYPE_QUEUE_NATIVE_DESC;
+  UrNativeDesc.pNativeData = &NativeHandleDesc;
+
+  UrProperties.pNext = &UrNativeDesc;
   UrNativeProperties.pNext = &UrProperties;
 
-  // TODO: How to pass this up in the urQueueCreateWithNativeHandle interface?
-  std::ignore = NativeHandleDesc;
   HANDLE_ERRORS(urQueueCreateWithNativeHandle(
       UrNativeHandle, UrContext, UrDevice, &UrNativeProperties, UrQueue));
   return PI_SUCCESS;
@@ -1377,13 +1382,13 @@ inline pi_result piextQueueGetNativeHandle(pi_queue Queue,
   PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
   PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
 
-  // TODO: How to pass this up in the urQueueGetNativeHandle interface?
-  std::ignore = NativeHandleDesc;
+  ur_queue_native_desc_t UrNativeDesc{};
+  UrNativeDesc.pNativeData = NativeHandleDesc;
 
   ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
 
   ur_native_handle_t UrNativeQueue{};
-  HANDLE_ERRORS(urQueueGetNativeHandle(UrQueue, &UrNativeQueue));
+  HANDLE_ERRORS(urQueueGetNativeHandle(UrQueue, &UrNativeDesc, &UrNativeQueue));
 
   *NativeHandle = reinterpret_cast<pi_native_handle>(UrNativeQueue);
 
@@ -1967,7 +1972,7 @@ inline pi_result piextProgramGetNativeHandle(pi_program Program,
 
 inline pi_result
 piextProgramCreateWithNativeHandle(pi_native_handle NativeHandle,
-                                   pi_context Context, bool ownNativeHandle,
+                                   pi_context Context, bool OwnNativeHandle,
                                    pi_program *Program) {
   PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM);
   PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE);
@@ -1979,8 +1984,10 @@ piextProgramCreateWithNativeHandle(pi_native_handle NativeHandle,
       reinterpret_cast<ur_context_handle_t>(Context);
   ur_program_handle_t *UrProgram =
       reinterpret_cast<ur_program_handle_t *>(Program);
-  HANDLE_ERRORS(
-      urProgramCreateWithNativeHandle(NativeProgram, UrContext, UrProgram));
+  ur_program_native_properties_t UrProperties{};
+  UrProperties.isNativeHandleOwned = OwnNativeHandle;
+  HANDLE_ERRORS(urProgramCreateWithNativeHandle(NativeProgram, UrContext,
+                                                &UrProperties, UrProgram));
   return PI_SUCCESS;
 }
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
index f3d242f7f4e5d..dc9f6a9f7069d 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
@@ -1233,9 +1233,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle(
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     ur_native_handle_t NativeDevice, ///< [in] the native handle of the device.
     ur_platform_handle_t Platform,   ///< [in] handle of the platform instance
+    const ur_device_native_properties_t
+        *Properties, ///< [in][optional] pointer to native device properties
+                     ///< struct.
     ur_device_handle_t
         *Device ///< [out] pointer to the handle of the device object created.
 ) {
+  std::ignore = Properties;
   auto ZeDevice = ur_cast<ze_device_handle_t>(NativeDevice);
 
   // The SYCL spec requires that the set of devices must remain fixed for the
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
index 469c39d3e668c..61ef0f98b5683 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
@@ -306,10 +306,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle(
 
 UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
     ur_native_handle_t
-        NativePlatform,            ///< [in] the native handle of the platform.
+        NativePlatform, ///< [in] the native handle of the platform.
+    const ur_platform_native_properties_t
+        *Properties, ///< [in][optional] pointer to native platform properties
+                     ///< struct.
     ur_platform_handle_t *Platform ///< [out] pointer to the handle of the
                                    ///< platform object created.
 ) {
+  std::ignore = Properties;
   auto ZeDriver = ur_cast<ze_driver_handle_t>(NativePlatform);
 
   uint32_t NumPlatforms = 0;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp
index 5519f7e2254bd..6604ca073bc6a 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_program.cpp
@@ -719,9 +719,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle(
     ur_native_handle_t
         NativeProgram,           ///< [in] the native handle of the program.
     ur_context_handle_t Context, ///< [in] handle of the context instance
+    const ur_program_native_properties_t
+        *Properties, ///< [in][optional] pointer to native program properties
+                     ///< struct.
     ur_program_handle_t *Program ///< [out] pointer to the handle of the
                                  ///< program object created.
 ) {
+  std::ignore = Properties;
   auto ZeModule = ur_cast<ze_module_handle_t>(NativeProgram);
 
   // We assume here that programs created from a native handle always
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
index 5ee786a1fb3b7..730bb6542e7b7 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_queue.cpp
@@ -480,12 +480,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(
 
 UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(
     ur_queue_handle_t Queue, ///< [in] handle of the queue.
+    ur_queue_native_desc_t *Desc,
     ur_native_handle_t
         *NativeQueue ///< [out] a pointer to the native handle of the queue.
 ) {
   // Lock automatically releases when this goes out of scope.
   std::shared_lock<ur_shared_mutex> lock(Queue->Mutex);
 
+  int32_t NativeHandleDesc{};
+
   // Get handle to this thread's queue group.
   auto &QueueGroup = Queue->getQueueGroup(false /*compute*/);
 
@@ -494,7 +497,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(
     // Extract the Level Zero command list handle from the given PI queue
     *ZeCmdList = QueueGroup.getImmCmdList()->first;
     // TODO: How to pass this up in the urQueueGetNativeHandle interface?
-    // *NativeHandleDesc = true;
+    NativeHandleDesc = true;
   } else {
     auto ZeQueue = ur_cast<ze_command_queue_handle_t *>(NativeQueue);
 
@@ -503,9 +506,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(
     uint32_t QueueGroupOrdinalUnused;
     *ZeQueue = QueueGroup.getZeQueue(&QueueGroupOrdinalUnused);
     // TODO: How to pass this up in the urQueueGetNativeHandle interface?
-    // *NativeHandleDesc = false;
+    NativeHandleDesc = false;
   }
 
+  if (Desc && Desc->pNativeData)
+    *(reinterpret_cast<int32_t *>((Desc->pNativeData))) = NativeHandleDesc;
+
   return UR_RESULT_SUCCESS;
 }
 
@@ -533,24 +539,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
 ) {
   bool OwnNativeHandle = false;
   ur_queue_flags_t Flags{};
+  int32_t NativeHandleDesc{};
 
   if (NativeProperties) {
     OwnNativeHandle = NativeProperties->isNativeHandleOwned;
-    if (NativeProperties->pNext) {
+    void *pNext = NativeProperties->pNext;
+    while (pNext) {
       const ur_base_properties_t *extendedProperties =
-          reinterpret_cast<const ur_base_properties_t *>(
-              NativeProperties->pNext);
+          reinterpret_cast<const ur_base_properties_t *>(pNext);
       if (extendedProperties->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) {
         const ur_queue_properties_t *UrProperties =
             reinterpret_cast<const ur_queue_properties_t *>(extendedProperties);
         Flags = UrProperties->flags;
+      } else if (extendedProperties->stype ==
+                 UR_STRUCTURE_TYPE_QUEUE_NATIVE_DESC) {
+        const ur_queue_native_desc_t *UrNativeDesc =
+            reinterpret_cast<const ur_queue_native_desc_t *>(
+                extendedProperties);
+        if (UrNativeDesc->pNativeData)
+          NativeHandleDesc =
+              *(reinterpret_cast<int32_t *>((UrNativeDesc->pNativeData)));
       }
+      pNext = extendedProperties->pNext;
     }
   }
 
-  // TODO: How to pass this up in the urQueueCreateWithNativeHandle interface?
-  int32_t NativeHandleDesc = 0;
-
   // Get the device handle from first device in the platform
   // Maybe this is not completely correct.
   uint32_t NumEntries = 1;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
index bf32fdd9367d0..e7330bd5078b8 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_sampler.cpp
@@ -161,11 +161,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urSamplerCreateWithNativeHandle(
     ur_native_handle_t
         NativeSampler,           ///< [in] the native handle of the sampler.
     ur_context_handle_t Context, ///< [in] handle of the context object
+    const ur_sampler_native_properties_t
+        *Properties, ///< [in][optional] pointer to native sampler properties
+                     ///< struct.
     ur_sampler_handle_t *Sampler ///< [out] pointer to the handle of the
                                  ///< sampler object created.
 ) {
   std::ignore = NativeSampler;
   std::ignore = Context;
+  std::ignore = Properties;
   std::ignore = Sampler;
   urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp
index 2a5effb541ad7..0b0cc51c845d9 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp
@@ -532,12 +532,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate(
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolDestroy(
-    ur_context_handle_t Context, ///< [in] handle of the context object
-    ur_usm_pool_handle_t Pool    ///< [in] pointer to USM memory pool
+ur_result_t
+urUSMPoolRetain(ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool
+) {
+  std::ignore = Pool;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+ur_result_t
+urUSMPoolRelease(ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool
+) {
+  std::ignore = Pool;
+  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+ur_result_t urUSMPoolGetInfo(
+    ur_usm_pool_handle_t Pool,   ///< [in] handle of the USM memory pool
+    ur_usm_pool_info_t PropName, ///< [in] name of the pool property to query
+    size_t PropSize, ///< [in] size in bytes of the pool property value provided
+    void *PropValue, ///< [out][typename(propName, propSize)] value of the pool
+                     ///< property
+    size_t *PropSizeRet ///< [out] size in bytes returned in pool property value
 ) {
-  std::ignore = Context;
   std::ignore = Pool;
+  std::ignore = PropName;
+  std::ignore = PropSize;
+  std::ignore = PropValue;
+  std::ignore = PropSizeRet;
   urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp
index 0d37c805bfb2b..0e2c5bc85bf71 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_loader_interface.cpp
@@ -272,7 +272,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMProcAddrTable(
   pDdiTable->pfnFree = urUSMFree;
   pDdiTable->pfnGetMemAllocInfo = urUSMGetMemAllocInfo;
   pDdiTable->pfnPoolCreate = urUSMPoolCreate;
-  pDdiTable->pfnPoolDestroy = urUSMPoolDestroy;
+  pDdiTable->pfnPoolRetain = urUSMPoolRetain;
+  pDdiTable->pfnPoolRelease = urUSMPoolRelease;
+  pDdiTable->pfnPoolGetInfo = urUSMPoolGetInfo;
 
   return retVal;
 }

From 7ce01a783dc05bff1bbf96112e4ce8b33cac8189 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Tue, 23 May 2023 10:42:58 -0700
Subject: [PATCH 48/50] Port [SYCL] Properly install UR libraries

https://github.com/intel/llvm/pull/9555

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 sycl/plugins/unified_runtime/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index c912025f2991b..a7b8a1e8e8b31 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -130,7 +130,7 @@ if (TARGET UnifiedRuntimeLoader)
   # TODO: this is piggy-backing on the existing target component level-zero-sycl-dev
   # When UR is moved to its separate repo perhaps we should introduce new component,
   # e.g. unified-runtime-sycl-dev.
-  install(TARGETS loader
+  install(TARGETS ur_loader
     LIBRARY DESTINATION "lib${LLVM_LIBDIR_SUFFIX}" COMPONENT level-zero-sycl-dev
     ARCHIVE DESTINATION "lib${LLVM_LIBDIR_SUFFIX}" COMPONENT level-zero-sycl-dev
     RUNTIME DESTINATION "bin" COMPONENT level-zero-sycl-dev

From 7a3deca2db1cf96538ba67db53ebc5a1221b5828 Mon Sep 17 00:00:00 2001
From: Brandon Yates <brandon.yates@intel.com>
Date: Tue, 23 May 2023 18:07:45 -0400
Subject: [PATCH 49/50] Add implementation of USM pools (#11)

Signed-off-by: Brandon Yates <brandon.yates@intel.com>
---
 .../level_zero/ur_level_zero_device.cpp       |   2 +-
 .../adapters/level_zero/ur_level_zero_usm.cpp | 135 +++++++++++++-----
 .../adapters/level_zero/ur_level_zero_usm.hpp |  17 +++
 3 files changed, 117 insertions(+), 37 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
index dc9f6a9f7069d..7b95bb9bf5b1a 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_device.cpp
@@ -573,7 +573,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT:
   case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: {
     auto MapCaps = [](const ze_memory_access_cap_flags_t &ZeCapabilities) {
-      uint64_t Capabilities = 0;
+      ur_device_usm_access_capability_flags_t Capabilities = 0;
       if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_RW)
         Capabilities |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS;
       if (ZeCapabilities & ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC)
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp
index 0b0cc51c845d9..9f215d06d85a8 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.cpp
@@ -24,17 +24,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(
         Size, ///< [in] size in bytes of the USM memory object to be allocated
     void **RetMem ///< [out] pointer to USM host memory object
 ) {
-  std::ignore = Pool;
 
-  uint32_t Align = USMDesc->align;
+  uint32_t Align = USMDesc ? USMDesc->align : 0;
   // L0 supports alignment up to 64KB and silently ignores higher values.
   // We flag alignment > 64KB as an invalid value.
   if (Align > 65536)
     return UR_RESULT_ERROR_INVALID_VALUE;
 
-  const ur_usm_advice_flags_t *USMHintFlags = &USMDesc->hints;
-  std::ignore = USMHintFlags;
-
   ur_platform_handle_t Plt = Context->getPlatform();
   // If indirect access tracking is enabled then lock the mutex which is
   // guarding contexts container in the platform. This prevents new kernels from
@@ -77,7 +73,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(
   // find the allocator depending on context as we do for Shared and Device
   // allocations.
   try {
-    *RetMem = Context->HostMemAllocContext->allocate(Size, Align);
+    if (Pool) {
+      *RetMem = Pool->HostMemPool->allocate(Size, Align);
+    } else {
+      *RetMem = Context->HostMemAllocContext->allocate(Size, Align);
+    }
     if (IndirectAccessTrackingEnabled) {
       // Keep track of all memory allocations in the context
       Context->MemAllocs.emplace(std::piecewise_construct,
@@ -105,18 +105,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(
         Size, ///< [in] size in bytes of the USM memory object to be allocated
     void **RetMem ///< [out] pointer to USM device memory object
 ) {
-  std::ignore = Pool;
 
-  uint32_t Alignment = USMDesc->align;
+  uint32_t Alignment = USMDesc ? USMDesc->align : 0;
 
   // L0 supports alignment up to 64KB and silently ignores higher values.
   // We flag alignment > 64KB as an invalid value.
   if (Alignment > 65536)
     return UR_RESULT_ERROR_INVALID_VALUE;
 
-  const ur_usm_advice_flags_t *USMHintFlags = &USMDesc->hints;
-  std::ignore = USMHintFlags;
-
   ur_platform_handle_t Plt = Device->Platform;
 
   // If indirect access tracking is enabled then lock the mutex which is
@@ -157,11 +153,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(
   }
 
   try {
-    auto It = Context->DeviceMemAllocContexts.find(Device->ZeDevice);
-    if (It == Context->DeviceMemAllocContexts.end())
-      return UR_RESULT_ERROR_INVALID_VALUE;
 
-    *RetMem = It->second.allocate(Size, Alignment);
+    if (Pool) {
+      *RetMem = Pool->DeviceMemPools[Device]->allocate(Size, Alignment);
+    } else {
+      auto It = Context->DeviceMemAllocContexts.find(Device->ZeDevice);
+      if (It == Context->DeviceMemAllocContexts.end())
+        return UR_RESULT_ERROR_INVALID_VALUE;
+
+      *RetMem = It->second.allocate(Size, Alignment);
+    }
     if (IndirectAccessTrackingEnabled) {
       // Keep track of all memory allocations in the context
       Context->MemAllocs.emplace(std::piecewise_construct,
@@ -190,9 +191,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(
         Size, ///< [in] size in bytes of the USM memory object to be allocated
     void **RetMem ///< [out] pointer to USM shared memory object
 ) {
-  std::ignore = Pool;
 
-  uint32_t Alignment = USMDesc->align;
+  uint32_t Alignment = USMDesc ? USMDesc->align : 0;
 
   ur_usm_host_mem_flags_t UsmHostFlags{};
 
@@ -200,7 +200,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(
   bool DeviceReadOnly = false;
   ur_usm_device_mem_flags_t UsmDeviceFlags{};
 
-  void *pNext = const_cast<void *>(USMDesc->pNext);
+  void *pNext = USMDesc ? const_cast<void *>(USMDesc->pNext) : nullptr;
   while (pNext != nullptr) {
     const ur_base_desc_t *BaseDesc =
         reinterpret_cast<const ur_base_desc_t *>(pNext);
@@ -259,13 +259,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(
   }
 
   try {
-    auto &Allocator = (DeviceReadOnly ? Context->SharedReadOnlyMemAllocContexts
-                                      : Context->SharedMemAllocContexts);
-    auto It = Allocator.find(Device->ZeDevice);
-    if (It == Allocator.end())
-      return UR_RESULT_ERROR_INVALID_VALUE;
-
-    *RetMem = It->second.allocate(Size, Alignment);
+    if (Pool) {
+      if (DeviceReadOnly) {
+        *RetMem =
+            Pool->SharedMemReadOnlyPools[Device]->allocate(Size, Alignment);
+      } else {
+        *RetMem = Pool->SharedMemPools[Device]->allocate(Size, Alignment);
+      }
+    } else {
+      auto &Allocator =
+          (DeviceReadOnly ? Context->SharedReadOnlyMemAllocContexts
+                          : Context->SharedMemAllocContexts);
+      auto It = Allocator.find(Device->ZeDevice);
+      if (It == Allocator.end())
+        return UR_RESULT_ERROR_INVALID_VALUE;
+
+      *RetMem = It->second.allocate(Size, Alignment);
+    }
     if (DeviceReadOnly) {
       Context->SharedReadOnlyAllocs.insert(*RetMem);
     }
@@ -518,6 +528,56 @@ static ur_result_t USMAllocationMakeResident(
   return UR_RESULT_SUCCESS;
 }
 
+ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
+                                             ur_usm_pool_desc_t *PoolDesc) {
+
+  zeroInit = static_cast<ur_bool_t>(PoolDesc->flags &
+                                    UR_USM_POOL_FLAG_ZERO_INITIALIZE_BLOCK);
+
+  void *pNext = const_cast<void *>(PoolDesc->pNext);
+  while (pNext != nullptr) {
+    const ur_base_desc_t *BaseDesc =
+        reinterpret_cast<const ur_base_desc_t *>(pNext);
+    switch (BaseDesc->stype) {
+    case UR_STRUCTURE_TYPE_USM_POOL_LIMITS_DESC: {
+      const ur_usm_pool_limits_desc_t *Limits =
+          reinterpret_cast<const ur_usm_pool_limits_desc_t *>(BaseDesc);
+      for (auto &config : USMAllocatorConfigs.Configs) {
+        config.MaxPoolableSize = Limits->maxPoolableSize;
+        config.SlabMinSize = Limits->minDriverAllocSize;
+      }
+      break;
+    }
+    default: {
+      urPrint("urUSMPoolCreate: unexpected chained stype\n");
+      throw UsmAllocationException(UR_RESULT_ERROR_INVALID_ARGUMENT);
+    }
+    }
+    pNext = const_cast<void *>(BaseDesc->pNext);
+  }
+
+  HostMemPool = std::make_unique<USMAllocContext>(
+      std::unique_ptr<SystemMemory>(new USMHostMemoryAlloc(Context)),
+      this->USMAllocatorConfigs.Configs[usm_settings::MemType::Host]);
+
+  for (auto device : Context->Devices) {
+    DeviceMemPools[device] = std::make_unique<USMAllocContext>(
+        std::unique_ptr<SystemMemory>(
+            new USMDeviceMemoryAlloc(Context, device)),
+        this->USMAllocatorConfigs.Configs[usm_settings::MemType::Device]);
+
+    SharedMemPools[device] = std::make_unique<USMAllocContext>(
+        std::unique_ptr<SystemMemory>(
+            new USMSharedMemoryAlloc(Context, device)),
+        this->USMAllocatorConfigs.Configs[usm_settings::MemType::Shared]);
+    SharedMemReadOnlyPools[device] = std::make_unique<USMAllocContext>(
+        std::unique_ptr<SystemMemory>(
+            new USMSharedMemoryAlloc(Context, device)),
+        this->USMAllocatorConfigs
+            .Configs[usm_settings::MemType::SharedReadOnly]);
+  }
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate(
     ur_context_handle_t Context, ///< [in] handle of the context object
     ur_usm_pool_desc_t
@@ -525,27 +585,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPoolCreate(
                    ///< ::ur_usm_pool_limits_desc_t
     ur_usm_pool_handle_t *Pool ///< [out] pointer to USM memory pool
 ) {
-  std::ignore = Context;
-  std::ignore = PoolDesc;
-  std::ignore = Pool;
-  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+
+  try {
+    *Pool = reinterpret_cast<ur_usm_pool_handle_t>(
+        new ur_usm_pool_handle_t_(Context, PoolDesc));
+  } catch (const UsmAllocationException &Ex) {
+    return Ex.getError();
+  }
+  return UR_RESULT_SUCCESS;
 }
 
 ur_result_t
 urUSMPoolRetain(ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool
 ) {
-  std::ignore = Pool;
-  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  Pool->RefCount.increment();
+  return UR_RESULT_SUCCESS;
 }
 
 ur_result_t
 urUSMPoolRelease(ur_usm_pool_handle_t Pool ///< [in] pointer to USM memory pool
 ) {
-  std::ignore = Pool;
-  urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  if (Pool->RefCount.decrementAndTest()) {
+    delete Pool;
+  }
+  return UR_RESULT_SUCCESS;
 }
 
 ur_result_t urUSMPoolGetInfo(
diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp
index ba0130089906e..a53b6d35712f9 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_usm.hpp
@@ -9,6 +9,23 @@
 
 #include "ur_level_zero_common.hpp"
 
+struct ur_usm_pool_handle_t_ : _ur_object {
+  bool zeroInit;
+
+  usm_settings::USMAllocatorConfig USMAllocatorConfigs;
+
+  std::unique_ptr<USMAllocContext> HostMemPool;
+  std::unordered_map<ur_device_handle_t, std::unique_ptr<USMAllocContext>>
+      SharedMemPools;
+  std::unordered_map<ur_device_handle_t, std::unique_ptr<USMAllocContext>>
+      SharedMemReadOnlyPools;
+  std::unordered_map<ur_device_handle_t, std::unique_ptr<USMAllocContext>>
+      DeviceMemPools;
+
+  ur_usm_pool_handle_t_(ur_context_handle_t Context,
+                        ur_usm_pool_desc_t *PoolDesc);
+};
+
 // Exception type to pass allocation errors
 class UsmAllocationException {
   const ur_result_t Error;

From 8b2170d5847a6ab2c958216832abb8869e2184d8 Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Fri, 26 May 2023 07:03:01 -0700
Subject: [PATCH 50/50] Port [SYCL][L0] Check if ZE call count had started

https://github.com/intel/llvm/pull/9610

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 .../ur/adapters/level_zero/ur_level_zero_platform.cpp           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
index 61ef0f98b5683..db7570d795b3e 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_level_zero_platform.cpp
@@ -34,7 +34,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urTearDown(
   // Print the balance of various create/destroy native calls.
   // The idea is to verify if the number of create(+) and destroy(-) calls are
   // matched.
-  if (UrL0Debug & UR_L0_DEBUG_CALL_COUNT) {
+  if (ZeCallCount && (UrL0Debug & UR_L0_DEBUG_CALL_COUNT) != 0) {
     // clang-format off
     //
     // The format of this table is such that each row accounts for a