From da707813513b3fa2e3b4da843c5cbfa474bc8deb Mon Sep 17 00:00:00 2001
From: Petr Vesely <petr.vesely@codeplay.com>
Date: Tue, 4 Apr 2023 09:57:35 +0100
Subject: [PATCH 01/45] [SYCL][CUDA] Export loader interface for CUDA UR
 adapter

[UR] add ur_adapter_cuda target

[UR] add license
---
 sycl/plugins/cuda/CMakeLists.txt              |  11 +
 sycl/plugins/unified_runtime/CMakeLists.txt   |  15 +
 .../ur/adapters/cuda/ur_interface_loader.cpp  | 257 ++++++++++++++++++
 3 files changed, 283 insertions(+)
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp

diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt
index d3e742267af34..7df7f549c9981 100644
--- a/sycl/plugins/cuda/CMakeLists.txt
+++ b/sycl/plugins/cuda/CMakeLists.txt
@@ -48,6 +48,15 @@ endif()
 
 add_sycl_plugin(cuda
   SOURCES
+    # Some code is shared with the UR adapter
+    "../unified_runtime/pi2ur.hpp"
+    "../unified_runtime/pi2ur.cpp"
+    "../unified_runtime/ur/ur.hpp"
+    "../unified_runtime/ur/ur.cpp"
+    "../unified_runtime/ur/usm_allocator.cpp"
+    "../unified_runtime/ur/usm_allocator.hpp"
+    "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp"
+    # --- 
     "${sycl_inc_dir}/sycl/detail/pi.h"
     "${sycl_inc_dir}/sycl/detail/pi.hpp"
     "pi_cuda.hpp"
@@ -57,9 +66,11 @@ add_sycl_plugin(cuda
   INCLUDE_DIRS
     ${sycl_inc_dir}
     ${XPTI_INCLUDE}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../unified_runtime
   LIBRARIES
     cudadrv
     ${XPTI_LIBS}
+    UnifiedRuntime-Headers
   HEADER "${CMAKE_CURRENT_SOURCE_DIR}/include/features.hpp"
 )
 
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index e829d012e55b4..6ed2b57fcd4ce 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -123,6 +123,21 @@ set_target_properties("ur_adapter_level_zero" PROPERTIES
     SOVERSION "0"
 )
 
+# Build CUDA adapter
+add_sycl_library("ur_adapter_cuda" SHARED 
+  SOURCES
+    "ur/ur.hpp"
+    "ur/ur.cpp"
+    "ur/usm_allocator.cpp"
+    "ur/usm_allocator.hpp"
+    "ur/adapters/cuda/ur_interface_loader.cpp"
+  # ---
+  INCLUDE_DIRS
+    ${sycl_inc_dir}
+  LIBRARIES
+    UnifiedRuntime-Headers
+    Threads::Threads
+)
 
 if (TARGET UnifiedRuntimeLoader)
   set_target_properties(hello_world PROPERTIES EXCLUDE_FROM_ALL 1 EXCLUDE_FROM_DEFAULT_BUILD 1)
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
new file mode 100644
index 0000000000000..9446515bd435e
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -0,0 +1,257 @@
+//===--------- ur_interface_loader.cpp - Unified Runtime  ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include <ur_ddi.h>
+
+namespace {
+
+// TODO - this is a duplicate of what is in the L0 plugin
+// We should move this to somewhere common
+ur_result_t validateProcInputs(ur_api_version_t version, void *pDdiTable) {
+  if (nullptr == pDdiTable) {
+    return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+  }
+  // Pre 1.0 we enforce loader and adapter must have same version.
+  // Post 1.0 only major version match should be required.
+  if (version != UR_API_VERSION_CURRENT) {
+    return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+  }
+  return UR_RESULT_SUCCESS;
+}
+} // namespace
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable(
+    ur_api_version_t version, ur_platform_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGet = nullptr;
+  pDdiTable->pfnGetApiVersion = nullptr;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetContextProcAddrTable(
+    ur_api_version_t version, ur_context_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreate = nullptr;
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnRelease = nullptr;
+  pDdiTable->pfnRetain = nullptr;
+  pDdiTable->pfnSetExtendedDeleter = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable(
+    ur_api_version_t version, ur_event_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnGetProfilingInfo = nullptr;
+  pDdiTable->pfnRelease = nullptr;
+  pDdiTable->pfnRetain = nullptr;
+  pDdiTable->pfnSetCallback = nullptr;
+  pDdiTable->pfnWait = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable(
+    ur_api_version_t version, ur_program_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnBuild = nullptr;
+  pDdiTable->pfnCompile = nullptr;
+  pDdiTable->pfnCreateWithBinary = nullptr;
+  pDdiTable->pfnCreateWithIL = nullptr;
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGetBuildInfo = nullptr;
+  pDdiTable->pfnGetFunctionPointer = nullptr;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnLink = nullptr;
+  pDdiTable->pfnRelease = nullptr;
+  pDdiTable->pfnRetain = nullptr;
+  pDdiTable->pfnSetSpecializationConstants = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
+    ur_api_version_t version, ur_kernel_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreate = nullptr;
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGetGroupInfo = nullptr;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnGetSubGroupInfo = nullptr;
+  pDdiTable->pfnRelease = nullptr;
+  pDdiTable->pfnRetain = nullptr;
+  pDdiTable->pfnSetArgLocal = nullptr;
+  pDdiTable->pfnSetArgMemObj = nullptr;
+  pDdiTable->pfnSetArgPointer = nullptr;
+  pDdiTable->pfnSetArgSampler = nullptr;
+  pDdiTable->pfnSetArgValue = nullptr;
+  pDdiTable->pfnSetExecInfo = nullptr;
+  pDdiTable->pfnSetSpecializationConstants = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable(
+    ur_api_version_t version, ur_sampler_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreate = nullptr;
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnRelease = nullptr;
+  pDdiTable->pfnRetain = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL
+urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnBufferCreate = nullptr;
+  pDdiTable->pfnBufferPartition = nullptr;
+  pDdiTable->pfnBufferCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnImageCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnImageCreate = nullptr;
+  pDdiTable->pfnImageGetInfo = nullptr;
+  pDdiTable->pfnRelease = nullptr;
+  pDdiTable->pfnRetain = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
+    ur_api_version_t version, ur_enqueue_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnDeviceGlobalVariableRead = nullptr;
+  pDdiTable->pfnDeviceGlobalVariableWrite = nullptr;
+  pDdiTable->pfnEventsWait = nullptr;
+  pDdiTable->pfnEventsWaitWithBarrier = nullptr;
+  pDdiTable->pfnKernelLaunch = nullptr;
+  pDdiTable->pfnMemBufferCopy = nullptr;
+  pDdiTable->pfnMemBufferCopyRect = nullptr;
+  pDdiTable->pfnMemBufferFill = nullptr;
+  pDdiTable->pfnMemBufferMap = nullptr;
+  pDdiTable->pfnMemBufferRead = nullptr;
+  pDdiTable->pfnMemBufferReadRect = nullptr;
+  pDdiTable->pfnMemBufferWrite = nullptr;
+  pDdiTable->pfnMemBufferWriteRect = nullptr;
+  pDdiTable->pfnMemImageCopy = nullptr;
+  pDdiTable->pfnMemImageRead = nullptr;
+  pDdiTable->pfnMemImageWrite = nullptr;
+  pDdiTable->pfnMemUnmap = nullptr;
+  pDdiTable->pfnUSMFill2D = nullptr;
+  pDdiTable->pfnUSMFill = nullptr;
+  pDdiTable->pfnUSMAdvise = nullptr;
+  pDdiTable->pfnUSMMemcpy2D = nullptr;
+  pDdiTable->pfnUSMMemcpy = nullptr;
+  pDdiTable->pfnUSMPrefetch = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable(
+    ur_api_version_t version, ur_global_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnGetLastResult = nullptr;
+  pDdiTable->pfnInit = nullptr;
+  pDdiTable->pfnTearDown = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable(
+    ur_api_version_t version, ur_queue_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreate = nullptr;
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnFinish = nullptr;
+  pDdiTable->pfnFlush = nullptr;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnRelease = nullptr;
+  pDdiTable->pfnRetain = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL
+urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnDeviceAlloc = nullptr;
+  pDdiTable->pfnFree = nullptr;
+  pDdiTable->pfnGetMemAllocInfo = nullptr;
+  pDdiTable->pfnHostAlloc = nullptr;
+  pDdiTable->pfnPoolCreate = nullptr;
+  pDdiTable->pfnPoolDestroy = nullptr;
+  pDdiTable->pfnPoolDestroy = nullptr;
+  pDdiTable->pfnSharedAlloc = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable(
+    ur_api_version_t version, ur_device_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGet = nullptr;
+  pDdiTable->pfnGetGlobalTimestamps = nullptr;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnPartition = nullptr;
+  pDdiTable->pfnRelease = nullptr;
+  pDdiTable->pfnRetain = nullptr;
+  pDdiTable->pfnSelectBinary = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif

From 46dca606721ee9fdd22321e17a90d77bdfa497c5 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Thu, 6 Apr 2023 12:36:32 +0100
Subject: [PATCH 02/45] [SYCL][PI][UR][CUDA] Port CUDA platform, device,
 context to Unified Runtime

---
 sycl/plugins/cuda/CMakeLists.txt              |    8 +
 sycl/plugins/cuda/pi_cuda.cpp                 | 1491 +----------------
 sycl/plugins/cuda/pi_cuda.hpp                 |  135 +-
 sycl/plugins/unified_runtime/CMakeLists.txt   |   11 +-
 sycl/plugins/unified_runtime/pi2ur.hpp        |    4 +
 .../ur/adapters/cuda/common.cpp               |   87 +
 .../ur/adapters/cuda/common.hpp               |   51 +
 .../ur/adapters/cuda/context.cpp              |  151 ++
 .../ur/adapters/cuda/context.hpp              |  108 ++
 .../ur/adapters/cuda/device.cpp               | 1119 +++++++++++++
 .../ur/adapters/cuda/device.hpp               |   59 +
 .../ur/adapters/cuda/platform.cpp             |  174 ++
 .../ur/adapters/cuda/platform.hpp             |   15 +
 .../ur/adapters/cuda/ur_interface_loader.cpp  |   37 +-
 sycl/plugins/unified_runtime/ur/ur.hpp        |   15 +-
 sycl/unittests/pi/cuda/CMakeLists.txt         |    2 +
 16 files changed, 1876 insertions(+), 1591 deletions(-)
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp

diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt
index 7df7f549c9981..e4fa949eca8e9 100644
--- a/sycl/plugins/cuda/CMakeLists.txt
+++ b/sycl/plugins/cuda/CMakeLists.txt
@@ -55,6 +55,14 @@ add_sycl_plugin(cuda
     "../unified_runtime/ur/ur.cpp"
     "../unified_runtime/ur/usm_allocator.cpp"
     "../unified_runtime/ur/usm_allocator.hpp"
+    "../unified_runtime/ur/adapters/cuda/common.cpp"
+    "../unified_runtime/ur/adapters/cuda/common.hpp"
+    "../unified_runtime/ur/adapters/cuda/context.cpp"
+    "../unified_runtime/ur/adapters/cuda/context.hpp"
+    "../unified_runtime/ur/adapters/cuda/device.cpp"
+    "../unified_runtime/ur/adapters/cuda/device.hpp"
+    "../unified_runtime/ur/adapters/cuda/platform.cpp"
+    "../unified_runtime/ur/adapters/cuda/platform.hpp"
     "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp"
     # --- 
     "${sycl_inc_dir}/sycl/detail/pi.h"
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index dd68c196e94c1..b1183b662b137 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -33,16 +33,6 @@ void enableCUDATracing();
 void disableCUDATracing();
 
 namespace {
-std::string getCudaVersionString() {
-  int driver_version = 0;
-  cuDriverGetVersion(&driver_version);
-  // The version is returned as (1000 major + 10 minor).
-  std::stringstream stream;
-  stream << "CUDA " << driver_version / 1000 << "."
-         << driver_version % 1000 / 10;
-  return stream.str();
-}
-
 pi_result map_error(CUresult result) {
   switch (result) {
   case CUDA_SUCCESS:
@@ -185,55 +175,13 @@ pi_result check_error(CUresult result, const char *function, int line,
 /// \cond NODOXY
 #define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__)
 
-/// ScopedContext is used across all PI CUDA plugin implementation to ensure
-/// that the proper CUDA context is active for the given PI context.
-//
-/// This class will only replace the context if necessary, and will leave the
-/// new context active on the current thread. If there was an active context
-/// already it will simply be replaced.
-//
-/// Previously active contexts are not restored for two reasons:
-/// * Performance: context switches are expensive so leaving the context active
-///   means subsequent SYCL calls with the same context will be cheaper.
-/// * Multi-threading cleanup: contexts are set active per thread and deleting a
-///   context will only deactivate it for the current thread. This means other
-///   threads may end up with deleted active contexts. In particular this can
-///   happen with host_tasks as they run in a thread pool. When the context
-///   associated with these tasks is deleted it will remain active in the
-///   threads of the thread pool. So it would be invalid for any other task
-///   running on these threads to try to restore the deleted context. With the
-///   current implementation this is not an issue because the active deleted
-///   context will just be replaced.
-//
-/// This approach does mean that CUDA interop tasks should NOT expect their
-/// contexts to be restored by SYCL.
-class ScopedContext {
-public:
-  ScopedContext(pi_context ctxt) {
-    if (!ctxt) {
-      throw PI_ERROR_INVALID_CONTEXT;
-    }
-
-    set_context(ctxt->get());
+ScopedContext::ScopedContext(pi_context ctxt) {
+  if (!ctxt) {
+    throw PI_ERROR_INVALID_CONTEXT;
   }
 
-  ScopedContext(CUcontext ctxt) { set_context(ctxt); }
-
-  ~ScopedContext() {}
-
-private:
-  void set_context(CUcontext desired) {
-    CUcontext original = nullptr;
-
-    PI_CHECK_ERROR(cuCtxGetCurrent(&original));
-
-    // Make sure the desired context is active on the current thread, setting
-    // it if necessary
-    if (original != desired) {
-      PI_CHECK_ERROR(cuCtxSetCurrent(desired));
-    }
-  }
-};
+  set_context(ctxt->get());
+}
 
 /// \cond NODOXY
 template <typename T, typename Assign>
@@ -648,7 +596,7 @@ _pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue,
   if (queue_ != nullptr) {
     cuda_piQueueRetain(queue_);
   }
-  cuda_piContextRetain(context_);
+  pi2ur::piContextRetain(context_);
 }
 
 _pi_event::_pi_event(pi_context context, CUevent eventNative)
@@ -657,14 +605,14 @@ _pi_event::_pi_event(pi_context context, CUevent eventNative)
       streamToken_{std::numeric_limits<pi_uint32>::max()}, evEnd_{eventNative},
       evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr},
       context_{context} {
-  cuda_piContextRetain(context_);
+  pi2ur::piContextRetain(context_);
 }
 
 _pi_event::~_pi_event() {
   if (queue_ != nullptr) {
     cuda_piQueueRelease(queue_);
   }
-  cuda_piContextRelease(context_);
+  pi2ur::piContextRelease(context_);
 }
 
 pi_result _pi_event::start() {
@@ -702,14 +650,6 @@ bool _pi_event::is_completed() const noexcept {
   return true;
 }
 
-pi_uint64 _pi_device::get_elapsed_time(CUevent ev) const {
-  float miliSeconds = 0.0f;
-
-  PI_CHECK_ERROR(cuEventElapsedTime(&miliSeconds, evBase_, ev));
-
-  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
-}
-
 pi_uint64 _pi_event::get_queued_time() const {
   assert(is_started());
   return queue_->get_device()->get_elapsed_time(evQueued_);
@@ -797,10 +737,10 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) {
 _pi_program::_pi_program(pi_context ctxt)
     : module_{nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1},
       context_{ctxt}, kernelReqdWorkGroupSizeMD_{} {
-  cuda_piContextRetain(context_);
+  pi2ur::piContextRetain(context_);
 }
 
-_pi_program::~_pi_program() { cuda_piContextRelease(context_); }
+_pi_program::~_pi_program() { pi2ur::piContextRelease(context_); }
 
 std::pair<std::string, std::string>
 splitMetadataName(const std::string &metadataName) {
@@ -917,201 +857,6 @@ std::string getKernelNames(pi_program) {
 
 //-- PI API implementation
 extern "C" {
-
-pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
-                               size_t param_value_size, void *param_value,
-                               size_t *param_value_size_ret);
-
-/// Obtains the CUDA platform.
-/// There is only one CUDA platform, and contains all devices on the system.
-/// Triggers the CUDA Driver initialization (cuInit) the first time, so this
-/// must be the first PI API called.
-///
-/// However because multiple devices in a context is not currently supported,
-/// place each device in a separate platform.
-///
-pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
-                              pi_uint32 *num_platforms) {
-
-  try {
-    static std::once_flag initFlag;
-    static pi_uint32 numPlatforms = 1;
-    static std::vector<_pi_platform> platformIds;
-
-    if (num_entries == 0 && platforms != nullptr) {
-      return PI_ERROR_INVALID_VALUE;
-    }
-    if (platforms == nullptr && num_platforms == nullptr) {
-      return PI_ERROR_INVALID_VALUE;
-    }
-
-    pi_result err = PI_SUCCESS;
-
-    std::call_once(
-        initFlag,
-        [](pi_result &err) {
-          if (cuInit(0) != CUDA_SUCCESS) {
-            numPlatforms = 0;
-            return;
-          }
-          int numDevices = 0;
-          err = PI_CHECK_ERROR(cuDeviceGetCount(&numDevices));
-          if (numDevices == 0) {
-            numPlatforms = 0;
-            return;
-          }
-          try {
-            // make one platform per device
-            numPlatforms = numDevices;
-            platformIds.resize(numDevices);
-
-            for (int i = 0; i < numDevices; ++i) {
-              CUdevice device;
-              err = PI_CHECK_ERROR(cuDeviceGet(&device, i));
-              CUcontext context;
-              err = PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(&context, device));
-
-              ScopedContext active(context);
-              CUevent evBase;
-              err = PI_CHECK_ERROR(cuEventCreate(&evBase, CU_EVENT_DEFAULT));
-
-              // Use default stream to record base event counter
-              err = PI_CHECK_ERROR(cuEventRecord(evBase, 0));
-
-              platformIds[i].devices_.emplace_back(
-                  new _pi_device{device, context, evBase, &platformIds[i]});
-
-              {
-                const auto &dev = platformIds[i].devices_.back().get();
-                size_t maxWorkGroupSize = 0u;
-                size_t maxThreadsPerBlock[3] = {};
-                pi_result retError = cuda_piDeviceGetInfo(
-                    dev, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
-                    sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
-                assert(retError == PI_SUCCESS);
-                (void)retError;
-
-                retError = cuda_piDeviceGetInfo(
-                    dev, PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
-                    sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr);
-                assert(retError == PI_SUCCESS);
-
-                dev->save_max_work_item_sizes(sizeof(maxThreadsPerBlock),
-                                              maxThreadsPerBlock);
-                dev->save_max_work_group_size(maxWorkGroupSize);
-              }
-            }
-          } catch (const std::bad_alloc &) {
-            // Signal out-of-memory situation
-            for (int i = 0; i < numDevices; ++i) {
-              platformIds[i].devices_.clear();
-            }
-            platformIds.clear();
-            err = PI_ERROR_OUT_OF_HOST_MEMORY;
-          } catch (...) {
-            // Clear and rethrow to allow retry
-            for (int i = 0; i < numDevices; ++i) {
-              platformIds[i].devices_.clear();
-            }
-            platformIds.clear();
-            throw;
-          }
-        },
-        err);
-
-    if (num_platforms != nullptr) {
-      *num_platforms = numPlatforms;
-    }
-
-    if (platforms != nullptr) {
-      for (unsigned i = 0; i < std::min(num_entries, numPlatforms); ++i) {
-        platforms[i] = &platformIds[i];
-      }
-    }
-
-    return err;
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_OUT_OF_RESOURCES;
-  }
-}
-
-pi_result cuda_piPlatformGetInfo([[maybe_unused]] pi_platform platform,
-                                 pi_platform_info param_name,
-                                 size_t param_value_size, void *param_value,
-                                 size_t *param_value_size_ret) {
-  assert(platform != nullptr);
-
-  switch (param_name) {
-  case PI_PLATFORM_INFO_NAME:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   "NVIDIA CUDA BACKEND");
-  case PI_PLATFORM_INFO_VENDOR:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   "NVIDIA Corporation");
-  case PI_PLATFORM_INFO_PROFILE:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   "FULL PROFILE");
-  case PI_PLATFORM_INFO_VERSION: {
-    auto version = getCudaVersionString();
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   version.c_str());
-  }
-  case PI_PLATFORM_INFO_EXTENSIONS: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, "");
-  }
-  case PI_EXT_PLATFORM_INFO_BACKEND: {
-    return getInfo<pi_platform_backend>(param_value_size, param_value,
-                                        param_value_size_ret,
-                                        PI_EXT_PLATFORM_BACKEND_CUDA);
-  }
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-  sycl::detail::pi::die("Platform info request not implemented");
-  return {};
-}
-
-/// \param devices List of devices available on the system
-/// \param num_devices Number of elements in the list of devices
-/// Requesting a non-GPU device triggers an error, all PI CUDA devices
-/// are GPUs.
-///
-pi_result cuda_piDevicesGet(pi_platform platform, pi_device_type device_type,
-                            pi_uint32 num_entries, pi_device *devices,
-                            pi_uint32 *num_devices) {
-
-  pi_result err = PI_SUCCESS;
-  const bool askingForDefault = device_type == PI_DEVICE_TYPE_DEFAULT;
-  const bool askingForGPU = device_type & PI_DEVICE_TYPE_GPU;
-  const bool returnDevices = askingForDefault || askingForGPU;
-
-  size_t numDevices = returnDevices ? platform->devices_.size() : 0;
-
-  try {
-    if (num_devices) {
-      *num_devices = numDevices;
-    }
-
-    if (returnDevices && devices) {
-      for (size_t i = 0; i < std::min(size_t(num_entries), numDevices); ++i) {
-        devices[i] = platform->devices_[i].get();
-      }
-    }
-
-    return err;
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_OUT_OF_RESOURCES;
-  }
-}
-
-/// \return PI_SUCCESS if the function is executed successfully
-/// CUDA devices are always root devices so retain always returns success.
-pi_result cuda_piDeviceRetain(pi_device) { return PI_SUCCESS; }
-
 pi_result cuda_piContextGetInfo(pi_context context, pi_context_info param_name,
                                 size_t param_value_size, void *param_value,
                                 size_t *param_value_size_ret) {
@@ -1150,27 +895,6 @@ pi_result cuda_piContextGetInfo(pi_context context, pi_context_info param_name,
   return PI_ERROR_OUT_OF_RESOURCES;
 }
 
-pi_result cuda_piContextRetain(pi_context context) {
-  assert(context != nullptr);
-  assert(context->get_reference_count() > 0);
-
-  context->increment_reference_count();
-  return PI_SUCCESS;
-}
-
-pi_result cuda_piextContextSetExtendedDeleter(
-    pi_context context, pi_context_extended_deleter function, void *user_data) {
-  context->set_extended_deleter(function, user_data);
-  return PI_SUCCESS;
-}
-
-/// Not applicable to CUDA, devices cannot be partitioned.
-pi_result cuda_piDevicePartition(pi_device,
-                                 const pi_device_partition_property *,
-                                 pi_uint32, pi_device *, pi_uint32 *) {
-  return {};
-}
-
 /// \return If available, the first binary that is PTX
 ///
 pi_result cuda_piextDeviceSelectBinary(pi_device device,
@@ -1224,1155 +948,6 @@ pi_result cuda_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device,
   return retError;
 }
 
-/// \return PI_SUCCESS always since CUDA devices are always root devices.
-///
-pi_result cuda_piDeviceRelease(pi_device) { return PI_SUCCESS; }
-
-pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
-                               size_t param_value_size, void *param_value,
-                               size_t *param_value_size_ret) {
-
-  static constexpr pi_uint32 max_work_item_dimensions = 3u;
-
-  assert(device != nullptr);
-
-  ScopedContext active(device->get_context());
-
-  switch (param_name) {
-  case PI_DEVICE_INFO_TYPE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_DEVICE_TYPE_GPU);
-  }
-  case PI_DEVICE_INFO_VENDOR_ID: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 4318u);
-  }
-  case PI_DEVICE_INFO_MAX_COMPUTE_UNITS: {
-    int compute_units = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&compute_units,
-                             CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(compute_units >= 0);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint32(compute_units));
-  }
-  case PI_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   max_work_item_dimensions);
-  }
-  case PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES: {
-    size_t return_sizes[max_work_item_dimensions];
-
-    int max_x = 0, max_y = 0, max_z = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(max_x >= 0);
-
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(max_y >= 0);
-
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(max_z >= 0);
-
-    return_sizes[0] = size_t(max_x);
-    return_sizes[1] = size_t(max_y);
-    return_sizes[2] = size_t(max_z);
-    return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
-                        param_value_size_ret, return_sizes);
-  }
-
-  case PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D: {
-    size_t return_sizes[max_work_item_dimensions];
-    int max_x = 0, max_y = 0, max_z = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(max_x >= 0);
-
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(max_y >= 0);
-
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(max_z >= 0);
-
-    return_sizes[0] = size_t(max_x);
-    return_sizes[1] = size_t(max_y);
-    return_sizes[2] = size_t(max_z);
-    return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
-                        param_value_size_ret, return_sizes);
-  }
-
-  case PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE: {
-    int max_work_group_size = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_work_group_size,
-                             CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                             device->get()) == CUDA_SUCCESS);
-
-    sycl::detail::pi::assertion(max_work_group_size >= 0);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   size_t(max_work_group_size));
-  }
-  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
-  }
-  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
-  }
-  case PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS: {
-    // Number of sub-groups = max block size / warp size + possible remainder
-    int max_threads = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_threads,
-                             CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                             device->get()) == CUDA_SUCCESS);
-    int warpSize = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                             device->get()) == CUDA_SUCCESS);
-    int maxWarps = (max_threads + warpSize - 1) / warpSize;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   static_cast<uint32_t>(maxWarps));
-  }
-  case PI_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: {
-    // Volta provides independent thread scheduling
-    // TODO: Revisit for previous generation GPUs
-    int major = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-    bool ifp = (major >= 7);
-    return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
-  }
-
-  case PI_DEVICE_INFO_ATOMIC_64: {
-    int major = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-
-    bool atomic64 = (major >= 6) ? true : false;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   atomic64);
-  }
-  case PI_EXT_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
-    pi_memory_order_capabilities capabilities =
-        PI_MEMORY_ORDER_RELAXED | PI_MEMORY_ORDER_ACQUIRE |
-        PI_MEMORY_ORDER_RELEASE | PI_MEMORY_ORDER_ACQ_REL;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   capabilities);
-  }
-  case PI_EXT_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
-    int major = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-    pi_memory_order_capabilities capabilities =
-        (major >= 7) ? PI_MEMORY_SCOPE_WORK_ITEM | PI_MEMORY_SCOPE_SUB_GROUP |
-                           PI_MEMORY_SCOPE_WORK_GROUP | PI_MEMORY_SCOPE_DEVICE |
-                           PI_MEMORY_SCOPE_SYSTEM
-                     : PI_MEMORY_SCOPE_WORK_ITEM | PI_MEMORY_SCOPE_SUB_GROUP |
-                           PI_MEMORY_SCOPE_WORK_GROUP | PI_MEMORY_SCOPE_DEVICE;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   capabilities);
-  }
-  case PI_EXT_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: {
-    // SYCL2020 4.6.4.2 minimum mandated capabilities for
-    // atomic_fence_order_capabilities.
-    pi_memory_order_capabilities capabilities =
-        PI_MEMORY_ORDER_RELAXED | PI_MEMORY_ORDER_ACQUIRE |
-        PI_MEMORY_ORDER_RELEASE | PI_MEMORY_ORDER_ACQ_REL;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   capabilities);
-  }
-  case PI_EXT_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: {
-    // SYCL2020 4.6.4.2 minimum mandated capabilities for
-    // atomic_fence/memory_scope_capabilities.
-    // Because scopes are hierarchical, wider scopes support all narrower
-    // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and
-    // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382)
-    pi_memory_scope_capabilities capabilities = PI_MEMORY_SCOPE_WORK_ITEM |
-                                                PI_MEMORY_SCOPE_SUB_GROUP |
-                                                PI_MEMORY_SCOPE_WORK_GROUP;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   capabilities);
-  }
-  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS: {
-    int major = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-
-    bool bfloat16 = (major >= 8) ? true : false;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   bfloat16);
-  }
-  case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
-    // NVIDIA devices only support one sub-group size (the warp size)
-    int warpSize = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                             device->get()) == CUDA_SUCCESS);
-    size_t sizes[1] = {static_cast<size_t>(warpSize)};
-    return getInfoArray<size_t>(1, param_value_size, param_value,
-                                param_value_size_ret, sizes);
-  }
-  case PI_DEVICE_INFO_MAX_CLOCK_FREQUENCY: {
-    int clock_freq = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&clock_freq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(clock_freq >= 0);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint32(clock_freq) / 1000u);
-  }
-  case PI_DEVICE_INFO_ADDRESS_BITS: {
-    auto bits = pi_uint32{std::numeric_limits<uintptr_t>::digits};
-    return getInfo(param_value_size, param_value, param_value_size_ret, bits);
-  }
-  case PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: {
-    // Max size of memory object allocation in bytes.
-    // The minimum value is max(min(1024 × 1024 ×
-    // 1024, 1/4th of CL_DEVICE_GLOBAL_MEM_SIZE),
-    // 32 × 1024 × 1024) for devices that are not of type
-    // CL_DEVICE_TYPE_CUSTOM.
-
-    size_t global = 0;
-    sycl::detail::pi::assertion(cuDeviceTotalMem(&global, device->get()) ==
-                                CUDA_SUCCESS);
-
-    auto quarter_global = static_cast<pi_uint32>(global / 4u);
-
-    auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global),
-                              32u * 1024u * 1024u);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint64{max_alloc});
-  }
-  case PI_DEVICE_INFO_IMAGE_SUPPORT: {
-    pi_bool enabled = PI_FALSE;
-
-    if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) {
-      enabled = PI_TRUE;
-    } else {
-      sycl::detail::pi::cuPrint(
-          "Images are not fully supported by the CUDA BE, their support is "
-          "disabled by default. Their partial support can be activated by "
-          "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at "
-          "runtime.");
-    }
-
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   enabled);
-  }
-  case PI_DEVICE_INFO_MAX_READ_IMAGE_ARGS: {
-    // This call doesn't match to CUDA as it doesn't have images, but instead
-    // surfaces and textures. No clear call in the CUDA API to determine this,
-    // but some searching found as of SM 2.x 128 are supported.
-    return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
-  }
-  case PI_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: {
-    // This call doesn't match to CUDA as it doesn't have images, but instead
-    // surfaces and textures. No clear call in the CUDA API to determine this,
-    // but some searching found as of SM 2.x 128 are supported.
-    return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
-  }
-  case PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: {
-    // Take the smaller of maximum surface and maximum texture height.
-    int tex_height = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&tex_height,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(tex_height >= 0);
-    int surf_height = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&surf_height,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(surf_height >= 0);
-
-    int min = std::min(tex_height, surf_height);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret, min);
-  }
-  case PI_DEVICE_INFO_IMAGE2D_MAX_WIDTH: {
-    // Take the smaller of maximum surface and maximum texture width.
-    int tex_width = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&tex_width,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(tex_width >= 0);
-    int surf_width = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&surf_width,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(surf_width >= 0);
-
-    int min = std::min(tex_width, surf_width);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret, min);
-  }
-  case PI_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: {
-    // Take the smaller of maximum surface and maximum texture height.
-    int tex_height = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&tex_height,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(tex_height >= 0);
-    int surf_height = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&surf_height,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(surf_height >= 0);
-
-    int min = std::min(tex_height, surf_height);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret, min);
-  }
-  case PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH: {
-    // Take the smaller of maximum surface and maximum texture width.
-    int tex_width = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&tex_width,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(tex_width >= 0);
-    int surf_width = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&surf_width,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(surf_width >= 0);
-
-    int min = std::min(tex_width, surf_width);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret, min);
-  }
-  case PI_DEVICE_INFO_IMAGE3D_MAX_DEPTH: {
-    // Take the smaller of maximum surface and maximum texture depth.
-    int tex_depth = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&tex_depth,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(tex_depth >= 0);
-    int surf_depth = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&surf_depth,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(surf_depth >= 0);
-
-    int min = std::min(tex_depth, surf_depth);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret, min);
-  }
-  case PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: {
-    // Take the smaller of maximum surface and maximum texture width.
-    int tex_width = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&tex_width,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(tex_width >= 0);
-    int surf_width = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&surf_width,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(surf_width >= 0);
-
-    int min = std::min(tex_width, surf_width);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret, min);
-  }
-  case PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   size_t(0));
-  }
-  case PI_DEVICE_INFO_MAX_SAMPLERS: {
-    // This call is kind of meaningless for cuda, as samplers don't exist.
-    // Closest thing is textures, which is 128.
-    return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
-  }
-  case PI_DEVICE_INFO_MAX_PARAMETER_SIZE: {
-    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters
-    // __global__ function parameters are passed to the device via constant
-    // memory and are limited to 4 KB.
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   size_t{4000u});
-  }
-  case PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: {
-    int mem_base_addr_align = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&mem_base_addr_align,
-                             CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
-                             device->get()) == CUDA_SUCCESS);
-    // Multiply by 8 as clGetDeviceInfo returns this value in bits
-    mem_base_addr_align *= 8;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   mem_base_addr_align);
-  }
-  case PI_DEVICE_INFO_HALF_FP_CONFIG: {
-    // TODO: is this config consistent across all NVIDIA GPUs?
-    return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
-  }
-  case PI_DEVICE_INFO_SINGLE_FP_CONFIG: {
-    // TODO: is this config consistent across all NVIDIA GPUs?
-    auto config = PI_FP_DENORM | PI_FP_INF_NAN | PI_FP_ROUND_TO_NEAREST |
-                  PI_FP_ROUND_TO_ZERO | PI_FP_ROUND_TO_INF | PI_FP_FMA |
-                  PI_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
-    return getInfo(param_value_size, param_value, param_value_size_ret, config);
-  }
-  case PI_DEVICE_INFO_DOUBLE_FP_CONFIG: {
-    // TODO: is this config consistent across all NVIDIA GPUs?
-    auto config = PI_FP_DENORM | PI_FP_INF_NAN | PI_FP_ROUND_TO_NEAREST |
-                  PI_FP_ROUND_TO_ZERO | PI_FP_ROUND_TO_INF | PI_FP_FMA;
-    return getInfo(param_value_size, param_value, param_value_size_ret, config);
-  }
-  case PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: {
-    // TODO: is this config consistent across all NVIDIA GPUs?
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE);
-  }
-  case PI_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE: {
-    // The value is documented for all existing GPUs in the CUDA programming
-    // guidelines, section "H.3.2. Global Memory".
-    return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
-  }
-  case PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: {
-    int cache_size = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&cache_size, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(cache_size >= 0);
-    // The L2 cache is global to the GPU.
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint64(cache_size));
-  }
-  case PI_DEVICE_INFO_GLOBAL_MEM_SIZE: {
-    size_t bytes = 0;
-    // Runtime API has easy access to this value, driver API info is scarse.
-    sycl::detail::pi::assertion(cuDeviceTotalMem(&bytes, device->get()) ==
-                                CUDA_SUCCESS);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint64{bytes});
-  }
-  case PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: {
-    int constant_memory = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&constant_memory,
-                             CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(constant_memory >= 0);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint64(constant_memory));
-  }
-  case PI_DEVICE_INFO_MAX_CONSTANT_ARGS: {
-    // TODO: is there a way to retrieve this from CUDA driver API?
-    // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX
-    // 1060 3GB
-    return getInfo(param_value_size, param_value, param_value_size_ret, 9u);
-  }
-  case PI_DEVICE_INFO_LOCAL_MEM_TYPE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_DEVICE_LOCAL_MEM_TYPE_LOCAL);
-  }
-  case PI_DEVICE_INFO_LOCAL_MEM_SIZE: {
-    // OpenCL's "local memory" maps most closely to CUDA's "shared memory".
-    // CUDA has its own definition of "local memory", which maps to OpenCL's
-    // "private memory".
-    int local_mem_size = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&local_mem_size,
-                             CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(local_mem_size >= 0);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint64(local_mem_size));
-  }
-  case PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: {
-    int ecc_enabled = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&ecc_enabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
-                             device->get()) == CUDA_SUCCESS);
-
-    sycl::detail::pi::assertion((ecc_enabled == 0) | (ecc_enabled == 1));
-    auto result = static_cast<pi_bool>(ecc_enabled);
-    return getInfo(param_value_size, param_value, param_value_size_ret, result);
-  }
-  case PI_DEVICE_INFO_HOST_UNIFIED_MEMORY: {
-    int is_integrated = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&is_integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED,
-                             device->get()) == CUDA_SUCCESS);
-
-    sycl::detail::pi::assertion((is_integrated == 0) | (is_integrated == 1));
-    auto result = static_cast<pi_bool>(is_integrated);
-    return getInfo(param_value_size, param_value, param_value_size_ret, result);
-  }
-  case PI_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: {
-    // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX
-    // 1060 3GB
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   size_t{1000u});
-  }
-  case PI_DEVICE_INFO_ENDIAN_LITTLE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_TRUE);
-  }
-  case PI_DEVICE_INFO_AVAILABLE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_TRUE);
-  }
-  case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_TRUE);
-  }
-  case PI_DEVICE_INFO_COMPILER_AVAILABLE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_TRUE);
-  }
-  case PI_DEVICE_INFO_LINKER_AVAILABLE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_TRUE);
-  }
-  case PI_DEVICE_INFO_EXECUTION_CAPABILITIES: {
-    auto capability = PI_DEVICE_EXEC_CAPABILITIES_KERNEL;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   capability);
-  }
-  case PI_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: {
-    // The mandated minimum capability:
-    auto capability = PI_QUEUE_FLAG_PROFILING_ENABLE |
-                      PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   capability);
-  }
-  case PI_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: {
-    // The mandated minimum capability:
-    auto capability = PI_QUEUE_FLAG_PROFILING_ENABLE;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   capability);
-  }
-  case PI_DEVICE_INFO_BUILT_IN_KERNELS: {
-    // An empty string is returned if no built-in kernels are supported by the
-    // device.
-    return getInfo(param_value_size, param_value, param_value_size_ret, "");
-  }
-  case PI_DEVICE_INFO_PLATFORM: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   device->get_platform());
-  }
-  case PI_DEVICE_INFO_NAME: {
-    static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u;
-    char name[MAX_DEVICE_NAME_LENGTH];
-    sycl::detail::pi::assertion(cuDeviceGetName(name, MAX_DEVICE_NAME_LENGTH,
-                                                device->get()) == CUDA_SUCCESS);
-    return getInfoArray(strlen(name) + 1, param_value_size, param_value,
-                        param_value_size_ret, name);
-  }
-  case PI_DEVICE_INFO_VENDOR: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   "NVIDIA Corporation");
-  }
-  case PI_DEVICE_INFO_DRIVER_VERSION: {
-    auto version = getCudaVersionString();
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   version.c_str());
-  }
-  case PI_DEVICE_INFO_PROFILE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, "CUDA");
-  }
-  case PI_DEVICE_INFO_REFERENCE_COUNT: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   device->get_reference_count());
-  }
-  case PI_DEVICE_INFO_VERSION: {
-    std::stringstream s;
-    int major;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-    s << major;
-
-    int minor;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&minor,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                             device->get()) == CUDA_SUCCESS);
-    s << "." << minor;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   s.str().c_str());
-  }
-  case PI_DEVICE_INFO_OPENCL_C_VERSION: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, "");
-  }
-  case PI_DEVICE_INFO_EXTENSIONS: {
-
-    std::string SupportedExtensions = "cl_khr_fp64 cl_khr_subgroups ";
-    SupportedExtensions += PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT;
-    SupportedExtensions += " ";
-
-    int major = 0;
-    int minor = 0;
-
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&minor,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                             device->get()) == CUDA_SUCCESS);
-
-    if ((major >= 6) || ((major == 5) && (minor >= 3))) {
-      SupportedExtensions += "cl_khr_fp16 ";
-    }
-
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   SupportedExtensions.c_str());
-  }
-  case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: {
-    // The minimum value for the FULL profile is 1 MB.
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   size_t{1024u});
-  }
-  case PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_TRUE);
-  }
-  case PI_DEVICE_INFO_PARENT_DEVICE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   nullptr);
-  }
-  case PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
-  }
-  case PI_DEVICE_INFO_PARTITION_PROPERTIES: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   static_cast<pi_device_partition_property>(0u));
-  }
-  case PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
-  }
-  case PI_DEVICE_INFO_PARTITION_TYPE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   static_cast<pi_device_partition_property>(0u));
-  }
-
-    // Intel USM extensions
-
-  case PI_DEVICE_INFO_USM_HOST_SUPPORT: {
-    // from cl_intel_unified_shared_memory: "The host memory access capabilities
-    // apply to any host allocation."
-    //
-    // query if/how the device can access page-locked host memory, possibly
-    // through PCIe, using the same pointer as the host
-    pi_bitfield value = {};
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
-      // the device shares a unified address space with the host
-      if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
-          6) {
-        // compute capability 6.x introduces operations that are atomic with
-        // respect to other CPUs and GPUs in the system
-        value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS |
-                PI_USM_CONCURRENT_ACCESS | PI_USM_CONCURRENT_ATOMIC_ACCESS;
-      } else {
-        // on GPU architectures with compute capability lower than 6.x, atomic
-        // operations from the GPU to CPU memory will not be atomic with respect
-        // to CPU initiated atomic operations
-        value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS;
-      }
-    }
-    return getInfo(param_value_size, param_value, param_value_size_ret, value);
-  }
-  case PI_DEVICE_INFO_USM_DEVICE_SUPPORT: {
-    // from cl_intel_unified_shared_memory:
-    // "The device memory access capabilities apply to any device allocation
-    // associated with this device."
-    //
-    // query how the device can access memory allocated on the device itself (?)
-    pi_bitfield value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS |
-                        PI_USM_CONCURRENT_ACCESS |
-                        PI_USM_CONCURRENT_ATOMIC_ACCESS;
-    return getInfo(param_value_size, param_value, param_value_size_ret, value);
-  }
-  case PI_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: {
-    // from cl_intel_unified_shared_memory:
-    // "The single device shared memory access capabilities apply to any shared
-    // allocation associated with this device."
-    //
-    // query if/how the device can access managed memory associated to it
-    pi_bitfield value = {};
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
-      // the device can allocate managed memory on this system
-      value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS;
-    }
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
-      // the device can coherently access managed memory concurrently with the
-      // CPU
-      value |= PI_USM_CONCURRENT_ACCESS;
-      if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
-          6) {
-        // compute capability 6.x introduces operations that are atomic with
-        // respect to other CPUs and GPUs in the system
-        value |= PI_USM_CONCURRENT_ATOMIC_ACCESS;
-      }
-    }
-    return getInfo(param_value_size, param_value, param_value_size_ret, value);
-  }
-  case PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: {
-    // from cl_intel_unified_shared_memory:
-    // "The cross-device shared memory access capabilities apply to any shared
-    // allocation associated with this device, or to any shared memory
-    // allocation on another device that also supports the same cross-device
-    // shared memory access capability."
-    //
-    // query if/how the device can access managed memory associated to other
-    // devices
-    pi_bitfield value = {};
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
-      // the device can allocate managed memory on this system
-      value |= PI_USM_ACCESS;
-    }
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
-      // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
-      // attribute can coherently access managed memory concurrently with the
-      // CPU
-      value |= PI_USM_CONCURRENT_ACCESS;
-    }
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
-        6) {
-      // compute capability 6.x introduces operations that are atomic with
-      // respect to other CPUs and GPUs in the system
-      if (value & PI_USM_ACCESS)
-        value |= PI_USM_ATOMIC_ACCESS;
-      if (value & PI_USM_CONCURRENT_ACCESS)
-        value |= PI_USM_CONCURRENT_ATOMIC_ACCESS;
-    }
-    return getInfo(param_value_size, param_value, param_value_size_ret, value);
-  }
-  case PI_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: {
-    // from cl_intel_unified_shared_memory:
-    // "The shared system memory access capabilities apply to any allocations
-    // made by a system allocator, such as malloc or new."
-    //
-    // query if/how the device can access pageable host memory allocated by the
-    // system allocator
-    pi_bitfield value = {};
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) {
-      // the device suppports coherently accessing pageable memory without
-      // calling cuMemHostRegister/cudaHostRegister on it
-      if (getAttribute(device,
-                       CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) {
-        // the link between the device and the host supports native atomic
-        // operations
-        value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS |
-                PI_USM_CONCURRENT_ACCESS | PI_USM_CONCURRENT_ATOMIC_ACCESS;
-      } else {
-        // the link between the device and the host does not support native
-        // atomic operations
-        value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS;
-      }
-    }
-    return getInfo(param_value_size, param_value, param_value_size_ret, value);
-  }
-  case PI_EXT_ONEAPI_DEVICE_INFO_CUDA_ASYNC_BARRIER: {
-    int value =
-        getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 8;
-    return getInfo(param_value_size, param_value, param_value_size_ret, value);
-  }
-  case PI_DEVICE_INFO_BACKEND_VERSION: {
-    int major =
-        getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
-    int minor =
-        getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
-    std::string result = std::to_string(major) + "." + std::to_string(minor);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   result.c_str());
-  }
-
-  case PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY: {
-    size_t FreeMemory = 0;
-    size_t TotalMemory = 0;
-    sycl::detail::pi::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) ==
-                                    CUDA_SUCCESS,
-                                "failed cuMemGetInfo() API.");
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   FreeMemory);
-  }
-  case PI_EXT_INTEL_DEVICE_INFO_MEMORY_CLOCK_RATE: {
-    int value = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(value >= 0);
-    // Convert kilohertz to megahertz when returning.
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   value / 1000);
-  }
-  case PI_EXT_INTEL_DEVICE_INFO_MEMORY_BUS_WIDTH: {
-    int value = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&value,
-                             CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(value >= 0);
-    return getInfo(param_value_size, param_value, param_value_size_ret, value);
-  }
-  case PI_EXT_INTEL_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_int32{1});
-  }
-
-  case PI_DEVICE_INFO_DEVICE_ID: {
-    int value = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(value >= 0);
-    return getInfo(param_value_size, param_value, param_value_size_ret, value);
-  }
-
-  case PI_DEVICE_INFO_UUID: {
-    CUuuid uuid;
-#if (CUDA_VERSION >= 11040)
-    sycl::detail::pi::assertion(cuDeviceGetUuid_v2(&uuid, device->get()) ==
-                                CUDA_SUCCESS);
-#else
-    sycl::detail::pi::assertion(cuDeviceGetUuid(&uuid, device->get()) ==
-                                CUDA_SUCCESS);
-#endif
-    std::array<unsigned char, 16> name;
-    std::copy(uuid.bytes, uuid.bytes + 16, name.begin());
-    return getInfoArray(16, param_value_size, param_value, param_value_size_ret,
-                        name.data());
-  }
-
-  case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH: {
-    int major = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-
-    int minor = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&minor,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                             device->get()) == CUDA_SUCCESS);
-
-    // Some specific devices seem to need special handling. See reference
-    // https://github.com/jeffhammond/HPCInfo/blob/master/cuda/gpu-detect.cu
-    bool is_xavier_agx = major == 7 && minor == 2;
-    bool is_orin_agx = major == 8 && minor == 7;
-
-    int memory_clock_khz = 0;
-    if (is_xavier_agx) {
-      memory_clock_khz = 2133000;
-    } else if (is_orin_agx) {
-      memory_clock_khz = 3200000;
-    } else {
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&memory_clock_khz,
-                               CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
-                               device->get()) == CUDA_SUCCESS);
-    }
-
-    int memory_bus_width = 0;
-    if (is_orin_agx) {
-      memory_bus_width = 256;
-    } else {
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&memory_bus_width,
-                               CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
-                               device->get()) == CUDA_SUCCESS);
-    }
-
-    uint64_t memory_bandwidth =
-        uint64_t(memory_clock_khz) * memory_bus_width * 250;
-
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   memory_bandwidth);
-  }
-  case PI_EXT_INTEL_DEVICE_INFO_MEM_CHANNEL_SUPPORT: {
-    // The mem-channel buffer property is not supported on CUDA devices.
-    return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
-                            false);
-  }
-  case PI_DEVICE_INFO_IMAGE_SRGB: {
-    // The sRGB images are not supported on CUDA.
-    return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
-                            false);
-  }
-
-  case PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: {
-    // Maximum number of 32-bit registers available to a thread block.
-    // Note: This number is shared by all thread blocks simultaneously resident
-    // on a multiprocessor.
-    int max_registers{-1};
-    PI_CHECK_ERROR(cuDeviceGetAttribute(
-        &max_registers, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
-        device->get()));
-
-    sycl::detail::pi::assertion(max_registers >= 0);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   static_cast<uint32_t>(max_registers));
-  }
-
-  case PI_DEVICE_INFO_PCI_ADDRESS: {
-    constexpr size_t AddressBufferSize = 13;
-    char AddressBuffer[AddressBufferSize];
-    sycl::detail::pi::assertion(
-        cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, device->get()) ==
-        CUDA_SUCCESS);
-    // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written
-    sycl::detail::pi::assertion(strnlen(AddressBuffer, AddressBufferSize) ==
-                                12);
-    return getInfoArray(strnlen(AddressBuffer, AddressBufferSize - 1) + 1,
-                        param_value_size, param_value, param_value_size_ret,
-                        AddressBuffer);
-  }
-  // TODO: Investigate if this information is available on CUDA.
-  case PI_DEVICE_INFO_GPU_EU_COUNT:
-  case PI_DEVICE_INFO_GPU_EU_SIMD_WIDTH:
-  case PI_DEVICE_INFO_GPU_SLICES:
-  case PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
-  case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
-  case PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
-    return PI_ERROR_INVALID_VALUE;
-
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-  sycl::detail::pi::die("Device info request not implemented");
-  return {};
-}
-
-/// Gets the native CUDA handle of a PI device object
-///
-/// \param[in] device The PI device to get the native CUDA object of.
-/// \param[out] nativeHandle Set to the native handle of the PI device object.
-///
-/// \return PI_SUCCESS
-pi_result cuda_piextDeviceGetNativeHandle(pi_device device,
-                                          pi_native_handle *nativeHandle) {
-  *nativeHandle = static_cast<pi_native_handle>(device->get());
-  return PI_SUCCESS;
-}
-
-/// Created a PI device object from a CUDA device handle.
-/// NOTE: The created PI object does not take ownership of the native handle.
-///
-/// \param[in] nativeHandle The native handle to create PI device object from.
-/// \param[in] platform is the PI platform of the device.
-/// \param[out] device Set to the PI device object created from native handle.
-///
-/// \return TBD
-pi_result cuda_piextDeviceCreateWithNativeHandle(pi_native_handle nativeHandle,
-                                                 pi_platform platform,
-                                                 pi_device *piDevice) {
-  assert(piDevice != nullptr);
-
-  CUdevice cu_device = static_cast<CUdevice>(nativeHandle);
-
-  auto is_device = [=](std::unique_ptr<_pi_device> &dev) {
-    return dev->get() == cu_device;
-  };
-
-  // If a platform is provided just check if the device is in it
-  if (platform) {
-    auto search_res = std::find_if(begin(platform->devices_),
-                                   end(platform->devices_), is_device);
-    if (search_res != end(platform->devices_)) {
-      *piDevice = (*search_res).get();
-      return PI_SUCCESS;
-    }
-  }
-
-  // Get list of platforms
-  pi_uint32 num_platforms;
-  pi_result result = cuda_piPlatformsGet(0, nullptr, &num_platforms);
-  if (result != PI_SUCCESS)
-    return result;
-
-  pi_platform *plat =
-      static_cast<pi_platform *>(malloc(num_platforms * sizeof(pi_platform)));
-  result = cuda_piPlatformsGet(num_platforms, plat, nullptr);
-  if (result != PI_SUCCESS)
-    return result;
-
-  // Iterate through platforms to find device that matches nativeHandle
-  for (pi_uint32 j = 0; j < num_platforms; ++j) {
-    auto search_res = std::find_if(begin(plat[j]->devices_),
-                                   end(plat[j]->devices_), is_device);
-    if (search_res != end(plat[j]->devices_)) {
-      *piDevice = (*search_res).get();
-      return PI_SUCCESS;
-    }
-  }
-
-  // If the provided nativeHandle cannot be matched to an
-  // existing device return error
-  return PI_ERROR_INVALID_OPERATION;
-}
-
-/* Context APIs */
-
-/// Create a PI CUDA context.
-///
-/// By default creates a scoped context and keeps the last active CUDA context
-/// on top of the CUDA context stack.
-/// With the __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY key/id and a value of
-/// PI_TRUE creates a primary CUDA context and activates it on the CUDA context
-/// stack.
-///
-/// \param[in] properties 0 terminated array of key/id-value combinations. Can
-/// be nullptr. Only accepts property key/id
-/// __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY with a pi_bool value.
-/// \param[in] num_devices Number of devices to create the context for.
-/// \param[in] devices Devices to create the context for.
-/// \param[in] pfn_notify Callback, currently unused.
-/// \param[in] user_data User data for callback.
-/// \param[out] retcontext Set to created context on success.
-///
-/// \return PI_SUCCESS on success, otherwise an error return code.
-pi_result cuda_piContextCreate(
-    [[maybe_unused]] const pi_context_properties *properties,
-    [[maybe_unused]] pi_uint32 num_devices, const pi_device *devices,
-    [[maybe_unused]] void (*pfn_notify)(const char *errinfo,
-                                        const void *private_info, size_t cb,
-                                        void *user_data),
-    [[maybe_unused]] void *user_data, pi_context *retcontext) {
-
-  assert(devices != nullptr);
-  // TODO: How to implement context callback?
-  assert(pfn_notify == nullptr);
-  assert(user_data == nullptr);
-  assert(num_devices == 1);
-  // Need input context
-  assert(retcontext != nullptr);
-  pi_result errcode_ret = PI_SUCCESS;
-
-  std::unique_ptr<_pi_context> piContextPtr{nullptr};
-  try {
-    piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{*devices});
-    *retcontext = piContextPtr.release();
-  } catch (pi_result err) {
-    errcode_ret = err;
-  } catch (...) {
-    errcode_ret = PI_ERROR_OUT_OF_RESOURCES;
-  }
-  return errcode_ret;
-}
-
-pi_result cuda_piContextRelease(pi_context ctxt) {
-  assert(ctxt != nullptr);
-
-  if (ctxt->decrement_reference_count() > 0) {
-    return PI_SUCCESS;
-  }
-  ctxt->invoke_extended_deleters();
-
-  std::unique_ptr<_pi_context> context{ctxt};
-
-  return PI_SUCCESS;
-}
-
-/// Gets the native CUDA handle of a PI context object
-///
-/// \param[in] context The PI context to get the native CUDA object of.
-/// \param[out] nativeHandle Set to the native handle of the PI context object.
-///
-/// \return PI_SUCCESS
-pi_result cuda_piextContextGetNativeHandle(pi_context context,
-                                           pi_native_handle *nativeHandle) {
-  *nativeHandle = reinterpret_cast<pi_native_handle>(context->get());
-  return PI_SUCCESS;
-}
-
-/// Created a PI context object from a CUDA context handle.
-/// NOTE: The created PI object does not take ownership of the native handle.
-///
-/// \param[in] nativeHandle The native handle to create PI context object from.
-/// \param[out] context Set to the PI context object created from native handle.
-///
-/// \return TBD
-pi_result cuda_piextContextCreateWithNativeHandle(pi_native_handle nativeHandle,
-                                                  pi_uint32 num_devices,
-                                                  const pi_device *devices,
-                                                  bool ownNativeHandle,
-                                                  pi_context *piContext) {
-  (void)nativeHandle;
-  (void)num_devices;
-  (void)devices;
-  (void)ownNativeHandle;
-  (void)piContext;
-  assert(piContext != nullptr);
-  assert(ownNativeHandle == false);
-
-  return PI_ERROR_INVALID_OPERATION;
-}
-
 /// Creates a PI Memory object using a CUDA memory allocation.
 /// Can trigger a manual copy depending on the mode.
 /// \TODO Implement USE_HOST_PTR using cuHostRegister
@@ -2874,7 +1449,7 @@ pi_result cuda_piextQueueCreateWithNativeHandle(
   *queue = new _pi_queue{std::move(computeCuStreams),
                          std::move(transferCuStreams),
                          context,
-                         context->get_device(),
+                         reinterpret_cast<pi_device>(context->get_device()),
                          properties,
                          flags,
                          /*backend_owns*/ false};
@@ -5389,7 +3964,8 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr,
                                        pi_uint32 num_events_in_waitlist,
                                        const pi_event *events_waitlist,
                                        pi_event *event) {
-  pi_device device = queue->get_context()->get_device();
+  pi_device device =
+      reinterpret_cast<pi_device>(queue->get_context()->get_device());
 
   // Certain cuda devices and Windows do not have support for some Unified
   // Memory features. cuMemPrefetchAsync requires concurrent memory access
@@ -5459,7 +4035,8 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr,
       advice == PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY ||
       advice == PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY ||
       advice == PI_MEM_ADVICE_RESET) {
-    pi_device device = queue->get_context()->get_device();
+    pi_device device =
+        reinterpret_cast<pi_device>(queue->get_context()->get_device());
     if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
       setErrorMessage("Mem advise ignored as device does not support "
                       "concurrent managed access",
@@ -5727,10 +4304,12 @@ pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr,
       // the same index
       std::vector<pi_platform> platforms;
       platforms.resize(device_idx + 1);
-      result = cuda_piPlatformsGet(device_idx + 1, platforms.data(), nullptr);
+      result = pi2ur::piPlatformsGet(device_idx + 1, platforms.data(), nullptr);
 
       // get the device from the platform
-      pi_device device = platforms[device_idx]->devices_[0].get();
+      // TODO(ur): Remove cast when this entry point is moved to UR
+      pi_device device =
+          reinterpret_cast<pi_device>(platforms[device_idx]->devices_[0].get());
       return getInfo(param_value_size, param_value, param_value_size_ret,
                      device);
     }
@@ -5915,28 +4494,28 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   (PluginInit->PiFunctionTable).pi_api = (decltype(&::pi_api))(&cuda_api);
 
   // Platform
-  _PI_CL(piPlatformsGet, cuda_piPlatformsGet)
-  _PI_CL(piPlatformGetInfo, cuda_piPlatformGetInfo)
+  _PI_CL(piPlatformsGet, pi2ur::piPlatformsGet)
+  _PI_CL(piPlatformGetInfo, pi2ur::piPlatformGetInfo)
   // Device
-  _PI_CL(piDevicesGet, cuda_piDevicesGet)
-  _PI_CL(piDeviceGetInfo, cuda_piDeviceGetInfo)
-  _PI_CL(piDevicePartition, cuda_piDevicePartition)
-  _PI_CL(piDeviceRetain, cuda_piDeviceRetain)
-  _PI_CL(piDeviceRelease, cuda_piDeviceRelease)
+  _PI_CL(piDevicesGet, pi2ur::piDevicesGet)
+  _PI_CL(piDeviceGetInfo, pi2ur::piDeviceGetInfo)
+  _PI_CL(piDevicePartition, pi2ur::piDevicePartition)
+  _PI_CL(piDeviceRetain, pi2ur::piDeviceRetain)
+  _PI_CL(piDeviceRelease, pi2ur::piDeviceRelease)
   _PI_CL(piextDeviceSelectBinary, cuda_piextDeviceSelectBinary)
   _PI_CL(piextGetDeviceFunctionPointer, cuda_piextGetDeviceFunctionPointer)
-  _PI_CL(piextDeviceGetNativeHandle, cuda_piextDeviceGetNativeHandle)
+  _PI_CL(piextDeviceGetNativeHandle, pi2ur::piextDeviceGetNativeHandle)
   _PI_CL(piextDeviceCreateWithNativeHandle,
-         cuda_piextDeviceCreateWithNativeHandle)
+         pi2ur::piextDeviceCreateWithNativeHandle)
   // Context
-  _PI_CL(piextContextSetExtendedDeleter, cuda_piextContextSetExtendedDeleter)
-  _PI_CL(piContextCreate, cuda_piContextCreate)
-  _PI_CL(piContextGetInfo, cuda_piContextGetInfo)
-  _PI_CL(piContextRetain, cuda_piContextRetain)
-  _PI_CL(piContextRelease, cuda_piContextRelease)
-  _PI_CL(piextContextGetNativeHandle, cuda_piextContextGetNativeHandle)
+  _PI_CL(piextContextSetExtendedDeleter, pi2ur::piextContextSetExtendedDeleter)
+  _PI_CL(piContextCreate, pi2ur::piContextCreate)
+  _PI_CL(piContextGetInfo, pi2ur::piContextGetInfo)
+  _PI_CL(piContextRetain, pi2ur::piContextRetain)
+  _PI_CL(piContextRelease, pi2ur::piContextRelease)
+  _PI_CL(piextContextGetNativeHandle, pi2ur::piextContextGetNativeHandle)
   _PI_CL(piextContextCreateWithNativeHandle,
-         cuda_piextContextCreateWithNativeHandle)
+         pi2ur::piextContextCreateWithNativeHandle)
   // Queue
   _PI_CL(piQueueCreate, cuda_piQueueCreate)
   _PI_CL(piextQueueCreate, cuda_piextQueueCreate)
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index a4864cf673392..f6a95ff8d0ab5 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -42,13 +42,16 @@
 #include <unordered_map>
 #include <vector>
 
+#include <ur/adapters/cuda/context.hpp>
+#include <ur/adapters/cuda/device.hpp>
+#include <ur/adapters/cuda/platform.hpp>
+
+// Share code between the PI Plugin and UR Adapter
+#include <pi2ur.hpp>
+
 extern "C" {
 
 /// \cond IGNORE_BLOCK_IN_DOXYGEN
-pi_result cuda_piContextRetain(pi_context);
-pi_result cuda_piContextRelease(pi_context);
-pi_result cuda_piDeviceRelease(pi_device);
-pi_result cuda_piDeviceRetain(pi_device);
 pi_result cuda_piProgramRetain(pi_program);
 pi_result cuda_piProgramRelease(pi_program);
 pi_result cuda_piQueueRelease(pi_queue);
@@ -71,8 +74,8 @@ using _pi_stream_guard = std::unique_lock<std::mutex>;
 ///  available devices since initialization is done
 ///  when devices are used.
 ///
-struct _pi_platform {
-  std::vector<std::unique_ptr<_pi_device>> devices_;
+struct _pi_platform : ur_platform_handle_t_ {
+  using ur_platform_handle_t_::ur_platform_handle_t_;
 };
 
 /// PI device mapping to a CUdevice.
@@ -80,53 +83,8 @@ struct _pi_platform {
 /// and implements the reference counting semantics since
 /// CUDA objects are not refcounted.
 ///
-struct _pi_device {
-private:
-  using native_type = CUdevice;
-
-  native_type cuDevice_;
-  CUcontext cuContext_;
-  CUevent evBase_; // CUDA event used as base counter
-  std::atomic_uint32_t refCount_;
-  pi_platform platform_;
-
-  static constexpr pi_uint32 max_work_item_dimensions = 3u;
-  size_t max_work_item_sizes[max_work_item_dimensions];
-  int max_work_group_size;
-
-public:
-  _pi_device(native_type cuDevice, CUcontext cuContext, CUevent evBase,
-             pi_platform platform)
-      : cuDevice_(cuDevice), cuContext_(cuContext),
-        evBase_(evBase), refCount_{1}, platform_(platform) {}
-
-  ~_pi_device() { cuDevicePrimaryCtxRelease(cuDevice_); }
-
-  native_type get() const noexcept { return cuDevice_; };
-
-  CUcontext get_context() const noexcept { return cuContext_; };
-
-  pi_uint32 get_reference_count() const noexcept { return refCount_; }
-
-  pi_platform get_platform() const noexcept { return platform_; };
-
-  pi_uint64 get_elapsed_time(CUevent) const;
-
-  void save_max_work_item_sizes(size_t size,
-                                size_t *save_max_work_item_sizes) noexcept {
-    memcpy(max_work_item_sizes, save_max_work_item_sizes, size);
-  };
-
-  void save_max_work_group_size(int value) noexcept {
-    max_work_group_size = value;
-  };
-
-  void get_max_work_item_sizes(size_t ret_size,
-                               size_t *ret_max_work_item_sizes) const noexcept {
-    memcpy(ret_max_work_item_sizes, max_work_item_sizes, ret_size);
-  };
-
-  int get_max_work_group_size() const noexcept { return max_work_group_size; };
+struct _pi_device : ur_device_handle_t_ {
+  using ur_device_handle_t_::ur_device_handle_t_;
 };
 
 /// PI context mapping to a CUDA context object.
@@ -167,54 +125,8 @@ struct _pi_device {
 ///  called upon destruction of the PI Context.
 ///  See proposal for details.
 ///
-struct _pi_context {
-
-  struct deleter_data {
-    pi_context_extended_deleter function;
-    void *user_data;
-
-    void operator()() { function(user_data); }
-  };
-
-  using native_type = CUcontext;
-
-  native_type cuContext_;
-  _pi_device *deviceId_;
-  std::atomic_uint32_t refCount_;
-
-  _pi_context(_pi_device *devId)
-      : cuContext_{devId->get_context()}, deviceId_{devId}, refCount_{1} {
-    cuda_piDeviceRetain(deviceId_);
-  };
-
-  ~_pi_context() { cuda_piDeviceRelease(deviceId_); }
-
-  void invoke_extended_deleters() {
-    std::lock_guard<std::mutex> guard(mutex_);
-    for (auto &deleter : extended_deleters_) {
-      deleter();
-    }
-  }
-
-  void set_extended_deleter(pi_context_extended_deleter function,
-                            void *user_data) {
-    std::lock_guard<std::mutex> guard(mutex_);
-    extended_deleters_.emplace_back(deleter_data{function, user_data});
-  }
-
-  pi_device get_device() const noexcept { return deviceId_; }
-
-  native_type get() const noexcept { return cuContext_; }
-
-  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
-
-  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
-
-  pi_uint32 get_reference_count() const noexcept { return refCount_; }
-
-private:
-  std::mutex mutex_;
-  std::vector<deleter_data> extended_deleters_;
+struct _pi_context : ur_context_handle_t_ {
+  using ur_context_handle_t_::ur_context_handle_t_;
 };
 
 /// PI Mem mapping to CUDA memory allocations, both data and texture/surface.
@@ -345,7 +257,7 @@ struct _pi_mem {
     if (is_sub_buffer()) {
       cuda_piMemRetain(mem_.buffer_mem_.parent_);
     } else {
-      cuda_piContextRetain(context_);
+      pi2ur::piContextRetain(context_);
     }
   };
 
@@ -359,7 +271,7 @@ struct _pi_mem {
     mem_.surface_mem_.array_ = array;
     mem_.surface_mem_.surfObj_ = surf;
     mem_.surface_mem_.imageType_ = image_type;
-    cuda_piContextRetain(context_);
+    pi2ur::piContextRetain(context_);
   }
 
   ~_pi_mem() {
@@ -369,7 +281,7 @@ struct _pi_mem {
         return;
       }
     }
-    cuda_piContextRelease(context_);
+    pi2ur::piContextRelease(context_);
   }
 
   // TODO: Move as many shared funcs up as possible
@@ -444,13 +356,13 @@ struct _pi_queue {
         num_compute_streams_{0}, num_transfer_streams_{0},
         last_sync_compute_streams_{0}, last_sync_transfer_streams_{0},
         flags_(flags), has_ownership_{backend_owns} {
-    cuda_piContextRetain(context_);
-    cuda_piDeviceRetain(device_);
+    pi2ur::piContextRetain(context_);
+    pi2ur::piDeviceRetain(device_);
   }
 
   ~_pi_queue() {
-    cuda_piContextRelease(context_);
-    cuda_piDeviceRelease(device_);
+    pi2ur::piContextRelease(context_);
+    pi2ur::piDeviceRelease(device_);
   }
 
   void compute_stream_wait_for_barrier_if_needed(CUstream stream,
@@ -917,10 +829,11 @@ struct _pi_kernel {
       : function_{func}, functionWithOffsetParam_{funcWithOffsetParam},
         name_{name}, context_{ctxt}, program_{program}, refCount_{1} {
     cuda_piProgramRetain(program_);
-    cuda_piContextRetain(context_);
+    pi2ur::piContextRetain(context_);
     /// Note: this code assumes that there is only one device per context
     pi_result retError = cuda_piKernelGetGroupInfo(
-        this, ctxt->get_device(), PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
+        this, reinterpret_cast<pi_device>(ctxt->get_device()),
+        PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
         sizeof(reqdThreadsPerBlock_), reqdThreadsPerBlock_, nullptr);
     (void)retError;
     assert(retError == PI_SUCCESS);
@@ -928,7 +841,7 @@ struct _pi_kernel {
 
   ~_pi_kernel() {
     cuda_piProgramRelease(program_);
-    cuda_piContextRelease(context_);
+    pi2ur::piContextRelease(context_);
   }
 
   pi_program get_program() const noexcept { return program_; }
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 6ed2b57fcd4ce..bec6aed6131c8 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -116,6 +116,7 @@ add_sycl_library("ur_adapter_level_zero" SHARED
     LevelZeroLoader-Headers
     LevelZeroLoader
     Threads::Threads
+    cudadrv
 )
 
 set_target_properties("ur_adapter_level_zero" PROPERTIES
@@ -130,13 +131,21 @@ add_sycl_library("ur_adapter_cuda" SHARED
     "ur/ur.cpp"
     "ur/usm_allocator.cpp"
     "ur/usm_allocator.hpp"
+    "ur/adapters/cuda/common.cpp"
+    "ur/adapters/cuda/common.hpp"
+    "ur/adapters/cuda/context.cpp"
+    "ur/adapters/cuda/context.hpp"
+    "ur/adapters/cuda/device.cpp"
+    "ur/adapters/cuda/device.hpp"
+    "ur/adapters/cuda/platform.cpp"
+    "ur/adapters/cuda/platform.hpp"
     "ur/adapters/cuda/ur_interface_loader.cpp"
-  # ---
   INCLUDE_DIRS
     ${sycl_inc_dir}
   LIBRARIES
     UnifiedRuntime-Headers
     Threads::Threads
+    cudadrv
 )
 
 if (TARGET UnifiedRuntimeLoader)
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 4ba4104ce6c3a..2408fa452351f 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -1017,6 +1017,10 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
     InfoType = UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION;
     break;
   }
+  case PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: {
+    InfoType = UR_EXT_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP;
+    break;
+  }
   default:
     return PI_ERROR_UNKNOWN;
   };
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
new file mode 100644
index 0000000000000..264d7588f3229
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
@@ -0,0 +1,87 @@
+//===--------- common.cpp - CUDA Adapter -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "common.hpp"
+
+#include <cuda.h>
+
+#include <sstream>
+
+ur_result_t map_error_ur(CUresult result) {
+  switch (result) {
+  case CUDA_SUCCESS:
+    return UR_RESULT_SUCCESS;
+  case CUDA_ERROR_NOT_PERMITTED:
+    return UR_RESULT_ERROR_INVALID_OPERATION;
+  case CUDA_ERROR_INVALID_CONTEXT:
+    return UR_RESULT_ERROR_INVALID_CONTEXT;
+  case CUDA_ERROR_INVALID_DEVICE:
+    return UR_RESULT_ERROR_INVALID_DEVICE;
+  case CUDA_ERROR_INVALID_VALUE:
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  case CUDA_ERROR_OUT_OF_MEMORY:
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  default:
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+}
+
+ur_result_t check_error_ur(CUresult result, const char *function, int line,
+                           const char *file) {
+  if (result == CUDA_SUCCESS || result == CUDA_ERROR_DEINITIALIZED) {
+    return UR_RESULT_SUCCESS;
+  }
+
+  if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr) {
+    const char *errorString = nullptr;
+    const char *errorName = nullptr;
+    cuGetErrorName(result, &errorName);
+    cuGetErrorString(result, &errorString);
+    std::stringstream ss;
+    ss << "\nUR CUDA ERROR:"
+       << "\n\tValue:           " << result
+       << "\n\tName:            " << errorName
+       << "\n\tDescription:     " << errorString
+       << "\n\tFunction:        " << function << "\n\tSource Location: " << file
+       << ":" << line << "\n"
+       << std::endl;
+    std::cerr << ss.str();
+  }
+
+  if (std::getenv("PI_CUDA_ABORT") != nullptr) {
+    std::abort();
+  }
+
+  throw map_error_ur(result);
+}
+
+std::string getCudaVersionString() {
+  int driver_version = 0;
+  cuDriverGetVersion(&driver_version);
+  // The version is returned as (1000 major + 10 minor).
+  std::stringstream stream;
+  stream << "CUDA " << driver_version / 1000 << "."
+         << driver_version % 1000 / 10;
+  return stream.str();
+}
+
+void sycl::detail::ur::die(const char *Message) {
+  std::cerr << "ur_die: " << Message << std::endl;
+  std::terminate();
+}
+
+void sycl::detail::ur::assertion(bool Condition, const char *Message) {
+  if (!Condition)
+    die(Message);
+}
+
+void sycl::detail::ur::cuPrint(const char *Message) {
+  std::cerr << "ur_print: " << Message << std::endl;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
new file mode 100644
index 0000000000000..16cabc37a2b16
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
@@ -0,0 +1,51 @@
+//===--------- common.hpp - CUDA Adapter -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <cuda.h>
+#include <sycl/detail/defines.hpp>
+#include <ur/ur.hpp>
+
+ur_result_t map_error_ur(CUresult result);
+
+/// Converts CUDA error into UR error codes, and outputs error information
+/// to stderr.
+/// If PI_CUDA_ABORT env variable is defined, it aborts directly instead of
+/// throwing the error. This is intended for debugging purposes.
+/// \return UR_RESULT_SUCCESS if \param result was CUDA_SUCCESS.
+/// \throw ur_result_t exception (integer) if input was not success.
+///
+ur_result_t check_error_ur(CUresult result, const char *function, int line,
+                           const char *file);
+
+#define UR_CHECK_ERROR(result)                                                 \
+  check_error_ur(result, __func__, __LINE__, __FILE__)
+
+std::string getCudaVersionString();
+
+/// ------ Error handling, matching OpenCL plugin semantics.
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace detail {
+namespace ur {
+
+// Report error and no return (keeps compiler from printing warnings).
+// TODO: Probably change that to throw a catchable exception,
+//       but for now it is useful to see every failure.
+//
+[[noreturn]] void die(const char *Message);
+
+// Reports error messages
+void cuPrint(const char *Message);
+
+void assertion(bool Condition, const char *Message = nullptr);
+
+} // namespace ur
+} // namespace detail
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
new file mode 100644
index 0000000000000..a84d4c71c8dd2
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
@@ -0,0 +1,151 @@
+//===--------- context.cpp - CUDA Adapter ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "context.hpp"
+
+#include <cassert>
+
+/// Create a UR CUDA context.
+///
+/// By default creates a scoped context and keeps the last active CUDA context
+/// on top of the CUDA context stack.
+/// With the __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY key/id and a value of
+/// PI_TRUE creates a primary CUDA context and activates it on the CUDA context
+/// stack.
+///
+UR_APIEXPORT ur_result_t UR_APICALL
+urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices,
+                const ur_context_properties_t *pProperties,
+                ur_context_handle_t *phContext) {
+  UR_ASSERT(phDevices, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(phContext, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  assert(DeviceCount == 1);
+  ur_result_t errcode_ret = UR_RESULT_SUCCESS;
+
+  std::unique_ptr<ur_context_handle_t_> piContextPtr{nullptr};
+  try {
+    piContextPtr = std::unique_ptr<ur_context_handle_t_>(
+        new ur_context_handle_t_{*phDevices});
+    *phContext = piContextPtr.release();
+  } catch (ur_result_t err) {
+    errcode_ret = err;
+  } catch (...) {
+    errcode_ret = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+  return errcode_ret;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
+    ur_context_handle_t hContext, ur_context_info_t ContextInfoType,
+    size_t propSize, void *pContextInfo, size_t *pPropSizeRet) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  UrReturnHelper ReturnValue(propSize, pContextInfo, pPropSizeRet);
+
+  switch (uint32_t{ContextInfoType}) {
+  case UR_CONTEXT_INFO_NUM_DEVICES:
+    return ReturnValue(1);
+  case UR_CONTEXT_INFO_DEVICES:
+    return ReturnValue(hContext->get_device());
+  case UR_CONTEXT_INFO_REFERENCE_COUNT:
+    return ReturnValue(hContext->get_reference_count());
+  case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
+    uint32_t capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
+                            UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
+                            UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
+                            UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL;
+    return ReturnValue(capabilities);
+  }
+  case UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
+    int major = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             hContext->get_device()->get()) == CUDA_SUCCESS);
+    uint32_t capabilities =
+        (major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM
+                     : UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE;
+    return ReturnValue(capabilities);
+  }
+  case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT:
+    // 2D USM memcpy is supported.
+    return ReturnValue(true);
+  case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT:
+    // 2D USM operations currently not supported.
+    return ReturnValue(false);
+
+  default:
+    break;
+  }
+
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextRelease(ur_context_handle_t ctxt) {
+  UR_ASSERT(ctxt, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  if (ctxt->decrement_reference_count() > 0) {
+    return UR_RESULT_SUCCESS;
+  }
+  ctxt->invoke_extended_deleters();
+
+  std::unique_ptr<ur_context_handle_t_> context{ctxt};
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextRetain(ur_context_handle_t ctxt) {
+  UR_ASSERT(ctxt, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  assert(ctxt->get_reference_count() > 0);
+
+  ctxt->increment_reference_count();
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle(
+    ur_context_handle_t hContext, ur_native_handle_t *phNativeContext) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(phNativeContext, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  *phNativeContext = reinterpret_cast<ur_native_handle_t>(hContext->get());
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle(
+    ur_native_handle_t hNativeContext, uint32_t numDevices,
+    const ur_device_handle_t *phDevices,
+    const ur_context_native_properties_t *pProperties,
+    ur_context_handle_t *phContext) {
+  (void)hNativeContext;
+  (void)phContext;
+
+  // TODO(ur): Needed for the conformance test to pass, but it may be valid
+  // to have a null CUDA context
+  UR_ASSERT(hNativeContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  return UR_RESULT_ERROR_INVALID_OPERATION;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextSetExtendedDeleter(
+    ur_context_handle_t hContext, ur_context_extended_deleter_t pfnDeleter,
+    void *pUserData) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pfnDeleter, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  hContext->set_extended_deleter(pfnDeleter, pUserData);
+  return UR_RESULT_SUCCESS;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp
new file mode 100644
index 0000000000000..34575829c318b
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp
@@ -0,0 +1,108 @@
+//===--------- context.hpp - CUDA Adapter ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <cuda.h>
+#include <ur_api.h>
+
+#include <atomic>
+#include <mutex>
+#include <vector>
+
+// We need this declaration temporarily while UR and PI share ScopedContext
+class _pi_context;
+using pi_context = _pi_context *;
+
+#include "common.hpp"
+#include "device.hpp"
+
+typedef void (*ur_context_extended_deleter_t)(void *user_data);
+
+struct ur_context_handle_t_ {
+
+  struct deleter_data {
+    ur_context_extended_deleter_t function;
+    void *user_data;
+
+    void operator()() { function(user_data); }
+  };
+
+  using native_type = CUcontext;
+
+  native_type cuContext_;
+  ur_device_handle_t deviceId_;
+  std::atomic_uint32_t refCount_;
+
+  ur_context_handle_t_(ur_device_handle_t_ *devId)
+      : cuContext_{devId->get_context()}, deviceId_{devId}, refCount_{1} {
+    urDeviceRetain(deviceId_);
+  };
+
+  ~ur_context_handle_t_() { urDeviceRelease(deviceId_); }
+
+  void invoke_extended_deleters() {
+    std::lock_guard<std::mutex> guard(mutex_);
+    for (auto &deleter : extended_deleters_) {
+      deleter();
+    }
+  }
+
+  void set_extended_deleter(ur_context_extended_deleter_t function,
+                            void *user_data) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    extended_deleters_.emplace_back(deleter_data{function, user_data});
+  }
+
+  ur_device_handle_t get_device() const noexcept { return deviceId_; }
+
+  native_type get() const noexcept { return cuContext_; }
+
+  uint32_t increment_reference_count() noexcept { return ++refCount_; }
+
+  uint32_t decrement_reference_count() noexcept { return --refCount_; }
+
+  uint32_t get_reference_count() const noexcept { return refCount_; }
+
+private:
+  std::mutex mutex_;
+  std::vector<deleter_data> extended_deleters_;
+};
+
+namespace {
+class ScopedContext {
+public:
+  // TODO(ur): Needed for compatibility with PI; once the CUDA PI plugin is
+  // fully moved over we can drop this constructor
+  ScopedContext(pi_context ctxt);
+
+  ScopedContext(ur_context_handle_t ctxt) {
+    if (!ctxt) {
+      throw UR_RESULT_ERROR_INVALID_CONTEXT;
+    }
+
+    set_context(ctxt->get());
+  }
+
+  ScopedContext(CUcontext ctxt) { set_context(ctxt); }
+
+  ~ScopedContext() {}
+
+private:
+  void set_context(CUcontext desired) {
+    CUcontext original = nullptr;
+
+    UR_CHECK_ERROR(cuCtxGetCurrent(&original));
+
+    // Make sure the desired context is active on the current thread, setting
+    // it if necessary
+    if (original != desired) {
+      UR_CHECK_ERROR(cuCtxSetCurrent(desired));
+    }
+  }
+};
+} // namespace
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
new file mode 100644
index 0000000000000..d0b11b23cc74d
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
@@ -0,0 +1,1119 @@
+//===--------- device.cpp - CUDA Adapter -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include <cassert>
+#include <sstream>
+
+#include "context.hpp"
+#include "device.hpp"
+#include "platform.hpp"
+
+int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute) {
+  int value;
+  sycl::detail::ur::assertion(
+      cuDeviceGetAttribute(&value, attribute, device->get()) == CUDA_SUCCESS);
+  return value;
+}
+
+uint64_t ur_device_handle_t_::get_elapsed_time(CUevent ev) const {
+  float miliSeconds = 0.0f;
+
+  UR_CHECK_ERROR(cuEventElapsedTime(&miliSeconds, evBase_, ev));
+
+  return static_cast<uint64_t>(miliSeconds * 1.0e6);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
+                                                    ur_device_info_t infoType,
+                                                    size_t propSize,
+                                                    void *pDeviceInfo,
+                                                    size_t *pPropSizeRet) {
+  UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UrReturnHelper ReturnValue(propSize, pDeviceInfo, pPropSizeRet);
+
+  static constexpr uint32_t max_work_item_dimensions = 3u;
+
+  ScopedContext active(device->get_context());
+
+  switch ((uint32_t)infoType) {
+  case UR_DEVICE_INFO_TYPE: {
+    return ReturnValue(UR_DEVICE_TYPE_GPU);
+  }
+  case UR_DEVICE_INFO_VENDOR_ID: {
+    return ReturnValue(4318u);
+  }
+  case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
+    int compute_units = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&compute_units,
+                             CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(compute_units >= 0);
+    return ReturnValue(static_cast<uint32_t>(compute_units));
+  }
+  case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: {
+    return ReturnValue(max_work_item_dimensions);
+  }
+  case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: {
+    struct {
+      size_t sizes[max_work_item_dimensions];
+    } return_sizes;
+
+    int max_x = 0, max_y = 0, max_z = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(max_x >= 0);
+
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(max_y >= 0);
+
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(max_z >= 0);
+
+    return_sizes.sizes[0] = size_t(max_x);
+    return_sizes.sizes[1] = size_t(max_y);
+    return_sizes.sizes[2] = size_t(max_z);
+    return ReturnValue(return_sizes);
+  }
+
+  case UR_DEVICE_INFO_MAX_WORK_GROUPS_3D: {
+    struct {
+      size_t sizes[max_work_item_dimensions];
+    } return_sizes;
+    int max_x = 0, max_y = 0, max_z = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(max_x >= 0);
+
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(max_y >= 0);
+
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(max_z >= 0);
+
+    return_sizes.sizes[0] = size_t(max_x);
+    return_sizes.sizes[1] = size_t(max_y);
+    return_sizes.sizes[2] = size_t(max_z);
+    return ReturnValue(return_sizes);
+  }
+
+  case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: {
+    int max_work_group_size = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&max_work_group_size,
+                             CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                             device->get()) == CUDA_SUCCESS);
+
+    sycl::detail::ur::assertion(max_work_group_size >= 0);
+
+    return ReturnValue(size_t(max_work_group_size));
+  }
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF: {
+    return ReturnValue(0u);
+  }
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: {
+    return ReturnValue(0u);
+  }
+  case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: {
+    // Number of sub-groups = max block size / warp size + possible remainder
+    int max_threads = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&max_threads,
+                             CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                             device->get()) == CUDA_SUCCESS);
+    int warpSize = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                             device->get()) == CUDA_SUCCESS);
+    int maxWarps = (max_threads + warpSize - 1) / warpSize;
+    return ReturnValue(maxWarps);
+  }
+  case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: {
+    // Volta provides independent thread scheduling
+    // TODO: Revisit for previous generation GPUs
+    int major = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             device->get()) == CUDA_SUCCESS);
+    bool ifp = (major >= 7);
+    return ReturnValue(ifp);
+  }
+
+  case UR_DEVICE_INFO_ATOMIC_64: {
+    int major = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             device->get()) == CUDA_SUCCESS);
+
+    bool atomic64 = (major >= 6) ? true : false;
+    return ReturnValue(atomic64);
+  }
+  case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
+    uint64_t capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
+                            UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
+                            UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
+                            UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL;
+    return ReturnValue(capabilities);
+  }
+  case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
+    int major = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             device->get()) == CUDA_SUCCESS);
+    uint64_t capabilities =
+        (major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM
+                     : UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE;
+    return ReturnValue(capabilities);
+  }
+
+  case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: {
+    // SYCL2020 4.6.4.2 minimum mandated capabilities for
+    // atomic_fence_order_capabilities.
+    ur_memory_order_capability_flags_t capabilities =
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL;
+    return ReturnValue(capabilities);
+  }
+  case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: {
+    // SYCL2020 4.6.4.2 minimum mandated capabilities for
+    // atomic_fence/memory_scope_capabilities.
+    // Because scopes are hierarchical, wider scopes support all narrower
+    // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and
+    // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382)
+    ur_memory_scope_capability_flags_t capabilities =
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP;
+    return ReturnValue(capabilities);
+  }
+  case UR_DEVICE_INFO_BFLOAT16: {
+    int major = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             device->get()) == CUDA_SUCCESS);
+
+    bool bfloat16 = (major >= 8) ? true : false;
+    return ReturnValue(bfloat16);
+  }
+  case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
+    // NVIDIA devices only support one sub-group size (the warp size)
+    int warpSize = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                             device->get()) == CUDA_SUCCESS);
+    size_t sizes[1] = {static_cast<size_t>(warpSize)};
+    return ReturnValue(sizes, 1);
+  }
+  case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: {
+    int clock_freq = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&clock_freq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(clock_freq >= 0);
+    return ReturnValue(static_cast<uint32_t>(clock_freq) / 1000u);
+  }
+  case UR_DEVICE_INFO_ADDRESS_BITS: {
+    auto bits = uint32_t{std::numeric_limits<uintptr_t>::digits};
+    return ReturnValue(bits);
+  }
+  case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: {
+    // Max size of memory object allocation in bytes.
+    // The minimum value is max(min(1024 × 1024 ×
+    // 1024, 1/4th of CL_DEVICE_GLOBAL_MEM_SIZE),
+    // 32 × 1024 × 1024) for devices that are not of type
+    // CL_DEVICE_TYPE_CUSTOM.
+
+    size_t global = 0;
+    sycl::detail::ur::assertion(cuDeviceTotalMem(&global, device->get()) ==
+                                CUDA_SUCCESS);
+
+    auto quarter_global = static_cast<uint32_t>(global / 4u);
+
+    auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global),
+                              32u * 1024u * 1024u);
+
+    return ReturnValue(uint64_t{max_alloc});
+  }
+  case UR_DEVICE_INFO_IMAGE_SUPPORTED: {
+    bool enabled = false;
+
+    if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) {
+      enabled = true;
+    } else {
+      sycl::detail::ur::cuPrint(
+          "Images are not fully supported by the CUDA BE, their support is "
+          "disabled by default. Their partial support can be activated by "
+          "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at "
+          "runtime.");
+    }
+
+    return ReturnValue(uint32_t{enabled});
+  }
+  case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS: {
+    // This call doesn't match to CUDA as it doesn't have images, but instead
+    // surfaces and textures. No clear call in the CUDA API to determine this,
+    // but some searching found as of SM 2.x 128 are supported.
+    return ReturnValue(128u);
+  }
+  case UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: {
+    // This call doesn't match to CUDA as it doesn't have images, but instead
+    // surfaces and textures. No clear call in the CUDA API to determine this,
+    // but some searching found as of SM 2.x 128 are supported.
+    return ReturnValue(128u);
+  }
+  case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: {
+    // Take the smaller of maximum surface and maximum texture height.
+    int tex_height = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&tex_height,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(tex_height >= 0);
+    int surf_height = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&surf_height,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(surf_height >= 0);
+
+    int min = std::min(tex_height, surf_height);
+
+    return ReturnValue(static_cast<size_t>(min));
+  }
+  case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: {
+    // Take the smaller of maximum surface and maximum texture width.
+    int tex_width = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&tex_width,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(tex_width >= 0);
+    int surf_width = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&surf_width,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(surf_width >= 0);
+
+    int min = std::min(tex_width, surf_width);
+
+    return ReturnValue(static_cast<size_t>(min));
+  }
+  case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: {
+    // Take the smaller of maximum surface and maximum texture height.
+    int tex_height = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&tex_height,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(tex_height >= 0);
+    int surf_height = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&surf_height,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(surf_height >= 0);
+
+    int min = std::min(tex_height, surf_height);
+
+    return ReturnValue(static_cast<size_t>(min));
+  }
+  case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: {
+    // Take the smaller of maximum surface and maximum texture width.
+    int tex_width = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&tex_width,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(tex_width >= 0);
+    int surf_width = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&surf_width,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(surf_width >= 0);
+
+    int min = std::min(tex_width, surf_width);
+
+    return ReturnValue(static_cast<size_t>(min));
+  }
+  case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: {
+    // Take the smaller of maximum surface and maximum texture depth.
+    int tex_depth = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&tex_depth,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(tex_depth >= 0);
+    int surf_depth = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&surf_depth,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(surf_depth >= 0);
+
+    int min = std::min(tex_depth, surf_depth);
+
+    return ReturnValue(static_cast<size_t>(min));
+  }
+  case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: {
+    // Take the smaller of maximum surface and maximum texture width.
+    int tex_width = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&tex_width,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(tex_width >= 0);
+    int surf_width = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&surf_width,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(surf_width >= 0);
+
+    int min = std::min(tex_width, surf_width);
+
+    return ReturnValue(static_cast<size_t>(min));
+  }
+  case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: {
+    return ReturnValue(0lu);
+  }
+  case UR_DEVICE_INFO_MAX_SAMPLERS: {
+    // This call is kind of meaningless for cuda, as samplers don't exist.
+    // Closest thing is textures, which is 128.
+    return ReturnValue(128u);
+  }
+  case UR_DEVICE_INFO_MAX_PARAMETER_SIZE: {
+    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters
+    // __global__ function parameters are passed to the device via constant
+    // memory and are limited to 4 KB.
+    return ReturnValue(4000lu);
+  }
+  case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: {
+    int mem_base_addr_align = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&mem_base_addr_align,
+                             CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
+                             device->get()) == CUDA_SUCCESS);
+    // Multiply by 8 as clGetDeviceInfo returns this value in bits
+    mem_base_addr_align *= 8;
+    return ReturnValue(mem_base_addr_align);
+  }
+  case UR_DEVICE_INFO_HALF_FP_CONFIG: {
+    // TODO: is this config consistent across all NVIDIA GPUs?
+    return ReturnValue(0u);
+  }
+  case UR_DEVICE_INFO_SINGLE_FP_CONFIG: {
+    // TODO: is this config consistent across all NVIDIA GPUs?
+    uint64_t config =
+        UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
+        UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
+        UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
+        UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
+        UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
+        UR_DEVICE_FP_CAPABILITY_FLAG_FMA |
+        UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
+    return ReturnValue(config);
+  }
+  case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: {
+    // TODO: is this config consistent across all NVIDIA GPUs?
+    uint64_t config = UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
+                      UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
+                      UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
+                      UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
+                      UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
+                      UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
+    return ReturnValue(config);
+  }
+  case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: {
+    // TODO: is this config consistent across all NVIDIA GPUs?
+    return ReturnValue(UR_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE);
+  }
+  case UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE: {
+    // The value is documented for all existing GPUs in the CUDA programming
+    // guidelines, section "H.3.2. Global Memory".
+    return ReturnValue(128u);
+  }
+  case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: {
+    int cache_size = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&cache_size, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(cache_size >= 0);
+    // The L2 cache is global to the GPU.
+    return ReturnValue(static_cast<uint64_t>(cache_size));
+  }
+  case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: {
+    size_t bytes = 0;
+    // Runtime API has easy access to this value, driver API info is scarse.
+    sycl::detail::ur::assertion(cuDeviceTotalMem(&bytes, device->get()) ==
+                                CUDA_SUCCESS);
+    return ReturnValue(uint64_t{bytes});
+  }
+  case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: {
+    int constant_memory = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&constant_memory,
+                             CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(constant_memory >= 0);
+
+    return ReturnValue(static_cast<uint64_t>(constant_memory));
+  }
+  case UR_DEVICE_INFO_MAX_CONSTANT_ARGS: {
+    // TODO: is there a way to retrieve this from CUDA driver API?
+    // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX
+    // 1060 3GB
+    return ReturnValue(9u);
+  }
+  case UR_DEVICE_INFO_LOCAL_MEM_TYPE: {
+    return ReturnValue(UR_DEVICE_LOCAL_MEM_TYPE_LOCAL);
+  }
+  case UR_DEVICE_INFO_LOCAL_MEM_SIZE: {
+    // OpenCL's "local memory" maps most closely to CUDA's "shared memory".
+    // CUDA has its own definition of "local memory", which maps to OpenCL's
+    // "private memory".
+    int local_mem_size = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&local_mem_size,
+                             CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(local_mem_size >= 0);
+    return ReturnValue(static_cast<uint64_t>(local_mem_size));
+  }
+  case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: {
+    int ecc_enabled = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&ecc_enabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
+                             device->get()) == CUDA_SUCCESS);
+
+    sycl::detail::ur::assertion((ecc_enabled == 0) | (ecc_enabled == 1));
+    auto result = static_cast<bool>(ecc_enabled);
+    return ReturnValue(result);
+  }
+  case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: {
+    int is_integrated = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&is_integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED,
+                             device->get()) == CUDA_SUCCESS);
+
+    sycl::detail::ur::assertion((is_integrated == 0) | (is_integrated == 1));
+    auto result = static_cast<bool>(is_integrated);
+    return ReturnValue(result);
+  }
+  case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: {
+    // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX
+    // 1060 3GB
+    return ReturnValue(1000lu);
+  }
+  case UR_DEVICE_INFO_ENDIAN_LITTLE: {
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_AVAILABLE: {
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE: {
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_COMPILER_AVAILABLE: {
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_LINKER_AVAILABLE: {
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_EXECUTION_CAPABILITIES: {
+    auto capability = ur_device_exec_capability_flags_t{
+        UR_DEVICE_EXEC_CAPABILITY_FLAG_KERNEL};
+    return ReturnValue(capability);
+  }
+  case UR_DEVICE_INFO_QUEUE_PROPERTIES:
+    return ReturnValue(
+        ur_queue_flag_t(UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                        UR_QUEUE_FLAG_PROFILING_ENABLE));
+  case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: {
+    // The mandated minimum capability:
+    uint64_t capability = UR_QUEUE_FLAG_PROFILING_ENABLE |
+                          UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+    return ReturnValue(capability);
+  }
+  case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: {
+    // The mandated minimum capability:
+    uint64_t capability = UR_QUEUE_FLAG_PROFILING_ENABLE;
+    return ReturnValue(capability);
+  }
+  case UR_DEVICE_INFO_BUILT_IN_KERNELS: {
+    // An empty string is returned if no built-in kernels are supported by the
+    // device.
+    return ReturnValue("");
+  }
+  case UR_DEVICE_INFO_PLATFORM: {
+    return ReturnValue(device->get_platform());
+  }
+  case UR_DEVICE_INFO_NAME: {
+    static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u;
+    char name[MAX_DEVICE_NAME_LENGTH];
+    sycl::detail::ur::assertion(cuDeviceGetName(name, MAX_DEVICE_NAME_LENGTH,
+                                                device->get()) == CUDA_SUCCESS);
+    return ReturnValue(name, strlen(name) + 1);
+  }
+  case UR_DEVICE_INFO_VENDOR: {
+    return ReturnValue("NVIDIA Corporation");
+  }
+  case UR_DEVICE_INFO_DRIVER_VERSION: {
+    auto version = getCudaVersionString();
+    return ReturnValue(version.c_str());
+  }
+  case UR_DEVICE_INFO_PROFILE: {
+    return ReturnValue("CUDA");
+  }
+  case UR_DEVICE_INFO_REFERENCE_COUNT: {
+    return ReturnValue(device->get_reference_count());
+  }
+  case UR_DEVICE_INFO_VERSION: {
+    std::stringstream SS;
+    int Major;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&Major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             device->get()) == CUDA_SUCCESS);
+    SS << Major;
+    int Minor;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&Minor,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                             device->get()) == CUDA_SUCCESS);
+    SS << "." << Minor;
+    return ReturnValue(SS.str().c_str());
+  }
+  case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION: {
+    return ReturnValue("");
+  }
+  case UR_DEVICE_INFO_EXTENSIONS: {
+
+    std::string SupportedExtensions = "cl_khr_fp64 cl_khr_subgroups ";
+    SupportedExtensions += "pi_ext_intel_devicelib_assert ";
+    SupportedExtensions += " ";
+
+    int major = 0;
+    int minor = 0;
+
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&minor,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                             device->get()) == CUDA_SUCCESS);
+
+    if ((major >= 6) || ((major == 5) && (minor >= 3))) {
+      SupportedExtensions += "cl_khr_fp16 ";
+    }
+
+    return ReturnValue(SupportedExtensions.c_str());
+  }
+  case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE: {
+    // The minimum value for the FULL profile is 1 MB.
+    return ReturnValue(1024lu);
+  }
+  case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: {
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_PARENT_DEVICE: {
+    return ReturnValue(nullptr);
+  }
+  case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: {
+    return ReturnValue(0u);
+  }
+  case UR_DEVICE_INFO_PARTITION_PROPERTIES: {
+    return ReturnValue(static_cast<ur_device_partition_t>(0u));
+  }
+  case UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: {
+    return ReturnValue(0u);
+  }
+  case UR_DEVICE_INFO_PARTITION_TYPE: {
+    return ReturnValue(static_cast<ur_device_partition_t>(0u));
+  }
+
+    // Intel USM extensions
+
+  case UR_DEVICE_INFO_USM_HOST_SUPPORT: {
+    // from cl_intel_unified_shared_memory: "The host memory access capabilities
+    // apply to any host allocation."
+    //
+    // query if/how the device can access page-locked host memory, possibly
+    // through PCIe, using the same pointer as the host
+    uint64_t value = {};
+    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
+      // the device shares a unified address space with the host
+      if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
+          6) {
+        // compute capability 6.x introduces operations that are atomic with
+        // respect to other CPUs and GPUs in the system
+        value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS |
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS |
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
+      } else {
+        // on GPU architectures with compute capability lower than 6.x, atomic
+        // operations from the GPU to CPU memory will not be atomic with respect
+        // to CPU initiated atomic operations
+        value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS;
+      }
+    }
+    return ReturnValue(value);
+  }
+  case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: {
+    // from cl_intel_unified_shared_memory:
+    // "The device memory access capabilities apply to any device allocation
+    // associated with this device."
+    //
+    // query how the device can access memory allocated on the device itself (?)
+    uint64_t value =
+        UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+        UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS |
+        UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS |
+        UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
+    return ReturnValue(value);
+  }
+  case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: {
+    // from cl_intel_unified_shared_memory:
+    // "The single device shared memory access capabilities apply to any shared
+    // allocation associated with this device."
+    //
+    // query if/how the device can access managed memory associated to it
+    uint64_t value = {};
+    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
+      // the device can allocate managed memory on this system
+      value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+              UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS;
+    }
+    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+      // the device can coherently access managed memory concurrently with the
+      // CPU
+      value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS;
+      if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
+          6) {
+        // compute capability 6.x introduces operations that are atomic with
+        // respect to other CPUs and GPUs in the system
+        value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
+      }
+    }
+    return ReturnValue(value);
+  }
+  case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: {
+    // from cl_intel_unified_shared_memory:
+    // "The cross-device shared memory access capabilities apply to any shared
+    // allocation associated with this device, or to any shared memory
+    // allocation on another device that also supports the same cross-device
+    // shared memory access capability."
+    //
+    // query if/how the device can access managed memory associated to other
+    // devices
+    uint64_t value = {};
+    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
+      // the device can allocate managed memory on this system
+      value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS;
+    }
+    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+      // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+      // attribute can coherently access managed memory concurrently with the
+      // CPU
+      value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS;
+    }
+    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
+        6) {
+      // compute capability 6.x introduces operations that are atomic with
+      // respect to other CPUs and GPUs in the system
+      if (value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)
+        value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS;
+      if (value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS)
+        value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
+    }
+    return ReturnValue(value);
+  }
+  case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: {
+    // from cl_intel_unified_shared_memory:
+    // "The shared system memory access capabilities apply to any allocations
+    // made by a system allocator, such as malloc or new."
+    //
+    // query if/how the device can access pageable host memory allocated by the
+    // system allocator
+    uint64_t value = {};
+    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) {
+      // the device suppports coherently accessing pageable memory without
+      // calling cuMemHostRegister/cudaHostRegister on it
+      if (getAttribute(device,
+                       CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) {
+        // the link between the device and the host supports native atomic
+        // operations
+        value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS |
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS |
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
+      } else {
+        // the link between the device and the host does not support native
+        // atomic operations
+        value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS;
+      }
+    }
+    return ReturnValue(value);
+  }
+  case UR_DEVICE_INFO_ASYNC_BARRIER: {
+    int value =
+        getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 8;
+    return ReturnValue(static_cast<bool>(value));
+  }
+  case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION: {
+    int major =
+        getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
+    int minor =
+        getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
+    std::string result = std::to_string(major) + "." + std::to_string(minor);
+    return ReturnValue(result.c_str());
+  }
+
+  case UR_DEVICE_INFO_GLOBAL_MEM_FREE: {
+    size_t FreeMemory = 0;
+    size_t TotalMemory = 0;
+    sycl::detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) ==
+                                    CUDA_SUCCESS,
+                                "failed cuMemGetInfo() API.");
+    return ReturnValue(FreeMemory);
+  }
+  case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: {
+    int value = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(value >= 0);
+    // Convert kilohertz to megahertz when returning.
+    return ReturnValue(value / 1000);
+  }
+  case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: {
+    int value = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&value,
+                             CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(value >= 0);
+    return ReturnValue(value);
+  }
+  case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: {
+    return ReturnValue(int32_t{1});
+  }
+  case UR_DEVICE_INFO_DEVICE_ID: {
+    int value = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
+                             device->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(value >= 0);
+    return ReturnValue(value);
+  }
+  case UR_DEVICE_INFO_UUID: {
+    int driver_version = 0;
+    cuDriverGetVersion(&driver_version);
+    int major = driver_version / 1000;
+    int minor = driver_version % 1000 / 10;
+    CUuuid uuid;
+    if ((major > 11) || (major == 11 && minor >= 4)) {
+      sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&uuid, device->get()) ==
+                                  CUDA_SUCCESS);
+    } else {
+      sycl::detail::ur::assertion(cuDeviceGetUuid(&uuid, device->get()) ==
+                                  CUDA_SUCCESS);
+    }
+    std::array<unsigned char, 16> name;
+    std::copy(uuid.bytes, uuid.bytes + 16, name.begin());
+    return ReturnValue(name.data(), 16);
+  }
+  case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: {
+    int major = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             device->get()) == CUDA_SUCCESS);
+
+    int minor = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&minor,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                             device->get()) == CUDA_SUCCESS);
+
+    // Some specific devices seem to need special handling. See reference
+    // https://github.com/jeffhammond/HPCInfo/blob/master/cuda/gpu-detect.cu
+    bool is_xavier_agx = major == 7 && minor == 2;
+    bool is_orin_agx = major == 8 && minor == 7;
+
+    int memory_clock_khz = 0;
+    if (is_xavier_agx) {
+      memory_clock_khz = 2133000;
+    } else if (is_orin_agx) {
+      memory_clock_khz = 3200000;
+    } else {
+      sycl::detail::ur::assertion(
+          cuDeviceGetAttribute(&memory_clock_khz,
+                               CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
+                               device->get()) == CUDA_SUCCESS);
+    }
+
+    int memory_bus_width = 0;
+    if (is_orin_agx) {
+      memory_bus_width = 256;
+    } else {
+      sycl::detail::ur::assertion(
+          cuDeviceGetAttribute(&memory_bus_width,
+                               CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
+                               device->get()) == CUDA_SUCCESS);
+    }
+
+    uint64_t memory_bandwidth =
+        uint64_t(memory_clock_khz) * memory_bus_width * 250;
+
+    return ReturnValue(memory_bandwidth);
+  }
+  case UR_EXT_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: {
+    // Maximum number of 32-bit registers available to a thread block.
+    // Note: This number is shared by all thread blocks simultaneously resident
+    // on a multiprocessor.
+    int max_registers{-1};
+    UR_CHECK_ERROR(cuDeviceGetAttribute(
+        &max_registers, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
+        device->get()));
+
+    sycl::detail::ur::assertion(max_registers >= 0);
+
+    return ReturnValue(static_cast<uint32_t>(max_registers));
+  }
+  case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
+    return ReturnValue(false);
+  case UR_DEVICE_INFO_IMAGE_SRGB:
+    return ReturnValue(false);
+  case UR_DEVICE_INFO_PCI_ADDRESS: {
+    constexpr size_t AddressBufferSize = 13;
+    char AddressBuffer[AddressBufferSize];
+    sycl::detail::ur::assertion(
+        cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, device->get()) ==
+        CUDA_SUCCESS);
+    // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written
+    sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12);
+    return ReturnValue(AddressBuffer,
+                       strnlen(AddressBuffer, AddressBufferSize - 1) + 1);
+  }
+    // TODO: Investigate if this information is available on CUDA.
+  case UR_DEVICE_INFO_GPU_EU_COUNT:
+  case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH:
+  case UR_DEVICE_INFO_GPU_EU_SLICES:
+  case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
+  case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
+  case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
+    return UR_RESULT_ERROR_INVALID_ENUMERATION;
+
+  default:
+    break;
+  }
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
+
+/// \return PI_SUCCESS if the function is executed successfully
+/// CUDA devices are always root devices so retain always returns success.
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t device) {
+  UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urDevicePartition(ur_device_handle_t, const ur_device_partition_property_t *,
+                  uint32_t, ur_device_handle_t *, uint32_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+/// \return UR_RESULT_SUCCESS always since CUDA devices are always root
+/// devices.
+ur_result_t urDeviceRelease(ur_device_handle_t device) {
+  UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform,
+                                                ur_device_type_t DeviceType,
+                                                uint32_t NumEntries,
+                                                ur_device_handle_t *phDevices,
+                                                uint32_t *pNumDevices) {
+  ur_result_t err = UR_RESULT_SUCCESS;
+  const bool askingForAll = DeviceType == UR_DEVICE_TYPE_ALL;
+  const bool askingForDefault = DeviceType == UR_DEVICE_TYPE_DEFAULT;
+  const bool askingForGPU = DeviceType == UR_DEVICE_TYPE_GPU;
+  const bool returnDevices = askingForDefault || askingForAll || askingForGPU;
+
+  UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  size_t numDevices = returnDevices ? hPlatform->devices_.size() : 0;
+
+  try {
+    UR_ASSERT(pNumDevices || phDevices, UR_RESULT_ERROR_INVALID_VALUE);
+
+    if (pNumDevices) {
+      *pNumDevices = numDevices;
+    }
+
+    if (returnDevices && phDevices) {
+      for (size_t i = 0; i < std::min(size_t(NumEntries), numDevices); ++i) {
+        phDevices[i] = hPlatform->devices_[i].get();
+      }
+    }
+
+    return err;
+  } catch (ur_result_t err) {
+    return err;
+  } catch (...) {
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+}
+
+/// Gets the native CUDA handle of a UR device object
+///
+/// \param[in] device The UR device to get the native CUDA object of.
+/// \param[out] nativeHandle Set to the native handle of the UR device object.
+///
+/// \return PI_SUCCESS
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle(
+    ur_device_handle_t hDevice, ur_native_handle_t *phNativeHandle) {
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(phNativeHandle, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  *phNativeHandle = reinterpret_cast<ur_native_handle_t>(hDevice->get());
+  return UR_RESULT_SUCCESS;
+}
+
+/// Created a UR device object from a CUDA device handle.
+/// NOTE: The created UR object does not take ownership of the native handle.
+///
+/// \param[in] nativeHandle The native handle to create UR device object from.
+/// \param[in] platform is the UR platform of the device.
+/// \param[out] device Set to the UR device object created from native handle.
+///
+/// \return TBD
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
+    ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform,
+    ur_device_handle_t *phDevice) {
+  // TODO(ur): This is neede for the UR CTS, but it might be valid to to have a
+  // null native handle
+  UR_ASSERT(hNativeDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(phDevice, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  // We can't cast between ur_native_handle_t and CUdevice, so memcpy the bits
+  // instead
+  CUdevice cu_device = 0;
+  memcpy(&cu_device, hNativeDevice, sizeof(CUdevice));
+
+  auto is_device = [=](std::unique_ptr<ur_device_handle_t_> &dev) {
+    return dev->get() == cu_device;
+  };
+
+  // If a platform is provided just check if the device is in it
+  if (hPlatform) {
+    auto search_res = std::find_if(begin(hPlatform->devices_),
+                                   end(hPlatform->devices_), is_device);
+    if (search_res != end(hPlatform->devices_)) {
+      *phDevice = search_res->get();
+      return UR_RESULT_SUCCESS;
+    }
+  }
+
+  // Get list of platforms
+  uint32_t num_platforms = 0;
+  ur_result_t result = urPlatformGet(0, nullptr, &num_platforms);
+  if (result != UR_RESULT_SUCCESS)
+    return result;
+
+  ur_platform_handle_t *plat = static_cast<ur_platform_handle_t *>(
+      malloc(num_platforms * sizeof(ur_platform_handle_t)));
+  result = urPlatformGet(num_platforms, plat, nullptr);
+  if (result != UR_RESULT_SUCCESS)
+    return result;
+
+  // Iterate through platforms to find device that matches nativeHandle
+  for (uint32_t j = 0; j < num_platforms; ++j) {
+    auto search_res = std::find_if(begin(plat[j]->devices_),
+                                   end(plat[j]->devices_), is_device);
+    if (search_res != end(plat[j]->devices_)) {
+      *phDevice = static_cast<ur_device_handle_t>((*search_res).get());
+      return UR_RESULT_SUCCESS;
+    }
+  }
+
+  // If the provided nativeHandle cannot be matched to an
+  // existing device return error
+  return UR_RESULT_ERROR_INVALID_OPERATION;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp
new file mode 100644
index 0000000000000..c2195c958cfd7
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp
@@ -0,0 +1,59 @@
+//===--------- device.hpp - CUDA Adapter -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <ur/ur.hpp>
+
+struct ur_device_handle_t_ {
+private:
+  using native_type = CUdevice;
+
+  native_type cuDevice_;
+  CUcontext cuContext_;
+  CUevent evBase_; // CUDA event used as base counter
+  std::atomic_uint32_t refCount_;
+  ur_platform_handle_t platform_;
+
+  static constexpr uint32_t max_work_item_dimensions = 3u;
+  size_t max_work_item_sizes[max_work_item_dimensions];
+  int max_work_group_size;
+
+public:
+  ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase,
+                      ur_platform_handle_t platform)
+      : cuDevice_(cuDevice), cuContext_(cuContext), evBase_(evBase),
+        refCount_{1}, platform_(platform) {}
+
+  ur_device_handle_t_() { cuDevicePrimaryCtxRelease(cuDevice_); }
+
+  native_type get() const noexcept { return cuDevice_; };
+
+  CUcontext get_context() const noexcept { return cuContext_; };
+
+  uint32_t get_reference_count() const noexcept { return refCount_; }
+
+  ur_platform_handle_t get_platform() const noexcept { return platform_; };
+
+  uint64_t get_elapsed_time(CUevent) const;
+
+  void save_max_work_item_sizes(size_t size,
+                                size_t *save_max_work_item_sizes) noexcept {
+    memcpy(max_work_item_sizes, save_max_work_item_sizes, size);
+  };
+
+  void save_max_work_group_size(int value) noexcept {
+    max_work_group_size = value;
+  };
+
+  void get_max_work_item_sizes(size_t ret_size,
+                               size_t *ret_max_work_item_sizes) const noexcept {
+    memcpy(ret_max_work_item_sizes, max_work_item_sizes, ret_size);
+  };
+
+  int get_max_work_group_size() const noexcept { return max_work_group_size; };
+};
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
new file mode 100644
index 0000000000000..dd8503f1f8907
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
@@ -0,0 +1,174 @@
+//===--------- platform.cpp - CUDA Adapter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "platform.hpp"
+#include "common.hpp"
+#include "context.hpp"
+#include "device.hpp"
+
+#include <cassert>
+#include <cuda.h>
+#include <sstream>
+
+ur_result_t urPlatformGetInfo(ur_platform_handle_t hPlatform,
+                              ur_platform_info_t PlatformInfoType, size_t Size,
+                              void *pPlatformInfo, size_t *pSizeRet) {
+
+  UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UrReturnHelper ReturnValue(Size, pPlatformInfo, pSizeRet);
+
+  switch (PlatformInfoType) {
+  case UR_PLATFORM_INFO_NAME:
+    return ReturnValue("NVIDIA CUDA BACKEND");
+  case UR_PLATFORM_INFO_VENDOR_NAME:
+    return ReturnValue("NVIDIA Corporation");
+  case UR_PLATFORM_INFO_PROFILE:
+    return ReturnValue("FULL PROFILE");
+  case UR_PLATFORM_INFO_VERSION: {
+    auto version = getCudaVersionString();
+    return ReturnValue(version.c_str());
+  }
+  case UR_PLATFORM_INFO_EXTENSIONS: {
+    return ReturnValue("");
+  }
+  case UR_PLATFORM_INFO_BACKEND: {
+    return ReturnValue(UR_PLATFORM_BACKEND_CUDA);
+  }
+  default:
+    return UR_RESULT_ERROR_INVALID_ENUMERATION;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+/// Obtains the CUDA platform.
+/// There is only one CUDA platform, and contains all devices on the system.
+/// Triggers the CUDA Driver initialization (cuInit) the first time, so this
+/// must be the first PI API called.
+///
+/// However because multiple devices in a context is not currently supported,
+/// place each device in a separate platform.
+///
+ur_result_t urPlatformGet(uint32_t NumEntries,
+                          ur_platform_handle_t *phPlatforms,
+                          uint32_t *pNumPlatforms) {
+
+  try {
+    static std::once_flag initFlag;
+    static uint32_t numPlatforms = 1;
+    static std::vector<ur_platform_handle_t_> platformIds;
+
+    UR_ASSERT(phPlatforms || pNumPlatforms, UR_RESULT_ERROR_INVALID_VALUE);
+    UR_ASSERT(!phPlatforms || NumEntries > 0, UR_RESULT_ERROR_INVALID_SIZE);
+
+    ur_result_t err = UR_RESULT_SUCCESS;
+
+    std::call_once(
+        initFlag,
+        [](ur_result_t &err) {
+          if (cuInit(0) != CUDA_SUCCESS) {
+            numPlatforms = 0;
+            return;
+          }
+          int numDevices = 0;
+          err = UR_CHECK_ERROR(cuDeviceGetCount(&numDevices));
+          if (numDevices == 0) {
+            numPlatforms = 0;
+            return;
+          }
+          try {
+            // make one platform per device
+            numPlatforms = numDevices;
+            platformIds.resize(numDevices);
+
+            for (int i = 0; i < numDevices; ++i) {
+              CUdevice device;
+              err = UR_CHECK_ERROR(cuDeviceGet(&device, i));
+              CUcontext context;
+              err = UR_CHECK_ERROR(cuDevicePrimaryCtxRetain(&context, device));
+
+              ScopedContext active(context);
+              CUevent evBase;
+              err = UR_CHECK_ERROR(cuEventCreate(&evBase, CU_EVENT_DEFAULT));
+
+              // Use default stream to record base event counter
+              err = UR_CHECK_ERROR(cuEventRecord(evBase, 0));
+
+              platformIds[i].devices_.emplace_back(new ur_device_handle_t_{
+                  device, context, evBase, &platformIds[i]});
+              {
+                const auto &dev = platformIds[i].devices_.back().get();
+                size_t maxWorkGroupSize = 0u;
+                size_t maxThreadsPerBlock[3] = {};
+                ur_result_t retError = urDeviceGetInfo(
+                    dev, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
+                    sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
+                if (retError != UR_RESULT_SUCCESS) {
+                  throw retError;
+                }
+
+                retError = urDeviceGetInfo(
+                    dev, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
+                    sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr);
+                if (retError != UR_RESULT_SUCCESS) {
+                  throw retError;
+                }
+
+                dev->save_max_work_item_sizes(sizeof(maxThreadsPerBlock),
+                                              maxThreadsPerBlock);
+                dev->save_max_work_group_size(maxWorkGroupSize);
+              }
+            }
+          } catch (const std::bad_alloc &) {
+            // Signal out-of-memory situation
+            for (int i = 0; i < numDevices; ++i) {
+              platformIds[i].devices_.clear();
+            }
+            platformIds.clear();
+            err = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+          } catch (...) {
+            // Clear and rethrow to allow retry
+            for (int i = 0; i < numDevices; ++i) {
+              platformIds[i].devices_.clear();
+            }
+            platformIds.clear();
+            throw;
+          }
+        },
+        err);
+
+    if (pNumPlatforms != nullptr) {
+      *pNumPlatforms = numPlatforms;
+    }
+
+    if (phPlatforms != nullptr) {
+      for (unsigned i = 0; i < std::min(NumEntries, numPlatforms); ++i) {
+        phPlatforms[i] = &platformIds[i];
+      }
+    }
+
+    return err;
+  } catch (ur_result_t err) {
+    return err;
+  } catch (...) {
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+}
+
+ur_result_t urPlatformGetApiVersion(ur_platform_handle_t hDriver,
+                                    ur_api_version_t *pVersion) {
+  UR_ASSERT(hDriver, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pVersion, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  *pVersion = UR_API_VERSION_CURRENT;
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t urInit(ur_device_init_flags_t) { return UR_RESULT_SUCCESS; }
+
+ur_result_t urTearDown(void *) { return UR_RESULT_SUCCESS; }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp
new file mode 100644
index 0000000000000..5b2e79f49be8d
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp
@@ -0,0 +1,15 @@
+//===--------- platform.hpp - CUDA Adapter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <ur/ur.hpp>
+#include <vector>
+
+struct ur_platform_handle_t_ {
+  std::vector<std::unique_ptr<ur_device_handle_t_>> devices_;
+};
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index 9446515bd435e..015dadcbaa074 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -6,6 +6,7 @@
 //
 //===-----------------------------------------------------------------===//
 
+#include <ur_api.h>
 #include <ur_ddi.h>
 
 namespace {
@@ -36,9 +37,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable(
     return result;
   }
   pDdiTable->pfnCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnGet = nullptr;
-  pDdiTable->pfnGetApiVersion = nullptr;
-  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGet = urPlatformGet;
+  pDdiTable->pfnGetApiVersion = urPlatformGetApiVersion;
+  pDdiTable->pfnGetInfo = urPlatformGetInfo;
   pDdiTable->pfnGetNativeHandle = nullptr;
   return UR_RESULT_SUCCESS;
 }
@@ -49,13 +50,13 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetContextProcAddrTable(
   if (UR_RESULT_SUCCESS != result) {
     return result;
   }
-  pDdiTable->pfnCreate = nullptr;
-  pDdiTable->pfnCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnGetInfo = nullptr;
-  pDdiTable->pfnGetNativeHandle = nullptr;
-  pDdiTable->pfnRelease = nullptr;
-  pDdiTable->pfnRetain = nullptr;
-  pDdiTable->pfnSetExtendedDeleter = nullptr;
+  pDdiTable->pfnCreate = urContextCreate;
+  pDdiTable->pfnCreateWithNativeHandle = urContextCreateWithNativeHandle;
+  pDdiTable->pfnGetInfo = urContextGetInfo;
+  pDdiTable->pfnGetNativeHandle = urContextGetNativeHandle;
+  pDdiTable->pfnRelease = urContextRelease;
+  pDdiTable->pfnRetain = urContextRetain;
+  pDdiTable->pfnSetExtendedDeleter = urContextSetExtendedDeleter;
   return UR_RESULT_SUCCESS;
 }
 
@@ -195,8 +196,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable(
     return result;
   }
   pDdiTable->pfnGetLastResult = nullptr;
-  pDdiTable->pfnInit = nullptr;
-  pDdiTable->pfnTearDown = nullptr;
+  pDdiTable->pfnInit = urInit;
+  pDdiTable->pfnTearDown = urTearDown;
   return UR_RESULT_SUCCESS;
 }
 
@@ -240,14 +241,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable(
   if (UR_RESULT_SUCCESS != result) {
     return result;
   }
-  pDdiTable->pfnCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnGet = nullptr;
+  pDdiTable->pfnCreateWithNativeHandle = urDeviceCreateWithNativeHandle;
+  pDdiTable->pfnGet = urDeviceGet;
   pDdiTable->pfnGetGlobalTimestamps = nullptr;
-  pDdiTable->pfnGetInfo = nullptr;
-  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnGetInfo = urDeviceGetInfo;
+  pDdiTable->pfnGetNativeHandle = urDeviceGetNativeHandle;
   pDdiTable->pfnPartition = nullptr;
-  pDdiTable->pfnRelease = nullptr;
-  pDdiTable->pfnRetain = nullptr;
+  pDdiTable->pfnRelease = urDeviceRelease;
+  pDdiTable->pfnRetain = urDeviceRetain;
   pDdiTable->pfnSelectBinary = nullptr;
   return UR_RESULT_SUCCESS;
 }
diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp
index d0d1fb8f46912..c2f3a3782f9a0 100644
--- a/sycl/plugins/unified_runtime/ur/ur.hpp
+++ b/sycl/plugins/unified_runtime/ur/ur.hpp
@@ -39,6 +39,9 @@ template <> uint32_t inline ur_cast(uint64_t Value) {
 const ur_device_info_t UR_EXT_DEVICE_INFO_OPENCL_C_VERSION =
     (ur_device_info_t)0x103D;
 
+const ur_device_info_t UR_EXT_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP =
+    (ur_device_info_t)((uint32_t)UR_DEVICE_INFO_FORCE_UINT32 - 1);
+
 const ur_command_t UR_EXT_COMMAND_TYPE_USER =
     (ur_command_t)((uint32_t)UR_COMMAND_FORCE_UINT32 - 1);
 
@@ -197,6 +200,7 @@ extern bool PiPlatformCachePopulated;
 
 // The getInfo*/ReturnHelper facilities provide shortcut way of
 // writing return bytes for the various getInfo APIs.
+namespace ur {
 template <typename T, typename Assign>
 ur_result_t getInfoImpl(size_t param_value_size, void *param_value,
                         size_t *param_value_size_ret, T value,
@@ -260,6 +264,7 @@ getInfo<const char *>(size_t param_value_size, void *param_value,
   return getInfoArray(strlen(value) + 1, param_value_size, param_value,
                       param_value_size_ret, value);
 }
+} // namespace ur
 
 class UrReturnHelper {
 public:
@@ -276,20 +281,20 @@ class UrReturnHelper {
 
   // Scalar return value
   template <class T> ur_result_t operator()(const T &t) {
-    return getInfo(param_value_size, param_value, param_value_size_ret, t);
+    return ur::getInfo(param_value_size, param_value, param_value_size_ret, t);
   }
 
   // Array return value
   template <class T> ur_result_t operator()(const T *t, size_t s) {
-    return getInfoArray(s, param_value_size, param_value, param_value_size_ret,
-                        t);
+    return ur::getInfoArray(s, param_value_size, param_value,
+                            param_value_size_ret, t);
   }
 
   // Array return value where element type is differrent from T
   template <class RetType, class T>
   ur_result_t operator()(const T *t, size_t s) {
-    return getInfoArray<T, RetType>(s, param_value_size, param_value,
-                                    param_value_size_ret, t);
+    return ur::getInfoArray<T, RetType>(s, param_value_size, param_value,
+                                        param_value_size_ret, t);
   }
 
 protected:
diff --git a/sycl/unittests/pi/cuda/CMakeLists.txt b/sycl/unittests/pi/cuda/CMakeLists.txt
index 94ac39f07e474..7808340cc4302 100644
--- a/sycl/unittests/pi/cuda/CMakeLists.txt
+++ b/sycl/unittests/pi/cuda/CMakeLists.txt
@@ -22,9 +22,11 @@ target_include_directories(PiCudaTests
     "${sycl_inc_dir}/sycl/detail/"
     "${sycl_inc_dir}"
     "${sycl_plugin_dir}/cuda/"
+    "${sycl_plugin_dir}/unified_runtime/"
 )
 
 target_link_libraries(PiCudaTests
   PRIVATE
     cudadrv
+    UnifiedRuntime-Headers
 )

From 8073f6c46b8c93b84bbfbae37d3847dab72d1fae Mon Sep 17 00:00:00 2001
From: Omar Ahmed <omar.ahmed@codeplay.com>
Date: Mon, 10 Apr 2023 13:53:15 +0100
Subject: [PATCH 03/45] Port program and kernel entry points

---
 sycl/plugins/cuda/CMakeLists.txt              |   4 +
 sycl/plugins/cuda/pi_cuda.cpp                 | 840 +-----------------
 sycl/plugins/cuda/pi_cuda.hpp                 | 217 +----
 sycl/plugins/unified_runtime/CMakeLists.txt   |   4 +
 .../ur/adapters/cuda/kernel.cpp               | 281 ++++++
 .../ur/adapters/cuda/kernel.hpp               | 183 ++++
 .../ur/adapters/cuda/program.cpp              | 439 +++++++++
 .../ur/adapters/cuda/program.hpp              |  55 ++
 8 files changed, 997 insertions(+), 1026 deletions(-)
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp

diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt
index e4fa949eca8e9..76d730967a7c0 100644
--- a/sycl/plugins/cuda/CMakeLists.txt
+++ b/sycl/plugins/cuda/CMakeLists.txt
@@ -63,6 +63,10 @@ add_sycl_plugin(cuda
     "../unified_runtime/ur/adapters/cuda/device.hpp"
     "../unified_runtime/ur/adapters/cuda/platform.cpp"
     "../unified_runtime/ur/adapters/cuda/platform.hpp"
+    "../unified_runtime/ur/adapters/cuda/program.cpp"
+    "../unified_runtime/ur/adapters/cuda/program.hpp"
+    "../unified_runtime/ur/adapters/cuda/kernel.cpp"
+    "../unified_runtime/ur/adapters/cuda/kernel.hpp"
     "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp"
     # --- 
     "${sycl_inc_dir}/sycl/detail/pi.h"
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index b1183b662b137..c09ccea8ef6a3 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -228,14 +228,6 @@ pi_result getInfoArray(size_t array_length, size_t param_value_size,
                      array_length * sizeof(T), memcpy);
 }
 
-template <>
-pi_result getInfo<const char *>(size_t param_value_size, void *param_value,
-                                size_t *param_value_size_ret,
-                                const char *value) {
-  return getInfoArray(strlen(value) + 1, param_value_size, param_value,
-                      param_value_size_ret, value);
-}
-
 int getAttribute(pi_device device, CUdevice_attribute attribute) {
   int value;
   sycl::detail::pi::assertion(
@@ -370,41 +362,6 @@ void getUSMHostOrDevicePtr(PtrT usm_ptr, CUmemorytype *out_mem_type,
   }
 }
 
-bool getMaxRegistersJitOptionValue(const std::string &build_options,
-                                   unsigned int &value) {
-  using namespace std::string_view_literals;
-  const std::size_t optionPos = build_options.find_first_of("maxrregcount"sv);
-  if (optionPos == std::string::npos) {
-    return false;
-  }
-
-  const std::size_t delimPos = build_options.find('=', optionPos + 1u);
-  if (delimPos == std::string::npos) {
-    return false;
-  }
-
-  const std::size_t length = build_options.length();
-  const std::size_t startPos = delimPos + 1u;
-  if (delimPos == std::string::npos || startPos >= length) {
-    return false;
-  }
-
-  std::size_t pos = startPos;
-  while (pos < length &&
-         std::isdigit(static_cast<unsigned char>(build_options[pos]))) {
-    pos++;
-  }
-
-  const std::string valueString =
-      build_options.substr(startPos, pos - startPos);
-  if (valueString.empty()) {
-    return false;
-  }
-
-  value = static_cast<unsigned int>(std::stoi(valueString));
-  return true;
-}
-
 // Helper to verify out-of-registers case (exceeded block max registers).
 // If the kernel requires a number of registers for the entire thread
 // block exceeds the hardware limitations, then the cuLaunchKernel call
@@ -734,127 +691,6 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) {
   return PI_SUCCESS;
 }
 
-_pi_program::_pi_program(pi_context ctxt)
-    : module_{nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1},
-      context_{ctxt}, kernelReqdWorkGroupSizeMD_{} {
-  pi2ur::piContextRetain(context_);
-}
-
-_pi_program::~_pi_program() { pi2ur::piContextRelease(context_); }
-
-std::pair<std::string, std::string>
-splitMetadataName(const std::string &metadataName) {
-  size_t splitPos = metadataName.rfind('@');
-  if (splitPos == std::string::npos)
-    return std::make_pair(metadataName, std::string{});
-  return std::make_pair(metadataName.substr(0, splitPos),
-                        metadataName.substr(splitPos, metadataName.length()));
-}
-
-pi_result _pi_program::set_metadata(const pi_device_binary_property *metadata,
-                                    size_t length) {
-  for (size_t i = 0; i < length; ++i) {
-    const pi_device_binary_property metadataElement = metadata[i];
-    std::string metadataElementName{metadataElement->Name};
-
-    auto [prefix, tag] = splitMetadataName(metadataElementName);
-
-    if (tag == __SYCL_PI_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) {
-      // If metadata is reqd_work_group_size, record it for the corresponding
-      // kernel name.
-      size_t MDElemsSize = metadataElement->ValSize - sizeof(std::uint64_t);
-
-      // Expect between 1 and 3 32-bit integer values.
-      assert(MDElemsSize >= sizeof(std::uint32_t) &&
-             MDElemsSize <= sizeof(std::uint32_t) * 3 &&
-             "Unexpected size for reqd_work_group_size metadata");
-
-      // Get pointer to data, skipping 64-bit size at the start of the data.
-      const char *ValuePtr =
-          reinterpret_cast<const char *>(metadataElement->ValAddr) +
-          sizeof(std::uint64_t);
-      // Read values and pad with 1's for values not present.
-      std::uint32_t reqdWorkGroupElements[] = {1, 1, 1};
-      std::memcpy(reqdWorkGroupElements, ValuePtr, MDElemsSize);
-      kernelReqdWorkGroupSizeMD_[prefix] =
-          std::make_tuple(reqdWorkGroupElements[0], reqdWorkGroupElements[1],
-                          reqdWorkGroupElements[2]);
-    } else if (tag == __SYCL_PI_PROGRAM_METADATA_GLOBAL_ID_MAPPING) {
-      const char *metadataValPtr =
-          reinterpret_cast<const char *>(metadataElement->ValAddr) +
-          sizeof(std::uint64_t);
-      const char *metadataValPtrEnd =
-          metadataValPtr + metadataElement->ValSize - sizeof(std::uint64_t);
-      globalIDMD_[prefix] = std::string{metadataValPtr, metadataValPtrEnd};
-    }
-  }
-  return PI_SUCCESS;
-}
-
-pi_result _pi_program::set_binary(const char *source, size_t length) {
-  assert((binary_ == nullptr && binarySizeInBytes_ == 0) &&
-         "Re-setting program binary data which has already been set");
-  binary_ = source;
-  binarySizeInBytes_ = length;
-  return PI_SUCCESS;
-}
-
-pi_result _pi_program::build_program(const char *build_options) {
-
-  this->buildOptions_ = build_options;
-
-  constexpr const unsigned int numberOfOptions = 4u;
-
-  std::vector<CUjit_option> options(numberOfOptions);
-  std::vector<void *> optionVals(numberOfOptions);
-
-  // Pass a buffer for info messages
-  options[0] = CU_JIT_INFO_LOG_BUFFER;
-  optionVals[0] = (void *)infoLog_;
-  // Pass the size of the info buffer
-  options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
-  optionVals[1] = (void *)(long)MAX_LOG_SIZE;
-  // Pass a buffer for error message
-  options[2] = CU_JIT_ERROR_LOG_BUFFER;
-  optionVals[2] = (void *)errorLog_;
-  // Pass the size of the error buffer
-  options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
-  optionVals[3] = (void *)(long)MAX_LOG_SIZE;
-
-  if (!buildOptions_.empty()) {
-    unsigned int maxRegs;
-    bool valid = getMaxRegistersJitOptionValue(buildOptions_, maxRegs);
-    if (valid) {
-      options.push_back(CU_JIT_MAX_REGISTERS);
-      optionVals.push_back(reinterpret_cast<void *>(maxRegs));
-    }
-  }
-
-  auto result = PI_CHECK_ERROR(
-      cuModuleLoadDataEx(&module_, static_cast<const void *>(binary_),
-                         options.size(), options.data(), optionVals.data()));
-
-  const auto success = (result == PI_SUCCESS);
-
-  buildStatus_ =
-      success ? PI_PROGRAM_BUILD_STATUS_SUCCESS : PI_PROGRAM_BUILD_STATUS_ERROR;
-
-  // If no exception, result is correct
-  return success ? PI_SUCCESS : PI_ERROR_BUILD_PROGRAM_FAILURE;
-}
-
-/// Finds kernel names by searching for entry points in the PTX source, as the
-/// CUDA driver API doesn't expose an operation for this.
-/// Note: This is currently only being used by the SYCL program class for the
-///       has_kernel method, so an alternative would be to move the has_kernel
-///       query to PI and use cuModuleGetFunction to check for a kernel.
-/// Note: Another alternative is to add kernel names as metadata, like with
-///       reqd_work_group_size.
-std::string getKernelNames(pi_program) {
-  sycl::detail::pi::die("getKernelNames not implemented");
-  return {};
-}
-
 //-- PI API implementation
 extern "C" {
 pi_result cuda_piContextGetInfo(pi_context context, pi_context_info param_name,
@@ -1586,63 +1422,6 @@ pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
   }
 }
 
-pi_result cuda_piKernelCreate(pi_program program, const char *kernel_name,
-                              pi_kernel *kernel) {
-  assert(kernel != nullptr);
-  assert(program != nullptr);
-
-  pi_result retErr = PI_SUCCESS;
-  std::unique_ptr<_pi_kernel> retKernel{nullptr};
-
-  try {
-    ScopedContext active(program->get_context());
-
-    CUfunction cuFunc;
-    retErr = PI_CHECK_ERROR(
-        cuModuleGetFunction(&cuFunc, program->get(), kernel_name));
-
-    std::string kernel_name_woffset = std::string(kernel_name) + "_with_offset";
-    CUfunction cuFuncWithOffsetParam;
-    CUresult offsetRes = cuModuleGetFunction(
-        &cuFuncWithOffsetParam, program->get(), kernel_name_woffset.c_str());
-
-    // If there is no kernel with global offset parameter we mark it as missing
-    if (offsetRes == CUDA_ERROR_NOT_FOUND) {
-      cuFuncWithOffsetParam = nullptr;
-    } else {
-      retErr = PI_CHECK_ERROR(offsetRes);
-    }
-
-    retKernel = std::unique_ptr<_pi_kernel>(
-        new _pi_kernel{cuFunc, cuFuncWithOffsetParam, kernel_name, program,
-                       program->get_context()});
-  } catch (pi_result err) {
-    retErr = err;
-  } catch (...) {
-    retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
-  }
-
-  *kernel = retKernel.release();
-  return retErr;
-}
-
-pi_result cuda_piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index,
-                              size_t arg_size, const void *arg_value) {
-
-  assert(kernel != nullptr);
-  pi_result retErr = PI_SUCCESS;
-  try {
-    if (arg_value) {
-      kernel->set_kernel_arg(arg_index, arg_size, arg_value);
-    } else {
-      kernel->set_kernel_local_arg(arg_index, arg_size);
-    }
-  } catch (pi_result err) {
-    retErr = err;
-  }
-  return retErr;
-}
-
 pi_result cuda_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index,
                                        const pi_mem *arg_value) {
 
@@ -1700,119 +1479,6 @@ pi_result cuda_piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index,
   return retErr;
 }
 
-pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
-                                    pi_kernel_group_info param_name,
-                                    size_t param_value_size, void *param_value,
-                                    size_t *param_value_size_ret) {
-
-  // Here we want to query about a kernel's cuda blocks!
-
-  if (kernel != nullptr) {
-
-    switch (param_name) {
-    case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
-      size_t global_work_size[3] = {0, 0, 0};
-
-      int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0};
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&max_block_dimX,
-                               CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                               device->get()) == CUDA_SUCCESS);
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&max_block_dimY,
-                               CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
-                               device->get()) == CUDA_SUCCESS);
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&max_block_dimZ,
-                               CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
-                               device->get()) == CUDA_SUCCESS);
-
-      int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0};
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&max_grid_dimX,
-                               CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                               device->get()) == CUDA_SUCCESS);
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&max_grid_dimY,
-                               CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
-                               device->get()) == CUDA_SUCCESS);
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&max_grid_dimZ,
-                               CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
-                               device->get()) == CUDA_SUCCESS);
-
-      global_work_size[0] = max_block_dimX * max_grid_dimX;
-      global_work_size[1] = max_block_dimY * max_grid_dimY;
-      global_work_size[2] = max_block_dimZ * max_grid_dimZ;
-      return getInfoArray(3, param_value_size, param_value,
-                          param_value_size_ret, global_work_size);
-    }
-    case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
-      int max_threads = 0;
-      sycl::detail::pi::assertion(
-          cuFuncGetAttribute(&max_threads,
-                             CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                             kernel->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     size_t(max_threads));
-    }
-    case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
-      size_t group_size[3] = {0, 0, 0};
-      const auto &reqd_wg_size_md_map =
-          kernel->program_->kernelReqdWorkGroupSizeMD_;
-      const auto reqd_wg_size_md = reqd_wg_size_md_map.find(kernel->name_);
-      if (reqd_wg_size_md != reqd_wg_size_md_map.end()) {
-        const auto reqd_wg_size = reqd_wg_size_md->second;
-        group_size[0] = std::get<0>(reqd_wg_size);
-        group_size[1] = std::get<1>(reqd_wg_size);
-        group_size[2] = std::get<2>(reqd_wg_size);
-      }
-      return getInfoArray(3, param_value_size, param_value,
-                          param_value_size_ret, group_size);
-    }
-    case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
-      // OpenCL LOCAL == CUDA SHARED
-      int bytes = 0;
-      sycl::detail::pi::assertion(
-          cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
-                             kernel->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     pi_uint64(bytes));
-    }
-    case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
-      // Work groups should be multiples of the warp size
-      int warpSize = 0;
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                               device->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     static_cast<size_t>(warpSize));
-    }
-    case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
-      // OpenCL PRIVATE == CUDA LOCAL
-      int bytes = 0;
-      sycl::detail::pi::assertion(
-          cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
-                             kernel->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     pi_uint64(bytes));
-    }
-    case PI_KERNEL_GROUP_INFO_NUM_REGS: {
-      int numRegs = 0;
-      sycl::detail::pi::assertion(
-          cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
-                             kernel->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     pi_uint32(numRegs));
-    }
-    default:
-      __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-    }
-  }
-
-  return PI_ERROR_INVALID_KERNEL;
-}
-
 pi_result cuda_piEnqueueKernelLaunch(
     pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim,
     const size_t *global_work_offset, const size_t *global_work_size,
@@ -1984,13 +1650,6 @@ pi_result cuda_piEnqueueNativeKernel(pi_queue, void (*)(void *), void *, size_t,
   return {};
 }
 
-pi_result cuda_piextKernelCreateWithNativeHandle(pi_native_handle, pi_context,
-                                                 pi_program, bool,
-                                                 pi_kernel *) {
-  sycl::detail::pi::die("Unsupported operation");
-  return PI_SUCCESS;
-}
-
 /// \TODO Not implemented
 pi_result cuda_piMemImageCreate(pi_context context, pi_mem_flags flags,
                                 const pi_image_format *image_format,
@@ -2161,457 +1820,6 @@ pi_result cuda_piMemRetain(pi_mem mem) {
   return PI_SUCCESS;
 }
 
-/// Not used as CUDA backend only creates programs from binary.
-/// See \ref cuda_piclProgramCreateWithBinary.
-///
-pi_result cuda_piclProgramCreateWithSource(pi_context, pi_uint32, const char **,
-                                           const size_t *, pi_program *) {
-  sycl::detail::pi::cuPrint("cuda_piclProgramCreateWithSource not implemented");
-  return PI_ERROR_INVALID_OPERATION;
-}
-
-/// Loads the images from a PI program into a CUmodule that can be
-/// used later on to extract functions (kernels).
-/// See \ref _pi_program for implementation details.
-///
-pi_result cuda_piProgramBuild(
-    pi_program program, [[maybe_unused]] pi_uint32 num_devices,
-    [[maybe_unused]] const pi_device *device_list, const char *options,
-    [[maybe_unused]] void (*pfn_notify)(pi_program program, void *user_data),
-    [[maybe_unused]] void *user_data) {
-
-  assert(program != nullptr);
-  assert(num_devices == 1 || num_devices == 0);
-  assert(device_list != nullptr || num_devices == 0);
-  assert(pfn_notify == nullptr);
-  assert(user_data == nullptr);
-  pi_result retError = PI_SUCCESS;
-
-  try {
-    ScopedContext active(program->get_context());
-
-    program->build_program(options);
-
-  } catch (pi_result err) {
-    retError = err;
-  }
-  return retError;
-}
-
-/// \TODO Not implemented
-pi_result cuda_piProgramCreate(pi_context, const void *, size_t, pi_program *) {
-  sycl::detail::pi::die("cuda_piProgramCreate not implemented");
-  return {};
-}
-
-/// Loads images from a list of PTX or CUBIN binaries.
-/// Note: No calls to CUDA driver API in this function, only store binaries
-/// for later.
-///
-/// Note: Only supports one device
-///
-pi_result cuda_piProgramCreateWithBinary(
-    pi_context context, [[maybe_unused]] pi_uint32 num_devices,
-    [[maybe_unused]] const pi_device *device_list, const size_t *lengths,
-    const unsigned char **binaries, size_t num_metadata_entries,
-    const pi_device_binary_property *metadata, pi_int32 *binary_status,
-    pi_program *program) {
-  // Ignore unused parameter
-  (void)binary_status;
-
-  assert(context != nullptr);
-  assert(binaries != nullptr);
-  assert(program != nullptr);
-  assert(device_list != nullptr);
-  assert(num_devices == 1 && "CUDA contexts are for a single device");
-  assert((context->get_device()->get() == device_list[0]->get()) &&
-         "Mismatch between devices context and passed context when creating "
-         "program from binary");
-
-  pi_result retError = PI_SUCCESS;
-
-  std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
-
-  retProgram->set_metadata(metadata, num_metadata_entries);
-
-  const bool has_length = (lengths != nullptr);
-  size_t length = has_length
-                      ? lengths[0]
-                      : strlen(reinterpret_cast<const char *>(binaries[0])) + 1;
-
-  assert(length != 0);
-
-  retProgram->set_binary(reinterpret_cast<const char *>(binaries[0]), length);
-
-  *program = retProgram.release();
-
-  return retError;
-}
-
-pi_result cuda_piProgramGetInfo(pi_program program, pi_program_info param_name,
-                                size_t param_value_size, void *param_value,
-                                size_t *param_value_size_ret) {
-  assert(program != nullptr);
-
-  switch (param_name) {
-  case PI_PROGRAM_INFO_REFERENCE_COUNT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   program->get_reference_count());
-  case PI_PROGRAM_INFO_CONTEXT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   program->context_);
-  case PI_PROGRAM_INFO_NUM_DEVICES:
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  case PI_PROGRAM_INFO_DEVICES:
-    return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
-                        &program->context_->deviceId_);
-  case PI_PROGRAM_INFO_SOURCE:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   program->binary_);
-  case PI_PROGRAM_INFO_BINARY_SIZES:
-    return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
-                        &program->binarySizeInBytes_);
-  case PI_PROGRAM_INFO_BINARIES:
-    return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
-                        &program->binary_);
-  case PI_PROGRAM_INFO_KERNEL_NAMES: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   getKernelNames(program).c_str());
-  }
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-  sycl::detail::pi::die("Program info request not implemented");
-  return {};
-}
-
-/// Creates a new PI program object that is the outcome of linking all input
-/// programs.
-/// \TODO Implement linker options, requires mapping of OpenCL to CUDA
-///
-pi_result cuda_piProgramLink(
-    pi_context context, [[maybe_unused]] pi_uint32 num_devices,
-    [[maybe_unused]] const pi_device *device_list, const char *options,
-    pi_uint32 num_input_programs, const pi_program *input_programs,
-    [[maybe_unused]] void (*pfn_notify)(pi_program program, void *user_data),
-    [[maybe_unused]] void *user_data, pi_program *ret_program) {
-
-  assert(ret_program != nullptr);
-  assert(num_devices == 1 || num_devices == 0);
-  assert(device_list != nullptr || num_devices == 0);
-  assert(pfn_notify == nullptr);
-  assert(user_data == nullptr);
-  pi_result retError = PI_SUCCESS;
-
-  try {
-    ScopedContext active(context);
-
-    CUlinkState state;
-    std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
-
-    retError = PI_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &state));
-    try {
-      for (size_t i = 0; i < num_input_programs; ++i) {
-        pi_program program = input_programs[i];
-        retError = PI_CHECK_ERROR(cuLinkAddData(
-            state, CU_JIT_INPUT_PTX, const_cast<char *>(program->binary_),
-            program->binarySizeInBytes_, nullptr, 0, nullptr, nullptr));
-      }
-      void *cubin = nullptr;
-      size_t cubinSize = 0;
-      retError = PI_CHECK_ERROR(cuLinkComplete(state, &cubin, &cubinSize));
-
-      retError =
-          retProgram->set_binary(static_cast<const char *>(cubin), cubinSize);
-
-      if (retError != PI_SUCCESS) {
-        return retError;
-      }
-
-      retError = retProgram->build_program(options);
-
-      if (retError != PI_SUCCESS) {
-        return retError;
-      }
-    } catch (...) {
-      // Upon error attempt cleanup
-      PI_CHECK_ERROR(cuLinkDestroy(state));
-      throw;
-    }
-
-    retError = PI_CHECK_ERROR(cuLinkDestroy(state));
-    *ret_program = retProgram.release();
-
-  } catch (pi_result err) {
-    retError = err;
-  }
-  return retError;
-}
-
-/// Creates a new program that is the outcome of the compilation of the headers
-///  and the program.
-/// \TODO Implement asynchronous compilation
-///
-pi_result cuda_piProgramCompile(
-    pi_program program, [[maybe_unused]] pi_uint32 num_devices,
-    [[maybe_unused]] const pi_device *device_list, const char *options,
-    [[maybe_unused]] pi_uint32 num_input_headers,
-    const pi_program *input_headers, const char **header_include_names,
-    [[maybe_unused]] void (*pfn_notify)(pi_program program, void *user_data),
-    [[maybe_unused]] void *user_data) {
-  // Ignore unused parameters
-  (void)header_include_names;
-  (void)input_headers;
-
-  assert(program != nullptr);
-  assert(num_devices == 1 || num_devices == 0);
-  assert(device_list != nullptr || num_devices == 0);
-  assert(pfn_notify == nullptr);
-  assert(user_data == nullptr);
-  assert(num_input_headers == 0);
-  pi_result retError = PI_SUCCESS;
-
-  try {
-    ScopedContext active(program->get_context());
-
-    program->build_program(options);
-
-  } catch (pi_result err) {
-    retError = err;
-  }
-  return retError;
-}
-
-pi_result cuda_piProgramGetBuildInfo(pi_program program, pi_device device,
-                                     pi_program_build_info param_name,
-                                     size_t param_value_size, void *param_value,
-                                     size_t *param_value_size_ret) {
-  // Ignore unused parameter
-  (void)device;
-
-  assert(program != nullptr);
-
-  switch (param_name) {
-  case PI_PROGRAM_BUILD_INFO_STATUS: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   program->buildStatus_);
-  }
-  case PI_PROGRAM_BUILD_INFO_OPTIONS:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   program->buildOptions_.c_str());
-  case PI_PROGRAM_BUILD_INFO_LOG:
-    return getInfoArray(program->MAX_LOG_SIZE, param_value_size, param_value,
-                        param_value_size_ret, program->infoLog_);
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-  sycl::detail::pi::die("Program Build info request not implemented");
-  return {};
-}
-
-pi_result cuda_piProgramRetain(pi_program program) {
-  assert(program != nullptr);
-  assert(program->get_reference_count() > 0);
-  program->increment_reference_count();
-  return PI_SUCCESS;
-}
-
-/// Decreases the reference count of a pi_program object.
-/// When the reference count reaches 0, it unloads the module from
-/// the context.
-pi_result cuda_piProgramRelease(pi_program program) {
-  assert(program != nullptr);
-
-  // double delete or someone is messing with the ref count.
-  // either way, cannot safely proceed.
-  assert(program->get_reference_count() != 0 &&
-         "Reference count overflow detected in cuda_piProgramRelease.");
-
-  // decrement ref count. If it is 0, delete the program.
-  if (program->decrement_reference_count() == 0) {
-
-    std::unique_ptr<_pi_program> program_ptr{program};
-
-    pi_result result = PI_ERROR_INVALID_PROGRAM;
-
-    try {
-      ScopedContext active(program->get_context());
-      auto cuModule = program->get();
-      result = PI_CHECK_ERROR(cuModuleUnload(cuModule));
-    } catch (...) {
-      result = PI_ERROR_OUT_OF_RESOURCES;
-    }
-
-    return result;
-  }
-
-  return PI_SUCCESS;
-}
-
-/// Gets the native CUDA handle of a PI program object
-///
-/// \param[in] program The PI program to get the native CUDA object of.
-/// \param[out] nativeHandle Set to the native handle of the PI program object.
-///
-/// \return TBD
-pi_result cuda_piextProgramGetNativeHandle(pi_program program,
-                                           pi_native_handle *nativeHandle) {
-  *nativeHandle = reinterpret_cast<pi_native_handle>(program->get());
-  return PI_SUCCESS;
-}
-
-/// Created a PI program object from a CUDA program handle.
-/// TODO: Implement this.
-/// NOTE: The created PI object takes ownership of the native handle.
-///
-/// \param[in] nativeHandle The native handle to create PI program object from.
-/// \param[in] context The PI context of the program.
-/// \param[out] program Set to the PI program object created from native handle.
-///
-/// \return TBD
-pi_result cuda_piextProgramCreateWithNativeHandle(pi_native_handle, pi_context,
-                                                  bool, pi_program *) {
-  sycl::detail::pi::die(
-      "Creation of PI program from native handle not implemented");
-  return {};
-}
-
-pi_result cuda_piKernelGetInfo(pi_kernel kernel, pi_kernel_info param_name,
-                               size_t param_value_size, void *param_value,
-                               size_t *param_value_size_ret) {
-
-  if (kernel != nullptr) {
-
-    switch (param_name) {
-    case PI_KERNEL_INFO_FUNCTION_NAME:
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     kernel->get_name());
-    case PI_KERNEL_INFO_NUM_ARGS:
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     kernel->get_num_args());
-    case PI_KERNEL_INFO_REFERENCE_COUNT:
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     kernel->get_reference_count());
-    case PI_KERNEL_INFO_CONTEXT: {
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     kernel->get_context());
-    }
-    case PI_KERNEL_INFO_PROGRAM: {
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     kernel->get_program());
-    }
-    case PI_KERNEL_INFO_ATTRIBUTES: {
-      return getInfo(param_value_size, param_value, param_value_size_ret, "");
-    }
-    default: {
-      __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-    }
-    }
-  }
-
-  return PI_ERROR_INVALID_KERNEL;
-}
-
-pi_result cuda_piKernelGetSubGroupInfo(
-    pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name,
-    size_t input_value_size, const void *input_value, size_t param_value_size,
-    void *param_value, size_t *param_value_size_ret) {
-  // Ignore unused parameters
-  (void)input_value_size;
-  (void)input_value;
-
-  if (kernel != nullptr) {
-    switch (param_name) {
-    case PI_KERNEL_MAX_SUB_GROUP_SIZE: {
-      // Sub-group size is equivalent to warp size
-      int warpSize = 0;
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                               device->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     static_cast<uint32_t>(warpSize));
-    }
-    case PI_KERNEL_MAX_NUM_SUB_GROUPS: {
-      // Number of sub-groups = max block size / warp size + possible remainder
-      int max_threads = 0;
-      sycl::detail::pi::assertion(
-          cuFuncGetAttribute(&max_threads,
-                             CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                             kernel->get()) == CUDA_SUCCESS);
-      int warpSize = 0;
-      cuda_piKernelGetSubGroupInfo(kernel, device, PI_KERNEL_MAX_SUB_GROUP_SIZE,
-                                   0, nullptr, sizeof(uint32_t), &warpSize,
-                                   nullptr);
-      int maxWarps = (max_threads + warpSize - 1) / warpSize;
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     static_cast<uint32_t>(maxWarps));
-    }
-    case PI_KERNEL_COMPILE_NUM_SUB_GROUPS: {
-      // Return value of 0 => not specified
-      // TODO: Revisit if PTX is generated for compile-time work-group sizes
-      return getInfo(param_value_size, param_value, param_value_size_ret, 0);
-    }
-    case PI_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL: {
-      // Return value of 0 => unspecified or "auto" sub-group size
-      // Correct for now, since warp size may be read from special register
-      // TODO: Return warp size once default is primary sub-group size
-      // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX
-      return getInfo(param_value_size, param_value, param_value_size_ret, 0);
-    }
-    default:
-      __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-    }
-  }
-  return PI_ERROR_INVALID_KERNEL;
-}
-
-pi_result cuda_piKernelRetain(pi_kernel kernel) {
-  assert(kernel != nullptr);
-  assert(kernel->get_reference_count() > 0u);
-
-  kernel->increment_reference_count();
-  return PI_SUCCESS;
-}
-
-pi_result cuda_piKernelRelease(pi_kernel kernel) {
-  assert(kernel != nullptr);
-
-  // double delete or someone is messing with the ref count.
-  // either way, cannot safely proceed.
-  assert(kernel->get_reference_count() != 0 &&
-         "Reference count overflow detected in cuda_piKernelRelease.");
-
-  // decrement ref count. If it is 0, delete the program.
-  if (kernel->decrement_reference_count() == 0) {
-    // no internal cuda resources to clean up. Just delete it.
-    delete kernel;
-    return PI_SUCCESS;
-  }
-
-  return PI_SUCCESS;
-}
-
-// A NOP for the CUDA backend
-pi_result cuda_piKernelSetExecInfo(pi_kernel, pi_kernel_exec_info, size_t,
-                                   const void *) {
-  return PI_SUCCESS;
-}
-
-pi_result cuda_piextProgramSetSpecializationConstant(pi_program, pi_uint32,
-                                                     size_t, const void *) {
-  // This entry point is only used for native specialization constants (SPIR-V),
-  // and the CUDA plugin is AOT only so this entry point is not supported.
-  sycl::detail::pi::die("Native specialization constants are not supported");
-  return {};
-}
-
-pi_result cuda_piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index,
-                                        size_t arg_size,
-                                        const void *arg_value) {
-  kernel->set_kernel_arg(arg_index, arg_size, arg_value);
-  return PI_SUCCESS;
-}
-
 //
 // Events
 //
@@ -4538,33 +3746,35 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piextMemGetNativeHandle, cuda_piextMemGetNativeHandle)
   _PI_CL(piextMemCreateWithNativeHandle, cuda_piextMemCreateWithNativeHandle)
   // Program
-  _PI_CL(piProgramCreate, cuda_piProgramCreate)
-  _PI_CL(piclProgramCreateWithSource, cuda_piclProgramCreateWithSource)
-  _PI_CL(piProgramCreateWithBinary, cuda_piProgramCreateWithBinary)
-  _PI_CL(piProgramGetInfo, cuda_piProgramGetInfo)
-  _PI_CL(piProgramCompile, cuda_piProgramCompile)
-  _PI_CL(piProgramBuild, cuda_piProgramBuild)
-  _PI_CL(piProgramLink, cuda_piProgramLink)
-  _PI_CL(piProgramGetBuildInfo, cuda_piProgramGetBuildInfo)
-  _PI_CL(piProgramRetain, cuda_piProgramRetain)
-  _PI_CL(piProgramRelease, cuda_piProgramRelease)
-  _PI_CL(piextProgramGetNativeHandle, cuda_piextProgramGetNativeHandle)
+  _PI_CL(piProgramCreate, pi2ur::piProgramCreate)
+  _PI_CL(piclProgramCreateWithSource, pi2ur::piclProgramCreateWithSource)
+  _PI_CL(piProgramCreateWithBinary, pi2ur::piProgramCreateWithBinary)
+  _PI_CL(piProgramGetInfo, pi2ur::piProgramGetInfo)
+  _PI_CL(piProgramCompile, pi2ur::piProgramCompile)
+  _PI_CL(piProgramBuild, pi2ur::piProgramBuild)
+  _PI_CL(piProgramLink, pi2ur::piProgramLink)
+  _PI_CL(piProgramGetBuildInfo, pi2ur::piProgramGetBuildInfo)
+  _PI_CL(piProgramRetain, pi2ur::piProgramRetain)
+  _PI_CL(piProgramRelease, pi2ur::piProgramRelease)
+  _PI_CL(piextProgramGetNativeHandle, pi2ur::piextProgramGetNativeHandle)
   _PI_CL(piextProgramCreateWithNativeHandle,
-         cuda_piextProgramCreateWithNativeHandle)
+         pi2ur::piextProgramCreateWithNativeHandle)
   // Kernel
-  _PI_CL(piKernelCreate, cuda_piKernelCreate)
-  _PI_CL(piKernelSetArg, cuda_piKernelSetArg)
-  _PI_CL(piKernelGetInfo, cuda_piKernelGetInfo)
-  _PI_CL(piKernelGetGroupInfo, cuda_piKernelGetGroupInfo)
-  _PI_CL(piKernelGetSubGroupInfo, cuda_piKernelGetSubGroupInfo)
-  _PI_CL(piKernelRetain, cuda_piKernelRetain)
-  _PI_CL(piKernelRelease, cuda_piKernelRelease)
-  _PI_CL(piKernelSetExecInfo, cuda_piKernelSetExecInfo)
+  _PI_CL(piKernelCreate, pi2ur::piKernelCreate)
+  _PI_CL(piKernelSetArg, pi2ur::piKernelSetArg)
+  _PI_CL(piKernelGetInfo, pi2ur::piKernelGetInfo)
+  _PI_CL(piKernelGetGroupInfo, pi2ur::piKernelGetGroupInfo)
+  _PI_CL(piKernelGetSubGroupInfo, pi2ur::piKernelGetSubGroupInfo)
+  _PI_CL(piKernelRetain, pi2ur::piKernelRetain)
+  _PI_CL(piKernelRelease, pi2ur::piKernelRelease)
+  _PI_CL(piextKernelGetNativeHandle, pi2ur::piextKernelGetNativeHandle)
+  _PI_CL(piKernelSetExecInfo, pi2ur::piKernelSetExecInfo)
   _PI_CL(piextProgramSetSpecializationConstant,
-         cuda_piextProgramSetSpecializationConstant)
-  _PI_CL(piextKernelSetArgPointer, cuda_piextKernelSetArgPointer)
+         pi2ur::piextProgramSetSpecializationConstant)
+  _PI_CL(piextKernelSetArgPointer, pi2ur::piKernelSetArgPointer)
   _PI_CL(piextKernelCreateWithNativeHandle,
-         cuda_piextKernelCreateWithNativeHandle)
+         pi2ur::piextKernelCreateWithNativeHandle)
+
   // Event
   _PI_CL(piEventCreate, cuda_piEventCreate)
   _PI_CL(piEventGetInfo, cuda_piEventGetInfo)
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index f6a95ff8d0ab5..51f6b7f2a34b4 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -44,7 +44,9 @@
 
 #include <ur/adapters/cuda/context.hpp>
 #include <ur/adapters/cuda/device.hpp>
+#include <ur/adapters/cuda/kernel.hpp>
 #include <ur/adapters/cuda/platform.hpp>
+#include <ur/adapters/cuda/program.hpp>
 
 // Share code between the PI Plugin and UR Adapter
 #include <pi2ur.hpp>
@@ -52,18 +54,10 @@
 extern "C" {
 
 /// \cond IGNORE_BLOCK_IN_DOXYGEN
-pi_result cuda_piProgramRetain(pi_program);
-pi_result cuda_piProgramRelease(pi_program);
 pi_result cuda_piQueueRelease(pi_queue);
 pi_result cuda_piQueueRetain(pi_queue);
 pi_result cuda_piMemRetain(pi_mem);
 pi_result cuda_piMemRelease(pi_mem);
-pi_result cuda_piKernelRetain(pi_kernel);
-pi_result cuda_piKernelRelease(pi_kernel);
-pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
-                                    pi_kernel_group_info param_name,
-                                    size_t param_value_size, void *param_value,
-                                    size_t *param_value_size_ret);
 /// \endcond
 }
 
@@ -670,44 +664,8 @@ struct _pi_event {
 
 /// Implementation of PI Program on CUDA Module object
 ///
-struct _pi_program {
-  using native_type = CUmodule;
-  native_type module_;
-  const char *binary_;
-  size_t binarySizeInBytes_;
-  std::atomic_uint32_t refCount_;
-  _pi_context *context_;
-
-  // Metadata
-  std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
-      kernelReqdWorkGroupSizeMD_;
-  std::unordered_map<std::string, std::string> globalIDMD_;
-
-  constexpr static size_t MAX_LOG_SIZE = 8192u;
-
-  char errorLog_[MAX_LOG_SIZE], infoLog_[MAX_LOG_SIZE];
-  std::string buildOptions_;
-  pi_program_build_status buildStatus_ = PI_PROGRAM_BUILD_STATUS_NONE;
-
-  _pi_program(pi_context ctxt);
-  ~_pi_program();
-
-  pi_result set_metadata(const pi_device_binary_property *metadata,
-                         size_t length);
-
-  pi_result set_binary(const char *binary, size_t binarySizeInBytes);
-
-  pi_result build_program(const char *build_options);
-
-  pi_context get_context() const { return context_; };
-
-  native_type get() const noexcept { return module_; };
-
-  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
-
-  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
-
-  pi_uint32 get_reference_count() const noexcept { return refCount_; }
+struct _pi_program : ur_program_handle_t_ {
+  using ur_program_handle_t_::ur_program_handle_t_;
 };
 
 /// Implementation of a PI Kernel for CUDA
@@ -726,171 +684,8 @@ struct _pi_program {
 /// CUDA shared model. This object simply calculates the total of
 /// shared memory, and the initial offsets of each parameter.
 ///
-struct _pi_kernel {
-  using native_type = CUfunction;
-
-  native_type function_;
-  native_type functionWithOffsetParam_;
-  std::string name_;
-  pi_context context_;
-  pi_program program_;
-  std::atomic_uint32_t refCount_;
-
-  static constexpr pi_uint32 REQD_THREADS_PER_BLOCK_DIMENSIONS = 3u;
-  size_t reqdThreadsPerBlock_[REQD_THREADS_PER_BLOCK_DIMENSIONS];
-
-  /// Structure that holds the arguments to the kernel.
-  /// Note earch argument size is known, since it comes
-  /// from the kernel signature.
-  /// This is not something can be queried from the CUDA API
-  /// so there is a hard-coded size (\ref MAX_PARAM_BYTES)
-  /// and a storage.
-  ///
-  struct arguments {
-    static constexpr size_t MAX_PARAM_BYTES = 4000u;
-    using args_t = std::array<char, MAX_PARAM_BYTES>;
-    using args_size_t = std::vector<size_t>;
-    using args_index_t = std::vector<void *>;
-    args_t storage_;
-    args_size_t paramSizes_;
-    args_index_t indices_;
-    args_size_t offsetPerIndex_;
-
-    std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0};
-
-    arguments() {
-      // Place the implicit offset index at the end of the indicies collection
-      indices_.emplace_back(&implicitOffsetArgs_);
-    }
-
-    /// Adds an argument to the kernel.
-    /// If the argument existed before, it is replaced.
-    /// Otherwise, it is added.
-    /// Gaps are filled with empty arguments.
-    /// Implicit offset argument is kept at the back of the indices collection.
-    void add_arg(size_t index, size_t size, const void *arg,
-                 size_t localSize = 0) {
-      if (index + 2 > indices_.size()) {
-        // Move implicit offset argument index with the end
-        indices_.resize(index + 2, indices_.back());
-        // Ensure enough space for the new argument
-        paramSizes_.resize(index + 1);
-        offsetPerIndex_.resize(index + 1);
-      }
-      paramSizes_[index] = size;
-      // calculate the insertion point on the array
-      size_t insertPos = std::accumulate(std::begin(paramSizes_),
-                                         std::begin(paramSizes_) + index, 0);
-      // Update the stored value for the argument
-      std::memcpy(&storage_[insertPos], arg, size);
-      indices_[index] = &storage_[insertPos];
-      offsetPerIndex_[index] = localSize;
-    }
-
-    void add_local_arg(size_t index, size_t size) {
-      size_t localOffset = this->get_local_size();
-
-      // maximum required alignment is the size of the largest vector type
-      const size_t max_alignment = sizeof(double) * 16;
-
-      // for arguments smaller than the maximum alignment simply align to the
-      // size of the argument
-      const size_t alignment = std::min(max_alignment, size);
-
-      // align the argument
-      size_t alignedLocalOffset = localOffset;
-      if (localOffset % alignment != 0) {
-        alignedLocalOffset += alignment - (localOffset % alignment);
-      }
-
-      add_arg(index, sizeof(size_t), (const void *)&(alignedLocalOffset),
-              size + (alignedLocalOffset - localOffset));
-    }
-
-    void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) {
-      assert(size == sizeof(std::uint32_t) * 3);
-      std::memcpy(implicitOffsetArgs_, implicitOffset, size);
-    }
-
-    void clear_local_size() {
-      std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0);
-    }
-
-    const args_index_t &get_indices() const noexcept { return indices_; }
-
-    pi_uint32 get_local_size() const {
-      return std::accumulate(std::begin(offsetPerIndex_),
-                             std::end(offsetPerIndex_), 0);
-    }
-  } args_;
-
-  _pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name,
-             pi_program program, pi_context ctxt)
-      : function_{func}, functionWithOffsetParam_{funcWithOffsetParam},
-        name_{name}, context_{ctxt}, program_{program}, refCount_{1} {
-    cuda_piProgramRetain(program_);
-    pi2ur::piContextRetain(context_);
-    /// Note: this code assumes that there is only one device per context
-    pi_result retError = cuda_piKernelGetGroupInfo(
-        this, reinterpret_cast<pi_device>(ctxt->get_device()),
-        PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
-        sizeof(reqdThreadsPerBlock_), reqdThreadsPerBlock_, nullptr);
-    (void)retError;
-    assert(retError == PI_SUCCESS);
-  }
-
-  ~_pi_kernel() {
-    cuda_piProgramRelease(program_);
-    pi2ur::piContextRelease(context_);
-  }
-
-  pi_program get_program() const noexcept { return program_; }
-
-  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
-
-  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
-
-  pi_uint32 get_reference_count() const noexcept { return refCount_; }
-
-  native_type get() const noexcept { return function_; };
-
-  native_type get_with_offset_parameter() const noexcept {
-    return functionWithOffsetParam_;
-  };
-
-  bool has_with_offset_parameter() const noexcept {
-    return functionWithOffsetParam_ != nullptr;
-  }
-
-  pi_context get_context() const noexcept { return context_; };
-
-  const char *get_name() const noexcept { return name_.c_str(); }
-
-  /// Returns the number of arguments, excluding the implicit global offset.
-  /// Note this only returns the current known number of arguments, not the
-  /// real one required by the kernel, since this cannot be queried from
-  /// the CUDA Driver API
-  pi_uint32 get_num_args() const noexcept { return args_.indices_.size() - 1; }
-
-  void set_kernel_arg(int index, size_t size, const void *arg) {
-    args_.add_arg(index, size, arg);
-  }
-
-  void set_kernel_local_arg(int index, size_t size) {
-    args_.add_local_arg(index, size);
-  }
-
-  void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) {
-    args_.set_implicit_offset(size, implicitOffset);
-  }
-
-  const arguments::args_index_t &get_arg_indices() const {
-    return args_.get_indices();
-  }
-
-  pi_uint32 get_local_size() const noexcept { return args_.get_local_size(); }
-
-  void clear_local_size() { args_.clear_local_size(); }
+struct _pi_kernel : ur_kernel_handle_t_ {
+  using ur_kernel_handle_t_::ur_kernel_handle_t_;
 };
 
 /// Implementation of samplers for CUDA
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index bec6aed6131c8..c5ac46747fd73 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -139,6 +139,10 @@ add_sycl_library("ur_adapter_cuda" SHARED
     "ur/adapters/cuda/device.hpp"
     "ur/adapters/cuda/platform.cpp"
     "ur/adapters/cuda/platform.hpp"
+    "ur/adapters/cuda/program.cpp"
+    "ur/adapters/cuda/program.hpp"
+    "ur/adapters/cuda/kernel.cpp"
+    "ur/adapters/cuda/kernel.hpp"
     "ur/adapters/cuda/ur_interface_loader.cpp"
   INCLUDE_DIRS
     ${sycl_inc_dir}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
new file mode 100644
index 0000000000000..ea341f47ee167
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
@@ -0,0 +1,281 @@
+//===--------- kernel.cpp - CUDA Adapter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "kernel.hpp"
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
+               ur_kernel_handle_t *phKernel) {
+  UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(phKernel, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  ur_result_t retErr = UR_RESULT_SUCCESS;
+  std::unique_ptr<ur_kernel_handle_t_> retKernel{nullptr};
+
+  try {
+    ScopedContext active(hProgram->get_context());
+
+    CUfunction cuFunc;
+    retErr = UR_CHECK_ERROR(
+        cuModuleGetFunction(&cuFunc, hProgram->get(), pKernelName));
+
+    std::string kernel_name_woffset = std::string(pKernelName) + "_with_offset";
+    CUfunction cuFuncWithOffsetParam;
+    CUresult offsetRes = cuModuleGetFunction(
+        &cuFuncWithOffsetParam, hProgram->get(), kernel_name_woffset.c_str());
+
+    // If there is no kernel with global offset parameter we mark it as missing
+    if (offsetRes == CUDA_ERROR_NOT_FOUND) {
+      cuFuncWithOffsetParam = nullptr;
+    } else {
+      retErr = UR_CHECK_ERROR(offsetRes);
+    }
+    retKernel = std::unique_ptr<ur_kernel_handle_t_>(
+        new ur_kernel_handle_t_{cuFunc, cuFuncWithOffsetParam, pKernelName,
+                                hProgram, hProgram->get_context()});
+  } catch (ur_result_t err) {
+    retErr = err;
+  } catch (...) {
+    retErr = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  }
+
+  *phKernel = retKernel.release();
+  return retErr;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
+                     ur_kernel_group_info_t propName, size_t propSize,
+                     void *pPropValue, size_t *pPropSizeRet) {
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  // Here we want to query about a kernel's cuda blocks!
+  UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+
+  switch (propName) {
+  case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
+    int max_threads = 0;
+    sycl::detail::ur::assertion(
+        cuFuncGetAttribute(&max_threads,
+                           CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                           hKernel->get()) == CUDA_SUCCESS);
+    return ReturnValue(size_t(max_threads));
+  }
+  case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
+    size_t group_size[3] = {0, 0, 0};
+    const auto &reqd_wg_size_md_map =
+        hKernel->program_->kernelReqdWorkGroupSizeMD_;
+    const auto reqd_wg_size_md = reqd_wg_size_md_map.find(hKernel->name_);
+    if (reqd_wg_size_md != reqd_wg_size_md_map.end()) {
+      const auto reqd_wg_size = reqd_wg_size_md->second;
+      group_size[0] = std::get<0>(reqd_wg_size);
+      group_size[1] = std::get<1>(reqd_wg_size);
+      group_size[2] = std::get<2>(reqd_wg_size);
+    }
+    return ReturnValue(group_size, 3);
+  }
+  case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
+    // OpenCL LOCAL == CUDA SHARED
+    int bytes = 0;
+    sycl::detail::ur::assertion(
+        cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+                           hKernel->get()) == CUDA_SUCCESS);
+    return ReturnValue(uint64_t(bytes));
+  }
+  case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
+    // Work groups should be multiples of the warp size
+    int warpSize = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                             hDevice->get()) == CUDA_SUCCESS);
+    return ReturnValue(static_cast<size_t>(warpSize));
+  }
+  case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
+    // OpenCL PRIVATE == CUDA LOCAL
+    int bytes = 0;
+    sycl::detail::ur::assertion(
+        cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
+                           hKernel->get()) == CUDA_SUCCESS);
+    return ReturnValue(uint64_t(bytes));
+  }
+  default:
+    break;
+  }
+
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain(ur_kernel_handle_t hKernel) {
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hKernel->get_reference_count() > 0u,
+            UR_RESULT_ERROR_INVALID_KERNEL);
+
+  hKernel->increment_reference_count();
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelRelease(ur_kernel_handle_t hKernel) {
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  // double delete or someone is messing with the ref count.
+  // either way, cannot safely proceed.
+  UR_ASSERT(hKernel->get_reference_count() != 0,
+            UR_RESULT_ERROR_INVALID_KERNEL);
+
+  // decrement ref count. If it is 0, delete the program.
+  if (hKernel->decrement_reference_count() == 0) {
+    // no internal cuda resources to clean up. Just delete it.
+    delete hKernel;
+    return UR_RESULT_SUCCESS;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+// TODO(ur): Not implemented on cuda atm. Also, need to add tests for this
+// feature.
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
+    ur_kernel_handle_t hKernel, ur_native_handle_t *phNativeKernel) {
+  (void)hKernel;
+  (void)phNativeKernel;
+
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelSetArgValue(ur_kernel_handle_t hKernel, uint32_t argIndex,
+                    size_t argSize, const void *pArgValue) {
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  ur_result_t retErr = UR_RESULT_SUCCESS;
+  try {
+    if (pArgValue) {
+      hKernel->set_kernel_arg(argIndex, argSize, pArgValue);
+    } else {
+      hKernel->set_kernel_local_arg(argIndex, argSize);
+    }
+  } catch (ur_result_t err) {
+    retErr = err;
+  }
+  return retErr;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
+                                                    ur_kernel_info_t propName,
+                                                    size_t propSize,
+                                                    void *pKernelInfo,
+                                                    size_t *pPropSizeRet) {
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  UrReturnHelper ReturnValue(propSize, pKernelInfo, pPropSizeRet);
+
+  switch (propName) {
+  case UR_KERNEL_INFO_FUNCTION_NAME:
+    return ReturnValue(hKernel->get_name());
+  case UR_KERNEL_INFO_NUM_ARGS:
+    return ReturnValue(hKernel->get_num_args());
+  case UR_KERNEL_INFO_REFERENCE_COUNT:
+    return ReturnValue(hKernel->get_reference_count());
+  case UR_KERNEL_INFO_CONTEXT:
+    return ReturnValue(hKernel->get_context());
+  case UR_KERNEL_INFO_PROGRAM:
+    return ReturnValue(hKernel->get_program());
+  case UR_KERNEL_INFO_ATTRIBUTES:
+    return ReturnValue("");
+  case UR_KERNEL_INFO_NUM_REGS: {
+    int numRegs = 0;
+    sycl::detail::ur::assertion(
+        cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
+                           hKernel->get()) == CUDA_SUCCESS);
+    return ReturnValue(uint32_t{numRegs});
+  }
+  default:
+    break;
+  }
+
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
+                        ur_kernel_sub_group_info_t propName, size_t propSize,
+                        void *pPropValue, size_t *pPropSizeRet) {
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+  switch (propName) {
+  case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: {
+    // Sub-group size is equivalent to warp size
+    int warpSize = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                             hDevice->get()) == CUDA_SUCCESS);
+    return ReturnValue(static_cast<uint32_t>(warpSize));
+  }
+  case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: {
+    // Number of sub-groups = max block size / warp size + possible remainder
+    int max_threads = 0;
+    sycl::detail::ur::assertion(
+        cuFuncGetAttribute(&max_threads,
+                           CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                           hKernel->get()) == CUDA_SUCCESS);
+    int warpSize = 0;
+    urKernelGetSubGroupInfo(hKernel, hDevice,
+                            UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE,
+                            sizeof(uint32_t), &warpSize, nullptr);
+    int maxWarps = (max_threads + warpSize - 1) / warpSize;
+    return ReturnValue(static_cast<uint32_t>(maxWarps));
+  }
+  case UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS: {
+    // Return value of 0 => not specified
+    // TODO: Revisit if PTX is generated for compile-time work-group sizes
+    return ReturnValue(0);
+  }
+  case UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL: {
+    // Return value of 0 => unspecified or "auto" sub-group size
+    // Correct for now, since warp size may be read from special register
+    // TODO: Return warp size once default is primary sub-group size
+    // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX
+    return ReturnValue(0);
+  }
+  default:
+    break;
+  }
+
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer(
+    ur_kernel_handle_t hKernel, uint32_t argIndex, const void *pArgValue) {
+  hKernel->set_kernel_arg(argIndex, sizeof(pArgValue), pArgValue);
+  return UR_RESULT_SUCCESS;
+}
+
+// A NOP for the CUDA backend
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelSetExecInfo(ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName,
+                    size_t propSize, const void *pPropValue) {
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants(
+    ur_program_handle_t, uint32_t, const ur_specialization_constant_info_t *) {
+  // This entry point is only used for native specialization constants (SPIR-V),
+  // and the CUDA plugin is AOT only so this entry point is not supported.
+  sycl::detail::ur::die("Native specialization constants are not supported");
+  return {};
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
+    ur_native_handle_t hNativeKernel, ur_context_handle_t hContext,
+    ur_program_handle_t hProgram,
+    const ur_kernel_native_properties_t *pProperties,
+    ur_kernel_handle_t *phKernel) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
new file mode 100644
index 0000000000000..42e624cefba48
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
@@ -0,0 +1,183 @@
+//===--------- kernel.hpp - CUDA Adapter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <cuda.h>
+#include <ur_api.h>
+
+#include <atomic>
+#include <cassert>
+#include <numeric>
+
+#include "program.hpp"
+
+struct ur_kernel_handle_t_ {
+  using native_type = CUfunction;
+
+  native_type function_;
+  native_type functionWithOffsetParam_;
+  std::string name_;
+  ur_context_handle_t context_;
+  ur_program_handle_t program_;
+  std::atomic_uint32_t refCount_;
+
+  static constexpr uint32_t REQD_THREADS_PER_BLOCK_DIMENSIONS = 3u;
+  size_t reqdThreadsPerBlock_[REQD_THREADS_PER_BLOCK_DIMENSIONS];
+
+  /// Structure that holds the arguments to the kernel.
+  /// Note earch argument size is known, since it comes
+  /// from the kernel signature.
+  /// This is not something can be queried from the CUDA API
+  /// so there is a hard-coded size (\ref MAX_PARAM_BYTES)
+  /// and a storage.
+  ///
+  struct arguments {
+    static constexpr size_t MAX_PARAM_BYTES = 4000u;
+    using args_t = std::array<char, MAX_PARAM_BYTES>;
+    using args_size_t = std::vector<size_t>;
+    using args_index_t = std::vector<void *>;
+    args_t storage_;
+    args_size_t paramSizes_;
+    args_index_t indices_;
+    args_size_t offsetPerIndex_;
+
+    std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0};
+
+    arguments() {
+      // Place the implicit offset index at the end of the indicies collection
+      indices_.emplace_back(&implicitOffsetArgs_);
+    }
+
+    /// Adds an argument to the kernel.
+    /// If the argument existed before, it is replaced.
+    /// Otherwise, it is added.
+    /// Gaps are filled with empty arguments.
+    /// Implicit offset argument is kept at the back of the indices collection.
+    void add_arg(size_t index, size_t size, const void *arg,
+                 size_t localSize = 0) {
+      if (index + 2 > indices_.size()) {
+        // Move implicit offset argument index with the end
+        indices_.resize(index + 2, indices_.back());
+        // Ensure enough space for the new argument
+        paramSizes_.resize(index + 1);
+        offsetPerIndex_.resize(index + 1);
+      }
+      paramSizes_[index] = size;
+      // calculate the insertion point on the array
+      size_t insertPos = std::accumulate(std::begin(paramSizes_),
+                                         std::begin(paramSizes_) + index, 0);
+      // Update the stored value for the argument
+      std::memcpy(&storage_[insertPos], arg, size);
+      indices_[index] = &storage_[insertPos];
+      offsetPerIndex_[index] = localSize;
+    }
+
+    void add_local_arg(size_t index, size_t size) {
+      size_t localOffset = this->get_local_size();
+
+      // maximum required alignment is the size of the largest vector type
+      const size_t max_alignment = sizeof(double) * 16;
+
+      // for arguments smaller than the maximum alignment simply align to the
+      // size of the argument
+      const size_t alignment = std::min(max_alignment, size);
+
+      // align the argument
+      size_t alignedLocalOffset = localOffset;
+      if (localOffset % alignment != 0) {
+        alignedLocalOffset += alignment - (localOffset % alignment);
+      }
+
+      add_arg(index, sizeof(size_t), (const void *)&(alignedLocalOffset),
+              size + (alignedLocalOffset - localOffset));
+    }
+
+    void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) {
+      assert(size == sizeof(std::uint32_t) * 3);
+      std::memcpy(implicitOffsetArgs_, implicitOffset, size);
+    }
+
+    void clear_local_size() {
+      std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0);
+    }
+
+    const args_index_t &get_indices() const noexcept { return indices_; }
+
+    uint32_t get_local_size() const {
+      return std::accumulate(std::begin(offsetPerIndex_),
+                             std::end(offsetPerIndex_), 0);
+    }
+  } args_;
+
+  ur_kernel_handle_t_(CUfunction func, CUfunction funcWithOffsetParam,
+                      const char *name, ur_program_handle_t program,
+                      ur_context_handle_t ctxt)
+      : function_{func}, functionWithOffsetParam_{funcWithOffsetParam},
+        name_{name}, context_{ctxt}, program_{program}, refCount_{1} {
+    urProgramRetain(program_);
+    urContextRetain(context_);
+    /// Note: this code assumes that there is only one device per context
+    ur_result_t retError = urKernelGetGroupInfo(
+        this, ctxt->get_device(), UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
+        sizeof(reqdThreadsPerBlock_), reqdThreadsPerBlock_, nullptr);
+    assert(retError == UR_RESULT_SUCCESS);
+  }
+
+  ~ur_kernel_handle_t_() {
+    urProgramRelease(program_);
+    urContextRelease(context_);
+  }
+
+  ur_program_handle_t get_program() const noexcept { return program_; }
+
+  uint32_t increment_reference_count() noexcept { return ++refCount_; }
+
+  uint32_t decrement_reference_count() noexcept { return --refCount_; }
+
+  uint32_t get_reference_count() const noexcept { return refCount_; }
+
+  native_type get() const noexcept { return function_; };
+
+  native_type get_with_offset_parameter() const noexcept {
+    return functionWithOffsetParam_;
+  };
+
+  bool has_with_offset_parameter() const noexcept {
+    return functionWithOffsetParam_ != nullptr;
+  }
+
+  ur_context_handle_t get_context() const noexcept { return context_; };
+
+  const char *get_name() const noexcept { return name_.c_str(); }
+
+  /// Returns the number of arguments, excluding the implicit global offset.
+  /// Note this only returns the current known number of arguments, not the
+  /// real one required by the kernel, since this cannot be queried from
+  /// the CUDA Driver API
+  uint32_t get_num_args() const noexcept { return args_.indices_.size() - 1; }
+
+  void set_kernel_arg(int index, size_t size, const void *arg) {
+    args_.add_arg(index, size, arg);
+  }
+
+  void set_kernel_local_arg(int index, size_t size) {
+    args_.add_local_arg(index, size);
+  }
+
+  void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) {
+    return args_.set_implicit_offset(size, implicitOffset);
+  }
+
+  const arguments::args_index_t &get_arg_indices() const {
+    return args_.get_indices();
+  }
+
+  uint32_t get_local_size() const noexcept { return args_.get_local_size(); }
+
+  void clear_local_size() { args_.clear_local_size(); }
+};
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
new file mode 100644
index 0000000000000..7a56620180fef
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
@@ -0,0 +1,439 @@
+//===--------- program.cpp - CUDA Adapter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "program.hpp"
+
+bool getMaxRegistersJitOptionValue(const std::string &build_options,
+                                   unsigned int &value) {
+  using namespace std::string_view_literals;
+  const std::size_t optionPos = build_options.find_first_of("maxrregcount"sv);
+  if (optionPos == std::string::npos) {
+    return false;
+  }
+
+  const std::size_t delimPos = build_options.find('=', optionPos + 1u);
+  if (delimPos == std::string::npos) {
+    return false;
+  }
+
+  const std::size_t length = build_options.length();
+  const std::size_t startPos = delimPos + 1u;
+  if (delimPos == std::string::npos || startPos >= length) {
+    return false;
+  }
+
+  std::size_t pos = startPos;
+  while (pos < length &&
+         std::isdigit(static_cast<unsigned char>(build_options[pos]))) {
+    pos++;
+  }
+
+  const std::string valueString =
+      build_options.substr(startPos, pos - startPos);
+  if (valueString.empty()) {
+    return false;
+  }
+
+  value = static_cast<unsigned int>(std::stoi(valueString));
+  return true;
+}
+
+ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t ctxt)
+    : module_{nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1},
+      context_{ctxt}, kernelReqdWorkGroupSizeMD_{} {
+  urContextRetain(context_);
+}
+
+ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(context_); }
+
+std::pair<std::string, std::string>
+splitMetadataName(const std::string &metadataName) {
+  size_t splitPos = metadataName.rfind('@');
+  if (splitPos == std::string::npos)
+    return std::make_pair(metadataName, std::string{});
+  return std::make_pair(metadataName.substr(0, splitPos),
+                        metadataName.substr(splitPos, metadataName.length()));
+}
+
+ur_result_t
+ur_program_handle_t_::set_metadata(const ur_program_metadata_t *metadata,
+                                   size_t length) {
+  for (size_t i = 0; i < length; ++i) {
+    const ur_program_metadata_t metadataElement = metadata[i];
+    std::string metadataElementName{metadataElement.pName};
+
+    auto [prefix, tag] = splitMetadataName(metadataElementName);
+
+    if (tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) {
+      // If metadata is reqd_work_group_size, record it for the corresponding
+      // kernel name.
+      size_t MDElemsSize = metadataElement.size - sizeof(std::uint64_t);
+
+      // Expect between 1 and 3 32-bit integer values.
+      UR_ASSERT(MDElemsSize >= sizeof(std::uint32_t) &&
+                    MDElemsSize <= sizeof(std::uint32_t) * 3,
+                UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE);
+
+      // Get pointer to data, skipping 64-bit size at the start of the data.
+      const char *ValuePtr =
+          reinterpret_cast<const char *>(metadataElement.value.pData) +
+          sizeof(std::uint64_t);
+      // Read values and pad with 1's for values not present.
+      std::uint32_t reqdWorkGroupElements[] = {1, 1, 1};
+      std::memcpy(reqdWorkGroupElements, ValuePtr, MDElemsSize);
+      kernelReqdWorkGroupSizeMD_[prefix] =
+          std::make_tuple(reqdWorkGroupElements[0], reqdWorkGroupElements[1],
+                          reqdWorkGroupElements[2]);
+    } else if (tag == __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING) {
+      const char *metadataValPtr =
+          reinterpret_cast<const char *>(metadataElement.value.pData) +
+          sizeof(std::uint64_t);
+      const char *metadataValPtrEnd =
+          metadataValPtr + metadataElement.size - sizeof(std::uint64_t);
+      globalIDMD_[prefix] = std::string{metadataValPtr, metadataValPtrEnd};
+    }
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t ur_program_handle_t_::set_binary(const char *source,
+                                             size_t length) {
+  // Do not re-set program binary data which has already been set as that will
+  // delete the old binary data.
+  UR_ASSERT(binary_ == nullptr && binarySizeInBytes_ == 0,
+            UR_RESULT_ERROR_INVALID_OPERATION);
+  binary_ = source;
+  binarySizeInBytes_ = length;
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t ur_program_handle_t_::build_program(const char *build_options) {
+
+  this->buildOptions_ = build_options;
+
+  constexpr const unsigned int numberOfOptions = 4u;
+
+  std::vector<CUjit_option> options(numberOfOptions);
+  std::vector<void *> optionVals(numberOfOptions);
+
+  // Pass a buffer for info messages
+  options[0] = CU_JIT_INFO_LOG_BUFFER;
+  optionVals[0] = (void *)infoLog_;
+  // Pass the size of the info buffer
+  options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+  optionVals[1] = (void *)(long)MAX_LOG_SIZE;
+  // Pass a buffer for error message
+  options[2] = CU_JIT_ERROR_LOG_BUFFER;
+  optionVals[2] = (void *)errorLog_;
+  // Pass the size of the error buffer
+  options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+  optionVals[3] = (void *)(long)MAX_LOG_SIZE;
+
+  if (!buildOptions_.empty()) {
+    unsigned int maxRegs;
+    bool valid = getMaxRegistersJitOptionValue(buildOptions_, maxRegs);
+    if (valid) {
+      options.push_back(CU_JIT_MAX_REGISTERS);
+      optionVals.push_back(reinterpret_cast<void *>(maxRegs));
+    }
+  }
+
+  auto result = UR_CHECK_ERROR(
+      cuModuleLoadDataEx(&module_, static_cast<const void *>(binary_),
+                         options.size(), options.data(), optionVals.data()));
+
+  const auto success = (result == UR_RESULT_SUCCESS);
+
+  buildStatus_ =
+      success ? UR_PROGRAM_BUILD_STATUS_SUCCESS : UR_PROGRAM_BUILD_STATUS_ERROR;
+
+  // If no exception, result is correct
+  return success ? UR_RESULT_SUCCESS : UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE;
+}
+
+/// Finds kernel names by searching for entry points in the PTX source, as the
+/// CUDA driver API doesn't expose an operation for this.
+/// Note: This is currently only being used by the SYCL program class for the
+///       has_kernel method, so an alternative would be to move the has_kernel
+///       query to PI and use cuModuleGetFunction to check for a kernel.
+/// Note: Another alternative is to add kernel names as metadata, like with
+///       reqd_work_group_size.
+std::string getKernelNames(ur_program_handle_t) {
+  sycl::detail::ur::die("getKernelNames not implemented");
+  return {};
+}
+
+/// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object.
+/// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in
+/// terms of CUDA adapter. See \ref urProgramCreateWithBinary.
+///
+UR_APIEXPORT ur_result_t UR_APICALL
+urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
+                      size_t length, const ur_program_properties_t *pProperties,
+                      ur_program_handle_t *phProgram) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  ur_device_handle_t hDevice = hContext->get_device();
+  auto pBinary = reinterpret_cast<const uint8_t *>(pIL);
+
+  return urProgramCreateWithBinary(hContext, hDevice, length, pBinary,
+                                   pProperties, phProgram);
+}
+
+/// CUDA will handle the PTX/CUBIN binaries internally through a call to
+/// cuModuleLoadDataEx. So, urProgramCompile and urProgramBuild are equivalent
+/// in terms of CUDA adapter. \TODO Implement asynchronous compilation
+///
+UR_APIEXPORT ur_result_t UR_APICALL
+urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram,
+                 const char *pOptions) {
+  return urProgramBuild(hContext, hProgram, pOptions);
+}
+
+/// Loads the images from a UR program into a CUmodule that can be
+/// used later on to extract functions (kernels).
+/// See \ref ur_program_handle_t for implementation details.
+///
+UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
+                                                   ur_program_handle_t hProgram,
+                                                   const char *pOptions) {
+  UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  ur_result_t retError = UR_RESULT_SUCCESS;
+
+  try {
+    ScopedContext active(hProgram->get_context());
+
+    hProgram->build_program(pOptions);
+
+  } catch (ur_result_t err) {
+    retError = err;
+  }
+  return retError;
+}
+
+/// Creates a new UR program object that is the outcome of linking all input
+/// programs.
+/// \TODO Implement linker options, requires mapping of OpenCL to CUDA
+///
+UR_APIEXPORT ur_result_t UR_APICALL
+urProgramLink(ur_context_handle_t hContext, uint32_t count,
+              const ur_program_handle_t *phPrograms, const char *pOptions,
+              ur_program_handle_t *phProgram) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(count, UR_RESULT_ERROR_PROGRAM_LINK_FAILURE);
+  UR_ASSERT(phPrograms, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  ur_result_t retError = UR_RESULT_SUCCESS;
+
+  try {
+    ScopedContext active(hContext);
+
+    CUlinkState state;
+    std::unique_ptr<ur_program_handle_t_> retProgram{
+        new ur_program_handle_t_{hContext}};
+
+    retError = UR_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &state));
+    try {
+      for (size_t i = 0; i < count; ++i) {
+        ur_program_handle_t program = phPrograms[i];
+        retError = UR_CHECK_ERROR(cuLinkAddData(
+            state, CU_JIT_INPUT_PTX, const_cast<char *>(program->binary_),
+            program->binarySizeInBytes_, nullptr, 0, nullptr, nullptr));
+      }
+      void *cubin = nullptr;
+      size_t cubinSize = 0;
+      retError = UR_CHECK_ERROR(cuLinkComplete(state, &cubin, &cubinSize));
+
+      retError =
+          retProgram->set_binary(static_cast<const char *>(cubin), cubinSize);
+
+      retError = retProgram->build_program(pOptions);
+    } catch (...) {
+      // Upon error attempt cleanup
+      UR_CHECK_ERROR(cuLinkDestroy(state));
+      throw;
+    }
+
+    retError = UR_CHECK_ERROR(cuLinkDestroy(state));
+    *phProgram = retProgram.release();
+
+  } catch (ur_result_t err) {
+    retError = err;
+  }
+  return retError;
+}
+
+/// Created a UR program object from a CUDA program handle.
+/// TODO: Implement this.
+/// NOTE: The created UR object takes ownership of the native handle.
+///
+/// \param[in] nativeHandle The native handle to create UR program object from.
+/// \param[in] context The UR context of the program.
+/// \param[out] program Set to the UR program object created from native handle.
+///
+/// \return TBD
+UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle(
+    ur_native_handle_t hNativeProgram, ur_context_handle_t hContext,
+    ur_program_handle_t *phProgram) {
+  sycl::detail::ur::die(
+      "Creation of UR program from native handle not implemented");
+  return {};
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice,
+                      ur_program_build_info_t propName, size_t propSize,
+                      void *pPropValue, size_t *pPropSizeRet) {
+  // Ignore unused parameter
+  (void)hDevice;
+
+  UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+
+  switch (propName) {
+  case UR_PROGRAM_BUILD_INFO_STATUS: {
+    return ReturnValue(hProgram->buildStatus_);
+  }
+  case UR_PROGRAM_BUILD_INFO_OPTIONS:
+    return ReturnValue(hProgram->buildOptions_.c_str());
+  case UR_PROGRAM_BUILD_INFO_LOG:
+    return ReturnValue(hProgram->infoLog_, hProgram->MAX_LOG_SIZE);
+  default:
+    break;
+  }
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName,
+                 size_t propSize, void *pProgramInfo, size_t *pPropSizeRet) {
+  UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  UrReturnHelper ReturnValue(propSize, pProgramInfo, pPropSizeRet);
+
+  switch (propName) {
+  case UR_PROGRAM_INFO_REFERENCE_COUNT:
+    return ReturnValue(hProgram->get_reference_count());
+  case UR_PROGRAM_INFO_CONTEXT:
+    return ReturnValue(hProgram->context_);
+  case UR_PROGRAM_INFO_NUM_DEVICES:
+    return ReturnValue(1u);
+  case UR_PROGRAM_INFO_DEVICES:
+    return ReturnValue(&hProgram->context_->deviceId_, 1);
+  case UR_PROGRAM_INFO_SOURCE:
+    return ReturnValue(hProgram->binary_);
+  case UR_PROGRAM_INFO_BINARY_SIZES:
+    return ReturnValue(&hProgram->binarySizeInBytes_, 1);
+  case UR_PROGRAM_INFO_BINARIES:
+    return ReturnValue(&hProgram->binary_, 1);
+  case UR_PROGRAM_INFO_NUM_KERNELS:
+    return ReturnValue(getKernelNames(hProgram).c_str());
+  default:
+    break;
+  }
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urProgramRetain(ur_program_handle_t program) {
+  UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(program->get_reference_count() > 0,
+            UR_RESULT_ERROR_INVALID_PROGRAM);
+  program->increment_reference_count();
+  return UR_RESULT_SUCCESS;
+}
+
+/// Decreases the reference count of a ur_program_handle_t object.
+/// When the reference count reaches 0, it unloads the module from
+/// the context.
+UR_APIEXPORT ur_result_t UR_APICALL
+urProgramRelease(ur_program_handle_t program) {
+  UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  // double delete or someone is messing with the ref count.
+  // either way, cannot safely proceed.
+  UR_ASSERT(program->get_reference_count() != 0,
+            UR_RESULT_ERROR_INVALID_PROGRAM);
+
+  // decrement ref count. If it is 0, delete the program.
+  if (program->decrement_reference_count() == 0) {
+
+    std::unique_ptr<ur_program_handle_t_> program_ptr{program};
+
+    ur_result_t result = UR_RESULT_ERROR_INVALID_PROGRAM;
+
+    try {
+      ScopedContext active(program->get_context());
+      auto cuModule = program->get();
+      result = UR_CHECK_ERROR(cuModuleUnload(cuModule));
+    } catch (...) {
+      result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    }
+
+    return result;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+/// Gets the native CUDA handle of a UR program object
+///
+/// \param[in] program The PI program to get the native CUDA object of.
+/// \param[out] nativeHandle Set to the native handle of the PI program object.
+///
+/// \return TBD
+UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
+    ur_program_handle_t program, ur_native_handle_t *nativeHandle) {
+  UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  *nativeHandle = reinterpret_cast<ur_native_handle_t>(program->get());
+  return UR_RESULT_SUCCESS;
+}
+
+/// Loads images from a list of PTX or CUBIN binaries.
+/// Note: No calls to CUDA driver API in this function, only store binaries
+/// for later.
+///
+/// Note: Only supports one device
+///
+UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
+    ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size,
+    const uint8_t *pBinary, const ur_program_properties_t *pProperties,
+    ur_program_handle_t *phProgram) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(pBinary != nullptr && size != 0, UR_RESULT_ERROR_INVALID_BINARY);
+  UR_ASSERT(hContext->get_device()->get() == hDevice->get(),
+            UR_RESULT_ERROR_INVALID_CONTEXT);
+
+  ur_result_t retError = UR_RESULT_SUCCESS;
+
+  std::unique_ptr<ur_program_handle_t_> retProgram{
+      new ur_program_handle_t_{hContext}};
+
+  retError =
+      retProgram->set_metadata(pProperties->pMetadatas, pProperties->count);
+  UR_ASSERT(retError == UR_RESULT_SUCCESS, retError);
+
+  auto pBinary_string = reinterpret_cast<const char *>(pBinary);
+  if (size == 0) {
+    size = strlen(pBinary_string) + 1;
+  }
+
+  UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);
+
+  retError = retProgram->set_binary(pBinary_string, size);
+  UR_ASSERT(retError == UR_RESULT_SUCCESS, retError);
+
+  *phProgram = retProgram.release();
+
+  return retError;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp
new file mode 100644
index 0000000000000..35ac6fb215ea0
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp
@@ -0,0 +1,55 @@
+//===--------- program.hpp - CUDA Adapter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <cuda.h>
+#include <ur_api.h>
+
+#include <atomic>
+#include <unordered_map>
+
+#include "context.hpp"
+
+struct ur_program_handle_t_ {
+  using native_type = CUmodule;
+  native_type module_;
+  const char *binary_;
+  size_t binarySizeInBytes_;
+  std::atomic_uint32_t refCount_;
+  ur_context_handle_t context_;
+
+  // Metadata
+  std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
+      kernelReqdWorkGroupSizeMD_;
+  std::unordered_map<std::string, std::string> globalIDMD_;
+
+  constexpr static size_t MAX_LOG_SIZE = 8192u;
+
+  char errorLog_[MAX_LOG_SIZE], infoLog_[MAX_LOG_SIZE];
+  std::string buildOptions_;
+  ur_program_build_status_t buildStatus_ = UR_PROGRAM_BUILD_STATUS_NONE;
+
+  ur_program_handle_t_(ur_context_handle_t ctxt);
+  ~ur_program_handle_t_();
+
+  ur_result_t set_metadata(const ur_program_metadata_t *metadata,
+                           size_t length);
+
+  ur_result_t set_binary(const char *binary, size_t binarySizeInBytes);
+
+  ur_result_t build_program(const char *build_options);
+  ur_context_handle_t get_context() const { return context_; };
+
+  native_type get() const noexcept { return module_; };
+
+  uint32_t increment_reference_count() noexcept { return ++refCount_; }
+
+  uint32_t decrement_reference_count() noexcept { return --refCount_; }
+
+  uint32_t get_reference_count() const noexcept { return refCount_; }
+};

From 506130ec0bc7f44c034c1d2fbd4e3bfc9617e733 Mon Sep 17 00:00:00 2001
From: Omar Ahmed <omar.ahmed@codeplay.com>
Date: Thu, 13 Apr 2023 17:18:54 +0100
Subject: [PATCH 04/45] Add UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE to kernel
 group info

---
 .../ur/adapters/cuda/kernel.cpp               | 35 ++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
index ea341f47ee167..e34976394c5ff 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
@@ -52,12 +52,45 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
                      ur_kernel_group_info_t propName, size_t propSize,
                      void *pPropValue, size_t *pPropSizeRet) {
-  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
   // Here we want to query about a kernel's cuda blocks!
   UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
 
   switch (propName) {
+  case UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
+    size_t global_work_size[3] = {0, 0, 0};
+
+    int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0};
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&max_block_dimX,
+                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&max_block_dimY,
+                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&max_block_dimZ,
+                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
+                             hDevice->get()) == CUDA_SUCCESS);
+
+    int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0};
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&max_grid_dimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&max_grid_dimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&max_grid_dimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
+                             hDevice->get()) == CUDA_SUCCESS);
+
+    global_work_size[0] = max_block_dimX * max_grid_dimX;
+    global_work_size[1] = max_block_dimY * max_grid_dimY;
+    global_work_size[2] = max_block_dimZ * max_grid_dimZ;
+    return ReturnValue(global_work_size, 3);
+  }
   case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
     int max_threads = 0;
     sycl::detail::ur::assertion(

From 625f1f8958496054588d7bc2b5303dc7ecfcb3f6 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Fri, 14 Apr 2023 10:53:05 +0100
Subject: [PATCH 05/45] [SYCL][PI][UR][CUDA] Port a few miscellaneous CUDA
 entry points to UR Namely: * piTearDown * piPluginGetLastError *
 piGetDeviceAndHostTimer

---
 sycl/plugins/cuda/CMakeLists.txt              |  2 +-
 sycl/plugins/cuda/pi_cuda.cpp                 | 75 +++----------------
 sycl/plugins/unified_runtime/CMakeLists.txt   |  1 +
 .../ur/adapters/cuda/common.cpp               | 25 +++++++
 .../ur/adapters/cuda/common.hpp               |  8 ++
 .../ur/adapters/cuda/device.cpp               | 28 +++++++
 .../ur/adapters/cuda/platform.cpp             | 14 +++-
 .../ur/adapters}/cuda/tracing.cpp             |  0
 .../ur/adapters/cuda/ur_interface_loader.cpp  |  6 +-
 9 files changed, 88 insertions(+), 71 deletions(-)
 rename sycl/plugins/{ => unified_runtime/ur/adapters}/cuda/tracing.cpp (100%)

diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt
index 76d730967a7c0..cbc5e8f9e9638 100644
--- a/sycl/plugins/cuda/CMakeLists.txt
+++ b/sycl/plugins/cuda/CMakeLists.txt
@@ -68,12 +68,12 @@ add_sycl_plugin(cuda
     "../unified_runtime/ur/adapters/cuda/kernel.cpp"
     "../unified_runtime/ur/adapters/cuda/kernel.hpp"
     "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp"
+    "../unified_runtime/ur/adapters/cuda/tracing.cpp"
     # --- 
     "${sycl_inc_dir}/sycl/detail/pi.h"
     "${sycl_inc_dir}/sycl/detail/pi.hpp"
     "pi_cuda.hpp"
     "pi_cuda.cpp"
-    "tracing.cpp"
     ${XPTI_PROXY_SRC}
   INCLUDE_DIRS
     ${sycl_inc_dir}
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index c09ccea8ef6a3..31c85b3877091 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -54,24 +54,6 @@ pi_result map_error(CUresult result) {
   }
 }
 
-// Global variables for PI_ERROR_PLUGIN_SPECIFIC_ERROR
-constexpr size_t MaxMessageSize = 256;
-thread_local pi_result ErrorMessageCode = PI_SUCCESS;
-thread_local char ErrorMessage[MaxMessageSize];
-
-// Utility function for setting a message and warning
-static void setErrorMessage(const char *message, pi_result error_code) {
-  assert(strlen(message) <= MaxMessageSize);
-  strcpy(ErrorMessage, message);
-  ErrorMessageCode = error_code;
-}
-
-// Returns plugin specific error and warning messages
-pi_result cuda_piPluginGetLastError(char **message) {
-  *message = &ErrorMessage[0];
-  return ErrorMessageCode;
-}
-
 // Returns plugin specific backend option.
 // Current support is only for optimization options.
 // Return empty string for cuda.
@@ -713,7 +695,7 @@ pi_result cuda_piContextGetInfo(pi_context context, pi_context_info param_name,
     // These queries should be dealt with in context_impl.cpp by calling the
     // queries of each device separately and building the intersection set.
     setErrorMessage("These queries should have never come here.",
-                    PI_ERROR_INVALID_ARG_VALUE);
+                    UR_RESULT_ERROR_INVALID_ARGUMENT);
     return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
   }
   case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT:
@@ -1448,7 +1430,7 @@ pi_result cuda_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index,
           arrayDesc.Format != CU_AD_FORMAT_FLOAT) {
         setErrorMessage("PI CUDA kernels only support images with channel "
                         "types int32, uint32, float, and half.",
-                        PI_ERROR_PLUGIN_SPECIFIC_ERROR);
+                        UR_RESULT_ERROR_ADAPTER_SPECIFIC);
         return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
       }
       CUsurfObject cuSurf = arg_mem->mem_.surface_mem_.get_surface();
@@ -1618,7 +1600,7 @@ pi_result cuda_piEnqueueKernelLaunch(
       if (env_val <= 0 || env_val > device_max_local_mem) {
         setErrorMessage("Invalid value specified for "
                         "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
-                        PI_ERROR_PLUGIN_SPECIFIC_ERROR);
+                        UR_RESULT_ERROR_ADAPTER_SPECIFIC);
         return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
       }
       PI_CHECK_ERROR(cuFuncSetAttribute(
@@ -3182,7 +3164,7 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr,
   if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
     setErrorMessage("Prefetch hint ignored as device does not support "
                     "concurrent managed access",
-                    PI_SUCCESS);
+                    UR_RESULT_SUCCESS);
     return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
   }
 
@@ -3191,7 +3173,7 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr,
       &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr));
   if (!is_managed) {
     setErrorMessage("Prefetch hint ignored as prefetch only works with USM",
-                    PI_SUCCESS);
+                    UR_RESULT_SUCCESS);
     return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
   }
 
@@ -3248,7 +3230,7 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr,
     if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
       setErrorMessage("Mem advise ignored as device does not support "
                       "concurrent managed access",
-                      PI_SUCCESS);
+                      UR_RESULT_SUCCESS);
       return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
     }
 
@@ -3263,7 +3245,7 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr,
   if (!is_managed) {
     setErrorMessage(
         "Memory advice ignored as memory advices only works with USM",
-        PI_SUCCESS);
+        UR_RESULT_SUCCESS);
     return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
   }
 
@@ -3641,43 +3623,6 @@ pi_result cuda_piextEnqueueWriteHostPipe(
   return {};
 }
 
-// This API is called by Sycl RT to notify the end of the plugin lifetime.
-// Windows: dynamically loaded plugins might have been unloaded already
-// when this is called. Sycl RT holds onto the PI plugin so it can be
-// called safely. But this is not transitive. If the PI plugin in turn
-// dynamically loaded a different DLL, that may have been unloaded.
-// TODO: add a global variable lifetime management code here (see
-// pi_level_zero.cpp for reference) Currently this is just a NOOP.
-pi_result cuda_piTearDown(void *) {
-  disableCUDATracing();
-  return PI_SUCCESS;
-}
-
-pi_result cuda_piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime,
-                                       uint64_t *HostTime) {
-  _pi_event::native_type event;
-  ScopedContext active(Device->get_context());
-
-  if (DeviceTime) {
-    PI_CHECK_ERROR(cuEventCreate(&event, CU_EVENT_DEFAULT));
-    PI_CHECK_ERROR(cuEventRecord(event, 0));
-  }
-  if (HostTime) {
-
-    using namespace std::chrono;
-    *HostTime =
-        duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
-            .count();
-  }
-
-  if (DeviceTime) {
-    PI_CHECK_ERROR(cuEventSynchronize(event));
-    *DeviceTime = Device->get_elapsed_time(event);
-  }
-
-  return PI_SUCCESS;
-}
-
 const char SupportedVersion[] = _PI_CUDA_PLUGIN_VERSION_STRING;
 
 pi_result piPluginInit(pi_plugin *PluginInit) {
@@ -3835,9 +3780,9 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
 
   _PI_CL(piextKernelSetArgMemObj, cuda_piextKernelSetArgMemObj)
   _PI_CL(piextKernelSetArgSampler, cuda_piextKernelSetArgSampler)
-  _PI_CL(piPluginGetLastError, cuda_piPluginGetLastError)
-  _PI_CL(piTearDown, cuda_piTearDown)
-  _PI_CL(piGetDeviceAndHostTimer, cuda_piGetDeviceAndHostTimer)
+  _PI_CL(piPluginGetLastError, pi2ur::piPluginGetLastError)
+  _PI_CL(piTearDown, pi2ur::piTearDown)
+  _PI_CL(piGetDeviceAndHostTimer, pi2ur::piGetDeviceAndHostTimer)
   _PI_CL(piPluginGetBackendOption, cuda_piPluginGetBackendOption)
 
 #undef _PI_CL
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index c5ac46747fd73..014938c9ba542 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -144,6 +144,7 @@ add_sycl_library("ur_adapter_cuda" SHARED
     "ur/adapters/cuda/kernel.cpp"
     "ur/adapters/cuda/kernel.hpp"
     "ur/adapters/cuda/ur_interface_loader.cpp"
+    "ur/adapters/cuda/tracing.cpp"
   INCLUDE_DIRS
     ${sycl_inc_dir}
   LIBRARIES
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
index 264d7588f3229..f25aa88b3e292 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
@@ -85,3 +85,28 @@ void sycl::detail::ur::assertion(bool Condition, const char *Message) {
 void sycl::detail::ur::cuPrint(const char *Message) {
   std::cerr << "ur_print: " << Message << std::endl;
 }
+
+
+// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR
+thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS;
+thread_local char ErrorMessage[MaxMessageSize];
+
+// Utility function for setting a message and warning
+[[maybe_unused]] void setErrorMessage(const char *message,
+                                      ur_result_t error_code) {
+  assert(strlen(message) <= MaxMessageSize);
+  strcpy(ErrorMessage, message);
+  ErrorMessageCode = error_code;
+}
+
+ur_result_t zerPluginGetLastError(char **message) {
+  *message = &ErrorMessage[0];
+  return ErrorMessageCode;
+}
+
+// Returns plugin specific error and warning messages; common implementation
+// that can be shared between adapters
+ur_result_t urGetLastResult(ur_platform_handle_t, const char **ppMessage) {
+  *ppMessage = &ErrorMessage[0];
+  return ErrorMessageCode;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
index 16cabc37a2b16..3aa23c67bf492 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
@@ -28,6 +28,14 @@ ur_result_t check_error_ur(CUresult result, const char *function, int line,
 
 std::string getCudaVersionString();
 
+constexpr size_t MaxMessageSize = 256;
+extern thread_local ur_result_t ErrorMessageCode;
+extern thread_local char ErrorMessage[MaxMessageSize];
+
+// Utility function for setting a message and warning
+[[maybe_unused]] void setErrorMessage(const char *message,
+                                      ur_result_t error_code);
+
 /// ------ Error handling, matching OpenCL plugin semantics.
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
index d0b11b23cc74d..ae987ab4a7c6e 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
@@ -1117,3 +1117,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
   // existing device return error
   return UR_RESULT_ERROR_INVALID_OPERATION;
 }
+
+ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
+                                                   uint64_t *pDeviceTimestamp,
+                                                   uint64_t *pHostTimestamp) {
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  CUevent event;
+  ScopedContext active(hDevice->get_context());
+
+  if (pDeviceTimestamp) {
+    UR_CHECK_ERROR(cuEventCreate(&event, CU_EVENT_DEFAULT));
+    UR_CHECK_ERROR(cuEventRecord(event, 0));
+  }
+  if (pHostTimestamp) {
+
+    using namespace std::chrono;
+    *pHostTimestamp =
+        duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
+            .count();
+  }
+
+  if (pDeviceTimestamp) {
+    UR_CHECK_ERROR(cuEventSynchronize(event));
+    *pDeviceTimestamp = hDevice->get_elapsed_time(event);
+  }
+
+  return UR_RESULT_SUCCESS;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
index dd8503f1f8907..5a4e43c320af0 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
@@ -15,6 +15,9 @@
 #include <cuda.h>
 #include <sstream>
 
+void enableCUDATracing();
+void disableCUDATracing();
+
 ur_result_t urPlatformGetInfo(ur_platform_handle_t hPlatform,
                               ur_platform_info_t PlatformInfoType, size_t Size,
                               void *pPlatformInfo, size_t *pSizeRet) {
@@ -169,6 +172,13 @@ ur_result_t urPlatformGetApiVersion(ur_platform_handle_t hDriver,
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t urInit(ur_device_init_flags_t) { return UR_RESULT_SUCCESS; }
+ur_result_t urInit(ur_device_init_flags_t) {
+  enableCUDATracing();
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t urTearDown(void *) {
+  disableCUDATracing();
+  return UR_RESULT_SUCCESS;
+}
 
-ur_result_t urTearDown(void *) { return UR_RESULT_SUCCESS; }
diff --git a/sycl/plugins/cuda/tracing.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/tracing.cpp
similarity index 100%
rename from sycl/plugins/cuda/tracing.cpp
rename to sycl/plugins/unified_runtime/ur/adapters/cuda/tracing.cpp
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index 015dadcbaa074..d7f9ad75d38cd 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -195,7 +195,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable(
   if (UR_RESULT_SUCCESS != result) {
     return result;
   }
-  pDdiTable->pfnGetLastResult = nullptr;
+  pDdiTable->pfnGetLastResult = urGetLastResult;
   pDdiTable->pfnInit = urInit;
   pDdiTable->pfnTearDown = urTearDown;
   return UR_RESULT_SUCCESS;
@@ -243,10 +243,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable(
   }
   pDdiTable->pfnCreateWithNativeHandle = urDeviceCreateWithNativeHandle;
   pDdiTable->pfnGet = urDeviceGet;
-  pDdiTable->pfnGetGlobalTimestamps = nullptr;
+  pDdiTable->pfnGetGlobalTimestamps = urDeviceGetGlobalTimestamps;
   pDdiTable->pfnGetInfo = urDeviceGetInfo;
   pDdiTable->pfnGetNativeHandle = urDeviceGetNativeHandle;
-  pDdiTable->pfnPartition = nullptr;
+  pDdiTable->pfnPartition = urDevicePartition;
   pDdiTable->pfnRelease = urDeviceRelease;
   pDdiTable->pfnRetain = urDeviceRetain;
   pDdiTable->pfnSelectBinary = nullptr;

From 3ae23298f2869e319739518938c88a45cce96aec Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Fri, 7 Apr 2023 15:53:48 +0100
Subject: [PATCH 06/45] [SYCL][PI][UR][CUDA] Port CUDA queue and event to
 Unified Runtime

---
 sycl/plugins/cuda/CMakeLists.txt              |   5 +
 sycl/plugins/cuda/pi_cuda.cpp                 | 794 ++----------------
 sycl/plugins/cuda/pi_cuda.hpp                 | 419 ++-------
 sycl/plugins/unified_runtime/CMakeLists.txt   |   5 +
 .../ur/adapters/cuda/enqueue.cpp              | 110 +++
 .../ur/adapters/cuda/event.cpp                | 309 +++++++
 .../ur/adapters/cuda/event.hpp                | 191 +++++
 .../ur/adapters/cuda/queue.cpp                | 326 +++++++
 .../ur/adapters/cuda/queue.hpp                | 253 ++++++
 9 files changed, 1337 insertions(+), 1075 deletions(-)
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp

diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt
index cbc5e8f9e9638..6339f1e3466ea 100644
--- a/sycl/plugins/cuda/CMakeLists.txt
+++ b/sycl/plugins/cuda/CMakeLists.txt
@@ -61,12 +61,17 @@ add_sycl_plugin(cuda
     "../unified_runtime/ur/adapters/cuda/context.hpp"
     "../unified_runtime/ur/adapters/cuda/device.cpp"
     "../unified_runtime/ur/adapters/cuda/device.hpp"
+    "../unified_runtime/ur/adapters/cuda/enqueue.cpp"
+    "../unified_runtime/ur/adapters/cuda/event.cpp"
+    "../unified_runtime/ur/adapters/cuda/event.hpp"
     "../unified_runtime/ur/adapters/cuda/platform.cpp"
     "../unified_runtime/ur/adapters/cuda/platform.hpp"
     "../unified_runtime/ur/adapters/cuda/program.cpp"
     "../unified_runtime/ur/adapters/cuda/program.hpp"
     "../unified_runtime/ur/adapters/cuda/kernel.cpp"
     "../unified_runtime/ur/adapters/cuda/kernel.hpp"
+    "../unified_runtime/ur/adapters/cuda/queue.hpp"
+    "../unified_runtime/ur/adapters/cuda/queue.cpp"
     "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp"
     "../unified_runtime/ur/adapters/cuda/tracing.cpp"
     # --- 
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 31c85b3877091..c2c08b645b03a 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -73,6 +73,27 @@ pi_result cuda_piPluginGetBackendOption(pi_platform,
   return PI_ERROR_INVALID_VALUE;
 }
 
+pi_result map_ur_error(ur_result_t result) {
+  switch (result) {
+  case UR_RESULT_SUCCESS:
+    return PI_SUCCESS;
+  case UR_RESULT_ERROR_INVALID_OPERATION:
+    return PI_ERROR_INVALID_OPERATION;
+  case UR_RESULT_ERROR_INVALID_CONTEXT:
+    return PI_ERROR_INVALID_CONTEXT;
+  case UR_RESULT_ERROR_INVALID_DEVICE:
+    return PI_ERROR_INVALID_DEVICE;
+  case UR_RESULT_ERROR_INVALID_VALUE:
+    return PI_ERROR_INVALID_VALUE;
+  case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY:
+    return PI_ERROR_OUT_OF_HOST_MEMORY;
+  case UR_RESULT_ERROR_OUT_OF_RESOURCES:
+    return PI_ERROR_OUT_OF_RESOURCES;
+  default:
+    return PI_ERROR_UNKNOWN;
+  }
+}
+
 // Iterates over the event wait list, returns correct pi_result error codes.
 // Invokes the callback for the latest event of each queue in the wait list.
 // The callback must take a single pi_event argument and return a pi_result.
@@ -411,257 +432,11 @@ pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue,
                                               pi_uint32 num_events_in_wait_list,
                                               const pi_event *event_wait_list,
                                               pi_event *event);
-pi_result cuda_piEventRelease(pi_event event);
-pi_result cuda_piEventRetain(pi_event event);
 
 } // extern "C"
 
 /// \endcond
 
-void _pi_queue::compute_stream_wait_for_barrier_if_needed(CUstream stream,
-                                                          pi_uint32 stream_i) {
-  if (barrier_event_ && !compute_applied_barrier_[stream_i]) {
-    PI_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0));
-    compute_applied_barrier_[stream_i] = true;
-  }
-}
-
-void _pi_queue::transfer_stream_wait_for_barrier_if_needed(CUstream stream,
-                                                           pi_uint32 stream_i) {
-  if (barrier_event_ && !transfer_applied_barrier_[stream_i]) {
-    PI_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0));
-    transfer_applied_barrier_[stream_i] = true;
-  }
-}
-
-CUstream _pi_queue::get_next_compute_stream(pi_uint32 *stream_token) {
-  pi_uint32 stream_i;
-  pi_uint32 token;
-  while (true) {
-    if (num_compute_streams_ < compute_streams_.size()) {
-      // the check above is for performance - so as not to lock mutex every time
-      std::lock_guard<std::mutex> guard(compute_stream_mutex_);
-      // The second check is done after mutex is locked so other threads can not
-      // change num_compute_streams_ after that
-      if (num_compute_streams_ < compute_streams_.size()) {
-        PI_CHECK_ERROR(
-            cuStreamCreate(&compute_streams_[num_compute_streams_++], flags_));
-      }
-    }
-    token = compute_stream_idx_++;
-    stream_i = token % compute_streams_.size();
-    // if a stream has been reused before it was next selected round-robin
-    // fashion, we want to delay its next use and instead select another one
-    // that is more likely to have completed all the enqueued work.
-    if (delay_compute_[stream_i]) {
-      delay_compute_[stream_i] = false;
-    } else {
-      break;
-    }
-  }
-  if (stream_token) {
-    *stream_token = token;
-  }
-  CUstream res = compute_streams_[stream_i];
-  compute_stream_wait_for_barrier_if_needed(res, stream_i);
-  return res;
-}
-
-CUstream _pi_queue::get_next_compute_stream(pi_uint32 num_events_in_wait_list,
-                                            const pi_event *event_wait_list,
-                                            _pi_stream_guard &guard,
-                                            pi_uint32 *stream_token) {
-  for (pi_uint32 i = 0; i < num_events_in_wait_list; i++) {
-    pi_uint32 token = event_wait_list[i]->get_compute_stream_token();
-    if (event_wait_list[i]->get_queue() == this && can_reuse_stream(token)) {
-      std::unique_lock<std::mutex> compute_sync_guard(
-          compute_stream_sync_mutex_);
-      // redo the check after lock to avoid data races on
-      // last_sync_compute_streams_
-      if (can_reuse_stream(token)) {
-        pi_uint32 stream_i = token % delay_compute_.size();
-        delay_compute_[stream_i] = true;
-        if (stream_token) {
-          *stream_token = token;
-        }
-        guard = _pi_stream_guard{std::move(compute_sync_guard)};
-        CUstream res = event_wait_list[i]->get_stream();
-        compute_stream_wait_for_barrier_if_needed(res, stream_i);
-        return res;
-      }
-    }
-  }
-  guard = {};
-  return get_next_compute_stream(stream_token);
-}
-
-CUstream _pi_queue::get_next_transfer_stream() {
-  if (transfer_streams_.empty()) { // for example in in-order queue
-    return get_next_compute_stream();
-  }
-  if (num_transfer_streams_ < transfer_streams_.size()) {
-    // the check above is for performance - so as not to lock mutex every time
-    std::lock_guard<std::mutex> guard(transfer_stream_mutex_);
-    // The second check is done after mutex is locked so other threads can not
-    // change num_transfer_streams_ after that
-    if (num_transfer_streams_ < transfer_streams_.size()) {
-      PI_CHECK_ERROR(
-          cuStreamCreate(&transfer_streams_[num_transfer_streams_++], flags_));
-    }
-  }
-  pi_uint32 stream_i = transfer_stream_idx_++ % transfer_streams_.size();
-  CUstream res = transfer_streams_[stream_i];
-  transfer_stream_wait_for_barrier_if_needed(res, stream_i);
-  return res;
-}
-
-_pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue,
-                     CUstream stream, pi_uint32 stream_token)
-    : commandType_{type}, refCount_{1}, has_ownership_{true},
-      hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
-      streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr},
-      evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} {
-
-  bool profilingEnabled = queue_->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE;
-
-  PI_CHECK_ERROR(cuEventCreate(
-      &evEnd_, profilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
-
-  if (profilingEnabled) {
-    PI_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
-    PI_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
-  }
-
-  if (queue_ != nullptr) {
-    cuda_piQueueRetain(queue_);
-  }
-  pi2ur::piContextRetain(context_);
-}
-
-_pi_event::_pi_event(pi_context context, CUevent eventNative)
-    : commandType_{PI_COMMAND_TYPE_USER}, refCount_{1}, has_ownership_{false},
-      hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
-      streamToken_{std::numeric_limits<pi_uint32>::max()}, evEnd_{eventNative},
-      evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr},
-      context_{context} {
-  pi2ur::piContextRetain(context_);
-}
-
-_pi_event::~_pi_event() {
-  if (queue_ != nullptr) {
-    cuda_piQueueRelease(queue_);
-  }
-  pi2ur::piContextRelease(context_);
-}
-
-pi_result _pi_event::start() {
-  assert(!is_started());
-  pi_result result = PI_SUCCESS;
-
-  try {
-    if (queue_->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE) {
-      // NOTE: This relies on the default stream to be unused.
-      result = PI_CHECK_ERROR(cuEventRecord(evQueued_, 0));
-      result = PI_CHECK_ERROR(cuEventRecord(evStart_, stream_));
-    }
-  } catch (pi_result error) {
-    result = error;
-  }
-
-  isStarted_ = true;
-  return result;
-}
-
-bool _pi_event::is_completed() const noexcept {
-  if (!isRecorded_) {
-    return false;
-  }
-  if (!hasBeenWaitedOn_) {
-    const CUresult ret = cuEventQuery(evEnd_);
-    if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_READY) {
-      PI_CHECK_ERROR(ret);
-      return false;
-    }
-    if (ret == CUDA_ERROR_NOT_READY) {
-      return false;
-    }
-  }
-  return true;
-}
-
-pi_uint64 _pi_event::get_queued_time() const {
-  assert(is_started());
-  return queue_->get_device()->get_elapsed_time(evQueued_);
-}
-
-pi_uint64 _pi_event::get_start_time() const {
-  assert(is_started());
-  return queue_->get_device()->get_elapsed_time(evStart_);
-}
-
-pi_uint64 _pi_event::get_end_time() const {
-  assert(is_started() && is_recorded());
-  return queue_->get_device()->get_elapsed_time(evEnd_);
-}
-
-pi_result _pi_event::record() {
-
-  if (is_recorded() || !is_started()) {
-    return PI_ERROR_INVALID_EVENT;
-  }
-
-  pi_result result = PI_ERROR_INVALID_OPERATION;
-
-  if (!queue_) {
-    return PI_ERROR_INVALID_QUEUE;
-  }
-
-  try {
-    eventId_ = queue_->get_next_event_id();
-    if (eventId_ == 0) {
-      sycl::detail::pi::die(
-          "Unrecoverable program state reached in event identifier overflow");
-    }
-    result = PI_CHECK_ERROR(cuEventRecord(evEnd_, stream_));
-  } catch (pi_result error) {
-    result = error;
-  }
-
-  if (result == PI_SUCCESS) {
-    isRecorded_ = true;
-  }
-
-  return result;
-}
-
-pi_result _pi_event::wait() {
-  pi_result retErr;
-  try {
-    retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
-    hasBeenWaitedOn_ = true;
-  } catch (pi_result error) {
-    retErr = error;
-  }
-
-  return retErr;
-}
-
-pi_result _pi_event::release() {
-  if (!backend_has_ownership())
-    return PI_SUCCESS;
-
-  assert(queue_ != nullptr);
-
-  PI_CHECK_ERROR(cuEventDestroy(evEnd_));
-
-  if (queue_->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE) {
-    PI_CHECK_ERROR(cuEventDestroy(evQueued_));
-    PI_CHECK_ERROR(cuEventDestroy(evStart_));
-  }
-
-  return PI_SUCCESS;
-}
-
 // makes all future work submitted to queue wait for all work captured in event.
 pi_result enqueueEventWait(pi_queue queue, pi_event event) {
   // for native events, the cuStreamWaitEvent call is used.
@@ -1028,254 +803,6 @@ pi_result cuda_piextMemImageCreateWithNativeHandle(pi_native_handle, pi_context,
   return {};
 }
 
-/// Creates a `pi_queue` object on the CUDA backend.
-/// Valid properties
-/// * __SYCL_PI_CUDA_USE_DEFAULT_STREAM -> CU_STREAM_DEFAULT
-/// * __SYCL_PI_CUDA_SYNC_WITH_DEFAULT -> CU_STREAM_NON_BLOCKING
-/// \return Pi queue object mapping to a CUStream
-///
-pi_result cuda_piQueueCreate(pi_context context, pi_device device,
-                             pi_queue_properties properties, pi_queue *queue) {
-  try {
-    std::unique_ptr<_pi_queue> queueImpl{nullptr};
-
-    if (context->get_device() != device) {
-      *queue = nullptr;
-      return PI_ERROR_INVALID_DEVICE;
-    }
-
-    unsigned int flags = 0;
-    if (properties == __SYCL_PI_CUDA_USE_DEFAULT_STREAM) {
-      flags = CU_STREAM_DEFAULT;
-    } else if (properties == __SYCL_PI_CUDA_SYNC_WITH_DEFAULT) {
-      flags = 0;
-    } else {
-      flags = CU_STREAM_NON_BLOCKING;
-    }
-
-    const bool is_out_of_order =
-        properties & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
-
-    std::vector<CUstream> computeCuStreams(
-        is_out_of_order ? _pi_queue::default_num_compute_streams : 1);
-    std::vector<CUstream> transferCuStreams(
-        is_out_of_order ? _pi_queue::default_num_transfer_streams : 0);
-
-    queueImpl = std::unique_ptr<_pi_queue>(
-        new _pi_queue{std::move(computeCuStreams), std::move(transferCuStreams),
-                      context, device, properties, flags});
-
-    *queue = queueImpl.release();
-
-    return PI_SUCCESS;
-  } catch (pi_result err) {
-
-    return err;
-
-  } catch (...) {
-
-    return PI_ERROR_OUT_OF_RESOURCES;
-  }
-}
-pi_result cuda_piextQueueCreate(pi_context Context, pi_device Device,
-                                pi_queue_properties *Properties,
-                                pi_queue *Queue) {
-  assert(Properties);
-  // Expect flags mask to be passed first.
-  assert(Properties[0] == PI_QUEUE_FLAGS);
-  if (Properties[0] != PI_QUEUE_FLAGS)
-    return PI_ERROR_INVALID_VALUE;
-  pi_queue_properties Flags = Properties[1];
-  // Extra data isn't supported yet.
-  assert(Properties[2] == 0);
-  if (Properties[2] != 0)
-    return PI_ERROR_INVALID_VALUE;
-  return cuda_piQueueCreate(Context, Device, Flags, Queue);
-}
-
-pi_result cuda_piQueueGetInfo(pi_queue command_queue, pi_queue_info param_name,
-                              size_t param_value_size, void *param_value,
-                              size_t *param_value_size_ret) {
-  assert(command_queue != nullptr);
-
-  switch (param_name) {
-  case PI_QUEUE_INFO_CONTEXT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   command_queue->context_);
-  case PI_QUEUE_INFO_DEVICE:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   command_queue->device_);
-  case PI_QUEUE_INFO_REFERENCE_COUNT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   command_queue->get_reference_count());
-  case PI_QUEUE_INFO_PROPERTIES:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   command_queue->properties_);
-  case PI_EXT_ONEAPI_QUEUE_INFO_EMPTY: {
-    try {
-      bool IsReady = command_queue->all_of([](CUstream s) -> bool {
-        const CUresult ret = cuStreamQuery(s);
-        if (ret == CUDA_SUCCESS)
-          return true;
-
-        if (ret == CUDA_ERROR_NOT_READY)
-          return false;
-
-        PI_CHECK_ERROR(ret);
-        return false;
-      });
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     IsReady);
-    } catch (pi_result err) {
-      return err;
-    } catch (...) {
-      return PI_ERROR_OUT_OF_RESOURCES;
-    }
-  }
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-  sycl::detail::pi::die("Queue info request not implemented");
-  return {};
-}
-
-pi_result cuda_piQueueRetain(pi_queue command_queue) {
-  assert(command_queue != nullptr);
-  assert(command_queue->get_reference_count() > 0);
-
-  command_queue->increment_reference_count();
-  return PI_SUCCESS;
-}
-
-pi_result cuda_piQueueRelease(pi_queue command_queue) {
-  assert(command_queue != nullptr);
-
-  if (command_queue->decrement_reference_count() > 0) {
-    return PI_SUCCESS;
-  }
-
-  try {
-    std::unique_ptr<_pi_queue> queueImpl(command_queue);
-
-    if (!command_queue->backend_has_ownership())
-      return PI_SUCCESS;
-
-    ScopedContext active(command_queue->get_context());
-
-    command_queue->for_each_stream([](CUstream s) {
-      PI_CHECK_ERROR(cuStreamSynchronize(s));
-      PI_CHECK_ERROR(cuStreamDestroy(s));
-    });
-
-    return PI_SUCCESS;
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_OUT_OF_RESOURCES;
-  }
-}
-
-pi_result cuda_piQueueFinish(pi_queue command_queue) {
-  pi_result result = PI_SUCCESS;
-
-  try {
-
-    assert(command_queue !=
-           nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code
-    ScopedContext active(command_queue->get_context());
-
-    command_queue->sync_streams</*ResetUsed=*/true>([&result](CUstream s) {
-      result = PI_CHECK_ERROR(cuStreamSynchronize(s));
-    });
-
-  } catch (pi_result err) {
-
-    result = err;
-
-  } catch (...) {
-
-    result = PI_ERROR_OUT_OF_RESOURCES;
-  }
-
-  return result;
-}
-
-// There is no CUDA counterpart for queue flushing and we don't run into the
-// same problem of having to flush cross-queue dependencies as some of the
-// other plugins, so it can be left as no-op.
-pi_result cuda_piQueueFlush(pi_queue command_queue) {
-  (void)command_queue;
-  return PI_SUCCESS;
-}
-
-/// Gets the native CUDA handle of a PI queue object
-///
-/// \param[in] queue The PI queue to get the native CUDA object of.
-/// \param[in] NativeHandleDesc Pointer to additional native handle info.
-/// \param[out] nativeHandle Set to the native handle of the PI queue object.
-///
-/// \return PI_SUCCESS
-pi_result cuda_piextQueueGetNativeHandle(pi_queue queue,
-                                         pi_native_handle *nativeHandle,
-                                         int32_t *NativeHandleDesc) {
-  *NativeHandleDesc = 0;
-  ScopedContext active(queue->get_context());
-  *nativeHandle =
-      reinterpret_cast<pi_native_handle>(queue->get_next_compute_stream());
-  return PI_SUCCESS;
-}
-
-/// Created a PI queue object from a CUDA queue handle.
-/// NOTE: The created PI object does not take ownership of the native handle.
-///
-/// \param[in] nativeHandle The native handle to create PI queue object from.
-/// \param[in] nativeHandleDesc Info about the native handle.
-/// \param[in] context is the PI context of the queue.
-/// \param[out] queue Set to the PI queue object created from native handle.
-/// \param ownNativeHandle tells if SYCL RT should assume the ownership of
-///        the native handle, if it can.
-///
-/// \return TBD
-pi_result cuda_piextQueueCreateWithNativeHandle(
-    pi_native_handle nativeHandle, int32_t NativeHandleDesc, pi_context context,
-    pi_device device, bool ownNativeHandle, pi_queue_properties *Properties,
-    pi_queue *queue) {
-  (void)NativeHandleDesc;
-  (void)device;
-  (void)ownNativeHandle;
-  (void)Properties;
-  assert(ownNativeHandle == false);
-
-  unsigned int flags;
-  CUstream cuStream = reinterpret_cast<CUstream>(nativeHandle);
-
-  auto retErr = PI_CHECK_ERROR(cuStreamGetFlags(cuStream, &flags));
-
-  pi_queue_properties properties = 0;
-  if (flags == CU_STREAM_DEFAULT)
-    properties = __SYCL_PI_CUDA_USE_DEFAULT_STREAM;
-  else if (flags == CU_STREAM_NON_BLOCKING)
-    properties = __SYCL_PI_CUDA_SYNC_WITH_DEFAULT;
-  else
-    sycl::detail::pi::die("Unknown cuda stream");
-
-  std::vector<CUstream> computeCuStreams(1, cuStream);
-  std::vector<CUstream> transferCuStreams(0);
-
-  // Create queue and set num_compute_streams to 1, as computeCuStreams has
-  // valid stream
-  *queue = new _pi_queue{std::move(computeCuStreams),
-                         std::move(transferCuStreams),
-                         context,
-                         reinterpret_cast<pi_device>(context->get_device()),
-                         properties,
-                         flags,
-                         /*backend_owns*/ false};
-  (*queue)->num_compute_streams_ = 1;
-
-  return retErr;
-}
-
 pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer,
                                        pi_bool blocking_write, size_t offset,
                                        size_t size, const void *ptr,
@@ -1306,7 +833,7 @@ pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer,
         PI_CHECK_ERROR(cuMemcpyHtoDAsync(devPtr + offset, ptr, size, cuStream));
 
     if (event) {
-      retErr = retImplEv->record();
+      retErr = map_ur_error(retImplEv->record());
     }
 
     if (blocking_write) {
@@ -1352,7 +879,7 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer,
         PI_CHECK_ERROR(cuMemcpyDtoHAsync(ptr, devPtr + offset, size, cuStream));
 
     if (event) {
-      retErr = retImplEv->record();
+      retErr = map_ur_error(retImplEv->record());
     }
 
     if (blocking_read) {
@@ -1369,41 +896,6 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer,
   return retErr;
 }
 
-pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
-
-  try {
-    assert(num_events != 0);
-    assert(event_list);
-    if (num_events == 0) {
-      return PI_ERROR_INVALID_VALUE;
-    }
-
-    if (!event_list) {
-      return PI_ERROR_INVALID_EVENT;
-    }
-
-    auto context = event_list[0]->get_context();
-    ScopedContext active(context);
-
-    auto waitFunc = [context](pi_event event) -> pi_result {
-      if (!event) {
-        return PI_ERROR_INVALID_EVENT;
-      }
-
-      if (event->get_context() != context) {
-        return PI_ERROR_INVALID_CONTEXT;
-      }
-
-      return event->wait();
-    };
-    return forLatestEvents(event_list, num_events, waitFunc);
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_OUT_OF_RESOURCES;
-  }
-}
-
 pi_result cuda_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index,
                                        const pi_mem *arg_value) {
 
@@ -1526,14 +1018,15 @@ pi_result cuda_piEnqueueKernelLaunch(
           kernelLocalWorkGroupSize += local_work_size[dim];
         }
 
-        if (hasExceededMaxRegistersPerBlock(command_queue->device_, kernel,
-                                            kernelLocalWorkGroupSize)) {
+        if (hasExceededMaxRegistersPerBlock(
+                reinterpret_cast<pi_device>(command_queue->device_), kernel,
+                kernelLocalWorkGroupSize)) {
           return PI_ERROR_INVALID_WORK_GROUP_SIZE;
         }
       } else {
-        guessLocalWorkSize(command_queue->device_, threadsPerBlock,
-                           global_work_size, maxThreadsPerBlock, kernel,
-                           local_size);
+        guessLocalWorkSize(reinterpret_cast<pi_device>(command_queue->device_),
+                           threadsPerBlock, global_work_size,
+                           maxThreadsPerBlock, kernel, local_size);
       }
     }
 
@@ -1554,7 +1047,9 @@ pi_result cuda_piEnqueueKernelLaunch(
     pi_uint32 stream_token;
     _pi_stream_guard guard;
     CUstream cuStream = command_queue->get_next_compute_stream(
-        num_events_in_wait_list, event_wait_list, guard, &stream_token);
+        num_events_in_wait_list,
+        reinterpret_cast<const ur_event_handle_t *>(event_wait_list), guard,
+        &stream_token);
     CUfunction cuFunc = kernel->get();
 
     retError = enqueueEventsWait(command_queue, cuStream,
@@ -1615,7 +1110,7 @@ pi_result cuda_piEnqueueKernelLaunch(
       kernel->clear_local_size();
 
     if (event) {
-      retError = retImplEv->record();
+      retError = map_ur_error(retImplEv->record());
       *event = retImplEv.release();
     }
   } catch (pi_result err) {
@@ -1802,124 +1297,6 @@ pi_result cuda_piMemRetain(pi_mem mem) {
   return PI_SUCCESS;
 }
 
-//
-// Events
-//
-pi_result cuda_piEventCreate(pi_context, pi_event *) {
-  sycl::detail::pi::die("PI Event Create not implemented in CUDA backend");
-}
-
-pi_result cuda_piEventGetInfo(pi_event event, pi_event_info param_name,
-                              size_t param_value_size, void *param_value,
-                              size_t *param_value_size_ret) {
-  assert(event != nullptr);
-
-  switch (param_name) {
-  case PI_EVENT_INFO_COMMAND_QUEUE:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   event->get_queue());
-  case PI_EVENT_INFO_COMMAND_TYPE:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   event->get_command_type());
-  case PI_EVENT_INFO_REFERENCE_COUNT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   event->get_reference_count());
-  case PI_EVENT_INFO_COMMAND_EXECUTION_STATUS: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   static_cast<pi_event_status>(event->get_execution_status()));
-  }
-  case PI_EVENT_INFO_CONTEXT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   event->get_context());
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-
-  return PI_ERROR_INVALID_EVENT;
-}
-
-/// Obtain profiling information from PI CUDA events
-/// \TODO Timings from CUDA are only elapsed time.
-pi_result cuda_piEventGetProfilingInfo(pi_event event,
-                                       pi_profiling_info param_name,
-                                       size_t param_value_size,
-                                       void *param_value,
-                                       size_t *param_value_size_ret) {
-
-  assert(event != nullptr);
-
-  pi_queue queue = event->get_queue();
-  if (queue == nullptr ||
-      !(queue->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE)) {
-    return PI_ERROR_PROFILING_INFO_NOT_AVAILABLE;
-  }
-
-  switch (param_name) {
-  case PI_PROFILING_INFO_COMMAND_QUEUED:
-  case PI_PROFILING_INFO_COMMAND_SUBMIT:
-    // Note: No user for this case
-    return getInfo<pi_uint64>(param_value_size, param_value,
-                              param_value_size_ret, event->get_queued_time());
-  case PI_PROFILING_INFO_COMMAND_START:
-    return getInfo<pi_uint64>(param_value_size, param_value,
-                              param_value_size_ret, event->get_start_time());
-  case PI_PROFILING_INFO_COMMAND_END:
-    return getInfo<pi_uint64>(param_value_size, param_value,
-                              param_value_size_ret, event->get_end_time());
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-  sycl::detail::pi::die("Event Profiling info request not implemented");
-  return {};
-}
-
-pi_result cuda_piEventSetCallback(pi_event, pi_int32, pfn_notify, void *) {
-  sycl::detail::pi::die("Event Callback not implemented in CUDA backend");
-  return PI_SUCCESS;
-}
-
-pi_result cuda_piEventSetStatus(pi_event, pi_int32) {
-  sycl::detail::pi::die("Event Set Status not implemented in CUDA backend");
-  return PI_ERROR_INVALID_VALUE;
-}
-
-pi_result cuda_piEventRetain(pi_event event) {
-  assert(event != nullptr);
-
-  const auto refCount = event->increment_reference_count();
-
-  sycl::detail::pi::assertion(
-      refCount != 0,
-      "Reference count overflow detected in cuda_piEventRetain.");
-
-  return PI_SUCCESS;
-}
-
-pi_result cuda_piEventRelease(pi_event event) {
-  assert(event != nullptr);
-
-  // double delete or someone is messing with the ref count.
-  // either way, cannot safely proceed.
-  sycl::detail::pi::assertion(
-      event->get_reference_count() != 0,
-      "Reference count overflow detected in cuda_piEventRelease.");
-
-  // decrement ref count. If it is 0, delete the event.
-  if (event->decrement_reference_count() == 0) {
-    std::unique_ptr<_pi_event> event_ptr{event};
-    pi_result result = PI_ERROR_INVALID_EVENT;
-    try {
-      ScopedContext active(event->get_context());
-      result = event->release();
-    } catch (...) {
-      result = PI_ERROR_OUT_OF_RESOURCES;
-    }
-    return result;
-  }
-
-  return PI_SUCCESS;
-}
-
 /// Enqueues a wait on the given CUstream for all events.
 /// See \ref enqueueEventWait
 /// TODO: Add support for multiple streams once the Event class is properly
@@ -1962,7 +1339,9 @@ pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue,
     pi_uint32 stream_token;
     _pi_stream_guard guard;
     CUstream cuStream = command_queue->get_next_compute_stream(
-        num_events_in_wait_list, event_wait_list, guard, &stream_token);
+        num_events_in_wait_list,
+        reinterpret_cast<const ur_event_handle_t *>(event_wait_list), guard,
+        &stream_token);
     {
       std::lock_guard<std::mutex> guard(command_queue->barrier_mutex_);
       if (command_queue->barrier_event_ == nullptr) {
@@ -2027,41 +1406,6 @@ pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue,
   }
 }
 
-/// Gets the native CUDA handle of a PI event object
-///
-/// \param[in] event The PI event to get the native CUDA object of.
-/// \param[out] nativeHandle Set to the native handle of the PI event object.
-///
-/// \return PI_SUCCESS on success. PI_ERROR_INVALID_EVENT if given a user event.
-pi_result cuda_piextEventGetNativeHandle(pi_event event,
-                                         pi_native_handle *nativeHandle) {
-  *nativeHandle = reinterpret_cast<pi_native_handle>(event->get());
-  return PI_SUCCESS;
-}
-
-/// Created a PI event object from a CUDA event handle.
-/// TODO: Implement this.
-/// NOTE: The created PI object takes ownership of the native handle.
-///
-/// \param[in] nativeHandle The native handle to create PI event object from.
-/// \param[out] event Set to the PI event object created from native handle.
-///
-/// \return TBD
-pi_result cuda_piextEventCreateWithNativeHandle(pi_native_handle nativeHandle,
-                                                pi_context context,
-                                                bool ownNativeHandle,
-                                                pi_event *event) {
-  (void)ownNativeHandle;
-  assert(!ownNativeHandle);
-
-  std::unique_ptr<_pi_event> event_ptr{nullptr};
-
-  *event = _pi_event::make_with_native(context,
-                                       reinterpret_cast<CUevent>(nativeHandle));
-
-  return PI_SUCCESS;
-}
-
 /// Creates a PI sampler object
 ///
 /// \param[in] context The context the sampler is created for.
@@ -2297,7 +1641,7 @@ pi_result cuda_piEnqueueMemBufferReadRect(
         host_offset, host_row_pitch, host_slice_pitch);
 
     if (event) {
-      retErr = retImplEv->record();
+      retErr = map_ur_error(retImplEv->record());
     }
 
     if (blocking_read) {
@@ -2347,7 +1691,7 @@ pi_result cuda_piEnqueueMemBufferWriteRect(
         buffer_row_pitch, buffer_slice_pitch);
 
     if (event) {
-      retErr = retImplEv->record();
+      retErr = map_ur_error(retImplEv->record());
     }
 
     if (blocking_write) {
@@ -2387,7 +1731,7 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer,
     if (event) {
       retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
           PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue, stream));
-      result = retImplEv->start();
+      result = map_ur_error(retImplEv->start());
     }
 
     auto src = src_buffer->mem_.buffer_mem_.get() + src_offset;
@@ -2396,7 +1740,7 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer,
     result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream));
 
     if (event) {
-      result = retImplEv->record();
+      result = map_ur_error(retImplEv->record());
       *event = retImplEv.release();
     }
 
@@ -2489,7 +1833,7 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
     if (event) {
       retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
           PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue, stream));
-      result = retImplEv->start();
+      result = map_ur_error(retImplEv->start());
     }
 
     auto dstDevice = buffer->mem_.buffer_mem_.get() + offset;
@@ -2541,7 +1885,7 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
     }
 
     if (event) {
-      result = retImplEv->record();
+      result = map_ur_error(retImplEv->record());
       *event = retImplEv.release();
     }
 
@@ -3088,7 +2432,9 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value,
     pi_uint32 stream_token;
     _pi_stream_guard guard;
     CUstream cuStream = queue->get_next_compute_stream(
-        num_events_in_waitlist, events_waitlist, guard, &stream_token);
+        num_events_in_waitlist,
+        reinterpret_cast<const ur_event_handle_t *>(events_waitlist), guard,
+        &stream_token);
     result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
                                events_waitlist);
     if (event) {
@@ -3099,7 +2445,7 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value,
     result = PI_CHECK_ERROR(cuMemsetD8Async(
         (CUdeviceptr)ptr, (unsigned char)value & 0xFF, count, cuStream));
     if (event) {
-      result = event_ptr->record();
+      result = map_ur_error(event_ptr->record());
       *event = event_ptr.release();
     }
   } catch (pi_result err) {
@@ -3134,7 +2480,7 @@ pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking,
     result = PI_CHECK_ERROR(cuMemcpyAsync(
         (CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr, size, cuStream));
     if (event) {
-      result = event_ptr->record();
+      result = map_ur_error(event_ptr->record());
     }
     if (blocking) {
       result = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
@@ -3198,7 +2544,7 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr,
     result = PI_CHECK_ERROR(
         cuMemPrefetchAsync((CUdeviceptr)ptr, size, device->get(), cuStream));
     if (event) {
-      result = event_ptr->record();
+      result = map_ur_error(event_ptr->record());
       *event = event_ptr.release();
     }
   } catch (pi_result err) {
@@ -3299,7 +2645,7 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr,
       sycl::detail::pi::die("Unknown advice");
     }
     if (event) {
-      result = event_ptr->record();
+      result = map_ur_error(event_ptr->record());
       *event = event_ptr.release();
     }
   } catch (pi_result err) {
@@ -3670,16 +3016,20 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piextContextCreateWithNativeHandle,
          pi2ur::piextContextCreateWithNativeHandle)
   // Queue
-  _PI_CL(piQueueCreate, cuda_piQueueCreate)
-  _PI_CL(piextQueueCreate, cuda_piextQueueCreate)
-  _PI_CL(piQueueGetInfo, cuda_piQueueGetInfo)
-  _PI_CL(piQueueFinish, cuda_piQueueFinish)
-  _PI_CL(piQueueFlush, cuda_piQueueFlush)
-  _PI_CL(piQueueRetain, cuda_piQueueRetain)
-  _PI_CL(piQueueRelease, cuda_piQueueRelease)
-  _PI_CL(piextQueueGetNativeHandle, cuda_piextQueueGetNativeHandle)
+  _PI_CL(piQueueCreate, pi2ur::piQueueCreate)
+  _PI_CL(piextQueueCreate, pi2ur::piextQueueCreate)
+  _PI_CL(piextQueueCreate2, pi2ur::piextQueueCreate2)
+  _PI_CL(piQueueGetInfo, pi2ur::piQueueGetInfo)
+  _PI_CL(piQueueFinish, pi2ur::piQueueFinish)
+  _PI_CL(piQueueFlush, pi2ur::piQueueFlush)
+  _PI_CL(piQueueRetain, pi2ur::piQueueRetain)
+  _PI_CL(piQueueRelease, pi2ur::piQueueRelease)
+  _PI_CL(piextQueueGetNativeHandle, pi2ur::piextQueueGetNativeHandle)
+  _PI_CL(piextQueueGetNativeHandle2, pi2ur::piextQueueGetNativeHandle2)
   _PI_CL(piextQueueCreateWithNativeHandle,
-         cuda_piextQueueCreateWithNativeHandle)
+         pi2ur::piextQueueCreateWithNativeHandle)
+  _PI_CL(piextQueueCreateWithNativeHandle2,
+         pi2ur::piextQueueCreateWithNativeHandle2)
   // Memory
   _PI_CL(piMemBufferCreate, cuda_piMemBufferCreate)
   _PI_CL(piMemImageCreate, cuda_piMemImageCreate)
@@ -3721,17 +3071,17 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
          pi2ur::piextKernelCreateWithNativeHandle)
 
   // Event
-  _PI_CL(piEventCreate, cuda_piEventCreate)
-  _PI_CL(piEventGetInfo, cuda_piEventGetInfo)
-  _PI_CL(piEventGetProfilingInfo, cuda_piEventGetProfilingInfo)
-  _PI_CL(piEventsWait, cuda_piEventsWait)
-  _PI_CL(piEventSetCallback, cuda_piEventSetCallback)
-  _PI_CL(piEventSetStatus, cuda_piEventSetStatus)
-  _PI_CL(piEventRetain, cuda_piEventRetain)
-  _PI_CL(piEventRelease, cuda_piEventRelease)
-  _PI_CL(piextEventGetNativeHandle, cuda_piextEventGetNativeHandle)
+  _PI_CL(piEventCreate, pi2ur::piEventCreate)
+  _PI_CL(piEventGetInfo, pi2ur::piEventGetInfo)
+  _PI_CL(piEventGetProfilingInfo, pi2ur::piEventGetProfilingInfo)
+  _PI_CL(piEventsWait, pi2ur::piEventsWait)
+  _PI_CL(piEventSetCallback, pi2ur::piEventSetCallback)
+  _PI_CL(piEventSetStatus, pi2ur::piEventSetStatus)
+  _PI_CL(piEventRetain, pi2ur::piEventRetain)
+  _PI_CL(piEventRelease, pi2ur::piEventRelease)
+  _PI_CL(piextEventGetNativeHandle, pi2ur::piextEventGetNativeHandle)
   _PI_CL(piextEventCreateWithNativeHandle,
-         cuda_piextEventCreateWithNativeHandle)
+         pi2ur::piextEventCreateWithNativeHandle)
   // Sampler
   _PI_CL(piSamplerCreate, cuda_piSamplerCreate)
   _PI_CL(piSamplerGetInfo, cuda_piSamplerGetInfo)
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index 51f6b7f2a34b4..1a8c7e64537cd 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -47,6 +47,8 @@
 #include <ur/adapters/cuda/kernel.hpp>
 #include <ur/adapters/cuda/platform.hpp>
 #include <ur/adapters/cuda/program.hpp>
+#include <ur/adapters/cuda/event.hpp>
+#include <ur/adapters/cuda/queue.hpp>
 
 // Share code between the PI Plugin and UR Adapter
 #include <pi2ur.hpp>
@@ -54,8 +56,6 @@
 extern "C" {
 
 /// \cond IGNORE_BLOCK_IN_DOXYGEN
-pi_result cuda_piQueueRelease(pi_queue);
-pi_result cuda_piQueueRetain(pi_queue);
 pi_result cuda_piMemRetain(pi_mem);
 pi_result cuda_piMemRelease(pi_mem);
 /// \endcond
@@ -298,368 +298,81 @@ struct _pi_mem {
 
 /// PI queue mapping on to CUstream objects.
 ///
-struct _pi_queue {
-  using native_type = CUstream;
-  static constexpr int default_num_compute_streams = 128;
-  static constexpr int default_num_transfer_streams = 64;
-
-  std::vector<native_type> compute_streams_;
-  std::vector<native_type> transfer_streams_;
-  // delay_compute_ keeps track of which streams have been recently reused and
-  // their next use should be delayed. If a stream has been recently reused it
-  // will be skipped the next time it would be selected round-robin style. When
-  // skipped, its delay flag is cleared.
-  std::vector<bool> delay_compute_;
-  // keep track of which streams have applied barrier
-  std::vector<bool> compute_applied_barrier_;
-  std::vector<bool> transfer_applied_barrier_;
-  _pi_context *context_;
-  _pi_device *device_;
-  pi_queue_properties properties_;
-  CUevent barrier_event_ = nullptr;
-  CUevent barrier_tmp_event_ = nullptr;
-  std::atomic_uint32_t refCount_;
-  std::atomic_uint32_t eventCount_;
-  std::atomic_uint32_t compute_stream_idx_;
-  std::atomic_uint32_t transfer_stream_idx_;
-  unsigned int num_compute_streams_;
-  unsigned int num_transfer_streams_;
-  unsigned int last_sync_compute_streams_;
-  unsigned int last_sync_transfer_streams_;
-  unsigned int flags_;
-  // When compute_stream_sync_mutex_ and compute_stream_mutex_ both need to be
-  // locked at the same time, compute_stream_sync_mutex_ should be locked first
-  // to avoid deadlocks
-  std::mutex compute_stream_sync_mutex_;
-  std::mutex compute_stream_mutex_;
-  std::mutex transfer_stream_mutex_;
-  std::mutex barrier_mutex_;
-  bool has_ownership_;
-
-  _pi_queue(std::vector<CUstream> &&compute_streams,
-            std::vector<CUstream> &&transfer_streams, _pi_context *context,
-            _pi_device *device, pi_queue_properties properties,
-            unsigned int flags, bool backend_owns = true)
-      : compute_streams_{std::move(compute_streams)},
-        transfer_streams_{std::move(transfer_streams)},
-        delay_compute_(compute_streams_.size(), false),
-        compute_applied_barrier_(compute_streams_.size()),
-        transfer_applied_barrier_(transfer_streams_.size()), context_{context},
-        device_{device}, properties_{properties}, refCount_{1}, eventCount_{0},
-        compute_stream_idx_{0}, transfer_stream_idx_{0},
-        num_compute_streams_{0}, num_transfer_streams_{0},
-        last_sync_compute_streams_{0}, last_sync_transfer_streams_{0},
-        flags_(flags), has_ownership_{backend_owns} {
-    pi2ur::piContextRetain(context_);
-    pi2ur::piDeviceRetain(device_);
-  }
-
-  ~_pi_queue() {
-    pi2ur::piContextRelease(context_);
-    pi2ur::piDeviceRelease(device_);
-  }
-
-  void compute_stream_wait_for_barrier_if_needed(CUstream stream,
-                                                 pi_uint32 stream_i);
-  void transfer_stream_wait_for_barrier_if_needed(CUstream stream,
-                                                  pi_uint32 stream_i);
-
-  // get_next_compute/transfer_stream() functions return streams from
-  // appropriate pools in round-robin fashion
-  native_type get_next_compute_stream(pi_uint32 *stream_token = nullptr);
-  // this overload tries select a stream that was used by one of dependancies.
-  // If that is not possible returns a new stream. If a stream is reused it
-  // returns a lock that needs to remain locked as long as the stream is in use
-  native_type get_next_compute_stream(pi_uint32 num_events_in_wait_list,
-                                      const pi_event *event_wait_list,
-                                      _pi_stream_guard &guard,
-                                      pi_uint32 *stream_token = nullptr);
-  native_type get_next_transfer_stream();
-  native_type get() { return get_next_compute_stream(); };
-
-  bool has_been_synchronized(pi_uint32 stream_token) {
-    // stream token not associated with one of the compute streams
-    if (stream_token == std::numeric_limits<pi_uint32>::max()) {
-      return false;
-    }
-    return last_sync_compute_streams_ > stream_token;
-  }
-
-  bool can_reuse_stream(pi_uint32 stream_token) {
-    // stream token not associated with one of the compute streams
-    if (stream_token == std::numeric_limits<pi_uint32>::max()) {
-      return false;
-    }
-    // If the command represented by the stream token was not the last command
-    // enqueued to the stream we can not reuse the stream - we need to allow for
-    // commands enqueued after it and the one we are about to enqueue to run
-    // concurrently
-    bool is_last_command =
-        (compute_stream_idx_ - stream_token) <= compute_streams_.size();
-    // If there was a barrier enqueued to the queue after the command
-    // represented by the stream token we should not reuse the stream, as we can
-    // not take that stream into account for the bookkeeping for the next
-    // barrier - such a stream would not be synchronized with. Performance-wise
-    // it does not matter that we do not reuse the stream, as the work
-    // represented by the stream token is guaranteed to be complete by the
-    // barrier before any work we are about to enqueue to the stream will start,
-    // so the event does not need to be synchronized with.
-    return is_last_command && !has_been_synchronized(stream_token);
-  }
-
-  template <typename T> bool all_of(T &&f) {
-    {
-      std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
-      unsigned int end =
-          std::min(static_cast<unsigned int>(compute_streams_.size()),
-                   num_compute_streams_);
-      if (!std::all_of(compute_streams_.begin(), compute_streams_.begin() + end,
-                       f))
-        return false;
-    }
-    {
-      std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
-      unsigned int end =
-          std::min(static_cast<unsigned int>(transfer_streams_.size()),
-                   num_transfer_streams_);
-      if (!std::all_of(transfer_streams_.begin(),
-                       transfer_streams_.begin() + end, f))
-        return false;
-    }
-    return true;
-  }
-
-  template <typename T> void for_each_stream(T &&f) {
-    {
-      std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
-      unsigned int end =
-          std::min(static_cast<unsigned int>(compute_streams_.size()),
-                   num_compute_streams_);
-      for (unsigned int i = 0; i < end; i++) {
-        f(compute_streams_[i]);
-      }
-    }
-    {
-      std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
-      unsigned int end =
-          std::min(static_cast<unsigned int>(transfer_streams_.size()),
-                   num_transfer_streams_);
-      for (unsigned int i = 0; i < end; i++) {
-        f(transfer_streams_[i]);
-      }
-    }
-  }
-
-  template <bool ResetUsed = false, typename T> void sync_streams(T &&f) {
-    auto sync_compute = [&f, &streams = compute_streams_,
-                         &delay = delay_compute_](unsigned int start,
-                                                  unsigned int stop) {
-      for (unsigned int i = start; i < stop; i++) {
-        f(streams[i]);
-        delay[i] = false;
-      }
-    };
-    auto sync_transfer = [&f, &streams = transfer_streams_](unsigned int start,
-                                                            unsigned int stop) {
-      for (unsigned int i = start; i < stop; i++) {
-        f(streams[i]);
-      }
-    };
-    {
-      unsigned int size = static_cast<unsigned int>(compute_streams_.size());
-      std::lock_guard<std::mutex> compute_sync_guard(
-          compute_stream_sync_mutex_);
-      std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
-      unsigned int start = last_sync_compute_streams_;
-      unsigned int end = num_compute_streams_ < size
-                             ? num_compute_streams_
-                             : compute_stream_idx_.load();
-      if (end - start >= size) {
-        sync_compute(0, size);
-      } else {
-        start %= size;
-        end %= size;
-        if (start <= end) {
-          sync_compute(start, end);
-        } else {
-          sync_compute(start, size);
-          sync_compute(0, end);
-        }
-      }
-      if (ResetUsed) {
-        last_sync_compute_streams_ = end;
-      }
-    }
-    {
-      unsigned int size = static_cast<unsigned int>(transfer_streams_.size());
-      if (size > 0) {
-        std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
-        unsigned int start = last_sync_transfer_streams_;
-        unsigned int end = num_transfer_streams_ < size
-                               ? num_transfer_streams_
-                               : transfer_stream_idx_.load();
-        if (end - start >= size) {
-          sync_transfer(0, size);
-        } else {
-          start %= size;
-          end %= size;
-          if (start <= end) {
-            sync_transfer(start, end);
-          } else {
-            sync_transfer(start, size);
-            sync_transfer(0, end);
-          }
-        }
-        if (ResetUsed) {
-          last_sync_transfer_streams_ = end;
-        }
-      }
-    }
-  }
-
-  _pi_context *get_context() const { return context_; };
-
-  _pi_device *get_device() const { return device_; };
-
-  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
-
-  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
-
-  pi_uint32 get_reference_count() const noexcept { return refCount_; }
-
-  pi_uint32 get_next_event_id() noexcept { return ++eventCount_; }
-
-  bool backend_has_ownership() const noexcept { return has_ownership_; }
+struct _pi_queue : ur_queue_handle_t_ {
+  using ur_queue_handle_t_::ur_queue_handle_t_;
 };
 
 typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
                            void *userData);
-/// PI Event mapping to CUevent
-///
-struct _pi_event {
-public:
-  using native_type = CUevent;
-
-  pi_result record();
-
-  pi_result wait();
-
-  pi_result start();
-
-  native_type get() const noexcept { return evEnd_; };
-
-  pi_queue get_queue() const noexcept { return queue_; }
-
-  CUstream get_stream() const noexcept { return stream_; }
-
-  pi_uint32 get_compute_stream_token() const noexcept { return streamToken_; }
-
-  pi_command_type get_command_type() const noexcept { return commandType_; }
-
-  pi_uint32 get_reference_count() const noexcept { return refCount_; }
-
-  bool is_recorded() const noexcept { return isRecorded_; }
-
-  bool is_started() const noexcept { return isStarted_; }
-
-  bool is_completed() const noexcept;
-
-  pi_int32 get_execution_status() const noexcept {
-
-    if (!is_recorded()) {
-      return PI_EVENT_SUBMITTED;
-    }
-
-    if (!is_completed()) {
-      return PI_EVENT_RUNNING;
-    }
-    return PI_EVENT_COMPLETE;
-  }
-
-  pi_context get_context() const noexcept { return context_; };
-
-  pi_uint32 increment_reference_count() { return ++refCount_; }
-
-  pi_uint32 decrement_reference_count() { return --refCount_; }
 
-  pi_uint32 get_event_id() const noexcept { return eventId_; }
+struct _pi_event : ur_event_handle_t_ {
+  using ur_event_handle_t_::ur_event_handle_t_;
 
-  bool backend_has_ownership() const noexcept { return has_ownership_; }
-
-  // Returns the counter time when the associated command(s) were enqueued
-  //
-  pi_uint64 get_queued_time() const;
-
-  // Returns the counter time when the associated command(s) started execution
-  //
-  pi_uint64 get_start_time() const;
-
-  // Returns the counter time when the associated command(s) completed
-  //
-  pi_uint64 get_end_time() const;
-
-  // construct a native CUDA. This maps closely to the underlying CUDA event.
+  // Helpers for queue command implementations until they also get ported to UR
   static pi_event
   make_native(pi_command_type type, pi_queue queue, CUstream stream,
-              pi_uint32 stream_token = std::numeric_limits<pi_uint32>::max()) {
-    return new _pi_event(type, queue->get_context(), queue, stream,
-                         stream_token);
-  }
+              uint32_t stream_token = std::numeric_limits<uint32_t>::max()) {
+    auto urQueue = reinterpret_cast<ur_queue_handle_t>(queue);
+    static std::unordered_map<_pi_command_type, ur_command_t> cmdMap = {
+        {PI_COMMAND_TYPE_NDRANGE_KERNEL, UR_COMMAND_KERNEL_LAUNCH},
+        {PI_COMMAND_TYPE_MEM_BUFFER_READ, UR_COMMAND_MEM_BUFFER_READ},
+        {PI_COMMAND_TYPE_MEM_BUFFER_WRITE, UR_COMMAND_MEM_BUFFER_WRITE},
+        {PI_COMMAND_TYPE_MEM_BUFFER_COPY, UR_COMMAND_MEM_BUFFER_COPY},
+        {PI_COMMAND_TYPE_MEM_BUFFER_MAP, UR_COMMAND_MEM_BUFFER_MAP},
+        {PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, UR_COMMAND_MEM_UNMAP},
+        {PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, UR_COMMAND_MEM_BUFFER_READ_RECT},
+        {PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT,
+         UR_COMMAND_MEM_BUFFER_WRITE_RECT},
+        {PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, UR_COMMAND_MEM_BUFFER_COPY_RECT},
+        {PI_COMMAND_TYPE_MEM_BUFFER_FILL, UR_COMMAND_MEM_BUFFER_FILL},
+        {PI_COMMAND_TYPE_IMAGE_READ, UR_COMMAND_MEM_IMAGE_READ},
+        {PI_COMMAND_TYPE_IMAGE_WRITE, UR_COMMAND_MEM_IMAGE_WRITE},
+        {PI_COMMAND_TYPE_IMAGE_COPY, UR_COMMAND_MEM_IMAGE_COPY},
+        {PI_COMMAND_TYPE_BARRIER, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER},
+        {PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_READ,
+         UR_COMMAND_DEVICE_GLOBAL_VARIABLE_READ},
+        {PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_WRITE,
+         UR_COMMAND_DEVICE_GLOBAL_VARIABLE_WRITE},
+    };
 
-  static pi_event make_with_native(pi_context context, CUevent eventNative) {
-    return new _pi_event(context, eventNative);
+    // TODO(ur): There is no exact mapping for the following commands. Just
+    // default to KERNEL_LAUNCH for now.
+    // PI_COMMAND_TYPE_USER
+    // PI_COMMAND_TYPE_MEM_BUFFER_FILL,
+    // PI_COMMAND_TYPE_IMAGE_READ,
+    // PI_COMMAND_TYPE_IMAGE_WRITE,
+    // PI_COMMAND_TYPE_IMAGE_COPY,
+    // PI_COMMAND_TYPE_NATIVE_KERNEL,
+    // PI_COMMAND_TYPE_COPY_BUFFER_TO_IMAGE,
+    // PI_COMMAND_TYPE_COPY_IMAGE_TO_BUFFER,
+    // PI_COMMAND_TYPE_MAP_IMAGE,
+    // PI_COMMAND_TYPE_MARKER,
+    // PI_COMMAND_TYPE_ACQUIRE_GL_OBJECTS,
+    // PI_COMMAND_TYPE_RELEASE_GL_OBJECTS,
+    // PI_COMMAND_TYPE_BARRIER,
+    // PI_COMMAND_TYPE_MIGRATE_MEM_OBJECTS,
+    // PI_COMMAND_TYPE_FILL_IMAGE
+    // PI_COMMAND_TYPE_SVM_FREE
+    // PI_COMMAND_TYPE_SVM_MEMCPY
+    // PI_COMMAND_TYPE_SVM_MEMFILL
+    // PI_COMMAND_TYPE_SVM_MAP
+    // PI_COMMAND_TYPE_SVM_UNMAP
+
+    ur_command_t urCmd = UR_COMMAND_KERNEL_LAUNCH;
+    auto cmdIt = cmdMap.find(type);
+    if (cmdIt != cmdMap.end()) {
+      urCmd = cmdIt->second;
+    }
+    return reinterpret_cast<pi_event>(
+        ur_event_handle_t_::make_native(urCmd, urQueue, stream, stream_token));
   }
 
-  pi_result release();
-
-  ~_pi_event();
-
-private:
-  // This constructor is private to force programmers to use the make_native /
-  // make_user static members in order to create a pi_event for CUDA.
-  _pi_event(pi_command_type type, pi_context context, pi_queue queue,
-            CUstream stream, pi_uint32 stream_token);
-
-  // This constructor is private to force programmers to use the
-  // make_with_native for event introp
-  _pi_event(pi_context context, CUevent eventNative);
-
-  pi_command_type commandType_; // The type of command associated with event.
-
-  std::atomic_uint32_t refCount_; // Event reference count.
-
-  bool has_ownership_; // Signifies if event owns the native type.
-
-  bool hasBeenWaitedOn_; // Signifies whether the event has been waited
-                         // on through a call to wait(), which implies
-                         // that it has completed.
-
-  bool isRecorded_; // Signifies wether a native CUDA event has been recorded
-                    // yet.
-  bool isStarted_;  // Signifies wether the operation associated with the
-                    // PI event has started or not
-                    //
-
-  pi_uint32 streamToken_;
-  pi_uint32 eventId_; // Queue identifier of the event.
-
-  native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
-                      // event, this will be nullptr.
-
-  native_type evStart_; // CUDA event handle associated with the start
-
-  native_type evQueued_; // CUDA event handle associated with the time
-                         // the command was enqueued
-
-  pi_queue queue_; // pi_queue associated with the event. If this is a user
-                   // event, this will be nullptr.
-
-  CUstream stream_; // CUstream associated with the event. If this is a user
-                    // event, this will be uninitialized.
-
-  pi_context context_; // pi_context associated with the event. If this is a
-                       // native event, this will be the same context associated
-                       // with the queue_ member.
+  static pi_event make_with_native(ur_context_handle_t context,
+                                   CUevent eventNative) {
+    auto urContext = reinterpret_cast<ur_context_handle_t>(context);
+    return reinterpret_cast<pi_event>(
+        ur_event_handle_t_::make_with_native(urContext, eventNative));
+  }
 };
 
 /// Implementation of PI Program on CUDA Module object
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 014938c9ba542..dc572bd5e7e9c 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -137,12 +137,17 @@ add_sycl_library("ur_adapter_cuda" SHARED
     "ur/adapters/cuda/context.hpp"
     "ur/adapters/cuda/device.cpp"
     "ur/adapters/cuda/device.hpp"
+    "ur/adapters/cuda/enqueue.cpp"
+    "ur/adapters/cuda/event.cpp"
+    "ur/adapters/cuda/event.hpp"
     "ur/adapters/cuda/platform.cpp"
     "ur/adapters/cuda/platform.hpp"
     "ur/adapters/cuda/program.cpp"
     "ur/adapters/cuda/program.hpp"
     "ur/adapters/cuda/kernel.cpp"
     "ur/adapters/cuda/kernel.hpp"
+    "ur/adapters/cuda/queue.cpp"
+    "ur/adapters/cuda/queue.hpp"
     "ur/adapters/cuda/ur_interface_loader.cpp"
     "ur/adapters/cuda/tracing.cpp"
   INCLUDE_DIRS
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
new file mode 100644
index 0000000000000..3dfa1ba1dbd5c
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
@@ -0,0 +1,110 @@
+//===--------- enqueue.cpp - CUDA Adapter ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "common.hpp"
+#include "context.hpp"
+#include "event.hpp"
+#include "queue.hpp"
+
+#include <cuda.h>
+
+/// Enqueues a wait on the given CUstream for all specified events (See
+/// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued
+/// wait will wait on all previous events in the queue.
+///
+ur_result_t urEnqueueEventsWaitWithBarrier(
+    ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  // This function makes one stream work on the previous work (or work
+  // represented by input events) and then all future work waits on that stream.
+  if (!hQueue) {
+    return UR_RESULT_ERROR_INVALID_QUEUE;
+  }
+
+  ur_result_t result;
+
+  try {
+    ScopedContext active(hQueue->get_context());
+    uint32_t stream_token;
+    ur_stream_guard_ guard;
+    CUstream cuStream = hQueue->get_next_compute_stream(
+        numEventsInWaitList, phEventWaitList, guard, &stream_token);
+    {
+      std::lock_guard<std::mutex> guard(hQueue->barrier_mutex_);
+      if (hQueue->barrier_event_ == nullptr) {
+        UR_CHECK_ERROR(
+            cuEventCreate(&hQueue->barrier_event_, CU_EVENT_DISABLE_TIMING));
+      }
+      if (numEventsInWaitList == 0) { //  wait on all work
+        if (hQueue->barrier_tmp_event_ == nullptr) {
+          UR_CHECK_ERROR(cuEventCreate(&hQueue->barrier_tmp_event_,
+                                       CU_EVENT_DISABLE_TIMING));
+        }
+        hQueue->sync_streams(
+            [cuStream, tmp_event = hQueue->barrier_tmp_event_](CUstream s) {
+              if (cuStream != s) {
+                // record a new CUDA event on every stream and make one stream
+                // wait for these events
+                UR_CHECK_ERROR(cuEventRecord(tmp_event, s));
+                UR_CHECK_ERROR(cuStreamWaitEvent(cuStream, tmp_event, 0));
+              }
+            });
+      } else { // wait just on given events
+        forLatestEvents(phEventWaitList, numEventsInWaitList,
+                        [cuStream](ur_event_handle_t event) -> ur_result_t {
+                          if (event->get_queue()->has_been_synchronized(
+                                  event->get_compute_stream_token())) {
+                            return UR_RESULT_SUCCESS;
+                          } else {
+                            return UR_CHECK_ERROR(
+                                cuStreamWaitEvent(cuStream, event->get(), 0));
+                          }
+                        });
+      }
+
+      result = UR_CHECK_ERROR(cuEventRecord(hQueue->barrier_event_, cuStream));
+      for (unsigned int i = 0; i < hQueue->compute_applied_barrier_.size();
+           i++) {
+        hQueue->compute_applied_barrier_[i] = false;
+      }
+      for (unsigned int i = 0; i < hQueue->transfer_applied_barrier_.size();
+           i++) {
+        hQueue->transfer_applied_barrier_[i] = false;
+      }
+    }
+    if (result != UR_RESULT_SUCCESS) {
+      return result;
+    }
+
+    if (phEvent) {
+      *phEvent = ur_event_handle_t_::make_native(
+          UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, hQueue, cuStream, stream_token);
+      (*phEvent)->start();
+      (*phEvent)->record();
+    }
+
+    return UR_RESULT_SUCCESS;
+  } catch (ur_result_t err) {
+    return err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+}
+
+/// Enqueues a wait on the given CUstream for all events.
+/// See \ref enqueueEventWait
+/// TODO: Add support for multiple streams once the Event class is properly
+/// refactored.
+///
+ur_result_t urEnqueueEventsWait(ur_queue_handle_t hQueue,
+                                uint32_t numEventsInWaitList,
+                                const ur_event_handle_t *phEventWaitList,
+                                ur_event_handle_t *phEvent) {
+  return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
+                                        phEventWaitList, phEvent);
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
new file mode 100644
index 0000000000000..6788de883e971
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
@@ -0,0 +1,309 @@
+//===--------- event.cpp - CUDA Adapter ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "event.hpp"
+#include "common.hpp"
+#include "context.hpp"
+#include "device.hpp"
+#include "queue.hpp"
+
+#include <cassert>
+#include <cuda.h>
+
+ur_event_handle_t_::ur_event_handle_t_(ur_command_t type,
+                                       ur_context_handle_t context,
+                                       ur_queue_handle_t queue, CUstream stream,
+                                       uint32_t stream_token)
+    : commandType_{type}, refCount_{1}, has_ownership_{true},
+      hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
+      streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr},
+      evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} {
+
+  bool profilingEnabled = queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE;
+
+  UR_CHECK_ERROR(cuEventCreate(
+      &evEnd_, profilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
+
+  if (profilingEnabled) {
+    UR_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
+    UR_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
+  }
+
+  if (queue_ != nullptr) {
+    urQueueRetain(queue_);
+  }
+  urContextRetain(context_);
+}
+
+ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t context,
+                                       CUevent eventNative)
+    // TODO(ur): Missing user command type
+    : commandType_{UR_COMMAND_EVENTS_WAIT}, refCount_{1}, has_ownership_{false},
+      hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
+      streamToken_{std::numeric_limits<uint32_t>::max()}, evEnd_{eventNative},
+      evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, context_{
+                                                                  context} {
+  urContextRetain(context_);
+}
+
+ur_event_handle_t_::~ur_event_handle_t_() {
+  if (queue_ != nullptr) {
+    urQueueRelease(queue_);
+  }
+  urContextRelease(context_);
+}
+
+ur_result_t ur_event_handle_t_::start() {
+  assert(!is_started());
+  ur_result_t result = UR_RESULT_SUCCESS;
+
+  try {
+    if (queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+      // NOTE: This relies on the default stream to be unused.
+      result = UR_CHECK_ERROR(cuEventRecord(evQueued_, 0));
+      result = UR_CHECK_ERROR(cuEventRecord(evStart_, stream_));
+    }
+  } catch (ur_result_t error) {
+    result = error;
+  }
+
+  isStarted_ = true;
+  return result;
+}
+
+bool ur_event_handle_t_::is_completed() const noexcept {
+  if (!isRecorded_) {
+    return false;
+  }
+  if (!hasBeenWaitedOn_) {
+    const CUresult ret = cuEventQuery(evEnd_);
+    if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_READY) {
+      UR_CHECK_ERROR(ret);
+      return false;
+    }
+    if (ret == CUDA_ERROR_NOT_READY) {
+      return false;
+    }
+  }
+  return true;
+}
+
+uint64_t ur_event_handle_t_::get_queued_time() const {
+  assert(is_started());
+  return queue_->get_device()->get_elapsed_time(evQueued_);
+}
+
+uint64_t ur_event_handle_t_::get_start_time() const {
+  assert(is_started());
+  return queue_->get_device()->get_elapsed_time(evStart_);
+}
+
+uint64_t ur_event_handle_t_::get_end_time() const {
+  assert(is_started() && is_recorded());
+  return queue_->get_device()->get_elapsed_time(evEnd_);
+}
+
+ur_result_t ur_event_handle_t_::record() {
+
+  if (is_recorded() || !is_started()) {
+    return UR_RESULT_ERROR_INVALID_EVENT;
+  }
+
+  ur_result_t result = UR_RESULT_ERROR_INVALID_OPERATION;
+
+  UR_ASSERT(queue_, UR_RESULT_ERROR_INVALID_QUEUE);
+
+  try {
+    eventId_ = queue_->get_next_event_id();
+    if (eventId_ == 0) {
+      sycl::detail::ur::die(
+          "Unrecoverable program state reached in event identifier overflow");
+    }
+    result = UR_CHECK_ERROR(cuEventRecord(evEnd_, stream_));
+  } catch (ur_result_t error) {
+    result = error;
+  }
+
+  if (result == UR_RESULT_SUCCESS) {
+    isRecorded_ = true;
+  }
+
+  return result;
+}
+
+ur_result_t ur_event_handle_t_::wait() {
+  ur_result_t retErr;
+  try {
+    retErr = UR_CHECK_ERROR(cuEventSynchronize(evEnd_));
+    hasBeenWaitedOn_ = true;
+  } catch (ur_result_t error) {
+    retErr = error;
+  }
+
+  return retErr;
+}
+
+ur_result_t ur_event_handle_t_::release() {
+  if (!backend_has_ownership())
+    return UR_RESULT_SUCCESS;
+
+  assert(queue_ != nullptr);
+
+  UR_CHECK_ERROR(cuEventDestroy(evEnd_));
+
+  if (queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+    UR_CHECK_ERROR(cuEventDestroy(evQueued_));
+    UR_CHECK_ERROR(cuEventDestroy(evStart_));
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent,
+                                                   ur_event_info_t propName,
+                                                   size_t propValueSize,
+                                                   void *pPropValue,
+                                                   size_t *pPropValueSizeRet) {
+  UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet);
+
+  switch (propName) {
+  case UR_EVENT_INFO_COMMAND_QUEUE:
+    return ReturnValue(hEvent->get_queue());
+  case UR_EVENT_INFO_COMMAND_TYPE:
+    return ReturnValue(hEvent->get_command_type());
+  case UR_EVENT_INFO_REFERENCE_COUNT:
+    return ReturnValue(hEvent->get_reference_count());
+  case UR_EVENT_INFO_COMMAND_EXECUTION_STATUS:
+    return ReturnValue(hEvent->get_execution_status());
+  case UR_EVENT_INFO_CONTEXT:
+    return ReturnValue(hEvent->get_context());
+  default:
+    sycl::detail::ur::die("Event info request not implemented");
+  }
+
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
+
+/// Obtain profiling information from PI CUDA events
+/// \TODO Timings from CUDA are only elapsed time.
+UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
+    ur_event_handle_t hEvent, ur_profiling_info_t propName,
+    size_t propValueSize, void *pPropValue, size_t *pPropValueSizeRet) {
+  UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet);
+
+  ur_queue_handle_t queue = hEvent->get_queue();
+  if (queue == nullptr ||
+      !(queue->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
+    return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
+  }
+
+  switch (propName) {
+  case UR_PROFILING_INFO_COMMAND_QUEUED:
+  case UR_PROFILING_INFO_COMMAND_SUBMIT:
+    // Note: No user for this case
+    return ReturnValue(static_cast<uint64_t>(hEvent->get_queued_time()));
+  case UR_PROFILING_INFO_COMMAND_START:
+    return ReturnValue(static_cast<uint64_t>(hEvent->get_start_time()));
+  case UR_PROFILING_INFO_COMMAND_END:
+    return ReturnValue(static_cast<uint64_t>(hEvent->get_end_time()));
+  default:
+    break;
+  }
+  sycl::detail::ur::die("Event Profiling info request not implemented");
+  return {};
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t,
+                                                       ur_execution_info_t,
+                                                       ur_event_callback_t,
+                                                       void *) {
+  sycl::detail::ur::die("Event Callback not implemented in CUDA adapter");
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) {
+  try {
+    UR_ASSERT(phEventWaitList, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+    UR_ASSERT(numEvents > 0, UR_RESULT_ERROR_INVALID_VALUE);
+
+    auto context = phEventWaitList[0]->get_context();
+    ScopedContext active(context);
+
+    auto waitFunc = [context](ur_event_handle_t event) -> ur_result_t {
+      UR_ASSERT(event, UR_RESULT_ERROR_INVALID_EVENT);
+      UR_ASSERT(event->get_context() == context,
+                UR_RESULT_ERROR_INVALID_CONTEXT);
+
+      return event->wait();
+    };
+    return forLatestEvents(phEventWaitList, numEvents, waitFunc);
+  } catch (ur_result_t err) {
+    return err;
+  } catch (...) {
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) {
+  UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  const auto refCount = hEvent->increment_reference_count();
+
+  sycl::detail::ur::assertion(
+      refCount != 0, "Reference count overflow detected in urEventRetain.");
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) {
+  UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  // double delete or someone is messing with the ref count.
+  // either way, cannot safely proceed.
+  sycl::detail::ur::assertion(
+      hEvent->get_reference_count() != 0,
+      "Reference count overflow detected in urEventRelease.");
+
+  // decrement ref count. If it is 0, delete the event.
+  if (hEvent->decrement_reference_count() == 0) {
+    std::unique_ptr<ur_event_handle_t_> event_ptr{hEvent};
+    ur_result_t result = UR_RESULT_ERROR_INVALID_EVENT;
+    try {
+      ScopedContext active(hEvent->get_context());
+      result = hEvent->release();
+    } catch (...) {
+      result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    }
+    return result;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle(
+    ur_event_handle_t hEvent, ur_native_handle_t *phNativeEvent) {
+  *phNativeEvent = reinterpret_cast<ur_native_handle_t>(hEvent->get());
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle(
+    ur_native_handle_t hNativeEvent, ur_context_handle_t hContext,
+    const ur_event_native_properties_t *pProperties,
+    ur_event_handle_t *phEvent) {
+  (void)pProperties;
+
+  std::unique_ptr<ur_event_handle_t_> event_ptr{nullptr};
+
+  *phEvent = ur_event_handle_t_::make_with_native(
+      hContext, reinterpret_cast<CUevent>(hNativeEvent));
+
+  return UR_RESULT_SUCCESS;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp
new file mode 100644
index 0000000000000..d0c7fef8a2b48
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp
@@ -0,0 +1,191 @@
+//===--------- event.hpp - CUDA Adapter ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <cuda.h>
+#include <ur/ur.hpp>
+
+#include "queue.hpp"
+
+/// UR Event mapping to CUevent
+///
+struct ur_event_handle_t_ {
+public:
+  using native_type = CUevent;
+
+  ur_result_t record();
+
+  ur_result_t wait();
+
+  ur_result_t start();
+
+  native_type get() const noexcept { return evEnd_; };
+
+  ur_queue_handle_t get_queue() const noexcept { return queue_; }
+
+  CUstream get_stream() const noexcept { return stream_; }
+
+  uint32_t get_compute_stream_token() const noexcept { return streamToken_; }
+
+  ur_command_t get_command_type() const noexcept { return commandType_; }
+
+  uint32_t get_reference_count() const noexcept { return refCount_; }
+
+  bool is_recorded() const noexcept { return isRecorded_; }
+
+  bool is_started() const noexcept { return isStarted_; }
+
+  bool is_completed() const noexcept;
+
+  uint32_t get_execution_status() const noexcept {
+
+    if (!is_recorded()) {
+      return UR_EVENT_STATUS_SUBMITTED;
+    }
+
+    if (!is_completed()) {
+      return UR_EVENT_STATUS_RUNNING;
+    }
+    return UR_EVENT_STATUS_COMPLETE;
+  }
+
+  ur_context_handle_t get_context() const noexcept { return context_; };
+
+  uint32_t increment_reference_count() { return ++refCount_; }
+
+  uint32_t decrement_reference_count() { return --refCount_; }
+
+  uint32_t get_event_id() const noexcept { return eventId_; }
+
+  bool backend_has_ownership() const noexcept { return has_ownership_; }
+
+  // Returns the counter time when the associated command(s) were enqueued
+  //
+  uint64_t get_queued_time() const;
+
+  // Returns the counter time when the associated command(s) started execution
+  //
+  uint64_t get_start_time() const;
+
+  // Returns the counter time when the associated command(s) completed
+  //
+  uint64_t get_end_time() const;
+
+  // construct a native CUDA. This maps closely to the underlying CUDA event.
+  static ur_event_handle_t
+  make_native(ur_command_t type, ur_queue_handle_t queue, CUstream stream,
+              uint32_t stream_token = std::numeric_limits<uint32_t>::max()) {
+    // TODO(ur): Remove cast when pi_event is ported to UR
+    return new ur_event_handle_t_(type, queue->get_context(), queue, stream,
+                                  stream_token);
+  }
+
+  static ur_event_handle_t make_with_native(ur_context_handle_t context,
+                                            CUevent eventNative) {
+    return new ur_event_handle_t_(context, eventNative);
+  }
+
+  ur_result_t release();
+
+  ~ur_event_handle_t_();
+
+private:
+  // This constructor is private to force programmers to use the make_native /
+  // make_user static members in order to create a pi_event for CUDA.
+  ur_event_handle_t_(ur_command_t type, ur_context_handle_t context,
+                     ur_queue_handle_t queue, CUstream stream,
+                     uint32_t stream_token);
+
+  // This constructor is private to force programmers to use the
+  // make_with_native for event introp
+  ur_event_handle_t_(ur_context_handle_t context, CUevent eventNative);
+
+  ur_command_t commandType_; // The type of command associated with event.
+
+  std::atomic_uint32_t refCount_; // Event reference count.
+
+  bool has_ownership_; // Signifies if event owns the native type.
+
+  bool hasBeenWaitedOn_; // Signifies whether the event has been waited
+                         // on through a call to wait(), which implies
+                         // that it has completed.
+
+  bool isRecorded_; // Signifies wether a native CUDA event has been recorded
+                    // yet.
+  bool isStarted_;  // Signifies wether the operation associated with the
+                    // PI event has started or not
+                    //
+
+  uint32_t streamToken_;
+  uint32_t eventId_; // Queue identifier of the event.
+
+  native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
+                      // event, this will be nullptr.
+
+  native_type evStart_; // CUDA event handle associated with the start
+
+  native_type evQueued_; // CUDA event handle associated with the time
+                         // the command was enqueued
+
+  ur_queue_handle_t queue_; // pi_queue associated with the event. If this is a
+                            // user event, this will be nullptr.
+
+  CUstream stream_; // CUstream associated with the event. If this is a user
+                    // event, this will be uninitialized.
+
+  ur_context_handle_t context_; // pi_context associated with the event. If this
+                                // is a native event, this will be the same
+                                // context associated with the queue_ member.
+};
+
+// Iterates over the event wait list, returns correct ur_result_t error codes.
+// Invokes the callback for the latest event of each queue in the wait list.
+// The callback must take a single pi_event argument and return a ur_result_t.
+template <typename Func>
+ur_result_t forLatestEvents(const ur_event_handle_t *event_wait_list,
+                            std::size_t num_events_in_wait_list, Func &&f) {
+
+  if (event_wait_list == nullptr || num_events_in_wait_list == 0) {
+    return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
+  }
+
+  // Fast path if we only have a single event
+  if (num_events_in_wait_list == 1) {
+    return f(event_wait_list[0]);
+  }
+
+  std::vector<ur_event_handle_t> events{
+      event_wait_list, event_wait_list + num_events_in_wait_list};
+  std::sort(events.begin(), events.end(),
+            [](ur_event_handle_t e0, ur_event_handle_t e1) {
+              // Tiered sort creating sublists of streams (smallest value first)
+              // in which the corresponding events are sorted into a sequence of
+              // newest first.
+              return e0->get_stream() < e1->get_stream() ||
+                     (e0->get_stream() == e1->get_stream() &&
+                      e0->get_event_id() > e1->get_event_id());
+            });
+
+  bool first = true;
+  CUstream lastSeenStream = 0;
+  for (ur_event_handle_t event : events) {
+    if (!event || (!first && event->get_stream() == lastSeenStream)) {
+      continue;
+    }
+
+    first = false;
+    lastSeenStream = event->get_stream();
+
+    auto result = f(event);
+    if (result != UR_RESULT_SUCCESS) {
+      return result;
+    }
+  }
+
+  return UR_RESULT_SUCCESS;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
new file mode 100644
index 0000000000000..1d10cedd82c91
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
@@ -0,0 +1,326 @@
+//===--------- queue.cpp - CUDA Adapter ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "queue.hpp"
+#include "common.hpp"
+#include "context.hpp"
+#include "event.hpp"
+
+#include <cassert>
+#include <cuda.h>
+
+void ur_queue_handle_t_::compute_stream_wait_for_barrier_if_needed(
+    CUstream stream, uint32_t stream_i) {
+  if (barrier_event_ && !compute_applied_barrier_[stream_i]) {
+    UR_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0));
+    compute_applied_barrier_[stream_i] = true;
+  }
+}
+
+void ur_queue_handle_t_::transfer_stream_wait_for_barrier_if_needed(
+    CUstream stream, uint32_t stream_i) {
+  if (barrier_event_ && !transfer_applied_barrier_[stream_i]) {
+    UR_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0));
+    transfer_applied_barrier_[stream_i] = true;
+  }
+}
+
+CUstream ur_queue_handle_t_::get_next_compute_stream(uint32_t *stream_token) {
+  uint32_t stream_i;
+  uint32_t token;
+  while (true) {
+    if (num_compute_streams_ < compute_streams_.size()) {
+      // the check above is for performance - so as not to lock mutex every time
+      std::lock_guard<std::mutex> guard(compute_stream_mutex_);
+      // The second check is done after mutex is locked so other threads can not
+      // change num_compute_streams_ after that
+      if (num_compute_streams_ < compute_streams_.size()) {
+        UR_CHECK_ERROR(
+            cuStreamCreate(&compute_streams_[num_compute_streams_++], flags_));
+      }
+    }
+    token = compute_stream_idx_++;
+    stream_i = token % compute_streams_.size();
+    // if a stream has been reused before it was next selected round-robin
+    // fashion, we want to delay its next use and instead select another one
+    // that is more likely to have completed all the enqueued work.
+    if (delay_compute_[stream_i]) {
+      delay_compute_[stream_i] = false;
+    } else {
+      break;
+    }
+  }
+  if (stream_token) {
+    *stream_token = token;
+  }
+  CUstream res = compute_streams_[stream_i];
+  compute_stream_wait_for_barrier_if_needed(res, stream_i);
+  return res;
+}
+
+CUstream ur_queue_handle_t_::get_next_compute_stream(
+    uint32_t num_events_in_wait_list, const ur_event_handle_t *event_wait_list,
+    ur_stream_guard_ &guard, uint32_t *stream_token) {
+  for (uint32_t i = 0; i < num_events_in_wait_list; i++) {
+    uint32_t token = event_wait_list[i]->get_compute_stream_token();
+    if (reinterpret_cast<ur_queue_handle_t>(event_wait_list[i]->get_queue()) ==
+            this &&
+        can_reuse_stream(token)) {
+      std::unique_lock<std::mutex> compute_sync_guard(
+          compute_stream_sync_mutex_);
+      // redo the check after lock to avoid data races on
+      // last_sync_compute_streams_
+      if (can_reuse_stream(token)) {
+        uint32_t stream_i = token % delay_compute_.size();
+        delay_compute_[stream_i] = true;
+        if (stream_token) {
+          *stream_token = token;
+        }
+        guard = ur_stream_guard_{std::move(compute_sync_guard)};
+        CUstream res = event_wait_list[i]->get_stream();
+        compute_stream_wait_for_barrier_if_needed(res, stream_i);
+        return res;
+      }
+    }
+  }
+  guard = {};
+  return get_next_compute_stream(stream_token);
+}
+
+CUstream ur_queue_handle_t_::get_next_transfer_stream() {
+  if (transfer_streams_.empty()) { // for example in in-order queue
+    return get_next_compute_stream();
+  }
+  if (num_transfer_streams_ < transfer_streams_.size()) {
+    // the check above is for performance - so as not to lock mutex every time
+    std::lock_guard<std::mutex> guard(transfer_stream_mutex_);
+    // The second check is done after mutex is locked so other threads can not
+    // change num_transfer_streams_ after that
+    if (num_transfer_streams_ < transfer_streams_.size()) {
+      UR_CHECK_ERROR(
+          cuStreamCreate(&transfer_streams_[num_transfer_streams_++], flags_));
+    }
+  }
+  uint32_t stream_i = transfer_stream_idx_++ % transfer_streams_.size();
+  CUstream res = transfer_streams_[stream_i];
+  transfer_stream_wait_for_barrier_if_needed(res, stream_i);
+  return res;
+}
+
+/// Creates a `ur_queue_handle_t` object on the CUDA backend.
+/// Valid properties
+/// * __SYCL_PI_CUDA_USE_DEFAULT_STREAM -> CU_STREAM_DEFAULT
+/// * __SYCL_PI_CUDA_SYNC_WITH_DEFAULT -> CU_STREAM_NON_BLOCKING
+///
+UR_APIEXPORT ur_result_t UR_APICALL
+urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
+              const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) {
+  try {
+    std::unique_ptr<ur_queue_handle_t_> queueImpl{nullptr};
+
+    if (hContext->get_device() != hDevice) {
+      *phQueue = nullptr;
+      return UR_RESULT_ERROR_INVALID_DEVICE;
+    }
+
+    unsigned int flags = CU_STREAM_NON_BLOCKING;
+    ur_queue_flags_t urFlags = 0;
+    bool is_out_of_order = false;
+    if (pProps && pProps->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) {
+      urFlags = pProps->flags;
+      if (urFlags == __SYCL_UR_CUDA_USE_DEFAULT_STREAM) {
+        flags = CU_STREAM_DEFAULT;
+      } else if (urFlags == __SYCL_UR_CUDA_SYNC_WITH_DEFAULT) {
+        flags = 0;
+      }
+
+      if (urFlags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
+        is_out_of_order = true;
+      }
+    }
+
+    std::vector<CUstream> computeCuStreams(
+        is_out_of_order ? ur_queue_handle_t_::default_num_compute_streams : 1);
+    std::vector<CUstream> transferCuStreams(
+        is_out_of_order ? ur_queue_handle_t_::default_num_transfer_streams : 0);
+
+    queueImpl = std::unique_ptr<ur_queue_handle_t_>(new ur_queue_handle_t_{
+        std::move(computeCuStreams), std::move(transferCuStreams), hContext,
+        hDevice, flags, urFlags});
+
+    *phQueue = queueImpl.release();
+
+    return UR_RESULT_SUCCESS;
+  } catch (ur_result_t err) {
+
+    return err;
+
+  } catch (...) {
+
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) {
+  assert(hQueue != nullptr);
+  assert(hQueue->get_reference_count() > 0);
+
+  hQueue->increment_reference_count();
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) {
+  assert(hQueue != nullptr);
+
+  if (hQueue->decrement_reference_count() > 0) {
+    return UR_RESULT_SUCCESS;
+  }
+
+  try {
+    std::unique_ptr<ur_queue_handle_t_> queueImpl(hQueue);
+
+    if (!hQueue->backend_has_ownership())
+      return UR_RESULT_SUCCESS;
+
+    ScopedContext active(hQueue->get_context());
+
+    hQueue->for_each_stream([](CUstream s) {
+      UR_CHECK_ERROR(cuStreamSynchronize(s));
+      UR_CHECK_ERROR(cuStreamDestroy(s));
+    });
+
+    return UR_RESULT_SUCCESS;
+  } catch (ur_result_t err) {
+    return err;
+  } catch (...) {
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) {
+  ur_result_t result = UR_RESULT_SUCCESS;
+
+  try {
+
+    assert(hQueue !=
+           nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code
+    ScopedContext active(hQueue->get_context());
+
+    hQueue->sync_streams</*ResetUsed=*/true>([&result](CUstream s) {
+      result = UR_CHECK_ERROR(cuStreamSynchronize(s));
+    });
+
+  } catch (ur_result_t err) {
+
+    result = err;
+
+  } catch (...) {
+
+    result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+
+  return result;
+}
+
+// There is no CUDA counterpart for queue flushing and we don't run into the
+// same problem of having to flush cross-queue dependencies as some of the
+// other plugins, so it can be left as no-op.
+UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) {
+  (void)hQueue;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(
+    ur_queue_handle_t hQueue, ur_native_handle_t *phNativeQueue) {
+  ScopedContext active(hQueue->get_context());
+  *phNativeQueue =
+      reinterpret_cast<ur_native_handle_t>(hQueue->get_next_compute_stream());
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
+    ur_native_handle_t hNativeQueue, ur_context_handle_t hContext,
+    ur_device_handle_t hDevice, const ur_queue_native_properties_t *pProperties,
+    ur_queue_handle_t *phQueue) {
+  (void)pProperties;
+
+  unsigned int cuFlags;
+  CUstream cuStream = reinterpret_cast<CUstream>(hNativeQueue);
+  UR_ASSERT(hContext->get_device() == hDevice, UR_RESULT_ERROR_INVALID_DEVICE);
+
+  auto retErr = UR_CHECK_ERROR(cuStreamGetFlags(cuStream, &cuFlags));
+
+  ur_queue_flags_t flags = 0;
+  if (cuFlags == CU_STREAM_DEFAULT)
+    flags = __SYCL_UR_CUDA_USE_DEFAULT_STREAM;
+  else if (cuFlags == CU_STREAM_NON_BLOCKING)
+    flags = __SYCL_UR_CUDA_SYNC_WITH_DEFAULT;
+  else
+    sycl::detail::ur::die("Unknown cuda stream");
+
+  std::vector<CUstream> computeCuStreams(1, cuStream);
+  std::vector<CUstream> transferCuStreams(0);
+
+  // Create queue and set num_compute_streams to 1, as computeCuStreams has
+  // valid stream
+  *phQueue = new ur_queue_handle_t_{std::move(computeCuStreams),
+                                    std::move(transferCuStreams),
+                                    hContext,
+                                    hDevice,
+                                    cuFlags,
+                                    flags,
+                                    /*backend_owns*/ false};
+  (*phQueue)->num_compute_streams_ = 1;
+
+  return retErr;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
+                                                   ur_queue_info_t propName,
+                                                   size_t propValueSize,
+                                                   void *pPropValue,
+                                                   size_t *pPropSizeRet) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
+
+  UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet);
+
+  switch (uint32_t{propName}) {
+  case UR_QUEUE_INFO_CONTEXT:
+    return ReturnValue(hQueue->context_);
+  case UR_QUEUE_INFO_DEVICE:
+    return ReturnValue(hQueue->device_);
+  case UR_QUEUE_INFO_REFERENCE_COUNT:
+    return ReturnValue(hQueue->get_reference_count());
+  case UR_QUEUE_INFO_FLAGS:
+    return ReturnValue(hQueue->ur_flags_);
+  case UR_QUEUE_INFO_EMPTY: {
+    try {
+      bool IsReady = hQueue->all_of([](CUstream s) -> bool {
+        const CUresult ret = cuStreamQuery(s);
+        if (ret == CUDA_SUCCESS)
+          return true;
+
+        if (ret == CUDA_ERROR_NOT_READY)
+          return false;
+
+        UR_CHECK_ERROR(ret);
+        return false;
+      });
+      return ReturnValue(IsReady);
+    } catch (ur_result_t err) {
+      return err;
+    } catch (...) {
+      return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    }
+  }
+  default:
+    break;
+  }
+  sycl::detail::ur::die("Queue info request not implemented");
+  return {};
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
new file mode 100644
index 0000000000000..99a7904b82b7e
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
@@ -0,0 +1,253 @@
+//===--------- queue.hpp - CUDA Adapter ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <ur/ur.hpp>
+
+#include <cuda.h>
+#include <vector>
+
+using ur_stream_guard_ = std::unique_lock<std::mutex>;
+
+/// UR queue mapping on to CUstream objects.
+///
+struct ur_queue_handle_t_ {
+
+  using native_type = CUstream;
+  static constexpr int default_num_compute_streams = 128;
+  static constexpr int default_num_transfer_streams = 64;
+
+  std::vector<native_type> compute_streams_;
+  std::vector<native_type> transfer_streams_;
+  // delay_compute_ keeps track of which streams have been recently reused and
+  // their next use should be delayed. If a stream has been recently reused it
+  // will be skipped the next time it would be selected round-robin style. When
+  // skipped, its delay flag is cleared.
+  std::vector<bool> delay_compute_;
+  // keep track of which streams have applied barrier
+  std::vector<bool> compute_applied_barrier_;
+  std::vector<bool> transfer_applied_barrier_;
+  ur_context_handle_t_ *context_;
+  ur_device_handle_t_ *device_;
+  // ur_queue_properties_t properties_;
+  CUevent barrier_event_ = nullptr;
+  CUevent barrier_tmp_event_ = nullptr;
+  std::atomic_uint32_t refCount_;
+  std::atomic_uint32_t eventCount_;
+  std::atomic_uint32_t compute_stream_idx_;
+  std::atomic_uint32_t transfer_stream_idx_;
+  unsigned int num_compute_streams_;
+  unsigned int num_transfer_streams_;
+  unsigned int last_sync_compute_streams_;
+  unsigned int last_sync_transfer_streams_;
+  unsigned int flags_;
+  ur_queue_flags_t ur_flags_;
+  // When compute_stream_sync_mutex_ and compute_stream_mutex_ both need to be
+  // locked at the same time, compute_stream_sync_mutex_ should be locked first
+  // to avoid deadlocks
+  std::mutex compute_stream_sync_mutex_;
+  std::mutex compute_stream_mutex_;
+  std::mutex transfer_stream_mutex_;
+  std::mutex barrier_mutex_;
+  bool has_ownership_;
+
+  ur_queue_handle_t_(std::vector<CUstream> &&compute_streams,
+                     std::vector<CUstream> &&transfer_streams,
+                     ur_context_handle_t_ *context, ur_device_handle_t_ *device,
+                     unsigned int flags, ur_queue_flags_t ur_flags,
+                     bool backend_owns = true)
+      : compute_streams_{std::move(compute_streams)},
+        transfer_streams_{std::move(transfer_streams)},
+        delay_compute_(compute_streams_.size(), false),
+        compute_applied_barrier_(compute_streams_.size()),
+        transfer_applied_barrier_(transfer_streams_.size()), context_{context},
+        device_{device}, refCount_{1}, eventCount_{0}, compute_stream_idx_{0},
+        transfer_stream_idx_{0}, num_compute_streams_{0},
+        num_transfer_streams_{0}, last_sync_compute_streams_{0},
+        last_sync_transfer_streams_{0}, flags_(flags),
+        ur_flags_(ur_flags), has_ownership_{backend_owns} {
+    urContextRetain(context_);
+    urDeviceRetain(device_);
+  }
+
+  ~ur_queue_handle_t_() {
+    urContextRelease(context_);
+    urDeviceRelease(device_);
+  }
+
+  void compute_stream_wait_for_barrier_if_needed(CUstream stream,
+                                                 uint32_t stream_i);
+  void transfer_stream_wait_for_barrier_if_needed(CUstream stream,
+                                                  uint32_t stream_i);
+
+  // get_next_compute/transfer_stream() functions return streams from
+  // appropriate pools in round-robin fashion
+  native_type get_next_compute_stream(uint32_t *stream_token = nullptr);
+  // this overload tries select a stream that was used by one of dependancies.
+  // If that is not possible returns a new stream. If a stream is reused it
+  // returns a lock that needs to remain locked as long as the stream is in use
+  native_type get_next_compute_stream(uint32_t num_events_in_wait_list,
+                                      const ur_event_handle_t *event_wait_list,
+                                      ur_stream_guard_ &guard,
+                                      uint32_t *stream_token = nullptr);
+  native_type get_next_transfer_stream();
+  native_type get() { return get_next_compute_stream(); };
+
+  bool has_been_synchronized(uint32_t stream_token) {
+    // stream token not associated with one of the compute streams
+    if (stream_token == std::numeric_limits<uint32_t>::max()) {
+      return false;
+    }
+    return last_sync_compute_streams_ >= stream_token;
+  }
+
+  bool can_reuse_stream(uint32_t stream_token) {
+    // stream token not associated with one of the compute streams
+    if (stream_token == std::numeric_limits<uint32_t>::max()) {
+      return false;
+    }
+    // If the command represented by the stream token was not the last command
+    // enqueued to the stream we can not reuse the stream - we need to allow for
+    // commands enqueued after it and the one we are about to enqueue to run
+    // concurrently
+    bool is_last_command =
+        (compute_stream_idx_ - stream_token) <= compute_streams_.size();
+    // If there was a barrier enqueued to the queue after the command
+    // represented by the stream token we should not reuse the stream, as we can
+    // not take that stream into account for the bookkeeping for the next
+    // barrier - such a stream would not be synchronized with. Performance-wise
+    // it does not matter that we do not reuse the stream, as the work
+    // represented by the stream token is guaranteed to be complete by the
+    // barrier before any work we are about to enqueue to the stream will start,
+    // so the event does not need to be synchronized with.
+    return is_last_command && !has_been_synchronized(stream_token);
+  }
+
+  template <typename T> bool all_of(T &&f) {
+    {
+      std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
+      unsigned int end =
+          std::min(static_cast<unsigned int>(compute_streams_.size()),
+                   num_compute_streams_);
+      if (!std::all_of(compute_streams_.begin(), compute_streams_.begin() + end,
+                       f))
+        return false;
+    }
+    {
+      std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
+      unsigned int end =
+          std::min(static_cast<unsigned int>(transfer_streams_.size()),
+                   num_transfer_streams_);
+      if (!std::all_of(transfer_streams_.begin(),
+                       transfer_streams_.begin() + end, f))
+        return false;
+    }
+    return true;
+  }
+
+  template <typename T> void for_each_stream(T &&f) {
+    {
+      std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
+      unsigned int end =
+          std::min(static_cast<unsigned int>(compute_streams_.size()),
+                   num_compute_streams_);
+      for (unsigned int i = 0; i < end; i++) {
+        f(compute_streams_[i]);
+      }
+    }
+    {
+      std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
+      unsigned int end =
+          std::min(static_cast<unsigned int>(transfer_streams_.size()),
+                   num_transfer_streams_);
+      for (unsigned int i = 0; i < end; i++) {
+        f(transfer_streams_[i]);
+      }
+    }
+  }
+
+  template <bool ResetUsed = false, typename T> void sync_streams(T &&f) {
+    auto sync_compute = [&f, &streams = compute_streams_,
+                         &delay = delay_compute_](unsigned int start,
+                                                  unsigned int stop) {
+      for (unsigned int i = start; i < stop; i++) {
+        f(streams[i]);
+        delay[i] = false;
+      }
+    };
+    auto sync_transfer = [&f, &streams = transfer_streams_](unsigned int start,
+                                                            unsigned int stop) {
+      for (unsigned int i = start; i < stop; i++) {
+        f(streams[i]);
+      }
+    };
+    {
+      unsigned int size = static_cast<unsigned int>(compute_streams_.size());
+      std::lock_guard compute_sync_guard(compute_stream_sync_mutex_);
+      std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
+      unsigned int start = last_sync_compute_streams_;
+      unsigned int end = num_compute_streams_ < size
+                             ? num_compute_streams_
+                             : compute_stream_idx_.load();
+      if (ResetUsed) {
+        last_sync_compute_streams_ = end;
+      }
+      if (end - start >= size) {
+        sync_compute(0, size);
+      } else {
+        start %= size;
+        end %= size;
+        if (start <= end) {
+          sync_compute(start, end);
+        } else {
+          sync_compute(start, size);
+          sync_compute(0, end);
+        }
+      }
+    }
+    {
+      unsigned int size = static_cast<unsigned int>(transfer_streams_.size());
+      if (size > 0) {
+        std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
+        unsigned int start = last_sync_transfer_streams_;
+        unsigned int end = num_transfer_streams_ < size
+                               ? num_transfer_streams_
+                               : transfer_stream_idx_.load();
+        if (ResetUsed) {
+          last_sync_transfer_streams_ = end;
+        }
+        if (end - start >= size) {
+          sync_transfer(0, size);
+        } else {
+          start %= size;
+          end %= size;
+          if (start <= end) {
+            sync_transfer(start, end);
+          } else {
+            sync_transfer(start, size);
+            sync_transfer(0, end);
+          }
+        }
+      }
+    }
+  }
+
+  ur_context_handle_t_ *get_context() const { return context_; };
+
+  ur_device_handle_t_ *get_device() const { return device_; };
+
+  uint32_t increment_reference_count() noexcept { return ++refCount_; }
+
+  uint32_t decrement_reference_count() noexcept { return --refCount_; }
+
+  uint32_t get_reference_count() const noexcept { return refCount_; }
+
+  uint32_t get_next_event_id() noexcept { return ++eventCount_; }
+
+  bool backend_has_ownership() const noexcept { return has_ownership_; }
+};

From 103cec35c6b58abace79d0b662404e4d8ba0fc90 Mon Sep 17 00:00:00 2001
From: Omar Ahmed <omar.ahmed@codeplay.com>
Date: Tue, 18 Apr 2023 14:40:56 +0100
Subject: [PATCH 07/45] AAdd program and kernel ddi tables

---
 sycl/plugins/cuda/pi_cuda.cpp                 |  4 +-
 .../ur/adapters/cuda/kernel.cpp               | 10 +---
 .../ur/adapters/cuda/program.cpp              | 18 ++++---
 .../ur/adapters/cuda/ur_interface_loader.cpp  | 47 ++++++++++---------
 4 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index c2c08b645b03a..e9dfa9c74ab35 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -3054,6 +3054,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piextProgramGetNativeHandle, pi2ur::piextProgramGetNativeHandle)
   _PI_CL(piextProgramCreateWithNativeHandle,
          pi2ur::piextProgramCreateWithNativeHandle)
+  _PI_CL(piextProgramSetSpecializationConstant,
+         pi2ur::piextProgramSetSpecializationConstant)
   // Kernel
   _PI_CL(piKernelCreate, pi2ur::piKernelCreate)
   _PI_CL(piKernelSetArg, pi2ur::piKernelSetArg)
@@ -3064,8 +3066,6 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piKernelRelease, pi2ur::piKernelRelease)
   _PI_CL(piextKernelGetNativeHandle, pi2ur::piextKernelGetNativeHandle)
   _PI_CL(piKernelSetExecInfo, pi2ur::piKernelSetExecInfo)
-  _PI_CL(piextProgramSetSpecializationConstant,
-         pi2ur::piextProgramSetSpecializationConstant)
   _PI_CL(piextKernelSetArgPointer, pi2ur::piKernelSetArgPointer)
   _PI_CL(piextKernelCreateWithNativeHandle,
          pi2ur::piextKernelCreateWithNativeHandle)
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
index e34976394c5ff..e80960f7ceb3c 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
@@ -226,7 +226,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
     sycl::detail::ur::assertion(
         cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
                            hKernel->get()) == CUDA_SUCCESS);
-    return ReturnValue(uint32_t{numRegs});
+    return ReturnValue(static_cast<uint32_t>(numRegs));
   }
   default:
     break;
@@ -297,14 +297,6 @@ urKernelSetExecInfo(ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName,
   return UR_RESULT_SUCCESS;
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants(
-    ur_program_handle_t, uint32_t, const ur_specialization_constant_info_t *) {
-  // This entry point is only used for native specialization constants (SPIR-V),
-  // and the CUDA plugin is AOT only so this entry point is not supported.
-  sycl::detail::ur::die("Native specialization constants are not supported");
-  return {};
-}
-
 UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     ur_native_handle_t hNativeKernel, ur_context_handle_t hContext,
     ur_program_handle_t hProgram,
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
index 7a56620180fef..bca41b4c0b5ba 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
@@ -163,9 +163,8 @@ ur_result_t ur_program_handle_t_::build_program(const char *build_options) {
 ///       query to PI and use cuModuleGetFunction to check for a kernel.
 /// Note: Another alternative is to add kernel names as metadata, like with
 ///       reqd_work_group_size.
-std::string getKernelNames(ur_program_handle_t) {
-  sycl::detail::ur::die("getKernelNames not implemented");
-  return {};
+ur_result_t getKernelNames(ur_program_handle_t) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
 /// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object.
@@ -282,9 +281,7 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count,
 UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle(
     ur_native_handle_t hNativeProgram, ur_context_handle_t hContext,
     ur_program_handle_t *phProgram) {
-  sycl::detail::ur::die(
-      "Creation of UR program from native handle not implemented");
-  return {};
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL
@@ -335,7 +332,7 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName,
   case UR_PROGRAM_INFO_BINARIES:
     return ReturnValue(&hProgram->binary_, 1);
   case UR_PROGRAM_INFO_NUM_KERNELS:
-    return ReturnValue(getKernelNames(hProgram).c_str());
+    return getKernelNames(hProgram);
   default:
     break;
   }
@@ -437,3 +434,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
 
   return retError;
 }
+
+// This entry point is only used for native specialization constants (SPIR-V),
+// and the CUDA plugin is AOT only so this entry point is not supported.
+UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants(
+    ur_program_handle_t, uint32_t, const ur_specialization_constant_info_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index d7f9ad75d38cd..9d408ff9d939f 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -83,19 +83,20 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable(
   if (UR_RESULT_SUCCESS != result) {
     return result;
   }
-  pDdiTable->pfnBuild = nullptr;
-  pDdiTable->pfnCompile = nullptr;
-  pDdiTable->pfnCreateWithBinary = nullptr;
-  pDdiTable->pfnCreateWithIL = nullptr;
-  pDdiTable->pfnCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnGetBuildInfo = nullptr;
+  pDdiTable->pfnBuild = urProgramBuild;
+  pDdiTable->pfnCompile = urProgramCompile;
+  pDdiTable->pfnCreateWithBinary = urProgramCreateWithBinary;
+  pDdiTable->pfnCreateWithIL = urProgramCreateWithIL;
+  pDdiTable->pfnCreateWithNativeHandle = urProgramCreateWithNativeHandle;
+  pDdiTable->pfnGetBuildInfo = urProgramGetBuildInfo;
   pDdiTable->pfnGetFunctionPointer = nullptr;
-  pDdiTable->pfnGetInfo = nullptr;
-  pDdiTable->pfnGetNativeHandle = nullptr;
-  pDdiTable->pfnLink = nullptr;
-  pDdiTable->pfnRelease = nullptr;
-  pDdiTable->pfnRetain = nullptr;
-  pDdiTable->pfnSetSpecializationConstants = nullptr;
+  pDdiTable->pfnGetInfo = urProgramGetInfo;
+  pDdiTable->pfnGetNativeHandle = urProgramGetNativeHandle;
+  pDdiTable->pfnLink = urProgramLink;
+  pDdiTable->pfnRelease = urProgramRelease;
+  pDdiTable->pfnRetain = urProgramRetain;
+  pDdiTable->pfnSetSpecializationConstants =
+      urProgramSetSpecializationConstants;
   return UR_RESULT_SUCCESS;
 }
 
@@ -105,20 +106,20 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   if (UR_RESULT_SUCCESS != result) {
     return result;
   }
-  pDdiTable->pfnCreate = nullptr;
-  pDdiTable->pfnCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnGetGroupInfo = nullptr;
-  pDdiTable->pfnGetInfo = nullptr;
-  pDdiTable->pfnGetNativeHandle = nullptr;
-  pDdiTable->pfnGetSubGroupInfo = nullptr;
-  pDdiTable->pfnRelease = nullptr;
-  pDdiTable->pfnRetain = nullptr;
+  pDdiTable->pfnCreate = urKernelCreate;
+  pDdiTable->pfnCreateWithNativeHandle = urKernelCreateWithNativeHandle;
+  pDdiTable->pfnGetGroupInfo = urKernelGetGroupInfo;
+  pDdiTable->pfnGetInfo = urKernelGetInfo;
+  pDdiTable->pfnGetNativeHandle = urKernelGetNativeHandle;
+  pDdiTable->pfnGetSubGroupInfo = urKernelGetSubGroupInfo;
+  pDdiTable->pfnRelease = urKernelRelease;
+  pDdiTable->pfnRetain = urKernelRetain;
   pDdiTable->pfnSetArgLocal = nullptr;
   pDdiTable->pfnSetArgMemObj = nullptr;
-  pDdiTable->pfnSetArgPointer = nullptr;
+  pDdiTable->pfnSetArgPointer = urKernelSetArgPointer;
   pDdiTable->pfnSetArgSampler = nullptr;
-  pDdiTable->pfnSetArgValue = nullptr;
-  pDdiTable->pfnSetExecInfo = nullptr;
+  pDdiTable->pfnSetArgValue = urKernelSetArgValue;
+  pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
   pDdiTable->pfnSetSpecializationConstants = nullptr;
   return UR_RESULT_SUCCESS;
 }

From c64033957addac8e4255ba8d60398a5162911011 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Mon, 17 Apr 2023 14:41:26 +0100
Subject: [PATCH 08/45] [SYCL][PI][UR][CUDA] Port piEnqueueKernelLaunch to UR

---
 sycl/plugins/cuda/pi_cuda.cpp                 | 392 +-----------------
 .../ur/adapters/cuda/enqueue.cpp              | 282 +++++++++++++
 .../ur/adapters/cuda/ur_interface_loader.cpp  |   6 +-
 3 files changed, 292 insertions(+), 388 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index e9dfa9c74ab35..baf4f4a4983d1 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -239,74 +239,6 @@ int getAttribute(pi_device device, CUdevice_attribute attribute) {
 }
 /// \endcond
 
-// Determine local work sizes that result in uniform work groups.
-// The default threadsPerBlock only require handling the first work_dim
-// dimension.
-void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
-                        const size_t *global_work_size,
-                        const size_t maxThreadsPerBlock[3], pi_kernel kernel,
-                        pi_uint32 local_size) {
-  assert(threadsPerBlock != nullptr);
-  assert(global_work_size != nullptr);
-  assert(kernel != nullptr);
-  int minGrid, maxBlockSize, maxBlockDim[3];
-
-  static auto isPrime = [](size_t number) -> bool {
-    auto lastNumToCheck = ceil(sqrt(number));
-    if (number < 2)
-      return false;
-    if (number == 2)
-      return true;
-    if (number % 2 == 0)
-      return false;
-    for (int i = 3; i <= lastNumToCheck; i += 2) {
-      if (number % i == 0)
-        return false;
-    }
-    return true;
-  };
-
-  cuDeviceGetAttribute(&maxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
-                       device->get());
-  cuDeviceGetAttribute(&maxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
-                       device->get());
-
-  PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
-      &minGrid, &maxBlockSize, kernel->get(), NULL, local_size,
-      maxThreadsPerBlock[0]));
-
-  threadsPerBlock[2] = std::min(global_work_size[2], size_t(maxBlockDim[2]));
-  threadsPerBlock[1] =
-      std::min(global_work_size[1], std::min(maxBlockSize / threadsPerBlock[2],
-                                             size_t(maxBlockDim[1])));
-  maxBlockDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
-  threadsPerBlock[0] =
-      std::min(maxThreadsPerBlock[0],
-               std::min(global_work_size[0], size_t(maxBlockDim[0])));
-
-  // When global_work_size[0] is prime threadPerBlock[0] will later computed as
-  // 1, which is not efficient configuration. In such case we use
-  // global_work_size[0] + 1 to compute threadPerBlock[0].
-  int adjusted_0_dim_global_work_size =
-      (isPrime(global_work_size[0]) &&
-       (threadsPerBlock[0] != global_work_size[0]))
-          ? global_work_size[0] + 1
-          : global_work_size[0];
-
-  static auto isPowerOf2 = [](size_t value) -> bool {
-    return value && !(value & (value - 1));
-  };
-
-  // Find a local work group size that is a divisor of the global
-  // work group size to produce uniform work groups.
-  // Additionally, for best compute utilisation, the local size has
-  // to be a power of two.
-  while (0u != (adjusted_0_dim_global_work_size % threadsPerBlock[0]) ||
-         !isPowerOf2(threadsPerBlock[0])) {
-    --threadsPerBlock[0];
-  }
-}
-
 pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream,
                             pi_uint32 num_events_in_wait_list,
                             const pi_event *event_wait_list) {
@@ -365,27 +297,6 @@ void getUSMHostOrDevicePtr(PtrT usm_ptr, CUmemorytype *out_mem_type,
   }
 }
 
-// Helper to verify out-of-registers case (exceeded block max registers).
-// If the kernel requires a number of registers for the entire thread
-// block exceeds the hardware limitations, then the cuLaunchKernel call
-// will fail to launch with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES error.
-bool hasExceededMaxRegistersPerBlock(pi_device device, pi_kernel kernel,
-                                     size_t blockSize) {
-  assert(device);
-  assert(kernel);
-
-  int maxRegsPerBlock{0};
-  PI_CHECK_ERROR(cuDeviceGetAttribute(
-      &maxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
-      device->get()));
-
-  int regsPerThread{0};
-  PI_CHECK_ERROR(cuFuncGetAttribute(&regsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS,
-                                    kernel->get()));
-
-  return blockSize * regsPerThread > size_t(maxRegsPerBlock);
-}
-
 } // anonymous namespace
 
 /// ------ Error handling, matching OpenCL plugin semantics.
@@ -421,20 +332,6 @@ void assertion(bool Condition, const char *Message) {
 //--------------
 // PI object implementation
 
-extern "C" {
-
-// Required in a number of functions, so forward declare here
-pi_result cuda_piEnqueueEventsWait(pi_queue command_queue,
-                                   pi_uint32 num_events_in_wait_list,
-                                   const pi_event *event_wait_list,
-                                   pi_event *event);
-pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue,
-                                              pi_uint32 num_events_in_wait_list,
-                                              const pi_event *event_wait_list,
-                                              pi_event *event);
-
-} // extern "C"
-
 /// \endcond
 
 // makes all future work submitted to queue wait for all work captured in event.
@@ -953,172 +850,6 @@ pi_result cuda_piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index,
   return retErr;
 }
 
-pi_result cuda_piEnqueueKernelLaunch(
-    pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim,
-    const size_t *global_work_offset, const size_t *global_work_size,
-    const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
-    const pi_event *event_wait_list, pi_event *event) {
-
-  // Preconditions
-  assert(command_queue != nullptr);
-  assert(command_queue->get_context() == kernel->get_context());
-  assert(kernel != nullptr);
-  assert(global_work_offset != nullptr);
-  assert(work_dim > 0);
-  assert(work_dim < 4);
-
-  if (*global_work_size == 0) {
-    return cuda_piEnqueueEventsWaitWithBarrier(
-        command_queue, num_events_in_wait_list, event_wait_list, event);
-  }
-
-  // Set the number of threads per block to the number of threads per warp
-  // by default unless user has provided a better number
-  size_t threadsPerBlock[3] = {32u, 1u, 1u};
-  size_t maxWorkGroupSize = 0u;
-  size_t maxThreadsPerBlock[3] = {};
-  bool providedLocalWorkGroupSize = (local_work_size != nullptr);
-  pi_uint32 local_size = kernel->get_local_size();
-  pi_result retError = PI_SUCCESS;
-
-  try {
-    // Set the active context here as guessLocalWorkSize needs an active context
-    ScopedContext active(command_queue->get_context());
-    {
-      size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_;
-      maxWorkGroupSize = command_queue->device_->get_max_work_group_size();
-      command_queue->device_->get_max_work_item_sizes(
-          sizeof(maxThreadsPerBlock), maxThreadsPerBlock);
-
-      if (providedLocalWorkGroupSize) {
-        auto isValid = [&](int dim) {
-          if (reqdThreadsPerBlock[dim] != 0 &&
-              local_work_size[dim] != reqdThreadsPerBlock[dim])
-            return PI_ERROR_INVALID_WORK_GROUP_SIZE;
-
-          if (local_work_size[dim] > maxThreadsPerBlock[dim])
-            return PI_ERROR_INVALID_WORK_GROUP_SIZE;
-          // Checks that local work sizes are a divisor of the global work sizes
-          // which includes that the local work sizes are neither larger than
-          // the global work sizes and not 0.
-          if (0u == local_work_size[dim])
-            return PI_ERROR_INVALID_WORK_GROUP_SIZE;
-          if (0u != (global_work_size[dim] % local_work_size[dim]))
-            return PI_ERROR_INVALID_WORK_GROUP_SIZE;
-          threadsPerBlock[dim] = local_work_size[dim];
-          return PI_SUCCESS;
-        };
-
-        size_t kernelLocalWorkGroupSize = 0;
-        for (size_t dim = 0; dim < work_dim; dim++) {
-          auto err = isValid(dim);
-          if (err != PI_SUCCESS)
-            return err;
-          // If no error then sum the total local work size per dim.
-          kernelLocalWorkGroupSize += local_work_size[dim];
-        }
-
-        if (hasExceededMaxRegistersPerBlock(
-                reinterpret_cast<pi_device>(command_queue->device_), kernel,
-                kernelLocalWorkGroupSize)) {
-          return PI_ERROR_INVALID_WORK_GROUP_SIZE;
-        }
-      } else {
-        guessLocalWorkSize(reinterpret_cast<pi_device>(command_queue->device_),
-                           threadsPerBlock, global_work_size,
-                           maxThreadsPerBlock, kernel, local_size);
-      }
-    }
-
-    if (maxWorkGroupSize <
-        size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
-      return PI_ERROR_INVALID_WORK_GROUP_SIZE;
-    }
-
-    size_t blocksPerGrid[3] = {1u, 1u, 1u};
-
-    for (size_t i = 0; i < work_dim; i++) {
-      blocksPerGrid[i] =
-          (global_work_size[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i];
-    }
-
-    std::unique_ptr<_pi_event> retImplEv{nullptr};
-
-    pi_uint32 stream_token;
-    _pi_stream_guard guard;
-    CUstream cuStream = command_queue->get_next_compute_stream(
-        num_events_in_wait_list,
-        reinterpret_cast<const ur_event_handle_t *>(event_wait_list), guard,
-        &stream_token);
-    CUfunction cuFunc = kernel->get();
-
-    retError = enqueueEventsWait(command_queue, cuStream,
-                                 num_events_in_wait_list, event_wait_list);
-
-    // Set the implicit global offset parameter if kernel has offset variant
-    if (kernel->get_with_offset_parameter()) {
-      std::uint32_t cuda_implicit_offset[3] = {0, 0, 0};
-      if (global_work_offset) {
-        for (size_t i = 0; i < work_dim; i++) {
-          cuda_implicit_offset[i] =
-              static_cast<std::uint32_t>(global_work_offset[i]);
-          if (global_work_offset[i] != 0) {
-            cuFunc = kernel->get_with_offset_parameter();
-          }
-        }
-      }
-      kernel->set_implicit_offset_arg(sizeof(cuda_implicit_offset),
-                                      cuda_implicit_offset);
-    }
-
-    auto &argIndices = kernel->get_arg_indices();
-
-    if (event) {
-      retImplEv = std::unique_ptr<_pi_event>(
-          _pi_event::make_native(PI_COMMAND_TYPE_NDRANGE_KERNEL, command_queue,
-                                 cuStream, stream_token));
-      retImplEv->start();
-    }
-
-    // Set local mem max size if env var is present
-    static const char *local_mem_sz_ptr =
-        std::getenv("SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE");
-
-    if (local_mem_sz_ptr) {
-      int device_max_local_mem = 0;
-      cuDeviceGetAttribute(
-          &device_max_local_mem,
-          CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
-          command_queue->get_device()->get());
-
-      static const int env_val = std::atoi(local_mem_sz_ptr);
-      if (env_val <= 0 || env_val > device_max_local_mem) {
-        setErrorMessage("Invalid value specified for "
-                        "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
-                        UR_RESULT_ERROR_ADAPTER_SPECIFIC);
-        return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-      }
-      PI_CHECK_ERROR(cuFuncSetAttribute(
-          cuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, env_val));
-    }
-
-    retError = PI_CHECK_ERROR(cuLaunchKernel(
-        cuFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2],
-        threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2], local_size,
-        cuStream, const_cast<void **>(argIndices.data()), nullptr));
-    if (local_size != 0)
-      kernel->clear_local_size();
-
-    if (event) {
-      retError = map_ur_error(retImplEv->record());
-      *event = retImplEv.release();
-    }
-  } catch (pi_result err) {
-    retError = err;
-  }
-  return retError;
-}
-
 /// \TODO Not implemented
 pi_result cuda_piEnqueueNativeKernel(pi_queue, void (*)(void *), void *, size_t,
                                      pi_uint32, const pi_mem *, const void **,
@@ -1297,115 +1028,6 @@ pi_result cuda_piMemRetain(pi_mem mem) {
   return PI_SUCCESS;
 }
 
-/// Enqueues a wait on the given CUstream for all events.
-/// See \ref enqueueEventWait
-/// TODO: Add support for multiple streams once the Event class is properly
-/// refactored.
-///
-pi_result cuda_piEnqueueEventsWait(pi_queue command_queue,
-                                   pi_uint32 num_events_in_wait_list,
-                                   const pi_event *event_wait_list,
-                                   pi_event *event) {
-  return cuda_piEnqueueEventsWaitWithBarrier(
-      command_queue, num_events_in_wait_list, event_wait_list, event);
-}
-
-/// Enqueues a wait on the given CUstream for all specified events (See
-/// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued
-/// wait will wait on all previous events in the queue.
-///
-/// \param[in] command_queue A valid PI queue.
-/// \param[in] num_events_in_wait_list Number of events in event_wait_list.
-/// \param[in] event_wait_list Events to wait on.
-/// \param[out] event Event for when all events in event_wait_list have finished
-/// or, if event_wait_list is empty, when all previous events in the queue have
-/// finished.
-///
-/// \return TBD
-pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue,
-                                              pi_uint32 num_events_in_wait_list,
-                                              const pi_event *event_wait_list,
-                                              pi_event *event) {
-  // This function makes one stream work on the previous work (or work
-  // represented by input events) and then all future work waits on that stream.
-  if (!command_queue) {
-    return PI_ERROR_INVALID_QUEUE;
-  }
-
-  pi_result result;
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    pi_uint32 stream_token;
-    _pi_stream_guard guard;
-    CUstream cuStream = command_queue->get_next_compute_stream(
-        num_events_in_wait_list,
-        reinterpret_cast<const ur_event_handle_t *>(event_wait_list), guard,
-        &stream_token);
-    {
-      std::lock_guard<std::mutex> guard(command_queue->barrier_mutex_);
-      if (command_queue->barrier_event_ == nullptr) {
-        PI_CHECK_ERROR(cuEventCreate(&command_queue->barrier_event_,
-                                     CU_EVENT_DISABLE_TIMING));
-      }
-      if (num_events_in_wait_list == 0) { //  wait on all work
-        if (command_queue->barrier_tmp_event_ == nullptr) {
-          PI_CHECK_ERROR(cuEventCreate(&command_queue->barrier_tmp_event_,
-                                       CU_EVENT_DISABLE_TIMING));
-        }
-        command_queue->sync_streams(
-            [cuStream,
-             tmp_event = command_queue->barrier_tmp_event_](CUstream s) {
-              if (cuStream != s) {
-                // record a new CUDA event on every stream and make one stream
-                // wait for these events
-                PI_CHECK_ERROR(cuEventRecord(tmp_event, s));
-                PI_CHECK_ERROR(cuStreamWaitEvent(cuStream, tmp_event, 0));
-              }
-            });
-      } else { // wait just on given events
-        forLatestEvents(event_wait_list, num_events_in_wait_list,
-                        [cuStream](pi_event event) -> pi_result {
-                          if (event->get_queue()->has_been_synchronized(
-                                  event->get_compute_stream_token())) {
-                            return PI_SUCCESS;
-                          } else {
-                            return PI_CHECK_ERROR(
-                                cuStreamWaitEvent(cuStream, event->get(), 0));
-                          }
-                        });
-      }
-
-      result = PI_CHECK_ERROR(
-          cuEventRecord(command_queue->barrier_event_, cuStream));
-      for (unsigned int i = 0;
-           i < command_queue->compute_applied_barrier_.size(); i++) {
-        command_queue->compute_applied_barrier_[i] = false;
-      }
-      for (unsigned int i = 0;
-           i < command_queue->transfer_applied_barrier_.size(); i++) {
-        command_queue->transfer_applied_barrier_[i] = false;
-      }
-    }
-    if (result != PI_SUCCESS) {
-      return result;
-    }
-
-    if (event) {
-      *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue,
-                                      cuStream, stream_token);
-      (*event)->start();
-      (*event)->record();
-    }
-
-    return PI_SUCCESS;
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-}
-
 /// Creates a PI sampler object
 ///
 /// \param[in] context The context the sampler is created for.
@@ -2238,8 +1860,8 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer,
     ScopedContext active(command_queue->get_context());
 
     if (is_pinned) {
-      ret_err = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
-                                         event_wait_list, nullptr);
+      ret_err = pi2ur::piEnqueueEventsWait(
+          command_queue, num_events_in_wait_list, event_wait_list, nullptr);
     }
 
     if (event) {
@@ -2293,8 +1915,8 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
     ScopedContext active(command_queue->get_context());
 
     if (is_pinned) {
-      ret_err = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
-                                         event_wait_list, nullptr);
+      ret_err = pi2ur::piEnqueueEventsWait(
+          command_queue, num_events_in_wait_list, event_wait_list, nullptr);
     }
 
     if (event) {
@@ -3088,10 +2710,10 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piSamplerRetain, cuda_piSamplerRetain)
   _PI_CL(piSamplerRelease, cuda_piSamplerRelease)
   // Queue commands
-  _PI_CL(piEnqueueKernelLaunch, cuda_piEnqueueKernelLaunch)
+  _PI_CL(piEnqueueKernelLaunch, pi2ur::piEnqueueKernelLaunch)
   _PI_CL(piEnqueueNativeKernel, cuda_piEnqueueNativeKernel)
-  _PI_CL(piEnqueueEventsWait, cuda_piEnqueueEventsWait)
-  _PI_CL(piEnqueueEventsWaitWithBarrier, cuda_piEnqueueEventsWaitWithBarrier)
+  _PI_CL(piEnqueueEventsWait, pi2ur::piEnqueueEventsWait)
+  _PI_CL(piEnqueueEventsWaitWithBarrier, pi2ur::piEnqueueEventsWaitWithBarrier)
   _PI_CL(piEnqueueMemBufferRead, cuda_piEnqueueMemBufferRead)
   _PI_CL(piEnqueueMemBufferReadRect, cuda_piEnqueueMemBufferReadRect)
   _PI_CL(piEnqueueMemBufferWrite, cuda_piEnqueueMemBufferWrite)
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
index 3dfa1ba1dbd5c..8b732a58fc7a1 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
@@ -9,10 +9,129 @@
 #include "common.hpp"
 #include "context.hpp"
 #include "event.hpp"
+#include "kernel.hpp"
 #include "queue.hpp"
 
+#include <cmath>
 #include <cuda.h>
 
+ur_result_t enqueueEventsWait(ur_queue_handle_t command_queue, CUstream stream,
+                              uint32_t num_events_in_wait_list,
+                              const ur_event_handle_t *event_wait_list) {
+  if (!event_wait_list) {
+    return UR_RESULT_SUCCESS;
+  }
+  try {
+    ScopedContext active(command_queue->get_context());
+
+    auto result = forLatestEvents(
+        event_wait_list, num_events_in_wait_list,
+        [stream](ur_event_handle_t event) -> ur_result_t {
+          if (event->get_stream() == stream) {
+            return UR_RESULT_SUCCESS;
+          } else {
+            return UR_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0));
+          }
+        });
+
+    if (result != UR_RESULT_SUCCESS) {
+      return result;
+    }
+    return UR_RESULT_SUCCESS;
+  } catch (ur_result_t err) {
+    return err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+}
+
+// Determine local work sizes that result in uniform work groups.
+// The default threadsPerBlock only require handling the first work_dim
+// dimension.
+void guessLocalWorkSize(ur_device_handle_t device, size_t *threadsPerBlock,
+                        const size_t *global_work_size,
+                        const size_t maxThreadsPerBlock[3],
+                        ur_kernel_handle_t kernel, uint32_t local_size) {
+  assert(threadsPerBlock != nullptr);
+  assert(global_work_size != nullptr);
+  assert(kernel != nullptr);
+  int minGrid, maxBlockSize, maxBlockDim[3];
+
+  static auto isPrime = [](size_t number) -> bool {
+    auto lastNumToCheck = ceil(sqrt(number));
+    if (number < 2)
+      return false;
+    if (number == 2)
+      return true;
+    if (number % 2 == 0)
+      return false;
+    for (int i = 3; i <= lastNumToCheck; i += 2) {
+      if (number % i == 0)
+        return false;
+    }
+    return true;
+  };
+
+  cuDeviceGetAttribute(&maxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
+                       device->get());
+  cuDeviceGetAttribute(&maxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
+                       device->get());
+
+  UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
+      &minGrid, &maxBlockSize, kernel->get(), NULL, local_size,
+      maxThreadsPerBlock[0]));
+
+  threadsPerBlock[2] = std::min(global_work_size[2], size_t(maxBlockDim[2]));
+  threadsPerBlock[1] =
+      std::min(global_work_size[1], std::min(maxBlockSize / threadsPerBlock[2],
+                                             size_t(maxBlockDim[1])));
+  maxBlockDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
+  threadsPerBlock[0] =
+      std::min(maxThreadsPerBlock[0],
+               std::min(global_work_size[0], size_t(maxBlockDim[0])));
+
+  // When global_work_size[0] is prime threadPerBlock[0] will later computed as
+  // 1, which is not efficient configuration. In such case we use
+  // global_work_size[0] + 1 to compute threadPerBlock[0].
+  int adjusted_0_dim_global_work_size =
+      (isPrime(global_work_size[0]) &&
+       (threadsPerBlock[0] != global_work_size[0]))
+          ? global_work_size[0] + 1
+          : global_work_size[0];
+
+  static auto isPowerOf2 = [](size_t value) -> bool {
+    return value && !(value & (value - 1));
+  };
+
+  // Find a local work group size that is a divisor of the global
+  // work group size to produce uniform work groups.
+  // Additionally, for best compute utilisation, the local size has
+  // to be a power of two.
+  while (0u != (adjusted_0_dim_global_work_size % threadsPerBlock[0]) ||
+         !isPowerOf2(threadsPerBlock[0])) {
+    --threadsPerBlock[0];
+  }
+}
+
+// Helper to verify out-of-registers case (exceeded block max registers).
+// If the kernel requires a number of registers for the entire thread
+// block exceeds the hardware limitations, then the cuLaunchKernel call
+// will fail to launch with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES error.
+bool hasExceededMaxRegistersPerBlock(ur_device_handle_t device,
+                                     ur_kernel_handle_t kernel,
+                                     size_t blockSize) {
+  int maxRegsPerBlock{0};
+  UR_CHECK_ERROR(cuDeviceGetAttribute(
+      &maxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
+      device->get()));
+
+  int regsPerThread{0};
+  UR_CHECK_ERROR(cuFuncGetAttribute(&regsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS,
+                                    kernel->get()));
+
+  return blockSize * regsPerThread > size_t(maxRegsPerBlock);
+};
+
 /// Enqueues a wait on the given CUstream for all specified events (See
 /// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued
 /// wait will wait on all previous events in the queue.
@@ -108,3 +227,166 @@ ur_result_t urEnqueueEventsWait(ur_queue_handle_t hQueue,
   return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
                                         phEventWaitList, phEvent);
 }
+
+ur_result_t urEnqueueKernelLaunch(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+
+  // Preconditions
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hQueue->get_context() == hKernel->get_context(),
+            UR_RESULT_ERROR_INVALID_KERNEL);
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pGlobalWorkOffset, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+  UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+
+  if (*pGlobalWorkSize == 0) {
+    return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
+                                          phEventWaitList, phEvent);
+  }
+
+  // Set the number of threads per block to the number of threads per warp
+  // by default unless user has provided a better number
+  size_t threadsPerBlock[3] = {32u, 1u, 1u};
+  size_t maxWorkGroupSize = 0u;
+  size_t maxThreadsPerBlock[3] = {};
+  bool providedLocalWorkGroupSize = (pLocalWorkSize != nullptr);
+  int32_t local_size = hKernel->get_local_size();
+  ur_result_t retError = UR_RESULT_SUCCESS;
+
+  try {
+    // Set the active context here as guessLocalWorkSize needs an active context
+    ScopedContext active(hQueue->get_context());
+    {
+      size_t *reqdThreadsPerBlock = hKernel->reqdThreadsPerBlock_;
+      maxWorkGroupSize = hQueue->device_->get_max_work_group_size();
+      hQueue->device_->get_max_work_item_sizes(sizeof(maxThreadsPerBlock),
+                                               maxThreadsPerBlock);
+
+      if (providedLocalWorkGroupSize) {
+        auto isValid = [&](int dim) {
+          if (reqdThreadsPerBlock[dim] != 0 &&
+              pLocalWorkSize[dim] != reqdThreadsPerBlock[dim])
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+
+          if (pLocalWorkSize[dim] > maxThreadsPerBlock[dim])
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+          // Checks that local work sizes are a divisor of the global work sizes
+          // which includes that the local work sizes are neither larger than
+          // the global work sizes and not 0.
+          if (0u == pLocalWorkSize[dim])
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+          if (0u != (pGlobalWorkSize[dim] % pLocalWorkSize[dim]))
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+          threadsPerBlock[dim] = pLocalWorkSize[dim];
+          return UR_RESULT_SUCCESS;
+        };
+
+        size_t kernelLocalWorkGroupSize = 0;
+        for (size_t dim = 0; dim < workDim; dim++) {
+          auto err = isValid(dim);
+          if (err != UR_RESULT_SUCCESS)
+            return err;
+          // If no error then sum the total local work size per dim.
+          kernelLocalWorkGroupSize += pLocalWorkSize[dim];
+        }
+
+        if (hasExceededMaxRegistersPerBlock(hQueue->device_, hKernel,
+                                            kernelLocalWorkGroupSize)) {
+          return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+        }
+      } else {
+        guessLocalWorkSize(hQueue->device_, threadsPerBlock, pGlobalWorkSize,
+                           maxThreadsPerBlock, hKernel, local_size);
+      }
+    }
+
+    if (maxWorkGroupSize <
+        size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
+      return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+    }
+
+    size_t blocksPerGrid[3] = {1u, 1u, 1u};
+
+    for (size_t i = 0; i < workDim; i++) {
+      blocksPerGrid[i] =
+          (pGlobalWorkSize[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i];
+    }
+
+    std::unique_ptr<ur_event_handle_t_> retImplEv{nullptr};
+
+    uint32_t stream_token;
+    ur_stream_guard_ guard;
+    CUstream cuStream = hQueue->get_next_compute_stream(
+        numEventsInWaitList, phEventWaitList, guard, &stream_token);
+    CUfunction cuFunc = hKernel->get();
+
+    retError = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                                 phEventWaitList);
+
+    // Set the implicit global offset parameter if kernel has offset variant
+    if (hKernel->get_with_offset_parameter()) {
+      std::uint32_t cuda_implicit_offset[3] = {0, 0, 0};
+      if (pGlobalWorkOffset) {
+        for (size_t i = 0; i < workDim; i++) {
+          cuda_implicit_offset[i] =
+              static_cast<std::uint32_t>(pGlobalWorkOffset[i]);
+          if (pGlobalWorkOffset[i] != 0) {
+            cuFunc = hKernel->get_with_offset_parameter();
+          }
+        }
+      }
+      hKernel->set_implicit_offset_arg(sizeof(cuda_implicit_offset),
+                                       cuda_implicit_offset);
+    }
+
+    auto &argIndices = hKernel->get_arg_indices();
+
+    if (phEvent) {
+      retImplEv =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
+              UR_COMMAND_KERNEL_LAUNCH, hQueue, cuStream, stream_token));
+      retImplEv->start();
+    }
+
+    // Set local mem max size if env var is present
+    static const char *local_mem_sz_ptr =
+        std::getenv("SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE");
+
+    if (local_mem_sz_ptr) {
+      int device_max_local_mem = 0;
+      cuDeviceGetAttribute(
+          &device_max_local_mem,
+          CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+          hQueue->get_device()->get());
+
+      static const int env_val = std::atoi(local_mem_sz_ptr);
+      if (env_val <= 0 || env_val > device_max_local_mem) {
+        setErrorMessage("Invalid value specified for "
+                        "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
+                        UR_RESULT_ERROR_ADAPTER_SPECIFIC);
+        return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+      }
+      UR_CHECK_ERROR(cuFuncSetAttribute(
+          cuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, env_val));
+    }
+
+    retError = UR_CHECK_ERROR(cuLaunchKernel(
+        cuFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2],
+        threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2], local_size,
+        cuStream, const_cast<void **>(argIndices.data()), nullptr));
+    if (local_size != 0)
+      hKernel->clear_local_size();
+
+    if (phEvent) {
+      retError = retImplEv->record();
+      *phEvent = retImplEv.release();
+    }
+  } catch (ur_result_t err) {
+    retError = err;
+  }
+  return retError;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index 9d408ff9d939f..c77184d5f226f 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -166,9 +166,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
   }
   pDdiTable->pfnDeviceGlobalVariableRead = nullptr;
   pDdiTable->pfnDeviceGlobalVariableWrite = nullptr;
-  pDdiTable->pfnEventsWait = nullptr;
-  pDdiTable->pfnEventsWaitWithBarrier = nullptr;
-  pDdiTable->pfnKernelLaunch = nullptr;
+  pDdiTable->pfnEventsWait = urEnqueueEventsWait;
+  pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier;
+  pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch;
   pDdiTable->pfnMemBufferCopy = nullptr;
   pDdiTable->pfnMemBufferCopyRect = nullptr;
   pDdiTable->pfnMemBufferFill = nullptr;

From 8c632473787ec896c98649f9c851704d1b70bc31 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Wed, 19 Apr 2023 10:37:02 +0100
Subject: [PATCH 09/45] [SYCL][CUDA][UR] Add missing queue/event entry points
 to DDI table

---
 .../ur/adapters/cuda/ur_interface_loader.cpp  | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index c77184d5f226f..0ffa5dd53e2f6 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -66,14 +66,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable(
   if (UR_RESULT_SUCCESS != result) {
     return result;
   }
-  pDdiTable->pfnCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnGetInfo = nullptr;
-  pDdiTable->pfnGetNativeHandle = nullptr;
-  pDdiTable->pfnGetProfilingInfo = nullptr;
-  pDdiTable->pfnRelease = nullptr;
-  pDdiTable->pfnRetain = nullptr;
-  pDdiTable->pfnSetCallback = nullptr;
-  pDdiTable->pfnWait = nullptr;
+  pDdiTable->pfnCreateWithNativeHandle = urEventCreateWithNativeHandle;
+  pDdiTable->pfnGetInfo = urEventGetInfo;
+  pDdiTable->pfnGetNativeHandle = urEventGetNativeHandle;
+  pDdiTable->pfnGetProfilingInfo = urEventGetProfilingInfo;
+  pDdiTable->pfnRelease = urEventRelease;
+  pDdiTable->pfnRetain = urEventRetain;
+  pDdiTable->pfnSetCallback = urEventSetCallback;
+  pDdiTable->pfnWait = urEventWait;
   return UR_RESULT_SUCCESS;
 }
 
@@ -208,14 +208,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable(
   if (UR_RESULT_SUCCESS != result) {
     return result;
   }
-  pDdiTable->pfnCreate = nullptr;
-  pDdiTable->pfnCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnFinish = nullptr;
-  pDdiTable->pfnFlush = nullptr;
-  pDdiTable->pfnGetInfo = nullptr;
-  pDdiTable->pfnGetNativeHandle = nullptr;
-  pDdiTable->pfnRelease = nullptr;
-  pDdiTable->pfnRetain = nullptr;
+  pDdiTable->pfnCreate = urQueueCreate;
+  pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle;
+  pDdiTable->pfnFinish = urQueueFinish;
+  pDdiTable->pfnFlush = urQueueFlush;
+  pDdiTable->pfnGetInfo = urQueueGetInfo;
+  pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle;
+  pDdiTable->pfnRelease = urQueueRelease;
+  pDdiTable->pfnRetain = urQueueRetain;
   return UR_RESULT_SUCCESS;
 }
 

From 76d4c5f9197c165ea1562edabbf2f1b71c0906b7 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Wed, 19 Apr 2023 10:57:45 +0100
Subject: [PATCH 10/45] [SYCL][CUDA] Remove unused function from pi_cuda

---
 sycl/plugins/cuda/pi_cuda.cpp | 38 -----------------------------------
 1 file changed, 38 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index baf4f4a4983d1..af6b5759922d2 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -347,44 +347,6 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) {
 
 //-- PI API implementation
 extern "C" {
-pi_result cuda_piContextGetInfo(pi_context context, pi_context_info param_name,
-                                size_t param_value_size, void *param_value,
-                                size_t *param_value_size_ret) {
-
-  switch (param_name) {
-  case PI_CONTEXT_INFO_NUM_DEVICES:
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1);
-  case PI_CONTEXT_INFO_DEVICES:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   context->get_device());
-  case PI_CONTEXT_INFO_REFERENCE_COUNT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   context->get_reference_count());
-  case PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES:
-  case PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES:
-  case PI_EXT_CONTEXT_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES:
-  case PI_EXT_CONTEXT_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: {
-    // These queries should be dealt with in context_impl.cpp by calling the
-    // queries of each device separately and building the intersection set.
-    setErrorMessage("These queries should have never come here.",
-                    UR_RESULT_ERROR_INVALID_ARGUMENT);
-    return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-  }
-  case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT:
-    return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
-                            true);
-  case PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT:
-  case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT:
-    // 2D USM operations currently not supported.
-    return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
-                            false);
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-
-  return PI_ERROR_OUT_OF_RESOURCES;
-}
-
 /// \return If available, the first binary that is PTX
 ///
 pi_result cuda_piextDeviceSelectBinary(pi_device device,

From 3742495c95c15a2dd181a19de7ced6cddad2f364 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Wed, 19 Apr 2023 11:26:08 +0100
Subject: [PATCH 11/45] [SYCL][CUDA] Add missing UR_APICALL, UR_APIEXPORT to
 entry points

---
 .../unified_runtime/ur/adapters/cuda/device.cpp |  2 +-
 .../ur/adapters/cuda/enqueue.cpp                | 11 +++++------
 .../ur/adapters/cuda/platform.cpp               | 17 ++++++++---------
 3 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
index ae987ab4a7c6e..567377be8796f 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
@@ -997,7 +997,7 @@ urDevicePartition(ur_device_handle_t, const ur_device_partition_property_t *,
 
 /// \return UR_RESULT_SUCCESS always since CUDA devices are always root
 /// devices.
-ur_result_t urDeviceRelease(ur_device_handle_t device) {
+UR_DLLEXPORT ur_result_t UR_APICALL urDeviceRelease(ur_device_handle_t device) {
   UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
   return UR_RESULT_SUCCESS;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
index 8b732a58fc7a1..68c70aa1ae9ec 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
@@ -136,7 +136,7 @@ bool hasExceededMaxRegistersPerBlock(ur_device_handle_t device,
 /// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued
 /// wait will wait on all previous events in the queue.
 ///
-ur_result_t urEnqueueEventsWaitWithBarrier(
+UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
     ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
   // This function makes one stream work on the previous work (or work
@@ -220,15 +220,14 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
 /// TODO: Add support for multiple streams once the Event class is properly
 /// refactored.
 ///
-ur_result_t urEnqueueEventsWait(ur_queue_handle_t hQueue,
-                                uint32_t numEventsInWaitList,
-                                const ur_event_handle_t *phEventWaitList,
-                                ur_event_handle_t *phEvent) {
+UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
+    ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
   return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
                                         phEventWaitList, phEvent);
 }
 
-ur_result_t urEnqueueKernelLaunch(
+UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
     const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
     const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
index 5a4e43c320af0..2ca8c516c08e3 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
@@ -18,9 +18,9 @@
 void enableCUDATracing();
 void disableCUDATracing();
 
-ur_result_t urPlatformGetInfo(ur_platform_handle_t hPlatform,
-                              ur_platform_info_t PlatformInfoType, size_t Size,
-                              void *pPlatformInfo, size_t *pSizeRet) {
+UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo(
+    ur_platform_handle_t hPlatform, ur_platform_info_t PlatformInfoType,
+    size_t Size, void *pPlatformInfo, size_t *pSizeRet) {
 
   UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UrReturnHelper ReturnValue(Size, pPlatformInfo, pSizeRet);
@@ -57,7 +57,7 @@ ur_result_t urPlatformGetInfo(ur_platform_handle_t hPlatform,
 /// However because multiple devices in a context is not currently supported,
 /// place each device in a separate platform.
 ///
-ur_result_t urPlatformGet(uint32_t NumEntries,
+UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGet(uint32_t NumEntries,
                           ur_platform_handle_t *phPlatforms,
                           uint32_t *pNumPlatforms) {
 
@@ -163,8 +163,8 @@ ur_result_t urPlatformGet(uint32_t NumEntries,
   }
 }
 
-ur_result_t urPlatformGetApiVersion(ur_platform_handle_t hDriver,
-                                    ur_api_version_t *pVersion) {
+UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion(
+    ur_platform_handle_t hDriver, ur_api_version_t *pVersion) {
   UR_ASSERT(hDriver, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(pVersion, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
@@ -172,13 +172,12 @@ ur_result_t urPlatformGetApiVersion(ur_platform_handle_t hDriver,
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t urInit(ur_device_init_flags_t) {
+UR_DLLEXPORT ur_result_t UR_APICALL urInit(ur_device_init_flags_t) {
   enableCUDATracing();
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t urTearDown(void *) {
+UR_DLLEXPORT ur_result_t UR_APICALL urTearDown(void *) {
   disableCUDATracing();
   return UR_RESULT_SUCCESS;
 }
-

From 7e0f0ecd636839e820babea0ceaa38e92c1fd697 Mon Sep 17 00:00:00 2001
From: Petr Vesely <petr.vesely@codeplay.com>
Date: Wed, 19 Apr 2023 11:34:37 +0100
Subject: [PATCH 12/45] Small fixes

---
 sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
index bca41b4c0b5ba..129f4eb06b81e 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
@@ -113,8 +113,9 @@ ur_result_t ur_program_handle_t_::set_binary(const char *source,
 }
 
 ur_result_t ur_program_handle_t_::build_program(const char *build_options) {
-
-  this->buildOptions_ = build_options;
+  if (build_options) {
+    this->buildOptions_ = build_options;
+  }
 
   constexpr const unsigned int numberOfOptions = 4u;
 

From 17f91fc331e90b29065db3b2c4c7f5d170bb9ab3 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Tue, 18 Apr 2023 11:48:29 +0100
Subject: [PATCH 13/45] [SYCL][PI][UR][CUDA] Port CUDA sampler to UR

---
 sycl/plugins/cuda/CMakeLists.txt              |   2 +
 sycl/plugins/cuda/pi_cuda.cpp                 | 146 +-----------------
 sycl/plugins/cuda/pi_cuda.hpp                 |  16 +-
 sycl/plugins/unified_runtime/CMakeLists.txt   |   2 +
 .../ur/adapters/cuda/sampler.cpp              |  84 ++++++++++
 .../ur/adapters/cuda/sampler.hpp              |  29 ++++
 .../ur/adapters/cuda/ur_interface_loader.cpp  |   8 +-
 7 files changed, 128 insertions(+), 159 deletions(-)
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp

diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt
index 6339f1e3466ea..7b8bb0377684e 100644
--- a/sycl/plugins/cuda/CMakeLists.txt
+++ b/sycl/plugins/cuda/CMakeLists.txt
@@ -72,6 +72,8 @@ add_sycl_plugin(cuda
     "../unified_runtime/ur/adapters/cuda/kernel.hpp"
     "../unified_runtime/ur/adapters/cuda/queue.hpp"
     "../unified_runtime/ur/adapters/cuda/queue.cpp"
+    "../unified_runtime/ur/adapters/cuda/sampler.cpp"
+    "../unified_runtime/ur/adapters/cuda/sampler.hpp"
     "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp"
     "../unified_runtime/ur/adapters/cuda/tracing.cpp"
     # --- 
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index af6b5759922d2..0c2cc178eeec6 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -990,144 +990,6 @@ pi_result cuda_piMemRetain(pi_mem mem) {
   return PI_SUCCESS;
 }
 
-/// Creates a PI sampler object
-///
-/// \param[in] context The context the sampler is created for.
-/// \param[in] sampler_properties The properties for the sampler.
-/// \param[out] result_sampler Set to the resulting sampler object.
-///
-/// \return PI_SUCCESS on success. PI_ERROR_INVALID_VALUE if given an invalid
-/// property
-///         or if there is multiple of properties from the same category.
-pi_result cuda_piSamplerCreate(pi_context context,
-                               const pi_sampler_properties *sampler_properties,
-                               pi_sampler *result_sampler) {
-  std::unique_ptr<_pi_sampler> retImplSampl{new _pi_sampler(context)};
-
-  bool propSeen[3] = {false, false, false};
-  for (size_t i = 0; sampler_properties[i] != 0; i += 2) {
-    switch (sampler_properties[i]) {
-    case PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS:
-      if (propSeen[0]) {
-        return PI_ERROR_INVALID_VALUE;
-      }
-      propSeen[0] = true;
-      retImplSampl->props_ |= sampler_properties[i + 1];
-      break;
-    case PI_SAMPLER_PROPERTIES_FILTER_MODE:
-      if (propSeen[1]) {
-        return PI_ERROR_INVALID_VALUE;
-      }
-      propSeen[1] = true;
-      retImplSampl->props_ |=
-          (sampler_properties[i + 1] - PI_SAMPLER_FILTER_MODE_NEAREST) << 1;
-      break;
-    case PI_SAMPLER_PROPERTIES_ADDRESSING_MODE:
-      if (propSeen[2]) {
-        return PI_ERROR_INVALID_VALUE;
-      }
-      propSeen[2] = true;
-      retImplSampl->props_ |=
-          (sampler_properties[i + 1] - PI_SAMPLER_ADDRESSING_MODE_NONE) << 2;
-      break;
-    default:
-      return PI_ERROR_INVALID_VALUE;
-    }
-  }
-
-  if (!propSeen[0]) {
-    retImplSampl->props_ |= PI_TRUE;
-  }
-  // Default filter mode to PI_SAMPLER_FILTER_MODE_NEAREST
-  if (!propSeen[2]) {
-    retImplSampl->props_ |=
-        (PI_SAMPLER_ADDRESSING_MODE_CLAMP % PI_SAMPLER_ADDRESSING_MODE_NONE)
-        << 2;
-  }
-
-  *result_sampler = retImplSampl.release();
-  return PI_SUCCESS;
-}
-
-/// Gets information from a PI sampler object
-///
-/// \param[in] sampler The sampler to get the information from.
-/// \param[in] param_name The name of the information to get.
-/// \param[in] param_value_size The size of the param_value.
-/// \param[out] param_value Set to information value.
-/// \param[out] param_value_size_ret Set to the size of the information value.
-///
-/// \return PI_SUCCESS on success.
-pi_result cuda_piSamplerGetInfo(pi_sampler sampler, pi_sampler_info param_name,
-                                size_t param_value_size, void *param_value,
-                                size_t *param_value_size_ret) {
-  assert(sampler != nullptr);
-
-  switch (param_name) {
-  case PI_SAMPLER_INFO_REFERENCE_COUNT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   sampler->get_reference_count());
-  case PI_SAMPLER_INFO_CONTEXT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   sampler->context_);
-  case PI_SAMPLER_INFO_NORMALIZED_COORDS: {
-    pi_bool norm_coords_prop = static_cast<pi_bool>(sampler->props_ & 0x1);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   norm_coords_prop);
-  }
-  case PI_SAMPLER_INFO_FILTER_MODE: {
-    pi_sampler_filter_mode filter_prop = static_cast<pi_sampler_filter_mode>(
-        ((sampler->props_ >> 1) & 0x1) + PI_SAMPLER_FILTER_MODE_NEAREST);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   filter_prop);
-  }
-  case PI_SAMPLER_INFO_ADDRESSING_MODE: {
-    pi_sampler_addressing_mode addressing_prop =
-        static_cast<pi_sampler_addressing_mode>(
-            (sampler->props_ >> 2) + PI_SAMPLER_ADDRESSING_MODE_NONE);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   addressing_prop);
-  }
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-  return {};
-}
-
-/// Retains a PI sampler object, incrementing its reference count.
-///
-/// \param[in] sampler The sampler to increment the reference count of.
-///
-/// \return PI_SUCCESS.
-pi_result cuda_piSamplerRetain(pi_sampler sampler) {
-  assert(sampler != nullptr);
-  sampler->increment_reference_count();
-  return PI_SUCCESS;
-}
-
-/// Releases a PI sampler object, decrementing its reference count. If the
-/// reference count reaches zero, the sampler object is destroyed.
-///
-/// \param[in] sampler The sampler to decrement the reference count of.
-///
-/// \return PI_SUCCESS.
-pi_result cuda_piSamplerRelease(pi_sampler sampler) {
-  assert(sampler != nullptr);
-
-  // double delete or someone is messing with the ref count.
-  // either way, cannot safely proceed.
-  sycl::detail::pi::assertion(
-      sampler->get_reference_count() != 0,
-      "Reference count overflow detected in cuda_piSamplerRelease.");
-
-  // decrement ref count. If it is 0, delete the sampler.
-  if (sampler->decrement_reference_count() == 0) {
-    delete sampler;
-  }
-
-  return PI_SUCCESS;
-}
-
 /// General 3D memory copy operation.
 /// This function requires the corresponding CUDA context to be at the top of
 /// the context stack
@@ -2667,10 +2529,10 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piextEventCreateWithNativeHandle,
          pi2ur::piextEventCreateWithNativeHandle)
   // Sampler
-  _PI_CL(piSamplerCreate, cuda_piSamplerCreate)
-  _PI_CL(piSamplerGetInfo, cuda_piSamplerGetInfo)
-  _PI_CL(piSamplerRetain, cuda_piSamplerRetain)
-  _PI_CL(piSamplerRelease, cuda_piSamplerRelease)
+  _PI_CL(piSamplerCreate, pi2ur::piSamplerCreate)
+  _PI_CL(piSamplerGetInfo, pi2ur::piSamplerGetInfo)
+  _PI_CL(piSamplerRetain, pi2ur::piSamplerRetain)
+  _PI_CL(piSamplerRelease, pi2ur::piSamplerRelease)
   // Queue commands
   _PI_CL(piEnqueueKernelLaunch, pi2ur::piEnqueueKernelLaunch)
   _PI_CL(piEnqueueNativeKernel, cuda_piEnqueueNativeKernel)
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index 1a8c7e64537cd..0df35e53c2d27 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -49,6 +49,7 @@
 #include <ur/adapters/cuda/program.hpp>
 #include <ur/adapters/cuda/event.hpp>
 #include <ur/adapters/cuda/queue.hpp>
+#include <ur/adapters/cuda/sampler.hpp>
 
 // Share code between the PI Plugin and UR Adapter
 #include <pi2ur.hpp>
@@ -406,19 +407,8 @@ struct _pi_kernel : ur_kernel_handle_t_ {
 /// Sampler property layout:
 /// | 31 30 ... 6 5 |      4 3 2      |     1      |         0        |
 /// |      N/A      | addressing mode | fiter mode | normalize coords |
-struct _pi_sampler {
-  std::atomic_uint32_t refCount_;
-  pi_uint32 props_;
-  pi_context context_;
-
-  _pi_sampler(pi_context context)
-      : refCount_(1), props_(0), context_(context) {}
-
-  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
-
-  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
-
-  pi_uint32 get_reference_count() const noexcept { return refCount_; }
+struct _pi_sampler : ur_sampler_handle_t_ {
+  using ur_sampler_handle_t_::ur_sampler_handle_t_;
 };
 
 // -------------------------------------------------------------
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index dc572bd5e7e9c..13ac8a5a1e138 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -148,6 +148,8 @@ add_sycl_library("ur_adapter_cuda" SHARED
     "ur/adapters/cuda/kernel.hpp"
     "ur/adapters/cuda/queue.cpp"
     "ur/adapters/cuda/queue.hpp"
+    "ur/adapters/cuda/sampler.cpp"
+    "ur/adapters/cuda/sampler.hpp"
     "ur/adapters/cuda/ur_interface_loader.cpp"
     "ur/adapters/cuda/tracing.cpp"
   INCLUDE_DIRS
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
new file mode 100644
index 0000000000000..c07f548c92a26
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
@@ -0,0 +1,84 @@
+//===--------- sampler.cpp - CUDA Adapter ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "sampler.hpp"
+#include "common.hpp"
+
+ur_result_t urSamplerCreate(ur_context_handle_t hContext,
+                            const ur_sampler_desc_t *pDesc,
+                            ur_sampler_handle_t *phSampler) {
+  std::unique_ptr<ur_sampler_handle_t_> retImplSampl{
+      new ur_sampler_handle_t_(hContext)};
+
+  if (pDesc && pDesc->stype == UR_STRUCTURE_TYPE_SAMPLER_DESC) {
+    retImplSampl->props_ |= pDesc->normalizedCoords;
+    retImplSampl->props_ |= (pDesc->filterMode << 1);
+    retImplSampl->props_ |= (pDesc->addressingMode << 2);
+  } else {
+    // Set default values
+    retImplSampl->props_ |= true; // Normalized Coords
+    retImplSampl->props_ |= UR_SAMPLER_ADDRESSING_MODE_CLAMP << 2;
+  }
+
+  *phSampler = retImplSampl.release();
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler,
+                             ur_sampler_info_t propName, size_t propValueSize,
+                             void *pPropValue, size_t *pPropSizeRet) {
+  UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet);
+
+  switch (propName) {
+  case UR_SAMPLER_INFO_REFERENCE_COUNT:
+    return ReturnValue(hSampler->get_reference_count());
+  case UR_SAMPLER_INFO_CONTEXT:
+    return ReturnValue(hSampler->context_);
+  case UR_SAMPLER_INFO_NORMALIZED_COORDS: {
+    bool norm_coords_prop = static_cast<bool>(hSampler->props_);
+    return ReturnValue(norm_coords_prop);
+  }
+  case UR_SAMPLER_INFO_FILTER_MODE: {
+    auto filter_prop =
+        static_cast<ur_sampler_filter_mode_t>(((hSampler->props_ >> 1) & 0x1));
+    return ReturnValue(filter_prop);
+  }
+  case UR_SAMPLER_INFO_ADDRESSING_MODE: {
+    auto addressing_prop =
+        static_cast<ur_sampler_addressing_mode_t>(hSampler->props_ >> 2);
+    return ReturnValue(addressing_prop);
+  }
+  default:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+  }
+  return {};
+}
+
+ur_result_t urSamplerRetain(ur_sampler_handle_t hSampler) {
+  UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  hSampler->increment_reference_count();
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t urSamplerRelease(ur_sampler_handle_t hSampler) {
+  UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  // double delete or someone is messing with the ref count.
+  // either way, cannot safely proceed.
+  sycl::detail::ur::assertion(
+      hSampler->get_reference_count() != 0,
+      "Reference count overflow detected in urSamplerRelease.");
+
+  // decrement ref count. If it is 0, delete the sampler.
+  if (hSampler->decrement_reference_count() == 0) {
+    delete hSampler;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp
new file mode 100644
index 0000000000000..61ed98325a5ed
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp
@@ -0,0 +1,29 @@
+//===--------- sampler.hpp - CUDA Adapter ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include <ur/ur.hpp>
+
+/// Implementation of samplers for CUDA
+///
+/// Sampler property layout:
+/// | 31 30 ... 6 5 |      4 3 2      |     1      |         0        |
+/// |      N/A      | addressing mode | fiter mode | normalize coords |
+struct ur_sampler_handle_t_ {
+  std::atomic_uint32_t refCount_;
+  uint32_t props_;
+  ur_context_handle_t context_;
+
+  ur_sampler_handle_t_(ur_context_handle_t context)
+      : refCount_(1), props_(0), context_(context) {}
+
+  uint32_t increment_reference_count() noexcept { return ++refCount_; }
+
+  uint32_t decrement_reference_count() noexcept { return --refCount_; }
+
+  uint32_t get_reference_count() const noexcept { return refCount_; }
+};
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index 0ffa5dd53e2f6..06ae75db02dec 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -130,12 +130,12 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable(
   if (UR_RESULT_SUCCESS != result) {
     return result;
   }
-  pDdiTable->pfnCreate = nullptr;
+  pDdiTable->pfnCreate = urSamplerCreate;
   pDdiTable->pfnCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetInfo = urSamplerGetInfo;
   pDdiTable->pfnGetNativeHandle = nullptr;
-  pDdiTable->pfnRelease = nullptr;
-  pDdiTable->pfnRetain = nullptr;
+  pDdiTable->pfnRelease = urSamplerRelease;
+  pDdiTable->pfnRetain = urSamplerRetain;
   return UR_RESULT_SUCCESS;
 }
 

From 6489ce147eb74d29930bfeb86d958dcd0b76344f Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Fri, 21 Apr 2023 14:51:21 +0100
Subject: [PATCH 14/45] [SYCL][CUDA] Fix missing input validation for various
 queue entry points

---
 .../ur/adapters/cuda/queue.cpp                | 23 +++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
index 1d10cedd82c91..371c3363b4e75 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
@@ -122,6 +122,9 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
               const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) {
   try {
     std::unique_ptr<ur_queue_handle_t_> queueImpl{nullptr};
+    UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+    UR_ASSERT(phQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+    UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
     if (hContext->get_device() != hDevice) {
       *phQueue = nullptr;
@@ -167,7 +170,7 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) {
-  assert(hQueue != nullptr);
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   assert(hQueue->get_reference_count() > 0);
 
   hQueue->increment_reference_count();
@@ -175,7 +178,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) {
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) {
-  assert(hQueue != nullptr);
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
   if (hQueue->decrement_reference_count() > 0) {
     return UR_RESULT_SUCCESS;
@@ -206,9 +209,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) {
   ur_result_t result = UR_RESULT_SUCCESS;
 
   try {
-
-    assert(hQueue !=
-           nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code
+    UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
     ScopedContext active(hQueue->get_context());
 
     hQueue->sync_streams</*ResetUsed=*/true>([&result](CUstream s) {
@@ -231,12 +232,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) {
 // same problem of having to flush cross-queue dependencies as some of the
 // other plugins, so it can be left as no-op.
 UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) {
-  (void)hQueue;
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(
     ur_queue_handle_t hQueue, ur_native_handle_t *phNativeQueue) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(phNativeQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
   ScopedContext active(hQueue->get_context());
   *phNativeQueue =
       reinterpret_cast<ur_native_handle_t>(hQueue->get_next_compute_stream());
@@ -285,7 +289,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
                                                    size_t propValueSize,
                                                    void *pPropValue,
                                                    size_t *pPropSizeRet) {
-  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pPropValue || pPropSizeRet, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
   UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet);
 
@@ -321,6 +326,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
   default:
     break;
   }
-  sycl::detail::ur::die("Queue info request not implemented");
-  return {};
+
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
 }

From ebe90a2117c1fcd5e70c41626c8acd6c49b636f5 Mon Sep 17 00:00:00 2001
From: Martin Morrison-Grant <martin.morrisongrant@codeplay.com>
Date: Fri, 21 Apr 2023 10:40:49 +0100
Subject: [PATCH 15/45] Refactor memory object and entry points into new
 memory.hpp/cpp files. Add entry points to DDI table.

---
 sycl/plugins/cuda/CMakeLists.txt              |   2 +
 sycl/plugins/cuda/pi_cuda.cpp                 | 498 +++--------------
 sycl/plugins/cuda/pi_cuda.hpp                 | 170 +-----
 sycl/plugins/unified_runtime/CMakeLists.txt   |   2 +
 .../ur/adapters/cuda/memory.cpp               | 513 ++++++++++++++++++
 .../ur/adapters/cuda/memory.hpp               | 195 +++++++
 .../ur/adapters/cuda/ur_interface_loader.cpp  |  20 +-
 7 files changed, 801 insertions(+), 599 deletions(-)
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp

diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt
index 7b8bb0377684e..da4ce9476ee91 100644
--- a/sycl/plugins/cuda/CMakeLists.txt
+++ b/sycl/plugins/cuda/CMakeLists.txt
@@ -76,6 +76,8 @@ add_sycl_plugin(cuda
     "../unified_runtime/ur/adapters/cuda/sampler.hpp"
     "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp"
     "../unified_runtime/ur/adapters/cuda/tracing.cpp"
+    "../unified_runtime/ur/adapters/cuda/memory.cpp"
+    "../unified_runtime/ur/adapters/cuda/memory.hpp"
     # --- 
     "${sycl_inc_dir}/sycl/detail/pi.h"
     "${sycl_inc_dir}/sycl/detail/pi.hpp"
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 0c2cc178eeec6..ab0d428e3613a 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -94,6 +94,62 @@ pi_result map_ur_error(ur_result_t result) {
   }
 }
 
+pi_mem_type map_ur_mem_type(ur_mem_type_t mem_type) {
+  switch (mem_type) {
+  case UR_MEM_TYPE_BUFFER:
+  default:
+    return PI_MEM_TYPE_BUFFER;
+  case UR_MEM_TYPE_IMAGE2D:
+    return PI_MEM_TYPE_IMAGE2D;
+  case UR_MEM_TYPE_IMAGE3D:
+    return PI_MEM_TYPE_IMAGE3D;
+  case UR_MEM_TYPE_IMAGE2D_ARRAY:
+    return PI_MEM_TYPE_IMAGE2D_ARRAY;
+  case UR_MEM_TYPE_IMAGE1D:
+    return PI_MEM_TYPE_IMAGE1D;
+  case UR_MEM_TYPE_IMAGE1D_ARRAY:
+    return PI_MEM_TYPE_IMAGE1D_ARRAY;
+  case UR_MEM_TYPE_IMAGE1D_BUFFER:
+    return PI_MEM_TYPE_IMAGE1D_BUFFER;
+  }
+}
+
+template <typename TypeOut, typename TypeFlag>
+inline pi_result
+ConvertInputBitfield(pi_bitfield in, TypeOut *out,
+                     const std::unordered_map<pi_bitfield, TypeFlag> &map) {
+  *out = 0;
+  for (auto &[FlagPI, FlagUR] : map) {
+    if (in & FlagPI) {
+      *out |= FlagUR;
+    }
+  }
+
+  return PI_SUCCESS;
+}
+
+// Convert bitfield flags from PI to UR for MemFlags
+inline pi_result pi2urMemFlags(pi_mem_flags piFlags, ur_mem_flags_t *urFlags) {
+  static const std::unordered_map<pi_mem_flags, ur_mem_flags_t> MemFlagsMap = {
+      {PI_MEM_FLAGS_ACCESS_RW, UR_MEM_FLAG_READ_WRITE},
+      {PI_MEM_ACCESS_READ_ONLY, UR_MEM_FLAG_READ_ONLY},
+      {PI_MEM_FLAGS_HOST_PTR_USE, UR_MEM_FLAG_USE_HOST_POINTER},
+      {PI_MEM_FLAGS_HOST_PTR_COPY, UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER},
+      {PI_MEM_FLAGS_HOST_PTR_ALLOC, UR_MEM_FLAG_ALLOC_HOST_POINTER},
+  };
+
+  return ConvertInputBitfield(piFlags, urFlags, MemFlagsMap);
+}
+
+// Convert bitfield flags from PI to UR for MapFlags
+inline pi_result pi2urMapFlags(pi_mem_flags piFlags, ur_mem_flags_t *urFlags) {
+  static const std::unordered_map<pi_bitfield, ur_map_flag_t> MapFlagsMap = {
+      {PI_MAP_READ, UR_MAP_FLAG_READ},
+      {PI_MAP_WRITE, UR_MAP_FLAG_WRITE},
+  };
+  return ConvertInputBitfield(piFlags, urFlags, MapFlagsMap);
+}
+
 // Iterates over the event wait list, returns correct pi_result error codes.
 // Invokes the callback for the latest event of each queue in the wait list.
 // The callback must take a single pi_event argument and return a pi_result.
@@ -400,245 +456,6 @@ pi_result cuda_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device,
   return retError;
 }
 
-/// Creates a PI Memory object using a CUDA memory allocation.
-/// Can trigger a manual copy depending on the mode.
-/// \TODO Implement USE_HOST_PTR using cuHostRegister
-///
-pi_result
-cuda_piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size,
-                       void *host_ptr, pi_mem *ret_mem,
-                       [[maybe_unused]] const pi_mem_properties *properties) {
-  // Need input memory object
-  assert(ret_mem != nullptr);
-  assert((properties == nullptr || *properties == 0) &&
-         "no mem properties goes to cuda RT yet");
-  // Currently, USE_HOST_PTR is not implemented using host register
-  // since this triggers a weird segfault after program ends.
-  // Setting this constant to true enables testing that behavior.
-  const bool enableUseHostPtr = false;
-  const bool performInitialCopy =
-      (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
-      ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && !enableUseHostPtr);
-  pi_result retErr = PI_SUCCESS;
-  pi_mem retMemObj = nullptr;
-
-  try {
-    ScopedContext active(context);
-    CUdeviceptr ptr;
-    _pi_mem::mem_::buffer_mem_::alloc_mode allocMode =
-        _pi_mem::mem_::buffer_mem_::alloc_mode::classic;
-
-    if ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && enableUseHostPtr) {
-      retErr = PI_CHECK_ERROR(
-          cuMemHostRegister(host_ptr, size, CU_MEMHOSTREGISTER_DEVICEMAP));
-      retErr = PI_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, host_ptr, 0));
-      allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::use_host_ptr;
-    } else if (flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
-      retErr = PI_CHECK_ERROR(cuMemAllocHost(&host_ptr, size));
-      retErr = PI_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, host_ptr, 0));
-      allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr;
-    } else {
-      retErr = PI_CHECK_ERROR(cuMemAlloc(&ptr, size));
-      if (flags & PI_MEM_FLAGS_HOST_PTR_COPY) {
-        allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::copy_in;
-      }
-    }
-
-    if (retErr == PI_SUCCESS) {
-      pi_mem parentBuffer = nullptr;
-
-      auto piMemObj = std::unique_ptr<_pi_mem>(
-          new _pi_mem{context, parentBuffer, allocMode, ptr, host_ptr, size});
-      if (piMemObj != nullptr) {
-        retMemObj = piMemObj.release();
-        if (performInitialCopy) {
-          // Operates on the default stream of the current CUDA context.
-          retErr = PI_CHECK_ERROR(cuMemcpyHtoD(ptr, host_ptr, size));
-          // Synchronize with default stream implicitly used by cuMemcpyHtoD
-          // to make buffer data available on device before any other PI call
-          // uses it.
-          if (retErr == PI_SUCCESS) {
-            CUstream defaultStream = 0;
-            retErr = PI_CHECK_ERROR(cuStreamSynchronize(defaultStream));
-          }
-        }
-      } else {
-        retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
-      }
-    }
-  } catch (pi_result err) {
-    retErr = err;
-  } catch (...) {
-    retErr = PI_ERROR_OUT_OF_RESOURCES;
-  }
-
-  *ret_mem = retMemObj;
-
-  return retErr;
-}
-
-/// Decreases the reference count of the Mem object.
-/// If this is zero, calls the relevant CUDA Free function
-/// \return PI_SUCCESS unless deallocation error
-///
-pi_result cuda_piMemRelease(pi_mem memObj) {
-  assert((memObj != nullptr) && "PI_ERROR_INVALID_MEM_OBJECTS");
-
-  pi_result ret = PI_SUCCESS;
-
-  try {
-
-    // Do nothing if there are other references
-    if (memObj->decrement_reference_count() > 0) {
-      return PI_SUCCESS;
-    }
-
-    // make sure memObj is released in case PI_CHECK_ERROR throws
-    std::unique_ptr<_pi_mem> uniqueMemObj(memObj);
-
-    if (memObj->is_sub_buffer()) {
-      return PI_SUCCESS;
-    }
-
-    ScopedContext active(uniqueMemObj->get_context());
-
-    if (memObj->mem_type_ == _pi_mem::mem_type::buffer) {
-      switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) {
-      case _pi_mem::mem_::buffer_mem_::alloc_mode::copy_in:
-      case _pi_mem::mem_::buffer_mem_::alloc_mode::classic:
-        ret = PI_CHECK_ERROR(cuMemFree(uniqueMemObj->mem_.buffer_mem_.ptr_));
-        break;
-      case _pi_mem::mem_::buffer_mem_::alloc_mode::use_host_ptr:
-        ret = PI_CHECK_ERROR(
-            cuMemHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
-        break;
-      case _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr:
-        ret = PI_CHECK_ERROR(
-            cuMemFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
-      };
-    } else if (memObj->mem_type_ == _pi_mem::mem_type::surface) {
-      ret = PI_CHECK_ERROR(
-          cuSurfObjectDestroy(uniqueMemObj->mem_.surface_mem_.get_surface()));
-      ret = PI_CHECK_ERROR(
-          cuArrayDestroy(uniqueMemObj->mem_.surface_mem_.get_array()));
-    }
-
-  } catch (pi_result err) {
-    ret = err;
-  } catch (...) {
-    ret = PI_ERROR_OUT_OF_RESOURCES;
-  }
-
-  if (ret != PI_SUCCESS) {
-    // A reported CUDA error is either an implementation or an asynchronous CUDA
-    // error for which it is unclear if the function that reported it succeeded
-    // or not. Either way, the state of the program is compromised and likely
-    // unrecoverable.
-    sycl::detail::pi::die(
-        "Unrecoverable program state reached in cuda_piMemRelease");
-  }
-
-  return PI_SUCCESS;
-}
-
-/// Implements a buffer partition in the CUDA backend.
-/// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented
-/// as an offset over an existing CUDA allocation.
-///
-pi_result cuda_piMemBufferPartition(
-    pi_mem parent_buffer, pi_mem_flags flags,
-    [[maybe_unused]] pi_buffer_create_type buffer_create_type,
-    void *buffer_create_info, pi_mem *memObj) {
-  assert((parent_buffer != nullptr) && "PI_ERROR_INVALID_MEM_OBJECT");
-  assert(parent_buffer->is_buffer() && "PI_ERROR_INVALID_MEM_OBJECTS");
-  assert(!parent_buffer->is_sub_buffer() && "PI_ERROR_INVALID_MEM_OBJECT");
-
-  // Default value for flags means PI_MEM_FLAGS_ACCCESS_RW.
-  if (flags == 0) {
-    flags = PI_MEM_FLAGS_ACCESS_RW;
-  }
-
-  assert((flags == PI_MEM_FLAGS_ACCESS_RW) && "PI_ERROR_INVALID_VALUE");
-  assert((buffer_create_type == PI_BUFFER_CREATE_TYPE_REGION) &&
-         "PI_ERROR_INVALID_VALUE");
-  assert((buffer_create_info != nullptr) && "PI_ERROR_INVALID_VALUE");
-  assert(memObj != nullptr);
-
-  const auto bufferRegion =
-      *reinterpret_cast<pi_buffer_region>(buffer_create_info);
-  assert((bufferRegion.size != 0u) && "PI_ERROR_INVALID_BUFFER_SIZE");
-
-  assert((bufferRegion.origin <= (bufferRegion.origin + bufferRegion.size)) &&
-         "Overflow");
-  assert(((bufferRegion.origin + bufferRegion.size) <=
-          parent_buffer->mem_.buffer_mem_.get_size()) &&
-         "PI_ERROR_INVALID_BUFFER_SIZE");
-  // Retained indirectly due to retaining parent buffer below.
-  pi_context context = parent_buffer->context_;
-  _pi_mem::mem_::buffer_mem_::alloc_mode allocMode =
-      _pi_mem::mem_::buffer_mem_::alloc_mode::classic;
-
-  assert(parent_buffer->mem_.buffer_mem_.ptr_ !=
-         _pi_mem::mem_::buffer_mem_::native_type{0});
-  _pi_mem::mem_::buffer_mem_::native_type ptr =
-      parent_buffer->mem_.buffer_mem_.ptr_ + bufferRegion.origin;
-
-  void *hostPtr = nullptr;
-  if (parent_buffer->mem_.buffer_mem_.hostPtr_) {
-    hostPtr = static_cast<char *>(parent_buffer->mem_.buffer_mem_.hostPtr_) +
-              bufferRegion.origin;
-  }
-
-  std::unique_ptr<_pi_mem> retMemObj{nullptr};
-  try {
-    retMemObj = std::unique_ptr<_pi_mem>{new _pi_mem{
-        context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}};
-  } catch (pi_result err) {
-    *memObj = nullptr;
-    return err;
-  } catch (...) {
-    *memObj = nullptr;
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  }
-
-  *memObj = retMemObj.release();
-  return PI_SUCCESS;
-}
-
-pi_result cuda_piMemGetInfo(pi_mem, pi_mem_info, size_t, void *, size_t *) {
-  sycl::detail::pi::die("cuda_piMemGetInfo not implemented");
-}
-
-/// Gets the native CUDA handle of a PI mem object
-///
-/// \param[in] mem The PI mem to get the native CUDA object of.
-/// \param[out] nativeHandle Set to the native handle of the PI mem object.
-///
-/// \return PI_SUCCESS
-pi_result cuda_piextMemGetNativeHandle(pi_mem mem,
-                                       pi_native_handle *nativeHandle) {
-  *nativeHandle = static_cast<pi_native_handle>(mem->mem_.buffer_mem_.get());
-  return PI_SUCCESS;
-}
-
-/// Created a PI mem object from a CUDA mem handle.
-/// TODO: Implement this.
-/// NOTE: The created PI object takes ownership of the native handle.
-///
-/// \param[in] nativeHandle The native handle to create PI mem object from.
-/// \param[in] context The PI context of the memory allocation.
-/// \param[in] ownNativeHandle Indicates if we own the native memory handle or
-/// it came from interop that asked to not transfer the ownership to SYCL RT.
-/// \param[out] mem Set to the PI mem object created from native handle.
-///
-/// \return TBD
-pi_result cuda_piextMemCreateWithNativeHandle(pi_native_handle, pi_context,
-                                              bool, pi_mem *) {
-  sycl::detail::pi::die(
-      "Creation of PI mem from native handle not implemented");
-  return {};
-}
-
 /// Created a PI image mem object from a CUDA image mem handle.
 /// TODO: Implement this.
 /// NOTE: The created PI object takes ownership of the native handle.
@@ -820,176 +637,6 @@ pi_result cuda_piEnqueueNativeKernel(pi_queue, void (*)(void *), void *, size_t,
   return {};
 }
 
-/// \TODO Not implemented
-pi_result cuda_piMemImageCreate(pi_context context, pi_mem_flags flags,
-                                const pi_image_format *image_format,
-                                const pi_image_desc *image_desc, void *host_ptr,
-                                pi_mem *ret_mem) {
-  // Need input memory object
-  assert(ret_mem != nullptr);
-  const bool performInitialCopy = (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
-                                  ((flags & PI_MEM_FLAGS_HOST_PTR_USE));
-  pi_result retErr = PI_SUCCESS;
-
-  // We only support RBGA channel order
-  // TODO: check SYCL CTS and spec. May also have to support BGRA
-  if (image_format->image_channel_order !=
-      pi_image_channel_order::PI_IMAGE_CHANNEL_ORDER_RGBA) {
-    sycl::detail::pi::die(
-        "cuda_piMemImageCreate only supports RGBA channel order");
-  }
-
-  // We have to use cuArray3DCreate, which has some caveats. The height and
-  // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc gives
-  // a minimum value of 1, so we need to convert the answer.
-  CUDA_ARRAY3D_DESCRIPTOR array_desc;
-  array_desc.NumChannels = 4; // Only support 4 channel image
-  array_desc.Flags = 0;       // No flags required
-  array_desc.Width = image_desc->image_width;
-  if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) {
-    array_desc.Height = 0;
-    array_desc.Depth = 0;
-  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) {
-    array_desc.Height = image_desc->image_height;
-    array_desc.Depth = 0;
-  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) {
-    array_desc.Height = image_desc->image_height;
-    array_desc.Depth = image_desc->image_depth;
-  }
-
-  // We need to get this now in bytes for calculating the total image size later
-  size_t pixel_type_size_bytes;
-
-  switch (image_format->image_channel_data_type) {
-  case PI_IMAGE_CHANNEL_TYPE_UNORM_INT8:
-  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
-    array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
-    pixel_type_size_bytes = 1;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
-    array_desc.Format = CU_AD_FORMAT_SIGNED_INT8;
-    pixel_type_size_bytes = 1;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_UNORM_INT16:
-  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16:
-    array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
-    pixel_type_size_bytes = 2;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT16:
-    array_desc.Format = CU_AD_FORMAT_SIGNED_INT16;
-    pixel_type_size_bytes = 2;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT:
-    array_desc.Format = CU_AD_FORMAT_HALF;
-    pixel_type_size_bytes = 2;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
-    array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
-    pixel_type_size_bytes = 4;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
-    array_desc.Format = CU_AD_FORMAT_SIGNED_INT32;
-    pixel_type_size_bytes = 4;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_FLOAT:
-    array_desc.Format = CU_AD_FORMAT_FLOAT;
-    pixel_type_size_bytes = 4;
-    break;
-  default:
-    sycl::detail::pi::die(
-        "cuda_piMemImageCreate given unsupported image_channel_data_type");
-  }
-
-  // When a dimension isn't used image_desc has the size set to 1
-  size_t pixel_size_bytes =
-      pixel_type_size_bytes * 4; // 4 is the only number of channels we support
-  size_t image_size_bytes = pixel_size_bytes * image_desc->image_width *
-                            image_desc->image_height * image_desc->image_depth;
-
-  ScopedContext active(context);
-  CUarray image_array;
-  retErr = PI_CHECK_ERROR(cuArray3DCreate(&image_array, &array_desc));
-
-  try {
-    if (performInitialCopy) {
-      // We have to use a different copy function for each image dimensionality
-      if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) {
-        retErr = PI_CHECK_ERROR(
-            cuMemcpyHtoA(image_array, 0, host_ptr, image_size_bytes));
-      } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) {
-        CUDA_MEMCPY2D cpy_desc;
-        memset(&cpy_desc, 0, sizeof(cpy_desc));
-        cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
-        cpy_desc.srcHost = host_ptr;
-        cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
-        cpy_desc.dstArray = image_array;
-        cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width;
-        cpy_desc.Height = image_desc->image_height;
-        retErr = PI_CHECK_ERROR(cuMemcpy2D(&cpy_desc));
-      } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) {
-        CUDA_MEMCPY3D cpy_desc;
-        memset(&cpy_desc, 0, sizeof(cpy_desc));
-        cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
-        cpy_desc.srcHost = host_ptr;
-        cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
-        cpy_desc.dstArray = image_array;
-        cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width;
-        cpy_desc.Height = image_desc->image_height;
-        cpy_desc.Depth = image_desc->image_depth;
-        retErr = PI_CHECK_ERROR(cuMemcpy3D(&cpy_desc));
-      }
-    }
-
-    // CUDA_RESOURCE_DESC is a union of different structs, shown here
-    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html
-    // We need to fill it as described here to use it for a surface or texture
-    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html
-    // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and
-    // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA array
-    // handle.
-    // CUDA_RESOURCE_DESC::flags must be set to zero
-
-    CUDA_RESOURCE_DESC image_res_desc;
-    image_res_desc.res.array.hArray = image_array;
-    image_res_desc.resType = CU_RESOURCE_TYPE_ARRAY;
-    image_res_desc.flags = 0;
-
-    CUsurfObject surface;
-    retErr = PI_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc));
-
-    auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{
-        context, image_array, surface, image_desc->image_type, host_ptr});
-
-    if (piMemObj == nullptr) {
-      return PI_ERROR_OUT_OF_HOST_MEMORY;
-    }
-
-    *ret_mem = piMemObj.release();
-  } catch (pi_result err) {
-    cuArrayDestroy(image_array);
-    return err;
-  } catch (...) {
-    cuArrayDestroy(image_array);
-    return PI_ERROR_UNKNOWN;
-  }
-
-  return retErr;
-}
-
-/// \TODO Not implemented
-pi_result cuda_piMemImageGetInfo(pi_mem, pi_image_info, size_t, void *,
-                                 size_t *) {
-  sycl::detail::pi::die("cuda_piMemImageGetInfo not implemented");
-  return {};
-}
-
-pi_result cuda_piMemRetain(pi_mem mem) {
-  assert(mem != nullptr);
-  assert(mem->get_reference_count() > 0);
-  mem->increment_reference_count();
-  return PI_SUCCESS;
-}
-
 /// General 3D memory copy operation.
 /// This function requires the corresponding CUDA context to be at the top of
 /// the context stack
@@ -1460,7 +1107,8 @@ pi_result cuda_piEnqueueMemImageRead(
     size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels;
     size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0];
 
-    pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
+    pi_mem_type imgType =
+        map_ur_mem_type(image->mem_.surface_mem_.get_image_type());
     if (imgType == PI_MEM_TYPE_IMAGE1D) {
       retErr = PI_CHECK_ERROR(
           cuMemcpyAtoHAsync(ptr, array, byteOffsetX, bytesToCopy, cuStream));
@@ -1530,7 +1178,8 @@ cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image,
     size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels;
     size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0];
 
-    pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
+    pi_mem_type imgType =
+        map_ur_mem_type(image->mem_.surface_mem_.get_image_type());
     if (imgType == PI_MEM_TYPE_IMAGE1D) {
       retErr = PI_CHECK_ERROR(
           cuMemcpyHtoAAsync(array, byteOffsetX, ptr, bytesToCopy, cuStream));
@@ -1601,7 +1250,8 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image,
         src_origin[0] * elementByteSize * dstArrayDesc.NumChannels;
     size_t bytesToCopy = elementByteSize * srcArrayDesc.NumChannels * region[0];
 
-    pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type();
+    pi_mem_type imgType =
+        map_ur_mem_type(src_image->mem_.surface_mem_.get_image_type());
     if (imgType == PI_MEM_TYPE_IMAGE1D) {
       retErr = PI_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray,
                                            srcByteOffsetX, bytesToCopy));
@@ -1669,7 +1319,10 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer,
   }
 
   // Allocate a pointer in the host to store the mapped information
-  auto hostPtr = buffer->mem_.buffer_mem_.map_to_ptr(offset, map_flags);
+  // TODO(ur): Remove conversion when this is ported to UR.
+  ur_map_flags_t map_flags_ur;
+  pi2urMapFlags(map_flags, &map_flags_ur);
+  auto hostPtr = buffer->mem_.buffer_mem_.map_to_ptr(offset, map_flags_ur);
   *ret_map = buffer->mem_.buffer_mem_.get_map_ptr();
   if (hostPtr) {
     ret_err = PI_SUCCESS;
@@ -2477,15 +2130,16 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piextQueueCreateWithNativeHandle2,
          pi2ur::piextQueueCreateWithNativeHandle2)
   // Memory
-  _PI_CL(piMemBufferCreate, cuda_piMemBufferCreate)
-  _PI_CL(piMemImageCreate, cuda_piMemImageCreate)
-  _PI_CL(piMemGetInfo, cuda_piMemGetInfo)
-  _PI_CL(piMemImageGetInfo, cuda_piMemImageGetInfo)
-  _PI_CL(piMemRetain, cuda_piMemRetain)
-  _PI_CL(piMemRelease, cuda_piMemRelease)
-  _PI_CL(piMemBufferPartition, cuda_piMemBufferPartition)
-  _PI_CL(piextMemGetNativeHandle, cuda_piextMemGetNativeHandle)
-  _PI_CL(piextMemCreateWithNativeHandle, cuda_piextMemCreateWithNativeHandle)
+  _PI_CL(piMemBufferCreate, pi2ur::piMemBufferCreate)
+  _PI_CL(piMemImageCreate, pi2ur::piMemImageCreate)
+  _PI_CL(piMemGetInfo, pi2ur::piMemGetInfo)
+  _PI_CL(piMemImageGetInfo, pi2ur::piMemImageGetInfo)
+  _PI_CL(piMemRetain, pi2ur::piMemRetain)
+  _PI_CL(piMemRelease, pi2ur::piMemRelease)
+  _PI_CL(piMemBufferPartition, pi2ur::piMemBufferPartition)
+  _PI_CL(piextMemGetNativeHandle, pi2ur::piextMemGetNativeHandle)
+  _PI_CL(piextMemCreateWithNativeHandle, pi2ur::piextMemCreateWithNativeHandle)
+
   // Program
   _PI_CL(piProgramCreate, pi2ur::piProgramCreate)
   _PI_CL(piclProgramCreateWithSource, pi2ur::piclProgramCreateWithSource)
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index 0df35e53c2d27..c1c84fa2a4557 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -50,6 +50,7 @@
 #include <ur/adapters/cuda/event.hpp>
 #include <ur/adapters/cuda/queue.hpp>
 #include <ur/adapters/cuda/sampler.hpp>
+#include <ur/adapters/cuda/memory.hpp>
 
 // Share code between the PI Plugin and UR Adapter
 #include <pi2ur.hpp>
@@ -128,173 +129,8 @@ struct _pi_context : ur_context_handle_t_ {
 /// \brief Represents non-SVM allocations on the CUDA backend.
 /// Keeps tracks of all mapped regions used for Map/Unmap calls.
 /// Only one region can be active at the same time per allocation.
-struct _pi_mem {
-
-  // TODO: Move as much shared data up as possible
-  using pi_context = _pi_context *;
-
-  // Context where the memory object is accessibles
-  pi_context context_;
-
-  /// Reference counting of the handler
-  std::atomic_uint32_t refCount_;
-  enum class mem_type { buffer, surface } mem_type_;
-
-  /// A PI Memory object represents either plain memory allocations ("Buffers"
-  /// in OpenCL) or typed allocations ("Images" in OpenCL).
-  /// In CUDA their API handlers are different. Whereas "Buffers" are allocated
-  /// as pointer-like structs, "Images" are stored in Textures or Surfaces
-  /// This union allows implementation to use either from the same handler.
-  union mem_ {
-    // Handler for plain, pointer-based CUDA allocations
-    struct buffer_mem_ {
-      using native_type = CUdeviceptr;
-
-      // If this allocation is a sub-buffer (i.e., a view on an existing
-      // allocation), this is the pointer to the parent handler structure
-      pi_mem parent_;
-      // CUDA handler for the pointer
-      native_type ptr_;
-
-      /// Pointer associated with this device on the host
-      void *hostPtr_;
-      /// Size of the allocation in bytes
-      size_t size_;
-      /// Offset of the active mapped region.
-      size_t mapOffset_;
-      /// Pointer to the active mapped region, if any
-      void *mapPtr_;
-      /// Original flags for the mapped region
-      pi_map_flags mapFlags_;
-
-      /** alloc_mode
-       * classic: Just a normal buffer allocated on the device via cuda malloc
-       * use_host_ptr: Use an address on the host for the device
-       * copy_in: The data for the device comes from the host but the host
-       pointer is not available later for re-use
-       * alloc_host_ptr: Uses pinned-memory allocation
-      */
-      enum class alloc_mode {
-        classic,
-        use_host_ptr,
-        copy_in,
-        alloc_host_ptr
-      } allocMode_;
-
-      native_type get() const noexcept { return ptr_; }
-
-      size_t get_size() const noexcept { return size_; }
-
-      void *get_map_ptr() const noexcept { return mapPtr_; }
-
-      size_t get_map_offset(void *) const noexcept { return mapOffset_; }
-
-      /// Returns a pointer to data visible on the host that contains
-      /// the data on the device associated with this allocation.
-      /// The offset is used to index into the CUDA allocation.
-      ///
-      void *map_to_ptr(size_t offset, pi_map_flags flags) noexcept {
-        assert(mapPtr_ == nullptr);
-        mapOffset_ = offset;
-        mapFlags_ = flags;
-        if (hostPtr_) {
-          mapPtr_ = static_cast<char *>(hostPtr_) + offset;
-        } else {
-          // TODO: Allocate only what is needed based on the offset
-          mapPtr_ = static_cast<void *>(malloc(this->get_size()));
-        }
-        return mapPtr_;
-      }
-
-      /// Detach the allocation from the host memory.
-      void unmap(void *) noexcept {
-        assert(mapPtr_ != nullptr);
-
-        if (mapPtr_ != hostPtr_) {
-          free(mapPtr_);
-        }
-        mapPtr_ = nullptr;
-        mapOffset_ = 0;
-      }
-
-      pi_map_flags get_map_flags() const noexcept {
-        assert(mapPtr_ != nullptr);
-        return mapFlags_;
-      }
-    } buffer_mem_;
-
-    // Handler data for surface object (i.e. Images)
-    struct surface_mem_ {
-      CUarray array_;
-      CUsurfObject surfObj_;
-      pi_mem_type imageType_;
-
-      CUarray get_array() const noexcept { return array_; }
-
-      CUsurfObject get_surface() const noexcept { return surfObj_; }
-
-      pi_mem_type get_image_type() const noexcept { return imageType_; }
-    } surface_mem_;
-  } mem_;
-
-  /// Constructs the PI MEM handler for a non-typed allocation ("buffer")
-  _pi_mem(pi_context ctxt, pi_mem parent, mem_::buffer_mem_::alloc_mode mode,
-          CUdeviceptr ptr, void *host_ptr, size_t size)
-      : context_{ctxt}, refCount_{1}, mem_type_{mem_type::buffer} {
-    mem_.buffer_mem_.ptr_ = ptr;
-    mem_.buffer_mem_.parent_ = parent;
-    mem_.buffer_mem_.hostPtr_ = host_ptr;
-    mem_.buffer_mem_.size_ = size;
-    mem_.buffer_mem_.mapOffset_ = 0;
-    mem_.buffer_mem_.mapPtr_ = nullptr;
-    mem_.buffer_mem_.mapFlags_ = PI_MAP_WRITE;
-    mem_.buffer_mem_.allocMode_ = mode;
-    if (is_sub_buffer()) {
-      cuda_piMemRetain(mem_.buffer_mem_.parent_);
-    } else {
-      pi2ur::piContextRetain(context_);
-    }
-  };
-
-  /// Constructs the PI allocation for an Image object (surface in CUDA)
-  _pi_mem(pi_context ctxt, CUarray array, CUsurfObject surf,
-          pi_mem_type image_type, void *host_ptr)
-      : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface} {
-    // Ignore unused parameter
-    (void)host_ptr;
-
-    mem_.surface_mem_.array_ = array;
-    mem_.surface_mem_.surfObj_ = surf;
-    mem_.surface_mem_.imageType_ = image_type;
-    pi2ur::piContextRetain(context_);
-  }
-
-  ~_pi_mem() {
-    if (mem_type_ == mem_type::buffer) {
-      if (is_sub_buffer()) {
-        cuda_piMemRelease(mem_.buffer_mem_.parent_);
-        return;
-      }
-    }
-    pi2ur::piContextRelease(context_);
-  }
-
-  // TODO: Move as many shared funcs up as possible
-  bool is_buffer() const noexcept { return mem_type_ == mem_type::buffer; }
-
-  bool is_sub_buffer() const noexcept {
-    return (is_buffer() && (mem_.buffer_mem_.parent_ != nullptr));
-  }
-
-  bool is_image() const noexcept { return mem_type_ == mem_type::surface; }
-
-  pi_context get_context() const noexcept { return context_; }
-
-  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
-
-  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
-
-  pi_uint32 get_reference_count() const noexcept { return refCount_; }
+struct _pi_mem  : ur_mem_handle_t_ {
+  using ur_mem_handle_t_::ur_mem_handle_t_;
 };
 
 /// PI queue mapping on to CUstream objects.
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 13ac8a5a1e138..86f3049697cf3 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -152,6 +152,8 @@ add_sycl_library("ur_adapter_cuda" SHARED
     "ur/adapters/cuda/sampler.hpp"
     "ur/adapters/cuda/ur_interface_loader.cpp"
     "ur/adapters/cuda/tracing.cpp"
+    "ur/adapters/cuda/memory.cpp"
+    "ur/adapters/cuda/memory.hpp"
   INCLUDE_DIRS
     ${sycl_inc_dir}
   LIBRARIES
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
new file mode 100644
index 0000000000000..0827f09c79a9e
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
@@ -0,0 +1,513 @@
+//===--------- memory.cpp - CUDA Adapter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include <cuda.h>
+
+#include "common.hpp"
+#include "context.hpp"
+#include "memory.hpp"
+
+/// Creates a UR Memory object using a CUDA memory allocation.
+/// Can trigger a manual copy depending on the mode.
+/// \TODO Implement USE_HOST_PTR using cuHostRegister
+///
+UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
+    ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size,
+    const ur_buffer_properties_t *pProperties, ur_mem_handle_t *phBuffer) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  // Validate flags
+  UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0,
+            UR_RESULT_ERROR_INVALID_ENUMERATION);
+  if (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_HOST_POINTER |
+               UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) {
+    UR_ASSERT(pProperties && pProperties->pHost,
+              UR_RESULT_ERROR_INVALID_HOST_PTR);
+  }
+  // Need input memory object
+  UR_ASSERT(phBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(size != 0, UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
+  uint64_t maxAlloc = 0;
+  urDeviceGetInfo(hContext->get_device(), UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE,
+                  sizeof(maxAlloc), &maxAlloc, nullptr);
+  UR_ASSERT(size <= maxAlloc, UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
+
+  // Currently, USE_HOST_PTR is not implemented using host register
+  // since this triggers a weird segfault after program ends.
+  // Setting this constant to true enables testing that behavior.
+  const bool enableUseHostPtr = false;
+  const bool performInitialCopy =
+      (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) ||
+      ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && !enableUseHostPtr);
+  ur_result_t retErr = UR_RESULT_SUCCESS;
+  ur_mem_handle_t retMemObj = nullptr;
+
+  try {
+    ScopedContext active(hContext);
+    CUdeviceptr ptr;
+    auto pHost = pProperties ? pProperties->pHost : nullptr;
+
+    ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode allocMode =
+        ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic;
+
+    if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && enableUseHostPtr) {
+      retErr = UR_CHECK_ERROR(
+          cuMemHostRegister(pHost, size, CU_MEMHOSTREGISTER_DEVICEMAP));
+      retErr = UR_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, pHost, 0));
+      allocMode = ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::use_host_ptr;
+    } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) {
+      retErr = UR_CHECK_ERROR(cuMemAllocHost(&pHost, size));
+      retErr = UR_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, pHost, 0));
+      allocMode =
+          ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr;
+    } else {
+      retErr = UR_CHECK_ERROR(cuMemAlloc(&ptr, size));
+      if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) {
+        allocMode = ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::copy_in;
+      }
+    }
+
+    if (retErr == UR_RESULT_SUCCESS) {
+      ur_mem_handle_t parentBuffer = nullptr;
+
+      auto piMemObj = std::unique_ptr<ur_mem_handle_t_>(new ur_mem_handle_t_{
+          hContext, parentBuffer, flags, allocMode, ptr, pHost, size});
+      if (piMemObj != nullptr) {
+        retMemObj = piMemObj.release();
+        if (performInitialCopy) {
+          // Operates on the default stream of the current CUDA context.
+          retErr = UR_CHECK_ERROR(cuMemcpyHtoD(ptr, pHost, size));
+          // Synchronize with default stream implicitly used by cuMemcpyHtoD
+          // to make buffer data available on device before any other UR call
+          // uses it.
+          if (retErr == UR_RESULT_SUCCESS) {
+            CUstream defaultStream = 0;
+            retErr = UR_CHECK_ERROR(cuStreamSynchronize(defaultStream));
+          }
+        }
+      } else {
+        retErr = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+      }
+    }
+  } catch (ur_result_t err) {
+    retErr = err;
+  } catch (...) {
+    retErr = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+
+  *phBuffer = retMemObj;
+
+  return retErr;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) {
+  UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hMem->get_reference_count() > 0,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  hMem->increment_reference_count();
+  return UR_RESULT_SUCCESS;
+}
+
+/// Decreases the reference count of the Mem object.
+/// If this is zero, calls the relevant CUDA Free function
+/// \return UR_RESULT_SUCCESS unless deallocation error
+///
+UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
+  UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  ur_result_t ret = UR_RESULT_SUCCESS;
+
+  try {
+
+    // Do nothing if there are other references
+    if (hMem->decrement_reference_count() > 0) {
+      return UR_RESULT_SUCCESS;
+    }
+
+    // make sure hMem is released in case check_error_ur throws
+    std::unique_ptr<ur_mem_handle_t_> uniqueMemObj(hMem);
+
+    if (hMem->is_sub_buffer()) {
+      return UR_RESULT_SUCCESS;
+    }
+
+    ScopedContext active(uniqueMemObj->get_context());
+
+    if (hMem->mem_type_ == ur_mem_handle_t_::mem_type::buffer) {
+      switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) {
+      case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::copy_in:
+      case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic:
+        ret = UR_CHECK_ERROR(cuMemFree(uniqueMemObj->mem_.buffer_mem_.ptr_));
+        break;
+      case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::use_host_ptr:
+        ret = UR_CHECK_ERROR(
+            cuMemHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
+        break;
+      case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr:
+        ret = UR_CHECK_ERROR(
+            cuMemFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
+      };
+    } else if (hMem->mem_type_ == ur_mem_handle_t_::mem_type::surface) {
+      ret = UR_CHECK_ERROR(
+          cuSurfObjectDestroy(uniqueMemObj->mem_.surface_mem_.get_surface()));
+      ret = UR_CHECK_ERROR(
+          cuArrayDestroy(uniqueMemObj->mem_.surface_mem_.get_array()));
+    }
+
+  } catch (ur_result_t err) {
+    ret = err;
+  } catch (...) {
+    ret = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+
+  if (ret != UR_RESULT_SUCCESS) {
+    // A reported CUDA error is either an implementation or an asynchronous CUDA
+    // error for which it is unclear if the function that reported it succeeded
+    // or not. Either way, the state of the program is compromised and likely
+    // unrecoverable.
+    sycl::detail::ur::die(
+        "Unrecoverable program state reached in urMemRelease");
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+/// Gets the native CUDA handle of a UR mem object
+///
+/// \param[in] hMem The UR mem to get the native CUDA object of.
+/// \param[out] phNativeMem Set to the native handle of the UR mem object.
+///
+/// \return UR_RESULT_SUCCESS
+UR_APIEXPORT ur_result_t UR_APICALL
+urMemGetNativeHandle(ur_mem_handle_t hMem, ur_native_handle_t *phNativeMem) {
+  UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(phNativeMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  *phNativeMem =
+      reinterpret_cast<ur_native_handle_t>(hMem->mem_.buffer_mem_.get());
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory,
+                                                 ur_mem_info_t MemInfoType,
+                                                 size_t propSize,
+                                                 void *pMemInfo,
+                                                 size_t *pPropSizeRet) {
+  UR_ASSERT(hMemory, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(MemInfoType <= UR_MEM_INFO_CONTEXT,
+            UR_RESULT_ERROR_INVALID_ENUMERATION);
+  UR_ASSERT(hMemory->is_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet);
+
+  ScopedContext active(hMemory->get_context());
+
+  switch (MemInfoType) {
+  case UR_MEM_INFO_SIZE: {
+    try {
+      size_t allocSize = 0;
+      UR_CHECK_ERROR(cuMemGetAddressRange(nullptr, &allocSize,
+                                          hMemory->mem_.buffer_mem_.ptr_));
+      return ReturnValue(allocSize);
+    } catch (ur_result_t err) {
+      return err;
+    } catch (...) {
+      return UR_RESULT_ERROR_UNKNOWN;
+    }
+  }
+  case UR_MEM_INFO_CONTEXT: {
+    return ReturnValue(hMemory->get_context());
+  }
+
+  default:
+    return UR_RESULT_ERROR_INVALID_ENUMERATION;
+  }
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle(
+    ur_native_handle_t hNativeMem, ur_context_handle_t hContext,
+    const ur_mem_native_properties_t *pProperties, ur_mem_handle_t *phMem) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle(
+    ur_native_handle_t hNativeMem, ur_context_handle_t hContext,
+    const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc,
+    const ur_mem_native_properties_t *pProperties, ur_mem_handle_t *phMem) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+/// \TODO Not implemented
+UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
+    ur_context_handle_t hContext, ur_mem_flags_t flags,
+    const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc,
+    void *pHost, ur_mem_handle_t *phMem) {
+  // Need input memory object
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(phMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(pImageDesc, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0,
+            UR_RESULT_ERROR_INVALID_ENUMERATION);
+  if (flags & (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER |
+               UR_MEM_FLAG_ALLOC_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)) {
+    UR_ASSERT(pHost, UR_RESULT_ERROR_INVALID_HOST_PTR);
+  }
+  const bool performInitialCopy =
+      (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) ||
+      ((flags & UR_MEM_FLAG_USE_HOST_POINTER));
+
+  UR_ASSERT(pImageDesc->stype == UR_STRUCTURE_TYPE_IMAGE_DESC,
+            UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+  UR_ASSERT(pImageDesc->type <= UR_MEM_TYPE_IMAGE1D_BUFFER,
+            UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+  UR_ASSERT(pImageDesc->numMipLevel == 0,
+            UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+  UR_ASSERT(pImageDesc->numSamples == 0,
+            UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+  UR_ASSERT(pHost == nullptr && pImageDesc->rowPitch == 0,
+            UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+  UR_ASSERT(pHost == nullptr && pImageDesc->slicePitch == 0,
+            UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+
+  ur_result_t retErr = UR_RESULT_SUCCESS;
+
+  // We only support RBGA channel order
+  // TODO: check SYCL CTS and spec. May also have to support BGRA
+  UR_ASSERT(pImageFormat->channelOrder == UR_IMAGE_CHANNEL_ORDER_RGBA,
+            UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
+
+  // We have to use cuArray3DCreate, which has some caveats. The height and
+  // depth parameters must be set to 0 produce 1D or 2D arrays. pImageDesc gives
+  // a minimum value of 1, so we need to convert the answer.
+  CUDA_ARRAY3D_DESCRIPTOR array_desc;
+  array_desc.NumChannels = 4; // Only support 4 channel image
+  array_desc.Flags = 0;       // No flags required
+  array_desc.Width = pImageDesc->width;
+  if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
+    array_desc.Height = 0;
+    array_desc.Depth = 0;
+  } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
+    array_desc.Height = pImageDesc->height;
+    array_desc.Depth = 0;
+  } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
+    array_desc.Height = pImageDesc->height;
+    array_desc.Depth = pImageDesc->depth;
+  }
+
+  // We need to get this now in bytes for calculating the total image size later
+  size_t pixel_type_size_bytes;
+
+  switch (pImageFormat->channelType) {
+  case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8:
+  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
+    array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
+    pixel_type_size_bytes = 1;
+    break;
+  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
+    array_desc.Format = CU_AD_FORMAT_SIGNED_INT8;
+    pixel_type_size_bytes = 1;
+    break;
+  case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16:
+  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16:
+    array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
+    pixel_type_size_bytes = 2;
+    break;
+  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16:
+    array_desc.Format = CU_AD_FORMAT_SIGNED_INT16;
+    pixel_type_size_bytes = 2;
+    break;
+  case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT:
+    array_desc.Format = CU_AD_FORMAT_HALF;
+    pixel_type_size_bytes = 2;
+    break;
+  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
+    array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
+    pixel_type_size_bytes = 4;
+    break;
+  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
+    array_desc.Format = CU_AD_FORMAT_SIGNED_INT32;
+    pixel_type_size_bytes = 4;
+    break;
+  case UR_IMAGE_CHANNEL_TYPE_FLOAT:
+    array_desc.Format = CU_AD_FORMAT_FLOAT;
+    pixel_type_size_bytes = 4;
+    break;
+  default:
+    sycl::detail::ur::die(
+        "urMemImageCreate given unsupported image_channel_data_type");
+  }
+
+  // When a dimension isn't used pImageDesc has the size set to 1
+  size_t pixel_size_bytes =
+      pixel_type_size_bytes * 4; // 4 is the only number of channels we support
+  size_t image_size_bytes = pixel_size_bytes * pImageDesc->width *
+                            pImageDesc->height * pImageDesc->depth;
+
+  ScopedContext active(hContext);
+  CUarray image_array = nullptr;
+  try {
+    retErr = UR_CHECK_ERROR(cuArray3DCreate(&image_array, &array_desc));
+  } catch (ur_result_t err) {
+    if (err == UR_RESULT_ERROR_INVALID_VALUE) {
+      return UR_RESULT_ERROR_INVALID_IMAGE_SIZE;
+    }
+    return err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  try {
+    if (performInitialCopy) {
+      // We have to use a different copy function for each image dimensionality
+      if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
+        retErr = UR_CHECK_ERROR(
+            cuMemcpyHtoA(image_array, 0, pHost, image_size_bytes));
+      } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
+        CUDA_MEMCPY2D cpy_desc;
+        memset(&cpy_desc, 0, sizeof(cpy_desc));
+        cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
+        cpy_desc.srcHost = pHost;
+        cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
+        cpy_desc.dstArray = image_array;
+        cpy_desc.WidthInBytes = pixel_size_bytes * pImageDesc->width;
+        cpy_desc.Height = pImageDesc->height;
+        retErr = UR_CHECK_ERROR(cuMemcpy2D(&cpy_desc));
+      } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
+        CUDA_MEMCPY3D cpy_desc;
+        memset(&cpy_desc, 0, sizeof(cpy_desc));
+        cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
+        cpy_desc.srcHost = pHost;
+        cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
+        cpy_desc.dstArray = image_array;
+        cpy_desc.WidthInBytes = pixel_size_bytes * pImageDesc->width;
+        cpy_desc.Height = pImageDesc->height;
+        cpy_desc.Depth = pImageDesc->depth;
+        retErr = UR_CHECK_ERROR(cuMemcpy3D(&cpy_desc));
+      }
+    }
+
+    // CUDA_RESOURCE_DESC is a union of different structs, shown here
+    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html
+    // We need to fill it as described here to use it for a surface or texture
+    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html
+    // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and
+    // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA array
+    // handle.
+    // CUDA_RESOURCE_DESC::flags must be set to zero
+
+    CUDA_RESOURCE_DESC image_res_desc;
+    image_res_desc.res.array.hArray = image_array;
+    image_res_desc.resType = CU_RESOURCE_TYPE_ARRAY;
+    image_res_desc.flags = 0;
+
+    CUsurfObject surface;
+    retErr = UR_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc));
+
+    auto urMemObj = std::unique_ptr<ur_mem_handle_t_>(new ur_mem_handle_t_(
+        hContext, image_array, surface, flags, pImageDesc->type, phMem));
+
+    if (urMemObj == nullptr) {
+      return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    }
+
+    *phMem = urMemObj.release();
+  } catch (ur_result_t err) {
+    if (image_array) {
+      cuArrayDestroy(image_array);
+    }
+    return err;
+  } catch (...) {
+    if (image_array) {
+      cuArrayDestroy(image_array);
+    }
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return retErr;
+}
+
+/// \TODO Not implemented
+UR_APIEXPORT ur_result_t UR_APICALL
+urMemImageGetInfo(ur_mem_handle_t hMemory, ur_image_info_t ImgInfoType,
+                  size_t propSize, void *pImgInfo, size_t *pPropSizeRet) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+/// Implements a buffer partition in the CUDA backend.
+/// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented
+/// as an offset over an existing CUDA allocation.
+///
+UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
+    ur_mem_handle_t hBuffer, ur_mem_flags_t flags,
+    ur_buffer_create_type_t bufferCreateType, const ur_buffer_region_t *pRegion,
+    ur_mem_handle_t *phMem) {
+  UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0,
+            UR_RESULT_ERROR_INVALID_ENUMERATION);
+  UR_ASSERT(hBuffer->is_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(!hBuffer->is_sub_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  // Default value for flags means UR_MEM_FLAG_READ_WRITE.
+  if (flags == 0) {
+    flags = UR_MEM_FLAG_READ_WRITE;
+  }
+
+  UR_ASSERT(!(flags &
+              (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER |
+               UR_MEM_FLAG_ALLOC_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)),
+            UR_RESULT_ERROR_INVALID_VALUE);
+  if (hBuffer->memFlags_ & UR_MEM_FLAG_WRITE_ONLY) {
+    UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_READ_ONLY)),
+              UR_RESULT_ERROR_INVALID_VALUE);
+  }
+  if (hBuffer->memFlags_ & UR_MEM_FLAG_READ_ONLY) {
+    UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)),
+              UR_RESULT_ERROR_INVALID_VALUE);
+  }
+
+  UR_ASSERT(bufferCreateType == UR_BUFFER_CREATE_TYPE_REGION,
+            UR_RESULT_ERROR_INVALID_ENUMERATION);
+  UR_ASSERT(pRegion != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(phMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  UR_ASSERT(pRegion->size != 0u, UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
+
+  assert((pRegion->origin <= (pRegion->origin + pRegion->size)) && "Overflow");
+  UR_ASSERT(((pRegion->origin + pRegion->size) <=
+             hBuffer->mem_.buffer_mem_.get_size()),
+            UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
+  // Retained indirectly due to retaining parent buffer below.
+  ur_context_handle_t context = hBuffer->context_;
+
+  ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode allocMode =
+      ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic;
+
+  assert(hBuffer->mem_.buffer_mem_.ptr_ !=
+         ur_mem_handle_t_::mem_::buffer_mem_::native_type{0});
+  ur_mem_handle_t_::mem_::buffer_mem_::native_type ptr =
+      hBuffer->mem_.buffer_mem_.ptr_ + pRegion->origin;
+
+  void *hostPtr = nullptr;
+  if (hBuffer->mem_.buffer_mem_.hostPtr_) {
+    hostPtr = static_cast<char *>(hBuffer->mem_.buffer_mem_.hostPtr_) +
+              pRegion->origin;
+  }
+
+  std::unique_ptr<ur_mem_handle_t_> retMemObj{nullptr};
+  try {
+    retMemObj = std::unique_ptr<ur_mem_handle_t_>{new ur_mem_handle_t_{
+        context, hBuffer, flags, allocMode, ptr, hostPtr, pRegion->size}};
+  } catch (ur_result_t err) {
+    *phMem = nullptr;
+    return err;
+  } catch (...) {
+    *phMem = nullptr;
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  }
+
+  *phMem = retMemObj.release();
+  return UR_RESULT_SUCCESS;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp
new file mode 100644
index 0000000000000..44484250f062b
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp
@@ -0,0 +1,195 @@
+//===--------- memory.hpp - CUDA Adapter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <cuda.h>
+#include <ur_api.h>
+#include <cassert>
+
+#include "common.hpp"
+
+/// UR Mem mapping to CUDA memory allocations, both data and texture/surface.
+/// \brief Represents non-SVM allocations on the CUDA backend.
+/// Keeps tracks of all mapped regions used for Map/Unmap calls.
+/// Only one region can be active at the same time per allocation.
+struct ur_mem_handle_t_ {
+
+  // TODO: Move as much shared data up as possible
+  using ur_context = ur_context_handle_t_ *;
+  using ur_mem = ur_mem_handle_t_ *;
+
+  // Context where the memory object is accessibles
+  ur_context context_;
+
+  /// Reference counting of the handler
+  std::atomic_uint32_t refCount_;
+  enum class mem_type { buffer, surface } mem_type_;
+
+  // Original mem flags passed
+  ur_mem_flags_t memFlags_;
+
+  /// A UR Memory object represents either plain memory allocations ("Buffers"
+  /// in OpenCL) or typed allocations ("Images" in OpenCL).
+  /// In CUDA their API handlers are different. Whereas "Buffers" are allocated
+  /// as pointer-like structs, "Images" are stored in Textures or Surfaces
+  /// This union allows implementation to use either from the same handler.
+  union mem_ {
+    // Handler for plain, pointer-based CUDA allocations
+    struct buffer_mem_ {
+      using native_type = CUdeviceptr;
+
+      // If this allocation is a sub-buffer (i.e., a view on an existing
+      // allocation), this is the pointer to the parent handler structure
+      ur_mem parent_;
+      // CUDA handler for the pointer
+      native_type ptr_;
+
+      /// Pointer associated with this device on the host
+      void *hostPtr_;
+      /// Size of the allocation in bytes
+      size_t size_;
+      /// Offset of the active mapped region.
+      size_t mapOffset_;
+      /// Pointer to the active mapped region, if any
+      void *mapPtr_;
+      /// Original flags for the mapped region
+      ur_map_flags_t mapFlags_;
+
+      /** alloc_mode
+       * classic: Just a normal buffer allocated on the device via cuda malloc
+       * use_host_ptr: Use an address on the host for the device
+       * copy_in: The data for the device comes from the host but the host
+       pointer is not available later for re-use
+       * alloc_host_ptr: Uses pinned-memory allocation
+      */
+      enum class alloc_mode {
+        classic,
+        use_host_ptr,
+        copy_in,
+        alloc_host_ptr
+      } allocMode_;
+
+      native_type get() const noexcept { return ptr_; }
+
+      size_t get_size() const noexcept { return size_; }
+
+      void *get_map_ptr() const noexcept { return mapPtr_; }
+
+      size_t get_map_offset(void *) const noexcept { return mapOffset_; }
+
+      /// Returns a pointer to data visible on the host that contains
+      /// the data on the device associated with this allocation.
+      /// The offset is used to index into the CUDA allocation.
+      ///
+      void *map_to_ptr(size_t offset, ur_map_flags_t flags) noexcept {
+        assert(mapPtr_ == nullptr);
+        mapOffset_ = offset;
+        mapFlags_ = flags;
+        if (hostPtr_) {
+          mapPtr_ = static_cast<char *>(hostPtr_) + offset;
+        } else {
+          // TODO: Allocate only what is needed based on the offset
+          mapPtr_ = static_cast<void *>(malloc(this->get_size()));
+        }
+        return mapPtr_;
+      }
+
+      /// Detach the allocation from the host memory.
+      void unmap(void *) noexcept {
+        assert(mapPtr_ != nullptr);
+
+        if (mapPtr_ != hostPtr_) {
+          free(mapPtr_);
+        }
+        mapPtr_ = nullptr;
+        mapOffset_ = 0;
+      }
+
+      ur_map_flags_t get_map_flags() const noexcept {
+        assert(mapPtr_ != nullptr);
+        return mapFlags_;
+      }
+    } buffer_mem_;
+
+    // Handler data for surface object (i.e. Images)
+    struct surface_mem_ {
+      CUarray array_;
+      CUsurfObject surfObj_;
+      ur_mem_type_t imageType_;
+
+      CUarray get_array() const noexcept { return array_; }
+
+      CUsurfObject get_surface() const noexcept { return surfObj_; }
+
+      ur_mem_type_t get_image_type() const noexcept { return imageType_; }
+    } surface_mem_;
+  } mem_;
+
+  /// Constructs the UR mem handler for a non-typed allocation ("buffer")
+  ur_mem_handle_t_(ur_context ctxt, ur_mem parent, ur_mem_flags_t mem_flags,
+                   mem_::buffer_mem_::alloc_mode mode, CUdeviceptr ptr,
+                   void *host_ptr, size_t size)
+      : context_{ctxt}, refCount_{1}, mem_type_{mem_type::buffer},
+        memFlags_{mem_flags} {
+    mem_.buffer_mem_.ptr_ = ptr;
+    mem_.buffer_mem_.parent_ = parent;
+    mem_.buffer_mem_.hostPtr_ = host_ptr;
+    mem_.buffer_mem_.size_ = size;
+    mem_.buffer_mem_.mapOffset_ = 0;
+    mem_.buffer_mem_.mapPtr_ = nullptr;
+    mem_.buffer_mem_.mapFlags_ = UR_MAP_FLAG_WRITE;
+    mem_.buffer_mem_.allocMode_ = mode;
+    if (is_sub_buffer()) {
+      urMemRetain(mem_.buffer_mem_.parent_);
+    } else {
+      urContextRetain(context_);
+    }
+  };
+
+  /// Constructs the UR allocation for an Image object (surface in CUDA)
+  ur_mem_handle_t_(ur_context ctxt, CUarray array, CUsurfObject surf,
+                   ur_mem_flags_t mem_flags, ur_mem_type_t image_type,
+                   void *host_ptr)
+      : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface},
+        memFlags_{mem_flags} {
+    // Ignore unused parameter
+    (void)host_ptr;
+
+    mem_.surface_mem_.array_ = array;
+    mem_.surface_mem_.surfObj_ = surf;
+    mem_.surface_mem_.imageType_ = image_type;
+    urContextRetain(context_);
+  }
+
+  ~ur_mem_handle_t_() {
+    if (mem_type_ == mem_type::buffer) {
+      if (is_sub_buffer()) {
+        urMemRelease(mem_.buffer_mem_.parent_);
+        return;
+      }
+    }
+    urContextRelease(context_);
+  }
+
+  // TODO: Move as many shared funcs up as possible
+  bool is_buffer() const noexcept { return mem_type_ == mem_type::buffer; }
+
+  bool is_sub_buffer() const noexcept {
+    return (is_buffer() && (mem_.buffer_mem_.parent_ != nullptr));
+  }
+
+  bool is_image() const noexcept { return mem_type_ == mem_type::surface; }
+
+  ur_context get_context() const noexcept { return context_; }
+
+  uint32_t increment_reference_count() noexcept { return ++refCount_; }
+
+  uint32_t decrement_reference_count() noexcept { return --refCount_; }
+
+  uint32_t get_reference_count() const noexcept { return refCount_; }
+};
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index 06ae75db02dec..35d807ffb6db4 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -145,16 +145,16 @@ urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) {
   if (UR_RESULT_SUCCESS != result) {
     return result;
   }
-  pDdiTable->pfnBufferCreate = nullptr;
-  pDdiTable->pfnBufferPartition = nullptr;
-  pDdiTable->pfnBufferCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnImageCreateWithNativeHandle = nullptr;
-  pDdiTable->pfnGetInfo = nullptr;
-  pDdiTable->pfnGetNativeHandle = nullptr;
-  pDdiTable->pfnImageCreate = nullptr;
-  pDdiTable->pfnImageGetInfo = nullptr;
-  pDdiTable->pfnRelease = nullptr;
-  pDdiTable->pfnRetain = nullptr;
+  pDdiTable->pfnBufferCreate = urMemBufferCreate;
+  pDdiTable->pfnBufferPartition = urMemBufferPartition;
+  pDdiTable->pfnBufferCreateWithNativeHandle = urMemBufferCreateWithNativeHandle;
+  pDdiTable->pfnImageCreateWithNativeHandle = urMemImageCreateWithNativeHandle;
+  pDdiTable->pfnGetInfo = urMemGetInfo;
+  pDdiTable->pfnGetNativeHandle = urMemGetNativeHandle;
+  pDdiTable->pfnImageCreate = urMemImageCreate;
+  pDdiTable->pfnImageGetInfo = urMemImageGetInfo;
+  pDdiTable->pfnRelease = urMemRelease;
+  pDdiTable->pfnRetain = urMemRetain;
   return UR_RESULT_SUCCESS;
 }
 

From ef9f2243c146f19f52a1f4963cde6d5e74867a87 Mon Sep 17 00:00:00 2001
From: Omar Ahmed <omar.ahmed@codeplay.com>
Date: Tue, 18 Apr 2023 14:16:38 +0100
Subject: [PATCH 16/45] Port USM entry points

---
 sycl/plugins/cuda/CMakeLists.txt              |   1 +
 sycl/plugins/cuda/pi_cuda.cpp                 | 613 +-----------------
 sycl/plugins/unified_runtime/CMakeLists.txt   |   5 +-
 .../ur/adapters/cuda/device.hpp               |   2 +
 .../ur/adapters/cuda/enqueue.cpp              | 372 +++++++++++
 .../ur/adapters/cuda/ur_interface_loader.cpp  |  22 +-
 .../unified_runtime/ur/adapters/cuda/usm.cpp  | 256 ++++++++
 7 files changed, 659 insertions(+), 612 deletions(-)
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp

diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt
index da4ce9476ee91..70e4e1a200e1a 100644
--- a/sycl/plugins/cuda/CMakeLists.txt
+++ b/sycl/plugins/cuda/CMakeLists.txt
@@ -74,6 +74,7 @@ add_sycl_plugin(cuda
     "../unified_runtime/ur/adapters/cuda/queue.cpp"
     "../unified_runtime/ur/adapters/cuda/sampler.cpp"
     "../unified_runtime/ur/adapters/cuda/sampler.hpp"
+    "../unified_runtime/ur/adapters/cuda/usm.cpp"
     "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp"
     "../unified_runtime/ur/adapters/cuda/tracing.cpp"
     "../unified_runtime/ur/adapters/cuda/memory.cpp"
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index ab0d428e3613a..ed10a030b665c 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -234,14 +234,6 @@ pi_result check_error(CUresult result, const char *function, int line,
 /// \cond NODOXY
 #define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__)
 
-ScopedContext::ScopedContext(pi_context ctxt) {
-  if (!ctxt) {
-    throw PI_ERROR_INVALID_CONTEXT;
-  }
-
-  set_context(ctxt->get());
-}
-
 /// \cond NODOXY
 template <typename T, typename Assign>
 pi_result getInfoImpl(size_t param_value_size, void *param_value,
@@ -286,13 +278,6 @@ pi_result getInfoArray(size_t array_length, size_t param_value_size,
   return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
                      array_length * sizeof(T), memcpy);
 }
-
-int getAttribute(pi_device device, CUdevice_attribute attribute) {
-  int value;
-  sycl::detail::pi::assertion(
-      cuDeviceGetAttribute(&value, attribute, device->get()) == CUDA_SUCCESS);
-  return value;
-}
 /// \endcond
 
 pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream,
@@ -325,34 +310,6 @@ pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream,
   }
 }
 
-template <typename PtrT>
-void getUSMHostOrDevicePtr(PtrT usm_ptr, CUmemorytype *out_mem_type,
-                           CUdeviceptr *out_dev_ptr, PtrT *out_host_ptr) {
-  // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
-  // checks with PI_CHECK_ERROR are not suggested
-  CUresult ret = cuPointerGetAttribute(
-      out_mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)usm_ptr);
-  assert((*out_mem_type != CU_MEMORYTYPE_ARRAY &&
-          *out_mem_type != CU_MEMORYTYPE_UNIFIED) &&
-         "ARRAY, UNIFIED types are not supported!");
-
-  // pointer not known to the CUDA subsystem (possibly a system allocated ptr)
-  if (ret == CUDA_ERROR_INVALID_VALUE) {
-    *out_mem_type = CU_MEMORYTYPE_HOST;
-    *out_dev_ptr = 0;
-    *out_host_ptr = usm_ptr;
-
-    // todo: resets the above "non-stick" error
-  } else if (ret == CUDA_SUCCESS) {
-    *out_dev_ptr = (*out_mem_type == CU_MEMORYTYPE_DEVICE)
-                       ? reinterpret_cast<CUdeviceptr>(usm_ptr)
-                       : 0;
-    *out_host_ptr = (*out_mem_type == CU_MEMORYTYPE_HOST) ? usm_ptr : nullptr;
-  } else {
-    PI_CHECK_ERROR(ret);
-  }
-}
-
 } // anonymous namespace
 
 /// ------ Error handling, matching OpenCL plugin semantics.
@@ -1413,548 +1370,6 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
   return ret_err;
 }
 
-/// USM: Implements USM Host allocations using CUDA Pinned Memory
-///
-pi_result
-cuda_piextUSMHostAlloc(void **result_ptr, pi_context context,
-                       [[maybe_unused]] pi_usm_mem_properties *properties,
-                       size_t size, [[maybe_unused]] pi_uint32 alignment) {
-  assert(result_ptr != nullptr);
-  assert(context != nullptr);
-  assert(properties == nullptr || *properties == 0);
-  pi_result result = PI_SUCCESS;
-  try {
-    ScopedContext active(context);
-    result = PI_CHECK_ERROR(cuMemAllocHost(result_ptr, size));
-  } catch (pi_result error) {
-    result = error;
-  }
-
-  assert(alignment == 0 ||
-         (result == PI_SUCCESS &&
-          reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
-  return result;
-}
-
-/// USM: Implements USM device allocations using a normal CUDA device pointer
-///
-pi_result
-cuda_piextUSMDeviceAlloc(void **result_ptr, pi_context context,
-                         [[maybe_unused]] pi_device device,
-                         [[maybe_unused]] pi_usm_mem_properties *properties,
-                         size_t size, [[maybe_unused]] pi_uint32 alignment) {
-  assert(result_ptr != nullptr);
-  assert(context != nullptr);
-  assert(device != nullptr);
-  assert(properties == nullptr || *properties == 0);
-  pi_result result = PI_SUCCESS;
-  try {
-    ScopedContext active(context);
-    result = PI_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)result_ptr, size));
-  } catch (pi_result error) {
-    result = error;
-  }
-
-  assert(alignment == 0 ||
-         (result == PI_SUCCESS &&
-          reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
-  return result;
-}
-
-/// USM: Implements USM Shared allocations using CUDA Managed Memory
-///
-pi_result
-cuda_piextUSMSharedAlloc(void **result_ptr, pi_context context,
-                         [[maybe_unused]] pi_device device,
-                         [[maybe_unused]] pi_usm_mem_properties *properties,
-                         size_t size, [[maybe_unused]] pi_uint32 alignment) {
-  assert(result_ptr != nullptr);
-  assert(context != nullptr);
-  assert(device != nullptr);
-  assert(properties == nullptr || *properties == 0);
-  pi_result result = PI_SUCCESS;
-  try {
-    ScopedContext active(context);
-    result = PI_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)result_ptr, size,
-                                              CU_MEM_ATTACH_GLOBAL));
-  } catch (pi_result error) {
-    result = error;
-  }
-
-  assert(alignment == 0 ||
-         (result == PI_SUCCESS &&
-          reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
-  return result;
-}
-
-/// USM: Frees the given USM pointer associated with the context.
-///
-pi_result cuda_piextUSMFree(pi_context context, void *ptr) {
-  assert(context != nullptr);
-  pi_result result = PI_SUCCESS;
-  try {
-    ScopedContext active(context);
-    bool is_managed;
-    unsigned int type;
-    void *attribute_values[2] = {&is_managed, &type};
-    CUpointer_attribute attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED,
-                                         CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
-    result = PI_CHECK_ERROR(cuPointerGetAttributes(
-        2, attributes, attribute_values, (CUdeviceptr)ptr));
-    assert(type == CU_MEMORYTYPE_DEVICE || type == CU_MEMORYTYPE_HOST);
-    if (is_managed || type == CU_MEMORYTYPE_DEVICE) {
-      // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed
-      // with cuMemFree
-      result = PI_CHECK_ERROR(cuMemFree((CUdeviceptr)ptr));
-    } else {
-      // Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost
-      result = PI_CHECK_ERROR(cuMemFreeHost(ptr));
-    }
-  } catch (pi_result error) {
-    result = error;
-  }
-  return result;
-}
-
-pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value,
-                                     size_t count,
-                                     pi_uint32 num_events_in_waitlist,
-                                     const pi_event *events_waitlist,
-                                     pi_event *event) {
-  assert(queue != nullptr);
-  assert(ptr != nullptr);
-  pi_result result = PI_SUCCESS;
-  std::unique_ptr<_pi_event> event_ptr{nullptr};
-
-  try {
-    ScopedContext active(queue->get_context());
-    pi_uint32 stream_token;
-    _pi_stream_guard guard;
-    CUstream cuStream = queue->get_next_compute_stream(
-        num_events_in_waitlist,
-        reinterpret_cast<const ur_event_handle_t *>(events_waitlist), guard,
-        &stream_token);
-    result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
-                               events_waitlist);
-    if (event) {
-      event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_FILL, queue, cuStream, stream_token));
-      event_ptr->start();
-    }
-    result = PI_CHECK_ERROR(cuMemsetD8Async(
-        (CUdeviceptr)ptr, (unsigned char)value & 0xFF, count, cuStream));
-    if (event) {
-      result = map_ur_error(event_ptr->record());
-      *event = event_ptr.release();
-    }
-  } catch (pi_result err) {
-    result = err;
-  }
-  return result;
-}
-
-pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking,
-                                     void *dst_ptr, const void *src_ptr,
-                                     size_t size,
-                                     pi_uint32 num_events_in_waitlist,
-                                     const pi_event *events_waitlist,
-                                     pi_event *event) {
-  assert(queue != nullptr);
-  assert(dst_ptr != nullptr);
-  assert(src_ptr != nullptr);
-  pi_result result = PI_SUCCESS;
-
-  std::unique_ptr<_pi_event> event_ptr{nullptr};
-
-  try {
-    ScopedContext active(queue->get_context());
-    CUstream cuStream = queue->get_next_transfer_stream();
-    result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
-                               events_waitlist);
-    if (event) {
-      event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, cuStream));
-      event_ptr->start();
-    }
-    result = PI_CHECK_ERROR(cuMemcpyAsync(
-        (CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr, size, cuStream));
-    if (event) {
-      result = map_ur_error(event_ptr->record());
-    }
-    if (blocking) {
-      result = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
-    }
-    if (event) {
-      *event = event_ptr.release();
-    }
-  } catch (pi_result err) {
-    result = err;
-  }
-  return result;
-}
-
-pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr,
-                                       size_t size,
-                                       pi_usm_migration_flags flags,
-                                       pi_uint32 num_events_in_waitlist,
-                                       const pi_event *events_waitlist,
-                                       pi_event *event) {
-  pi_device device =
-      reinterpret_cast<pi_device>(queue->get_context()->get_device());
-
-  // Certain cuda devices and Windows do not have support for some Unified
-  // Memory features. cuMemPrefetchAsync requires concurrent memory access
-  // for managed memory. Therfore, ignore prefetch hint if concurrent managed
-  // memory access is not available.
-  if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
-    setErrorMessage("Prefetch hint ignored as device does not support "
-                    "concurrent managed access",
-                    UR_RESULT_SUCCESS);
-    return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-  }
-
-  unsigned int is_managed;
-  PI_CHECK_ERROR(cuPointerGetAttribute(
-      &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr));
-  if (!is_managed) {
-    setErrorMessage("Prefetch hint ignored as prefetch only works with USM",
-                    UR_RESULT_SUCCESS);
-    return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-  }
-
-  // flags is currently unused so fail if set
-  if (flags != 0)
-    return PI_ERROR_INVALID_VALUE;
-  assert(queue != nullptr);
-  assert(ptr != nullptr);
-  pi_result result = PI_SUCCESS;
-  std::unique_ptr<_pi_event> event_ptr{nullptr};
-
-  try {
-    ScopedContext active(queue->get_context());
-    CUstream cuStream = queue->get_next_transfer_stream();
-    result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
-                               events_waitlist);
-    if (event) {
-      event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, cuStream));
-      event_ptr->start();
-    }
-    result = PI_CHECK_ERROR(
-        cuMemPrefetchAsync((CUdeviceptr)ptr, size, device->get(), cuStream));
-    if (event) {
-      result = map_ur_error(event_ptr->record());
-      *event = event_ptr.release();
-    }
-  } catch (pi_result err) {
-    result = err;
-  }
-  return result;
-}
-
-/// USM: memadvise API to govern behavior of automatic migration mechanisms
-pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr,
-                                        size_t length, pi_mem_advice advice,
-                                        pi_event *event) {
-  assert(queue != nullptr);
-  assert(ptr != nullptr);
-
-  // Certain cuda devices and Windows do not have support for some Unified
-  // Memory features. Passing CU_MEM_ADVISE_[UN]SET_PREFERRED_LOCATION and
-  // CU_MEM_ADVISE_[UN]SET_ACCESSED_BY to cuMemAdvise on a GPU device requires
-  // the GPU device to report a non-zero value for
-  // CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Therfore, ignore memory
-  // advise if concurrent managed memory access is not available.
-  if (advice == PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION ||
-      advice == PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION ||
-      advice == PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY ||
-      advice == PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY ||
-      advice == PI_MEM_ADVICE_RESET) {
-    pi_device device =
-        reinterpret_cast<pi_device>(queue->get_context()->get_device());
-    if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
-      setErrorMessage("Mem advise ignored as device does not support "
-                      "concurrent managed access",
-                      UR_RESULT_SUCCESS);
-      return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-    }
-
-    // TODO: If ptr points to valid system-allocated pageable memory we should
-    // check that the device also has the
-    // CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS property.
-  }
-
-  unsigned int is_managed;
-  PI_CHECK_ERROR(cuPointerGetAttribute(
-      &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr));
-  if (!is_managed) {
-    setErrorMessage(
-        "Memory advice ignored as memory advices only works with USM",
-        UR_RESULT_SUCCESS);
-    return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-  }
-
-  pi_result result = PI_SUCCESS;
-  std::unique_ptr<_pi_event> event_ptr{nullptr};
-
-  try {
-    ScopedContext active(queue->get_context());
-
-    if (event) {
-      event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_USER, queue, queue->get_next_transfer_stream()));
-      event_ptr->start();
-    }
-
-    switch (advice) {
-    case PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY:
-    case PI_MEM_ADVICE_CUDA_UNSET_READ_MOSTLY:
-    case PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION:
-    case PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION:
-    case PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY:
-    case PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY:
-      result = PI_CHECK_ERROR(cuMemAdvise(
-          (CUdeviceptr)ptr, length,
-          (CUmem_advise)(advice - PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY + 1),
-          queue->get_context()->get_device()->get()));
-      break;
-    case PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION_HOST:
-    case PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION_HOST:
-    case PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY_HOST:
-    case PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST:
-      result = PI_CHECK_ERROR(cuMemAdvise(
-          (CUdeviceptr)ptr, length,
-          (CUmem_advise)(advice - PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY + 1 -
-                         (PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION_HOST -
-                          PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION)),
-          CU_DEVICE_CPU));
-      break;
-    case PI_MEM_ADVICE_RESET:
-      PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length,
-                                 CU_MEM_ADVISE_UNSET_READ_MOSTLY,
-                                 queue->get_context()->get_device()->get()));
-      PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length,
-                                 CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
-                                 queue->get_context()->get_device()->get()));
-      PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length,
-                                 CU_MEM_ADVISE_UNSET_ACCESSED_BY,
-                                 queue->get_context()->get_device()->get()));
-      break;
-    default:
-      sycl::detail::pi::die("Unknown advice");
-    }
-    if (event) {
-      result = map_ur_error(event_ptr->record());
-      *event = event_ptr.release();
-    }
-  } catch (pi_result err) {
-    result = err;
-  } catch (...) {
-    result = PI_ERROR_UNKNOWN;
-  }
-  return result;
-}
-
-// TODO: Implement this. Remember to return true for
-//       PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT when it is implemented.
-pi_result cuda_piextUSMEnqueueFill2D(pi_queue, void *, size_t, size_t,
-                                     const void *, size_t, size_t, pi_uint32,
-                                     const pi_event *, pi_event *) {
-  sycl::detail::pi::die("piextUSMEnqueueFill2D: not implemented");
-  return {};
-}
-
-// TODO: Implement this. Remember to return true for
-//       PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT when it is implemented.
-pi_result cuda_piextUSMEnqueueMemset2D(pi_queue, void *, size_t, int, size_t,
-                                       size_t, pi_uint32, const pi_event *,
-                                       pi_event *) {
-  sycl::detail::pi::die("cuda_piextUSMEnqueueMemset2D: not implemented");
-  return {};
-}
-
-/// 2D Memcpy API
-///
-/// \param queue is the queue to submit to
-/// \param blocking is whether this operation should block the host
-/// \param dst_ptr is the location the data will be copied
-/// \param dst_pitch is the total width of the destination memory including
-/// padding
-/// \param src_ptr is the data to be copied
-/// \param dst_pitch is the total width of the source memory including padding
-/// \param width is width in bytes of each row to be copied
-/// \param height is height the columns to be copied
-/// \param num_events_in_waitlist is the number of events to wait on
-/// \param events_waitlist is an array of events to wait on
-/// \param event is the event that represents this operation
-pi_result cuda_piextUSMEnqueueMemcpy2D(pi_queue queue, pi_bool blocking,
-                                       void *dst_ptr, size_t dst_pitch,
-                                       const void *src_ptr, size_t src_pitch,
-                                       size_t width, size_t height,
-                                       pi_uint32 num_events_in_wait_list,
-                                       const pi_event *event_wait_list,
-                                       pi_event *event) {
-
-  assert(queue != nullptr);
-
-  pi_result result = PI_SUCCESS;
-
-  try {
-    ScopedContext active(queue->get_context());
-    CUstream cuStream = queue->get_next_transfer_stream();
-    result = enqueueEventsWait(queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-    if (event) {
-      (*event) = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT,
-                                        queue, cuStream);
-      (*event)->start();
-    }
-
-    // Determine the direction of copy using cuPointerGetAttribute
-    // for both the src_ptr and dst_ptr
-    CUDA_MEMCPY2D cpyDesc;
-    memset(&cpyDesc, 0, sizeof(cpyDesc));
-
-    getUSMHostOrDevicePtr(src_ptr, &cpyDesc.srcMemoryType, &cpyDesc.srcDevice,
-                          &cpyDesc.srcHost);
-    getUSMHostOrDevicePtr(dst_ptr, &cpyDesc.dstMemoryType, &cpyDesc.dstDevice,
-                          &cpyDesc.dstHost);
-
-    cpyDesc.dstPitch = dst_pitch;
-    cpyDesc.srcPitch = src_pitch;
-    cpyDesc.WidthInBytes = width;
-    cpyDesc.Height = height;
-
-    result = PI_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cuStream));
-
-    if (event) {
-      (*event)->record();
-    }
-    if (blocking) {
-      result = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
-    }
-  } catch (pi_result err) {
-    result = err;
-  }
-  return result;
-}
-
-/// API to query information about USM allocated pointers
-/// Valid Queries:
-///   PI_MEM_ALLOC_TYPE returns host/device/shared pi_host_usm value
-///   PI_MEM_ALLOC_BASE_PTR returns the base ptr of an allocation if
-///                         the queried pointer fell inside an allocation.
-///                         Result must fit in void *
-///   PI_MEM_ALLOC_SIZE returns how big the queried pointer's
-///                     allocation is in bytes. Result is a size_t.
-///   PI_MEM_ALLOC_DEVICE returns the pi_device this was allocated against
-///
-/// \param context is the pi_context
-/// \param ptr is the pointer to query
-/// \param param_name is the type of query to perform
-/// \param param_value_size is the size of the result in bytes
-/// \param param_value is the result
-/// \param param_value_size_ret is how many bytes were written
-pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr,
-                                       pi_mem_alloc_info param_name,
-                                       size_t param_value_size,
-                                       void *param_value,
-                                       size_t *param_value_size_ret) {
-  assert(context != nullptr);
-  assert(ptr != nullptr);
-  pi_result result = PI_SUCCESS;
-
-  try {
-    ScopedContext active(context);
-    switch (param_name) {
-    case PI_MEM_ALLOC_TYPE: {
-      unsigned int value;
-      // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
-      CUresult ret = cuPointerGetAttribute(
-          &value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr);
-      if (ret == CUDA_ERROR_INVALID_VALUE) {
-        // pointer not known to the CUDA subsystem
-        return getInfo(param_value_size, param_value, param_value_size_ret,
-                       PI_MEM_TYPE_UNKNOWN);
-      }
-      result = check_error(ret, __func__, __LINE__ - 5, __FILE__);
-      if (value) {
-        // pointer to managed memory
-        return getInfo(param_value_size, param_value, param_value_size_ret,
-                       PI_MEM_TYPE_SHARED);
-      }
-      result = PI_CHECK_ERROR(cuPointerGetAttribute(
-          &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)ptr));
-      assert(value == CU_MEMORYTYPE_DEVICE || value == CU_MEMORYTYPE_HOST);
-      if (value == CU_MEMORYTYPE_DEVICE) {
-        // pointer to device memory
-        return getInfo(param_value_size, param_value, param_value_size_ret,
-                       PI_MEM_TYPE_DEVICE);
-      }
-      if (value == CU_MEMORYTYPE_HOST) {
-        // pointer to host memory
-        return getInfo(param_value_size, param_value, param_value_size_ret,
-                       PI_MEM_TYPE_HOST);
-      }
-      // should never get here
-#ifdef _MSC_VER
-      __assume(0);
-#else
-      __builtin_unreachable();
-#endif
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     PI_MEM_TYPE_UNKNOWN);
-    }
-    case PI_MEM_ALLOC_BASE_PTR: {
-#if CUDA_VERSION >= 10020
-      // CU_POINTER_ATTRIBUTE_RANGE_START_ADDR was introduced in CUDA 10.2
-      unsigned int value;
-      result = PI_CHECK_ERROR(cuPointerGetAttribute(
-          &value, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)ptr));
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     value);
-#else
-      return PI_ERROR_INVALID_VALUE;
-#endif
-    }
-    case PI_MEM_ALLOC_SIZE: {
-#if CUDA_VERSION >= 10020
-      // CU_POINTER_ATTRIBUTE_RANGE_SIZE was introduced in CUDA 10.2
-      unsigned int value;
-      result = PI_CHECK_ERROR(cuPointerGetAttribute(
-          &value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)ptr));
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     value);
-#else
-      return PI_ERROR_INVALID_VALUE;
-#endif
-    }
-    case PI_MEM_ALLOC_DEVICE: {
-      // get device index associated with this pointer
-      unsigned int device_idx;
-      result = PI_CHECK_ERROR(cuPointerGetAttribute(
-          &device_idx, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr));
-
-      // currently each device is in its own platform, so find the platform at
-      // the same index
-      std::vector<pi_platform> platforms;
-      platforms.resize(device_idx + 1);
-      result = pi2ur::piPlatformsGet(device_idx + 1, platforms.data(), nullptr);
-
-      // get the device from the platform
-      // TODO(ur): Remove cast when this entry point is moved to UR
-      pi_device device =
-          reinterpret_cast<pi_device>(platforms[device_idx]->devices_[0].get());
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     device);
-    }
-    }
-  } catch (pi_result error) {
-    result = error;
-  }
-  return result;
-}
-
 pi_result cuda_piextEnqueueDeviceGlobalVariableWrite(
     pi_queue queue, pi_program program, const char *name,
     pi_bool blocking_write, size_t count, size_t offset, const void *src,
@@ -1984,7 +1399,7 @@ pi_result cuda_piextEnqueueDeviceGlobalVariableWrite(
     if (offset + count > device_global_size)
       return PI_ERROR_INVALID_VALUE;
 
-    return cuda_piextUSMEnqueueMemcpy(
+    return pi2ur::piextUSMEnqueueMemcpy(
         queue, blocking_write, reinterpret_cast<void *>(device_global + offset),
         src, count, num_events_in_wait_list, event_wait_list, event);
   } catch (pi_result error) {
@@ -2021,7 +1436,7 @@ pi_result cuda_piextEnqueueDeviceGlobalVariableRead(
     if (offset + count > device_global_size)
       return PI_ERROR_INVALID_VALUE;
 
-    return cuda_piextUSMEnqueueMemcpy(
+    return pi2ur::piextUSMEnqueueMemcpy(
         queue, blocking_read, dst,
         reinterpret_cast<const void *>(device_global + offset), count,
         num_events_in_wait_list, event_wait_list, event);
@@ -2206,18 +1621,18 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piEnqueueMemBufferMap, cuda_piEnqueueMemBufferMap)
   _PI_CL(piEnqueueMemUnmap, cuda_piEnqueueMemUnmap)
   // USM
-  _PI_CL(piextUSMHostAlloc, cuda_piextUSMHostAlloc)
-  _PI_CL(piextUSMDeviceAlloc, cuda_piextUSMDeviceAlloc)
-  _PI_CL(piextUSMSharedAlloc, cuda_piextUSMSharedAlloc)
-  _PI_CL(piextUSMFree, cuda_piextUSMFree)
-  _PI_CL(piextUSMEnqueueMemset, cuda_piextUSMEnqueueMemset)
-  _PI_CL(piextUSMEnqueueMemcpy, cuda_piextUSMEnqueueMemcpy)
-  _PI_CL(piextUSMEnqueuePrefetch, cuda_piextUSMEnqueuePrefetch)
-  _PI_CL(piextUSMEnqueueMemAdvise, cuda_piextUSMEnqueueMemAdvise)
-  _PI_CL(piextUSMEnqueueFill2D, cuda_piextUSMEnqueueFill2D)
-  _PI_CL(piextUSMEnqueueMemset2D, cuda_piextUSMEnqueueMemset2D)
-  _PI_CL(piextUSMEnqueueMemcpy2D, cuda_piextUSMEnqueueMemcpy2D)
-  _PI_CL(piextUSMGetMemAllocInfo, cuda_piextUSMGetMemAllocInfo)
+  _PI_CL(piextUSMHostAlloc, pi2ur::piextUSMHostAlloc)
+  _PI_CL(piextUSMDeviceAlloc, pi2ur::piextUSMDeviceAlloc)
+  _PI_CL(piextUSMSharedAlloc, pi2ur::piextUSMSharedAlloc)
+  _PI_CL(piextUSMFree, pi2ur::piextUSMFree)
+  _PI_CL(piextUSMEnqueueMemset, pi2ur::piextUSMEnqueueMemset)
+  _PI_CL(piextUSMEnqueueMemcpy, pi2ur::piextUSMEnqueueMemcpy)
+  _PI_CL(piextUSMEnqueuePrefetch, pi2ur::piextUSMEnqueuePrefetch)
+  _PI_CL(piextUSMEnqueueMemAdvise, pi2ur::piextUSMEnqueueMemAdvise)
+  _PI_CL(piextUSMEnqueueFill2D, pi2ur::piextUSMEnqueueFill2D)
+  _PI_CL(piextUSMEnqueueMemset2D, pi2ur::piextUSMEnqueueMemset2D)
+  _PI_CL(piextUSMEnqueueMemcpy2D, pi2ur::piextUSMEnqueueMemcpy2D)
+  _PI_CL(piextUSMGetMemAllocInfo, pi2ur::piextUSMGetMemAllocInfo)
   // Device global variable
   _PI_CL(piextEnqueueDeviceGlobalVariableWrite,
          cuda_piextEnqueueDeviceGlobalVariableWrite)
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 86f3049697cf3..2288a8e9949e1 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -150,10 +150,11 @@ add_sycl_library("ur_adapter_cuda" SHARED
     "ur/adapters/cuda/queue.hpp"
     "ur/adapters/cuda/sampler.cpp"
     "ur/adapters/cuda/sampler.hpp"
-    "ur/adapters/cuda/ur_interface_loader.cpp"
-    "ur/adapters/cuda/tracing.cpp"
     "ur/adapters/cuda/memory.cpp"
     "ur/adapters/cuda/memory.hpp"
+    "ur/adapters/cuda/usm.cpp"
+    "ur/adapters/cuda/ur_interface_loader.cpp"
+    "ur/adapters/cuda/tracing.cpp"
   INCLUDE_DIRS
     ${sycl_inc_dir}
   LIBRARIES
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp
index c2195c958cfd7..9d01edd8a5ec3 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp
@@ -57,3 +57,5 @@ struct ur_device_handle_t_ {
 
   int get_max_work_group_size() const noexcept { return max_work_group_size; };
 };
+
+int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
index 68c70aa1ae9ec..7e0e7b5905f31 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
@@ -45,6 +45,89 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t command_queue, CUstream stream,
   }
 }
 
+template <typename PtrT>
+void getUSMHostOrDevicePtr(PtrT usm_ptr, CUmemorytype *out_mem_type,
+                           CUdeviceptr *out_dev_ptr, PtrT *out_host_ptr) {
+  // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
+  // checks with PI_CHECK_ERROR are not suggested
+  CUresult ret = cuPointerGetAttribute(
+      out_mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)usm_ptr);
+  // ARRAY, UNIFIED types are not supported!
+  assert(*out_mem_type != CU_MEMORYTYPE_ARRAY &&
+         *out_mem_type != CU_MEMORYTYPE_UNIFIED);
+
+  // pointer not known to the CUDA subsystem (possibly a system allocated ptr)
+  if (ret == CUDA_ERROR_INVALID_VALUE) {
+    *out_mem_type = CU_MEMORYTYPE_HOST;
+    *out_dev_ptr = 0;
+    *out_host_ptr = usm_ptr;
+
+    // todo: resets the above "non-stick" error
+  } else if (ret == CUDA_SUCCESS) {
+    *out_dev_ptr = (*out_mem_type == CU_MEMORYTYPE_DEVICE)
+                       ? reinterpret_cast<CUdeviceptr>(usm_ptr)
+                       : 0;
+    *out_host_ptr = (*out_mem_type == CU_MEMORYTYPE_HOST) ? usm_ptr : nullptr;
+  } else {
+    UR_CHECK_ERROR(ret);
+  }
+}
+
+ur_result_t setCuMemAdvise(CUdeviceptr devPtr, size_t size,
+                           ur_usm_advice_flags_t ur_advice_flags,
+                           CUdevice device) {
+  std::unordered_map<ur_usm_advice_flags_t, CUmem_advise>
+      URToCUMemAdviseDeviceFlagsMap = {
+          {UR_USM_ADVICE_FLAG_SET_READ_MOSTLY, CU_MEM_ADVISE_SET_READ_MOSTLY},
+          {UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY,
+           CU_MEM_ADVISE_UNSET_READ_MOSTLY},
+          {UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION,
+           CU_MEM_ADVISE_SET_PREFERRED_LOCATION},
+          {UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION,
+           CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION},
+          {UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE,
+           CU_MEM_ADVISE_SET_ACCESSED_BY},
+          {UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE,
+           CU_MEM_ADVISE_UNSET_ACCESSED_BY},
+      };
+  for (auto &FlagPair : URToCUMemAdviseDeviceFlagsMap) {
+    if (ur_advice_flags & FlagPair.first) {
+      UR_CHECK_ERROR(cuMemAdvise(devPtr, size, FlagPair.second, device));
+    }
+  }
+
+  std::unordered_map<ur_usm_advice_flags_t, CUmem_advise>
+      URToCUMemAdviseHostFlagsMap = {
+          {UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION_HOST,
+           CU_MEM_ADVISE_SET_PREFERRED_LOCATION},
+          {UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION_HOST,
+           CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION},
+          {UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_HOST,
+           CU_MEM_ADVISE_SET_ACCESSED_BY},
+          {UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_HOST,
+           CU_MEM_ADVISE_UNSET_ACCESSED_BY},
+      };
+
+  for (auto &FlagPair : URToCUMemAdviseHostFlagsMap) {
+    if (ur_advice_flags & FlagPair.first) {
+      UR_CHECK_ERROR(cuMemAdvise(devPtr, size, FlagPair.second, CU_DEVICE_CPU));
+    }
+  }
+
+  std::array<ur_usm_advice_flags_t, 4> UnmappedMemAdviceFlags = {
+      UR_USM_ADVICE_FLAG_SET_NON_ATOMIC_MOSTLY,
+      UR_USM_ADVICE_FLAG_CLEAR_NON_ATOMIC_MOSTLY,
+      UR_USM_ADVICE_FLAG_BIAS_CACHED, UR_USM_ADVICE_FLAG_BIAS_UNCACHED};
+
+  for (auto &unMappedFlag : UnmappedMemAdviceFlags) {
+    if (ur_advice_flags & unMappedFlag) {
+      throw UR_RESULT_ERROR_INVALID_ENUMERATION;
+    }
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
 // Determine local work sizes that result in uniform work groups.
 // The default threadsPerBlock only require handling the first work_dim
 // dimension.
@@ -389,3 +472,292 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   }
   return retError;
 }
+
+/// TODO(ur): Add support for the offset.
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
+    ur_queue_handle_t hQueue, void *ptr, size_t patternSize,
+    const void *pPattern, size_t size, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
+  UR_ASSERT(ptr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(size % patternSize == 0, UR_RESULT_ERROR_INVALID_SIZE);
+
+  ur_result_t result = UR_RESULT_SUCCESS;
+  std::unique_ptr<ur_event_handle_t_> event_ptr{nullptr};
+
+  try {
+    ScopedContext active(hQueue->get_context());
+    uint32_t stream_token;
+    ur_stream_guard_ guard;
+    CUstream cuStream = hQueue->get_next_compute_stream(
+        numEventsInWaitList, phEventWaitList, guard, &stream_token);
+    result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                               phEventWaitList);
+    if (phEvent) {
+      event_ptr =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
+              UR_COMMAND_USM_FILL, hQueue, cuStream, stream_token));
+      event_ptr->start();
+    }
+    switch (patternSize) {
+    case 1:
+      result = UR_CHECK_ERROR(
+          cuMemsetD8Async((CUdeviceptr)ptr, *((const uint8_t *)pPattern) & 0xFF,
+                          size, cuStream));
+      break;
+    case 2:
+      result = UR_CHECK_ERROR(cuMemsetD16Async(
+          (CUdeviceptr)ptr, *((const uint16_t *)pPattern) & 0xFFFF, size,
+          cuStream));
+      break;
+    case 4:
+      result = UR_CHECK_ERROR(cuMemsetD32Async(
+          (CUdeviceptr)ptr, *((const uint32_t *)pPattern) & 0xFFFFFFFF, size,
+          cuStream));
+      break;
+    default:
+      return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    }
+    if (phEvent) {
+      result = event_ptr->record();
+      *phEvent = event_ptr.release();
+    }
+  } catch (ur_result_t err) {
+    result = err;
+  }
+  return result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
+    ur_queue_handle_t hQueue, bool blocking, void *pDst, const void *pSrc,
+    size_t size, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
+  UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  ur_result_t result = UR_RESULT_SUCCESS;
+
+  std::unique_ptr<ur_event_handle_t_> event_ptr{nullptr};
+
+  try {
+    ScopedContext active(hQueue->get_context());
+    CUstream cuStream = hQueue->get_next_transfer_stream();
+    result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                               phEventWaitList);
+    if (phEvent) {
+      event_ptr =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
+              UR_COMMAND_USM_MEMCPY, hQueue, cuStream));
+      event_ptr->start();
+    }
+    result = UR_CHECK_ERROR(
+        cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, cuStream));
+    if (phEvent) {
+      result = event_ptr->record();
+    }
+    if (blocking) {
+      result = UR_CHECK_ERROR(cuStreamSynchronize(cuStream));
+    }
+    if (phEvent) {
+      *phEvent = event_ptr.release();
+    }
+  } catch (ur_result_t err) {
+    result = err;
+  }
+  return result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
+    ur_queue_handle_t hQueue, const void *pMem, size_t size,
+    ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
+  ur_device_handle_t device = hQueue->get_context()->get_device();
+
+  // Certain cuda devices and Windows do not have support for some Unified
+  // Memory features. cuMemPrefetchAsync requires concurrent memory access
+  // for managed memory. Therfore, ignore prefetch hint if concurrent managed
+  // memory access is not available.
+  if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+    setErrorMessage("Prefetch hint ignored as device does not support "
+                    "concurrent managed access",
+                    UR_RESULT_SUCCESS);
+    return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+  }
+
+  unsigned int is_managed;
+  UR_CHECK_ERROR(cuPointerGetAttribute(
+      &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem));
+  if (!is_managed) {
+    setErrorMessage("Prefetch hint ignored as prefetch only works with USM",
+                    UR_RESULT_SUCCESS);
+    return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+  }
+
+  // flags is currently unused so fail if set
+  if (flags != 0)
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  ur_result_t result = UR_RESULT_SUCCESS;
+  std::unique_ptr<ur_event_handle_t_> event_ptr{nullptr};
+
+  try {
+    ScopedContext active(hQueue->get_context());
+    CUstream cuStream = hQueue->get_next_transfer_stream();
+    result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                               phEventWaitList);
+    if (phEvent) {
+      event_ptr =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
+              UR_COMMAND_MEM_BUFFER_COPY, hQueue, cuStream));
+      event_ptr->start();
+    }
+    result = UR_CHECK_ERROR(
+        cuMemPrefetchAsync((CUdeviceptr)pMem, size, device->get(), cuStream));
+    if (phEvent) {
+      result = event_ptr->record();
+      *phEvent = event_ptr.release();
+    }
+  } catch (ur_result_t err) {
+    result = err;
+  }
+  return result;
+}
+
+/// USM: memadvise API to govern behavior of automatic migration mechanisms
+UR_APIEXPORT ur_result_t UR_APICALL
+urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
+                   ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
+  UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  // Certain cuda devices and Windows do not have support for some Unified
+  // Memory features. Passing CU_MEM_ADVISE_SET/CLEAR_PREFERRED_LOCATION and
+  // to cuMemAdvise on a GPU device requires the GPU device to report a non-zero
+  // value for CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Therfore, ignore
+  // memory advise if concurrent managed memory access is not available.
+  if ((advice & UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION) ||
+      (advice & UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION) ||
+      (advice & UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE) ||
+      (advice & UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE) ||
+      (advice & UR_USM_ADVICE_FLAG_DEFAULT)) {
+    ur_device_handle_t device = hQueue->get_context()->get_device();
+    if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+      setErrorMessage("Mem advise ignored as device does not support "
+                      "concurrent managed access",
+                      UR_RESULT_SUCCESS);
+      return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+    }
+
+    // TODO: If ptr points to valid system-allocated pageable memory we should
+    // check that the device also has the
+    // CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS property.
+  }
+
+  unsigned int is_managed;
+  UR_CHECK_ERROR(cuPointerGetAttribute(
+      &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem));
+  if (!is_managed) {
+    setErrorMessage(
+        "Memory advice ignored as memory advices only works with USM",
+        UR_RESULT_SUCCESS);
+    return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+  }
+
+  ur_result_t result = UR_RESULT_SUCCESS;
+  std::unique_ptr<ur_event_handle_t_> event_ptr{nullptr};
+
+  try {
+    ScopedContext active(hQueue->get_context());
+
+    if (phEvent) {
+      event_ptr = std::unique_ptr<ur_event_handle_t_>(
+          ur_event_handle_t_::make_native(UR_COMMAND_USM_ADVISE, hQueue,
+                                          hQueue->get_next_transfer_stream()));
+      event_ptr->start();
+    }
+
+    if (advice & UR_USM_ADVICE_FLAG_DEFAULT) {
+      UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
+                                 CU_MEM_ADVISE_UNSET_READ_MOSTLY,
+                                 hQueue->get_context()->get_device()->get()));
+      UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
+                                 CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
+                                 hQueue->get_context()->get_device()->get()));
+      UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
+                                 CU_MEM_ADVISE_UNSET_ACCESSED_BY,
+                                 hQueue->get_context()->get_device()->get()));
+    } else {
+      result = setCuMemAdvise((CUdeviceptr)pMem, size, advice,
+                              hQueue->get_context()->get_device()->get());
+    }
+
+    if (phEvent) {
+      result = event_ptr->record();
+      *phEvent = event_ptr.release();
+    }
+  } catch (ur_result_t err) {
+    result = err;
+  } catch (...) {
+    result = UR_RESULT_ERROR_UNKNOWN;
+  }
+  return result;
+}
+
+// TODO: Implement this. Remember to return true for
+//       PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT when it is implemented.
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D(
+    ur_queue_handle_t hQueue, void *pMem, size_t pitch, size_t patternSize,
+    const void *pPattern, size_t width, size_t height,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
+    ur_queue_handle_t hQueue, bool blocking, void *pDst, size_t dstPitch,
+    const void *pSrc, size_t srcPitch, size_t width, size_t height,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
+  ur_result_t result = UR_RESULT_SUCCESS;
+
+  try {
+    ScopedContext active(hQueue->get_context());
+    CUstream cuStream = hQueue->get_next_transfer_stream();
+    result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                               phEventWaitList);
+    if (phEvent) {
+      (*phEvent) = ur_event_handle_t_::make_native(
+          UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, cuStream);
+      (*phEvent)->start();
+    }
+
+    // Determine the direction of copy using cuPointerGetAttribute
+    // for both the src_ptr and dst_ptr
+    CUDA_MEMCPY2D cpyDesc = {0};
+
+    getUSMHostOrDevicePtr(pSrc, &cpyDesc.srcMemoryType, &cpyDesc.srcDevice,
+                          &cpyDesc.srcHost);
+    getUSMHostOrDevicePtr(pDst, &cpyDesc.dstMemoryType, &cpyDesc.dstDevice,
+                          &cpyDesc.dstHost);
+
+    cpyDesc.dstPitch = dstPitch;
+    cpyDesc.srcPitch = srcPitch;
+    cpyDesc.WidthInBytes = width;
+    cpyDesc.Height = height;
+
+    result = UR_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cuStream));
+
+    if (phEvent) {
+      (*phEvent)->record();
+    }
+    if (blocking) {
+      result = UR_CHECK_ERROR(cuStreamSynchronize(cuStream));
+    }
+  } catch (ur_result_t err) {
+    result = err;
+  }
+  return result;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index 35d807ffb6db4..07ed631c5b31e 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -181,12 +181,12 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
   pDdiTable->pfnMemImageRead = nullptr;
   pDdiTable->pfnMemImageWrite = nullptr;
   pDdiTable->pfnMemUnmap = nullptr;
-  pDdiTable->pfnUSMFill2D = nullptr;
-  pDdiTable->pfnUSMFill = nullptr;
-  pDdiTable->pfnUSMAdvise = nullptr;
-  pDdiTable->pfnUSMMemcpy2D = nullptr;
-  pDdiTable->pfnUSMMemcpy = nullptr;
-  pDdiTable->pfnUSMPrefetch = nullptr;
+  pDdiTable->pfnUSMFill2D = urEnqueueUSMFill2D;
+  pDdiTable->pfnUSMFill = urEnqueueUSMFill;
+  pDdiTable->pfnUSMAdvise = urEnqueueUSMAdvise;
+  pDdiTable->pfnUSMMemcpy2D = urEnqueueUSMMemcpy2D;
+  pDdiTable->pfnUSMMemcpy = urEnqueueUSMMemcpy;
+  pDdiTable->pfnUSMPrefetch = urEnqueueUSMPrefetch;
   return UR_RESULT_SUCCESS;
 }
 
@@ -225,14 +225,14 @@ urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) {
   if (UR_RESULT_SUCCESS != result) {
     return result;
   }
-  pDdiTable->pfnDeviceAlloc = nullptr;
-  pDdiTable->pfnFree = nullptr;
-  pDdiTable->pfnGetMemAllocInfo = nullptr;
-  pDdiTable->pfnHostAlloc = nullptr;
+  pDdiTable->pfnDeviceAlloc = urUSMDeviceAlloc;
+  pDdiTable->pfnFree = urUSMFree;
+  pDdiTable->pfnGetMemAllocInfo = urUSMGetMemAllocInfo;
+  pDdiTable->pfnHostAlloc = urUSMHostAlloc;
   pDdiTable->pfnPoolCreate = nullptr;
   pDdiTable->pfnPoolDestroy = nullptr;
   pDdiTable->pfnPoolDestroy = nullptr;
-  pDdiTable->pfnSharedAlloc = nullptr;
+  pDdiTable->pfnSharedAlloc = urUSMSharedAlloc;
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp
new file mode 100644
index 0000000000000..0309d4a7b627a
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp
@@ -0,0 +1,256 @@
+//===--------- usm.cpp - CUDA Adapter ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include <cassert>
+
+#include "common.hpp"
+#include "context.hpp"
+#include "device.hpp"
+#include "event.hpp"
+#include "platform.hpp"
+#include "queue.hpp"
+
+#include <cuda.h>
+
+/// USM: Implements USM Host allocations using CUDA Pinned Memory
+///
+UR_APIEXPORT ur_result_t UR_APICALL
+urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
+               ur_usm_pool_handle_t pool, size_t size, void **ppMem) {
+  UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  size_t device_max_mem_alloc_size = 0;
+  UR_ASSERT(urDeviceGetInfo(hContext->get_device(),
+                            UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, sizeof(size_t),
+                            static_cast<void *>(&device_max_mem_alloc_size),
+                            nullptr) == UR_RESULT_SUCCESS,
+            UR_RESULT_ERROR_INVALID_DEVICE);
+  UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size,
+            UR_RESULT_ERROR_INVALID_USM_SIZE);
+
+  ur_result_t result = UR_RESULT_SUCCESS;
+  try {
+    ScopedContext active(hContext);
+    result = UR_CHECK_ERROR(cuMemAllocHost(ppMem, size));
+  } catch (ur_result_t error) {
+    result = error;
+  }
+
+  UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 ||
+                          ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)),
+            UR_RESULT_ERROR_INVALID_VALUE);
+
+  assert(result == UR_RESULT_SUCCESS &&
+         (!pUSMDesc || pUSMDesc->align == 0 ||
+          reinterpret_cast<std::uintptr_t>(*ppMem) % pUSMDesc->align == 0));
+
+  return result;
+}
+
+/// USM: Implements USM device allocations using a normal CUDA device pointer
+///
+UR_APIEXPORT ur_result_t UR_APICALL
+urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
+                 const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool,
+                 size_t size, void **ppMem) {
+  UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  size_t device_max_mem_alloc_size = 0;
+  UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE,
+                            sizeof(size_t),
+                            static_cast<void *>(&device_max_mem_alloc_size),
+                            nullptr) == UR_RESULT_SUCCESS,
+            UR_RESULT_ERROR_INVALID_DEVICE);
+  UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size,
+            UR_RESULT_ERROR_INVALID_USM_SIZE);
+
+  ur_result_t result = UR_RESULT_SUCCESS;
+  try {
+    ScopedContext active(hContext);
+    result = UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ppMem, size));
+  } catch (ur_result_t error) {
+    result = error;
+  }
+  UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 ||
+                          ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)),
+            UR_RESULT_ERROR_INVALID_VALUE);
+
+  assert(result == UR_RESULT_SUCCESS &&
+         (!pUSMDesc || pUSMDesc->align == 0 ||
+          reinterpret_cast<std::uintptr_t>(*ppMem) % pUSMDesc->align == 0));
+
+  return result;
+}
+
+/// USM: Implements USM Shared allocations using CUDA Managed Memory
+///
+UR_APIEXPORT ur_result_t UR_APICALL
+urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
+                 const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool,
+                 size_t size, void **ppMem) {
+  UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  size_t device_max_mem_alloc_size = 0;
+  UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE,
+                            sizeof(size_t),
+                            static_cast<void *>(&device_max_mem_alloc_size),
+                            nullptr) == UR_RESULT_SUCCESS,
+            UR_RESULT_ERROR_INVALID_DEVICE);
+  UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size,
+            UR_RESULT_ERROR_INVALID_USM_SIZE);
+
+  ur_result_t result = UR_RESULT_SUCCESS;
+  try {
+    ScopedContext active(hContext);
+    result = UR_CHECK_ERROR(
+        cuMemAllocManaged((CUdeviceptr *)ppMem, size, CU_MEM_ATTACH_GLOBAL));
+  } catch (ur_result_t error) {
+    result = error;
+  }
+  UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 ||
+                          ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)),
+            UR_RESULT_ERROR_INVALID_VALUE);
+
+  assert(result == UR_RESULT_SUCCESS &&
+         (!pUSMDesc || pUSMDesc->align == 0 ||
+          reinterpret_cast<std::uintptr_t>(*ppMem) % pUSMDesc->align == 0));
+
+  return result;
+}
+
+/// USM: Frees the given USM pointer associated with the context.
+///
+UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
+                                              void *pMem) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  ur_result_t result = UR_RESULT_SUCCESS;
+  try {
+    ScopedContext active(hContext);
+    bool is_managed;
+    unsigned int type;
+    void *attribute_values[2] = {&is_managed, &type};
+    CUpointer_attribute attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED,
+                                         CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
+    result = UR_CHECK_ERROR(cuPointerGetAttributes(
+        2, attributes, attribute_values, (CUdeviceptr)pMem));
+    UR_ASSERT(type == CU_MEMORYTYPE_DEVICE || type == CU_MEMORYTYPE_HOST,
+              UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+    if (is_managed || type == CU_MEMORYTYPE_DEVICE) {
+      // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed
+      // with cuMemFree
+      result = UR_CHECK_ERROR(cuMemFree((CUdeviceptr)pMem));
+    } else {
+      // Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost
+      result = UR_CHECK_ERROR(cuMemFreeHost(pMem));
+    }
+  } catch (ur_result_t error) {
+    result = error;
+  }
+  return result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
+                     ur_usm_alloc_info_t propName, size_t propValueSize,
+                     void *pPropValue, size_t *pPropValueSizeRet) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  ur_result_t result = UR_RESULT_SUCCESS;
+
+  UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet);
+
+  try {
+    ScopedContext active(hContext);
+    switch (propName) {
+    case UR_USM_ALLOC_INFO_TYPE: {
+      unsigned int value;
+      // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
+      CUresult ret = cuPointerGetAttribute(
+          &value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem);
+      if (ret == CUDA_ERROR_INVALID_VALUE) {
+        // pointer not known to the CUDA subsystem
+        return ReturnValue(UR_USM_TYPE_UNKNOWN);
+      }
+      result = check_error_ur(ret, __func__, __LINE__ - 5, __FILE__);
+      if (value) {
+        // pointer to managed memory
+        return ReturnValue(UR_USM_TYPE_SHARED);
+      }
+      result = UR_CHECK_ERROR(cuPointerGetAttribute(
+          &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)pMem));
+      UR_ASSERT(value == CU_MEMORYTYPE_DEVICE || value == CU_MEMORYTYPE_HOST,
+                UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+      if (value == CU_MEMORYTYPE_DEVICE) {
+        // pointer to device memory
+        return ReturnValue(UR_USM_TYPE_DEVICE);
+      }
+      if (value == CU_MEMORYTYPE_HOST) {
+        // pointer to host memory
+        return ReturnValue(UR_USM_TYPE_HOST);
+      }
+      // should never get here
+#ifdef _MSC_VER
+      __assume(0);
+#else
+      __builtin_unreachable();
+#endif
+      return ReturnValue(UR_USM_TYPE_UNKNOWN);
+    }
+    case UR_USM_ALLOC_INFO_BASE_PTR: {
+#if __CUDA_API_VERSION >= 10020
+      // CU_POINTER_ATTRIBUTE_RANGE_START_ADDR was introduced in CUDA 10.2
+      unsigned int value;
+      result = UR_CHECK_ERROR(cuPointerGetAttribute(
+          &value, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)pMem));
+      return ReturnValue(value);
+#else
+      return UR_RESULT_ERROR_INVALID_VALUE;
+#endif
+    }
+    case UR_USM_ALLOC_INFO_SIZE: {
+#if __CUDA_API_VERSION >= 10020
+      // CU_POINTER_ATTRIBUTE_RANGE_SIZE was introduced in CUDA 10.2
+      unsigned int value;
+      result = UR_CHECK_ERROR(cuPointerGetAttribute(
+          &value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
+      return ReturnValue(value);
+#else
+      return UR_RESULT_ERROR_INVALID_VALUE;
+#endif
+    }
+    case UR_USM_ALLOC_INFO_DEVICE: {
+      // get device index associated with this pointer
+      unsigned int device_idx;
+      result = UR_CHECK_ERROR(cuPointerGetAttribute(
+          &device_idx, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)pMem));
+
+      // currently each device is in its own platform, so find the platform at
+      // the same index
+      std::vector<ur_platform_handle_t> platforms;
+      platforms.resize(device_idx + 1);
+      result = urPlatformGet(device_idx + 1, platforms.data(), nullptr);
+
+      // get the device from the platform
+      ur_device_handle_t device = platforms[device_idx]->devices_[0].get();
+      return ReturnValue(device);
+    }
+    default:
+      return UR_RESULT_ERROR_INVALID_ENUMERATION;
+    }
+  } catch (ur_result_t error) {
+    result = error;
+  }
+  return result;
+}

From d185543356b377e682ab8e2076c99691b2666b72 Mon Sep 17 00:00:00 2001
From: Petr Vesely <petr.vesely@codeplay.com>
Date: Thu, 27 Apr 2023 10:55:46 +0100
Subject: [PATCH 17/45] [UR][CUDA][SYCL] Fix sycl-e2e tests

---
 .../ur/adapters/cuda/memory.cpp                | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
index 0827f09c79a9e..59975b0a7b821 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
@@ -23,8 +23,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
   // Validate flags
   UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0,
             UR_RESULT_ERROR_INVALID_ENUMERATION);
-  if (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_HOST_POINTER |
-               UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) {
+  if (flags &
+      (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) {
     UR_ASSERT(pProperties && pProperties->pHost,
               UR_RESULT_ERROR_INVALID_HOST_PTR);
   }
@@ -251,8 +251,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
   UR_ASSERT(pImageDesc, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0,
             UR_RESULT_ERROR_INVALID_ENUMERATION);
-  if (flags & (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER |
-               UR_MEM_FLAG_ALLOC_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)) {
+  if (flags &
+      (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)) {
     UR_ASSERT(pHost, UR_RESULT_ERROR_INVALID_HOST_PTR);
   }
   const bool performInitialCopy =
@@ -267,10 +267,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
             UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
   UR_ASSERT(pImageDesc->numSamples == 0,
             UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
-  UR_ASSERT(pHost == nullptr && pImageDesc->rowPitch == 0,
-            UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
-  UR_ASSERT(pHost == nullptr && pImageDesc->slicePitch == 0,
-            UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+  if (!pHost) {
+    UR_ASSERT(pImageDesc->rowPitch == 0,
+              UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+    UR_ASSERT(pImageDesc->slicePitch == 0,
+              UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+  }
 
   ur_result_t retErr = UR_RESULT_SUCCESS;
 

From 398f3e9603caa3cb70e5eb7eecc044c88c41a841 Mon Sep 17 00:00:00 2001
From: Petr Vesely <petr.vesely@codeplay.com>
Date: Wed, 26 Apr 2023 11:24:54 +0100
Subject: [PATCH 18/45] [UR][CUDA] Port urEnqueueRead/Write & setArgMemObj

---
 sycl/plugins/cuda/pi_cuda.cpp                 |  10 +-
 .../ur/adapters/cuda/enqueue.cpp              | 113 ++++++++++++++++++
 .../ur/adapters/cuda/kernel.cpp               |  34 ++++++
 .../ur/adapters/cuda/ur_interface_loader.cpp  |   6 +-
 4 files changed, 155 insertions(+), 8 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index ed10a030b665c..06650450a0a32 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -1287,7 +1287,7 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer,
 
   if (!is_pinned && ((map_flags & PI_MAP_READ) || (map_flags & PI_MAP_WRITE))) {
     // Pinned host memory is already on host so it doesn't need to be read.
-    ret_err = cuda_piEnqueueMemBufferRead(
+    ret_err = pi2ur::piEnqueueMemBufferRead(
         command_queue, buffer, blocking_map, offset, size, hostPtr,
         num_events_in_wait_list, event_wait_list, event);
   } else {
@@ -1340,7 +1340,7 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
        (memobj->mem_.buffer_mem_.get_map_flags() &
         PI_MAP_WRITE_INVALIDATE_REGION))) {
     // Pinned host memory is only on host so it doesn't need to be written to.
-    ret_err = cuda_piEnqueueMemBufferWrite(
+    ret_err = pi2ur::piEnqueueMemBufferWrite(
         command_queue, memobj, true,
         memobj->mem_.buffer_mem_.get_map_offset(mapped_ptr),
         memobj->mem_.buffer_mem_.get_size(), mapped_ptr,
@@ -1607,9 +1607,9 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piEnqueueNativeKernel, cuda_piEnqueueNativeKernel)
   _PI_CL(piEnqueueEventsWait, pi2ur::piEnqueueEventsWait)
   _PI_CL(piEnqueueEventsWaitWithBarrier, pi2ur::piEnqueueEventsWaitWithBarrier)
-  _PI_CL(piEnqueueMemBufferRead, cuda_piEnqueueMemBufferRead)
+  _PI_CL(piEnqueueMemBufferRead, pi2ur::piEnqueueMemBufferRead)
   _PI_CL(piEnqueueMemBufferReadRect, cuda_piEnqueueMemBufferReadRect)
-  _PI_CL(piEnqueueMemBufferWrite, cuda_piEnqueueMemBufferWrite)
+  _PI_CL(piEnqueueMemBufferWrite, pi2ur::piEnqueueMemBufferRead)
   _PI_CL(piEnqueueMemBufferWriteRect, cuda_piEnqueueMemBufferWriteRect)
   _PI_CL(piEnqueueMemBufferCopy, cuda_piEnqueueMemBufferCopy)
   _PI_CL(piEnqueueMemBufferCopyRect, cuda_piEnqueueMemBufferCopyRect)
@@ -1643,7 +1643,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piextEnqueueReadHostPipe, cuda_piextEnqueueReadHostPipe)
   _PI_CL(piextEnqueueWriteHostPipe, cuda_piextEnqueueWriteHostPipe)
 
-  _PI_CL(piextKernelSetArgMemObj, cuda_piextKernelSetArgMemObj)
+  _PI_CL(piextKernelSetArgMemObj, pi2ur::piextKernelSetArgMemObj)
   _PI_CL(piextKernelSetArgSampler, cuda_piextKernelSetArgSampler)
   _PI_CL(piPluginGetLastError, pi2ur::piPluginGetLastError)
   _PI_CL(piTearDown, pi2ur::piTearDown)
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
index 7e0e7b5905f31..674bea82ddef9 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
@@ -10,6 +10,7 @@
 #include "context.hpp"
 #include "event.hpp"
 #include "kernel.hpp"
+#include "memory.hpp"
 #include "queue.hpp"
 
 #include <cmath>
@@ -761,3 +762,115 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
   }
   return result;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead,
+    size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(!hBuffer->is_image(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  if (phEventWaitList) {
+    UR_ASSERT(numEventsInWaitList > 0, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+  } else {
+    UR_ASSERT(numEventsInWaitList == 0,
+              UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+  }
+  UR_ASSERT(offset + size <= hBuffer->mem_.buffer_mem_.size_,
+            UR_RESULT_ERROR_INVALID_SIZE);
+
+  ur_result_t retErr = UR_RESULT_SUCCESS;
+  CUdeviceptr devPtr = hBuffer->mem_.buffer_mem_.get();
+  std::unique_ptr<ur_event_handle_t_> retImplEv{nullptr};
+
+  try {
+    ScopedContext active(hQueue->get_context());
+    CUstream cuStream = hQueue->get_next_transfer_stream();
+
+    retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    if (phEvent) {
+      retImplEv =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
+              UR_COMMAND_MEM_BUFFER_READ, hQueue, cuStream));
+      retImplEv->start();
+    }
+
+    UR_CHECK_ERROR(cuMemcpyDtoHAsync(pDst, devPtr + offset, size, cuStream));
+
+    if (phEvent) {
+      retErr = retImplEv->record();
+    }
+
+    if (blockingRead) {
+      UR_CHECK_ERROR(cuStreamSynchronize(cuStream));
+    }
+
+    if (phEvent) {
+      *phEvent = retImplEv.release();
+    }
+
+  } catch (ur_result_t err) {
+    retErr = err;
+  }
+
+  return retErr;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite,
+    size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(!hBuffer->is_image(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  if (phEventWaitList) {
+    UR_ASSERT(numEventsInWaitList > 0, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+  } else {
+    UR_ASSERT(numEventsInWaitList == 0,
+              UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+  }
+  UR_ASSERT(offset + size <= hBuffer->mem_.buffer_mem_.size_,
+            UR_RESULT_ERROR_INVALID_SIZE);
+
+  ur_result_t retErr = UR_RESULT_SUCCESS;
+  CUdeviceptr devPtr = hBuffer->mem_.buffer_mem_.get();
+  std::unique_ptr<ur_event_handle_t_> retImplEv{nullptr};
+
+  try {
+    ScopedContext active(hQueue->get_context());
+    CUstream cuStream = hQueue->get_next_transfer_stream();
+
+    retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    if (phEvent) {
+      retImplEv =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
+              UR_COMMAND_MEM_BUFFER_WRITE, hQueue, cuStream));
+      retImplEv->start();
+    }
+
+    UR_CHECK_ERROR(cuMemcpyHtoDAsync(devPtr + offset, pSrc, size, cuStream));
+
+    if (phEvent) {
+      retErr = retImplEv->record();
+    }
+
+    if (blockingWrite) {
+      UR_CHECK_ERROR(cuStreamSynchronize(cuStream));
+    }
+
+    if (phEvent) {
+      *phEvent = retImplEv.release();
+    }
+  } catch (ur_result_t err) {
+    retErr = err;
+  }
+  return retErr;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
index e80960f7ceb3c..e0f07b41e611b 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
@@ -7,6 +7,7 @@
 //===-----------------------------------------------------------------===//
 
 #include "kernel.hpp"
+#include "memory.hpp"
 
 UR_APIEXPORT ur_result_t UR_APICALL
 urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
@@ -290,6 +291,39 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj(
+    ur_kernel_handle_t hKernel, uint32_t argIndex, ur_mem_handle_t hArgValue) {
+
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hArgValue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  ur_result_t retErr = UR_RESULT_SUCCESS;
+  try {
+    if (hArgValue->mem_type_ == ur_mem_handle_t_::mem_type::surface) {
+      CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+      UR_CHECK_ERROR(cuArray3DGetDescriptor(
+          &arrayDesc, hArgValue->mem_.surface_mem_.get_array()));
+      if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 &&
+          arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 &&
+          arrayDesc.Format != CU_AD_FORMAT_HALF &&
+          arrayDesc.Format != CU_AD_FORMAT_FLOAT) {
+        setErrorMessage("PI CUDA kernels only support images with channel "
+                        "types int32, uint32, float, and half.",
+                        UR_RESULT_ERROR_ADAPTER_SPECIFIC);
+        return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+      }
+      CUsurfObject cuSurf = hArgValue->mem_.surface_mem_.get_surface();
+      hKernel->set_kernel_arg(argIndex, sizeof(cuSurf), (void *)&cuSurf);
+    } else {
+      CUdeviceptr cuPtr = hArgValue->mem_.buffer_mem_.get();
+      hKernel->set_kernel_arg(argIndex, sizeof(CUdeviceptr), (void *)&cuPtr);
+    }
+  } catch (ur_result_t err) {
+    retErr = err;
+  }
+  return retErr;
+}
+
 // A NOP for the CUDA backend
 UR_APIEXPORT ur_result_t UR_APICALL
 urKernelSetExecInfo(ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName,
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index 07ed631c5b31e..085f87ab799ce 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -115,7 +115,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnRelease = urKernelRelease;
   pDdiTable->pfnRetain = urKernelRetain;
   pDdiTable->pfnSetArgLocal = nullptr;
-  pDdiTable->pfnSetArgMemObj = nullptr;
+  pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj;
   pDdiTable->pfnSetArgPointer = urKernelSetArgPointer;
   pDdiTable->pfnSetArgSampler = nullptr;
   pDdiTable->pfnSetArgValue = urKernelSetArgValue;
@@ -173,9 +173,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
   pDdiTable->pfnMemBufferCopyRect = nullptr;
   pDdiTable->pfnMemBufferFill = nullptr;
   pDdiTable->pfnMemBufferMap = nullptr;
-  pDdiTable->pfnMemBufferRead = nullptr;
+  pDdiTable->pfnMemBufferRead = urEnqueueMemBufferRead;
   pDdiTable->pfnMemBufferReadRect = nullptr;
-  pDdiTable->pfnMemBufferWrite = nullptr;
+  pDdiTable->pfnMemBufferWrite = urEnqueueMemBufferWrite;
   pDdiTable->pfnMemBufferWriteRect = nullptr;
   pDdiTable->pfnMemImageCopy = nullptr;
   pDdiTable->pfnMemImageRead = nullptr;

From e02d3d302d32fec3abdd8e69d0c1fdbbbfd9d12f Mon Sep 17 00:00:00 2001
From: Omar Ahmed <omar.ahmed@codeplay.com>
Date: Fri, 28 Apr 2023 10:08:38 +0100
Subject: [PATCH 19/45] Port piextKernelSetArgSampler

---
 sycl/plugins/cuda/pi_cuda.cpp                 | 175 +-----------------
 .../ur/adapters/cuda/kernel.cpp               |  24 ++-
 .../ur/adapters/cuda/ur_interface_loader.cpp  |   2 +-
 3 files changed, 25 insertions(+), 176 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 06650450a0a32..7b599e17dd04f 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -413,179 +413,6 @@ pi_result cuda_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device,
   return retError;
 }
 
-/// Created a PI image mem object from a CUDA image mem handle.
-/// TODO: Implement this.
-/// NOTE: The created PI object takes ownership of the native handle.
-///
-/// \param[in] pi_native_handle The native handle to create PI mem object from.
-/// \param[in] pi_context The PI context of the memory allocation.
-/// \param[in] ownNativeHandle Boolean indicates if we own the native memory
-/// handle or it came from interop that asked to not transfer the ownership to
-/// SYCL RT. \param[in] pi_image_format The format of the image. \param[in]
-/// pi_image_desc The description information for the image. \param[out] pi_mem
-/// Set to the PI mem object created from native handle.
-///
-/// \return TBD
-pi_result cuda_piextMemImageCreateWithNativeHandle(pi_native_handle, pi_context,
-                                                   bool,
-                                                   const pi_image_format *,
-                                                   const pi_image_desc *,
-                                                   pi_mem *) {
-  sycl::detail::pi::die(
-      "Creation of PI mem from native image handle not implemented");
-  return {};
-}
-
-pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer,
-                                       pi_bool blocking_write, size_t offset,
-                                       size_t size, const void *ptr,
-                                       pi_uint32 num_events_in_wait_list,
-                                       const pi_event *event_wait_list,
-                                       pi_event *event) {
-
-  assert(buffer != nullptr);
-  assert(command_queue != nullptr);
-  pi_result retErr = PI_SUCCESS;
-  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
-  std::unique_ptr<_pi_event> retImplEv{nullptr};
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    CUstream cuStream = command_queue->get_next_transfer_stream();
-
-    retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-
-    if (event) {
-      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_WRITE, command_queue, cuStream));
-      retImplEv->start();
-    }
-
-    retErr =
-        PI_CHECK_ERROR(cuMemcpyHtoDAsync(devPtr + offset, ptr, size, cuStream));
-
-    if (event) {
-      retErr = map_ur_error(retImplEv->record());
-    }
-
-    if (blocking_write) {
-      retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
-    }
-
-    if (event) {
-      *event = retImplEv.release();
-    }
-  } catch (pi_result err) {
-    retErr = err;
-  }
-  return retErr;
-}
-
-pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer,
-                                      pi_bool blocking_read, size_t offset,
-                                      size_t size, void *ptr,
-                                      pi_uint32 num_events_in_wait_list,
-                                      const pi_event *event_wait_list,
-                                      pi_event *event) {
-
-  assert(buffer != nullptr);
-  assert(command_queue != nullptr);
-  pi_result retErr = PI_SUCCESS;
-  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
-  std::unique_ptr<_pi_event> retImplEv{nullptr};
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    CUstream cuStream = command_queue->get_next_transfer_stream();
-
-    retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-
-    if (event) {
-      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_READ, command_queue, cuStream));
-      retImplEv->start();
-    }
-
-    retErr =
-        PI_CHECK_ERROR(cuMemcpyDtoHAsync(ptr, devPtr + offset, size, cuStream));
-
-    if (event) {
-      retErr = map_ur_error(retImplEv->record());
-    }
-
-    if (blocking_read) {
-      retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
-    }
-
-    if (event) {
-      *event = retImplEv.release();
-    }
-
-  } catch (pi_result err) {
-    retErr = err;
-  }
-  return retErr;
-}
-
-pi_result cuda_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index,
-                                       const pi_mem *arg_value) {
-
-  assert(kernel != nullptr);
-  assert(arg_value != nullptr);
-
-  // Below sets kernel arg when zero-sized buffers are handled.
-  // In such case the corresponding memory is null.
-  if (*arg_value == nullptr) {
-    kernel->set_kernel_arg(arg_index, 0, nullptr);
-    return PI_SUCCESS;
-  }
-
-  pi_result retErr = PI_SUCCESS;
-  try {
-    pi_mem arg_mem = *arg_value;
-    if (arg_mem->mem_type_ == _pi_mem::mem_type::surface) {
-      CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
-      PI_CHECK_ERROR(cuArray3DGetDescriptor(
-          &arrayDesc, arg_mem->mem_.surface_mem_.get_array()));
-      if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 &&
-          arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 &&
-          arrayDesc.Format != CU_AD_FORMAT_HALF &&
-          arrayDesc.Format != CU_AD_FORMAT_FLOAT) {
-        setErrorMessage("PI CUDA kernels only support images with channel "
-                        "types int32, uint32, float, and half.",
-                        UR_RESULT_ERROR_ADAPTER_SPECIFIC);
-        return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-      }
-      CUsurfObject cuSurf = arg_mem->mem_.surface_mem_.get_surface();
-      kernel->set_kernel_arg(arg_index, sizeof(cuSurf), (void *)&cuSurf);
-    } else {
-      CUdeviceptr cuPtr = arg_mem->mem_.buffer_mem_.get();
-      kernel->set_kernel_arg(arg_index, sizeof(CUdeviceptr), (void *)&cuPtr);
-    }
-  } catch (pi_result err) {
-    retErr = err;
-  }
-  return retErr;
-}
-
-pi_result cuda_piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index,
-                                        const pi_sampler *arg_value) {
-
-  assert(kernel != nullptr);
-  assert(arg_value != nullptr);
-
-  pi_result retErr = PI_SUCCESS;
-  try {
-    pi_uint32 samplerProps = (*arg_value)->props_;
-    kernel->set_kernel_arg(arg_index, sizeof(pi_uint32), (void *)&samplerProps);
-  } catch (pi_result err) {
-    retErr = err;
-  }
-  return retErr;
-}
-
 /// \TODO Not implemented
 pi_result cuda_piEnqueueNativeKernel(pi_queue, void (*)(void *), void *, size_t,
                                      pi_uint32, const pi_mem *, const void **,
@@ -1644,7 +1471,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piextEnqueueWriteHostPipe, cuda_piextEnqueueWriteHostPipe)
 
   _PI_CL(piextKernelSetArgMemObj, pi2ur::piextKernelSetArgMemObj)
-  _PI_CL(piextKernelSetArgSampler, cuda_piextKernelSetArgSampler)
+  _PI_CL(piextKernelSetArgSampler, pi2ur::piextKernelSetArgSampler)
   _PI_CL(piPluginGetLastError, pi2ur::piPluginGetLastError)
   _PI_CL(piTearDown, pi2ur::piTearDown)
   _PI_CL(piGetDeviceAndHostTimer, pi2ur::piGetDeviceAndHostTimer)
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
index e0f07b41e611b..69f86ca319df5 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
@@ -8,6 +8,7 @@
 
 #include "kernel.hpp"
 #include "memory.hpp"
+#include "sampler.hpp"
 
 UR_APIEXPORT ur_result_t UR_APICALL
 urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
@@ -295,7 +296,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj(
     ur_kernel_handle_t hKernel, uint32_t argIndex, ur_mem_handle_t hArgValue) {
 
   UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  UR_ASSERT(hArgValue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  // Below sets kernel arg when zero-sized buffers are handled.
+  // In such case the corresponding memory is null.
+  if (hArgValue == nullptr) {
+    hKernel->set_kernel_arg(argIndex, 0, nullptr);
+    return UR_RESULT_SUCCESS;
+  }
 
   ur_result_t retErr = UR_RESULT_SUCCESS;
   try {
@@ -338,3 +345,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     ur_kernel_handle_t *phKernel) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex,
+                      ur_sampler_handle_t hArgValue) {
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  ur_result_t retErr = UR_RESULT_SUCCESS;
+  try {
+    uint32_t samplerProps = hArgValue->props_;
+    hKernel->set_kernel_arg(argIndex, sizeof(uint32_t), (void *)&samplerProps);
+  } catch (ur_result_t err) {
+    retErr = err;
+  }
+  return retErr;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index 085f87ab799ce..d7751a02e9707 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -117,7 +117,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
   pDdiTable->pfnSetArgLocal = nullptr;
   pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj;
   pDdiTable->pfnSetArgPointer = urKernelSetArgPointer;
-  pDdiTable->pfnSetArgSampler = nullptr;
+  pDdiTable->pfnSetArgSampler = urKernelSetArgSampler;
   pDdiTable->pfnSetArgValue = urKernelSetArgValue;
   pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
   pDdiTable->pfnSetSpecializationConstants = nullptr;

From 816652ac6e2fe054b963f7b6bdaf32697f206791 Mon Sep 17 00:00:00 2001
From: Petr Vesely <petr.vesely@codeplay.com>
Date: Fri, 28 Apr 2023 11:08:47 +0100
Subject: [PATCH 20/45] [UR][SYCL][CUDA] Point PI to correct entry point

---
 sycl/plugins/cuda/pi_cuda.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 7b599e17dd04f..51ffd1fb63455 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -1436,7 +1436,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piEnqueueEventsWaitWithBarrier, pi2ur::piEnqueueEventsWaitWithBarrier)
   _PI_CL(piEnqueueMemBufferRead, pi2ur::piEnqueueMemBufferRead)
   _PI_CL(piEnqueueMemBufferReadRect, cuda_piEnqueueMemBufferReadRect)
-  _PI_CL(piEnqueueMemBufferWrite, pi2ur::piEnqueueMemBufferRead)
+  _PI_CL(piEnqueueMemBufferWrite, pi2ur::piEnqueueMemBufferWrite)
   _PI_CL(piEnqueueMemBufferWriteRect, cuda_piEnqueueMemBufferWriteRect)
   _PI_CL(piEnqueueMemBufferCopy, cuda_piEnqueueMemBufferCopy)
   _PI_CL(piEnqueueMemBufferCopyRect, cuda_piEnqueueMemBufferCopyRect)

From 6d648f66266e0a921c3efcdb0227333fe75e7f18 Mon Sep 17 00:00:00 2001
From: Omar Ahmed <omar.ahmed@codeplay.com>
Date: Thu, 27 Apr 2023 11:47:20 +0100
Subject: [PATCH 21/45] Port remaining queue entry-points

---
 sycl/plugins/cuda/pi_cuda.cpp                 | 916 +-----------------
 .../ur/adapters/cuda/enqueue.cpp              | 768 ++++++++++++++-
 .../ur/adapters/cuda/ur_interface_loader.cpp  |  20 +-
 3 files changed, 780 insertions(+), 924 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 51ffd1fb63455..09c2fddc6e207 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -73,83 +73,6 @@ pi_result cuda_piPluginGetBackendOption(pi_platform,
   return PI_ERROR_INVALID_VALUE;
 }
 
-pi_result map_ur_error(ur_result_t result) {
-  switch (result) {
-  case UR_RESULT_SUCCESS:
-    return PI_SUCCESS;
-  case UR_RESULT_ERROR_INVALID_OPERATION:
-    return PI_ERROR_INVALID_OPERATION;
-  case UR_RESULT_ERROR_INVALID_CONTEXT:
-    return PI_ERROR_INVALID_CONTEXT;
-  case UR_RESULT_ERROR_INVALID_DEVICE:
-    return PI_ERROR_INVALID_DEVICE;
-  case UR_RESULT_ERROR_INVALID_VALUE:
-    return PI_ERROR_INVALID_VALUE;
-  case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY:
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  case UR_RESULT_ERROR_OUT_OF_RESOURCES:
-    return PI_ERROR_OUT_OF_RESOURCES;
-  default:
-    return PI_ERROR_UNKNOWN;
-  }
-}
-
-pi_mem_type map_ur_mem_type(ur_mem_type_t mem_type) {
-  switch (mem_type) {
-  case UR_MEM_TYPE_BUFFER:
-  default:
-    return PI_MEM_TYPE_BUFFER;
-  case UR_MEM_TYPE_IMAGE2D:
-    return PI_MEM_TYPE_IMAGE2D;
-  case UR_MEM_TYPE_IMAGE3D:
-    return PI_MEM_TYPE_IMAGE3D;
-  case UR_MEM_TYPE_IMAGE2D_ARRAY:
-    return PI_MEM_TYPE_IMAGE2D_ARRAY;
-  case UR_MEM_TYPE_IMAGE1D:
-    return PI_MEM_TYPE_IMAGE1D;
-  case UR_MEM_TYPE_IMAGE1D_ARRAY:
-    return PI_MEM_TYPE_IMAGE1D_ARRAY;
-  case UR_MEM_TYPE_IMAGE1D_BUFFER:
-    return PI_MEM_TYPE_IMAGE1D_BUFFER;
-  }
-}
-
-template <typename TypeOut, typename TypeFlag>
-inline pi_result
-ConvertInputBitfield(pi_bitfield in, TypeOut *out,
-                     const std::unordered_map<pi_bitfield, TypeFlag> &map) {
-  *out = 0;
-  for (auto &[FlagPI, FlagUR] : map) {
-    if (in & FlagPI) {
-      *out |= FlagUR;
-    }
-  }
-
-  return PI_SUCCESS;
-}
-
-// Convert bitfield flags from PI to UR for MemFlags
-inline pi_result pi2urMemFlags(pi_mem_flags piFlags, ur_mem_flags_t *urFlags) {
-  static const std::unordered_map<pi_mem_flags, ur_mem_flags_t> MemFlagsMap = {
-      {PI_MEM_FLAGS_ACCESS_RW, UR_MEM_FLAG_READ_WRITE},
-      {PI_MEM_ACCESS_READ_ONLY, UR_MEM_FLAG_READ_ONLY},
-      {PI_MEM_FLAGS_HOST_PTR_USE, UR_MEM_FLAG_USE_HOST_POINTER},
-      {PI_MEM_FLAGS_HOST_PTR_COPY, UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER},
-      {PI_MEM_FLAGS_HOST_PTR_ALLOC, UR_MEM_FLAG_ALLOC_HOST_POINTER},
-  };
-
-  return ConvertInputBitfield(piFlags, urFlags, MemFlagsMap);
-}
-
-// Convert bitfield flags from PI to UR for MapFlags
-inline pi_result pi2urMapFlags(pi_mem_flags piFlags, ur_mem_flags_t *urFlags) {
-  static const std::unordered_map<pi_bitfield, ur_map_flag_t> MapFlagsMap = {
-      {PI_MAP_READ, UR_MAP_FLAG_READ},
-      {PI_MAP_WRITE, UR_MAP_FLAG_WRITE},
-  };
-  return ConvertInputBitfield(piFlags, urFlags, MapFlagsMap);
-}
-
 // Iterates over the event wait list, returns correct pi_result error codes.
 // Invokes the callback for the latest event of each queue in the wait list.
 // The callback must take a single pi_event argument and return a pi_result.
@@ -280,36 +203,6 @@ pi_result getInfoArray(size_t array_length, size_t param_value_size,
 }
 /// \endcond
 
-pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream,
-                            pi_uint32 num_events_in_wait_list,
-                            const pi_event *event_wait_list) {
-  if (!event_wait_list) {
-    return PI_SUCCESS;
-  }
-  try {
-    ScopedContext active(command_queue->get_context());
-
-    auto result = forLatestEvents(
-        event_wait_list, num_events_in_wait_list,
-        [stream](pi_event event) -> pi_result {
-          if (event->get_stream() == stream) {
-            return PI_SUCCESS;
-          } else {
-            return PI_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0));
-          }
-        });
-
-    if (result != PI_SUCCESS) {
-      return result;
-    }
-    return PI_SUCCESS;
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-}
-
 } // anonymous namespace
 
 /// ------ Error handling, matching OpenCL plugin semantics.
@@ -413,790 +306,6 @@ pi_result cuda_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device,
   return retError;
 }
 
-/// \TODO Not implemented
-pi_result cuda_piEnqueueNativeKernel(pi_queue, void (*)(void *), void *, size_t,
-                                     pi_uint32, const pi_mem *, const void **,
-                                     pi_uint32, const pi_event *, pi_event *) {
-  sycl::detail::pi::die("Not implemented in CUDA backend");
-  return {};
-}
-
-/// General 3D memory copy operation.
-/// This function requires the corresponding CUDA context to be at the top of
-/// the context stack
-/// If the source and/or destination is on the device, src_ptr and/or dst_ptr
-/// must be a pointer to a CUdeviceptr
-static pi_result commonEnqueueMemBufferCopyRect(
-    CUstream cu_stream, pi_buff_rect_region region, const void *src_ptr,
-    const CUmemorytype_enum src_type, pi_buff_rect_offset src_offset,
-    size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr,
-    const CUmemorytype_enum dst_type, pi_buff_rect_offset dst_offset,
-    size_t dst_row_pitch, size_t dst_slice_pitch) {
-
-  assert(region != nullptr);
-  assert(src_offset != nullptr);
-  assert(dst_offset != nullptr);
-
-  assert(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST);
-  assert(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST);
-
-  src_row_pitch = (!src_row_pitch) ? region->width_bytes + src_offset->x_bytes
-                                   : src_row_pitch;
-  src_slice_pitch =
-      (!src_slice_pitch)
-          ? ((region->height_scalar + src_offset->y_scalar) * src_row_pitch)
-          : src_slice_pitch;
-  dst_row_pitch = (!dst_row_pitch) ? region->width_bytes + dst_offset->x_bytes
-                                   : dst_row_pitch;
-  dst_slice_pitch =
-      (!dst_slice_pitch)
-          ? ((region->height_scalar + dst_offset->y_scalar) * dst_row_pitch)
-          : dst_slice_pitch;
-
-  CUDA_MEMCPY3D params = {};
-
-  params.WidthInBytes = region->width_bytes;
-  params.Height = region->height_scalar;
-  params.Depth = region->depth_scalar;
-
-  params.srcMemoryType = src_type;
-  params.srcDevice = src_type == CU_MEMORYTYPE_DEVICE
-                         ? *static_cast<const CUdeviceptr *>(src_ptr)
-                         : 0;
-  params.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr;
-  params.srcXInBytes = src_offset->x_bytes;
-  params.srcY = src_offset->y_scalar;
-  params.srcZ = src_offset->z_scalar;
-  params.srcPitch = src_row_pitch;
-  params.srcHeight = src_slice_pitch / src_row_pitch;
-
-  params.dstMemoryType = dst_type;
-  params.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE
-                         ? *static_cast<CUdeviceptr *>(dst_ptr)
-                         : 0;
-  params.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr;
-  params.dstXInBytes = dst_offset->x_bytes;
-  params.dstY = dst_offset->y_scalar;
-  params.dstZ = dst_offset->z_scalar;
-  params.dstPitch = dst_row_pitch;
-  params.dstHeight = dst_slice_pitch / dst_row_pitch;
-
-  return PI_CHECK_ERROR(cuMemcpy3DAsync(&params, cu_stream));
-}
-
-pi_result cuda_piEnqueueMemBufferReadRect(
-    pi_queue command_queue, pi_mem buffer, pi_bool blocking_read,
-    pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
-    pi_buff_rect_region region, size_t buffer_row_pitch,
-    size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
-    void *ptr, pi_uint32 num_events_in_wait_list,
-    const pi_event *event_wait_list, pi_event *event) {
-
-  assert(buffer != nullptr);
-  assert(command_queue != nullptr);
-
-  pi_result retErr = PI_SUCCESS;
-  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
-  std::unique_ptr<_pi_event> retImplEv{nullptr};
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    CUstream cuStream = command_queue->get_next_transfer_stream();
-
-    retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-
-    if (event) {
-      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, command_queue, cuStream));
-      retImplEv->start();
-    }
-
-    retErr = commonEnqueueMemBufferCopyRect(
-        cuStream, region, &devPtr, CU_MEMORYTYPE_DEVICE, buffer_offset,
-        buffer_row_pitch, buffer_slice_pitch, ptr, CU_MEMORYTYPE_HOST,
-        host_offset, host_row_pitch, host_slice_pitch);
-
-    if (event) {
-      retErr = map_ur_error(retImplEv->record());
-    }
-
-    if (blocking_read) {
-      retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
-    }
-
-    if (event) {
-      *event = retImplEv.release();
-    }
-
-  } catch (pi_result err) {
-    retErr = err;
-  }
-  return retErr;
-}
-
-pi_result cuda_piEnqueueMemBufferWriteRect(
-    pi_queue command_queue, pi_mem buffer, pi_bool blocking_write,
-    pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
-    pi_buff_rect_region region, size_t buffer_row_pitch,
-    size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
-    const void *ptr, pi_uint32 num_events_in_wait_list,
-    const pi_event *event_wait_list, pi_event *event) {
-
-  assert(buffer != nullptr);
-  assert(command_queue != nullptr);
-
-  pi_result retErr = PI_SUCCESS;
-  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
-  std::unique_ptr<_pi_event> retImplEv{nullptr};
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    CUstream cuStream = command_queue->get_next_transfer_stream();
-    retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-
-    if (event) {
-      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, command_queue, cuStream));
-      retImplEv->start();
-    }
-
-    retErr = commonEnqueueMemBufferCopyRect(
-        cuStream, region, ptr, CU_MEMORYTYPE_HOST, host_offset, host_row_pitch,
-        host_slice_pitch, &devPtr, CU_MEMORYTYPE_DEVICE, buffer_offset,
-        buffer_row_pitch, buffer_slice_pitch);
-
-    if (event) {
-      retErr = map_ur_error(retImplEv->record());
-    }
-
-    if (blocking_write) {
-      retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
-    }
-
-    if (event) {
-      *event = retImplEv.release();
-    }
-
-  } catch (pi_result err) {
-    retErr = err;
-  }
-  return retErr;
-}
-
-pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer,
-                                      pi_mem dst_buffer, size_t src_offset,
-                                      size_t dst_offset, size_t size,
-                                      pi_uint32 num_events_in_wait_list,
-                                      const pi_event *event_wait_list,
-                                      pi_event *event) {
-  if (!command_queue) {
-    return PI_ERROR_INVALID_QUEUE;
-  }
-
-  std::unique_ptr<_pi_event> retImplEv{nullptr};
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    pi_result result;
-
-    auto stream = command_queue->get_next_transfer_stream();
-    result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
-                               event_wait_list);
-
-    if (event) {
-      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue, stream));
-      result = map_ur_error(retImplEv->start());
-    }
-
-    auto src = src_buffer->mem_.buffer_mem_.get() + src_offset;
-    auto dst = dst_buffer->mem_.buffer_mem_.get() + dst_offset;
-
-    result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream));
-
-    if (event) {
-      result = map_ur_error(retImplEv->record());
-      *event = retImplEv.release();
-    }
-
-    return result;
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-}
-
-pi_result cuda_piEnqueueMemBufferCopyRect(
-    pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer,
-    pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin,
-    pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch,
-    size_t dst_row_pitch, size_t dst_slice_pitch,
-    pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
-    pi_event *event) {
-
-  assert(src_buffer != nullptr);
-  assert(dst_buffer != nullptr);
-  assert(command_queue != nullptr);
-
-  pi_result retErr = PI_SUCCESS;
-  CUdeviceptr srcPtr = src_buffer->mem_.buffer_mem_.get();
-  CUdeviceptr dstPtr = dst_buffer->mem_.buffer_mem_.get();
-  std::unique_ptr<_pi_event> retImplEv{nullptr};
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    CUstream cuStream = command_queue->get_next_transfer_stream();
-    retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-
-    if (event) {
-      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, command_queue, cuStream));
-      retImplEv->start();
-    }
-
-    retErr = commonEnqueueMemBufferCopyRect(
-        cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin,
-        src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE,
-        dst_origin, dst_row_pitch, dst_slice_pitch);
-
-    if (event) {
-      retImplEv->record();
-      *event = retImplEv.release();
-    }
-
-  } catch (pi_result err) {
-    retErr = err;
-  }
-  return retErr;
-}
-
-pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
-                                      const void *pattern, size_t pattern_size,
-                                      size_t offset, size_t size,
-                                      pi_uint32 num_events_in_wait_list,
-                                      const pi_event *event_wait_list,
-                                      pi_event *event) {
-  assert(command_queue != nullptr);
-
-  auto args_are_multiples_of_pattern_size =
-      (offset % pattern_size == 0) || (size % pattern_size == 0);
-
-  auto pattern_is_valid = (pattern != nullptr);
-
-  auto pattern_size_is_valid =
-      ((pattern_size & (pattern_size - 1)) == 0) && // is power of two
-      (pattern_size > 0) && (pattern_size <= 128);  // falls within valid range
-
-  assert(args_are_multiples_of_pattern_size && pattern_is_valid &&
-         pattern_size_is_valid);
-  (void)args_are_multiples_of_pattern_size;
-  (void)pattern_is_valid;
-  (void)pattern_size_is_valid;
-
-  std::unique_ptr<_pi_event> retImplEv{nullptr};
-
-  try {
-    ScopedContext active(command_queue->get_context());
-
-    auto stream = command_queue->get_next_transfer_stream();
-    pi_result result;
-    result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
-                               event_wait_list);
-
-    if (event) {
-      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue, stream));
-      result = map_ur_error(retImplEv->start());
-    }
-
-    auto dstDevice = buffer->mem_.buffer_mem_.get() + offset;
-    auto N = size / pattern_size;
-
-    // pattern size in bytes
-    switch (pattern_size) {
-    case 1: {
-      auto value = *static_cast<const uint8_t *>(pattern);
-      result = PI_CHECK_ERROR(cuMemsetD8Async(dstDevice, value, N, stream));
-      break;
-    }
-    case 2: {
-      auto value = *static_cast<const uint16_t *>(pattern);
-      result = PI_CHECK_ERROR(cuMemsetD16Async(dstDevice, value, N, stream));
-      break;
-    }
-    case 4: {
-      auto value = *static_cast<const uint32_t *>(pattern);
-      result = PI_CHECK_ERROR(cuMemsetD32Async(dstDevice, value, N, stream));
-      break;
-    }
-    default: {
-      // CUDA has no memset functions that allow setting values more than 4
-      // bytes. PI API lets you pass an arbitrary "pattern" to the buffer
-      // fill, which can be more than 4 bytes. We must break up the pattern
-      // into 4 byte values, and set the buffer using multiple strided calls.
-      // This means that one cuMemsetD2D32Async call is made for every 4 bytes
-      // in the pattern.
-
-      auto number_of_steps = pattern_size / sizeof(uint32_t);
-
-      // we walk up the pattern in 4-byte steps, and call cuMemset for each
-      // 4-byte chunk of the pattern.
-      for (auto step = 0u; step < number_of_steps; ++step) {
-        // take 4 bytes of the pattern
-        auto value = *(static_cast<const uint32_t *>(pattern) + step);
-
-        // offset the pointer to the part of the buffer we want to write to
-        auto offset_ptr = dstDevice + (step * sizeof(uint32_t));
-
-        // set all of the pattern chunks
-        result = PI_CHECK_ERROR(
-            cuMemsetD2D32Async(offset_ptr, pattern_size, value, 1, N, stream));
-      }
-
-      break;
-    }
-    }
-
-    if (event) {
-      result = map_ur_error(retImplEv->record());
-      *event = retImplEv.release();
-    }
-
-    return result;
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-}
-
-static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR array_desc) {
-  switch (array_desc.Format) {
-  case CU_AD_FORMAT_UNSIGNED_INT8:
-  case CU_AD_FORMAT_SIGNED_INT8:
-    return 1;
-  case CU_AD_FORMAT_UNSIGNED_INT16:
-  case CU_AD_FORMAT_SIGNED_INT16:
-  case CU_AD_FORMAT_HALF:
-    return 2;
-  case CU_AD_FORMAT_UNSIGNED_INT32:
-  case CU_AD_FORMAT_SIGNED_INT32:
-  case CU_AD_FORMAT_FLOAT:
-    return 4;
-  default:
-    sycl::detail::pi::die("Invalid image format.");
-    return 0;
-  }
-}
-
-/// General ND memory copy operation for images (where N > 1).
-/// This function requires the corresponding CUDA context to be at the top of
-/// the context stack
-/// If the source and/or destination is an array, src_ptr and/or dst_ptr
-/// must be a pointer to a CUarray
-static pi_result commonEnqueueMemImageNDCopy(
-    CUstream cu_stream, pi_mem_type img_type, const size_t *region,
-    const void *src_ptr, const CUmemorytype_enum src_type,
-    const size_t *src_offset, void *dst_ptr, const CUmemorytype_enum dst_type,
-    const size_t *dst_offset) {
-  assert(region != nullptr);
-
-  assert(src_type == CU_MEMORYTYPE_ARRAY || src_type == CU_MEMORYTYPE_HOST);
-  assert(dst_type == CU_MEMORYTYPE_ARRAY || dst_type == CU_MEMORYTYPE_HOST);
-
-  if (img_type == PI_MEM_TYPE_IMAGE2D) {
-    CUDA_MEMCPY2D cpyDesc;
-    memset(&cpyDesc, 0, sizeof(cpyDesc));
-    cpyDesc.srcMemoryType = src_type;
-    if (src_type == CU_MEMORYTYPE_ARRAY) {
-      cpyDesc.srcArray = *static_cast<const CUarray *>(src_ptr);
-      cpyDesc.srcXInBytes = src_offset[0];
-      cpyDesc.srcY = src_offset[1];
-    } else {
-      cpyDesc.srcHost = src_ptr;
-    }
-    cpyDesc.dstMemoryType = dst_type;
-    if (dst_type == CU_MEMORYTYPE_ARRAY) {
-      cpyDesc.dstArray = *static_cast<CUarray *>(dst_ptr);
-      cpyDesc.dstXInBytes = dst_offset[0];
-      cpyDesc.dstY = dst_offset[1];
-    } else {
-      cpyDesc.dstHost = dst_ptr;
-    }
-    cpyDesc.WidthInBytes = region[0];
-    cpyDesc.Height = region[1];
-    return PI_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cu_stream));
-  }
-  if (img_type == PI_MEM_TYPE_IMAGE3D) {
-    CUDA_MEMCPY3D cpyDesc;
-    memset(&cpyDesc, 0, sizeof(cpyDesc));
-    cpyDesc.srcMemoryType = src_type;
-    if (src_type == CU_MEMORYTYPE_ARRAY) {
-      cpyDesc.srcArray = *static_cast<const CUarray *>(src_ptr);
-      cpyDesc.srcXInBytes = src_offset[0];
-      cpyDesc.srcY = src_offset[1];
-      cpyDesc.srcZ = src_offset[2];
-    } else {
-      cpyDesc.srcHost = src_ptr;
-    }
-    cpyDesc.dstMemoryType = dst_type;
-    if (dst_type == CU_MEMORYTYPE_ARRAY) {
-      cpyDesc.dstArray = *static_cast<CUarray *>(dst_ptr);
-      cpyDesc.dstXInBytes = dst_offset[0];
-      cpyDesc.dstY = dst_offset[1];
-      cpyDesc.dstZ = dst_offset[2];
-    } else {
-      cpyDesc.dstHost = dst_ptr;
-    }
-    cpyDesc.WidthInBytes = region[0];
-    cpyDesc.Height = region[1];
-    cpyDesc.Depth = region[2];
-    return PI_CHECK_ERROR(cuMemcpy3DAsync(&cpyDesc, cu_stream));
-  }
-  return PI_ERROR_INVALID_VALUE;
-}
-
-pi_result cuda_piEnqueueMemImageRead(
-    pi_queue command_queue, pi_mem image, pi_bool blocking_read,
-    const size_t *origin, const size_t *region, size_t row_pitch,
-    size_t slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list,
-    const pi_event *event_wait_list, pi_event *event) {
-  // Ignore unused parameters
-  (void)row_pitch;
-  (void)slice_pitch;
-
-  assert(command_queue != nullptr);
-  assert(image != nullptr);
-  assert(image->mem_type_ == _pi_mem::mem_type::surface);
-
-  pi_result retErr = PI_SUCCESS;
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    CUstream cuStream = command_queue->get_next_transfer_stream();
-    retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-
-    CUarray array = image->mem_.surface_mem_.get_array();
-
-    CUDA_ARRAY_DESCRIPTOR arrayDesc;
-    retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array));
-
-    int elementByteSize = imageElementByteSize(arrayDesc);
-
-    size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels;
-    size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0];
-
-    pi_mem_type imgType =
-        map_ur_mem_type(image->mem_.surface_mem_.get_image_type());
-    if (imgType == PI_MEM_TYPE_IMAGE1D) {
-      retErr = PI_CHECK_ERROR(
-          cuMemcpyAtoHAsync(ptr, array, byteOffsetX, bytesToCopy, cuStream));
-    } else {
-      size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
-      size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]};
-
-      retErr = commonEnqueueMemImageNDCopy(
-          cuStream, imgType, adjustedRegion, &array, CU_MEMORYTYPE_ARRAY,
-          srcOffset, ptr, CU_MEMORYTYPE_HOST, nullptr);
-
-      if (retErr != PI_SUCCESS) {
-        return retErr;
-      }
-    }
-
-    if (event) {
-      auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_READ,
-                                              command_queue, cuStream);
-      new_event->record();
-      *event = new_event;
-    }
-
-    if (blocking_read) {
-      retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
-    }
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  return retErr;
-}
-
-pi_result
-cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image,
-                            pi_bool blocking_write, const size_t *origin,
-                            const size_t *region, size_t input_row_pitch,
-                            size_t input_slice_pitch, const void *ptr,
-                            pi_uint32 num_events_in_wait_list,
-                            const pi_event *event_wait_list, pi_event *event) {
-  // Ignore unused parameters
-  (void)blocking_write;
-  (void)input_row_pitch;
-  (void)input_slice_pitch;
-
-  assert(command_queue != nullptr);
-  assert(image != nullptr);
-  assert(image->mem_type_ == _pi_mem::mem_type::surface);
-
-  pi_result retErr = PI_SUCCESS;
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    CUstream cuStream = command_queue->get_next_transfer_stream();
-    retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-
-    CUarray array = image->mem_.surface_mem_.get_array();
-
-    CUDA_ARRAY_DESCRIPTOR arrayDesc;
-    retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array));
-
-    int elementByteSize = imageElementByteSize(arrayDesc);
-
-    size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels;
-    size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0];
-
-    pi_mem_type imgType =
-        map_ur_mem_type(image->mem_.surface_mem_.get_image_type());
-    if (imgType == PI_MEM_TYPE_IMAGE1D) {
-      retErr = PI_CHECK_ERROR(
-          cuMemcpyHtoAAsync(array, byteOffsetX, ptr, bytesToCopy, cuStream));
-    } else {
-      size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
-      size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]};
-
-      retErr = commonEnqueueMemImageNDCopy(
-          cuStream, imgType, adjustedRegion, ptr, CU_MEMORYTYPE_HOST, nullptr,
-          &array, CU_MEMORYTYPE_ARRAY, dstOffset);
-
-      if (retErr != PI_SUCCESS) {
-        return retErr;
-      }
-    }
-
-    if (event) {
-      auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_WRITE,
-                                              command_queue, cuStream);
-      new_event->record();
-      *event = new_event;
-    }
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  return retErr;
-}
-
-pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image,
-                                     pi_mem dst_image, const size_t *src_origin,
-                                     const size_t *dst_origin,
-                                     const size_t *region,
-                                     pi_uint32 num_events_in_wait_list,
-                                     const pi_event *event_wait_list,
-                                     pi_event *event) {
-  assert(src_image->mem_type_ == _pi_mem::mem_type::surface);
-  assert(dst_image->mem_type_ == _pi_mem::mem_type::surface);
-  assert(src_image->mem_.surface_mem_.get_image_type() ==
-         dst_image->mem_.surface_mem_.get_image_type());
-
-  pi_result retErr = PI_SUCCESS;
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    CUstream cuStream = command_queue->get_next_transfer_stream();
-    retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-
-    CUarray srcArray = src_image->mem_.surface_mem_.get_array();
-    CUarray dstArray = dst_image->mem_.surface_mem_.get_array();
-
-    CUDA_ARRAY_DESCRIPTOR srcArrayDesc;
-    retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&srcArrayDesc, srcArray));
-    CUDA_ARRAY_DESCRIPTOR dstArrayDesc;
-    retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&dstArrayDesc, dstArray));
-
-    assert(srcArrayDesc.Format == dstArrayDesc.Format);
-    assert(srcArrayDesc.NumChannels == dstArrayDesc.NumChannels);
-
-    int elementByteSize = imageElementByteSize(srcArrayDesc);
-
-    size_t dstByteOffsetX =
-        dst_origin[0] * elementByteSize * srcArrayDesc.NumChannels;
-    size_t srcByteOffsetX =
-        src_origin[0] * elementByteSize * dstArrayDesc.NumChannels;
-    size_t bytesToCopy = elementByteSize * srcArrayDesc.NumChannels * region[0];
-
-    pi_mem_type imgType =
-        map_ur_mem_type(src_image->mem_.surface_mem_.get_image_type());
-    if (imgType == PI_MEM_TYPE_IMAGE1D) {
-      retErr = PI_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray,
-                                           srcByteOffsetX, bytesToCopy));
-    } else {
-      size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
-      size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]};
-      size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]};
-
-      retErr = commonEnqueueMemImageNDCopy(
-          cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY,
-          srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset);
-
-      if (retErr != PI_SUCCESS) {
-        return retErr;
-      }
-    }
-
-    if (event) {
-      auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_COPY,
-                                              command_queue, cuStream);
-      new_event->record();
-      *event = new_event;
-    }
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  return retErr;
-}
-
-/// \TODO Not implemented in CUDA.
-pi_result cuda_piEnqueueMemImageFill(pi_queue, pi_mem, const void *,
-                                     const size_t *, const size_t *, pi_uint32,
-                                     const pi_event *, pi_event *) {
-  sycl::detail::pi::die("cuda_piEnqueueMemImageFill not implemented");
-  return {};
-}
-
-/// Implements mapping on the host using a BufferRead operation.
-/// Mapped pointers are stored in the pi_mem object.
-/// If the buffer uses pinned host memory a pointer to that memory is returned
-/// and no read operation is done.
-///
-pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer,
-                                     pi_bool blocking_map,
-                                     pi_map_flags map_flags, size_t offset,
-                                     size_t size,
-                                     pi_uint32 num_events_in_wait_list,
-                                     const pi_event *event_wait_list,
-                                     pi_event *event, void **ret_map) {
-  assert(ret_map != nullptr);
-  assert(command_queue != nullptr);
-  assert(buffer != nullptr);
-  assert(buffer->mem_type_ == _pi_mem::mem_type::buffer);
-
-  pi_result ret_err = PI_ERROR_INVALID_OPERATION;
-  const bool is_pinned = buffer->mem_.buffer_mem_.allocMode_ ==
-                         _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr;
-
-  // Currently no support for overlapping regions
-  if (buffer->mem_.buffer_mem_.get_map_ptr() != nullptr) {
-    return ret_err;
-  }
-
-  // Allocate a pointer in the host to store the mapped information
-  // TODO(ur): Remove conversion when this is ported to UR.
-  ur_map_flags_t map_flags_ur;
-  pi2urMapFlags(map_flags, &map_flags_ur);
-  auto hostPtr = buffer->mem_.buffer_mem_.map_to_ptr(offset, map_flags_ur);
-  *ret_map = buffer->mem_.buffer_mem_.get_map_ptr();
-  if (hostPtr) {
-    ret_err = PI_SUCCESS;
-  }
-
-  if (!is_pinned && ((map_flags & PI_MAP_READ) || (map_flags & PI_MAP_WRITE))) {
-    // Pinned host memory is already on host so it doesn't need to be read.
-    ret_err = pi2ur::piEnqueueMemBufferRead(
-        command_queue, buffer, blocking_map, offset, size, hostPtr,
-        num_events_in_wait_list, event_wait_list, event);
-  } else {
-    ScopedContext active(command_queue->get_context());
-
-    if (is_pinned) {
-      ret_err = pi2ur::piEnqueueEventsWait(
-          command_queue, num_events_in_wait_list, event_wait_list, nullptr);
-    }
-
-    if (event) {
-      try {
-        *event = _pi_event::make_native(
-            PI_COMMAND_TYPE_MEM_BUFFER_MAP, command_queue,
-            command_queue->get_next_transfer_stream());
-        (*event)->start();
-        (*event)->record();
-      } catch (pi_result error) {
-        ret_err = error;
-      }
-    }
-  }
-
-  return ret_err;
-}
-
-/// Implements the unmap from the host, using a BufferWrite operation.
-/// Requires the mapped pointer to be already registered in the given memobj.
-/// If memobj uses pinned host memory, this will not do a write.
-///
-pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
-                                 void *mapped_ptr,
-                                 pi_uint32 num_events_in_wait_list,
-                                 const pi_event *event_wait_list,
-                                 pi_event *event) {
-  pi_result ret_err = PI_SUCCESS;
-
-  assert(command_queue != nullptr);
-  assert(mapped_ptr != nullptr);
-  assert(memobj != nullptr);
-  assert(memobj->mem_type_ == _pi_mem::mem_type::buffer);
-  assert(memobj->mem_.buffer_mem_.get_map_ptr() != nullptr);
-  assert(memobj->mem_.buffer_mem_.get_map_ptr() == mapped_ptr);
-
-  const bool is_pinned = memobj->mem_.buffer_mem_.allocMode_ ==
-                         _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr;
-
-  if (!is_pinned &&
-      ((memobj->mem_.buffer_mem_.get_map_flags() & PI_MAP_WRITE) ||
-       (memobj->mem_.buffer_mem_.get_map_flags() &
-        PI_MAP_WRITE_INVALIDATE_REGION))) {
-    // Pinned host memory is only on host so it doesn't need to be written to.
-    ret_err = pi2ur::piEnqueueMemBufferWrite(
-        command_queue, memobj, true,
-        memobj->mem_.buffer_mem_.get_map_offset(mapped_ptr),
-        memobj->mem_.buffer_mem_.get_size(), mapped_ptr,
-        num_events_in_wait_list, event_wait_list, event);
-  } else {
-    ScopedContext active(command_queue->get_context());
-
-    if (is_pinned) {
-      ret_err = pi2ur::piEnqueueEventsWait(
-          command_queue, num_events_in_wait_list, event_wait_list, nullptr);
-    }
-
-    if (event) {
-      try {
-        *event = _pi_event::make_native(
-            PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, command_queue,
-            command_queue->get_next_transfer_stream());
-        (*event)->start();
-        (*event)->record();
-      } catch (pi_result error) {
-        ret_err = error;
-      }
-    }
-  }
-
-  memobj->mem_.buffer_mem_.unmap(mapped_ptr);
-  return ret_err;
-}
-
 pi_result cuda_piextEnqueueDeviceGlobalVariableWrite(
     pi_queue queue, pi_program program, const char *name,
     pi_bool blocking_write, size_t count, size_t offset, const void *src,
@@ -1431,22 +540,23 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piSamplerRelease, pi2ur::piSamplerRelease)
   // Queue commands
   _PI_CL(piEnqueueKernelLaunch, pi2ur::piEnqueueKernelLaunch)
-  _PI_CL(piEnqueueNativeKernel, cuda_piEnqueueNativeKernel)
+  _PI_CL(piEnqueueNativeKernel, pi2ur::piEnqueueNativeKernel)
   _PI_CL(piEnqueueEventsWait, pi2ur::piEnqueueEventsWait)
   _PI_CL(piEnqueueEventsWaitWithBarrier, pi2ur::piEnqueueEventsWaitWithBarrier)
   _PI_CL(piEnqueueMemBufferRead, pi2ur::piEnqueueMemBufferRead)
-  _PI_CL(piEnqueueMemBufferReadRect, cuda_piEnqueueMemBufferReadRect)
+  _PI_CL(piEnqueueMemBufferReadRect, pi2ur::piEnqueueMemBufferReadRect)
   _PI_CL(piEnqueueMemBufferWrite, pi2ur::piEnqueueMemBufferWrite)
-  _PI_CL(piEnqueueMemBufferWriteRect, cuda_piEnqueueMemBufferWriteRect)
-  _PI_CL(piEnqueueMemBufferCopy, cuda_piEnqueueMemBufferCopy)
-  _PI_CL(piEnqueueMemBufferCopyRect, cuda_piEnqueueMemBufferCopyRect)
-  _PI_CL(piEnqueueMemBufferFill, cuda_piEnqueueMemBufferFill)
-  _PI_CL(piEnqueueMemImageRead, cuda_piEnqueueMemImageRead)
-  _PI_CL(piEnqueueMemImageWrite, cuda_piEnqueueMemImageWrite)
-  _PI_CL(piEnqueueMemImageCopy, cuda_piEnqueueMemImageCopy)
-  _PI_CL(piEnqueueMemImageFill, cuda_piEnqueueMemImageFill)
-  _PI_CL(piEnqueueMemBufferMap, cuda_piEnqueueMemBufferMap)
-  _PI_CL(piEnqueueMemUnmap, cuda_piEnqueueMemUnmap)
+  _PI_CL(piEnqueueMemBufferWriteRect, pi2ur::piEnqueueMemBufferWriteRect)
+  _PI_CL(piEnqueueMemBufferCopy, pi2ur::piEnqueueMemBufferCopy)
+  _PI_CL(piEnqueueMemBufferCopyRect, pi2ur::piEnqueueMemBufferCopyRect)
+  _PI_CL(piEnqueueMemBufferFill, pi2ur::piEnqueueMemBufferFill)
+  _PI_CL(piEnqueueMemImageRead, pi2ur::piEnqueueMemImageRead)
+  _PI_CL(piEnqueueMemImageWrite, pi2ur::piEnqueueMemImageWrite)
+  _PI_CL(piEnqueueMemImageCopy, pi2ur::piEnqueueMemImageCopy)
+  _PI_CL(piEnqueueMemImageFill, pi2ur::piEnqueueMemImageFill)
+  _PI_CL(piEnqueueMemBufferMap, pi2ur::piEnqueueMemBufferMap)
+  _PI_CL(piEnqueueMemUnmap, pi2ur::piEnqueueMemUnmap)
+
   // USM
   _PI_CL(piextUSMHostAlloc, pi2ur::piextUSMHostAlloc)
   _PI_CL(piextUSMDeviceAlloc, pi2ur::piextUSMDeviceAlloc)
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
index 674bea82ddef9..fd2106dd6c141 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
@@ -19,9 +19,8 @@
 ur_result_t enqueueEventsWait(ur_queue_handle_t command_queue, CUstream stream,
                               uint32_t num_events_in_wait_list,
                               const ur_event_handle_t *event_wait_list) {
-  if (!event_wait_list) {
-    return UR_RESULT_SUCCESS;
-  }
+  UR_ASSERT(event_wait_list, UR_RESULT_SUCCESS);
+
   try {
     ScopedContext active(command_queue->get_context());
 
@@ -34,11 +33,7 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t command_queue, CUstream stream,
             return UR_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0));
           }
         });
-
-    if (result != UR_RESULT_SUCCESS) {
-      return result;
-    }
-    return UR_RESULT_SUCCESS;
+    return result;
   } catch (ur_result_t err) {
     return err;
   } catch (...) {
@@ -225,9 +220,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
   // This function makes one stream work on the previous work (or work
   // represented by input events) and then all future work waits on that stream.
-  if (!hQueue) {
-    return UR_RESULT_ERROR_INVALID_QUEUE;
-  }
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
 
   ur_result_t result;
 
@@ -474,6 +467,759 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   return retError;
 }
 
+/// General 3D memory copy operation.
+/// This function requires the corresponding CUDA context to be at the top of
+/// the context stack
+/// If the source and/or destination is on the device, src_ptr and/or dst_ptr
+/// must be a pointer to a CUdeviceptr
+static ur_result_t commonEnqueueMemBufferCopyRect(
+    CUstream cu_stream, ur_rect_region_t region, const void *src_ptr,
+    const CUmemorytype_enum src_type, ur_rect_offset_t src_offset,
+    size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr,
+    const CUmemorytype_enum dst_type, ur_rect_offset_t dst_offset,
+    size_t dst_row_pitch, size_t dst_slice_pitch) {
+
+  UR_ASSERT(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  src_row_pitch =
+      (!src_row_pitch) ? region.width + src_offset.x : src_row_pitch;
+  src_slice_pitch = (!src_slice_pitch)
+                        ? ((region.height + src_offset.y) * src_row_pitch)
+                        : src_slice_pitch;
+  dst_row_pitch =
+      (!dst_row_pitch) ? region.width + dst_offset.x : dst_row_pitch;
+  dst_slice_pitch = (!dst_slice_pitch)
+                        ? ((region.height + dst_offset.y) * dst_row_pitch)
+                        : dst_slice_pitch;
+
+  CUDA_MEMCPY3D params = {};
+
+  params.WidthInBytes = region.width;
+  params.Height = region.height;
+  params.Depth = region.depth;
+
+  params.srcMemoryType = src_type;
+  params.srcDevice = src_type == CU_MEMORYTYPE_DEVICE
+                         ? *static_cast<const CUdeviceptr *>(src_ptr)
+                         : 0;
+  params.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr;
+  params.srcXInBytes = src_offset.x;
+  params.srcY = src_offset.y;
+  params.srcZ = src_offset.z;
+  params.srcPitch = src_row_pitch;
+  params.srcHeight = src_slice_pitch / src_row_pitch;
+
+  params.dstMemoryType = dst_type;
+  params.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE
+                         ? *static_cast<CUdeviceptr *>(dst_ptr)
+                         : 0;
+  params.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr;
+  params.dstXInBytes = dst_offset.x;
+  params.dstY = dst_offset.y;
+  params.dstZ = dst_offset.z;
+  params.dstPitch = dst_row_pitch;
+  params.dstHeight = dst_slice_pitch / dst_row_pitch;
+
+  return UR_CHECK_ERROR(cuMemcpy3DAsync(&params, cu_stream));
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead,
+    ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin,
+    ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch,
+    size_t hostRowPitch, size_t hostSlicePitch, void *pDst,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  ur_result_t retErr = UR_RESULT_SUCCESS;
+  CUdeviceptr devPtr = hBuffer->mem_.buffer_mem_.get();
+  std::unique_ptr<ur_event_handle_t_> retImplEv{nullptr};
+
+  try {
+    ScopedContext active(hQueue->get_context());
+    CUstream cuStream = hQueue->get_next_transfer_stream();
+
+    retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    if (phEvent) {
+      retImplEv =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
+              UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, cuStream));
+      retImplEv->start();
+    }
+
+    retErr = commonEnqueueMemBufferCopyRect(
+        cuStream, region, &devPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
+        bufferRowPitch, bufferSlicePitch, pDst, CU_MEMORYTYPE_HOST, hostOrigin,
+        hostRowPitch, bufferSlicePitch);
+
+    if (phEvent) {
+      retErr = retImplEv->record();
+    }
+
+    if (blockingRead) {
+      retErr = UR_CHECK_ERROR(cuStreamSynchronize(cuStream));
+    }
+
+    if (phEvent) {
+      *phEvent = retImplEv.release();
+    }
+
+  } catch (ur_result_t err) {
+    retErr = err;
+  }
+  return retErr;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite,
+    ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin,
+    ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch,
+    size_t hostRowPitch, size_t hostSlicePitch, void *pSrc,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  ur_result_t retErr = UR_RESULT_SUCCESS;
+  CUdeviceptr devPtr = hBuffer->mem_.buffer_mem_.get();
+  std::unique_ptr<ur_event_handle_t_> retImplEv{nullptr};
+
+  try {
+    ScopedContext active(hQueue->get_context());
+    CUstream cuStream = hQueue->get_next_transfer_stream();
+    retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    if (phEvent) {
+      retImplEv =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
+              UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, cuStream));
+      retImplEv->start();
+    }
+
+    retErr = commonEnqueueMemBufferCopyRect(
+        cuStream, region, pSrc, CU_MEMORYTYPE_HOST, hostOrigin, hostRowPitch,
+        hostSlicePitch, &devPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
+        bufferRowPitch, bufferSlicePitch);
+
+    if (phEvent) {
+      retErr = retImplEv->record();
+    }
+
+    if (blockingWrite) {
+      retErr = UR_CHECK_ERROR(cuStreamSynchronize(cuStream));
+    }
+
+    if (phEvent) {
+      *phEvent = retImplEv.release();
+    }
+
+  } catch (ur_result_t err) {
+    retErr = err;
+  }
+  return retErr;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hBufferSrc,
+    ur_mem_handle_t hBufferDst, size_t srcOffset, size_t dstOffset, size_t size,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  std::unique_ptr<ur_event_handle_t_> retImplEv{nullptr};
+
+  try {
+    ScopedContext active(hQueue->get_context());
+    ur_result_t result;
+
+    auto stream = hQueue->get_next_transfer_stream();
+    result =
+        enqueueEventsWait(hQueue, stream, numEventsInWaitList, phEventWaitList);
+
+    if (phEvent) {
+      retImplEv =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
+              UR_COMMAND_MEM_BUFFER_COPY, hQueue, stream));
+      result = retImplEv->start();
+    }
+
+    auto src = hBufferSrc->mem_.buffer_mem_.get() + srcOffset;
+    auto dst = hBufferDst->mem_.buffer_mem_.get() + dstOffset;
+
+    result = UR_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream));
+
+    if (phEvent) {
+      result = retImplEv->record();
+      *phEvent = retImplEv.release();
+    }
+
+    return result;
+  } catch (ur_result_t err) {
+    return err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hBufferSrc,
+    ur_mem_handle_t hBufferDst, ur_rect_offset_t srcOrigin,
+    ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch,
+    size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hBufferSrc, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hBufferDst, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  ur_result_t retErr = UR_RESULT_SUCCESS;
+  CUdeviceptr srcPtr = hBufferSrc->mem_.buffer_mem_.get();
+  CUdeviceptr dstPtr = hBufferDst->mem_.buffer_mem_.get();
+  std::unique_ptr<ur_event_handle_t_> retImplEv{nullptr};
+
+  try {
+    ScopedContext active(hQueue->get_context());
+    CUstream cuStream = hQueue->get_next_transfer_stream();
+    retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    if (phEvent) {
+      retImplEv =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
+              UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, cuStream));
+      retImplEv->start();
+    }
+
+    retErr = commonEnqueueMemBufferCopyRect(
+        cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin, srcRowPitch,
+        srcSlicePitch, &dstPtr, CU_MEMORYTYPE_DEVICE, dstOrigin, dstRowPitch,
+        dstSlicePitch);
+
+    if (phEvent) {
+      retImplEv->record();
+      *phEvent = retImplEv.release();
+    }
+
+  } catch (ur_result_t err) {
+    retErr = err;
+  }
+  return retErr;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, const void *pPattern,
+    size_t patternSize, size_t offset, size_t size,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  auto args_are_multiples_of_pattern_size =
+      (offset % patternSize == 0) || (size % patternSize == 0);
+
+  auto pattern_is_valid = (pPattern != nullptr);
+
+  auto pattern_size_is_valid =
+      ((patternSize & (patternSize - 1)) == 0) && // is power of two
+      (patternSize > 0) && (patternSize <= 128);  // falls within valid range
+
+  UR_ASSERT(args_are_multiples_of_pattern_size && pattern_is_valid &&
+                pattern_size_is_valid,
+            UR_RESULT_ERROR_INVALID_SIZE);
+
+  std::unique_ptr<ur_event_handle_t_> retImplEv{nullptr};
+
+  try {
+    ScopedContext active(hQueue->get_context());
+
+    auto stream = hQueue->get_next_transfer_stream();
+    ur_result_t result;
+    result =
+        enqueueEventsWait(hQueue, stream, numEventsInWaitList, phEventWaitList);
+
+    if (phEvent) {
+      retImplEv =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
+              UR_COMMAND_MEM_BUFFER_FILL, hQueue, stream));
+      result = retImplEv->start();
+    }
+
+    auto dstDevice = hBuffer->mem_.buffer_mem_.get() + offset;
+    auto N = size / patternSize;
+
+    // pattern size in bytes
+    switch (patternSize) {
+    case 1: {
+      auto value = *static_cast<const uint8_t *>(pPattern);
+      result = UR_CHECK_ERROR(cuMemsetD8Async(dstDevice, value, N, stream));
+      break;
+    }
+    case 2: {
+      auto value = *static_cast<const uint16_t *>(pPattern);
+      result = UR_CHECK_ERROR(cuMemsetD16Async(dstDevice, value, N, stream));
+      break;
+    }
+    case 4: {
+      auto value = *static_cast<const uint32_t *>(pPattern);
+      result = UR_CHECK_ERROR(cuMemsetD32Async(dstDevice, value, N, stream));
+      break;
+    }
+    default: {
+      // CUDA has no memset functions that allow setting values more than 4
+      // bytes. PI API lets you pass an arbitrary "pattern" to the buffer
+      // fill, which can be more than 4 bytes. We must break up the pattern
+      // into 4 byte values, and set the buffer using multiple strided calls.
+      // This means that one cuMemsetD2D32Async call is made for every 4 bytes
+      // in the pattern.
+
+      auto number_of_steps = patternSize / sizeof(uint32_t);
+
+      // we walk up the pattern in 4-byte steps, and call cuMemset for each
+      // 4-byte chunk of the pattern.
+      for (auto step = 0u; step < number_of_steps; ++step) {
+        // take 4 bytes of the pattern
+        auto value = *(static_cast<const uint32_t *>(pPattern) + step);
+
+        // offset the pointer to the part of the buffer we want to write to
+        auto offset_ptr = dstDevice + (step * sizeof(uint32_t));
+
+        // set all of the pattern chunks
+        result = UR_CHECK_ERROR(
+            cuMemsetD2D32Async(offset_ptr, patternSize, value, 1, N, stream));
+      }
+
+      break;
+    }
+    }
+
+    if (phEvent) {
+      result = retImplEv->record();
+      *phEvent = retImplEv.release();
+    }
+
+    return result;
+  } catch (ur_result_t err) {
+    return err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+}
+
+static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR array_desc) {
+  switch (array_desc.Format) {
+  case CU_AD_FORMAT_UNSIGNED_INT8:
+  case CU_AD_FORMAT_SIGNED_INT8:
+    return 1;
+  case CU_AD_FORMAT_UNSIGNED_INT16:
+  case CU_AD_FORMAT_SIGNED_INT16:
+  case CU_AD_FORMAT_HALF:
+    return 2;
+  case CU_AD_FORMAT_UNSIGNED_INT32:
+  case CU_AD_FORMAT_SIGNED_INT32:
+  case CU_AD_FORMAT_FLOAT:
+    return 4;
+  default:
+    sycl::detail::ur::die("Invalid image format.");
+    return 0;
+  }
+}
+
+/// General ND memory copy operation for images (where N > 1).
+/// This function requires the corresponding CUDA context to be at the top of
+/// the context stack
+/// If the source and/or destination is an array, src_ptr and/or dst_ptr
+/// must be a pointer to a CUarray
+static ur_result_t commonEnqueueMemImageNDCopy(
+    CUstream cu_stream, ur_mem_type_t img_type, const ur_rect_region_t region,
+    const void *src_ptr, const CUmemorytype_enum src_type,
+    const ur_rect_offset_t src_offset, void *dst_ptr,
+    const CUmemorytype_enum dst_type, const ur_rect_offset_t dst_offset) {
+  UR_ASSERT(src_type == CU_MEMORYTYPE_ARRAY || src_type == CU_MEMORYTYPE_HOST,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(dst_type == CU_MEMORYTYPE_ARRAY || dst_type == CU_MEMORYTYPE_HOST,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  if (img_type == UR_MEM_TYPE_IMAGE2D) {
+    CUDA_MEMCPY2D cpyDesc;
+    memset(&cpyDesc, 0, sizeof(cpyDesc));
+    cpyDesc.srcMemoryType = src_type;
+    if (src_type == CU_MEMORYTYPE_ARRAY) {
+      cpyDesc.srcArray = *static_cast<const CUarray *>(src_ptr);
+      cpyDesc.srcXInBytes = src_offset.x;
+      cpyDesc.srcY = src_offset.y;
+    } else {
+      cpyDesc.srcHost = src_ptr;
+    }
+    cpyDesc.dstMemoryType = dst_type;
+    if (dst_type == CU_MEMORYTYPE_ARRAY) {
+      cpyDesc.dstArray = *static_cast<CUarray *>(dst_ptr);
+      cpyDesc.dstXInBytes = dst_offset.x;
+      cpyDesc.dstY = dst_offset.y;
+    } else {
+      cpyDesc.dstHost = dst_ptr;
+    }
+    cpyDesc.WidthInBytes = region.width;
+    cpyDesc.Height = region.height;
+    return UR_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cu_stream));
+  }
+  if (img_type == UR_MEM_TYPE_IMAGE3D) {
+    CUDA_MEMCPY3D cpyDesc;
+    memset(&cpyDesc, 0, sizeof(cpyDesc));
+    cpyDesc.srcMemoryType = src_type;
+    if (src_type == CU_MEMORYTYPE_ARRAY) {
+      cpyDesc.srcArray = *static_cast<const CUarray *>(src_ptr);
+      cpyDesc.srcXInBytes = src_offset.x;
+      cpyDesc.srcY = src_offset.y;
+      cpyDesc.srcZ = src_offset.z;
+    } else {
+      cpyDesc.srcHost = src_ptr;
+    }
+    cpyDesc.dstMemoryType = dst_type;
+    if (dst_type == CU_MEMORYTYPE_ARRAY) {
+      cpyDesc.dstArray = *static_cast<CUarray *>(dst_ptr);
+      cpyDesc.dstXInBytes = dst_offset.x;
+      cpyDesc.dstY = dst_offset.y;
+      cpyDesc.dstZ = dst_offset.z;
+    } else {
+      cpyDesc.dstHost = dst_ptr;
+    }
+    cpyDesc.WidthInBytes = region.width;
+    cpyDesc.Height = region.height;
+    cpyDesc.Depth = region.depth;
+    return UR_CHECK_ERROR(cuMemcpy3DAsync(&cpyDesc, cu_stream));
+  }
+  return UR_RESULT_ERROR_INVALID_VALUE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingRead,
+    ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch,
+    size_t phEventWaitListslicePitch, void *pDst, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hImage->mem_type_ == ur_mem_handle_t_::mem_type::surface,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  ur_result_t retErr = UR_RESULT_SUCCESS;
+
+  try {
+    ScopedContext active(hQueue->get_context());
+    CUstream cuStream = hQueue->get_next_transfer_stream();
+    retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    CUarray array = hImage->mem_.surface_mem_.get_array();
+
+    CUDA_ARRAY_DESCRIPTOR arrayDesc;
+    retErr = UR_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array));
+
+    int elementByteSize = imageElementByteSize(arrayDesc);
+
+    size_t byteOffsetX = origin.x * elementByteSize * arrayDesc.NumChannels;
+    size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region.width;
+
+    ur_mem_type_t imgType = hImage->mem_.surface_mem_.get_image_type();
+    if (imgType == UR_MEM_TYPE_IMAGE1D) {
+      retErr = UR_CHECK_ERROR(
+          cuMemcpyAtoHAsync(pDst, array, byteOffsetX, bytesToCopy, cuStream));
+    } else {
+      ur_rect_region_t adjustedRegion = {bytesToCopy, region.height,
+                                         region.depth};
+      ur_rect_offset_t srcOffset = {byteOffsetX, origin.y, origin.z};
+
+      retErr = commonEnqueueMemImageNDCopy(
+          cuStream, imgType, adjustedRegion, &array, CU_MEMORYTYPE_ARRAY,
+          srcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{});
+
+      if (retErr != UR_RESULT_SUCCESS) {
+        return retErr;
+      }
+    }
+
+    if (phEvent) {
+      auto new_event = ur_event_handle_t_::make_native(
+          UR_COMMAND_MEM_IMAGE_READ, hQueue, cuStream);
+      new_event->record();
+      *phEvent = new_event;
+    }
+
+    if (blockingRead) {
+      retErr = UR_CHECK_ERROR(cuStreamSynchronize(cuStream));
+    }
+  } catch (ur_result_t err) {
+    return err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return retErr;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingWrite,
+    ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch,
+    size_t slicePitch, void *pSrc, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hImage->mem_type_ == ur_mem_handle_t_::mem_type::surface,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  ur_result_t retErr = UR_RESULT_SUCCESS;
+
+  try {
+    ScopedContext active(hQueue->get_context());
+    CUstream cuStream = hQueue->get_next_transfer_stream();
+    retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    CUarray array = hImage->mem_.surface_mem_.get_array();
+
+    CUDA_ARRAY_DESCRIPTOR arrayDesc;
+    retErr = UR_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array));
+
+    int elementByteSize = imageElementByteSize(arrayDesc);
+
+    size_t byteOffsetX = origin.x * elementByteSize * arrayDesc.NumChannels;
+    size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region.width;
+
+    ur_mem_type_t imgType = hImage->mem_.surface_mem_.get_image_type();
+    if (imgType == UR_MEM_TYPE_IMAGE1D) {
+      retErr = UR_CHECK_ERROR(
+          cuMemcpyHtoAAsync(array, byteOffsetX, pSrc, bytesToCopy, cuStream));
+    } else {
+      ur_rect_region_t adjustedRegion = {bytesToCopy, region.height,
+                                         region.depth};
+      ur_rect_offset_t dstOffset = {byteOffsetX, origin.y, origin.z};
+
+      retErr = commonEnqueueMemImageNDCopy(
+          cuStream, imgType, adjustedRegion, pSrc, CU_MEMORYTYPE_HOST,
+          ur_rect_offset_t{}, &array, CU_MEMORYTYPE_ARRAY, dstOffset);
+
+      if (retErr != UR_RESULT_SUCCESS) {
+        return retErr;
+      }
+    }
+
+    if (phEvent) {
+      auto new_event = ur_event_handle_t_::make_native(
+          UR_COMMAND_MEM_IMAGE_WRITE, hQueue, cuStream);
+      new_event->record();
+      *phEvent = new_event;
+    }
+  } catch (ur_result_t err) {
+    return err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return retErr;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hImageSrc,
+    ur_mem_handle_t hImageDst, ur_rect_offset_t srcOrigin,
+    ur_rect_offset_t dstOrigin, ur_rect_region_t region,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hImageSrc->mem_type_ == ur_mem_handle_t_::mem_type::surface,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hImageDst->mem_type_ == ur_mem_handle_t_::mem_type::surface,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hImageSrc->mem_.surface_mem_.get_image_type() ==
+                hImageDst->mem_.surface_mem_.get_image_type(),
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  ur_result_t retErr = UR_RESULT_SUCCESS;
+
+  try {
+    ScopedContext active(hQueue->get_context());
+    CUstream cuStream = hQueue->get_next_transfer_stream();
+    retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    CUarray srcArray = hImageSrc->mem_.surface_mem_.get_array();
+    CUarray dstArray = hImageDst->mem_.surface_mem_.get_array();
+
+    CUDA_ARRAY_DESCRIPTOR srcArrayDesc;
+    retErr = UR_CHECK_ERROR(cuArrayGetDescriptor(&srcArrayDesc, srcArray));
+    CUDA_ARRAY_DESCRIPTOR dstArrayDesc;
+    retErr = UR_CHECK_ERROR(cuArrayGetDescriptor(&dstArrayDesc, dstArray));
+
+    UR_ASSERT(srcArrayDesc.Format == dstArrayDesc.Format,
+              UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+    UR_ASSERT(srcArrayDesc.NumChannels == dstArrayDesc.NumChannels,
+              UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+    int elementByteSize = imageElementByteSize(srcArrayDesc);
+
+    size_t dstByteOffsetX =
+        dstOrigin.x * elementByteSize * srcArrayDesc.NumChannels;
+    size_t srcByteOffsetX =
+        srcOrigin.x * elementByteSize * dstArrayDesc.NumChannels;
+    size_t bytesToCopy =
+        elementByteSize * srcArrayDesc.NumChannels * region.width;
+
+    ur_mem_type_t imgType = hImageSrc->mem_.surface_mem_.get_image_type();
+    if (imgType == UR_MEM_TYPE_IMAGE1D) {
+      retErr = UR_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray,
+                                           srcByteOffsetX, bytesToCopy));
+    } else {
+      ur_rect_region_t adjustedRegion = {bytesToCopy, region.height,
+                                         region.depth};
+      ur_rect_offset_t srcOffset = {srcByteOffsetX, srcOrigin.y, srcOrigin.z};
+      ur_rect_offset_t dstOffset = {dstByteOffsetX, dstOrigin.y, dstOrigin.z};
+
+      retErr = commonEnqueueMemImageNDCopy(
+          cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY,
+          srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset);
+
+      if (retErr != UR_RESULT_SUCCESS) {
+        return retErr;
+      }
+    }
+
+    if (phEvent) {
+      auto new_event = ur_event_handle_t_::make_native(
+          UR_COMMAND_MEM_IMAGE_COPY, hQueue, cuStream);
+      new_event->record();
+      *phEvent = new_event;
+    }
+  } catch (ur_result_t err) {
+    return err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return retErr;
+}
+
+/// Implements mapping on the host using a BufferRead operation.
+/// Mapped pointers are stored in the pi_mem object.
+/// If the buffer uses pinned host memory a pointer to that memory is returned
+/// and no read operation is done.
+///
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingMap,
+    ur_map_flags_t mapFlags, size_t offset, size_t size,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent, void **ppRetMap) {
+  UR_ASSERT(ppRetMap != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(hQueue != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hBuffer != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hBuffer->mem_type_ == ur_mem_handle_t_::mem_type::buffer,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  ur_result_t ret_err = UR_RESULT_ERROR_INVALID_MEM_OBJECT;
+  const bool is_pinned =
+      hBuffer->mem_.buffer_mem_.allocMode_ ==
+      ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr;
+
+  // Currently no support for overlapping regions
+  if (hBuffer->mem_.buffer_mem_.get_map_ptr() != nullptr) {
+    return ret_err;
+  }
+
+  // Allocate a pointer in the host to store the mapped information
+  auto hostPtr = hBuffer->mem_.buffer_mem_.map_to_ptr(offset, mapFlags);
+  *ppRetMap = hBuffer->mem_.buffer_mem_.get_map_ptr();
+  if (hostPtr) {
+    ret_err = UR_RESULT_SUCCESS;
+  }
+
+  if (!is_pinned &&
+      ((mapFlags & UR_MAP_FLAG_READ) || (mapFlags & UR_MAP_FLAG_WRITE))) {
+    // Pinned host memory is already on host so it doesn't need to be read.
+    ret_err = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size,
+                                     hostPtr, numEventsInWaitList,
+                                     phEventWaitList, phEvent);
+  } else {
+    ScopedContext active(hQueue->get_context());
+
+    if (is_pinned) {
+      ret_err = urEnqueueEventsWait(hQueue, numEventsInWaitList,
+                                    phEventWaitList, nullptr);
+    }
+
+    if (phEvent) {
+      try {
+        *phEvent =
+            ur_event_handle_t_::make_native(UR_COMMAND_MEM_BUFFER_MAP, hQueue,
+                                            hQueue->get_next_transfer_stream());
+        (*phEvent)->start();
+        (*phEvent)->record();
+      } catch (ur_result_t error) {
+        ret_err = error;
+      }
+    }
+  }
+
+  return ret_err;
+}
+
+/// Implements the unmap from the host, using a BufferWrite operation.
+/// Requires the mapped pointer to be already registered in the given memobj.
+/// If memobj uses pinned host memory, this will not do a write.
+///
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  ur_result_t ret_err = UR_RESULT_SUCCESS;
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pMappedPtr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  UR_ASSERT(hMem->mem_type_ == ur_mem_handle_t_::mem_type::buffer,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hMem->mem_.buffer_mem_.get_map_ptr() != nullptr,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hMem->mem_.buffer_mem_.get_map_ptr() == pMappedPtr,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  const bool is_pinned =
+      hMem->mem_.buffer_mem_.allocMode_ ==
+      ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr;
+
+  if (!is_pinned &&
+      (hMem->mem_.buffer_mem_.get_map_flags() & UR_MAP_FLAG_WRITE)) {
+    // Pinned host memory is only on host so it doesn't need to be written to.
+    ret_err = urEnqueueMemBufferWrite(
+        hQueue, hMem, true, hMem->mem_.buffer_mem_.get_map_offset(pMappedPtr),
+        hMem->mem_.buffer_mem_.get_size(), pMappedPtr, numEventsInWaitList,
+        phEventWaitList, phEvent);
+  } else {
+    ScopedContext active(hQueue->get_context());
+
+    if (is_pinned) {
+      ret_err = urEnqueueEventsWait(hQueue, numEventsInWaitList,
+                                    phEventWaitList, nullptr);
+    }
+
+    if (phEvent) {
+      try {
+        *phEvent = ur_event_handle_t_::make_native(
+            UR_COMMAND_MEM_UNMAP, hQueue, hQueue->get_next_transfer_stream());
+        (*phEvent)->start();
+        (*phEvent)->record();
+      } catch (ur_result_t error) {
+        ret_err = error;
+      }
+    }
+  }
+
+  hMem->mem_.buffer_mem_.unmap(pMappedPtr);
+  return ret_err;
+}
+
 /// TODO(ur): Add support for the offset.
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
     ur_queue_handle_t hQueue, void *ptr, size_t patternSize,
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index d7751a02e9707..c95eed5c24e05 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -169,18 +169,18 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
   pDdiTable->pfnEventsWait = urEnqueueEventsWait;
   pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier;
   pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch;
-  pDdiTable->pfnMemBufferCopy = nullptr;
-  pDdiTable->pfnMemBufferCopyRect = nullptr;
-  pDdiTable->pfnMemBufferFill = nullptr;
-  pDdiTable->pfnMemBufferMap = nullptr;
+  pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy;
+  pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect;
+  pDdiTable->pfnMemBufferFill = urEnqueueMemBufferFill;
+  pDdiTable->pfnMemBufferMap = urEnqueueMemBufferMap;
   pDdiTable->pfnMemBufferRead = urEnqueueMemBufferRead;
-  pDdiTable->pfnMemBufferReadRect = nullptr;
+  pDdiTable->pfnMemBufferReadRect = urEnqueueMemBufferReadRect;
   pDdiTable->pfnMemBufferWrite = urEnqueueMemBufferWrite;
-  pDdiTable->pfnMemBufferWriteRect = nullptr;
-  pDdiTable->pfnMemImageCopy = nullptr;
-  pDdiTable->pfnMemImageRead = nullptr;
-  pDdiTable->pfnMemImageWrite = nullptr;
-  pDdiTable->pfnMemUnmap = nullptr;
+  pDdiTable->pfnMemBufferWriteRect = urEnqueueMemBufferWriteRect;
+  pDdiTable->pfnMemImageCopy = urEnqueueMemImageCopy;
+  pDdiTable->pfnMemImageRead = urEnqueueMemImageRead;
+  pDdiTable->pfnMemImageWrite = urEnqueueMemImageWrite;
+  pDdiTable->pfnMemUnmap = urEnqueueMemUnmap;
   pDdiTable->pfnUSMFill2D = urEnqueueUSMFill2D;
   pDdiTable->pfnUSMFill = urEnqueueUSMFill;
   pDdiTable->pfnUSMAdvise = urEnqueueUSMAdvise;

From 4f9277bf6ca7afdc0e1dc906a5aa34cac531bffb Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Tue, 2 May 2023 10:48:34 +0100
Subject: [PATCH 22/45] Don't check MAX_MEM_ALLOC_SIZE when creating a buffer
 Rely on cuMemAlloc failing if the given size is too big instead

---
 sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
index 59975b0a7b821..abca91b594e19 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
@@ -31,10 +31,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
   // Need input memory object
   UR_ASSERT(phBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(size != 0, UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
-  uint64_t maxAlloc = 0;
-  urDeviceGetInfo(hContext->get_device(), UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE,
-                  sizeof(maxAlloc), &maxAlloc, nullptr);
-  UR_ASSERT(size <= maxAlloc, UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
 
   // Currently, USE_HOST_PTR is not implemented using host register
   // since this triggers a weird segfault after program ends.

From 8968c1f0aaf38a0663c1c7333fa566df0fc4f2b2 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Fri, 28 Apr 2023 10:00:15 +0100
Subject: [PATCH 23/45] [SYCL][CUDA] Port CUDA global variable read/write to UR

---
 sycl/plugins/cuda/pi_cuda.cpp                 | 80 +------------------
 .../ur/adapters/cuda/enqueue.cpp              | 73 +++++++++++++++++
 .../ur/adapters/cuda/ur_interface_loader.cpp  |  4 +-
 3 files changed, 77 insertions(+), 80 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 09c2fddc6e207..992b69f4078d0 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -306,82 +306,6 @@ pi_result cuda_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device,
   return retError;
 }
 
-pi_result cuda_piextEnqueueDeviceGlobalVariableWrite(
-    pi_queue queue, pi_program program, const char *name,
-    pi_bool blocking_write, size_t count, size_t offset, const void *src,
-    pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
-    pi_event *event) {
-  assert(queue != nullptr);
-  assert(program != nullptr);
-
-  if (name == nullptr || src == nullptr)
-    return PI_ERROR_INVALID_VALUE;
-
-  // Since CUDA requires a the global variable to be referenced by name, we use
-  // metadata to find the correct name to access it by.
-  auto device_global_name_it = program->globalIDMD_.find(name);
-  if (device_global_name_it == program->globalIDMD_.end())
-    return PI_ERROR_INVALID_VALUE;
-  std::string device_global_name = device_global_name_it->second;
-
-  pi_result result = PI_SUCCESS;
-  try {
-    CUdeviceptr device_global = 0;
-    size_t device_global_size = 0;
-    result = PI_CHECK_ERROR(
-        cuModuleGetGlobal(&device_global, &device_global_size, program->get(),
-                          device_global_name.c_str()));
-
-    if (offset + count > device_global_size)
-      return PI_ERROR_INVALID_VALUE;
-
-    return pi2ur::piextUSMEnqueueMemcpy(
-        queue, blocking_write, reinterpret_cast<void *>(device_global + offset),
-        src, count, num_events_in_wait_list, event_wait_list, event);
-  } catch (pi_result error) {
-    result = error;
-  }
-  return result;
-}
-
-pi_result cuda_piextEnqueueDeviceGlobalVariableRead(
-    pi_queue queue, pi_program program, const char *name, pi_bool blocking_read,
-    size_t count, size_t offset, void *dst, pi_uint32 num_events_in_wait_list,
-    const pi_event *event_wait_list, pi_event *event) {
-  assert(queue != nullptr);
-  assert(program != nullptr);
-
-  if (name == nullptr || dst == nullptr)
-    return PI_ERROR_INVALID_VALUE;
-
-  // Since CUDA requires a the global variable to be referenced by name, we use
-  // metadata to find the correct name to access it by.
-  auto device_global_name_it = program->globalIDMD_.find(name);
-  if (device_global_name_it == program->globalIDMD_.end())
-    return PI_ERROR_INVALID_VALUE;
-  std::string device_global_name = device_global_name_it->second;
-
-  pi_result result = PI_SUCCESS;
-  try {
-    CUdeviceptr device_global = 0;
-    size_t device_global_size = 0;
-    result = PI_CHECK_ERROR(
-        cuModuleGetGlobal(&device_global, &device_global_size, program->get(),
-                          device_global_name.c_str()));
-
-    if (offset + count > device_global_size)
-      return PI_ERROR_INVALID_VALUE;
-
-    return pi2ur::piextUSMEnqueueMemcpy(
-        queue, blocking_read, dst,
-        reinterpret_cast<const void *>(device_global + offset), count,
-        num_events_in_wait_list, event_wait_list, event);
-  } catch (pi_result error) {
-    result = error;
-  }
-  return result;
-}
-
 /// Host Pipes
 pi_result cuda_piextEnqueueReadHostPipe(
     pi_queue queue, pi_program program, const char *pipe_symbol,
@@ -572,9 +496,9 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piextUSMGetMemAllocInfo, pi2ur::piextUSMGetMemAllocInfo)
   // Device global variable
   _PI_CL(piextEnqueueDeviceGlobalVariableWrite,
-         cuda_piextEnqueueDeviceGlobalVariableWrite)
+         pi2ur::piextEnqueueDeviceGlobalVariableWrite)
   _PI_CL(piextEnqueueDeviceGlobalVariableRead,
-         cuda_piextEnqueueDeviceGlobalVariableRead)
+         pi2ur::piextEnqueueDeviceGlobalVariableRead)
 
   // Host Pipe
   _PI_CL(piextEnqueueReadHostPipe, cuda_piextEnqueueReadHostPipe)
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
index fd2106dd6c141..99a8285d2cd44 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
@@ -1620,3 +1620,76 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
   }
   return retErr;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
+    ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name,
+    bool blockingWrite, size_t count, size_t offset, const void *pSrc,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(name && pSrc, UR_RESULT_ERROR_INVALID_VALUE);
+
+  // Since CUDA requires a the global variable to be referenced by name, we use
+  // metadata to find the correct name to access it by.
+  auto device_global_name_it = hProgram->globalIDMD_.find(name);
+  if (device_global_name_it == hProgram->globalIDMD_.end())
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  std::string device_global_name = device_global_name_it->second;
+
+  ur_result_t result = UR_RESULT_SUCCESS;
+  try {
+    CUdeviceptr device_global = 0;
+    size_t device_global_size = 0;
+    result = UR_CHECK_ERROR(
+        cuModuleGetGlobal(&device_global, &device_global_size, hProgram->get(),
+                          device_global_name.c_str()));
+
+    if (offset + count > device_global_size)
+      return UR_RESULT_ERROR_INVALID_VALUE;
+
+    return urEnqueueUSMMemcpy(
+        hQueue, blockingWrite, reinterpret_cast<void *>(device_global + offset),
+        pSrc, count, numEventsInWaitList, phEventWaitList, phEvent);
+  } catch (ur_result_t error) {
+    result = error;
+  }
+  return result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
+    ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name,
+    bool blockingRead, size_t count, size_t offset, void *pDst,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(name && pDst, UR_RESULT_ERROR_INVALID_VALUE);
+
+  // Since CUDA requires a the global variable to be referenced by name, we use
+  // metadata to find the correct name to access it by.
+  auto device_global_name_it = hProgram->globalIDMD_.find(name);
+  if (device_global_name_it == hProgram->globalIDMD_.end())
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  std::string device_global_name = device_global_name_it->second;
+
+  ur_result_t result = UR_RESULT_SUCCESS;
+  try {
+    CUdeviceptr device_global = 0;
+    size_t device_global_size = 0;
+    result = UR_CHECK_ERROR(
+        cuModuleGetGlobal(&device_global, &device_global_size, hProgram->get(),
+                          device_global_name.c_str()));
+
+    if (offset + count > device_global_size)
+      return UR_RESULT_ERROR_INVALID_VALUE;
+
+    return urEnqueueUSMMemcpy(
+        hQueue, blockingRead, pDst,
+        reinterpret_cast<const void *>(device_global + offset), count,
+        numEventsInWaitList, phEventWaitList, phEvent);
+  } catch (ur_result_t error) {
+    result = error;
+  }
+  return result;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index c95eed5c24e05..b87e2f822d391 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -164,8 +164,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
   if (UR_RESULT_SUCCESS != result) {
     return result;
   }
-  pDdiTable->pfnDeviceGlobalVariableRead = nullptr;
-  pDdiTable->pfnDeviceGlobalVariableWrite = nullptr;
+  pDdiTable->pfnDeviceGlobalVariableRead = urEnqueueDeviceGlobalVariableRead;
+  pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite;
   pDdiTable->pfnEventsWait = urEnqueueEventsWait;
   pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier;
   pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch;

From 1fb2afdb6a93d5944bee0ef0c84faefd75b4273c Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Wed, 3 May 2023 12:22:27 +0100
Subject: [PATCH 24/45] [SYCL][CUDA] Only build CUDA UR adapter when CUDA
 plugin is enabled

---
 sycl/plugins/unified_runtime/CMakeLists.txt | 78 +++++++++++----------
 1 file changed, 40 insertions(+), 38 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 2288a8e9949e1..eaec6367392b5 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -124,44 +124,46 @@ set_target_properties("ur_adapter_level_zero" PROPERTIES
     SOVERSION "0"
 )
 
-# Build CUDA adapter
-add_sycl_library("ur_adapter_cuda" SHARED 
-  SOURCES
-    "ur/ur.hpp"
-    "ur/ur.cpp"
-    "ur/usm_allocator.cpp"
-    "ur/usm_allocator.hpp"
-    "ur/adapters/cuda/common.cpp"
-    "ur/adapters/cuda/common.hpp"
-    "ur/adapters/cuda/context.cpp"
-    "ur/adapters/cuda/context.hpp"
-    "ur/adapters/cuda/device.cpp"
-    "ur/adapters/cuda/device.hpp"
-    "ur/adapters/cuda/enqueue.cpp"
-    "ur/adapters/cuda/event.cpp"
-    "ur/adapters/cuda/event.hpp"
-    "ur/adapters/cuda/platform.cpp"
-    "ur/adapters/cuda/platform.hpp"
-    "ur/adapters/cuda/program.cpp"
-    "ur/adapters/cuda/program.hpp"
-    "ur/adapters/cuda/kernel.cpp"
-    "ur/adapters/cuda/kernel.hpp"
-    "ur/adapters/cuda/queue.cpp"
-    "ur/adapters/cuda/queue.hpp"
-    "ur/adapters/cuda/sampler.cpp"
-    "ur/adapters/cuda/sampler.hpp"
-    "ur/adapters/cuda/memory.cpp"
-    "ur/adapters/cuda/memory.hpp"
-    "ur/adapters/cuda/usm.cpp"
-    "ur/adapters/cuda/ur_interface_loader.cpp"
-    "ur/adapters/cuda/tracing.cpp"
-  INCLUDE_DIRS
-    ${sycl_inc_dir}
-  LIBRARIES
-    UnifiedRuntime-Headers
-    Threads::Threads
-    cudadrv
-)
+if ("cuda" IN_LIST SYCL_ENABLE_PLUGINS)
+  # Build CUDA adapter
+  add_sycl_library("ur_adapter_cuda" SHARED
+    SOURCES
+      "ur/ur.hpp"
+      "ur/ur.cpp"
+      "ur/usm_allocator.cpp"
+      "ur/usm_allocator.hpp"
+      "ur/adapters/cuda/common.cpp"
+      "ur/adapters/cuda/common.hpp"
+      "ur/adapters/cuda/context.cpp"
+      "ur/adapters/cuda/context.hpp"
+      "ur/adapters/cuda/device.cpp"
+      "ur/adapters/cuda/device.hpp"
+      "ur/adapters/cuda/enqueue.cpp"
+      "ur/adapters/cuda/event.cpp"
+      "ur/adapters/cuda/event.hpp"
+      "ur/adapters/cuda/platform.cpp"
+      "ur/adapters/cuda/platform.hpp"
+      "ur/adapters/cuda/program.cpp"
+      "ur/adapters/cuda/program.hpp"
+      "ur/adapters/cuda/kernel.cpp"
+      "ur/adapters/cuda/kernel.hpp"
+      "ur/adapters/cuda/queue.cpp"
+      "ur/adapters/cuda/queue.hpp"
+      "ur/adapters/cuda/sampler.cpp"
+      "ur/adapters/cuda/sampler.hpp"
+      "ur/adapters/cuda/memory.cpp"
+      "ur/adapters/cuda/memory.hpp"
+      "ur/adapters/cuda/usm.cpp"
+      "ur/adapters/cuda/ur_interface_loader.cpp"
+      "ur/adapters/cuda/tracing.cpp"
+    INCLUDE_DIRS
+      ${sycl_inc_dir}
+    LIBRARIES
+      UnifiedRuntime-Headers
+      Threads::Threads
+      cudadrv
+  )
+endif()
 
 if (TARGET UnifiedRuntimeLoader)
   set_target_properties(hello_world PROPERTIES EXCLUDE_FROM_ALL 1 EXCLUDE_FROM_DEFAULT_BUILD 1)

From 96c85ac0857de96ec3723badd5fd173b099ccdba Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Wed, 3 May 2023 14:14:27 +0100
Subject: [PATCH 25/45] [SYCL][CUDA] Don't link non-CUDA adapters with cudadrv

---
 sycl/plugins/unified_runtime/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index eaec6367392b5..9ceb01b670b98 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -116,7 +116,6 @@ add_sycl_library("ur_adapter_level_zero" SHARED
     LevelZeroLoader-Headers
     LevelZeroLoader
     Threads::Threads
-    cudadrv
 )
 
 set_target_properties("ur_adapter_level_zero" PROPERTIES

From 3812978c9af72f8ddfdc37249cab596bb344f4b5 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Thu, 4 May 2023 13:56:37 +0100
Subject: [PATCH 26/45] [SYCL][CUDA] Port piextGetDeviceFunctionPointer and
 piextDeviceSelectBinary to UR

---
 sycl/plugins/cuda/pi_cuda.cpp                 | 56 +------------------
 .../ur/adapters/cuda/device.cpp               | 25 +++++++++
 .../ur/adapters/cuda/program.cpp              | 25 +++++++++
 .../ur/adapters/cuda/ur_interface_loader.cpp  |  4 +-
 4 files changed, 54 insertions(+), 56 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 992b69f4078d0..6f3002e808cc7 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -253,58 +253,6 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) {
 
 //-- PI API implementation
 extern "C" {
-/// \return If available, the first binary that is PTX
-///
-pi_result cuda_piextDeviceSelectBinary(pi_device device,
-                                       pi_device_binary *binaries,
-                                       pi_uint32 num_binaries,
-                                       pi_uint32 *selected_binary) {
-  // Ignore unused parameter
-  (void)device;
-
-  if (!binaries) {
-    sycl::detail::pi::die("No list of device images provided");
-  }
-  if (num_binaries < 1) {
-    sycl::detail::pi::die("No binary images in the list");
-  }
-
-  // Look for an image for the NVPTX64 target, and return the first one that is
-  // found
-  for (pi_uint32 i = 0; i < num_binaries; i++) {
-    if (strcmp(binaries[i]->DeviceTargetSpec,
-               __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64) == 0) {
-      *selected_binary = i;
-      return PI_SUCCESS;
-    }
-  }
-
-  // No image can be loaded for the given device
-  return PI_ERROR_INVALID_BINARY;
-}
-
-pi_result cuda_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device,
-                                             pi_program program,
-                                             const char *func_name,
-                                             pi_uint64 *func_pointer_ret) {
-  // Check if device passed is the same the device bound to the context
-  assert(device == program->get_context()->get_device());
-  assert(func_pointer_ret != nullptr);
-
-  CUfunction func;
-  CUresult ret = cuModuleGetFunction(&func, program->get(), func_name);
-  *func_pointer_ret = reinterpret_cast<pi_uint64>(func);
-  pi_result retError = PI_SUCCESS;
-
-  if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_FOUND)
-    retError = PI_CHECK_ERROR(ret);
-  if (ret == CUDA_ERROR_NOT_FOUND) {
-    *func_pointer_ret = 0;
-    retError = PI_ERROR_INVALID_KERNEL_NAME;
-  }
-
-  return retError;
-}
 
 /// Host Pipes
 pi_result cuda_piextEnqueueReadHostPipe(
@@ -375,8 +323,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piDevicePartition, pi2ur::piDevicePartition)
   _PI_CL(piDeviceRetain, pi2ur::piDeviceRetain)
   _PI_CL(piDeviceRelease, pi2ur::piDeviceRelease)
-  _PI_CL(piextDeviceSelectBinary, cuda_piextDeviceSelectBinary)
-  _PI_CL(piextGetDeviceFunctionPointer, cuda_piextGetDeviceFunctionPointer)
+  _PI_CL(piextDeviceSelectBinary, pi2ur::piextDeviceSelectBinary)
+  _PI_CL(piextGetDeviceFunctionPointer, pi2ur::piextGetDeviceFunctionPointer)
   _PI_CL(piextDeviceGetNativeHandle, pi2ur::piextDeviceGetNativeHandle)
   _PI_CL(piextDeviceCreateWithNativeHandle,
          pi2ur::piextDeviceCreateWithNativeHandle)
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
index 567377be8796f..6d87373524341 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
@@ -1145,3 +1145,28 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
 
   return UR_RESULT_SUCCESS;
 }
+
+/// \return If available, the first binary that is PTX
+///
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary(
+    ur_device_handle_t hDevice, const ur_device_binary_t *pBinaries,
+    uint32_t NumBinaries, uint32_t *pSelectedBinary) {
+  // Ignore unused parameter
+  (void)hDevice;
+
+  UR_ASSERT(pBinaries, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(NumBinaries > 0, UR_RESULT_ERROR_INVALID_ARGUMENT);
+
+  // Look for an image for the NVPTX64 target, and return the first one that is
+  // found
+  for (uint32_t i = 0; i < NumBinaries; i++) {
+    if (strcmp(pBinaries[i].pDeviceTargetSpec,
+               UR_DEVICE_BINARY_TARGET_NVPTX64) == 0) {
+      *pSelectedBinary = i;
+      return UR_RESULT_SUCCESS;
+    }
+  }
+
+  // No image can be loaded for the given device
+  return UR_RESULT_ERROR_INVALID_BINARY;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
index 129f4eb06b81e..82f6db76fda68 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
@@ -442,3 +442,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants(
     ur_program_handle_t, uint32_t, const ur_specialization_constant_info_t *) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer(
+    ur_device_handle_t hDevice, ur_program_handle_t hProgram,
+    const char *pFunctionName, void **ppFunctionPointer) {
+  // Check if device passed is the same the device bound to the context
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hDevice == hProgram->get_context()->get_device(),
+            UR_RESULT_ERROR_INVALID_DEVICE);
+  UR_ASSERT(ppFunctionPointer, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  CUfunction func;
+  CUresult ret = cuModuleGetFunction(&func, hProgram->get(), pFunctionName);
+  *ppFunctionPointer = func;
+  ur_result_t retError = UR_RESULT_SUCCESS;
+
+  if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_FOUND)
+    retError = UR_CHECK_ERROR(ret);
+  if (ret == CUDA_ERROR_NOT_FOUND) {
+    *ppFunctionPointer = 0;
+    retError = UR_RESULT_ERROR_INVALID_FUNCTION_NAME;
+  }
+
+  return retError;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index b87e2f822d391..f7fb58c256d47 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -89,7 +89,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable(
   pDdiTable->pfnCreateWithIL = urProgramCreateWithIL;
   pDdiTable->pfnCreateWithNativeHandle = urProgramCreateWithNativeHandle;
   pDdiTable->pfnGetBuildInfo = urProgramGetBuildInfo;
-  pDdiTable->pfnGetFunctionPointer = nullptr;
+  pDdiTable->pfnGetFunctionPointer = urProgramGetFunctionPointer;
   pDdiTable->pfnGetInfo = urProgramGetInfo;
   pDdiTable->pfnGetNativeHandle = urProgramGetNativeHandle;
   pDdiTable->pfnLink = urProgramLink;
@@ -250,7 +250,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable(
   pDdiTable->pfnPartition = urDevicePartition;
   pDdiTable->pfnRelease = urDeviceRelease;
   pDdiTable->pfnRetain = urDeviceRetain;
-  pDdiTable->pfnSelectBinary = nullptr;
+  pDdiTable->pfnSelectBinary = urDeviceSelectBinary;
   return UR_RESULT_SUCCESS;
 }
 

From 764e683bcfb54c40ec28db0f2632e1817a8cbdc4 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Fri, 5 May 2023 15:30:55 +0100
Subject: [PATCH 27/45] [SYCL][CUDA] Port piPluginGetBackendOption to UR

---
 sycl/plugins/cuda/pi_cuda.cpp                 | 21 +------------------
 .../ur/adapters/cuda/platform.cpp             | 20 ++++++++++++++++++
 .../ur/adapters/cuda/ur_interface_loader.cpp  |  1 +
 3 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 6f3002e808cc7..70a7b319cb353 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -54,25 +54,6 @@ pi_result map_error(CUresult result) {
   }
 }
 
-// Returns plugin specific backend option.
-// Current support is only for optimization options.
-// Return empty string for cuda.
-// TODO: Determine correct string to be passed.
-pi_result cuda_piPluginGetBackendOption(pi_platform,
-                                        const char *frontend_option,
-                                        const char **backend_option) {
-  using namespace std::literals;
-  if (frontend_option == nullptr)
-    return PI_ERROR_INVALID_VALUE;
-  if (frontend_option == "-O0"sv || frontend_option == "-O1"sv ||
-      frontend_option == "-O2"sv || frontend_option == "-O3"sv ||
-      frontend_option == ""sv) {
-    *backend_option = "";
-    return PI_SUCCESS;
-  }
-  return PI_ERROR_INVALID_VALUE;
-}
-
 // Iterates over the event wait list, returns correct pi_result error codes.
 // Invokes the callback for the latest event of each queue in the wait list.
 // The callback must take a single pi_event argument and return a pi_result.
@@ -457,7 +438,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piPluginGetLastError, pi2ur::piPluginGetLastError)
   _PI_CL(piTearDown, pi2ur::piTearDown)
   _PI_CL(piGetDeviceAndHostTimer, pi2ur::piGetDeviceAndHostTimer)
-  _PI_CL(piPluginGetBackendOption, cuda_piPluginGetBackendOption)
+  _PI_CL(piPluginGetBackendOption, pi2ur::piPluginGetBackendOption)
 
 #undef _PI_CL
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
index 2ca8c516c08e3..dbbb177926c32 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
@@ -181,3 +181,23 @@ UR_DLLEXPORT ur_result_t UR_APICALL urTearDown(void *) {
   disableCUDATracing();
   return UR_RESULT_SUCCESS;
 }
+
+// Returns plugin specific backend option.
+// Current support is only for optimization options.
+// Return empty string for cuda.
+// TODO: Determine correct string to be passed.
+UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption(
+    ur_platform_handle_t hPlatform, const char *pFrontendOption,
+    const char **ppPlatformOption) {
+  (void)hPlatform;
+  using namespace std::literals;
+  if (pFrontendOption == nullptr)
+    return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+  if (pFrontendOption == "-O0"sv || pFrontendOption == "-O1"sv ||
+      pFrontendOption == "-O2"sv || pFrontendOption == "-O3"sv ||
+      pFrontendOption == ""sv) {
+    *ppPlatformOption = "";
+    return UR_RESULT_SUCCESS;
+  }
+  return UR_RESULT_ERROR_INVALID_VALUE;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index f7fb58c256d47..49189598be91d 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -41,6 +41,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable(
   pDdiTable->pfnGetApiVersion = urPlatformGetApiVersion;
   pDdiTable->pfnGetInfo = urPlatformGetInfo;
   pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnGetBackendOption = urPlatformGetBackendOption;
   return UR_RESULT_SUCCESS;
 }
 

From d98adf801cbb0ceb14474071074bed3e9feb118d Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Fri, 5 May 2023 16:36:21 +0100
Subject: [PATCH 28/45] [SYCL][CUDA] Port read/write host pipe to UR

---
 sycl/plugins/cuda/pi_cuda.cpp                 | 41 +------------------
 .../ur/adapters/cuda/enqueue.cpp              | 37 +++++++++++++++++
 .../ur/adapters/cuda/ur_interface_loader.cpp  |  2 +
 3 files changed, 41 insertions(+), 39 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 70a7b319cb353..1d28c08f64098 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -235,43 +235,6 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) {
 //-- PI API implementation
 extern "C" {
 
-/// Host Pipes
-pi_result cuda_piextEnqueueReadHostPipe(
-    pi_queue queue, pi_program program, const char *pipe_symbol,
-    pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist,
-    const pi_event *events_waitlist, pi_event *event) {
-  (void)queue;
-  (void)program;
-  (void)pipe_symbol;
-  (void)blocking;
-  (void)ptr;
-  (void)size;
-  (void)num_events_in_waitlist;
-  (void)events_waitlist;
-  (void)event;
-
-  sycl::detail::pi::die("cuda_piextEnqueueReadHostPipe not implemented");
-  return {};
-}
-
-pi_result cuda_piextEnqueueWriteHostPipe(
-    pi_queue queue, pi_program program, const char *pipe_symbol,
-    pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist,
-    const pi_event *events_waitlist, pi_event *event) {
-  (void)queue;
-  (void)program;
-  (void)pipe_symbol;
-  (void)blocking;
-  (void)ptr;
-  (void)size;
-  (void)num_events_in_waitlist;
-  (void)events_waitlist;
-  (void)event;
-
-  sycl::detail::pi::die("cuda_piextEnqueueWriteHostPipe not implemented");
-  return {};
-}
-
 const char SupportedVersion[] = _PI_CUDA_PLUGIN_VERSION_STRING;
 
 pi_result piPluginInit(pi_plugin *PluginInit) {
@@ -430,8 +393,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
          pi2ur::piextEnqueueDeviceGlobalVariableRead)
 
   // Host Pipe
-  _PI_CL(piextEnqueueReadHostPipe, cuda_piextEnqueueReadHostPipe)
-  _PI_CL(piextEnqueueWriteHostPipe, cuda_piextEnqueueWriteHostPipe)
+  _PI_CL(piextEnqueueReadHostPipe, pi2ur::piextEnqueueReadHostPipe)
+  _PI_CL(piextEnqueueWriteHostPipe, pi2ur::piextEnqueueWriteHostPipe)
 
   _PI_CL(piextKernelSetArgMemObj, pi2ur::piextKernelSetArgMemObj)
   _PI_CL(piextKernelSetArgSampler, pi2ur::piextKernelSetArgSampler)
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
index 99a8285d2cd44..073a9ffce26a6 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
@@ -1693,3 +1693,40 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
   }
   return result;
 }
+
+/// Host Pipes
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueReadHostPipe(
+    ur_queue_handle_t hQueue, ur_program_handle_t hProgram,
+    const char *pipe_symbol, bool blocking, void *pDst, size_t size,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  (void)hQueue;
+  (void)hProgram;
+  (void)pipe_symbol;
+  (void)blocking;
+  (void)pDst;
+  (void)size;
+  (void)numEventsInWaitList;
+  (void)phEventWaitList;
+  (void)phEvent;
+
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe(
+    ur_queue_handle_t hQueue, ur_program_handle_t hProgram,
+    const char *pipe_symbol, bool blocking, void *pSrc, size_t size,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  (void)hQueue;
+  (void)hProgram;
+  (void)pipe_symbol;
+  (void)blocking;
+  (void)pSrc;
+  (void)size;
+  (void)numEventsInWaitList;
+  (void)phEventWaitList;
+  (void)phEvent;
+
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index 49189598be91d..ebb4bc771ccd2 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -188,6 +188,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
   pDdiTable->pfnUSMMemcpy2D = urEnqueueUSMMemcpy2D;
   pDdiTable->pfnUSMMemcpy = urEnqueueUSMMemcpy;
   pDdiTable->pfnUSMPrefetch = urEnqueueUSMPrefetch;
+  pDdiTable->pfnReadHostPipe = urEnqueueReadHostPipe;
+  pDdiTable->pfnWriteHostPipe = urEnqueueWriteHostPipe;
   return UR_RESULT_SUCCESS;
 }
 

From 797d3f750c192b8c6051b9bc8710fc2914ea374d Mon Sep 17 00:00:00 2001
From: Omar Ahmed <omar.ahmed@codeplay.com>
Date: Mon, 15 May 2023 13:11:05 +0100
Subject: [PATCH 29/45] [CUDA][UR]Fix program_info_kernel_names

---
 sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
index 82f6db76fda68..91e0b5c85d1b1 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
@@ -332,7 +332,7 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName,
     return ReturnValue(&hProgram->binarySizeInBytes_, 1);
   case UR_PROGRAM_INFO_BINARIES:
     return ReturnValue(&hProgram->binary_, 1);
-  case UR_PROGRAM_INFO_NUM_KERNELS:
+  case UR_PROGRAM_INFO_KERNEL_NAMES:
     return getKernelNames(hProgram);
   default:
     break;

From 6f68c7c7a085e1b63effaf919e0bc9995a58cad7 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Tue, 9 May 2023 16:38:53 +0100
Subject: [PATCH 30/45] [SYCL][CUDA] Remove unused code from CUDA PI and move
 remaining documentation to UR

---
 sycl/plugins/cuda/pi_cuda.cpp                 | 213 ------------------
 sycl/plugins/cuda/pi_cuda.hpp                 | 175 --------------
 .../ur/adapters/cuda/context.cpp              |   4 -
 .../ur/adapters/cuda/context.hpp              |  46 +++-
 .../ur/adapters/cuda/device.cpp               |   3 -
 .../ur/adapters/cuda/enqueue.cpp              |   1 -
 .../ur/adapters/cuda/event.hpp                |   1 -
 .../ur/adapters/cuda/kernel.hpp               |  16 ++
 8 files changed, 54 insertions(+), 405 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 1d28c08f64098..bc8cbaa1e7e2c 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -16,221 +16,8 @@
 #include <sycl/detail/defines.hpp>
 #include <sycl/detail/pi.hpp>
 
-#include <algorithm>
-#include <cassert>
-#include <chrono>
-#include <cmath>
-#include <cuda.h>
-#include <cuda_device_runtime_api.h>
-#include <limits>
-#include <memory>
-#include <mutex>
-#include <regex>
-#include <string_view>
-
 // Forward declarations
 void enableCUDATracing();
-void disableCUDATracing();
-
-namespace {
-pi_result map_error(CUresult result) {
-  switch (result) {
-  case CUDA_SUCCESS:
-    return PI_SUCCESS;
-  case CUDA_ERROR_NOT_PERMITTED:
-    return PI_ERROR_INVALID_OPERATION;
-  case CUDA_ERROR_INVALID_CONTEXT:
-    return PI_ERROR_INVALID_CONTEXT;
-  case CUDA_ERROR_INVALID_DEVICE:
-    return PI_ERROR_INVALID_DEVICE;
-  case CUDA_ERROR_INVALID_VALUE:
-    return PI_ERROR_INVALID_VALUE;
-  case CUDA_ERROR_OUT_OF_MEMORY:
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
-    return PI_ERROR_OUT_OF_RESOURCES;
-  default:
-    return PI_ERROR_UNKNOWN;
-  }
-}
-
-// Iterates over the event wait list, returns correct pi_result error codes.
-// Invokes the callback for the latest event of each queue in the wait list.
-// The callback must take a single pi_event argument and return a pi_result.
-template <typename Func>
-pi_result forLatestEvents(const pi_event *event_wait_list,
-                          std::size_t num_events_in_wait_list, Func &&f) {
-
-  if (event_wait_list == nullptr || num_events_in_wait_list == 0) {
-    return PI_ERROR_INVALID_EVENT_WAIT_LIST;
-  }
-
-  // Fast path if we only have a single event
-  if (num_events_in_wait_list == 1) {
-    return f(event_wait_list[0]);
-  }
-
-  std::vector<pi_event> events{event_wait_list,
-                               event_wait_list + num_events_in_wait_list};
-  std::sort(events.begin(), events.end(), [](pi_event e0, pi_event e1) {
-    // Tiered sort creating sublists of streams (smallest value first) in which
-    // the corresponding events are sorted into a sequence of newest first.
-    return e0->get_stream() < e1->get_stream() ||
-           (e0->get_stream() == e1->get_stream() &&
-            e0->get_event_id() > e1->get_event_id());
-  });
-
-  bool first = true;
-  CUstream lastSeenStream = 0;
-  for (pi_event event : events) {
-    if (!event || (!first && event->get_stream() == lastSeenStream)) {
-      continue;
-    }
-
-    first = false;
-    lastSeenStream = event->get_stream();
-
-    auto result = f(event);
-    if (result != PI_SUCCESS) {
-      return result;
-    }
-  }
-
-  return PI_SUCCESS;
-}
-
-/// Converts CUDA error into PI error codes, and outputs error information
-/// to stderr.
-/// If PI_CUDA_ABORT env variable is defined, it aborts directly instead of
-/// throwing the error. This is intended for debugging purposes.
-/// \return PI_SUCCESS if \param result was CUDA_SUCCESS.
-/// \throw pi_error exception (integer) if input was not success.
-///
-pi_result check_error(CUresult result, const char *function, int line,
-                      const char *file) {
-  if (result == CUDA_SUCCESS || result == CUDA_ERROR_DEINITIALIZED) {
-    return PI_SUCCESS;
-  }
-
-  if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr) {
-    const char *errorString = nullptr;
-    const char *errorName = nullptr;
-    cuGetErrorName(result, &errorName);
-    cuGetErrorString(result, &errorString);
-    std::stringstream ss;
-    ss << "\nPI CUDA ERROR:"
-       << "\n\tValue:           " << result
-       << "\n\tName:            " << errorName
-       << "\n\tDescription:     " << errorString
-       << "\n\tFunction:        " << function << "\n\tSource Location: " << file
-       << ":" << line << "\n"
-       << std::endl;
-    std::cerr << ss.str();
-  }
-
-  if (std::getenv("PI_CUDA_ABORT") != nullptr) {
-    std::abort();
-  }
-
-  throw map_error(result);
-}
-
-/// \cond NODOXY
-#define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__)
-
-/// \cond NODOXY
-template <typename T, typename Assign>
-pi_result getInfoImpl(size_t param_value_size, void *param_value,
-                      size_t *param_value_size_ret, T value, size_t value_size,
-                      Assign &&assign_func) {
-
-  if (param_value != nullptr) {
-
-    if (param_value_size < value_size) {
-      return PI_ERROR_INVALID_VALUE;
-    }
-
-    assign_func(param_value, value, value_size);
-  }
-
-  if (param_value_size_ret != nullptr) {
-    *param_value_size_ret = value_size;
-  }
-
-  return PI_SUCCESS;
-}
-
-template <typename T>
-pi_result getInfo(size_t param_value_size, void *param_value,
-                  size_t *param_value_size_ret, T value) {
-
-  auto assignment = [](void *param_value, T value, size_t value_size) {
-    // Ignore unused parameter
-    (void)value_size;
-
-    *static_cast<T *>(param_value) = value;
-  };
-
-  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
-                     sizeof(T), assignment);
-}
-
-template <typename T>
-pi_result getInfoArray(size_t array_length, size_t param_value_size,
-                       void *param_value, size_t *param_value_size_ret,
-                       T *value) {
-  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
-                     array_length * sizeof(T), memcpy);
-}
-/// \endcond
-
-} // anonymous namespace
-
-/// ------ Error handling, matching OpenCL plugin semantics.
-namespace sycl {
-__SYCL_INLINE_VER_NAMESPACE(_V1) {
-namespace detail {
-namespace pi {
-
-// Report error and no return (keeps compiler from printing warnings).
-// TODO: Probably change that to throw a catchable exception,
-//       but for now it is useful to see every failure.
-//
-[[noreturn]] void die(const char *Message) {
-  std::cerr << "pi_die: " << Message << std::endl;
-  std::terminate();
-}
-
-// Reports error messages
-void cuPrint(const char *Message) {
-  std::cerr << "pi_print: " << Message << std::endl;
-}
-
-void assertion(bool Condition, const char *Message) {
-  if (!Condition)
-    die(Message);
-}
-
-} // namespace pi
-} // namespace detail
-} // __SYCL_INLINE_VER_NAMESPACE(_V1)
-} // namespace sycl
-
-//--------------
-// PI object implementation
-
-/// \endcond
-
-// makes all future work submitted to queue wait for all work captured in event.
-pi_result enqueueEventWait(pi_queue queue, pi_event event) {
-  // for native events, the cuStreamWaitEvent call is used.
-  // This makes all future work submitted to stream wait for all
-  // work captured in event.
-  queue->for_each_stream([e = event->get()](CUstream s) {
-    PI_CHECK_ERROR(cuStreamWaitEvent(s, e, 0));
-  });
-  return PI_SUCCESS;
-}
 
 //-- PI API implementation
 extern "C" {
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index c1c84fa2a4557..f1d15016bc0e5 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -25,23 +25,6 @@
 #define _PI_CUDA_PLUGIN_VERSION_STRING                                         \
   _PI_PLUGIN_VERSION_STRING(_PI_CUDA_PLUGIN_VERSION)
 
-#include "sycl/detail/pi.h"
-#include <algorithm>
-#include <array>
-#include <atomic>
-#include <cassert>
-#include <cstring>
-#include <cuda.h>
-#include <functional>
-#include <limits>
-#include <memory>
-#include <mutex>
-#include <numeric>
-#include <stdint.h>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
 #include <ur/adapters/cuda/context.hpp>
 #include <ur/adapters/cuda/device.hpp>
 #include <ur/adapters/cuda/kernel.hpp>
@@ -55,200 +38,42 @@
 // Share code between the PI Plugin and UR Adapter
 #include <pi2ur.hpp>
 
-extern "C" {
-
-/// \cond IGNORE_BLOCK_IN_DOXYGEN
-pi_result cuda_piMemRetain(pi_mem);
-pi_result cuda_piMemRelease(pi_mem);
-/// \endcond
-}
-
 using _pi_stream_guard = std::unique_lock<std::mutex>;
 
-/// A PI platform stores all known PI devices,
-///  in the CUDA plugin this is just a vector of
-///  available devices since initialization is done
-///  when devices are used.
-///
 struct _pi_platform : ur_platform_handle_t_ {
   using ur_platform_handle_t_::ur_platform_handle_t_;
 };
 
-/// PI device mapping to a CUdevice.
-/// Includes an observer pointer to the platform,
-/// and implements the reference counting semantics since
-/// CUDA objects are not refcounted.
-///
 struct _pi_device : ur_device_handle_t_ {
   using ur_device_handle_t_::ur_device_handle_t_;
 };
 
-/// PI context mapping to a CUDA context object.
-///
-/// There is no direct mapping between a CUDA context and a PI context,
-/// main differences described below:
-///
-/// <b> CUDA context vs PI context </b>
-///
-/// One of the main differences between the PI API and the CUDA driver API is
-/// that the second modifies the state of the threads by assigning
-/// `CUcontext` objects to threads. `CUcontext` objects store data associated
-/// with a given device and control access to said device from the user side.
-/// PI API context are objects that are passed to functions, and not bound
-/// to threads.
-/// The _pi_context object doesn't implement this behavior, only holds the
-/// CUDA context data. The RAII object \ref ScopedContext implements the active
-/// context behavior.
-///
-/// <b> Primary vs User-defined context </b>
-///
-/// CUDA has two different types of context, the Primary context,
-/// which is usable by all threads on a given process for a given device, and
-/// the aforementioned custom contexts.
-/// CUDA documentation, and performance analysis, indicates it is recommended
-/// to use Primary context whenever possible.
-/// Primary context is used as well by the CUDA Runtime API.
-/// For PI applications to interop with CUDA Runtime API, they have to use
-/// the primary context - and make that active in the thread.
-/// The `_pi_context` object can be constructed with a `kind` parameter
-/// that allows to construct a Primary or `user-defined` context, so that
-/// the PI object interface is always the same.
-///
-///  <b> Destructor callback </b>
-///
-///  Required to implement CP023, SYCL Extended Context Destruction,
-///  the PI Context can store a number of callback functions that will be
-///  called upon destruction of the PI Context.
-///  See proposal for details.
-///
 struct _pi_context : ur_context_handle_t_ {
   using ur_context_handle_t_::ur_context_handle_t_;
 };
 
-/// PI Mem mapping to CUDA memory allocations, both data and texture/surface.
-/// \brief Represents non-SVM allocations on the CUDA backend.
-/// Keeps tracks of all mapped regions used for Map/Unmap calls.
-/// Only one region can be active at the same time per allocation.
 struct _pi_mem  : ur_mem_handle_t_ {
   using ur_mem_handle_t_::ur_mem_handle_t_;
 };
 
-/// PI queue mapping on to CUstream objects.
-///
 struct _pi_queue : ur_queue_handle_t_ {
   using ur_queue_handle_t_::ur_queue_handle_t_;
 };
 
-typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
-                           void *userData);
-
 struct _pi_event : ur_event_handle_t_ {
   using ur_event_handle_t_::ur_event_handle_t_;
-
-  // Helpers for queue command implementations until they also get ported to UR
-  static pi_event
-  make_native(pi_command_type type, pi_queue queue, CUstream stream,
-              uint32_t stream_token = std::numeric_limits<uint32_t>::max()) {
-    auto urQueue = reinterpret_cast<ur_queue_handle_t>(queue);
-    static std::unordered_map<_pi_command_type, ur_command_t> cmdMap = {
-        {PI_COMMAND_TYPE_NDRANGE_KERNEL, UR_COMMAND_KERNEL_LAUNCH},
-        {PI_COMMAND_TYPE_MEM_BUFFER_READ, UR_COMMAND_MEM_BUFFER_READ},
-        {PI_COMMAND_TYPE_MEM_BUFFER_WRITE, UR_COMMAND_MEM_BUFFER_WRITE},
-        {PI_COMMAND_TYPE_MEM_BUFFER_COPY, UR_COMMAND_MEM_BUFFER_COPY},
-        {PI_COMMAND_TYPE_MEM_BUFFER_MAP, UR_COMMAND_MEM_BUFFER_MAP},
-        {PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, UR_COMMAND_MEM_UNMAP},
-        {PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, UR_COMMAND_MEM_BUFFER_READ_RECT},
-        {PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT,
-         UR_COMMAND_MEM_BUFFER_WRITE_RECT},
-        {PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, UR_COMMAND_MEM_BUFFER_COPY_RECT},
-        {PI_COMMAND_TYPE_MEM_BUFFER_FILL, UR_COMMAND_MEM_BUFFER_FILL},
-        {PI_COMMAND_TYPE_IMAGE_READ, UR_COMMAND_MEM_IMAGE_READ},
-        {PI_COMMAND_TYPE_IMAGE_WRITE, UR_COMMAND_MEM_IMAGE_WRITE},
-        {PI_COMMAND_TYPE_IMAGE_COPY, UR_COMMAND_MEM_IMAGE_COPY},
-        {PI_COMMAND_TYPE_BARRIER, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER},
-        {PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_READ,
-         UR_COMMAND_DEVICE_GLOBAL_VARIABLE_READ},
-        {PI_COMMAND_TYPE_DEVICE_GLOBAL_VARIABLE_WRITE,
-         UR_COMMAND_DEVICE_GLOBAL_VARIABLE_WRITE},
-    };
-
-    // TODO(ur): There is no exact mapping for the following commands. Just
-    // default to KERNEL_LAUNCH for now.
-    // PI_COMMAND_TYPE_USER
-    // PI_COMMAND_TYPE_MEM_BUFFER_FILL,
-    // PI_COMMAND_TYPE_IMAGE_READ,
-    // PI_COMMAND_TYPE_IMAGE_WRITE,
-    // PI_COMMAND_TYPE_IMAGE_COPY,
-    // PI_COMMAND_TYPE_NATIVE_KERNEL,
-    // PI_COMMAND_TYPE_COPY_BUFFER_TO_IMAGE,
-    // PI_COMMAND_TYPE_COPY_IMAGE_TO_BUFFER,
-    // PI_COMMAND_TYPE_MAP_IMAGE,
-    // PI_COMMAND_TYPE_MARKER,
-    // PI_COMMAND_TYPE_ACQUIRE_GL_OBJECTS,
-    // PI_COMMAND_TYPE_RELEASE_GL_OBJECTS,
-    // PI_COMMAND_TYPE_BARRIER,
-    // PI_COMMAND_TYPE_MIGRATE_MEM_OBJECTS,
-    // PI_COMMAND_TYPE_FILL_IMAGE
-    // PI_COMMAND_TYPE_SVM_FREE
-    // PI_COMMAND_TYPE_SVM_MEMCPY
-    // PI_COMMAND_TYPE_SVM_MEMFILL
-    // PI_COMMAND_TYPE_SVM_MAP
-    // PI_COMMAND_TYPE_SVM_UNMAP
-
-    ur_command_t urCmd = UR_COMMAND_KERNEL_LAUNCH;
-    auto cmdIt = cmdMap.find(type);
-    if (cmdIt != cmdMap.end()) {
-      urCmd = cmdIt->second;
-    }
-    return reinterpret_cast<pi_event>(
-        ur_event_handle_t_::make_native(urCmd, urQueue, stream, stream_token));
-  }
-
-  static pi_event make_with_native(ur_context_handle_t context,
-                                   CUevent eventNative) {
-    auto urContext = reinterpret_cast<ur_context_handle_t>(context);
-    return reinterpret_cast<pi_event>(
-        ur_event_handle_t_::make_with_native(urContext, eventNative));
-  }
 };
 
-/// Implementation of PI Program on CUDA Module object
-///
 struct _pi_program : ur_program_handle_t_ {
   using ur_program_handle_t_::ur_program_handle_t_;
 };
 
-/// Implementation of a PI Kernel for CUDA
-///
-/// PI Kernels are used to set kernel arguments,
-/// creating a state on the Kernel object for a given
-/// invocation. This is not the case of CUFunction objects,
-/// which are simply passed together with the arguments on the invocation.
-/// The PI Kernel implementation for CUDA stores the list of arguments,
-/// argument sizes and offsets to emulate the interface of PI Kernel,
-/// saving the arguments for the later dispatch.
-/// Note that in PI API, the Local memory is specified as a size per
-/// individual argument, but in CUDA only the total usage of shared
-/// memory is required since it is not passed as a parameter.
-/// A compiler pass converts the PI API local memory model into the
-/// CUDA shared model. This object simply calculates the total of
-/// shared memory, and the initial offsets of each parameter.
-///
 struct _pi_kernel : ur_kernel_handle_t_ {
   using ur_kernel_handle_t_::ur_kernel_handle_t_;
 };
 
-/// Implementation of samplers for CUDA
-///
-/// Sampler property layout:
-/// | 31 30 ... 6 5 |      4 3 2      |     1      |         0        |
-/// |      N/A      | addressing mode | fiter mode | normalize coords |
 struct _pi_sampler : ur_sampler_handle_t_ {
   using ur_sampler_handle_t_::ur_sampler_handle_t_;
 };
 
-// -------------------------------------------------------------
-// Helper types and functions
-//
-
 #endif // PI_CUDA_HPP
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
index a84d4c71c8dd2..27ed647639a6c 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
@@ -133,10 +133,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle(
   (void)hNativeContext;
   (void)phContext;
 
-  // TODO(ur): Needed for the conformance test to pass, but it may be valid
-  // to have a null CUDA context
-  UR_ASSERT(hNativeContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-
   return UR_RESULT_ERROR_INVALID_OPERATION;
 }
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp
index 34575829c318b..bc3cb32f55b9c 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp
@@ -14,15 +14,49 @@
 #include <mutex>
 #include <vector>
 
-// We need this declaration temporarily while UR and PI share ScopedContext
-class _pi_context;
-using pi_context = _pi_context *;
-
 #include "common.hpp"
 #include "device.hpp"
 
 typedef void (*ur_context_extended_deleter_t)(void *user_data);
 
+/// UR context mapping to a CUDA context object.
+///
+/// There is no direct mapping between a CUDA context and a UR context,
+/// main differences described below:
+///
+/// <b> CUDA context vs UR context </b>
+///
+/// One of the main differences between the UR API and the CUDA driver API is
+/// that the second modifies the state of the threads by assigning
+/// `CUcontext` objects to threads. `CUcontext` objects store data associated
+/// with a given device and control access to said device from the user side.
+/// UR API context are objects that are passed to functions, and not bound
+/// to threads.
+/// The _ur_context object doesn't implement this behavior, only holds the
+/// CUDA context data. The RAII object \ref ScopedContext implements the active
+/// context behavior.
+///
+/// <b> Primary vs User-defined context </b>
+///
+/// CUDA has two different types of context, the Primary context,
+/// which is usable by all threads on a given process for a given device, and
+/// the aforementioned custom contexts.
+/// CUDA documentation, and performance analysis, indicates it is recommended
+/// to use Primary context whenever possible.
+/// Primary context is used as well by the CUDA Runtime API.
+/// For UR applications to interop with CUDA Runtime API, they have to use
+/// the primary context - and make that active in the thread.
+/// The `_ur_context` object can be constructed with a `kind` parameter
+/// that allows to construct a Primary or `user-defined` context, so that
+/// the UR object interface is always the same.
+///
+///  <b> Destructor callback </b>
+///
+///  Required to implement CP023, SYCL Extended Context Destruction,
+///  the PI Context can store a number of callback functions that will be
+///  called upon destruction of the UR Context.
+///  See proposal for details.
+///
 struct ur_context_handle_t_ {
 
   struct deleter_data {
@@ -76,10 +110,6 @@ struct ur_context_handle_t_ {
 namespace {
 class ScopedContext {
 public:
-  // TODO(ur): Needed for compatibility with PI; once the CUDA PI plugin is
-  // fully moved over we can drop this constructor
-  ScopedContext(pi_context ctxt);
-
   ScopedContext(ur_context_handle_t ctxt) {
     if (!ctxt) {
       throw UR_RESULT_ERROR_INVALID_CONTEXT;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
index 6d87373524341..06544fbbfdba5 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
@@ -1067,9 +1067,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle(
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform,
     ur_device_handle_t *phDevice) {
-  // TODO(ur): This is neede for the UR CTS, but it might be valid to to have a
-  // null native handle
-  UR_ASSERT(hNativeDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(phDevice, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
   // We can't cast between ur_native_handle_t and CUdevice, so memcpy the bits
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
index 073a9ffce26a6..1cb7b912da1a8 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
@@ -1220,7 +1220,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
   return ret_err;
 }
 
-/// TODO(ur): Add support for the offset.
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
     ur_queue_handle_t hQueue, void *ptr, size_t patternSize,
     const void *pPattern, size_t size, uint32_t numEventsInWaitList,
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp
index d0c7fef8a2b48..b0f10b33a5822 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp
@@ -80,7 +80,6 @@ struct ur_event_handle_t_ {
   static ur_event_handle_t
   make_native(ur_command_t type, ur_queue_handle_t queue, CUstream stream,
               uint32_t stream_token = std::numeric_limits<uint32_t>::max()) {
-    // TODO(ur): Remove cast when pi_event is ported to UR
     return new ur_event_handle_t_(type, queue->get_context(), queue, stream,
                                   stream_token);
   }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
index 42e624cefba48..00f0792479979 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
@@ -16,6 +16,22 @@
 
 #include "program.hpp"
 
+/// Implementation of a UR Kernel for CUDA
+///
+/// UR Kernels are used to set kernel arguments,
+/// creating a state on the Kernel object for a given
+/// invocation. This is not the case of CUFunction objects,
+/// which are simply passed together with the arguments on the invocation.
+/// The UR Kernel implementation for CUDA stores the list of arguments,
+/// argument sizes and offsets to emulate the interface of UR Kernel,
+/// saving the arguments for the later dispatch.
+/// Note that in UR API, the Local memory is specified as a size per
+/// individual argument, but in CUDA only the total usage of shared
+/// memory is required since it is not passed as a parameter.
+/// A compiler pass converts the UR API local memory model into the
+/// CUDA shared model. This object simply calculates the total of
+/// shared memory, and the initial offsets of each parameter.
+///
 struct ur_kernel_handle_t_ {
   using native_type = CUfunction;
 

From ff2559fccce15333670b589ec340c7a4da1d3831 Mon Sep 17 00:00:00 2001
From: Aaron Greig <aaron.greig@codeplay.com>
Date: Tue, 23 May 2023 11:51:39 +0100
Subject: [PATCH 31/45] [SYCL][CUDA] Add a few extra checks to the cuda UR
 program implementation.

---
 .../ur/adapters/cuda/program.cpp                | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
index 91e0b5c85d1b1..de31ed7735a9e 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
@@ -371,7 +371,15 @@ urProgramRelease(ur_program_handle_t program) {
     try {
       ScopedContext active(program->get_context());
       auto cuModule = program->get();
-      result = UR_CHECK_ERROR(cuModuleUnload(cuModule));
+      // "0" is a valid handle for a cuModule, so the best way to check if we
+      // actually loaded a module and need to unload it is to look at the build
+      // status.
+      if (program->buildStatus_ == UR_PROGRAM_BUILD_STATUS_SUCCESS) {
+        result = UR_CHECK_ERROR(cuModuleUnload(cuModule));
+      } else if(program->buildStatus_ == UR_PROGRAM_BUILD_STATUS_NONE) {
+        // Nothing to free.
+        result = UR_RESULT_SUCCESS;
+      }
     } catch (...) {
       result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
     }
@@ -391,6 +399,7 @@ urProgramRelease(ur_program_handle_t program) {
 UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
     ur_program_handle_t program, ur_native_handle_t *nativeHandle) {
   UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(nativeHandle, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   *nativeHandle = reinterpret_cast<ur_native_handle_t>(program->get());
   return UR_RESULT_SUCCESS;
 }
@@ -417,8 +426,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
   std::unique_ptr<ur_program_handle_t_> retProgram{
       new ur_program_handle_t_{hContext}};
 
-  retError =
-      retProgram->set_metadata(pProperties->pMetadatas, pProperties->count);
+  if (pProperties && pProperties->pMetadatas) {
+    retError =
+        retProgram->set_metadata(pProperties->pMetadatas, pProperties->count);
+  }
   UR_ASSERT(retError == UR_RESULT_SUCCESS, retError);
 
   auto pBinary_string = reinterpret_cast<const char *>(pBinary);

From d69f029f2faa695a821d1498272fc114ded2e292 Mon Sep 17 00:00:00 2001
From: Aaron Greig <aaron.greig@codeplay.com>
Date: Tue, 23 May 2023 12:35:23 +0100
Subject: [PATCH 32/45] [SYCL][CUDA] Implement UR_DEVICE_INFO_IL_VERSION query
 for cuda.

---
 .../ur/adapters/cuda/device.cpp               | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
index 06544fbbfdba5..9d6a80d98c907 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
@@ -938,6 +938,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
 
     return ReturnValue(memory_bandwidth);
   }
+  case UR_DEVICE_INFO_IL_VERSION: {
+    std::string il_version = "nvptx-";
+
+    int driver_version = 0;
+    cuDriverGetVersion(&driver_version);
+    int major = driver_version / 1000;
+    int minor = driver_version % 1000 / 10;
+
+    // We can work out which ptx ISA version we support based on the versioning
+    // table published here
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes
+    // Major versions that we support are consistent in how they line up, so we
+    // can derive that easily. The minor versions for version 10 don't line up
+    // the same so it needs a special case. This is not ideal but it does seem
+    // to be the best bet to avoid a maintenance burden here.
+    il_version += std::to_string(major - 4) + ".";
+    if (major == 10) {
+      il_version += std::to_string(minor + 3);
+    } else if (major >= 11) {
+      il_version += std::to_string(minor);
+    } else {
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+
+    return ReturnValue(il_version.data(), il_version.size());
+  }
   case UR_EXT_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: {
     // Maximum number of 32-bit registers available to a thread block.
     // Note: This number is shared by all thread blocks simultaneously resident

From 190f3c7116738f74ca3af944638cb4bb362d8b87 Mon Sep 17 00:00:00 2001
From: Omar Ahmed <omar.ahmed@codeplay.com>
Date: Wed, 24 May 2023 09:04:24 +0100
Subject: [PATCH 33/45] [SYCL][CUDA][UR] Remove queue backward compatability
 apis

---
 sycl/plugins/cuda/pi_cuda.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index bc8cbaa1e7e2c..9af47b47a6b2a 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -71,18 +71,14 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   // Queue
   _PI_CL(piQueueCreate, pi2ur::piQueueCreate)
   _PI_CL(piextQueueCreate, pi2ur::piextQueueCreate)
-  _PI_CL(piextQueueCreate2, pi2ur::piextQueueCreate2)
   _PI_CL(piQueueGetInfo, pi2ur::piQueueGetInfo)
   _PI_CL(piQueueFinish, pi2ur::piQueueFinish)
   _PI_CL(piQueueFlush, pi2ur::piQueueFlush)
   _PI_CL(piQueueRetain, pi2ur::piQueueRetain)
   _PI_CL(piQueueRelease, pi2ur::piQueueRelease)
   _PI_CL(piextQueueGetNativeHandle, pi2ur::piextQueueGetNativeHandle)
-  _PI_CL(piextQueueGetNativeHandle2, pi2ur::piextQueueGetNativeHandle2)
   _PI_CL(piextQueueCreateWithNativeHandle,
          pi2ur::piextQueueCreateWithNativeHandle)
-  _PI_CL(piextQueueCreateWithNativeHandle2,
-         pi2ur::piextQueueCreateWithNativeHandle2)
   // Memory
   _PI_CL(piMemBufferCreate, pi2ur::piMemBufferCreate)
   _PI_CL(piMemImageCreate, pi2ur::piMemImageCreate)

From a4415034f35f6fdeecc59a113f20d0b10a2b6cb7 Mon Sep 17 00:00:00 2001
From: Omar Ahmed <omar.ahmed@codeplay.com>
Date: Thu, 25 May 2023 04:22:11 +0100
Subject: [PATCH 34/45] [SYCL][CUDA][UR] Add usmPool entry points to ddi tables
 and fix ur*nativeHandle apis

---
 sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp     | 1 +
 sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp    | 1 +
 sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp      | 5 +++--
 .../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp | 5 +++--
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
index 9d6a80d98c907..a5889ddba9b06 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
@@ -1092,6 +1092,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle(
 
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform,
+    const ur_device_native_properties_t *pProperties,
     ur_device_handle_t *phDevice) {
   UR_ASSERT(phDevice, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
index de31ed7735a9e..bf1af9441aed6 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
@@ -281,6 +281,7 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count,
 /// \return TBD
 UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle(
     ur_native_handle_t hNativeProgram, ur_context_handle_t hContext,
+    const ur_program_native_properties_t *pProperties,
     ur_program_handle_t *phProgram) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
index 371c3363b4e75..2c13c6ea29d14 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
@@ -236,8 +236,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) {
   return UR_RESULT_SUCCESS;
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(
-    ur_queue_handle_t hQueue, ur_native_handle_t *phNativeQueue) {
+UR_APIEXPORT ur_result_t UR_APICALL
+urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc,
+                       ur_native_handle_t *phNativeQueue) {
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(phNativeQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index ebb4bc771ccd2..bd57bfd762429 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -233,8 +233,9 @@ urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) {
   pDdiTable->pfnGetMemAllocInfo = urUSMGetMemAllocInfo;
   pDdiTable->pfnHostAlloc = urUSMHostAlloc;
   pDdiTable->pfnPoolCreate = nullptr;
-  pDdiTable->pfnPoolDestroy = nullptr;
-  pDdiTable->pfnPoolDestroy = nullptr;
+  pDdiTable->pfnPoolRetain = nullptr;
+  pDdiTable->pfnPoolRelease = nullptr;
+  pDdiTable->pfnPoolGetInfo = nullptr;
   pDdiTable->pfnSharedAlloc = urUSMSharedAlloc;
   return UR_RESULT_SUCCESS;
 }

From 3b6536941f9aad8dc196c63d6243c03ee6070bda Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Wed, 31 May 2023 11:48:56 +0100
Subject: [PATCH 35/45] Fix CUDA adapter formatting

---
 sycl/plugins/cuda/pi_cuda.hpp                               | 6 +++---
 sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp    | 1 -
 sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp    | 3 ++-
 sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp     | 4 ++--
 sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp    | 2 +-
 sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp  | 6 +++---
 sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp   | 2 +-
 sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp     | 4 ++--
 .../ur/adapters/cuda/ur_interface_loader.cpp                | 3 ++-
 9 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index f1d15016bc0e5..8fb4664199286 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -27,13 +27,13 @@
 
 #include <ur/adapters/cuda/context.hpp>
 #include <ur/adapters/cuda/device.hpp>
+#include <ur/adapters/cuda/event.hpp>
 #include <ur/adapters/cuda/kernel.hpp>
+#include <ur/adapters/cuda/memory.hpp>
 #include <ur/adapters/cuda/platform.hpp>
 #include <ur/adapters/cuda/program.hpp>
-#include <ur/adapters/cuda/event.hpp>
 #include <ur/adapters/cuda/queue.hpp>
 #include <ur/adapters/cuda/sampler.hpp>
-#include <ur/adapters/cuda/memory.hpp>
 
 // Share code between the PI Plugin and UR Adapter
 #include <pi2ur.hpp>
@@ -52,7 +52,7 @@ struct _pi_context : ur_context_handle_t_ {
   using ur_context_handle_t_::ur_context_handle_t_;
 };
 
-struct _pi_mem  : ur_mem_handle_t_ {
+struct _pi_mem : ur_mem_handle_t_ {
   using ur_mem_handle_t_::ur_mem_handle_t_;
 };
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
index f25aa88b3e292..de767c929d638 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
@@ -86,7 +86,6 @@ void sycl::detail::ur::cuPrint(const char *Message) {
   std::cerr << "ur_print: " << Message << std::endl;
 }
 
-
 // Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR
 thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS;
 thread_local char ErrorMessage[MaxMessageSize];
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
index a5889ddba9b06..f53caafcb587d 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
@@ -988,7 +988,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
         cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, device->get()) ==
         CUDA_SUCCESS);
     // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written
-    sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12);
+    sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) ==
+                                12);
     return ReturnValue(AddressBuffer,
                        strnlen(AddressBuffer, AddressBufferSize - 1) + 1);
   }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
index 6788de883e971..f1a0b9d2a97d2 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
@@ -46,8 +46,8 @@ ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t context,
     : commandType_{UR_COMMAND_EVENTS_WAIT}, refCount_{1}, has_ownership_{false},
       hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
       streamToken_{std::numeric_limits<uint32_t>::max()}, evEnd_{eventNative},
-      evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, context_{
-                                                                  context} {
+      evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr},
+      context_{context} {
   urContextRetain(context_);
 }
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp
index 44484250f062b..5712218b06425 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp
@@ -7,9 +7,9 @@
 //===-----------------------------------------------------------------===//
 #pragma once
 
+#include <cassert>
 #include <cuda.h>
 #include <ur_api.h>
-#include <cassert>
 
 #include "common.hpp"
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
index dbbb177926c32..fdf0f723e168f 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
@@ -57,9 +57,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo(
 /// However because multiple devices in a context is not currently supported,
 /// place each device in a separate platform.
 ///
-UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGet(uint32_t NumEntries,
-                          ur_platform_handle_t *phPlatforms,
-                          uint32_t *pNumPlatforms) {
+UR_DLLEXPORT ur_result_t UR_APICALL
+urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms,
+              uint32_t *pNumPlatforms) {
 
   try {
     static std::once_flag initFlag;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
index bf1af9441aed6..0081e921ec677 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
@@ -377,7 +377,7 @@ urProgramRelease(ur_program_handle_t program) {
       // status.
       if (program->buildStatus_ == UR_PROGRAM_BUILD_STATUS_SUCCESS) {
         result = UR_CHECK_ERROR(cuModuleUnload(cuModule));
-      } else if(program->buildStatus_ == UR_PROGRAM_BUILD_STATUS_NONE) {
+      } else if (program->buildStatus_ == UR_PROGRAM_BUILD_STATUS_NONE) {
         // Nothing to free.
         result = UR_RESULT_SUCCESS;
       }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
index 99a7904b82b7e..daa1017d0f0aa 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
@@ -69,8 +69,8 @@ struct ur_queue_handle_t_ {
         device_{device}, refCount_{1}, eventCount_{0}, compute_stream_idx_{0},
         transfer_stream_idx_{0}, num_compute_streams_{0},
         num_transfer_streams_{0}, last_sync_compute_streams_{0},
-        last_sync_transfer_streams_{0}, flags_(flags),
-        ur_flags_(ur_flags), has_ownership_{backend_owns} {
+        last_sync_transfer_streams_{0}, flags_(flags), ur_flags_(ur_flags),
+        has_ownership_{backend_owns} {
     urContextRetain(context_);
     urDeviceRetain(device_);
   }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index bd57bfd762429..f0eb6008d8a36 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -148,7 +148,8 @@ urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) {
   }
   pDdiTable->pfnBufferCreate = urMemBufferCreate;
   pDdiTable->pfnBufferPartition = urMemBufferPartition;
-  pDdiTable->pfnBufferCreateWithNativeHandle = urMemBufferCreateWithNativeHandle;
+  pDdiTable->pfnBufferCreateWithNativeHandle =
+      urMemBufferCreateWithNativeHandle;
   pDdiTable->pfnImageCreateWithNativeHandle = urMemImageCreateWithNativeHandle;
   pDdiTable->pfnGetInfo = urMemGetInfo;
   pDdiTable->pfnGetNativeHandle = urMemGetNativeHandle;

From 0011b9178664eef57a928d6f1fd0097ebcacbf61 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Wed, 31 May 2023 14:08:11 +0100
Subject: [PATCH 36/45] Mark KernelFusion/sync_two_queues_event_dep as
 unsupported on cuda pending further investigation

---
 sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
index 4fe263431aed2..4c3c4f5f8ecb7 100644
--- a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
@@ -1,5 +1,6 @@
 // For this test, complete_fusion must be supported.
 // REQUIRES: fusion
+// UNSUPPORTED: cuda
 // RUN: %{build} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
 

From 9e97af7f1d97a7af8b0ef2708a25d216c74bb932 Mon Sep 17 00:00:00 2001
From: Aaron Greig <aaron.greig@codeplay.com>
Date: Wed, 24 May 2023 15:09:53 +0100
Subject: [PATCH 37/45] [SYCL][CUDA] Fix assumption about work dimensions in
 EnqueueKernelLaunch.

---
 .../ur/adapters/cuda/enqueue.cpp              | 39 +++++++++++--------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
index 1cb7b912da1a8..b0c4562d60525 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
@@ -128,7 +128,7 @@ ur_result_t setCuMemAdvise(CUdeviceptr devPtr, size_t size,
 // The default threadsPerBlock only require handling the first work_dim
 // dimension.
 void guessLocalWorkSize(ur_device_handle_t device, size_t *threadsPerBlock,
-                        const size_t *global_work_size,
+                        const size_t *global_work_size, const uint32_t work_dim,
                         const size_t maxThreadsPerBlock[3],
                         ur_kernel_handle_t kernel, uint32_t local_size) {
   assert(threadsPerBlock != nullptr);
@@ -136,6 +136,13 @@ void guessLocalWorkSize(ur_device_handle_t device, size_t *threadsPerBlock,
   assert(kernel != nullptr);
   int minGrid, maxBlockSize, maxBlockDim[3];
 
+  // The below assumes a three dimensional range but this is not guaranteed by
+  // UR.
+  size_t global_size_normalized[3] = {1, 1, 1};
+  for (uint32_t i = 0; i < work_dim; i++) {
+    global_size_normalized[i] = global_work_size[i];
+  }
+
   static auto isPrime = [](size_t number) -> bool {
     auto lastNumToCheck = ceil(sqrt(number));
     if (number < 2)
@@ -160,23 +167,24 @@ void guessLocalWorkSize(ur_device_handle_t device, size_t *threadsPerBlock,
       &minGrid, &maxBlockSize, kernel->get(), NULL, local_size,
       maxThreadsPerBlock[0]));
 
-  threadsPerBlock[2] = std::min(global_work_size[2], size_t(maxBlockDim[2]));
-  threadsPerBlock[1] =
-      std::min(global_work_size[1], std::min(maxBlockSize / threadsPerBlock[2],
-                                             size_t(maxBlockDim[1])));
+  threadsPerBlock[2] =
+      std::min(global_size_normalized[2], size_t(maxBlockDim[2]));
+  threadsPerBlock[1] = std::min(
+      global_size_normalized[1],
+      std::min(maxBlockSize / threadsPerBlock[2], size_t(maxBlockDim[1])));
   maxBlockDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
   threadsPerBlock[0] =
       std::min(maxThreadsPerBlock[0],
-               std::min(global_work_size[0], size_t(maxBlockDim[0])));
+               std::min(global_size_normalized[0], size_t(maxBlockDim[0])));
 
-  // When global_work_size[0] is prime threadPerBlock[0] will later computed as
-  // 1, which is not efficient configuration. In such case we use
-  // global_work_size[0] + 1 to compute threadPerBlock[0].
+  // When global_size_normalized[0] is prime threadPerBlock[0] will later
+  // computed as 1, which is not efficient configuration. In such case we use
+  // global_size_normalized[0] + 1 to compute threadPerBlock[0].
   int adjusted_0_dim_global_work_size =
-      (isPrime(global_work_size[0]) &&
-       (threadsPerBlock[0] != global_work_size[0]))
-          ? global_work_size[0] + 1
-          : global_work_size[0];
+      (isPrime(global_size_normalized[0]) &&
+       (threadsPerBlock[0] != global_size_normalized[0]))
+          ? global_size_normalized[0] + 1
+          : global_size_normalized[0];
 
   static auto isPowerOf2 = [](size_t value) -> bool {
     return value && !(value & (value - 1));
@@ -209,7 +217,7 @@ bool hasExceededMaxRegistersPerBlock(ur_device_handle_t device,
                                     kernel->get()));
 
   return blockSize * regsPerThread > size_t(maxRegsPerBlock);
-};
+}
 
 /// Enqueues a wait on the given CUstream for all specified events (See
 /// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued
@@ -309,7 +317,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
     const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
-
   // Preconditions
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hQueue->get_context() == hKernel->get_context(),
@@ -376,7 +383,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         }
       } else {
         guessLocalWorkSize(hQueue->device_, threadsPerBlock, pGlobalWorkSize,
-                           maxThreadsPerBlock, hKernel, local_size);
+                           workDim, maxThreadsPerBlock, hKernel, local_size);
       }
     }
 

From b538dd89ced1166b6bcdb6dc90462469c228702e Mon Sep 17 00:00:00 2001
From: Aaron Greig <aaron.greig@codeplay.com>
Date: Mon, 29 May 2023 16:50:05 +0100
Subject: [PATCH 38/45] [SYCL][CUDA] Correct return type of cuda USM capability
 queries.

---
 .../unified_runtime/ur/adapters/cuda/device.cpp        | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
index f53caafcb587d..b633086d057e8 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
@@ -707,7 +707,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     //
     // query if/how the device can access page-locked host memory, possibly
     // through PCIe, using the same pointer as the host
-    uint64_t value = {};
+    uint32_t value = {};
     if (getAttribute(device, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
       // the device shares a unified address space with the host
       if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
@@ -734,7 +734,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     // associated with this device."
     //
     // query how the device can access memory allocated on the device itself (?)
-    uint64_t value =
+    uint32_t value =
         UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
         UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS |
         UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS |
@@ -747,7 +747,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     // allocation associated with this device."
     //
     // query if/how the device can access managed memory associated to it
-    uint64_t value = {};
+    uint32_t value = {};
     if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
       // the device can allocate managed memory on this system
       value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
@@ -775,7 +775,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     //
     // query if/how the device can access managed memory associated to other
     // devices
-    uint64_t value = {};
+    uint32_t value = {};
     if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
       // the device can allocate managed memory on this system
       value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS;
@@ -804,7 +804,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     //
     // query if/how the device can access pageable host memory allocated by the
     // system allocator
-    uint64_t value = {};
+    uint32_t value = {};
     if (getAttribute(device, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) {
       // the device suppports coherently accessing pageable memory without
       // calling cuMemHostRegister/cudaHostRegister on it

From 9811f9b258265c1da39cdee78bde6eac9fdd6f19 Mon Sep 17 00:00:00 2001
From: Aaron Greig <aaron.greig@codeplay.com>
Date: Mon, 5 Jun 2023 12:16:20 +0100
Subject: [PATCH 39/45] [SYCL][CUDA] A number of small cuda adapter fixes for
 cts/spec compliance.

---
 .../ur/adapters/cuda/device.cpp               |  2 ++
 .../ur/adapters/cuda/kernel.cpp               | 25 ++++++++++++++++---
 .../ur/adapters/cuda/kernel.hpp               |  2 +-
 .../ur/adapters/cuda/program.cpp              | 16 ++++++------
 sycl/plugins/unified_runtime/ur/ur.hpp        |  5 +++-
 5 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
index b633086d057e8..39d582405a1e1 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
@@ -993,6 +993,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     return ReturnValue(AddressBuffer,
                        strnlen(AddressBuffer, AddressBufferSize - 1) + 1);
   }
+  case UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS:
+    return ReturnValue(false);
     // TODO: Investigate if this information is available on CUDA.
   case UR_DEVICE_INFO_GPU_EU_COUNT:
   case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH:
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
index 69f86ca319df5..900b23dd84306 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
@@ -15,6 +15,7 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
                ur_kernel_handle_t *phKernel) {
   UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(phKernel, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(pKernelName, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
   ur_result_t retErr = UR_RESULT_SUCCESS;
   std::unique_ptr<ur_kernel_handle_t_> retKernel{nullptr};
@@ -23,8 +24,16 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
     ScopedContext active(hProgram->get_context());
 
     CUfunction cuFunc;
-    retErr = UR_CHECK_ERROR(
-        cuModuleGetFunction(&cuFunc, hProgram->get(), pKernelName));
+    CUresult functionResult =
+        cuModuleGetFunction(&cuFunc, hProgram->get(), pKernelName);
+
+    // We can't add this as a generic mapping in UR_CHECK_ERROR since cuda's
+    // NOT_FOUND error applies to more than just functions.
+    if (functionResult == CUDA_ERROR_NOT_FOUND) {
+      throw UR_RESULT_ERROR_INVALID_KERNEL_NAME;
+    } else {
+      retErr = UR_CHECK_ERROR(functionResult);
+    }
 
     std::string kernel_name_woffset = std::string(pKernelName) + "_with_offset";
     CUfunction cuFuncWithOffsetParam;
@@ -187,6 +196,7 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urKernelSetArgValue(ur_kernel_handle_t hKernel, uint32_t argIndex,
                     size_t argSize, const void *pArgValue) {
   UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(argSize, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE);
 
   ur_result_t retErr = UR_RESULT_SUCCESS;
   try {
@@ -335,7 +345,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj(
 UR_APIEXPORT ur_result_t UR_APICALL
 urKernelSetExecInfo(ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName,
                     size_t propSize, const void *pPropValue) {
-  return UR_RESULT_SUCCESS;
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pPropValue, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  switch (propName) {
+  case UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS:
+  case UR_KERNEL_EXEC_INFO_USM_PTRS:
+  case UR_KERNEL_EXEC_INFO_CACHE_CONFIG:
+    return UR_RESULT_SUCCESS;
+  default:
+    return UR_RESULT_ERROR_INVALID_ENUMERATION;
+  }
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
index 00f0792479979..9308b7b408b44 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
@@ -175,7 +175,7 @@ struct ur_kernel_handle_t_ {
   /// Note this only returns the current known number of arguments, not the
   /// real one required by the kernel, since this cannot be queried from
   /// the CUDA Driver API
-  uint32_t get_num_args() const noexcept { return args_.indices_.size() - 1; }
+  size_t get_num_args() const noexcept { return args_.indices_.size() - 1; }
 
   void set_kernel_arg(int index, size_t size, const void *arg) {
     args_.add_arg(index, size, arg);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
index 0081e921ec677..314a9a866c813 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
@@ -418,27 +418,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
   UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER);
-  UR_ASSERT(pBinary != nullptr && size != 0, UR_RESULT_ERROR_INVALID_BINARY);
+  UR_ASSERT(pBinary != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(hContext->get_device()->get() == hDevice->get(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
+  UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);
 
   ur_result_t retError = UR_RESULT_SUCCESS;
 
   std::unique_ptr<ur_program_handle_t_> retProgram{
       new ur_program_handle_t_{hContext}};
 
-  if (pProperties && pProperties->pMetadatas) {
+  if (pProperties) {
+    if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
+      return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    } else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
+      return UR_RESULT_ERROR_INVALID_SIZE;
+    }
     retError =
         retProgram->set_metadata(pProperties->pMetadatas, pProperties->count);
   }
   UR_ASSERT(retError == UR_RESULT_SUCCESS, retError);
 
   auto pBinary_string = reinterpret_cast<const char *>(pBinary);
-  if (size == 0) {
-    size = strlen(pBinary_string) + 1;
-  }
-
-  UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);
 
   retError = retProgram->set_binary(pBinary_string, size);
   UR_ASSERT(retError == UR_RESULT_SUCCESS, retError);
@@ -463,6 +464,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer(
   UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hDevice == hProgram->get_context()->get_device(),
             UR_RESULT_ERROR_INVALID_DEVICE);
+  UR_ASSERT(pFunctionName, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(ppFunctionPointer, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
   CUfunction func;
diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp
index c2f3a3782f9a0..2099b31529176 100644
--- a/sycl/plugins/unified_runtime/ur/ur.hpp
+++ b/sycl/plugins/unified_runtime/ur/ur.hpp
@@ -205,11 +205,14 @@ template <typename T, typename Assign>
 ur_result_t getInfoImpl(size_t param_value_size, void *param_value,
                         size_t *param_value_size_ret, T value,
                         size_t value_size, Assign &&assign_func) {
+  if (!param_value && !param_value_size_ret) {
+    return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+  }
 
   if (param_value != nullptr) {
 
     if (param_value_size < value_size) {
-      return UR_RESULT_ERROR_INVALID_VALUE;
+      return UR_RESULT_ERROR_INVALID_SIZE;
     }
 
     assign_func(param_value, value, value_size);

From fce479c0c244cccbc41370e6e7d234cbe96d13fb Mon Sep 17 00:00:00 2001
From: Aaron Greig <aaron.greig@codeplay.com>
Date: Tue, 6 Jun 2023 14:44:50 +0100
Subject: [PATCH 40/45] [SYCL][UR] Avoid zero-length new in pi2ur.

---
 sycl/plugins/unified_runtime/pi2ur.hpp | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 2408fa452351f..5fed9d0f933f7 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -1611,17 +1611,20 @@ inline pi_result piProgramCreateWithBinary(
       reinterpret_cast<ur_context_handle_t>(Context);
   auto UrDevice = reinterpret_cast<ur_device_handle_t>(DeviceList[0]);
 
-  std::unique_ptr<ur_program_metadata_t[]> pMetadatas(
-      new ur_program_metadata_t[NumMetadataEntries]);
-  for (unsigned i = 0; i < NumMetadataEntries; i++) {
-    HANDLE_ERRORS(mapPIMetadataToUR(&Metadata[i], &pMetadatas[i]));
-  }
-
-  ur_program_properties_t Properties;
+  ur_program_properties_t Properties = {};
   Properties.stype = UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES;
   Properties.pNext = nullptr;
   Properties.count = NumMetadataEntries;
-  Properties.pMetadatas = pMetadatas.get();
+
+  std::unique_ptr<ur_program_metadata_t[]> pMetadatas;
+  if (NumMetadataEntries) {
+    pMetadatas.reset(new ur_program_metadata_t[NumMetadataEntries]);
+    for (unsigned i = 0; i < NumMetadataEntries; i++) {
+      HANDLE_ERRORS(mapPIMetadataToUR(&Metadata[i], &pMetadatas[i]));
+    }
+
+    Properties.pMetadatas = pMetadatas.get();
+  }
 
   ur_program_handle_t *UrProgram =
       reinterpret_cast<ur_program_handle_t *>(Program);

From 9b3448afbb123f67867d60c3c240f2c62bb0bd99 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Thu, 8 Jun 2023 16:32:07 +0100
Subject: [PATCH 41/45] [SYCL][CUDA] Mass fixup of code style in the CUDA
 adapter

---
 .../ur/adapters/cuda/common.cpp               |   51 +-
 .../ur/adapters/cuda/common.hpp               |   16 +-
 .../ur/adapters/cuda/context.cpp              |   58 +-
 .../ur/adapters/cuda/context.hpp              |   68 +-
 .../ur/adapters/cuda/device.cpp               |  775 +++++-----
 .../ur/adapters/cuda/device.hpp               |   52 +-
 .../ur/adapters/cuda/enqueue.cpp              | 1363 ++++++++---------
 .../ur/adapters/cuda/event.cpp                |  209 ++-
 .../ur/adapters/cuda/event.hpp                |  155 +-
 .../ur/adapters/cuda/kernel.cpp               |  212 ++-
 .../ur/adapters/cuda/kernel.hpp               |  162 +-
 .../ur/adapters/cuda/memory.cpp               |  356 +++--
 .../ur/adapters/cuda/memory.hpp               |  181 ++-
 .../ur/adapters/cuda/platform.cpp             |  115 +-
 .../ur/adapters/cuda/platform.hpp             |    2 +-
 .../ur/adapters/cuda/program.cpp              |  293 ++--
 .../ur/adapters/cuda/program.hpp              |   41 +-
 .../ur/adapters/cuda/queue.cpp                |  259 ++--
 .../ur/adapters/cuda/queue.hpp                |  281 ++--
 .../ur/adapters/cuda/sampler.cpp              |   40 +-
 .../ur/adapters/cuda/sampler.hpp              |   16 +-
 .../unified_runtime/ur/adapters/cuda/usm.cpp  |  153 +-
 22 files changed, 2410 insertions(+), 2448 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
index de767c929d638..86975e5097257 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
@@ -12,8 +12,8 @@
 
 #include <sstream>
 
-ur_result_t map_error_ur(CUresult result) {
-  switch (result) {
+ur_result_t mapErrorUR(CUresult Result) {
+  switch (Result) {
   case CUDA_SUCCESS:
     return UR_RESULT_SUCCESS;
   case CUDA_ERROR_NOT_PERMITTED:
@@ -33,33 +33,33 @@ ur_result_t map_error_ur(CUresult result) {
   }
 }
 
-ur_result_t check_error_ur(CUresult result, const char *function, int line,
-                           const char *file) {
-  if (result == CUDA_SUCCESS || result == CUDA_ERROR_DEINITIALIZED) {
+ur_result_t checkErrorUR(CUresult Result, const char *Function, int Line,
+                         const char *File) {
+  if (Result == CUDA_SUCCESS || Result == CUDA_ERROR_DEINITIALIZED) {
     return UR_RESULT_SUCCESS;
   }
 
   if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr) {
-    const char *errorString = nullptr;
-    const char *errorName = nullptr;
-    cuGetErrorName(result, &errorName);
-    cuGetErrorString(result, &errorString);
-    std::stringstream ss;
-    ss << "\nUR CUDA ERROR:"
-       << "\n\tValue:           " << result
-       << "\n\tName:            " << errorName
-       << "\n\tDescription:     " << errorString
-       << "\n\tFunction:        " << function << "\n\tSource Location: " << file
-       << ":" << line << "\n"
+    const char *ErrorString = nullptr;
+    const char *ErrorName = nullptr;
+    cuGetErrorName(Result, &ErrorName);
+    cuGetErrorString(Result, &ErrorString);
+    std::stringstream SS;
+    SS << "\nUR CUDA ERROR:"
+       << "\n\tValue:           " << Result
+       << "\n\tName:            " << ErrorName
+       << "\n\tDescription:     " << ErrorString
+       << "\n\tFunction:        " << Function << "\n\tSource Location: " << File
+       << ":" << Line << "\n"
        << std::endl;
-    std::cerr << ss.str();
+    std::cerr << SS.str();
   }
 
   if (std::getenv("PI_CUDA_ABORT") != nullptr) {
     std::abort();
   }
 
-  throw map_error_ur(result);
+  throw mapErrorUR(Result);
 }
 
 std::string getCudaVersionString() {
@@ -91,16 +91,11 @@ thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS;
 thread_local char ErrorMessage[MaxMessageSize];
 
 // Utility function for setting a message and warning
-[[maybe_unused]] void setErrorMessage(const char *message,
-                                      ur_result_t error_code) {
-  assert(strlen(message) <= MaxMessageSize);
-  strcpy(ErrorMessage, message);
-  ErrorMessageCode = error_code;
-}
-
-ur_result_t zerPluginGetLastError(char **message) {
-  *message = &ErrorMessage[0];
-  return ErrorMessageCode;
+[[maybe_unused]] void setErrorMessage(const char *pMessage,
+                                      ur_result_t ErrorCode) {
+  assert(strlen(pMessage) <= MaxMessageSize);
+  strcpy(ErrorMessage, pMessage);
+  ErrorMessageCode = ErrorCode;
 }
 
 // Returns plugin specific error and warning messages; common implementation
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
index 3aa23c67bf492..5cfa609018b29 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
@@ -11,20 +11,20 @@
 #include <sycl/detail/defines.hpp>
 #include <ur/ur.hpp>
 
-ur_result_t map_error_ur(CUresult result);
+ur_result_t mapErrorUR(CUresult Result);
 
 /// Converts CUDA error into UR error codes, and outputs error information
 /// to stderr.
 /// If PI_CUDA_ABORT env variable is defined, it aborts directly instead of
 /// throwing the error. This is intended for debugging purposes.
-/// \return UR_RESULT_SUCCESS if \param result was CUDA_SUCCESS.
+/// \return UR_RESULT_SUCCESS if \param Result was CUDA_SUCCESS.
 /// \throw ur_result_t exception (integer) if input was not success.
 ///
-ur_result_t check_error_ur(CUresult result, const char *function, int line,
-                           const char *file);
+ur_result_t checkErrorUR(CUresult Result, const char *Function, int Line,
+                         const char *File);
 
-#define UR_CHECK_ERROR(result)                                                 \
-  check_error_ur(result, __func__, __LINE__, __FILE__)
+#define UR_CHECK_ERROR(Result)                                                 \
+  checkErrorUR(Result, __func__, __LINE__, __FILE__)
 
 std::string getCudaVersionString();
 
@@ -33,8 +33,8 @@ extern thread_local ur_result_t ErrorMessageCode;
 extern thread_local char ErrorMessage[MaxMessageSize];
 
 // Utility function for setting a message and warning
-[[maybe_unused]] void setErrorMessage(const char *message,
-                                      ur_result_t error_code);
+[[maybe_unused]] void setErrorMessage(const char *pMessage,
+                                      ur_result_t ErrorCode);
 
 /// ------ Error handling, matching OpenCL plugin semantics.
 namespace sycl {
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
index 27ed647639a6c..c922e8a3ddad6 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
@@ -26,19 +26,19 @@ urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices,
   UR_ASSERT(phContext, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
   assert(DeviceCount == 1);
-  ur_result_t errcode_ret = UR_RESULT_SUCCESS;
+  ur_result_t RetErr = UR_RESULT_SUCCESS;
 
-  std::unique_ptr<ur_context_handle_t_> piContextPtr{nullptr};
+  std::unique_ptr<ur_context_handle_t_> ContextPtr{nullptr};
   try {
-    piContextPtr = std::unique_ptr<ur_context_handle_t_>(
+    ContextPtr = std::unique_ptr<ur_context_handle_t_>(
         new ur_context_handle_t_{*phDevices});
-    *phContext = piContextPtr.release();
-  } catch (ur_result_t err) {
-    errcode_ret = err;
+    *phContext = ContextPtr.release();
+  } catch (ur_result_t Err) {
+    RetErr = Err;
   } catch (...) {
-    errcode_ret = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    RetErr = UR_RESULT_ERROR_OUT_OF_RESOURCES;
   }
-  return errcode_ret;
+  return RetErr;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
@@ -52,24 +52,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
   case UR_CONTEXT_INFO_NUM_DEVICES:
     return ReturnValue(1);
   case UR_CONTEXT_INFO_DEVICES:
-    return ReturnValue(hContext->get_device());
+    return ReturnValue(hContext->getDevice());
   case UR_CONTEXT_INFO_REFERENCE_COUNT:
-    return ReturnValue(hContext->get_reference_count());
+    return ReturnValue(hContext->getReferenceCount());
   case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
-    uint32_t capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
+    uint32_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
                             UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
                             UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
                             UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL;
-    return ReturnValue(capabilities);
+    return ReturnValue(Capabilities);
   }
   case UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
-    int major = 0;
+    int Major = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&major,
+        cuDeviceGetAttribute(&Major,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             hContext->get_device()->get()) == CUDA_SUCCESS);
-    uint32_t capabilities =
-        (major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+                             hContext->getDevice()->get()) == CUDA_SUCCESS);
+    uint32_t Capabilities =
+        (Major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE |
@@ -78,7 +78,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE;
-    return ReturnValue(capabilities);
+    return ReturnValue(Capabilities);
   }
   case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT:
     // 2D USM memcpy is supported.
@@ -94,25 +94,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
   return UR_RESULT_ERROR_INVALID_ENUMERATION;
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urContextRelease(ur_context_handle_t ctxt) {
-  UR_ASSERT(ctxt, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+UR_APIEXPORT ur_result_t UR_APICALL
+urContextRelease(ur_context_handle_t hContext) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  if (ctxt->decrement_reference_count() > 0) {
+  if (hContext->decrementReferenceCount() > 0) {
     return UR_RESULT_SUCCESS;
   }
-  ctxt->invoke_extended_deleters();
+  hContext->invokeExtendedDeleters();
 
-  std::unique_ptr<ur_context_handle_t_> context{ctxt};
+  std::unique_ptr<ur_context_handle_t_> Context{hContext};
 
   return UR_RESULT_SUCCESS;
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urContextRetain(ur_context_handle_t ctxt) {
-  UR_ASSERT(ctxt, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+UR_APIEXPORT ur_result_t UR_APICALL
+urContextRetain(ur_context_handle_t hContext) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  assert(ctxt->get_reference_count() > 0);
+  assert(hContext->getReferenceCount() > 0);
 
-  ctxt->increment_reference_count();
+  hContext->incrementReferenceCount();
   return UR_RESULT_SUCCESS;
 }
 
@@ -142,6 +144,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextSetExtendedDeleter(
   UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(pfnDeleter, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
-  hContext->set_extended_deleter(pfnDeleter, pUserData);
+  hContext->setExtendedDeleter(pfnDeleter, pUserData);
   return UR_RESULT_SUCCESS;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp
index bc3cb32f55b9c..96103d4d52c14 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp
@@ -60,78 +60,78 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
 struct ur_context_handle_t_ {
 
   struct deleter_data {
-    ur_context_extended_deleter_t function;
-    void *user_data;
+    ur_context_extended_deleter_t Function;
+    void *UserData;
 
-    void operator()() { function(user_data); }
+    void operator()() { Function(UserData); }
   };
 
   using native_type = CUcontext;
 
-  native_type cuContext_;
-  ur_device_handle_t deviceId_;
-  std::atomic_uint32_t refCount_;
+  native_type CUContext;
+  ur_device_handle_t DeviceID;
+  std::atomic_uint32_t RefCount;
 
-  ur_context_handle_t_(ur_device_handle_t_ *devId)
-      : cuContext_{devId->get_context()}, deviceId_{devId}, refCount_{1} {
-    urDeviceRetain(deviceId_);
+  ur_context_handle_t_(ur_device_handle_t_ *DevID)
+      : CUContext{DevID->getContext()}, DeviceID{DevID}, RefCount{1} {
+    urDeviceRetain(DeviceID);
   };
 
-  ~ur_context_handle_t_() { urDeviceRelease(deviceId_); }
+  ~ur_context_handle_t_() { urDeviceRelease(DeviceID); }
 
-  void invoke_extended_deleters() {
-    std::lock_guard<std::mutex> guard(mutex_);
-    for (auto &deleter : extended_deleters_) {
-      deleter();
+  void invokeExtendedDeleters() {
+    std::lock_guard<std::mutex> Guard(Mutex);
+    for (auto &Deleter : ExtendedDeleters) {
+      Deleter();
     }
   }
 
-  void set_extended_deleter(ur_context_extended_deleter_t function,
-                            void *user_data) {
-    std::lock_guard<std::mutex> guard(mutex_);
-    extended_deleters_.emplace_back(deleter_data{function, user_data});
+  void setExtendedDeleter(ur_context_extended_deleter_t Function,
+                          void *UserData) {
+    std::lock_guard<std::mutex> Guard(Mutex);
+    ExtendedDeleters.emplace_back(deleter_data{Function, UserData});
   }
 
-  ur_device_handle_t get_device() const noexcept { return deviceId_; }
+  ur_device_handle_t getDevice() const noexcept { return DeviceID; }
 
-  native_type get() const noexcept { return cuContext_; }
+  native_type get() const noexcept { return CUContext; }
 
-  uint32_t increment_reference_count() noexcept { return ++refCount_; }
+  uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
 
-  uint32_t decrement_reference_count() noexcept { return --refCount_; }
+  uint32_t decrementReferenceCount() noexcept { return --RefCount; }
 
-  uint32_t get_reference_count() const noexcept { return refCount_; }
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
 
 private:
-  std::mutex mutex_;
-  std::vector<deleter_data> extended_deleters_;
+  std::mutex Mutex;
+  std::vector<deleter_data> ExtendedDeleters;
 };
 
 namespace {
 class ScopedContext {
 public:
-  ScopedContext(ur_context_handle_t ctxt) {
-    if (!ctxt) {
+  ScopedContext(ur_context_handle_t Context) {
+    if (!Context) {
       throw UR_RESULT_ERROR_INVALID_CONTEXT;
     }
 
-    set_context(ctxt->get());
+    setContext(Context->get());
   }
 
-  ScopedContext(CUcontext ctxt) { set_context(ctxt); }
+  ScopedContext(CUcontext NativeContext) { setContext(NativeContext); }
 
   ~ScopedContext() {}
 
 private:
-  void set_context(CUcontext desired) {
-    CUcontext original = nullptr;
+  void setContext(CUcontext Desired) {
+    CUcontext Original = nullptr;
 
-    UR_CHECK_ERROR(cuCtxGetCurrent(&original));
+    UR_CHECK_ERROR(cuCtxGetCurrent(&Original));
 
     // Make sure the desired context is active on the current thread, setting
     // it if necessary
-    if (original != desired) {
-      UR_CHECK_ERROR(cuCtxSetCurrent(desired));
+    if (Original != Desired) {
+      UR_CHECK_ERROR(cuCtxSetCurrent(Desired));
     }
   }
 };
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
index 39d582405a1e1..c3028a58717c6 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
@@ -20,27 +20,27 @@ int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute) {
   return value;
 }
 
-uint64_t ur_device_handle_t_::get_elapsed_time(CUevent ev) const {
-  float miliSeconds = 0.0f;
+uint64_t ur_device_handle_t_::getElapsedTime(CUevent ev) const {
+  float Milliseconds = 0.0f;
 
-  UR_CHECK_ERROR(cuEventElapsedTime(&miliSeconds, evBase_, ev));
+  UR_CHECK_ERROR(cuEventElapsedTime(&Milliseconds, EvBase, ev));
 
-  return static_cast<uint64_t>(miliSeconds * 1.0e6);
+  return static_cast<uint64_t>(Milliseconds * 1.0e6);
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
-                                                    ur_device_info_t infoType,
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+                                                    ur_device_info_t propName,
                                                     size_t propSize,
-                                                    void *pDeviceInfo,
+                                                    void *pPropValue,
                                                     size_t *pPropSizeRet) {
-  UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  UrReturnHelper ReturnValue(propSize, pDeviceInfo, pPropSizeRet);
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
 
-  static constexpr uint32_t max_work_item_dimensions = 3u;
+  static constexpr uint32_t MaxWorkItemDimensions = 3u;
 
-  ScopedContext active(device->get_context());
+  ScopedContext Active(hDevice->getContext());
 
-  switch ((uint32_t)infoType) {
+  switch ((uint32_t)propName) {
   case UR_DEVICE_INFO_TYPE: {
     return ReturnValue(UR_DEVICE_TYPE_GPU);
   }
@@ -48,80 +48,80 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     return ReturnValue(4318u);
   }
   case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
-    int compute_units = 0;
+    int ComputeUnits = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&compute_units,
+        cuDeviceGetAttribute(&ComputeUnits,
                              CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(compute_units >= 0);
-    return ReturnValue(static_cast<uint32_t>(compute_units));
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(ComputeUnits >= 0);
+    return ReturnValue(static_cast<uint32_t>(ComputeUnits));
   }
   case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: {
-    return ReturnValue(max_work_item_dimensions);
+    return ReturnValue(MaxWorkItemDimensions);
   }
   case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: {
     struct {
-      size_t sizes[max_work_item_dimensions];
-    } return_sizes;
+      size_t Sizes[MaxWorkItemDimensions];
+    } ReturnSizes;
 
-    int max_x = 0, max_y = 0, max_z = 0;
+    int MaxX = 0, MaxY = 0, MaxZ = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(max_x >= 0);
+        cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(MaxX >= 0);
 
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(max_y >= 0);
+        cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(MaxY >= 0);
 
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(max_z >= 0);
+        cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(MaxZ >= 0);
 
-    return_sizes.sizes[0] = size_t(max_x);
-    return_sizes.sizes[1] = size_t(max_y);
-    return_sizes.sizes[2] = size_t(max_z);
-    return ReturnValue(return_sizes);
+    ReturnSizes.Sizes[0] = size_t(MaxX);
+    ReturnSizes.Sizes[1] = size_t(MaxY);
+    ReturnSizes.Sizes[2] = size_t(MaxZ);
+    return ReturnValue(ReturnSizes);
   }
 
   case UR_DEVICE_INFO_MAX_WORK_GROUPS_3D: {
     struct {
-      size_t sizes[max_work_item_dimensions];
-    } return_sizes;
-    int max_x = 0, max_y = 0, max_z = 0;
+      size_t Sizes[MaxWorkItemDimensions];
+    } ReturnSizes;
+    int MaxX = 0, MaxY = 0, MaxZ = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(max_x >= 0);
+        cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(MaxX >= 0);
 
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(max_y >= 0);
+        cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(MaxY >= 0);
 
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(max_z >= 0);
+        cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(MaxZ >= 0);
 
-    return_sizes.sizes[0] = size_t(max_x);
-    return_sizes.sizes[1] = size_t(max_y);
-    return_sizes.sizes[2] = size_t(max_z);
-    return ReturnValue(return_sizes);
+    ReturnSizes.Sizes[0] = size_t(MaxX);
+    ReturnSizes.Sizes[1] = size_t(MaxY);
+    ReturnSizes.Sizes[2] = size_t(MaxZ);
+    return ReturnValue(ReturnSizes);
   }
 
   case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: {
-    int max_work_group_size = 0;
+    int MaxWorkGroupSize = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&max_work_group_size,
+        cuDeviceGetAttribute(&MaxWorkGroupSize,
                              CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                             device->get()) == CUDA_SUCCESS);
+                             hDevice->get()) == CUDA_SUCCESS);
 
-    sycl::detail::ur::assertion(max_work_group_size >= 0);
+    sycl::detail::ur::assertion(MaxWorkGroupSize >= 0);
 
-    return ReturnValue(size_t(max_work_group_size));
+    return ReturnValue(size_t(MaxWorkGroupSize));
   }
   case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: {
     return ReturnValue(1u);
@@ -167,55 +167,55 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
   }
   case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: {
     // Number of sub-groups = max block size / warp size + possible remainder
-    int max_threads = 0;
+    int MaxThreads = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&max_threads,
+        cuDeviceGetAttribute(&MaxThreads,
                              CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                             device->get()) == CUDA_SUCCESS);
-    int warpSize = 0;
+                             hDevice->get()) == CUDA_SUCCESS);
+    int WarpSize = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                             device->get()) == CUDA_SUCCESS);
-    int maxWarps = (max_threads + warpSize - 1) / warpSize;
-    return ReturnValue(maxWarps);
+        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                             hDevice->get()) == CUDA_SUCCESS);
+    int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize;
+    return ReturnValue(MaxWarps);
   }
   case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: {
     // Volta provides independent thread scheduling
     // TODO: Revisit for previous generation GPUs
-    int major = 0;
+    int Major = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&major,
+        cuDeviceGetAttribute(&Major,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-    bool ifp = (major >= 7);
-    return ReturnValue(ifp);
+                             hDevice->get()) == CUDA_SUCCESS);
+    bool IFP = (Major >= 7);
+    return ReturnValue(IFP);
   }
 
   case UR_DEVICE_INFO_ATOMIC_64: {
-    int major = 0;
+    int Major = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&major,
+        cuDeviceGetAttribute(&Major,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
+                             hDevice->get()) == CUDA_SUCCESS);
 
-    bool atomic64 = (major >= 6) ? true : false;
-    return ReturnValue(atomic64);
+    bool Atomic64 = (Major >= 6) ? true : false;
+    return ReturnValue(Atomic64);
   }
   case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
-    uint64_t capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
+    uint64_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
                             UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
                             UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
                             UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL;
-    return ReturnValue(capabilities);
+    return ReturnValue(Capabilities);
   }
   case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
-    int major = 0;
+    int Major = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&major,
+        cuDeviceGetAttribute(&Major,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-    uint64_t capabilities =
-        (major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+                             hDevice->get()) == CUDA_SUCCESS);
+    uint64_t Capabilities =
+        (Major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE |
@@ -224,18 +224,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE;
-    return ReturnValue(capabilities);
+    return ReturnValue(Capabilities);
   }
 
   case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: {
     // SYCL2020 4.6.4.2 minimum mandated capabilities for
     // atomic_fence_order_capabilities.
-    ur_memory_order_capability_flags_t capabilities =
+    ur_memory_order_capability_flags_t Capabilities =
         UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
         UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
         UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
         UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL;
-    return ReturnValue(capabilities);
+    return ReturnValue(Capabilities);
   }
   case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: {
     // SYCL2020 4.6.4.2 minimum mandated capabilities for
@@ -243,42 +243,42 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     // Because scopes are hierarchical, wider scopes support all narrower
     // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and
     // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382)
-    ur_memory_scope_capability_flags_t capabilities =
+    ur_memory_scope_capability_flags_t Capabilities =
         UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
         UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
         UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP;
-    return ReturnValue(capabilities);
+    return ReturnValue(Capabilities);
   }
   case UR_DEVICE_INFO_BFLOAT16: {
-    int major = 0;
+    int Major = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&major,
+        cuDeviceGetAttribute(&Major,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
+                             hDevice->get()) == CUDA_SUCCESS);
 
-    bool bfloat16 = (major >= 8) ? true : false;
-    return ReturnValue(bfloat16);
+    bool BFloat16 = (Major >= 8) ? true : false;
+    return ReturnValue(BFloat16);
   }
   case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
     // NVIDIA devices only support one sub-group size (the warp size)
-    int warpSize = 0;
+    int WarpSize = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                             device->get()) == CUDA_SUCCESS);
-    size_t sizes[1] = {static_cast<size_t>(warpSize)};
-    return ReturnValue(sizes, 1);
+        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                             hDevice->get()) == CUDA_SUCCESS);
+    size_t Sizes[1] = {static_cast<size_t>(WarpSize)};
+    return ReturnValue(Sizes, 1);
   }
   case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: {
-    int clock_freq = 0;
+    int ClockFreq = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&clock_freq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(clock_freq >= 0);
-    return ReturnValue(static_cast<uint32_t>(clock_freq) / 1000u);
+        cuDeviceGetAttribute(&ClockFreq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(ClockFreq >= 0);
+    return ReturnValue(static_cast<uint32_t>(ClockFreq) / 1000u);
   }
   case UR_DEVICE_INFO_ADDRESS_BITS: {
-    auto bits = uint32_t{std::numeric_limits<uintptr_t>::digits};
-    return ReturnValue(bits);
+    auto Bits = uint32_t{std::numeric_limits<uintptr_t>::digits};
+    return ReturnValue(Bits);
   }
   case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: {
     // Max size of memory object allocation in bytes.
@@ -287,22 +287,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     // 32 × 1024 × 1024) for devices that are not of type
     // CL_DEVICE_TYPE_CUSTOM.
 
-    size_t global = 0;
-    sycl::detail::ur::assertion(cuDeviceTotalMem(&global, device->get()) ==
+    size_t Global = 0;
+    sycl::detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) ==
                                 CUDA_SUCCESS);
 
-    auto quarter_global = static_cast<uint32_t>(global / 4u);
+    auto QuarterGlobal = static_cast<uint32_t>(Global / 4u);
 
-    auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global),
-                              32u * 1024u * 1024u);
+    auto MaxAlloc = std::max(std::min(1024u * 1024u * 1024u, QuarterGlobal),
+                             32u * 1024u * 1024u);
 
-    return ReturnValue(uint64_t{max_alloc});
+    return ReturnValue(uint64_t{MaxAlloc});
   }
   case UR_DEVICE_INFO_IMAGE_SUPPORTED: {
-    bool enabled = false;
+    bool Enabled = false;
 
     if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) {
-      enabled = true;
+      Enabled = true;
     } else {
       sycl::detail::ur::cuPrint(
           "Images are not fully supported by the CUDA BE, their support is "
@@ -311,7 +311,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
           "runtime.");
     }
 
-    return ReturnValue(uint32_t{enabled});
+    return ReturnValue(uint32_t{Enabled});
   }
   case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS: {
     // This call doesn't match to CUDA as it doesn't have images, but instead
@@ -327,117 +327,117 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
   }
   case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: {
     // Take the smaller of maximum surface and maximum texture height.
-    int tex_height = 0;
+    int TexHeight = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&tex_height,
+        cuDeviceGetAttribute(&TexHeight,
                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(tex_height >= 0);
-    int surf_height = 0;
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(TexHeight >= 0);
+    int SurfHeight = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&surf_height,
+        cuDeviceGetAttribute(&SurfHeight,
                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(surf_height >= 0);
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(SurfHeight >= 0);
 
-    int min = std::min(tex_height, surf_height);
+    int Min = std::min(TexHeight, SurfHeight);
 
-    return ReturnValue(static_cast<size_t>(min));
+    return ReturnValue(static_cast<size_t>(Min));
   }
   case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: {
     // Take the smaller of maximum surface and maximum texture width.
-    int tex_width = 0;
+    int TexWidth = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&tex_width,
+        cuDeviceGetAttribute(&TexWidth,
                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(tex_width >= 0);
-    int surf_width = 0;
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(TexWidth >= 0);
+    int SurfWidth = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&surf_width,
+        cuDeviceGetAttribute(&SurfWidth,
                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(surf_width >= 0);
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(SurfWidth >= 0);
 
-    int min = std::min(tex_width, surf_width);
+    int Min = std::min(TexWidth, SurfWidth);
 
-    return ReturnValue(static_cast<size_t>(min));
+    return ReturnValue(static_cast<size_t>(Min));
   }
   case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: {
     // Take the smaller of maximum surface and maximum texture height.
-    int tex_height = 0;
+    int TexHeight = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&tex_height,
+        cuDeviceGetAttribute(&TexHeight,
                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(tex_height >= 0);
-    int surf_height = 0;
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(TexHeight >= 0);
+    int SurfHeight = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&surf_height,
+        cuDeviceGetAttribute(&SurfHeight,
                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(surf_height >= 0);
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(SurfHeight >= 0);
 
-    int min = std::min(tex_height, surf_height);
+    int Min = std::min(TexHeight, SurfHeight);
 
-    return ReturnValue(static_cast<size_t>(min));
+    return ReturnValue(static_cast<size_t>(Min));
   }
   case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: {
     // Take the smaller of maximum surface and maximum texture width.
-    int tex_width = 0;
+    int TexWidth = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&tex_width,
+        cuDeviceGetAttribute(&TexWidth,
                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(tex_width >= 0);
-    int surf_width = 0;
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(TexWidth >= 0);
+    int SurfWidth = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&surf_width,
+        cuDeviceGetAttribute(&SurfWidth,
                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(surf_width >= 0);
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(SurfWidth >= 0);
 
-    int min = std::min(tex_width, surf_width);
+    int Min = std::min(TexWidth, SurfWidth);
 
-    return ReturnValue(static_cast<size_t>(min));
+    return ReturnValue(static_cast<size_t>(Min));
   }
   case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: {
     // Take the smaller of maximum surface and maximum texture depth.
-    int tex_depth = 0;
+    int TexDepth = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&tex_depth,
+        cuDeviceGetAttribute(&TexDepth,
                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(tex_depth >= 0);
-    int surf_depth = 0;
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(TexDepth >= 0);
+    int SurfDepth = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&surf_depth,
+        cuDeviceGetAttribute(&SurfDepth,
                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(surf_depth >= 0);
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(SurfDepth >= 0);
 
-    int min = std::min(tex_depth, surf_depth);
+    int Min = std::min(TexDepth, SurfDepth);
 
-    return ReturnValue(static_cast<size_t>(min));
+    return ReturnValue(static_cast<size_t>(Min));
   }
   case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: {
     // Take the smaller of maximum surface and maximum texture width.
-    int tex_width = 0;
+    int TexWidth = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&tex_width,
+        cuDeviceGetAttribute(&TexWidth,
                              CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(tex_width >= 0);
-    int surf_width = 0;
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(TexWidth >= 0);
+    int SurfWidth = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&surf_width,
+        cuDeviceGetAttribute(&SurfWidth,
                              CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(surf_width >= 0);
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(SurfWidth >= 0);
 
-    int min = std::min(tex_width, surf_width);
+    int Min = std::min(TexWidth, SurfWidth);
 
-    return ReturnValue(static_cast<size_t>(min));
+    return ReturnValue(static_cast<size_t>(Min));
   }
   case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: {
     return ReturnValue(0lu);
@@ -454,14 +454,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     return ReturnValue(4000lu);
   }
   case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: {
-    int mem_base_addr_align = 0;
+    int MemBaseAddrAlign = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&mem_base_addr_align,
+        cuDeviceGetAttribute(&MemBaseAddrAlign,
                              CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
-                             device->get()) == CUDA_SUCCESS);
+                             hDevice->get()) == CUDA_SUCCESS);
     // Multiply by 8 as clGetDeviceInfo returns this value in bits
-    mem_base_addr_align *= 8;
-    return ReturnValue(mem_base_addr_align);
+    MemBaseAddrAlign *= 8;
+    return ReturnValue(MemBaseAddrAlign);
   }
   case UR_DEVICE_INFO_HALF_FP_CONFIG: {
     // TODO: is this config consistent across all NVIDIA GPUs?
@@ -469,7 +469,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
   }
   case UR_DEVICE_INFO_SINGLE_FP_CONFIG: {
     // TODO: is this config consistent across all NVIDIA GPUs?
-    uint64_t config =
+    uint64_t Config =
         UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
         UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
         UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
@@ -477,17 +477,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
         UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
         UR_DEVICE_FP_CAPABILITY_FLAG_FMA |
         UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
-    return ReturnValue(config);
+    return ReturnValue(Config);
   }
   case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: {
     // TODO: is this config consistent across all NVIDIA GPUs?
-    uint64_t config = UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
+    uint64_t Config = UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
                       UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
                       UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
                       UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
                       UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
                       UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
-    return ReturnValue(config);
+    return ReturnValue(Config);
   }
   case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: {
     // TODO: is this config consistent across all NVIDIA GPUs?
@@ -499,30 +499,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     return ReturnValue(128u);
   }
   case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: {
-    int cache_size = 0;
+    int CacheSize = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&cache_size, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(cache_size >= 0);
+        cuDeviceGetAttribute(&CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(CacheSize >= 0);
     // The L2 cache is global to the GPU.
-    return ReturnValue(static_cast<uint64_t>(cache_size));
+    return ReturnValue(static_cast<uint64_t>(CacheSize));
   }
   case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: {
-    size_t bytes = 0;
+    size_t Bytes = 0;
     // Runtime API has easy access to this value, driver API info is scarse.
-    sycl::detail::ur::assertion(cuDeviceTotalMem(&bytes, device->get()) ==
+    sycl::detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) ==
                                 CUDA_SUCCESS);
-    return ReturnValue(uint64_t{bytes});
+    return ReturnValue(uint64_t{Bytes});
   }
   case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: {
-    int constant_memory = 0;
+    int ConstantMemory = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&constant_memory,
+        cuDeviceGetAttribute(&ConstantMemory,
                              CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(constant_memory >= 0);
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(ConstantMemory >= 0);
 
-    return ReturnValue(static_cast<uint64_t>(constant_memory));
+    return ReturnValue(static_cast<uint64_t>(ConstantMemory));
   }
   case UR_DEVICE_INFO_MAX_CONSTANT_ARGS: {
     // TODO: is there a way to retrieve this from CUDA driver API?
@@ -537,32 +537,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     // OpenCL's "local memory" maps most closely to CUDA's "shared memory".
     // CUDA has its own definition of "local memory", which maps to OpenCL's
     // "private memory".
-    int local_mem_size = 0;
+    int LocalMemSize = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&local_mem_size,
+        cuDeviceGetAttribute(&LocalMemSize,
                              CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(local_mem_size >= 0);
-    return ReturnValue(static_cast<uint64_t>(local_mem_size));
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(LocalMemSize >= 0);
+    return ReturnValue(static_cast<uint64_t>(LocalMemSize));
   }
   case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: {
-    int ecc_enabled = 0;
+    int ECCEnabled = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&ecc_enabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
-                             device->get()) == CUDA_SUCCESS);
+        cuDeviceGetAttribute(&ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
+                             hDevice->get()) == CUDA_SUCCESS);
 
-    sycl::detail::ur::assertion((ecc_enabled == 0) | (ecc_enabled == 1));
-    auto result = static_cast<bool>(ecc_enabled);
-    return ReturnValue(result);
+    sycl::detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1));
+    auto Result = static_cast<bool>(ECCEnabled);
+    return ReturnValue(Result);
   }
   case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: {
-    int is_integrated = 0;
+    int IsIntegrated = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&is_integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED,
-                             device->get()) == CUDA_SUCCESS);
+        cuDeviceGetAttribute(&IsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED,
+                             hDevice->get()) == CUDA_SUCCESS);
 
-    sycl::detail::ur::assertion((is_integrated == 0) | (is_integrated == 1));
-    auto result = static_cast<bool>(is_integrated);
+    sycl::detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1));
+    auto result = static_cast<bool>(IsIntegrated);
     return ReturnValue(result);
   }
   case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: {
@@ -586,9 +586,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     return ReturnValue(true);
   }
   case UR_DEVICE_INFO_EXECUTION_CAPABILITIES: {
-    auto capability = ur_device_exec_capability_flags_t{
+    auto Capability = ur_device_exec_capability_flags_t{
         UR_DEVICE_EXEC_CAPABILITY_FLAG_KERNEL};
-    return ReturnValue(capability);
+    return ReturnValue(Capability);
   }
   case UR_DEVICE_INFO_QUEUE_PROPERTIES:
     return ReturnValue(
@@ -596,14 +596,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
                         UR_QUEUE_FLAG_PROFILING_ENABLE));
   case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: {
     // The mandated minimum capability:
-    uint64_t capability = UR_QUEUE_FLAG_PROFILING_ENABLE |
+    uint64_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE |
                           UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
-    return ReturnValue(capability);
+    return ReturnValue(Capability);
   }
   case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: {
     // The mandated minimum capability:
-    uint64_t capability = UR_QUEUE_FLAG_PROFILING_ENABLE;
-    return ReturnValue(capability);
+    uint64_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE;
+    return ReturnValue(Capability);
   }
   case UR_DEVICE_INFO_BUILT_IN_KERNELS: {
     // An empty string is returned if no built-in kernels are supported by the
@@ -611,27 +611,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     return ReturnValue("");
   }
   case UR_DEVICE_INFO_PLATFORM: {
-    return ReturnValue(device->get_platform());
+    return ReturnValue(hDevice->getPlatform());
   }
   case UR_DEVICE_INFO_NAME: {
-    static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u;
-    char name[MAX_DEVICE_NAME_LENGTH];
-    sycl::detail::ur::assertion(cuDeviceGetName(name, MAX_DEVICE_NAME_LENGTH,
-                                                device->get()) == CUDA_SUCCESS);
-    return ReturnValue(name, strlen(name) + 1);
+    static constexpr size_t MaxDeviceNameLength = 256u;
+    char Name[MaxDeviceNameLength];
+    sycl::detail::ur::assertion(
+        cuDeviceGetName(Name, MaxDeviceNameLength, hDevice->get()) ==
+        CUDA_SUCCESS);
+    return ReturnValue(Name, strlen(Name) + 1);
   }
   case UR_DEVICE_INFO_VENDOR: {
     return ReturnValue("NVIDIA Corporation");
   }
   case UR_DEVICE_INFO_DRIVER_VERSION: {
-    auto version = getCudaVersionString();
-    return ReturnValue(version.c_str());
+    auto Version = getCudaVersionString();
+    return ReturnValue(Version.c_str());
   }
   case UR_DEVICE_INFO_PROFILE: {
     return ReturnValue("CUDA");
   }
   case UR_DEVICE_INFO_REFERENCE_COUNT: {
-    return ReturnValue(device->get_reference_count());
+    return ReturnValue(hDevice->getReferenceCount());
   }
   case UR_DEVICE_INFO_VERSION: {
     std::stringstream SS;
@@ -639,13 +640,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     sycl::detail::ur::assertion(
         cuDeviceGetAttribute(&Major,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
+                             hDevice->get()) == CUDA_SUCCESS);
     SS << Major;
     int Minor;
     sycl::detail::ur::assertion(
         cuDeviceGetAttribute(&Minor,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                             device->get()) == CUDA_SUCCESS);
+                             hDevice->get()) == CUDA_SUCCESS);
     SS << "." << Minor;
     return ReturnValue(SS.str().c_str());
   }
@@ -658,19 +659,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     SupportedExtensions += "pi_ext_intel_devicelib_assert ";
     SupportedExtensions += " ";
 
-    int major = 0;
-    int minor = 0;
+    int Major = 0;
+    int Minor = 0;
 
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&major,
+        cuDeviceGetAttribute(&Major,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
+                             hDevice->get()) == CUDA_SUCCESS);
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&minor,
+        cuDeviceGetAttribute(&Minor,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                             device->get()) == CUDA_SUCCESS);
+                             hDevice->get()) == CUDA_SUCCESS);
 
-    if ((major >= 6) || ((major == 5) && (minor >= 3))) {
+    if ((Major >= 6) || ((Major == 5) && (Minor >= 3))) {
       SupportedExtensions += "cl_khr_fp16 ";
     }
 
@@ -707,14 +708,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     //
     // query if/how the device can access page-locked host memory, possibly
     // through PCIe, using the same pointer as the host
-    uint32_t value = {};
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
+    uint32_t Value = {};
+    if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
       // the device shares a unified address space with the host
-      if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
+      if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
           6) {
         // compute capability 6.x introduces operations that are atomic with
         // respect to other CPUs and GPUs in the system
-        value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+        Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
                 UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS |
                 UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS |
                 UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
@@ -722,11 +723,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
         // on GPU architectures with compute capability lower than 6.x, atomic
         // operations from the GPU to CPU memory will not be atomic with respect
         // to CPU initiated atomic operations
-        value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+        Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
                 UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS;
       }
     }
-    return ReturnValue(value);
+    return ReturnValue(Value);
   }
   case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: {
     // from cl_intel_unified_shared_memory:
@@ -734,12 +735,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     // associated with this device."
     //
     // query how the device can access memory allocated on the device itself (?)
-    uint32_t value =
+    uint32_t Value =
         UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
         UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS |
         UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS |
         UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
-    return ReturnValue(value);
+    return ReturnValue(Value);
   }
   case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: {
     // from cl_intel_unified_shared_memory:
@@ -747,24 +748,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     // allocation associated with this device."
     //
     // query if/how the device can access managed memory associated to it
-    uint32_t value = {};
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
+    uint32_t Value = {};
+    if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
       // the device can allocate managed memory on this system
-      value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+      Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
               UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS;
     }
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+    if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
       // the device can coherently access managed memory concurrently with the
       // CPU
-      value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS;
-      if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
+      Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS;
+      if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
           6) {
         // compute capability 6.x introduces operations that are atomic with
         // respect to other CPUs and GPUs in the system
-        value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
+        Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
       }
     }
-    return ReturnValue(value);
+    return ReturnValue(Value);
   }
   case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: {
     // from cl_intel_unified_shared_memory:
@@ -775,27 +776,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     //
     // query if/how the device can access managed memory associated to other
     // devices
-    uint32_t value = {};
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
+    uint32_t Value = {};
+    if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
       // the device can allocate managed memory on this system
-      value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS;
+      Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS;
     }
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+    if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
       // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
       // attribute can coherently access managed memory concurrently with the
       // CPU
-      value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS;
+      Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS;
     }
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
+    if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
         6) {
       // compute capability 6.x introduces operations that are atomic with
       // respect to other CPUs and GPUs in the system
-      if (value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)
-        value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS;
-      if (value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS)
-        value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
+      if (Value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)
+        Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS;
+      if (Value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS)
+        Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
     }
-    return ReturnValue(value);
+    return ReturnValue(Value);
   }
   case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: {
     // from cl_intel_unified_shared_memory:
@@ -804,39 +805,39 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     //
     // query if/how the device can access pageable host memory allocated by the
     // system allocator
-    uint32_t value = {};
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) {
+    uint32_t Value = {};
+    if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) {
       // the device suppports coherently accessing pageable memory without
       // calling cuMemHostRegister/cudaHostRegister on it
-      if (getAttribute(device,
+      if (getAttribute(hDevice,
                        CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) {
         // the link between the device and the host supports native atomic
         // operations
-        value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+        Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
                 UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS |
                 UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS |
                 UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
       } else {
         // the link between the device and the host does not support native
         // atomic operations
-        value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+        Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
                 UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS;
       }
     }
-    return ReturnValue(value);
+    return ReturnValue(Value);
   }
   case UR_DEVICE_INFO_ASYNC_BARRIER: {
-    int value =
-        getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 8;
-    return ReturnValue(static_cast<bool>(value));
+    int Value = getAttribute(hDevice,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 8;
+    return ReturnValue(static_cast<bool>(Value));
   }
   case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION: {
-    int major =
-        getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
-    int minor =
-        getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
-    std::string result = std::to_string(major) + "." + std::to_string(minor);
-    return ReturnValue(result.c_str());
+    int Major =
+        getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
+    int Minor =
+        getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
+    std::string Result = std::to_string(Major) + "." + std::to_string(Minor);
+    return ReturnValue(Result.c_str());
   }
 
   case UR_DEVICE_INFO_GLOBAL_MEM_FREE: {
@@ -848,103 +849,102 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     return ReturnValue(FreeMemory);
   }
   case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: {
-    int value = 0;
+    int Value = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(value >= 0);
+        cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(Value >= 0);
     // Convert kilohertz to megahertz when returning.
-    return ReturnValue(value / 1000);
+    return ReturnValue(Value / 1000);
   }
   case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: {
-    int value = 0;
+    int Value = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&value,
+        cuDeviceGetAttribute(&Value,
                              CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(value >= 0);
-    return ReturnValue(value);
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(Value >= 0);
+    return ReturnValue(Value);
   }
   case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: {
     return ReturnValue(int32_t{1});
   }
   case UR_DEVICE_INFO_DEVICE_ID: {
-    int value = 0;
+    int Value = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::ur::assertion(value >= 0);
-    return ReturnValue(value);
+        cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(Value >= 0);
+    return ReturnValue(Value);
   }
   case UR_DEVICE_INFO_UUID: {
-    int driver_version = 0;
-    cuDriverGetVersion(&driver_version);
-    int major = driver_version / 1000;
-    int minor = driver_version % 1000 / 10;
-    CUuuid uuid;
-    if ((major > 11) || (major == 11 && minor >= 4)) {
-      sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&uuid, device->get()) ==
+    int DriverVersion = 0;
+    cuDriverGetVersion(&DriverVersion);
+    int Major = DriverVersion / 1000;
+    int Minor = DriverVersion % 1000 / 10;
+    CUuuid UUID;
+    if ((Major > 11) || (Major == 11 && Minor >= 4)) {
+      sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) ==
                                   CUDA_SUCCESS);
     } else {
-      sycl::detail::ur::assertion(cuDeviceGetUuid(&uuid, device->get()) ==
+      sycl::detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) ==
                                   CUDA_SUCCESS);
     }
-    std::array<unsigned char, 16> name;
-    std::copy(uuid.bytes, uuid.bytes + 16, name.begin());
-    return ReturnValue(name.data(), 16);
+    std::array<unsigned char, 16> Name;
+    std::copy(UUID.bytes, UUID.bytes + 16, Name.begin());
+    return ReturnValue(Name.data(), 16);
   }
   case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: {
-    int major = 0;
+    int Major = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&major,
+        cuDeviceGetAttribute(&Major,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
+                             hDevice->get()) == CUDA_SUCCESS);
 
-    int minor = 0;
+    int Minor = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&minor,
+        cuDeviceGetAttribute(&Minor,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                             device->get()) == CUDA_SUCCESS);
+                             hDevice->get()) == CUDA_SUCCESS);
 
     // Some specific devices seem to need special handling. See reference
     // https://github.com/jeffhammond/HPCInfo/blob/master/cuda/gpu-detect.cu
-    bool is_xavier_agx = major == 7 && minor == 2;
-    bool is_orin_agx = major == 8 && minor == 7;
-
-    int memory_clock_khz = 0;
-    if (is_xavier_agx) {
-      memory_clock_khz = 2133000;
-    } else if (is_orin_agx) {
-      memory_clock_khz = 3200000;
+    bool IsXavierAGX = Major == 7 && Minor == 2;
+    bool IsOrinAGX = Major == 8 && Minor == 7;
+
+    int MemoryClockKHz = 0;
+    if (IsXavierAGX) {
+      MemoryClockKHz = 2133000;
+    } else if (IsOrinAGX) {
+      MemoryClockKHz = 3200000;
     } else {
       sycl::detail::ur::assertion(
-          cuDeviceGetAttribute(&memory_clock_khz,
+          cuDeviceGetAttribute(&MemoryClockKHz,
                                CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
-                               device->get()) == CUDA_SUCCESS);
+                               hDevice->get()) == CUDA_SUCCESS);
     }
 
-    int memory_bus_width = 0;
-    if (is_orin_agx) {
-      memory_bus_width = 256;
+    int MemoryBusWidth = 0;
+    if (IsOrinAGX) {
+      MemoryBusWidth = 256;
     } else {
       sycl::detail::ur::assertion(
-          cuDeviceGetAttribute(&memory_bus_width,
+          cuDeviceGetAttribute(&MemoryBusWidth,
                                CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
-                               device->get()) == CUDA_SUCCESS);
+                               hDevice->get()) == CUDA_SUCCESS);
     }
 
-    uint64_t memory_bandwidth =
-        uint64_t(memory_clock_khz) * memory_bus_width * 250;
+    uint64_t MemoryBandwidth = uint64_t(MemoryClockKHz) * MemoryBusWidth * 250;
 
-    return ReturnValue(memory_bandwidth);
+    return ReturnValue(MemoryBandwidth);
   }
   case UR_DEVICE_INFO_IL_VERSION: {
-    std::string il_version = "nvptx-";
+    std::string ILVersion = "nvptx-";
 
-    int driver_version = 0;
-    cuDriverGetVersion(&driver_version);
-    int major = driver_version / 1000;
-    int minor = driver_version % 1000 / 10;
+    int DriverVersion = 0;
+    cuDriverGetVersion(&DriverVersion);
+    int Major = DriverVersion / 1000;
+    int Minor = DriverVersion % 1000 / 10;
 
     // We can work out which ptx ISA version we support based on the versioning
     // table published here
@@ -953,29 +953,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     // can derive that easily. The minor versions for version 10 don't line up
     // the same so it needs a special case. This is not ideal but it does seem
     // to be the best bet to avoid a maintenance burden here.
-    il_version += std::to_string(major - 4) + ".";
-    if (major == 10) {
-      il_version += std::to_string(minor + 3);
-    } else if (major >= 11) {
-      il_version += std::to_string(minor);
+    ILVersion += std::to_string(Major - 4) + ".";
+    if (Major == 10) {
+      ILVersion += std::to_string(Minor + 3);
+    } else if (Major >= 11) {
+      ILVersion += std::to_string(Minor);
     } else {
       return UR_RESULT_ERROR_INVALID_VALUE;
     }
 
-    return ReturnValue(il_version.data(), il_version.size());
+    return ReturnValue(ILVersion.data(), ILVersion.size());
   }
   case UR_EXT_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: {
     // Maximum number of 32-bit registers available to a thread block.
     // Note: This number is shared by all thread blocks simultaneously resident
     // on a multiprocessor.
-    int max_registers{-1};
+    int MaxRegisters{-1};
     UR_CHECK_ERROR(cuDeviceGetAttribute(
-        &max_registers, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
-        device->get()));
+        &MaxRegisters, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
+        hDevice->get()));
 
-    sycl::detail::ur::assertion(max_registers >= 0);
+    sycl::detail::ur::assertion(MaxRegisters >= 0);
 
-    return ReturnValue(static_cast<uint32_t>(max_registers));
+    return ReturnValue(static_cast<uint32_t>(MaxRegisters));
   }
   case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
     return ReturnValue(false);
@@ -985,7 +985,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
     constexpr size_t AddressBufferSize = 13;
     char AddressBuffer[AddressBufferSize];
     sycl::detail::ur::assertion(
-        cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, device->get()) ==
+        cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, hDevice->get()) ==
         CUDA_SUCCESS);
     // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written
     sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) ==
@@ -1012,8 +1012,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t device,
 
 /// \return PI_SUCCESS if the function is executed successfully
 /// CUDA devices are always root devices so retain always returns success.
-UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t device) {
-  UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t hDevice) {
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
   return UR_RESULT_SUCCESS;
 }
@@ -1026,8 +1026,9 @@ urDevicePartition(ur_device_handle_t, const ur_device_partition_property_t *,
 
 /// \return UR_RESULT_SUCCESS always since CUDA devices are always root
 /// devices.
-UR_DLLEXPORT ur_result_t UR_APICALL urDeviceRelease(ur_device_handle_t device) {
-  UR_ASSERT(device, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+UR_DLLEXPORT ur_result_t UR_APICALL
+urDeviceRelease(ur_device_handle_t hDevice) {
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
   return UR_RESULT_SUCCESS;
 }
@@ -1037,32 +1038,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform,
                                                 uint32_t NumEntries,
                                                 ur_device_handle_t *phDevices,
                                                 uint32_t *pNumDevices) {
-  ur_result_t err = UR_RESULT_SUCCESS;
-  const bool askingForAll = DeviceType == UR_DEVICE_TYPE_ALL;
-  const bool askingForDefault = DeviceType == UR_DEVICE_TYPE_DEFAULT;
-  const bool askingForGPU = DeviceType == UR_DEVICE_TYPE_GPU;
-  const bool returnDevices = askingForDefault || askingForAll || askingForGPU;
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  const bool AskingForAll = DeviceType == UR_DEVICE_TYPE_ALL;
+  const bool AskingForDefault = DeviceType == UR_DEVICE_TYPE_DEFAULT;
+  const bool AskingForGPU = DeviceType == UR_DEVICE_TYPE_GPU;
+  const bool ReturnDevices = AskingForDefault || AskingForAll || AskingForGPU;
 
   UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  size_t numDevices = returnDevices ? hPlatform->devices_.size() : 0;
+  size_t NumDevices = ReturnDevices ? hPlatform->Devices.size() : 0;
 
   try {
     UR_ASSERT(pNumDevices || phDevices, UR_RESULT_ERROR_INVALID_VALUE);
 
     if (pNumDevices) {
-      *pNumDevices = numDevices;
+      *pNumDevices = NumDevices;
     }
 
-    if (returnDevices && phDevices) {
-      for (size_t i = 0; i < std::min(size_t(NumEntries), numDevices); ++i) {
-        phDevices[i] = hPlatform->devices_[i].get();
+    if (ReturnDevices && phDevices) {
+      for (size_t i = 0; i < std::min(size_t(NumEntries), NumDevices); ++i) {
+        phDevices[i] = hPlatform->Devices[i].get();
       }
     }
 
-    return err;
-  } catch (ur_result_t err) {
-    return err;
+    return Result;
+  } catch (ur_result_t Err) {
+    return Err;
   } catch (...) {
     return UR_RESULT_ERROR_OUT_OF_RESOURCES;
   }
@@ -1101,41 +1102,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
 
   // We can't cast between ur_native_handle_t and CUdevice, so memcpy the bits
   // instead
-  CUdevice cu_device = 0;
-  memcpy(&cu_device, hNativeDevice, sizeof(CUdevice));
+  CUdevice CuDevice = 0;
+  memcpy(&CuDevice, hNativeDevice, sizeof(CUdevice));
 
-  auto is_device = [=](std::unique_ptr<ur_device_handle_t_> &dev) {
-    return dev->get() == cu_device;
+  auto IsDevice = [=](std::unique_ptr<ur_device_handle_t_> &Dev) {
+    return Dev->get() == CuDevice;
   };
 
   // If a platform is provided just check if the device is in it
   if (hPlatform) {
-    auto search_res = std::find_if(begin(hPlatform->devices_),
-                                   end(hPlatform->devices_), is_device);
-    if (search_res != end(hPlatform->devices_)) {
-      *phDevice = search_res->get();
+    auto SearchRes = std::find_if(begin(hPlatform->Devices),
+                                  end(hPlatform->Devices), IsDevice);
+    if (SearchRes != end(hPlatform->Devices)) {
+      *phDevice = SearchRes->get();
       return UR_RESULT_SUCCESS;
     }
   }
 
   // Get list of platforms
-  uint32_t num_platforms = 0;
-  ur_result_t result = urPlatformGet(0, nullptr, &num_platforms);
-  if (result != UR_RESULT_SUCCESS)
-    return result;
+  uint32_t NumPlatforms = 0;
+  ur_result_t Result = urPlatformGet(0, nullptr, &NumPlatforms);
+  if (Result != UR_RESULT_SUCCESS)
+    return Result;
 
-  ur_platform_handle_t *plat = static_cast<ur_platform_handle_t *>(
-      malloc(num_platforms * sizeof(ur_platform_handle_t)));
-  result = urPlatformGet(num_platforms, plat, nullptr);
-  if (result != UR_RESULT_SUCCESS)
-    return result;
+  ur_platform_handle_t *Plat = static_cast<ur_platform_handle_t *>(
+      malloc(NumPlatforms * sizeof(ur_platform_handle_t)));
+  Result = urPlatformGet(NumPlatforms, Plat, nullptr);
+  if (Result != UR_RESULT_SUCCESS)
+    return Result;
 
   // Iterate through platforms to find device that matches nativeHandle
-  for (uint32_t j = 0; j < num_platforms; ++j) {
-    auto search_res = std::find_if(begin(plat[j]->devices_),
-                                   end(plat[j]->devices_), is_device);
-    if (search_res != end(plat[j]->devices_)) {
-      *phDevice = static_cast<ur_device_handle_t>((*search_res).get());
+  for (uint32_t j = 0; j < NumPlatforms; ++j) {
+    auto SearchRes =
+        std::find_if(begin(Plat[j]->Devices), end(Plat[j]->Devices), IsDevice);
+    if (SearchRes != end(Plat[j]->Devices)) {
+      *phDevice = static_cast<ur_device_handle_t>((*SearchRes).get());
       return UR_RESULT_SUCCESS;
     }
   }
@@ -1150,12 +1151,12 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
                                                    uint64_t *pHostTimestamp) {
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  CUevent event;
-  ScopedContext active(hDevice->get_context());
+  CUevent Event;
+  ScopedContext Active(hDevice->getContext());
 
   if (pDeviceTimestamp) {
-    UR_CHECK_ERROR(cuEventCreate(&event, CU_EVENT_DEFAULT));
-    UR_CHECK_ERROR(cuEventRecord(event, 0));
+    UR_CHECK_ERROR(cuEventCreate(&Event, CU_EVENT_DEFAULT));
+    UR_CHECK_ERROR(cuEventRecord(Event, 0));
   }
   if (pHostTimestamp) {
 
@@ -1166,8 +1167,8 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
   }
 
   if (pDeviceTimestamp) {
-    UR_CHECK_ERROR(cuEventSynchronize(event));
-    *pDeviceTimestamp = hDevice->get_elapsed_time(event);
+    UR_CHECK_ERROR(cuEventSynchronize(Event));
+    *pDeviceTimestamp = hDevice->getElapsedTime(Event);
   }
 
   return UR_RESULT_SUCCESS;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp
index 9d01edd8a5ec3..ff8d85cf7a3d9 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp
@@ -13,49 +13,47 @@ struct ur_device_handle_t_ {
 private:
   using native_type = CUdevice;
 
-  native_type cuDevice_;
-  CUcontext cuContext_;
-  CUevent evBase_; // CUDA event used as base counter
-  std::atomic_uint32_t refCount_;
-  ur_platform_handle_t platform_;
+  native_type CuDevice;
+  CUcontext CuContext;
+  CUevent EvBase; // CUDA event used as base counter
+  std::atomic_uint32_t RefCount;
+  ur_platform_handle_t Platform;
 
-  static constexpr uint32_t max_work_item_dimensions = 3u;
-  size_t max_work_item_sizes[max_work_item_dimensions];
-  int max_work_group_size;
+  static constexpr uint32_t MaxWorkItemDimensions = 3u;
+  size_t MaxWorkItemSizes[MaxWorkItemDimensions];
+  int MaxWorkGroupSize;
 
 public:
   ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase,
                       ur_platform_handle_t platform)
-      : cuDevice_(cuDevice), cuContext_(cuContext), evBase_(evBase),
-        refCount_{1}, platform_(platform) {}
+      : CuDevice(cuDevice), CuContext(cuContext), EvBase(evBase), RefCount{1},
+        Platform(platform) {}
 
-  ur_device_handle_t_() { cuDevicePrimaryCtxRelease(cuDevice_); }
+  ur_device_handle_t_() { cuDevicePrimaryCtxRelease(CuDevice); }
 
-  native_type get() const noexcept { return cuDevice_; };
+  native_type get() const noexcept { return CuDevice; };
 
-  CUcontext get_context() const noexcept { return cuContext_; };
+  CUcontext getContext() const noexcept { return CuContext; };
 
-  uint32_t get_reference_count() const noexcept { return refCount_; }
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
 
-  ur_platform_handle_t get_platform() const noexcept { return platform_; };
+  ur_platform_handle_t getPlatform() const noexcept { return Platform; };
 
-  uint64_t get_elapsed_time(CUevent) const;
+  uint64_t getElapsedTime(CUevent) const;
 
-  void save_max_work_item_sizes(size_t size,
-                                size_t *save_max_work_item_sizes) noexcept {
-    memcpy(max_work_item_sizes, save_max_work_item_sizes, size);
+  void saveMaxWorkItemSizes(size_t Size,
+                            size_t *SaveMaxWorkItemSizes) noexcept {
+    memcpy(MaxWorkItemSizes, SaveMaxWorkItemSizes, Size);
   };
 
-  void save_max_work_group_size(int value) noexcept {
-    max_work_group_size = value;
-  };
+  void saveMaxWorkGroupSize(int Value) noexcept { MaxWorkGroupSize = Value; };
 
-  void get_max_work_item_sizes(size_t ret_size,
-                               size_t *ret_max_work_item_sizes) const noexcept {
-    memcpy(ret_max_work_item_sizes, max_work_item_sizes, ret_size);
+  void getMaxWorkItemSizes(size_t RetSize,
+                           size_t *RetMaxWorkItemSizes) const noexcept {
+    memcpy(RetMaxWorkItemSizes, MaxWorkItemSizes, RetSize);
   };
 
-  int get_max_work_group_size() const noexcept { return max_work_group_size; };
+  int getMaxWorkGroupSize() const noexcept { return MaxWorkGroupSize; };
 };
 
-int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute);
+int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
index b0c4562d60525..ef87dab96d2fa 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
@@ -16,62 +16,62 @@
 #include <cmath>
 #include <cuda.h>
 
-ur_result_t enqueueEventsWait(ur_queue_handle_t command_queue, CUstream stream,
-                              uint32_t num_events_in_wait_list,
-                              const ur_event_handle_t *event_wait_list) {
-  UR_ASSERT(event_wait_list, UR_RESULT_SUCCESS);
+ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
+                              uint32_t NumEventsInWaitList,
+                              const ur_event_handle_t *EventWaitList) {
+  UR_ASSERT(EventWaitList, UR_RESULT_SUCCESS);
 
   try {
-    ScopedContext active(command_queue->get_context());
+    ScopedContext Active(CommandQueue->getContext());
 
-    auto result = forLatestEvents(
-        event_wait_list, num_events_in_wait_list,
-        [stream](ur_event_handle_t event) -> ur_result_t {
-          if (event->get_stream() == stream) {
+    auto Result = forLatestEvents(
+        EventWaitList, NumEventsInWaitList,
+        [Stream](ur_event_handle_t Event) -> ur_result_t {
+          if (Event->getStream() == Stream) {
             return UR_RESULT_SUCCESS;
           } else {
-            return UR_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0));
+            return UR_CHECK_ERROR(cuStreamWaitEvent(Stream, Event->get(), 0));
           }
         });
-    return result;
-  } catch (ur_result_t err) {
-    return err;
+    return Result;
+  } catch (ur_result_t Err) {
+    return Err;
   } catch (...) {
     return UR_RESULT_ERROR_UNKNOWN;
   }
 }
 
 template <typename PtrT>
-void getUSMHostOrDevicePtr(PtrT usm_ptr, CUmemorytype *out_mem_type,
-                           CUdeviceptr *out_dev_ptr, PtrT *out_host_ptr) {
+void getUSMHostOrDevicePtr(PtrT USMPtr, CUmemorytype *OutMemType,
+                           CUdeviceptr *OutDevPtr, PtrT *OutHostPtr) {
   // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
   // checks with PI_CHECK_ERROR are not suggested
-  CUresult ret = cuPointerGetAttribute(
-      out_mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)usm_ptr);
+  CUresult Ret = cuPointerGetAttribute(
+      OutMemType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)USMPtr);
   // ARRAY, UNIFIED types are not supported!
-  assert(*out_mem_type != CU_MEMORYTYPE_ARRAY &&
-         *out_mem_type != CU_MEMORYTYPE_UNIFIED);
+  assert(*OutMemType != CU_MEMORYTYPE_ARRAY &&
+         *OutMemType != CU_MEMORYTYPE_UNIFIED);
 
   // pointer not known to the CUDA subsystem (possibly a system allocated ptr)
-  if (ret == CUDA_ERROR_INVALID_VALUE) {
-    *out_mem_type = CU_MEMORYTYPE_HOST;
-    *out_dev_ptr = 0;
-    *out_host_ptr = usm_ptr;
+  if (Ret == CUDA_ERROR_INVALID_VALUE) {
+    *OutMemType = CU_MEMORYTYPE_HOST;
+    *OutDevPtr = 0;
+    *OutHostPtr = USMPtr;
 
     // todo: resets the above "non-stick" error
-  } else if (ret == CUDA_SUCCESS) {
-    *out_dev_ptr = (*out_mem_type == CU_MEMORYTYPE_DEVICE)
-                       ? reinterpret_cast<CUdeviceptr>(usm_ptr)
-                       : 0;
-    *out_host_ptr = (*out_mem_type == CU_MEMORYTYPE_HOST) ? usm_ptr : nullptr;
+  } else if (Ret == CUDA_SUCCESS) {
+    *OutDevPtr = (*OutMemType == CU_MEMORYTYPE_DEVICE)
+                     ? reinterpret_cast<CUdeviceptr>(USMPtr)
+                     : 0;
+    *OutHostPtr = (*OutMemType == CU_MEMORYTYPE_HOST) ? USMPtr : nullptr;
   } else {
-    UR_CHECK_ERROR(ret);
+    UR_CHECK_ERROR(Ret);
   }
 }
 
-ur_result_t setCuMemAdvise(CUdeviceptr devPtr, size_t size,
-                           ur_usm_advice_flags_t ur_advice_flags,
-                           CUdevice device) {
+ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
+                           ur_usm_advice_flags_t URAdviceFlags,
+                           CUdevice Device) {
   std::unordered_map<ur_usm_advice_flags_t, CUmem_advise>
       URToCUMemAdviseDeviceFlagsMap = {
           {UR_USM_ADVICE_FLAG_SET_READ_MOSTLY, CU_MEM_ADVISE_SET_READ_MOSTLY},
@@ -87,8 +87,8 @@ ur_result_t setCuMemAdvise(CUdeviceptr devPtr, size_t size,
            CU_MEM_ADVISE_UNSET_ACCESSED_BY},
       };
   for (auto &FlagPair : URToCUMemAdviseDeviceFlagsMap) {
-    if (ur_advice_flags & FlagPair.first) {
-      UR_CHECK_ERROR(cuMemAdvise(devPtr, size, FlagPair.second, device));
+    if (URAdviceFlags & FlagPair.first) {
+      UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, Device));
     }
   }
 
@@ -105,8 +105,8 @@ ur_result_t setCuMemAdvise(CUdeviceptr devPtr, size_t size,
       };
 
   for (auto &FlagPair : URToCUMemAdviseHostFlagsMap) {
-    if (ur_advice_flags & FlagPair.first) {
-      UR_CHECK_ERROR(cuMemAdvise(devPtr, size, FlagPair.second, CU_DEVICE_CPU));
+    if (URAdviceFlags & FlagPair.first) {
+      UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, CU_DEVICE_CPU));
     }
   }
 
@@ -115,8 +115,8 @@ ur_result_t setCuMemAdvise(CUdeviceptr devPtr, size_t size,
       UR_USM_ADVICE_FLAG_CLEAR_NON_ATOMIC_MOSTLY,
       UR_USM_ADVICE_FLAG_BIAS_CACHED, UR_USM_ADVICE_FLAG_BIAS_UNCACHED};
 
-  for (auto &unMappedFlag : UnmappedMemAdviceFlags) {
-    if (ur_advice_flags & unMappedFlag) {
+  for (auto &UnmappedFlag : UnmappedMemAdviceFlags) {
+    if (URAdviceFlags & UnmappedFlag) {
       throw UR_RESULT_ERROR_INVALID_ENUMERATION;
     }
   }
@@ -127,76 +127,76 @@ ur_result_t setCuMemAdvise(CUdeviceptr devPtr, size_t size,
 // Determine local work sizes that result in uniform work groups.
 // The default threadsPerBlock only require handling the first work_dim
 // dimension.
-void guessLocalWorkSize(ur_device_handle_t device, size_t *threadsPerBlock,
-                        const size_t *global_work_size, const uint32_t work_dim,
-                        const size_t maxThreadsPerBlock[3],
-                        ur_kernel_handle_t kernel, uint32_t local_size) {
-  assert(threadsPerBlock != nullptr);
-  assert(global_work_size != nullptr);
-  assert(kernel != nullptr);
-  int minGrid, maxBlockSize, maxBlockDim[3];
+void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
+                        const size_t *GlobalWorkSize, const uint32_t WorkDim,
+                        const size_t MaxThreadsPerBlock[3],
+                        ur_kernel_handle_t Kernel, uint32_t LocalSize) {
+  assert(ThreadsPerBlock != nullptr);
+  assert(GlobalWorkSize != nullptr);
+  assert(Kernel != nullptr);
+  int MinGrid, MaxBlockSize, MaxBlockDim[3];
 
   // The below assumes a three dimensional range but this is not guaranteed by
   // UR.
-  size_t global_size_normalized[3] = {1, 1, 1};
-  for (uint32_t i = 0; i < work_dim; i++) {
-    global_size_normalized[i] = global_work_size[i];
+  size_t GlobalSizeNormalized[3] = {1, 1, 1};
+  for (uint32_t i = 0; i < WorkDim; i++) {
+    GlobalSizeNormalized[i] = GlobalWorkSize[i];
   }
 
-  static auto isPrime = [](size_t number) -> bool {
-    auto lastNumToCheck = ceil(sqrt(number));
-    if (number < 2)
+  static auto IsPrime = [](size_t Number) -> bool {
+    auto LastNumToCheck = ceil(sqrt(Number));
+    if (Number < 2)
       return false;
-    if (number == 2)
+    if (Number == 2)
       return true;
-    if (number % 2 == 0)
+    if (Number % 2 == 0)
       return false;
-    for (int i = 3; i <= lastNumToCheck; i += 2) {
-      if (number % i == 0)
+    for (int i = 3; i <= LastNumToCheck; i += 2) {
+      if (Number % i == 0)
         return false;
     }
     return true;
   };
 
-  cuDeviceGetAttribute(&maxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
-                       device->get());
-  cuDeviceGetAttribute(&maxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
-                       device->get());
-
-  UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
-      &minGrid, &maxBlockSize, kernel->get(), NULL, local_size,
-      maxThreadsPerBlock[0]));
-
-  threadsPerBlock[2] =
-      std::min(global_size_normalized[2], size_t(maxBlockDim[2]));
-  threadsPerBlock[1] = std::min(
-      global_size_normalized[1],
-      std::min(maxBlockSize / threadsPerBlock[2], size_t(maxBlockDim[1])));
-  maxBlockDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
-  threadsPerBlock[0] =
-      std::min(maxThreadsPerBlock[0],
-               std::min(global_size_normalized[0], size_t(maxBlockDim[0])));
-
-  // When global_size_normalized[0] is prime threadPerBlock[0] will later
+  cuDeviceGetAttribute(&MaxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
+                       Device->get());
+  cuDeviceGetAttribute(&MaxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
+                       Device->get());
+
+  UR_CHECK_ERROR(
+      cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(),
+                                       NULL, LocalSize, MaxThreadsPerBlock[0]));
+
+  ThreadsPerBlock[2] =
+      std::min(GlobalSizeNormalized[2], size_t(MaxBlockDim[2]));
+  ThreadsPerBlock[1] = std::min(
+      GlobalSizeNormalized[1],
+      std::min(MaxBlockSize / ThreadsPerBlock[2], size_t(MaxBlockDim[1])));
+  MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
+  ThreadsPerBlock[0] =
+      std::min(MaxThreadsPerBlock[0],
+               std::min(GlobalSizeNormalized[0], size_t(MaxBlockDim[0])));
+
+  // When GlobalSizeNormalized[0] is prime threadPerBlock[0] will later
   // computed as 1, which is not efficient configuration. In such case we use
-  // global_size_normalized[0] + 1 to compute threadPerBlock[0].
-  int adjusted_0_dim_global_work_size =
-      (isPrime(global_size_normalized[0]) &&
-       (threadsPerBlock[0] != global_size_normalized[0]))
-          ? global_size_normalized[0] + 1
-          : global_size_normalized[0];
-
-  static auto isPowerOf2 = [](size_t value) -> bool {
-    return value && !(value & (value - 1));
+  // GlobalSizeNormalized[0] + 1 to compute threadPerBlock[0].
+  int Adjusted0DimGlobalWorkSize =
+      (IsPrime(GlobalSizeNormalized[0]) &&
+       (ThreadsPerBlock[0] != GlobalSizeNormalized[0]))
+          ? GlobalSizeNormalized[0] + 1
+          : GlobalSizeNormalized[0];
+
+  static auto IsPowerOf2 = [](size_t Value) -> bool {
+    return Value && !(Value & (Value - 1));
   };
 
   // Find a local work group size that is a divisor of the global
   // work group size to produce uniform work groups.
   // Additionally, for best compute utilisation, the local size has
   // to be a power of two.
-  while (0u != (adjusted_0_dim_global_work_size % threadsPerBlock[0]) ||
-         !isPowerOf2(threadsPerBlock[0])) {
-    --threadsPerBlock[0];
+  while (0u != (Adjusted0DimGlobalWorkSize % ThreadsPerBlock[0]) ||
+         !IsPowerOf2(ThreadsPerBlock[0])) {
+    --ThreadsPerBlock[0];
   }
 }
 
@@ -204,19 +204,19 @@ void guessLocalWorkSize(ur_device_handle_t device, size_t *threadsPerBlock,
 // If the kernel requires a number of registers for the entire thread
 // block exceeds the hardware limitations, then the cuLaunchKernel call
 // will fail to launch with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES error.
-bool hasExceededMaxRegistersPerBlock(ur_device_handle_t device,
-                                     ur_kernel_handle_t kernel,
-                                     size_t blockSize) {
-  int maxRegsPerBlock{0};
+bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device,
+                                     ur_kernel_handle_t Kernel,
+                                     size_t BlockSize) {
+  int MaxRegsPerBlock{0};
   UR_CHECK_ERROR(cuDeviceGetAttribute(
-      &maxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
-      device->get()));
+      &MaxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
+      Device->get()));
 
-  int regsPerThread{0};
-  UR_CHECK_ERROR(cuFuncGetAttribute(&regsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS,
-                                    kernel->get()));
+  int RegsPerThread{0};
+  UR_CHECK_ERROR(cuFuncGetAttribute(&RegsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS,
+                                    Kernel->get()));
 
-  return blockSize * regsPerThread > size_t(maxRegsPerBlock);
+  return BlockSize * RegsPerThread > size_t(MaxRegsPerBlock);
 }
 
 /// Enqueues a wait on the given CUstream for all specified events (See
@@ -230,71 +230,69 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
   // represented by input events) and then all future work waits on that stream.
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
 
-  ur_result_t result;
+  ur_result_t Result;
 
   try {
-    ScopedContext active(hQueue->get_context());
-    uint32_t stream_token;
-    ur_stream_guard_ guard;
-    CUstream cuStream = hQueue->get_next_compute_stream(
-        numEventsInWaitList, phEventWaitList, guard, &stream_token);
+    ScopedContext Active(hQueue->getContext());
+    uint32_t StreamToken;
+    ur_stream_guard_ Guard;
+    CUstream CuStream = hQueue->getNextComputeStream(
+        numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
     {
-      std::lock_guard<std::mutex> guard(hQueue->barrier_mutex_);
-      if (hQueue->barrier_event_ == nullptr) {
+      std::lock_guard<std::mutex> GuardBarrier(hQueue->BarrierMutex);
+      if (hQueue->BarrierEvent == nullptr) {
         UR_CHECK_ERROR(
-            cuEventCreate(&hQueue->barrier_event_, CU_EVENT_DISABLE_TIMING));
+            cuEventCreate(&hQueue->BarrierEvent, CU_EVENT_DISABLE_TIMING));
       }
       if (numEventsInWaitList == 0) { //  wait on all work
-        if (hQueue->barrier_tmp_event_ == nullptr) {
-          UR_CHECK_ERROR(cuEventCreate(&hQueue->barrier_tmp_event_,
-                                       CU_EVENT_DISABLE_TIMING));
+        if (hQueue->BarrierTmpEvent == nullptr) {
+          UR_CHECK_ERROR(
+              cuEventCreate(&hQueue->BarrierTmpEvent, CU_EVENT_DISABLE_TIMING));
         }
-        hQueue->sync_streams(
-            [cuStream, tmp_event = hQueue->barrier_tmp_event_](CUstream s) {
-              if (cuStream != s) {
+        hQueue->syncStreams(
+            [CuStream, TmpEvent = hQueue->BarrierTmpEvent](CUstream s) {
+              if (CuStream != s) {
                 // record a new CUDA event on every stream and make one stream
                 // wait for these events
-                UR_CHECK_ERROR(cuEventRecord(tmp_event, s));
-                UR_CHECK_ERROR(cuStreamWaitEvent(cuStream, tmp_event, 0));
+                UR_CHECK_ERROR(cuEventRecord(TmpEvent, s));
+                UR_CHECK_ERROR(cuStreamWaitEvent(CuStream, TmpEvent, 0));
               }
             });
       } else { // wait just on given events
         forLatestEvents(phEventWaitList, numEventsInWaitList,
-                        [cuStream](ur_event_handle_t event) -> ur_result_t {
-                          if (event->get_queue()->has_been_synchronized(
-                                  event->get_compute_stream_token())) {
+                        [CuStream](ur_event_handle_t Event) -> ur_result_t {
+                          if (Event->getQueue()->hasBeenSynchronized(
+                                  Event->getComputeStreamToken())) {
                             return UR_RESULT_SUCCESS;
                           } else {
                             return UR_CHECK_ERROR(
-                                cuStreamWaitEvent(cuStream, event->get(), 0));
+                                cuStreamWaitEvent(CuStream, Event->get(), 0));
                           }
                         });
       }
 
-      result = UR_CHECK_ERROR(cuEventRecord(hQueue->barrier_event_, cuStream));
-      for (unsigned int i = 0; i < hQueue->compute_applied_barrier_.size();
-           i++) {
-        hQueue->compute_applied_barrier_[i] = false;
+      Result = UR_CHECK_ERROR(cuEventRecord(hQueue->BarrierEvent, CuStream));
+      for (unsigned int i = 0; i < hQueue->ComputeAppliedBarrier.size(); i++) {
+        hQueue->ComputeAppliedBarrier[i] = false;
       }
-      for (unsigned int i = 0; i < hQueue->transfer_applied_barrier_.size();
-           i++) {
-        hQueue->transfer_applied_barrier_[i] = false;
+      for (unsigned int i = 0; i < hQueue->TransferAppliedBarrier.size(); i++) {
+        hQueue->TransferAppliedBarrier[i] = false;
       }
     }
-    if (result != UR_RESULT_SUCCESS) {
-      return result;
+    if (Result != UR_RESULT_SUCCESS) {
+      return Result;
     }
 
     if (phEvent) {
-      *phEvent = ur_event_handle_t_::make_native(
-          UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, hQueue, cuStream, stream_token);
+      *phEvent = ur_event_handle_t_::makeNative(
+          UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, hQueue, CuStream, StreamToken);
       (*phEvent)->start();
       (*phEvent)->record();
     }
 
     return UR_RESULT_SUCCESS;
-  } catch (ur_result_t err) {
-    return err;
+  } catch (ur_result_t Err) {
+    return Err;
   } catch (...) {
     return UR_RESULT_ERROR_UNKNOWN;
   }
@@ -319,7 +317,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
   // Preconditions
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  UR_ASSERT(hQueue->get_context() == hKernel->get_context(),
+  UR_ASSERT(hQueue->getContext() == hKernel->getContext(),
             UR_RESULT_ERROR_INVALID_KERNEL);
   UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(pGlobalWorkOffset, UR_RESULT_ERROR_INVALID_NULL_POINTER);
@@ -333,162 +331,162 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
   // Set the number of threads per block to the number of threads per warp
   // by default unless user has provided a better number
-  size_t threadsPerBlock[3] = {32u, 1u, 1u};
-  size_t maxWorkGroupSize = 0u;
-  size_t maxThreadsPerBlock[3] = {};
-  bool providedLocalWorkGroupSize = (pLocalWorkSize != nullptr);
-  int32_t local_size = hKernel->get_local_size();
-  ur_result_t retError = UR_RESULT_SUCCESS;
+  size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
+  size_t MaxWorkGroupSize = 0u;
+  size_t MaxThreadsPerBlock[3] = {};
+  bool ProvidedLocalWorkGroupSize = (pLocalWorkSize != nullptr);
+  int32_t LocalSize = hKernel->getLocalSize();
+  ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
     // Set the active context here as guessLocalWorkSize needs an active context
-    ScopedContext active(hQueue->get_context());
+    ScopedContext Active(hQueue->getContext());
     {
-      size_t *reqdThreadsPerBlock = hKernel->reqdThreadsPerBlock_;
-      maxWorkGroupSize = hQueue->device_->get_max_work_group_size();
-      hQueue->device_->get_max_work_item_sizes(sizeof(maxThreadsPerBlock),
-                                               maxThreadsPerBlock);
-
-      if (providedLocalWorkGroupSize) {
-        auto isValid = [&](int dim) {
-          if (reqdThreadsPerBlock[dim] != 0 &&
-              pLocalWorkSize[dim] != reqdThreadsPerBlock[dim])
+      size_t *ReqdThreadsPerBlock = hKernel->ReqdThreadsPerBlock;
+      MaxWorkGroupSize = hQueue->Device->getMaxWorkGroupSize();
+      hQueue->Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock),
+                                          MaxThreadsPerBlock);
+
+      if (ProvidedLocalWorkGroupSize) {
+        auto IsValid = [&](int Dim) {
+          if (ReqdThreadsPerBlock[Dim] != 0 &&
+              pLocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim])
             return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
 
-          if (pLocalWorkSize[dim] > maxThreadsPerBlock[dim])
+          if (pLocalWorkSize[Dim] > MaxThreadsPerBlock[Dim])
             return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
           // Checks that local work sizes are a divisor of the global work sizes
           // which includes that the local work sizes are neither larger than
           // the global work sizes and not 0.
-          if (0u == pLocalWorkSize[dim])
+          if (0u == pLocalWorkSize[Dim])
             return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
-          if (0u != (pGlobalWorkSize[dim] % pLocalWorkSize[dim]))
+          if (0u != (pGlobalWorkSize[Dim] % pLocalWorkSize[Dim]))
             return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
-          threadsPerBlock[dim] = pLocalWorkSize[dim];
+          ThreadsPerBlock[Dim] = pLocalWorkSize[Dim];
           return UR_RESULT_SUCCESS;
         };
 
-        size_t kernelLocalWorkGroupSize = 0;
-        for (size_t dim = 0; dim < workDim; dim++) {
-          auto err = isValid(dim);
-          if (err != UR_RESULT_SUCCESS)
-            return err;
+        size_t KernelLocalWorkGroupSize = 0;
+        for (size_t Dim = 0; Dim < workDim; Dim++) {
+          auto Err = IsValid(Dim);
+          if (Err != UR_RESULT_SUCCESS)
+            return Err;
           // If no error then sum the total local work size per dim.
-          kernelLocalWorkGroupSize += pLocalWorkSize[dim];
+          KernelLocalWorkGroupSize += pLocalWorkSize[Dim];
         }
 
-        if (hasExceededMaxRegistersPerBlock(hQueue->device_, hKernel,
-                                            kernelLocalWorkGroupSize)) {
+        if (hasExceededMaxRegistersPerBlock(hQueue->Device, hKernel,
+                                            KernelLocalWorkGroupSize)) {
           return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
         }
       } else {
-        guessLocalWorkSize(hQueue->device_, threadsPerBlock, pGlobalWorkSize,
-                           workDim, maxThreadsPerBlock, hKernel, local_size);
+        guessLocalWorkSize(hQueue->Device, ThreadsPerBlock, pGlobalWorkSize,
+                           workDim, MaxThreadsPerBlock, hKernel, LocalSize);
       }
     }
 
-    if (maxWorkGroupSize <
-        size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
+    if (MaxWorkGroupSize <
+        size_t(ThreadsPerBlock[0] * ThreadsPerBlock[1] * ThreadsPerBlock[2])) {
       return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
     }
 
-    size_t blocksPerGrid[3] = {1u, 1u, 1u};
+    size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
     for (size_t i = 0; i < workDim; i++) {
-      blocksPerGrid[i] =
-          (pGlobalWorkSize[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i];
+      BlocksPerGrid[i] =
+          (pGlobalWorkSize[i] + ThreadsPerBlock[i] - 1) / ThreadsPerBlock[i];
     }
 
-    std::unique_ptr<ur_event_handle_t_> retImplEv{nullptr};
+    std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
-    uint32_t stream_token;
-    ur_stream_guard_ guard;
-    CUstream cuStream = hQueue->get_next_compute_stream(
-        numEventsInWaitList, phEventWaitList, guard, &stream_token);
-    CUfunction cuFunc = hKernel->get();
+    uint32_t StreamToken;
+    ur_stream_guard_ Guard;
+    CUstream CuStream = hQueue->getNextComputeStream(
+        numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
+    CUfunction CuFunc = hKernel->get();
 
-    retError = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
-                                 phEventWaitList);
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                               phEventWaitList);
 
     // Set the implicit global offset parameter if kernel has offset variant
     if (hKernel->get_with_offset_parameter()) {
-      std::uint32_t cuda_implicit_offset[3] = {0, 0, 0};
+      std::uint32_t CudaImplicitOffset[3] = {0, 0, 0};
       if (pGlobalWorkOffset) {
         for (size_t i = 0; i < workDim; i++) {
-          cuda_implicit_offset[i] =
+          CudaImplicitOffset[i] =
               static_cast<std::uint32_t>(pGlobalWorkOffset[i]);
           if (pGlobalWorkOffset[i] != 0) {
-            cuFunc = hKernel->get_with_offset_parameter();
+            CuFunc = hKernel->get_with_offset_parameter();
           }
         }
       }
-      hKernel->set_implicit_offset_arg(sizeof(cuda_implicit_offset),
-                                       cuda_implicit_offset);
+      hKernel->setImplicitOffsetArg(sizeof(CudaImplicitOffset),
+                                    CudaImplicitOffset);
     }
 
-    auto &argIndices = hKernel->get_arg_indices();
+    auto &ArgIndices = hKernel->getArgIndices();
 
     if (phEvent) {
-      retImplEv =
-          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
-              UR_COMMAND_KERNEL_LAUNCH, hQueue, cuStream, stream_token));
-      retImplEv->start();
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_KERNEL_LAUNCH, hQueue, CuStream, StreamToken));
+      RetImplEvent->start();
     }
 
     // Set local mem max size if env var is present
-    static const char *local_mem_sz_ptr =
+    static const char *LocalMemSizePtr =
         std::getenv("SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE");
 
-    if (local_mem_sz_ptr) {
-      int device_max_local_mem = 0;
+    if (LocalMemSizePtr) {
+      int DeviceMaxLocalMem = 0;
       cuDeviceGetAttribute(
-          &device_max_local_mem,
+          &DeviceMaxLocalMem,
           CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
           hQueue->get_device()->get());
 
-      static const int env_val = std::atoi(local_mem_sz_ptr);
-      if (env_val <= 0 || env_val > device_max_local_mem) {
+      static const int EnvVal = std::atoi(LocalMemSizePtr);
+      if (EnvVal <= 0 || EnvVal > DeviceMaxLocalMem) {
         setErrorMessage("Invalid value specified for "
                         "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
                         UR_RESULT_ERROR_ADAPTER_SPECIFIC);
         return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
       }
       UR_CHECK_ERROR(cuFuncSetAttribute(
-          cuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, env_val));
+          CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, EnvVal));
     }
 
-    retError = UR_CHECK_ERROR(cuLaunchKernel(
-        cuFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2],
-        threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2], local_size,
-        cuStream, const_cast<void **>(argIndices.data()), nullptr));
-    if (local_size != 0)
-      hKernel->clear_local_size();
+    Result = UR_CHECK_ERROR(cuLaunchKernel(
+        CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
+        ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize,
+        CuStream, const_cast<void **>(ArgIndices.data()), nullptr));
+    if (LocalSize != 0)
+      hKernel->clearLocalSize();
 
     if (phEvent) {
-      retError = retImplEv->record();
-      *phEvent = retImplEv.release();
+      Result = RetImplEvent->record();
+      *phEvent = RetImplEvent.release();
     }
-  } catch (ur_result_t err) {
-    retError = err;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
-  return retError;
+  return Result;
 }
 
 /// General 3D memory copy operation.
 /// This function requires the corresponding CUDA context to be at the top of
 /// the context stack
-/// If the source and/or destination is on the device, src_ptr and/or dst_ptr
+/// If the source and/or destination is on the device, SrcPtr and/or DstPtr
 /// must be a pointer to a CUdeviceptr
 static ur_result_t commonEnqueueMemBufferCopyRect(
-    CUstream cu_stream, ur_rect_region_t region, const void *src_ptr,
-    const CUmemorytype_enum src_type, ur_rect_offset_t src_offset,
-    size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr,
-    const CUmemorytype_enum dst_type, ur_rect_offset_t dst_offset,
+    CUstream cu_stream, ur_rect_region_t region, const void *SrcPtr,
+    const CUmemorytype_enum SrcType, ur_rect_offset_t src_offset,
+    size_t src_row_pitch, size_t src_slice_pitch, void *DstPtr,
+    const CUmemorytype_enum DstType, ur_rect_offset_t dst_offset,
     size_t dst_row_pitch, size_t dst_slice_pitch) {
 
-  UR_ASSERT(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST,
+  UR_ASSERT(SrcType == CU_MEMORYTYPE_DEVICE || SrcType == CU_MEMORYTYPE_HOST,
             UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  UR_ASSERT(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST,
+  UR_ASSERT(DstType == CU_MEMORYTYPE_DEVICE || DstType == CU_MEMORYTYPE_HOST,
             UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
   src_row_pitch =
@@ -508,22 +506,21 @@ static ur_result_t commonEnqueueMemBufferCopyRect(
   params.Height = region.height;
   params.Depth = region.depth;
 
-  params.srcMemoryType = src_type;
-  params.srcDevice = src_type == CU_MEMORYTYPE_DEVICE
-                         ? *static_cast<const CUdeviceptr *>(src_ptr)
+  params.srcMemoryType = SrcType;
+  params.srcDevice = SrcType == CU_MEMORYTYPE_DEVICE
+                         ? *static_cast<const CUdeviceptr *>(SrcPtr)
                          : 0;
-  params.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr;
+  params.srcHost = SrcType == CU_MEMORYTYPE_HOST ? SrcPtr : nullptr;
   params.srcXInBytes = src_offset.x;
   params.srcY = src_offset.y;
   params.srcZ = src_offset.z;
   params.srcPitch = src_row_pitch;
   params.srcHeight = src_slice_pitch / src_row_pitch;
 
-  params.dstMemoryType = dst_type;
-  params.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE
-                         ? *static_cast<CUdeviceptr *>(dst_ptr)
-                         : 0;
-  params.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr;
+  params.dstMemoryType = DstType;
+  params.dstDevice =
+      DstType == CU_MEMORYTYPE_DEVICE ? *static_cast<CUdeviceptr *>(DstPtr) : 0;
+  params.dstHost = DstType == CU_MEMORYTYPE_HOST ? DstPtr : nullptr;
   params.dstXInBytes = dst_offset.x;
   params.dstY = dst_offset.y;
   params.dstZ = dst_offset.z;
@@ -543,45 +540,45 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
   UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  ur_result_t retErr = UR_RESULT_SUCCESS;
-  CUdeviceptr devPtr = hBuffer->mem_.buffer_mem_.get();
-  std::unique_ptr<ur_event_handle_t_> retImplEv{nullptr};
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get();
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext active(hQueue->get_context());
-    CUstream cuStream = hQueue->get_next_transfer_stream();
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
 
-    retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
 
     if (phEvent) {
-      retImplEv =
-          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
-              UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, cuStream));
-      retImplEv->start();
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, CuStream));
+      RetImplEvent->start();
     }
 
-    retErr = commonEnqueueMemBufferCopyRect(
-        cuStream, region, &devPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
+    Result = commonEnqueueMemBufferCopyRect(
+        CuStream, region, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
         bufferRowPitch, bufferSlicePitch, pDst, CU_MEMORYTYPE_HOST, hostOrigin,
         hostRowPitch, bufferSlicePitch);
 
     if (phEvent) {
-      retErr = retImplEv->record();
+      Result = RetImplEvent->record();
     }
 
     if (blockingRead) {
-      retErr = UR_CHECK_ERROR(cuStreamSynchronize(cuStream));
+      Result = UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
     }
 
     if (phEvent) {
-      *phEvent = retImplEv.release();
+      *phEvent = RetImplEvent.release();
     }
 
-  } catch (ur_result_t err) {
-    retErr = err;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
-  return retErr;
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
@@ -594,44 +591,44 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
   UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  ur_result_t retErr = UR_RESULT_SUCCESS;
-  CUdeviceptr devPtr = hBuffer->mem_.buffer_mem_.get();
-  std::unique_ptr<ur_event_handle_t_> retImplEv{nullptr};
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get();
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext active(hQueue->get_context());
-    CUstream cuStream = hQueue->get_next_transfer_stream();
-    retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+    ScopedContext active(hQueue->getContext());
+    CUstream cuStream = hQueue->getNextTransferStream();
+    Result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
                                phEventWaitList);
 
     if (phEvent) {
-      retImplEv =
-          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
               UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, cuStream));
-      retImplEv->start();
+      RetImplEvent->start();
     }
 
-    retErr = commonEnqueueMemBufferCopyRect(
+    Result = commonEnqueueMemBufferCopyRect(
         cuStream, region, pSrc, CU_MEMORYTYPE_HOST, hostOrigin, hostRowPitch,
-        hostSlicePitch, &devPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
+        hostSlicePitch, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
         bufferRowPitch, bufferSlicePitch);
 
     if (phEvent) {
-      retErr = retImplEv->record();
+      Result = RetImplEvent->record();
     }
 
     if (blockingWrite) {
-      retErr = UR_CHECK_ERROR(cuStreamSynchronize(cuStream));
+      Result = UR_CHECK_ERROR(cuStreamSynchronize(cuStream));
     }
 
     if (phEvent) {
-      *phEvent = retImplEv.release();
+      *phEvent = RetImplEvent.release();
     }
 
-  } catch (ur_result_t err) {
-    retErr = err;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
-  return retErr;
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
@@ -641,36 +638,36 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
     ur_event_handle_t *phEvent) {
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  std::unique_ptr<ur_event_handle_t_> retImplEv{nullptr};
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext active(hQueue->get_context());
-    ur_result_t result;
+    ScopedContext Active(hQueue->getContext());
+    ur_result_t Result;
 
-    auto stream = hQueue->get_next_transfer_stream();
-    result =
-        enqueueEventsWait(hQueue, stream, numEventsInWaitList, phEventWaitList);
+    auto Stream = hQueue->getNextTransferStream();
+    Result =
+        enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList);
 
     if (phEvent) {
-      retImplEv =
-          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
-              UR_COMMAND_MEM_BUFFER_COPY, hQueue, stream));
-      result = retImplEv->start();
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_MEM_BUFFER_COPY, hQueue, Stream));
+      Result = RetImplEvent->start();
     }
 
-    auto src = hBufferSrc->mem_.buffer_mem_.get() + srcOffset;
-    auto dst = hBufferDst->mem_.buffer_mem_.get() + dstOffset;
+    auto Src = hBufferSrc->Mem.BufferMem.get() + srcOffset;
+    auto Dst = hBufferDst->Mem.BufferMem.get() + dstOffset;
 
-    result = UR_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream));
+    Result = UR_CHECK_ERROR(cuMemcpyDtoDAsync(Dst, Src, size, Stream));
 
     if (phEvent) {
-      result = retImplEv->record();
-      *phEvent = retImplEv.release();
+      Result = RetImplEvent->record();
+      *phEvent = RetImplEvent.release();
     }
 
-    return result;
-  } catch (ur_result_t err) {
-    return err;
+    return Result;
+  } catch (ur_result_t Err) {
+    return Err;
   } catch (...) {
     return UR_RESULT_ERROR_UNKNOWN;
   }
@@ -687,38 +684,38 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
   UR_ASSERT(hBufferDst, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  ur_result_t retErr = UR_RESULT_SUCCESS;
-  CUdeviceptr srcPtr = hBufferSrc->mem_.buffer_mem_.get();
-  CUdeviceptr dstPtr = hBufferDst->mem_.buffer_mem_.get();
-  std::unique_ptr<ur_event_handle_t_> retImplEv{nullptr};
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUdeviceptr SrcPtr = hBufferSrc->Mem.BufferMem.get();
+  CUdeviceptr DstPtr = hBufferDst->Mem.BufferMem.get();
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext active(hQueue->get_context());
-    CUstream cuStream = hQueue->get_next_transfer_stream();
-    retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
 
     if (phEvent) {
-      retImplEv =
-          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
-              UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, cuStream));
-      retImplEv->start();
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, CuStream));
+      RetImplEvent->start();
     }
 
-    retErr = commonEnqueueMemBufferCopyRect(
-        cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin, srcRowPitch,
-        srcSlicePitch, &dstPtr, CU_MEMORYTYPE_DEVICE, dstOrigin, dstRowPitch,
+    Result = commonEnqueueMemBufferCopyRect(
+        CuStream, region, &SrcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin, srcRowPitch,
+        srcSlicePitch, &DstPtr, CU_MEMORYTYPE_DEVICE, dstOrigin, dstRowPitch,
         dstSlicePitch);
 
     if (phEvent) {
-      retImplEv->record();
-      *phEvent = retImplEv.release();
+      RetImplEvent->record();
+      *phEvent = RetImplEvent.release();
     }
 
   } catch (ur_result_t err) {
-    retErr = err;
+    Result = err;
   }
-  return retErr;
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
@@ -728,54 +725,54 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
     ur_event_handle_t *phEvent) {
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  auto args_are_multiples_of_pattern_size =
+  auto ArgsAreMultiplesOfPatternSize =
       (offset % patternSize == 0) || (size % patternSize == 0);
 
-  auto pattern_is_valid = (pPattern != nullptr);
+  auto PatternIsValid = (pPattern != nullptr);
 
-  auto pattern_size_is_valid =
+  auto PatternSizeIsValid =
       ((patternSize & (patternSize - 1)) == 0) && // is power of two
       (patternSize > 0) && (patternSize <= 128);  // falls within valid range
 
-  UR_ASSERT(args_are_multiples_of_pattern_size && pattern_is_valid &&
-                pattern_size_is_valid,
+  UR_ASSERT(ArgsAreMultiplesOfPatternSize && PatternIsValid &&
+                PatternSizeIsValid,
             UR_RESULT_ERROR_INVALID_SIZE);
 
-  std::unique_ptr<ur_event_handle_t_> retImplEv{nullptr};
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext active(hQueue->get_context());
+    ScopedContext Active(hQueue->getContext());
 
-    auto stream = hQueue->get_next_transfer_stream();
-    ur_result_t result;
-    result =
-        enqueueEventsWait(hQueue, stream, numEventsInWaitList, phEventWaitList);
+    auto Stream = hQueue->getNextTransferStream();
+    ur_result_t Result;
+    Result =
+        enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList);
 
     if (phEvent) {
-      retImplEv =
-          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
-              UR_COMMAND_MEM_BUFFER_FILL, hQueue, stream));
-      result = retImplEv->start();
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_MEM_BUFFER_FILL, hQueue, Stream));
+      Result = RetImplEvent->start();
     }
 
-    auto dstDevice = hBuffer->mem_.buffer_mem_.get() + offset;
+    auto DstDevice = hBuffer->Mem.BufferMem.get() + offset;
     auto N = size / patternSize;
 
     // pattern size in bytes
     switch (patternSize) {
     case 1: {
-      auto value = *static_cast<const uint8_t *>(pPattern);
-      result = UR_CHECK_ERROR(cuMemsetD8Async(dstDevice, value, N, stream));
+      auto Value = *static_cast<const uint8_t *>(pPattern);
+      Result = UR_CHECK_ERROR(cuMemsetD8Async(DstDevice, Value, N, Stream));
       break;
     }
     case 2: {
-      auto value = *static_cast<const uint16_t *>(pPattern);
-      result = UR_CHECK_ERROR(cuMemsetD16Async(dstDevice, value, N, stream));
+      auto Value = *static_cast<const uint16_t *>(pPattern);
+      Result = UR_CHECK_ERROR(cuMemsetD16Async(DstDevice, Value, N, Stream));
       break;
     }
     case 4: {
-      auto value = *static_cast<const uint32_t *>(pPattern);
-      result = UR_CHECK_ERROR(cuMemsetD32Async(dstDevice, value, N, stream));
+      auto Value = *static_cast<const uint32_t *>(pPattern);
+      Result = UR_CHECK_ERROR(cuMemsetD32Async(DstDevice, Value, N, Stream));
       break;
     }
     default: {
@@ -786,20 +783,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
       // This means that one cuMemsetD2D32Async call is made for every 4 bytes
       // in the pattern.
 
-      auto number_of_steps = patternSize / sizeof(uint32_t);
+      auto NumberOfSteps = patternSize / sizeof(uint32_t);
 
       // we walk up the pattern in 4-byte steps, and call cuMemset for each
       // 4-byte chunk of the pattern.
-      for (auto step = 0u; step < number_of_steps; ++step) {
+      for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
         // take 4 bytes of the pattern
-        auto value = *(static_cast<const uint32_t *>(pPattern) + step);
+        auto Value = *(static_cast<const uint32_t *>(pPattern) + Step);
 
         // offset the pointer to the part of the buffer we want to write to
-        auto offset_ptr = dstDevice + (step * sizeof(uint32_t));
+        auto OffsetPtr = DstDevice + (Step * sizeof(uint32_t));
 
         // set all of the pattern chunks
-        result = UR_CHECK_ERROR(
-            cuMemsetD2D32Async(offset_ptr, patternSize, value, 1, N, stream));
+        Result = UR_CHECK_ERROR(
+            cuMemsetD2D32Async(OffsetPtr, patternSize, Value, 1, N, Stream));
       }
 
       break;
@@ -807,20 +804,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
     }
 
     if (phEvent) {
-      result = retImplEv->record();
-      *phEvent = retImplEv.release();
+      Result = RetImplEvent->record();
+      *phEvent = RetImplEvent.release();
     }
 
-    return result;
-  } catch (ur_result_t err) {
-    return err;
+    return Result;
+  } catch (ur_result_t Err) {
+    return Err;
   } catch (...) {
     return UR_RESULT_ERROR_UNKNOWN;
   }
 }
 
-static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR array_desc) {
-  switch (array_desc.Format) {
+static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) {
+  switch (ArrayDesc.Format) {
   case CU_AD_FORMAT_UNSIGNED_INT8:
   case CU_AD_FORMAT_SIGNED_INT8:
     return 1;
@@ -841,66 +838,66 @@ static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR array_desc) {
 /// General ND memory copy operation for images (where N > 1).
 /// This function requires the corresponding CUDA context to be at the top of
 /// the context stack
-/// If the source and/or destination is an array, src_ptr and/or dst_ptr
+/// If the source and/or destination is an array, SrcPtr and/or DstPtr
 /// must be a pointer to a CUarray
 static ur_result_t commonEnqueueMemImageNDCopy(
-    CUstream cu_stream, ur_mem_type_t img_type, const ur_rect_region_t region,
-    const void *src_ptr, const CUmemorytype_enum src_type,
-    const ur_rect_offset_t src_offset, void *dst_ptr,
-    const CUmemorytype_enum dst_type, const ur_rect_offset_t dst_offset) {
-  UR_ASSERT(src_type == CU_MEMORYTYPE_ARRAY || src_type == CU_MEMORYTYPE_HOST,
+    CUstream CuStream, ur_mem_type_t ImgType, const ur_rect_region_t Region,
+    const void *SrcPtr, const CUmemorytype_enum SrcType,
+    const ur_rect_offset_t SrcOffset, void *DstPtr,
+    const CUmemorytype_enum DstType, const ur_rect_offset_t DstOffset) {
+  UR_ASSERT(SrcType == CU_MEMORYTYPE_ARRAY || SrcType == CU_MEMORYTYPE_HOST,
             UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  UR_ASSERT(dst_type == CU_MEMORYTYPE_ARRAY || dst_type == CU_MEMORYTYPE_HOST,
+  UR_ASSERT(DstType == CU_MEMORYTYPE_ARRAY || DstType == CU_MEMORYTYPE_HOST,
             UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-  if (img_type == UR_MEM_TYPE_IMAGE2D) {
-    CUDA_MEMCPY2D cpyDesc;
-    memset(&cpyDesc, 0, sizeof(cpyDesc));
-    cpyDesc.srcMemoryType = src_type;
-    if (src_type == CU_MEMORYTYPE_ARRAY) {
-      cpyDesc.srcArray = *static_cast<const CUarray *>(src_ptr);
-      cpyDesc.srcXInBytes = src_offset.x;
-      cpyDesc.srcY = src_offset.y;
+  if (ImgType == UR_MEM_TYPE_IMAGE2D) {
+    CUDA_MEMCPY2D CpyDesc;
+    memset(&CpyDesc, 0, sizeof(CpyDesc));
+    CpyDesc.srcMemoryType = SrcType;
+    if (SrcType == CU_MEMORYTYPE_ARRAY) {
+      CpyDesc.srcArray = *static_cast<const CUarray *>(SrcPtr);
+      CpyDesc.srcXInBytes = SrcOffset.x;
+      CpyDesc.srcY = SrcOffset.y;
     } else {
-      cpyDesc.srcHost = src_ptr;
+      CpyDesc.srcHost = SrcPtr;
     }
-    cpyDesc.dstMemoryType = dst_type;
-    if (dst_type == CU_MEMORYTYPE_ARRAY) {
-      cpyDesc.dstArray = *static_cast<CUarray *>(dst_ptr);
-      cpyDesc.dstXInBytes = dst_offset.x;
-      cpyDesc.dstY = dst_offset.y;
+    CpyDesc.dstMemoryType = DstType;
+    if (DstType == CU_MEMORYTYPE_ARRAY) {
+      CpyDesc.dstArray = *static_cast<CUarray *>(DstPtr);
+      CpyDesc.dstXInBytes = DstOffset.x;
+      CpyDesc.dstY = DstOffset.y;
     } else {
-      cpyDesc.dstHost = dst_ptr;
+      CpyDesc.dstHost = DstPtr;
     }
-    cpyDesc.WidthInBytes = region.width;
-    cpyDesc.Height = region.height;
-    return UR_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cu_stream));
+    CpyDesc.WidthInBytes = Region.width;
+    CpyDesc.Height = Region.height;
+    return UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc, CuStream));
   }
-  if (img_type == UR_MEM_TYPE_IMAGE3D) {
-    CUDA_MEMCPY3D cpyDesc;
-    memset(&cpyDesc, 0, sizeof(cpyDesc));
-    cpyDesc.srcMemoryType = src_type;
-    if (src_type == CU_MEMORYTYPE_ARRAY) {
-      cpyDesc.srcArray = *static_cast<const CUarray *>(src_ptr);
-      cpyDesc.srcXInBytes = src_offset.x;
-      cpyDesc.srcY = src_offset.y;
-      cpyDesc.srcZ = src_offset.z;
+  if (ImgType == UR_MEM_TYPE_IMAGE3D) {
+    CUDA_MEMCPY3D CpyDesc;
+    memset(&CpyDesc, 0, sizeof(CpyDesc));
+    CpyDesc.srcMemoryType = SrcType;
+    if (SrcType == CU_MEMORYTYPE_ARRAY) {
+      CpyDesc.srcArray = *static_cast<const CUarray *>(SrcPtr);
+      CpyDesc.srcXInBytes = SrcOffset.x;
+      CpyDesc.srcY = SrcOffset.y;
+      CpyDesc.srcZ = SrcOffset.z;
     } else {
-      cpyDesc.srcHost = src_ptr;
-    }
-    cpyDesc.dstMemoryType = dst_type;
-    if (dst_type == CU_MEMORYTYPE_ARRAY) {
-      cpyDesc.dstArray = *static_cast<CUarray *>(dst_ptr);
-      cpyDesc.dstXInBytes = dst_offset.x;
-      cpyDesc.dstY = dst_offset.y;
-      cpyDesc.dstZ = dst_offset.z;
+      CpyDesc.srcHost = SrcPtr;
+    }
+    CpyDesc.dstMemoryType = DstType;
+    if (DstType == CU_MEMORYTYPE_ARRAY) {
+      CpyDesc.dstArray = *static_cast<CUarray *>(DstPtr);
+      CpyDesc.dstXInBytes = DstOffset.x;
+      CpyDesc.dstY = DstOffset.y;
+      CpyDesc.dstZ = DstOffset.z;
     } else {
-      cpyDesc.dstHost = dst_ptr;
+      CpyDesc.dstHost = DstPtr;
     }
-    cpyDesc.WidthInBytes = region.width;
-    cpyDesc.Height = region.height;
-    cpyDesc.Depth = region.depth;
-    return UR_CHECK_ERROR(cuMemcpy3DAsync(&cpyDesc, cu_stream));
+    CpyDesc.WidthInBytes = Region.width;
+    CpyDesc.Height = Region.height;
+    CpyDesc.Depth = Region.depth;
+    return UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc, CuStream));
   }
   return UR_RESULT_ERROR_INVALID_VALUE;
 }
@@ -912,62 +909,62 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  UR_ASSERT(hImage->mem_type_ == ur_mem_handle_t_::mem_type::surface,
+  UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface,
             UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-  ur_result_t retErr = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    ScopedContext active(hQueue->get_context());
-    CUstream cuStream = hQueue->get_next_transfer_stream();
-    retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
 
-    CUarray array = hImage->mem_.surface_mem_.get_array();
+    CUarray Array = hImage->Mem.SurfaceMem.getArray();
 
-    CUDA_ARRAY_DESCRIPTOR arrayDesc;
-    retErr = UR_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array));
+    CUDA_ARRAY_DESCRIPTOR ArrayDesc;
+    Result = UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array));
 
-    int elementByteSize = imageElementByteSize(arrayDesc);
+    int ElementByteSize = imageElementByteSize(ArrayDesc);
 
-    size_t byteOffsetX = origin.x * elementByteSize * arrayDesc.NumChannels;
-    size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region.width;
+    size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels;
+    size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width;
 
-    ur_mem_type_t imgType = hImage->mem_.surface_mem_.get_image_type();
-    if (imgType == UR_MEM_TYPE_IMAGE1D) {
-      retErr = UR_CHECK_ERROR(
-          cuMemcpyAtoHAsync(pDst, array, byteOffsetX, bytesToCopy, cuStream));
+    ur_mem_type_t ImgType = hImage->Mem.SurfaceMem.getImageType();
+    if (ImgType == UR_MEM_TYPE_IMAGE1D) {
+      Result = UR_CHECK_ERROR(
+          cuMemcpyAtoHAsync(pDst, Array, ByteOffsetX, BytesToCopy, CuStream));
     } else {
-      ur_rect_region_t adjustedRegion = {bytesToCopy, region.height,
+      ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height,
                                          region.depth};
-      ur_rect_offset_t srcOffset = {byteOffsetX, origin.y, origin.z};
+      ur_rect_offset_t SrcOffset = {ByteOffsetX, origin.y, origin.z};
 
-      retErr = commonEnqueueMemImageNDCopy(
-          cuStream, imgType, adjustedRegion, &array, CU_MEMORYTYPE_ARRAY,
-          srcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{});
+      Result = commonEnqueueMemImageNDCopy(
+          CuStream, ImgType, AdjustedRegion, &Array, CU_MEMORYTYPE_ARRAY,
+          SrcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{});
 
-      if (retErr != UR_RESULT_SUCCESS) {
-        return retErr;
+      if (Result != UR_RESULT_SUCCESS) {
+        return Result;
       }
     }
 
     if (phEvent) {
-      auto new_event = ur_event_handle_t_::make_native(
-          UR_COMMAND_MEM_IMAGE_READ, hQueue, cuStream);
-      new_event->record();
-      *phEvent = new_event;
+      auto NewEvent = ur_event_handle_t_::makeNative(UR_COMMAND_MEM_IMAGE_READ,
+                                                     hQueue, CuStream);
+      NewEvent->record();
+      *phEvent = NewEvent;
     }
 
     if (blockingRead) {
-      retErr = UR_CHECK_ERROR(cuStreamSynchronize(cuStream));
+      Result = UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
     }
-  } catch (ur_result_t err) {
-    return err;
+  } catch (ur_result_t Err) {
+    return Err;
   } catch (...) {
     return UR_RESULT_ERROR_UNKNOWN;
   }
 
-  return retErr;
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
@@ -977,58 +974,58 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  UR_ASSERT(hImage->mem_type_ == ur_mem_handle_t_::mem_type::surface,
+  UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface,
             UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-  ur_result_t retErr = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    ScopedContext active(hQueue->get_context());
-    CUstream cuStream = hQueue->get_next_transfer_stream();
-    retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
 
-    CUarray array = hImage->mem_.surface_mem_.get_array();
+    CUarray Array = hImage->Mem.SurfaceMem.getArray();
 
-    CUDA_ARRAY_DESCRIPTOR arrayDesc;
-    retErr = UR_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array));
+    CUDA_ARRAY_DESCRIPTOR ArrayDesc;
+    Result = UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array));
 
-    int elementByteSize = imageElementByteSize(arrayDesc);
+    int ElementByteSize = imageElementByteSize(ArrayDesc);
 
-    size_t byteOffsetX = origin.x * elementByteSize * arrayDesc.NumChannels;
-    size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region.width;
+    size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels;
+    size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width;
 
-    ur_mem_type_t imgType = hImage->mem_.surface_mem_.get_image_type();
-    if (imgType == UR_MEM_TYPE_IMAGE1D) {
-      retErr = UR_CHECK_ERROR(
-          cuMemcpyHtoAAsync(array, byteOffsetX, pSrc, bytesToCopy, cuStream));
+    ur_mem_type_t ImgType = hImage->Mem.SurfaceMem.getImageType();
+    if (ImgType == UR_MEM_TYPE_IMAGE1D) {
+      Result = UR_CHECK_ERROR(
+          cuMemcpyHtoAAsync(Array, ByteOffsetX, pSrc, BytesToCopy, CuStream));
     } else {
-      ur_rect_region_t adjustedRegion = {bytesToCopy, region.height,
+      ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height,
                                          region.depth};
-      ur_rect_offset_t dstOffset = {byteOffsetX, origin.y, origin.z};
+      ur_rect_offset_t DstOffset = {ByteOffsetX, origin.y, origin.z};
 
-      retErr = commonEnqueueMemImageNDCopy(
-          cuStream, imgType, adjustedRegion, pSrc, CU_MEMORYTYPE_HOST,
-          ur_rect_offset_t{}, &array, CU_MEMORYTYPE_ARRAY, dstOffset);
+      Result = commonEnqueueMemImageNDCopy(
+          CuStream, ImgType, AdjustedRegion, pSrc, CU_MEMORYTYPE_HOST,
+          ur_rect_offset_t{}, &Array, CU_MEMORYTYPE_ARRAY, DstOffset);
 
-      if (retErr != UR_RESULT_SUCCESS) {
-        return retErr;
+      if (Result != UR_RESULT_SUCCESS) {
+        return Result;
       }
     }
 
     if (phEvent) {
-      auto new_event = ur_event_handle_t_::make_native(
-          UR_COMMAND_MEM_IMAGE_WRITE, hQueue, cuStream);
-      new_event->record();
-      *phEvent = new_event;
+      auto NewEvent = ur_event_handle_t_::makeNative(UR_COMMAND_MEM_IMAGE_WRITE,
+                                                     hQueue, CuStream);
+      NewEvent->record();
+      *phEvent = NewEvent;
     }
-  } catch (ur_result_t err) {
-    return err;
+  } catch (ur_result_t Err) {
+    return Err;
   } catch (...) {
     return UR_RESULT_ERROR_UNKNOWN;
   }
 
-  return retErr;
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
@@ -1037,76 +1034,76 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
     ur_rect_offset_t dstOrigin, ur_rect_region_t region,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  UR_ASSERT(hImageSrc->mem_type_ == ur_mem_handle_t_::mem_type::surface,
+  UR_ASSERT(hImageSrc->MemType == ur_mem_handle_t_::Type::Surface,
             UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  UR_ASSERT(hImageDst->mem_type_ == ur_mem_handle_t_::mem_type::surface,
+  UR_ASSERT(hImageDst->MemType == ur_mem_handle_t_::Type::Surface,
             UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  UR_ASSERT(hImageSrc->mem_.surface_mem_.get_image_type() ==
-                hImageDst->mem_.surface_mem_.get_image_type(),
+  UR_ASSERT(hImageSrc->Mem.SurfaceMem.getImageType() ==
+                hImageDst->Mem.SurfaceMem.getImageType(),
             UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-  ur_result_t retErr = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    ScopedContext active(hQueue->get_context());
-    CUstream cuStream = hQueue->get_next_transfer_stream();
-    retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
 
-    CUarray srcArray = hImageSrc->mem_.surface_mem_.get_array();
-    CUarray dstArray = hImageDst->mem_.surface_mem_.get_array();
+    CUarray SrcArray = hImageSrc->Mem.SurfaceMem.getArray();
+    CUarray DstArray = hImageDst->Mem.SurfaceMem.getArray();
 
-    CUDA_ARRAY_DESCRIPTOR srcArrayDesc;
-    retErr = UR_CHECK_ERROR(cuArrayGetDescriptor(&srcArrayDesc, srcArray));
-    CUDA_ARRAY_DESCRIPTOR dstArrayDesc;
-    retErr = UR_CHECK_ERROR(cuArrayGetDescriptor(&dstArrayDesc, dstArray));
+    CUDA_ARRAY_DESCRIPTOR SrcArrayDesc;
+    Result = UR_CHECK_ERROR(cuArrayGetDescriptor(&SrcArrayDesc, SrcArray));
+    CUDA_ARRAY_DESCRIPTOR DstArrayDesc;
+    Result = UR_CHECK_ERROR(cuArrayGetDescriptor(&DstArrayDesc, DstArray));
 
-    UR_ASSERT(srcArrayDesc.Format == dstArrayDesc.Format,
+    UR_ASSERT(SrcArrayDesc.Format == DstArrayDesc.Format,
               UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-    UR_ASSERT(srcArrayDesc.NumChannels == dstArrayDesc.NumChannels,
+    UR_ASSERT(SrcArrayDesc.NumChannels == DstArrayDesc.NumChannels,
               UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-    int elementByteSize = imageElementByteSize(srcArrayDesc);
+    int ElementByteSize = imageElementByteSize(SrcArrayDesc);
 
-    size_t dstByteOffsetX =
-        dstOrigin.x * elementByteSize * srcArrayDesc.NumChannels;
-    size_t srcByteOffsetX =
-        srcOrigin.x * elementByteSize * dstArrayDesc.NumChannels;
-    size_t bytesToCopy =
-        elementByteSize * srcArrayDesc.NumChannels * region.width;
+    size_t DstByteOffsetX =
+        dstOrigin.x * ElementByteSize * SrcArrayDesc.NumChannels;
+    size_t SrcByteOffsetX =
+        srcOrigin.x * ElementByteSize * DstArrayDesc.NumChannels;
+    size_t BytesToCopy =
+        ElementByteSize * SrcArrayDesc.NumChannels * region.width;
 
-    ur_mem_type_t imgType = hImageSrc->mem_.surface_mem_.get_image_type();
-    if (imgType == UR_MEM_TYPE_IMAGE1D) {
-      retErr = UR_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray,
-                                           srcByteOffsetX, bytesToCopy));
+    ur_mem_type_t ImgType = hImageSrc->Mem.SurfaceMem.getImageType();
+    if (ImgType == UR_MEM_TYPE_IMAGE1D) {
+      Result = UR_CHECK_ERROR(cuMemcpyAtoA(DstArray, DstByteOffsetX, SrcArray,
+                                           SrcByteOffsetX, BytesToCopy));
     } else {
-      ur_rect_region_t adjustedRegion = {bytesToCopy, region.height,
+      ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height,
                                          region.depth};
-      ur_rect_offset_t srcOffset = {srcByteOffsetX, srcOrigin.y, srcOrigin.z};
-      ur_rect_offset_t dstOffset = {dstByteOffsetX, dstOrigin.y, dstOrigin.z};
+      ur_rect_offset_t SrcOffset = {SrcByteOffsetX, srcOrigin.y, srcOrigin.z};
+      ur_rect_offset_t DstOffset = {DstByteOffsetX, dstOrigin.y, dstOrigin.z};
 
-      retErr = commonEnqueueMemImageNDCopy(
-          cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY,
-          srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset);
+      Result = commonEnqueueMemImageNDCopy(
+          CuStream, ImgType, AdjustedRegion, &SrcArray, CU_MEMORYTYPE_ARRAY,
+          SrcOffset, &DstArray, CU_MEMORYTYPE_ARRAY, DstOffset);
 
-      if (retErr != UR_RESULT_SUCCESS) {
-        return retErr;
+      if (Result != UR_RESULT_SUCCESS) {
+        return Result;
       }
     }
 
     if (phEvent) {
-      auto new_event = ur_event_handle_t_::make_native(
-          UR_COMMAND_MEM_IMAGE_COPY, hQueue, cuStream);
-      new_event->record();
-      *phEvent = new_event;
+      auto NewEvent = ur_event_handle_t_::makeNative(UR_COMMAND_MEM_IMAGE_COPY,
+                                                     hQueue, CuStream);
+      NewEvent->record();
+      *phEvent = NewEvent;
     }
-  } catch (ur_result_t err) {
-    return err;
+  } catch (ur_result_t Err) {
+    return Err;
   } catch (...) {
     return UR_RESULT_ERROR_UNKNOWN;
   }
 
-  return retErr;
+  return Result;
 }
 
 /// Implements mapping on the host using a BufferRead operation.
@@ -1122,54 +1119,53 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
   UR_ASSERT(ppRetMap != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(hQueue != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hBuffer != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  UR_ASSERT(hBuffer->mem_type_ == ur_mem_handle_t_::mem_type::buffer,
+  UR_ASSERT(hBuffer->MemType == ur_mem_handle_t_::Type::Buffer,
             UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-  ur_result_t ret_err = UR_RESULT_ERROR_INVALID_MEM_OBJECT;
-  const bool is_pinned =
-      hBuffer->mem_.buffer_mem_.allocMode_ ==
-      ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr;
+  ur_result_t Result = UR_RESULT_ERROR_INVALID_MEM_OBJECT;
+  const bool IsPinned =
+      hBuffer->Mem.BufferMem.MemAllocMode ==
+      ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr;
 
   // Currently no support for overlapping regions
-  if (hBuffer->mem_.buffer_mem_.get_map_ptr() != nullptr) {
-    return ret_err;
+  if (hBuffer->Mem.BufferMem.getMapPtr() != nullptr) {
+    return Result;
   }
 
   // Allocate a pointer in the host to store the mapped information
-  auto hostPtr = hBuffer->mem_.buffer_mem_.map_to_ptr(offset, mapFlags);
-  *ppRetMap = hBuffer->mem_.buffer_mem_.get_map_ptr();
-  if (hostPtr) {
-    ret_err = UR_RESULT_SUCCESS;
+  auto HostPtr = hBuffer->Mem.BufferMem.mapToPtr(offset, mapFlags);
+  *ppRetMap = hBuffer->Mem.BufferMem.getMapPtr();
+  if (HostPtr) {
+    Result = UR_RESULT_SUCCESS;
   }
 
-  if (!is_pinned &&
+  if (!IsPinned &&
       ((mapFlags & UR_MAP_FLAG_READ) || (mapFlags & UR_MAP_FLAG_WRITE))) {
     // Pinned host memory is already on host so it doesn't need to be read.
-    ret_err = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size,
-                                     hostPtr, numEventsInWaitList,
-                                     phEventWaitList, phEvent);
+    Result = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size,
+                                    HostPtr, numEventsInWaitList,
+                                    phEventWaitList, phEvent);
   } else {
-    ScopedContext active(hQueue->get_context());
+    ScopedContext Active(hQueue->getContext());
 
-    if (is_pinned) {
-      ret_err = urEnqueueEventsWait(hQueue, numEventsInWaitList,
-                                    phEventWaitList, nullptr);
+    if (IsPinned) {
+      Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList,
+                                   nullptr);
     }
 
     if (phEvent) {
       try {
-        *phEvent =
-            ur_event_handle_t_::make_native(UR_COMMAND_MEM_BUFFER_MAP, hQueue,
-                                            hQueue->get_next_transfer_stream());
+        *phEvent = ur_event_handle_t_::makeNative(
+            UR_COMMAND_MEM_BUFFER_MAP, hQueue, hQueue->getNextTransferStream());
         (*phEvent)->start();
         (*phEvent)->record();
-      } catch (ur_result_t error) {
-        ret_err = error;
+      } catch (ur_result_t Err) {
+        Result = Err;
       }
     }
   }
 
-  return ret_err;
+  return Result;
 }
 
 /// Implements the unmap from the host, using a BufferWrite operation.
@@ -1180,51 +1176,50 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
     ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  ur_result_t ret_err = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(pMappedPtr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
-  UR_ASSERT(hMem->mem_type_ == ur_mem_handle_t_::mem_type::buffer,
+  UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer,
             UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  UR_ASSERT(hMem->mem_.buffer_mem_.get_map_ptr() != nullptr,
+  UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() != nullptr,
             UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  UR_ASSERT(hMem->mem_.buffer_mem_.get_map_ptr() == pMappedPtr,
+  UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() == pMappedPtr,
             UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-  const bool is_pinned =
-      hMem->mem_.buffer_mem_.allocMode_ ==
-      ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr;
+  const bool IsPinned =
+      hMem->Mem.BufferMem.MemAllocMode ==
+      ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr;
 
-  if (!is_pinned &&
-      (hMem->mem_.buffer_mem_.get_map_flags() & UR_MAP_FLAG_WRITE)) {
+  if (!IsPinned && (hMem->Mem.BufferMem.getMapFlags() & UR_MAP_FLAG_WRITE)) {
     // Pinned host memory is only on host so it doesn't need to be written to.
-    ret_err = urEnqueueMemBufferWrite(
-        hQueue, hMem, true, hMem->mem_.buffer_mem_.get_map_offset(pMappedPtr),
-        hMem->mem_.buffer_mem_.get_size(), pMappedPtr, numEventsInWaitList,
+    Result = urEnqueueMemBufferWrite(
+        hQueue, hMem, true, hMem->Mem.BufferMem.getMapOffset(pMappedPtr),
+        hMem->Mem.BufferMem.getSize(), pMappedPtr, numEventsInWaitList,
         phEventWaitList, phEvent);
   } else {
-    ScopedContext active(hQueue->get_context());
+    ScopedContext Active(hQueue->getContext());
 
-    if (is_pinned) {
-      ret_err = urEnqueueEventsWait(hQueue, numEventsInWaitList,
-                                    phEventWaitList, nullptr);
+    if (IsPinned) {
+      Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList,
+                                   nullptr);
     }
 
     if (phEvent) {
       try {
-        *phEvent = ur_event_handle_t_::make_native(
-            UR_COMMAND_MEM_UNMAP, hQueue, hQueue->get_next_transfer_stream());
+        *phEvent = ur_event_handle_t_::makeNative(
+            UR_COMMAND_MEM_UNMAP, hQueue, hQueue->getNextTransferStream());
         (*phEvent)->start();
         (*phEvent)->record();
-      } catch (ur_result_t error) {
-        ret_err = error;
+      } catch (ur_result_t Err) {
+        Result = Err;
       }
     }
   }
 
-  hMem->mem_.buffer_mem_.unmap(pMappedPtr);
-  return ret_err;
+  hMem->Mem.BufferMem.unmap(pMappedPtr);
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
@@ -1235,50 +1230,50 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
   UR_ASSERT(ptr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(size % patternSize == 0, UR_RESULT_ERROR_INVALID_SIZE);
 
-  ur_result_t result = UR_RESULT_SUCCESS;
-  std::unique_ptr<ur_event_handle_t_> event_ptr{nullptr};
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
   try {
-    ScopedContext active(hQueue->get_context());
-    uint32_t stream_token;
-    ur_stream_guard_ guard;
-    CUstream cuStream = hQueue->get_next_compute_stream(
-        numEventsInWaitList, phEventWaitList, guard, &stream_token);
-    result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+    ScopedContext Active(hQueue->getContext());
+    uint32_t StreamToken;
+    ur_stream_guard_ Guard;
+    CUstream CuStream = hQueue->getNextComputeStream(
+        numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
     if (phEvent) {
-      event_ptr =
-          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
-              UR_COMMAND_USM_FILL, hQueue, cuStream, stream_token));
-      event_ptr->start();
+      EventPtr =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_USM_FILL, hQueue, CuStream, StreamToken));
+      EventPtr->start();
     }
     switch (patternSize) {
     case 1:
-      result = UR_CHECK_ERROR(
+      Result = UR_CHECK_ERROR(
           cuMemsetD8Async((CUdeviceptr)ptr, *((const uint8_t *)pPattern) & 0xFF,
-                          size, cuStream));
+                          size, CuStream));
       break;
     case 2:
-      result = UR_CHECK_ERROR(cuMemsetD16Async(
+      Result = UR_CHECK_ERROR(cuMemsetD16Async(
           (CUdeviceptr)ptr, *((const uint16_t *)pPattern) & 0xFFFF, size,
-          cuStream));
+          CuStream));
       break;
     case 4:
-      result = UR_CHECK_ERROR(cuMemsetD32Async(
+      Result = UR_CHECK_ERROR(cuMemsetD32Async(
           (CUdeviceptr)ptr, *((const uint32_t *)pPattern) & 0xFFFFFFFF, size,
-          cuStream));
+          CuStream));
       break;
     default:
       return UR_RESULT_ERROR_INVALID_ARGUMENT;
     }
     if (phEvent) {
-      result = event_ptr->record();
-      *phEvent = event_ptr.release();
+      Result = EventPtr->record();
+      *phEvent = EventPtr.release();
     }
-  } catch (ur_result_t err) {
-    result = err;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
-  return result;
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
@@ -1288,36 +1283,36 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
   UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER);
-  ur_result_t result = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
 
-  std::unique_ptr<ur_event_handle_t_> event_ptr{nullptr};
+  std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
   try {
-    ScopedContext active(hQueue->get_context());
-    CUstream cuStream = hQueue->get_next_transfer_stream();
-    result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
     if (phEvent) {
-      event_ptr =
-          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
-              UR_COMMAND_USM_MEMCPY, hQueue, cuStream));
-      event_ptr->start();
+      EventPtr =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_USM_MEMCPY, hQueue, CuStream));
+      EventPtr->start();
     }
-    result = UR_CHECK_ERROR(
-        cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, cuStream));
+    Result = UR_CHECK_ERROR(
+        cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, CuStream));
     if (phEvent) {
-      result = event_ptr->record();
+      Result = EventPtr->record();
     }
     if (blocking) {
-      result = UR_CHECK_ERROR(cuStreamSynchronize(cuStream));
+      Result = UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
     }
     if (phEvent) {
-      *phEvent = event_ptr.release();
+      *phEvent = EventPtr.release();
     }
-  } catch (ur_result_t err) {
-    result = err;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
-  return result;
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
@@ -1325,23 +1320,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
     ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList,
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
-  ur_device_handle_t device = hQueue->get_context()->get_device();
+  ur_device_handle_t Device = hQueue->getContext()->getDevice();
 
   // Certain cuda devices and Windows do not have support for some Unified
   // Memory features. cuMemPrefetchAsync requires concurrent memory access
   // for managed memory. Therfore, ignore prefetch hint if concurrent managed
   // memory access is not available.
-  if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+  if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
     setErrorMessage("Prefetch hint ignored as device does not support "
                     "concurrent managed access",
                     UR_RESULT_SUCCESS);
     return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
   }
 
-  unsigned int is_managed;
+  unsigned int IsManaged;
   UR_CHECK_ERROR(cuPointerGetAttribute(
-      &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem));
-  if (!is_managed) {
+      &IsManaged, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem));
+  if (!IsManaged) {
     setErrorMessage("Prefetch hint ignored as prefetch only works with USM",
                     UR_RESULT_SUCCESS);
     return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
@@ -1352,30 +1347,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
     return UR_RESULT_ERROR_INVALID_VALUE;
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
-  ur_result_t result = UR_RESULT_SUCCESS;
-  std::unique_ptr<ur_event_handle_t_> event_ptr{nullptr};
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
   try {
-    ScopedContext active(hQueue->get_context());
-    CUstream cuStream = hQueue->get_next_transfer_stream();
-    result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
     if (phEvent) {
-      event_ptr =
-          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
-              UR_COMMAND_MEM_BUFFER_COPY, hQueue, cuStream));
-      event_ptr->start();
+      EventPtr =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_MEM_BUFFER_COPY, hQueue, CuStream));
+      EventPtr->start();
     }
-    result = UR_CHECK_ERROR(
-        cuMemPrefetchAsync((CUdeviceptr)pMem, size, device->get(), cuStream));
+    Result = UR_CHECK_ERROR(
+        cuMemPrefetchAsync((CUdeviceptr)pMem, size, Device->get(), CuStream));
     if (phEvent) {
-      result = event_ptr->record();
-      *phEvent = event_ptr.release();
+      Result = EventPtr->record();
+      *phEvent = EventPtr.release();
     }
-  } catch (ur_result_t err) {
-    result = err;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
-  return result;
+  return Result;
 }
 
 /// USM: memadvise API to govern behavior of automatic migration mechanisms
@@ -1395,8 +1390,8 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
       (advice & UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE) ||
       (advice & UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE) ||
       (advice & UR_USM_ADVICE_FLAG_DEFAULT)) {
-    ur_device_handle_t device = hQueue->get_context()->get_device();
-    if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+    ur_device_handle_t Device = hQueue->getContext()->getDevice();
+    if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
       setErrorMessage("Mem advise ignored as device does not support "
                       "concurrent managed access",
                       UR_RESULT_SUCCESS);
@@ -1408,54 +1403,54 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
     // CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS property.
   }
 
-  unsigned int is_managed;
+  unsigned int IsManaged;
   UR_CHECK_ERROR(cuPointerGetAttribute(
-      &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem));
-  if (!is_managed) {
+      &IsManaged, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem));
+  if (!IsManaged) {
     setErrorMessage(
         "Memory advice ignored as memory advices only works with USM",
         UR_RESULT_SUCCESS);
     return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
   }
 
-  ur_result_t result = UR_RESULT_SUCCESS;
-  std::unique_ptr<ur_event_handle_t_> event_ptr{nullptr};
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
   try {
-    ScopedContext active(hQueue->get_context());
+    ScopedContext Active(hQueue->getContext());
 
     if (phEvent) {
-      event_ptr = std::unique_ptr<ur_event_handle_t_>(
-          ur_event_handle_t_::make_native(UR_COMMAND_USM_ADVISE, hQueue,
-                                          hQueue->get_next_transfer_stream()));
-      event_ptr->start();
+      EventPtr =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_USM_ADVISE, hQueue, hQueue->getNextTransferStream()));
+      EventPtr->start();
     }
 
     if (advice & UR_USM_ADVICE_FLAG_DEFAULT) {
       UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
                                  CU_MEM_ADVISE_UNSET_READ_MOSTLY,
-                                 hQueue->get_context()->get_device()->get()));
+                                 hQueue->getContext()->getDevice()->get()));
       UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
                                  CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
-                                 hQueue->get_context()->get_device()->get()));
+                                 hQueue->getContext()->getDevice()->get()));
       UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
                                  CU_MEM_ADVISE_UNSET_ACCESSED_BY,
-                                 hQueue->get_context()->get_device()->get()));
+                                 hQueue->getContext()->getDevice()->get()));
     } else {
-      result = setCuMemAdvise((CUdeviceptr)pMem, size, advice,
-                              hQueue->get_context()->get_device()->get());
+      Result = setCuMemAdvise((CUdeviceptr)pMem, size, advice,
+                              hQueue->getContext()->getDevice()->get());
     }
 
     if (phEvent) {
-      result = event_ptr->record();
-      *phEvent = event_ptr.release();
+      Result = EventPtr->record();
+      *phEvent = EventPtr.release();
     }
   } catch (ur_result_t err) {
-    result = err;
+    Result = err;
   } catch (...) {
-    result = UR_RESULT_ERROR_UNKNOWN;
+    Result = UR_RESULT_ERROR_UNKNOWN;
   }
-  return result;
+  return Result;
 }
 
 // TODO: Implement this. Remember to return true for
@@ -1477,31 +1472,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
   ur_result_t result = UR_RESULT_SUCCESS;
 
   try {
-    ScopedContext active(hQueue->get_context());
-    CUstream cuStream = hQueue->get_next_transfer_stream();
+    ScopedContext active(hQueue->getContext());
+    CUstream cuStream = hQueue->getNextTransferStream();
     result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
                                phEventWaitList);
     if (phEvent) {
-      (*phEvent) = ur_event_handle_t_::make_native(
+      (*phEvent) = ur_event_handle_t_::makeNative(
           UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, cuStream);
       (*phEvent)->start();
     }
 
     // Determine the direction of copy using cuPointerGetAttribute
-    // for both the src_ptr and dst_ptr
-    CUDA_MEMCPY2D cpyDesc = {0};
+    // for both the SrcPtr and DstPtr
+    CUDA_MEMCPY2D CpyDesc = {0};
 
-    getUSMHostOrDevicePtr(pSrc, &cpyDesc.srcMemoryType, &cpyDesc.srcDevice,
-                          &cpyDesc.srcHost);
-    getUSMHostOrDevicePtr(pDst, &cpyDesc.dstMemoryType, &cpyDesc.dstDevice,
-                          &cpyDesc.dstHost);
+    getUSMHostOrDevicePtr(pSrc, &CpyDesc.srcMemoryType, &CpyDesc.srcDevice,
+                          &CpyDesc.srcHost);
+    getUSMHostOrDevicePtr(pDst, &CpyDesc.dstMemoryType, &CpyDesc.dstDevice,
+                          &CpyDesc.dstHost);
 
-    cpyDesc.dstPitch = dstPitch;
-    cpyDesc.srcPitch = srcPitch;
-    cpyDesc.WidthInBytes = width;
-    cpyDesc.Height = height;
+    CpyDesc.dstPitch = dstPitch;
+    CpyDesc.srcPitch = srcPitch;
+    CpyDesc.WidthInBytes = width;
+    CpyDesc.Height = height;
 
-    result = UR_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cuStream));
+    result = UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc, cuStream));
 
     if (phEvent) {
       (*phEvent)->record();
@@ -1522,7 +1517,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
 
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  UR_ASSERT(!hBuffer->is_image(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(!hBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
   UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   if (phEventWaitList) {
     UR_ASSERT(numEventsInWaitList > 0, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
@@ -1530,46 +1525,46 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
     UR_ASSERT(numEventsInWaitList == 0,
               UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
   }
-  UR_ASSERT(offset + size <= hBuffer->mem_.buffer_mem_.size_,
+  UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.Size,
             UR_RESULT_ERROR_INVALID_SIZE);
 
-  ur_result_t retErr = UR_RESULT_SUCCESS;
-  CUdeviceptr devPtr = hBuffer->mem_.buffer_mem_.get();
-  std::unique_ptr<ur_event_handle_t_> retImplEv{nullptr};
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get();
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext active(hQueue->get_context());
-    CUstream cuStream = hQueue->get_next_transfer_stream();
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
 
-    retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
 
     if (phEvent) {
-      retImplEv =
-          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
-              UR_COMMAND_MEM_BUFFER_READ, hQueue, cuStream));
-      retImplEv->start();
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_MEM_BUFFER_READ, hQueue, CuStream));
+      RetImplEvent->start();
     }
 
-    UR_CHECK_ERROR(cuMemcpyDtoHAsync(pDst, devPtr + offset, size, cuStream));
+    UR_CHECK_ERROR(cuMemcpyDtoHAsync(pDst, DevPtr + offset, size, CuStream));
 
     if (phEvent) {
-      retErr = retImplEv->record();
+      Result = RetImplEvent->record();
     }
 
     if (blockingRead) {
-      UR_CHECK_ERROR(cuStreamSynchronize(cuStream));
+      UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
     }
 
     if (phEvent) {
-      *phEvent = retImplEv.release();
+      *phEvent = RetImplEvent.release();
     }
 
-  } catch (ur_result_t err) {
-    retErr = err;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
 
-  return retErr;
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
@@ -1579,7 +1574,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
 
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  UR_ASSERT(!hBuffer->is_image(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(!hBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
   UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   if (phEventWaitList) {
     UR_ASSERT(numEventsInWaitList > 0, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
@@ -1587,44 +1582,44 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
     UR_ASSERT(numEventsInWaitList == 0,
               UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
   }
-  UR_ASSERT(offset + size <= hBuffer->mem_.buffer_mem_.size_,
+  UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.Size,
             UR_RESULT_ERROR_INVALID_SIZE);
 
-  ur_result_t retErr = UR_RESULT_SUCCESS;
-  CUdeviceptr devPtr = hBuffer->mem_.buffer_mem_.get();
-  std::unique_ptr<ur_event_handle_t_> retImplEv{nullptr};
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get();
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext active(hQueue->get_context());
-    CUstream cuStream = hQueue->get_next_transfer_stream();
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
 
-    retErr = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
 
     if (phEvent) {
-      retImplEv =
-          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::make_native(
-              UR_COMMAND_MEM_BUFFER_WRITE, hQueue, cuStream));
-      retImplEv->start();
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_MEM_BUFFER_WRITE, hQueue, CuStream));
+      RetImplEvent->start();
     }
 
-    UR_CHECK_ERROR(cuMemcpyHtoDAsync(devPtr + offset, pSrc, size, cuStream));
+    UR_CHECK_ERROR(cuMemcpyHtoDAsync(DevPtr + offset, pSrc, size, CuStream));
 
     if (phEvent) {
-      retErr = retImplEv->record();
+      Result = RetImplEvent->record();
     }
 
     if (blockingWrite) {
-      UR_CHECK_ERROR(cuStreamSynchronize(cuStream));
+      UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
     }
 
     if (phEvent) {
-      *phEvent = retImplEv.release();
+      *phEvent = RetImplEvent.release();
     }
-  } catch (ur_result_t err) {
-    retErr = err;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
-  return retErr;
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
@@ -1638,29 +1633,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
 
   // Since CUDA requires a the global variable to be referenced by name, we use
   // metadata to find the correct name to access it by.
-  auto device_global_name_it = hProgram->globalIDMD_.find(name);
-  if (device_global_name_it == hProgram->globalIDMD_.end())
+  auto DeviceGlobalNameIt = hProgram->GlobalIDMD.find(name);
+  if (DeviceGlobalNameIt == hProgram->GlobalIDMD.end())
     return UR_RESULT_ERROR_INVALID_VALUE;
-  std::string device_global_name = device_global_name_it->second;
+  std::string DeviceGlobalName = DeviceGlobalNameIt->second;
 
-  ur_result_t result = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
   try {
-    CUdeviceptr device_global = 0;
-    size_t device_global_size = 0;
-    result = UR_CHECK_ERROR(
-        cuModuleGetGlobal(&device_global, &device_global_size, hProgram->get(),
-                          device_global_name.c_str()));
+    CUdeviceptr DeviceGlobal = 0;
+    size_t DeviceGlobalSize = 0;
+    Result = UR_CHECK_ERROR(cuModuleGetGlobal(&DeviceGlobal, &DeviceGlobalSize,
+                                              hProgram->get(),
+                                              DeviceGlobalName.c_str()));
 
-    if (offset + count > device_global_size)
+    if (offset + count > DeviceGlobalSize)
       return UR_RESULT_ERROR_INVALID_VALUE;
 
     return urEnqueueUSMMemcpy(
-        hQueue, blockingWrite, reinterpret_cast<void *>(device_global + offset),
+        hQueue, blockingWrite, reinterpret_cast<void *>(DeviceGlobal + offset),
         pSrc, count, numEventsInWaitList, phEventWaitList, phEvent);
-  } catch (ur_result_t error) {
-    result = error;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
-  return result;
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
@@ -1674,30 +1669,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
 
   // Since CUDA requires a the global variable to be referenced by name, we use
   // metadata to find the correct name to access it by.
-  auto device_global_name_it = hProgram->globalIDMD_.find(name);
-  if (device_global_name_it == hProgram->globalIDMD_.end())
+  auto DeviceGlobalNameIt = hProgram->GlobalIDMD.find(name);
+  if (DeviceGlobalNameIt == hProgram->GlobalIDMD.end())
     return UR_RESULT_ERROR_INVALID_VALUE;
-  std::string device_global_name = device_global_name_it->second;
+  std::string DeviceGlobalName = DeviceGlobalNameIt->second;
 
-  ur_result_t result = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
   try {
-    CUdeviceptr device_global = 0;
-    size_t device_global_size = 0;
-    result = UR_CHECK_ERROR(
-        cuModuleGetGlobal(&device_global, &device_global_size, hProgram->get(),
-                          device_global_name.c_str()));
+    CUdeviceptr DeviceGlobal = 0;
+    size_t DeviceGlobalSize = 0;
+    Result = UR_CHECK_ERROR(cuModuleGetGlobal(&DeviceGlobal, &DeviceGlobalSize,
+                                              hProgram->get(),
+                                              DeviceGlobalName.c_str()));
 
-    if (offset + count > device_global_size)
+    if (offset + count > DeviceGlobalSize)
       return UR_RESULT_ERROR_INVALID_VALUE;
 
     return urEnqueueUSMMemcpy(
         hQueue, blockingRead, pDst,
-        reinterpret_cast<const void *>(device_global + offset), count,
+        reinterpret_cast<const void *>(DeviceGlobal + offset), count,
         numEventsInWaitList, phEventWaitList, phEvent);
-  } catch (ur_result_t error) {
-    result = error;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
-  return result;
+  return Result;
 }
 
 /// Host Pipes
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
index f1a0b9d2a97d2..8916197b73f1c 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
@@ -15,150 +15,148 @@
 #include <cassert>
 #include <cuda.h>
 
-ur_event_handle_t_::ur_event_handle_t_(ur_command_t type,
-                                       ur_context_handle_t context,
-                                       ur_queue_handle_t queue, CUstream stream,
-                                       uint32_t stream_token)
-    : commandType_{type}, refCount_{1}, has_ownership_{true},
-      hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
-      streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr},
-      evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} {
+ur_event_handle_t_::ur_event_handle_t_(ur_command_t Type,
+                                       ur_context_handle_t Context,
+                                       ur_queue_handle_t Queue, CUstream Stream,
+                                       uint32_t StreamToken)
+    : CommandType{Type}, RefCount{1}, HasOwnership{true},
+      HasBeenWaitedOn{false}, IsRecorded{false}, IsStarted{false},
+      StreamToken{StreamToken}, EvEnd{nullptr}, EvStart{nullptr},
+      EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, Context{Context} {
 
-  bool profilingEnabled = queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE;
+  bool ProfilingEnabled = Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE;
 
   UR_CHECK_ERROR(cuEventCreate(
-      &evEnd_, profilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
+      &EvEnd, ProfilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
 
-  if (profilingEnabled) {
-    UR_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
-    UR_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
+  if (ProfilingEnabled) {
+    UR_CHECK_ERROR(cuEventCreate(&EvQueued, CU_EVENT_DEFAULT));
+    UR_CHECK_ERROR(cuEventCreate(&EvStart, CU_EVENT_DEFAULT));
   }
 
-  if (queue_ != nullptr) {
-    urQueueRetain(queue_);
+  if (Queue != nullptr) {
+    urQueueRetain(Queue);
   }
-  urContextRetain(context_);
+  urContextRetain(Context);
 }
 
-ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t context,
-                                       CUevent eventNative)
-    // TODO(ur): Missing user command type
-    : commandType_{UR_COMMAND_EVENTS_WAIT}, refCount_{1}, has_ownership_{false},
-      hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
-      streamToken_{std::numeric_limits<uint32_t>::max()}, evEnd_{eventNative},
-      evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr},
-      context_{context} {
-  urContextRetain(context_);
+ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t Context,
+                                       CUevent EventNative)
+    : CommandType{UR_COMMAND_EVENTS_WAIT}, RefCount{1}, HasOwnership{false},
+      HasBeenWaitedOn{false}, IsRecorded{false}, IsStarted{false},
+      StreamToken{std::numeric_limits<uint32_t>::max()}, EvEnd{EventNative},
+      EvStart{nullptr}, EvQueued{nullptr}, Queue{nullptr}, Context{Context} {
+  urContextRetain(Context);
 }
 
 ur_event_handle_t_::~ur_event_handle_t_() {
-  if (queue_ != nullptr) {
-    urQueueRelease(queue_);
+  if (Queue != nullptr) {
+    urQueueRelease(Queue);
   }
-  urContextRelease(context_);
+  urContextRelease(Context);
 }
 
 ur_result_t ur_event_handle_t_::start() {
-  assert(!is_started());
-  ur_result_t result = UR_RESULT_SUCCESS;
+  assert(!isStarted());
+  ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    if (queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+    if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) {
       // NOTE: This relies on the default stream to be unused.
-      result = UR_CHECK_ERROR(cuEventRecord(evQueued_, 0));
-      result = UR_CHECK_ERROR(cuEventRecord(evStart_, stream_));
+      Result = UR_CHECK_ERROR(cuEventRecord(EvQueued, 0));
+      Result = UR_CHECK_ERROR(cuEventRecord(EvStart, Stream));
     }
-  } catch (ur_result_t error) {
-    result = error;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
 
-  isStarted_ = true;
-  return result;
+  IsStarted = true;
+  return Result;
 }
 
-bool ur_event_handle_t_::is_completed() const noexcept {
-  if (!isRecorded_) {
+bool ur_event_handle_t_::isCompleted() const noexcept {
+  if (!IsRecorded) {
     return false;
   }
-  if (!hasBeenWaitedOn_) {
-    const CUresult ret = cuEventQuery(evEnd_);
-    if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_READY) {
-      UR_CHECK_ERROR(ret);
+  if (!HasBeenWaitedOn) {
+    const CUresult Result = cuEventQuery(EvEnd);
+    if (Result != CUDA_SUCCESS && Result != CUDA_ERROR_NOT_READY) {
+      UR_CHECK_ERROR(Result);
       return false;
     }
-    if (ret == CUDA_ERROR_NOT_READY) {
+    if (Result == CUDA_ERROR_NOT_READY) {
       return false;
     }
   }
   return true;
 }
 
-uint64_t ur_event_handle_t_::get_queued_time() const {
-  assert(is_started());
-  return queue_->get_device()->get_elapsed_time(evQueued_);
+uint64_t ur_event_handle_t_::getQueuedTime() const {
+  assert(isStarted());
+  return Queue->get_device()->getElapsedTime(EvQueued);
 }
 
-uint64_t ur_event_handle_t_::get_start_time() const {
-  assert(is_started());
-  return queue_->get_device()->get_elapsed_time(evStart_);
+uint64_t ur_event_handle_t_::getStartTime() const {
+  assert(isStarted());
+  return Queue->get_device()->getElapsedTime(EvStart);
 }
 
-uint64_t ur_event_handle_t_::get_end_time() const {
-  assert(is_started() && is_recorded());
-  return queue_->get_device()->get_elapsed_time(evEnd_);
+uint64_t ur_event_handle_t_::getEndTime() const {
+  assert(isStarted() && isRecorded());
+  return Queue->get_device()->getElapsedTime(EvEnd);
 }
 
 ur_result_t ur_event_handle_t_::record() {
 
-  if (is_recorded() || !is_started()) {
+  if (isRecorded() || !isStarted()) {
     return UR_RESULT_ERROR_INVALID_EVENT;
   }
 
-  ur_result_t result = UR_RESULT_ERROR_INVALID_OPERATION;
+  ur_result_t Result = UR_RESULT_ERROR_INVALID_OPERATION;
 
-  UR_ASSERT(queue_, UR_RESULT_ERROR_INVALID_QUEUE);
+  UR_ASSERT(Queue, UR_RESULT_ERROR_INVALID_QUEUE);
 
   try {
-    eventId_ = queue_->get_next_event_id();
-    if (eventId_ == 0) {
+    EventID = Queue->getNextEventID();
+    if (EventID == 0) {
       sycl::detail::ur::die(
           "Unrecoverable program state reached in event identifier overflow");
     }
-    result = UR_CHECK_ERROR(cuEventRecord(evEnd_, stream_));
+    Result = UR_CHECK_ERROR(cuEventRecord(EvEnd, Stream));
   } catch (ur_result_t error) {
-    result = error;
+    Result = error;
   }
 
-  if (result == UR_RESULT_SUCCESS) {
-    isRecorded_ = true;
+  if (Result == UR_RESULT_SUCCESS) {
+    IsRecorded = true;
   }
 
-  return result;
+  return Result;
 }
 
 ur_result_t ur_event_handle_t_::wait() {
-  ur_result_t retErr;
+  ur_result_t Result;
   try {
-    retErr = UR_CHECK_ERROR(cuEventSynchronize(evEnd_));
-    hasBeenWaitedOn_ = true;
+    Result = UR_CHECK_ERROR(cuEventSynchronize(EvEnd));
+    HasBeenWaitedOn = true;
   } catch (ur_result_t error) {
-    retErr = error;
+    Result = error;
   }
 
-  return retErr;
+  return Result;
 }
 
 ur_result_t ur_event_handle_t_::release() {
-  if (!backend_has_ownership())
+  if (!backendHasOwnership())
     return UR_RESULT_SUCCESS;
 
-  assert(queue_ != nullptr);
+  assert(Queue != nullptr);
 
-  UR_CHECK_ERROR(cuEventDestroy(evEnd_));
+  UR_CHECK_ERROR(cuEventDestroy(EvEnd));
 
-  if (queue_->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE) {
-    UR_CHECK_ERROR(cuEventDestroy(evQueued_));
-    UR_CHECK_ERROR(cuEventDestroy(evStart_));
+  if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+    UR_CHECK_ERROR(cuEventDestroy(EvQueued));
+    UR_CHECK_ERROR(cuEventDestroy(EvStart));
   }
 
   return UR_RESULT_SUCCESS;
@@ -174,15 +172,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent,
 
   switch (propName) {
   case UR_EVENT_INFO_COMMAND_QUEUE:
-    return ReturnValue(hEvent->get_queue());
+    return ReturnValue(hEvent->getQueue());
   case UR_EVENT_INFO_COMMAND_TYPE:
-    return ReturnValue(hEvent->get_command_type());
+    return ReturnValue(hEvent->getCommandType());
   case UR_EVENT_INFO_REFERENCE_COUNT:
-    return ReturnValue(hEvent->get_reference_count());
+    return ReturnValue(hEvent->getReferenceCount());
   case UR_EVENT_INFO_COMMAND_EXECUTION_STATUS:
-    return ReturnValue(hEvent->get_execution_status());
+    return ReturnValue(hEvent->getExecutionStatus());
   case UR_EVENT_INFO_CONTEXT:
-    return ReturnValue(hEvent->get_context());
+    return ReturnValue(hEvent->getContext());
   default:
     sycl::detail::ur::die("Event info request not implemented");
   }
@@ -198,9 +196,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
   UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet);
 
-  ur_queue_handle_t queue = hEvent->get_queue();
-  if (queue == nullptr ||
-      !(queue->ur_flags_ & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
+  ur_queue_handle_t Queue = hEvent->getQueue();
+  if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
     return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
   }
 
@@ -208,11 +205,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
   case UR_PROFILING_INFO_COMMAND_QUEUED:
   case UR_PROFILING_INFO_COMMAND_SUBMIT:
     // Note: No user for this case
-    return ReturnValue(static_cast<uint64_t>(hEvent->get_queued_time()));
+    return ReturnValue(static_cast<uint64_t>(hEvent->getQueuedTime()));
   case UR_PROFILING_INFO_COMMAND_START:
-    return ReturnValue(static_cast<uint64_t>(hEvent->get_start_time()));
+    return ReturnValue(static_cast<uint64_t>(hEvent->getStartTime()));
   case UR_PROFILING_INFO_COMMAND_END:
-    return ReturnValue(static_cast<uint64_t>(hEvent->get_end_time()));
+    return ReturnValue(static_cast<uint64_t>(hEvent->getEndTime()));
   default:
     break;
   }
@@ -234,19 +231,19 @@ urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) {
     UR_ASSERT(phEventWaitList, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
     UR_ASSERT(numEvents > 0, UR_RESULT_ERROR_INVALID_VALUE);
 
-    auto context = phEventWaitList[0]->get_context();
-    ScopedContext active(context);
+    auto Context = phEventWaitList[0]->getContext();
+    ScopedContext Active(Context);
 
-    auto waitFunc = [context](ur_event_handle_t event) -> ur_result_t {
-      UR_ASSERT(event, UR_RESULT_ERROR_INVALID_EVENT);
-      UR_ASSERT(event->get_context() == context,
+    auto WaitFunc = [Context](ur_event_handle_t Event) -> ur_result_t {
+      UR_ASSERT(Event, UR_RESULT_ERROR_INVALID_EVENT);
+      UR_ASSERT(Event->getContext() == Context,
                 UR_RESULT_ERROR_INVALID_CONTEXT);
 
-      return event->wait();
+      return Event->wait();
     };
-    return forLatestEvents(phEventWaitList, numEvents, waitFunc);
-  } catch (ur_result_t err) {
-    return err;
+    return forLatestEvents(phEventWaitList, numEvents, WaitFunc);
+  } catch (ur_result_t Err) {
+    return Err;
   } catch (...) {
     return UR_RESULT_ERROR_OUT_OF_RESOURCES;
   }
@@ -255,10 +252,10 @@ urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) {
 UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) {
   UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  const auto refCount = hEvent->increment_reference_count();
+  const auto RefCount = hEvent->incrementReferenceCount();
 
   sycl::detail::ur::assertion(
-      refCount != 0, "Reference count overflow detected in urEventRetain.");
+      RefCount != 0, "Reference count overflow detected in urEventRetain.");
 
   return UR_RESULT_SUCCESS;
 }
@@ -269,20 +266,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) {
   // double delete or someone is messing with the ref count.
   // either way, cannot safely proceed.
   sycl::detail::ur::assertion(
-      hEvent->get_reference_count() != 0,
+      hEvent->getReferenceCount() != 0,
       "Reference count overflow detected in urEventRelease.");
 
   // decrement ref count. If it is 0, delete the event.
-  if (hEvent->decrement_reference_count() == 0) {
+  if (hEvent->decrementReferenceCount() == 0) {
     std::unique_ptr<ur_event_handle_t_> event_ptr{hEvent};
-    ur_result_t result = UR_RESULT_ERROR_INVALID_EVENT;
+    ur_result_t Result = UR_RESULT_ERROR_INVALID_EVENT;
     try {
-      ScopedContext active(hEvent->get_context());
-      result = hEvent->release();
+      ScopedContext Active(hEvent->getContext());
+      Result = hEvent->release();
     } catch (...) {
-      result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+      Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
     }
-    return result;
+    return Result;
   }
 
   return UR_RESULT_SUCCESS;
@@ -298,11 +295,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle(
     ur_native_handle_t hNativeEvent, ur_context_handle_t hContext,
     const ur_event_native_properties_t *pProperties,
     ur_event_handle_t *phEvent) {
-  (void)pProperties;
+  std::ignore = pProperties;
 
-  std::unique_ptr<ur_event_handle_t_> event_ptr{nullptr};
+  std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
-  *phEvent = ur_event_handle_t_::make_with_native(
+  *phEvent = ur_event_handle_t_::makeWithNative(
       hContext, reinterpret_cast<CUevent>(hNativeEvent));
 
   return UR_RESULT_SUCCESS;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp
index b0f10b33a5822..b1e0f939940ca 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp
@@ -24,68 +24,68 @@ struct ur_event_handle_t_ {
 
   ur_result_t start();
 
-  native_type get() const noexcept { return evEnd_; };
+  native_type get() const noexcept { return EvEnd; };
 
-  ur_queue_handle_t get_queue() const noexcept { return queue_; }
+  ur_queue_handle_t getQueue() const noexcept { return Queue; }
 
-  CUstream get_stream() const noexcept { return stream_; }
+  CUstream getStream() const noexcept { return Stream; }
 
-  uint32_t get_compute_stream_token() const noexcept { return streamToken_; }
+  uint32_t getComputeStreamToken() const noexcept { return StreamToken; }
 
-  ur_command_t get_command_type() const noexcept { return commandType_; }
+  ur_command_t getCommandType() const noexcept { return CommandType; }
 
-  uint32_t get_reference_count() const noexcept { return refCount_; }
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
 
-  bool is_recorded() const noexcept { return isRecorded_; }
+  bool isRecorded() const noexcept { return IsRecorded; }
 
-  bool is_started() const noexcept { return isStarted_; }
+  bool isStarted() const noexcept { return IsStarted; }
 
-  bool is_completed() const noexcept;
+  bool isCompleted() const noexcept;
 
-  uint32_t get_execution_status() const noexcept {
+  uint32_t getExecutionStatus() const noexcept {
 
-    if (!is_recorded()) {
+    if (!isRecorded()) {
       return UR_EVENT_STATUS_SUBMITTED;
     }
 
-    if (!is_completed()) {
+    if (!isCompleted()) {
       return UR_EVENT_STATUS_RUNNING;
     }
     return UR_EVENT_STATUS_COMPLETE;
   }
 
-  ur_context_handle_t get_context() const noexcept { return context_; };
+  ur_context_handle_t getContext() const noexcept { return Context; };
 
-  uint32_t increment_reference_count() { return ++refCount_; }
+  uint32_t incrementReferenceCount() { return ++RefCount; }
 
-  uint32_t decrement_reference_count() { return --refCount_; }
+  uint32_t decrementReferenceCount() { return --RefCount; }
 
-  uint32_t get_event_id() const noexcept { return eventId_; }
+  uint32_t getEventID() const noexcept { return EventID; }
 
-  bool backend_has_ownership() const noexcept { return has_ownership_; }
+  bool backendHasOwnership() const noexcept { return HasOwnership; }
 
   // Returns the counter time when the associated command(s) were enqueued
   //
-  uint64_t get_queued_time() const;
+  uint64_t getQueuedTime() const;
 
   // Returns the counter time when the associated command(s) started execution
   //
-  uint64_t get_start_time() const;
+  uint64_t getStartTime() const;
 
   // Returns the counter time when the associated command(s) completed
   //
-  uint64_t get_end_time() const;
+  uint64_t getEndTime() const;
 
   // construct a native CUDA. This maps closely to the underlying CUDA event.
   static ur_event_handle_t
-  make_native(ur_command_t type, ur_queue_handle_t queue, CUstream stream,
-              uint32_t stream_token = std::numeric_limits<uint32_t>::max()) {
-    return new ur_event_handle_t_(type, queue->get_context(), queue, stream,
-                                  stream_token);
+  makeNative(ur_command_t Type, ur_queue_handle_t Queue, CUstream Stream,
+             uint32_t StreamToken = std::numeric_limits<uint32_t>::max()) {
+    return new ur_event_handle_t_(Type, Queue->getContext(), Queue, Stream,
+                                  StreamToken);
   }
 
-  static ur_event_handle_t make_with_native(ur_context_handle_t context,
-                                            CUevent eventNative) {
+  static ur_event_handle_t makeWithNative(ur_context_handle_t context,
+                                          CUevent eventNative) {
     return new ur_event_handle_t_(context, eventNative);
   }
 
@@ -94,95 +94,94 @@ struct ur_event_handle_t_ {
   ~ur_event_handle_t_();
 
 private:
-  // This constructor is private to force programmers to use the make_native /
+  // This constructor is private to force programmers to use the makeNative /
   // make_user static members in order to create a pi_event for CUDA.
-  ur_event_handle_t_(ur_command_t type, ur_context_handle_t context,
-                     ur_queue_handle_t queue, CUstream stream,
-                     uint32_t stream_token);
+  ur_event_handle_t_(ur_command_t Type, ur_context_handle_t Context,
+                     ur_queue_handle_t Queue, CUstream Stream,
+                     uint32_t StreamToken);
 
   // This constructor is private to force programmers to use the
-  // make_with_native for event introp
-  ur_event_handle_t_(ur_context_handle_t context, CUevent eventNative);
+  // makeWithNative for event introp
+  ur_event_handle_t_(ur_context_handle_t Context, CUevent EventNative);
 
-  ur_command_t commandType_; // The type of command associated with event.
+  ur_command_t CommandType; // The type of command associated with event.
 
-  std::atomic_uint32_t refCount_; // Event reference count.
+  std::atomic_uint32_t RefCount; // Event reference count.
 
-  bool has_ownership_; // Signifies if event owns the native type.
+  bool HasOwnership; // Signifies if event owns the native type.
 
-  bool hasBeenWaitedOn_; // Signifies whether the event has been waited
-                         // on through a call to wait(), which implies
-                         // that it has completed.
+  bool HasBeenWaitedOn; // Signifies whether the event has been waited
+                        // on through a call to wait(), which implies
+                        // that it has completed.
 
-  bool isRecorded_; // Signifies wether a native CUDA event has been recorded
-                    // yet.
-  bool isStarted_;  // Signifies wether the operation associated with the
-                    // PI event has started or not
-                    //
+  bool IsRecorded; // Signifies wether a native CUDA event has been recorded
+                   // yet.
+  bool IsStarted;  // Signifies wether the operation associated with the
+                   // PI event has started or not
 
-  uint32_t streamToken_;
-  uint32_t eventId_; // Queue identifier of the event.
+  uint32_t StreamToken;
+  uint32_t EventID; // Queue identifier of the event.
 
-  native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
-                      // event, this will be nullptr.
+  native_type EvEnd; // CUDA event handle. If this _pi_event represents a user
+                     // event, this will be nullptr.
 
-  native_type evStart_; // CUDA event handle associated with the start
+  native_type EvStart; // CUDA event handle associated with the start
 
-  native_type evQueued_; // CUDA event handle associated with the time
-                         // the command was enqueued
+  native_type EvQueued; // CUDA event handle associated with the time
+                        // the command was enqueued
 
-  ur_queue_handle_t queue_; // pi_queue associated with the event. If this is a
-                            // user event, this will be nullptr.
+  ur_queue_handle_t Queue; // pi_queue associated with the event. If this is a
+                           // user event, this will be nullptr.
 
-  CUstream stream_; // CUstream associated with the event. If this is a user
-                    // event, this will be uninitialized.
+  CUstream Stream; // CUstream associated with the event. If this is a user
+                   // event, this will be uninitialized.
 
-  ur_context_handle_t context_; // pi_context associated with the event. If this
-                                // is a native event, this will be the same
-                                // context associated with the queue_ member.
+  ur_context_handle_t Context; // pi_context associated with the event. If this
+                               // is a native event, this will be the same
+                               // context associated with the queue_ member.
 };
 
 // Iterates over the event wait list, returns correct ur_result_t error codes.
 // Invokes the callback for the latest event of each queue in the wait list.
 // The callback must take a single pi_event argument and return a ur_result_t.
 template <typename Func>
-ur_result_t forLatestEvents(const ur_event_handle_t *event_wait_list,
-                            std::size_t num_events_in_wait_list, Func &&f) {
+ur_result_t forLatestEvents(const ur_event_handle_t *EventWaitList,
+                            std::size_t NumEventsInWaitList, Func &&F) {
 
-  if (event_wait_list == nullptr || num_events_in_wait_list == 0) {
+  if (EventWaitList == nullptr || NumEventsInWaitList == 0) {
     return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
   }
 
   // Fast path if we only have a single event
-  if (num_events_in_wait_list == 1) {
-    return f(event_wait_list[0]);
+  if (NumEventsInWaitList == 1) {
+    return F(EventWaitList[0]);
   }
 
-  std::vector<ur_event_handle_t> events{
-      event_wait_list, event_wait_list + num_events_in_wait_list};
-  std::sort(events.begin(), events.end(),
-            [](ur_event_handle_t e0, ur_event_handle_t e1) {
+  std::vector<ur_event_handle_t> Events{EventWaitList,
+                                        EventWaitList + NumEventsInWaitList};
+  std::sort(Events.begin(), Events.end(),
+            [](ur_event_handle_t Event0, ur_event_handle_t Event1) {
               // Tiered sort creating sublists of streams (smallest value first)
               // in which the corresponding events are sorted into a sequence of
               // newest first.
-              return e0->get_stream() < e1->get_stream() ||
-                     (e0->get_stream() == e1->get_stream() &&
-                      e0->get_event_id() > e1->get_event_id());
+              return Event0->getStream() < Event1->getStream() ||
+                     (Event0->getStream() == Event1->getStream() &&
+                      Event0->getEventID() > Event1->getEventID());
             });
 
-  bool first = true;
-  CUstream lastSeenStream = 0;
-  for (ur_event_handle_t event : events) {
-    if (!event || (!first && event->get_stream() == lastSeenStream)) {
+  bool First = true;
+  CUstream LastSeenStream = 0;
+  for (ur_event_handle_t Event : Events) {
+    if (!Event || (!First && Event->getStream() == LastSeenStream)) {
       continue;
     }
 
-    first = false;
-    lastSeenStream = event->get_stream();
+    First = false;
+    LastSeenStream = Event->getStream();
 
-    auto result = f(event);
-    if (result != UR_RESULT_SUCCESS) {
-      return result;
+    auto Result = F(Event);
+    if (Result != UR_RESULT_SUCCESS) {
+      return Result;
     }
   }
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
index 900b23dd84306..f3c05e016e441 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
@@ -17,46 +17,47 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
   UR_ASSERT(phKernel, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(pKernelName, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
-  ur_result_t retErr = UR_RESULT_SUCCESS;
-  std::unique_ptr<ur_kernel_handle_t_> retKernel{nullptr};
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  std::unique_ptr<ur_kernel_handle_t_> Kernel{nullptr};
 
   try {
-    ScopedContext active(hProgram->get_context());
+    ScopedContext Active(hProgram->getContext());
 
-    CUfunction cuFunc;
-    CUresult functionResult =
-        cuModuleGetFunction(&cuFunc, hProgram->get(), pKernelName);
+    CUfunction CuFunc;
+    CUresult FunctionResult =
+        cuModuleGetFunction(&CuFunc, hProgram->get(), pKernelName);
 
     // We can't add this as a generic mapping in UR_CHECK_ERROR since cuda's
     // NOT_FOUND error applies to more than just functions.
-    if (functionResult == CUDA_ERROR_NOT_FOUND) {
+    if (FunctionResult == CUDA_ERROR_NOT_FOUND) {
       throw UR_RESULT_ERROR_INVALID_KERNEL_NAME;
     } else {
-      retErr = UR_CHECK_ERROR(functionResult);
+      Result = UR_CHECK_ERROR(FunctionResult);
     }
 
-    std::string kernel_name_woffset = std::string(pKernelName) + "_with_offset";
-    CUfunction cuFuncWithOffsetParam;
-    CUresult offsetRes = cuModuleGetFunction(
-        &cuFuncWithOffsetParam, hProgram->get(), kernel_name_woffset.c_str());
+    std::string KernelNameWithOffset =
+        std::string(pKernelName) + "_with_offset";
+    CUfunction CuFuncWithOffsetParam;
+    CUresult OffsetRes = cuModuleGetFunction(
+        &CuFuncWithOffsetParam, hProgram->get(), KernelNameWithOffset.c_str());
 
     // If there is no kernel with global offset parameter we mark it as missing
-    if (offsetRes == CUDA_ERROR_NOT_FOUND) {
-      cuFuncWithOffsetParam = nullptr;
+    if (OffsetRes == CUDA_ERROR_NOT_FOUND) {
+      CuFuncWithOffsetParam = nullptr;
     } else {
-      retErr = UR_CHECK_ERROR(offsetRes);
+      Result = UR_CHECK_ERROR(OffsetRes);
     }
-    retKernel = std::unique_ptr<ur_kernel_handle_t_>(
-        new ur_kernel_handle_t_{cuFunc, cuFuncWithOffsetParam, pKernelName,
-                                hProgram, hProgram->get_context()});
-  } catch (ur_result_t err) {
-    retErr = err;
+    Kernel = std::unique_ptr<ur_kernel_handle_t_>(
+        new ur_kernel_handle_t_{CuFunc, CuFuncWithOffsetParam, pKernelName,
+                                hProgram, hProgram->getContext()});
+  } catch (ur_result_t Err) {
+    Result = Err;
   } catch (...) {
-    retErr = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
   }
 
-  *phKernel = retKernel.release();
-  return retErr;
+  *phKernel = Kernel.release();
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL
@@ -70,82 +71,78 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
 
   switch (propName) {
   case UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
-    size_t global_work_size[3] = {0, 0, 0};
+    size_t GlobalWorkSize[3] = {0, 0, 0};
 
-    int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0};
+    int MaxBlockDimX{0}, MaxBlockDimY{0}, MaxBlockDimZ{0};
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&max_block_dimX,
-                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+        cuDeviceGetAttribute(&MaxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
                              hDevice->get()) == CUDA_SUCCESS);
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&max_block_dimY,
-                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
+        cuDeviceGetAttribute(&MaxBlockDimY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
                              hDevice->get()) == CUDA_SUCCESS);
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&max_block_dimZ,
-                             CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
+        cuDeviceGetAttribute(&MaxBlockDimZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
                              hDevice->get()) == CUDA_SUCCESS);
 
-    int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0};
+    int MaxGridDimX{0}, MaxGridDimY{0}, MaxGridDimZ{0};
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&max_grid_dimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+        cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
                              hDevice->get()) == CUDA_SUCCESS);
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&max_grid_dimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
+        cuDeviceGetAttribute(&MaxGridDimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
                              hDevice->get()) == CUDA_SUCCESS);
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&max_grid_dimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
+        cuDeviceGetAttribute(&MaxGridDimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
                              hDevice->get()) == CUDA_SUCCESS);
 
-    global_work_size[0] = max_block_dimX * max_grid_dimX;
-    global_work_size[1] = max_block_dimY * max_grid_dimY;
-    global_work_size[2] = max_block_dimZ * max_grid_dimZ;
-    return ReturnValue(global_work_size, 3);
+    GlobalWorkSize[0] = MaxBlockDimX * MaxGridDimX;
+    GlobalWorkSize[1] = MaxBlockDimY * MaxGridDimY;
+    GlobalWorkSize[2] = MaxBlockDimZ * MaxGridDimZ;
+    return ReturnValue(GlobalWorkSize, 3);
   }
   case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
-    int max_threads = 0;
+    int MaxThreads = 0;
     sycl::detail::ur::assertion(
-        cuFuncGetAttribute(&max_threads,
-                           CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+        cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                            hKernel->get()) == CUDA_SUCCESS);
-    return ReturnValue(size_t(max_threads));
+    return ReturnValue(size_t(MaxThreads));
   }
   case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
-    size_t group_size[3] = {0, 0, 0};
-    const auto &reqd_wg_size_md_map =
-        hKernel->program_->kernelReqdWorkGroupSizeMD_;
-    const auto reqd_wg_size_md = reqd_wg_size_md_map.find(hKernel->name_);
-    if (reqd_wg_size_md != reqd_wg_size_md_map.end()) {
-      const auto reqd_wg_size = reqd_wg_size_md->second;
-      group_size[0] = std::get<0>(reqd_wg_size);
-      group_size[1] = std::get<1>(reqd_wg_size);
-      group_size[2] = std::get<2>(reqd_wg_size);
+    size_t GroupSize[3] = {0, 0, 0};
+    const auto &ReqdWGSizeMDMap =
+        hKernel->get_program()->KernelReqdWorkGroupSizeMD;
+    const auto ReqdWGSizeMD = ReqdWGSizeMDMap.find(hKernel->getName());
+    if (ReqdWGSizeMD != ReqdWGSizeMDMap.end()) {
+      const auto ReqdWGSize = ReqdWGSizeMD->second;
+      GroupSize[0] = std::get<0>(ReqdWGSize);
+      GroupSize[1] = std::get<1>(ReqdWGSize);
+      GroupSize[2] = std::get<2>(ReqdWGSize);
     }
-    return ReturnValue(group_size, 3);
+    return ReturnValue(GroupSize, 3);
   }
   case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
     // OpenCL LOCAL == CUDA SHARED
-    int bytes = 0;
+    int Bytes = 0;
     sycl::detail::ur::assertion(
-        cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+        cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
                            hKernel->get()) == CUDA_SUCCESS);
-    return ReturnValue(uint64_t(bytes));
+    return ReturnValue(uint64_t(Bytes));
   }
   case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
     // Work groups should be multiples of the warp size
-    int warpSize = 0;
+    int WarpSize = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
                              hDevice->get()) == CUDA_SUCCESS);
-    return ReturnValue(static_cast<size_t>(warpSize));
+    return ReturnValue(static_cast<size_t>(WarpSize));
   }
   case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
     // OpenCL PRIVATE == CUDA LOCAL
-    int bytes = 0;
+    int Bytes = 0;
     sycl::detail::ur::assertion(
-        cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
+        cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
                            hKernel->get()) == CUDA_SUCCESS);
-    return ReturnValue(uint64_t(bytes));
+    return ReturnValue(uint64_t(Bytes));
   }
   default:
     break;
@@ -156,10 +153,9 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
 
 UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain(ur_kernel_handle_t hKernel) {
   UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  UR_ASSERT(hKernel->get_reference_count() > 0u,
-            UR_RESULT_ERROR_INVALID_KERNEL);
+  UR_ASSERT(hKernel->getReferenceCount() > 0u, UR_RESULT_ERROR_INVALID_KERNEL);
 
-  hKernel->increment_reference_count();
+  hKernel->incrementReferenceCount();
   return UR_RESULT_SUCCESS;
 }
 
@@ -169,11 +165,10 @@ urKernelRelease(ur_kernel_handle_t hKernel) {
 
   // double delete or someone is messing with the ref count.
   // either way, cannot safely proceed.
-  UR_ASSERT(hKernel->get_reference_count() != 0,
-            UR_RESULT_ERROR_INVALID_KERNEL);
+  UR_ASSERT(hKernel->getReferenceCount() != 0, UR_RESULT_ERROR_INVALID_KERNEL);
 
   // decrement ref count. If it is 0, delete the program.
-  if (hKernel->decrement_reference_count() == 0) {
+  if (hKernel->decrementReferenceCount() == 0) {
     // no internal cuda resources to clean up. Just delete it.
     delete hKernel;
     return UR_RESULT_SUCCESS;
@@ -198,17 +193,17 @@ urKernelSetArgValue(ur_kernel_handle_t hKernel, uint32_t argIndex,
   UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(argSize, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE);
 
-  ur_result_t retErr = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
   try {
     if (pArgValue) {
-      hKernel->set_kernel_arg(argIndex, argSize, pArgValue);
+      hKernel->setKernelArg(argIndex, argSize, pArgValue);
     } else {
-      hKernel->set_kernel_local_arg(argIndex, argSize);
+      hKernel->setKernelLocalArg(argIndex, argSize);
     }
-  } catch (ur_result_t err) {
-    retErr = err;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
-  return retErr;
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
@@ -222,23 +217,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
 
   switch (propName) {
   case UR_KERNEL_INFO_FUNCTION_NAME:
-    return ReturnValue(hKernel->get_name());
+    return ReturnValue(hKernel->getName());
   case UR_KERNEL_INFO_NUM_ARGS:
-    return ReturnValue(hKernel->get_num_args());
+    return ReturnValue(hKernel->getNumArgs());
   case UR_KERNEL_INFO_REFERENCE_COUNT:
-    return ReturnValue(hKernel->get_reference_count());
+    return ReturnValue(hKernel->getReferenceCount());
   case UR_KERNEL_INFO_CONTEXT:
-    return ReturnValue(hKernel->get_context());
+    return ReturnValue(hKernel->getContext());
   case UR_KERNEL_INFO_PROGRAM:
     return ReturnValue(hKernel->get_program());
   case UR_KERNEL_INFO_ATTRIBUTES:
     return ReturnValue("");
   case UR_KERNEL_INFO_NUM_REGS: {
-    int numRegs = 0;
+    int NumRegs = 0;
     sycl::detail::ur::assertion(
-        cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
+        cuFuncGetAttribute(&NumRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
                            hKernel->get()) == CUDA_SUCCESS);
-    return ReturnValue(static_cast<uint32_t>(numRegs));
+    return ReturnValue(static_cast<uint32_t>(NumRegs));
   }
   default:
     break;
@@ -257,25 +252,24 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
   switch (propName) {
   case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: {
     // Sub-group size is equivalent to warp size
-    int warpSize = 0;
+    int WarpSize = 0;
     sycl::detail::ur::assertion(
-        cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
                              hDevice->get()) == CUDA_SUCCESS);
-    return ReturnValue(static_cast<uint32_t>(warpSize));
+    return ReturnValue(static_cast<uint32_t>(WarpSize));
   }
   case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: {
     // Number of sub-groups = max block size / warp size + possible remainder
-    int max_threads = 0;
+    int MaxThreads = 0;
     sycl::detail::ur::assertion(
-        cuFuncGetAttribute(&max_threads,
-                           CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+        cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                            hKernel->get()) == CUDA_SUCCESS);
-    int warpSize = 0;
+    int WarpSize = 0;
     urKernelGetSubGroupInfo(hKernel, hDevice,
                             UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE,
-                            sizeof(uint32_t), &warpSize, nullptr);
-    int maxWarps = (max_threads + warpSize - 1) / warpSize;
-    return ReturnValue(static_cast<uint32_t>(maxWarps));
+                            sizeof(uint32_t), &WarpSize, nullptr);
+    int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize;
+    return ReturnValue(static_cast<uint32_t>(MaxWarps));
   }
   case UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS: {
     // Return value of 0 => not specified
@@ -298,7 +292,7 @@ urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
 
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer(
     ur_kernel_handle_t hKernel, uint32_t argIndex, const void *pArgValue) {
-  hKernel->set_kernel_arg(argIndex, sizeof(pArgValue), pArgValue);
+  hKernel->setKernelArg(argIndex, sizeof(pArgValue), pArgValue);
   return UR_RESULT_SUCCESS;
 }
 
@@ -310,16 +304,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj(
   // Below sets kernel arg when zero-sized buffers are handled.
   // In such case the corresponding memory is null.
   if (hArgValue == nullptr) {
-    hKernel->set_kernel_arg(argIndex, 0, nullptr);
+    hKernel->setKernelArg(argIndex, 0, nullptr);
     return UR_RESULT_SUCCESS;
   }
 
-  ur_result_t retErr = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
   try {
-    if (hArgValue->mem_type_ == ur_mem_handle_t_::mem_type::surface) {
+    if (hArgValue->MemType == ur_mem_handle_t_::Type::Surface) {
       CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
       UR_CHECK_ERROR(cuArray3DGetDescriptor(
-          &arrayDesc, hArgValue->mem_.surface_mem_.get_array()));
+          &arrayDesc, hArgValue->Mem.SurfaceMem.getArray()));
       if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 &&
           arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 &&
           arrayDesc.Format != CU_AD_FORMAT_HALF &&
@@ -329,16 +323,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj(
                         UR_RESULT_ERROR_ADAPTER_SPECIFIC);
         return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
       }
-      CUsurfObject cuSurf = hArgValue->mem_.surface_mem_.get_surface();
-      hKernel->set_kernel_arg(argIndex, sizeof(cuSurf), (void *)&cuSurf);
+      CUsurfObject CuSurf = hArgValue->Mem.SurfaceMem.getSurface();
+      hKernel->setKernelArg(argIndex, sizeof(CuSurf), (void *)&CuSurf);
     } else {
-      CUdeviceptr cuPtr = hArgValue->mem_.buffer_mem_.get();
-      hKernel->set_kernel_arg(argIndex, sizeof(CUdeviceptr), (void *)&cuPtr);
+      CUdeviceptr CuPtr = hArgValue->Mem.BufferMem.get();
+      hKernel->setKernelArg(argIndex, sizeof(CUdeviceptr), (void *)&CuPtr);
     }
-  } catch (ur_result_t err) {
-    retErr = err;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
-  return retErr;
+  return Result;
 }
 
 // A NOP for the CUDA backend
@@ -370,12 +364,12 @@ urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex,
                       ur_sampler_handle_t hArgValue) {
   UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  ur_result_t retErr = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
   try {
-    uint32_t samplerProps = hArgValue->props_;
-    hKernel->set_kernel_arg(argIndex, sizeof(uint32_t), (void *)&samplerProps);
-  } catch (ur_result_t err) {
-    retErr = err;
+    uint32_t SamplerProps = hArgValue->Props;
+    hKernel->setKernelArg(argIndex, sizeof(uint32_t), (void *)&SamplerProps);
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
-  return retErr;
+  return Result;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
index 9308b7b408b44..3707cab1d1e0f 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
@@ -35,38 +35,37 @@
 struct ur_kernel_handle_t_ {
   using native_type = CUfunction;
 
-  native_type function_;
-  native_type functionWithOffsetParam_;
-  std::string name_;
-  ur_context_handle_t context_;
-  ur_program_handle_t program_;
-  std::atomic_uint32_t refCount_;
+  native_type Function;
+  native_type FunctionWithOffsetParam;
+  std::string Name;
+  ur_context_handle_t Context;
+  ur_program_handle_t Program;
+  std::atomic_uint32_t RefCount;
 
-  static constexpr uint32_t REQD_THREADS_PER_BLOCK_DIMENSIONS = 3u;
-  size_t reqdThreadsPerBlock_[REQD_THREADS_PER_BLOCK_DIMENSIONS];
+  static constexpr uint32_t ReqdThreadsPerBlockDimensions = 3u;
+  size_t ReqdThreadsPerBlock[ReqdThreadsPerBlockDimensions];
 
   /// Structure that holds the arguments to the kernel.
-  /// Note earch argument size is known, since it comes
+  /// Note each argument size is known, since it comes
   /// from the kernel signature.
   /// This is not something can be queried from the CUDA API
   /// so there is a hard-coded size (\ref MAX_PARAM_BYTES)
   /// and a storage.
-  ///
   struct arguments {
-    static constexpr size_t MAX_PARAM_BYTES = 4000u;
-    using args_t = std::array<char, MAX_PARAM_BYTES>;
+    static constexpr size_t MaxParamBytes = 4000u;
+    using args_t = std::array<char, MaxParamBytes>;
     using args_size_t = std::vector<size_t>;
     using args_index_t = std::vector<void *>;
-    args_t storage_;
-    args_size_t paramSizes_;
-    args_index_t indices_;
-    args_size_t offsetPerIndex_;
+    args_t Storage;
+    args_size_t ParamSizes;
+    args_index_t Indices;
+    args_size_t OffsetPerIndex;
 
-    std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0};
+    std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0};
 
     arguments() {
       // Place the implicit offset index at the end of the indicies collection
-      indices_.emplace_back(&implicitOffsetArgs_);
+      Indices.emplace_back(&ImplicitOffsetArgs);
     }
 
     /// Adds an argument to the kernel.
@@ -74,126 +73,127 @@ struct ur_kernel_handle_t_ {
     /// Otherwise, it is added.
     /// Gaps are filled with empty arguments.
     /// Implicit offset argument is kept at the back of the indices collection.
-    void add_arg(size_t index, size_t size, const void *arg,
-                 size_t localSize = 0) {
-      if (index + 2 > indices_.size()) {
+    void addArg(size_t Index, size_t Size, const void *Arg,
+                size_t LocalSize = 0) {
+      if (Index + 2 > Indices.size()) {
         // Move implicit offset argument index with the end
-        indices_.resize(index + 2, indices_.back());
+        Indices.resize(Index + 2, Indices.back());
         // Ensure enough space for the new argument
-        paramSizes_.resize(index + 1);
-        offsetPerIndex_.resize(index + 1);
+        ParamSizes.resize(Index + 1);
+        OffsetPerIndex.resize(Index + 1);
       }
-      paramSizes_[index] = size;
+      ParamSizes[Index] = Size;
       // calculate the insertion point on the array
-      size_t insertPos = std::accumulate(std::begin(paramSizes_),
-                                         std::begin(paramSizes_) + index, 0);
+      size_t InsertPos = std::accumulate(std::begin(ParamSizes),
+                                         std::begin(ParamSizes) + Index, 0);
       // Update the stored value for the argument
-      std::memcpy(&storage_[insertPos], arg, size);
-      indices_[index] = &storage_[insertPos];
-      offsetPerIndex_[index] = localSize;
+      std::memcpy(&Storage[InsertPos], Arg, Size);
+      Indices[Index] = &Storage[InsertPos];
+      OffsetPerIndex[Index] = LocalSize;
     }
 
-    void add_local_arg(size_t index, size_t size) {
-      size_t localOffset = this->get_local_size();
+    void addLocalArg(size_t Index, size_t Size) {
+      size_t LocalOffset = this->getLocalSize();
 
       // maximum required alignment is the size of the largest vector type
-      const size_t max_alignment = sizeof(double) * 16;
+      const size_t MaxAlignment = sizeof(double) * 16;
 
       // for arguments smaller than the maximum alignment simply align to the
       // size of the argument
-      const size_t alignment = std::min(max_alignment, size);
+      const size_t Alignment = std::min(MaxAlignment, Size);
 
       // align the argument
-      size_t alignedLocalOffset = localOffset;
-      if (localOffset % alignment != 0) {
-        alignedLocalOffset += alignment - (localOffset % alignment);
+      size_t AlignedLocalOffset = LocalOffset;
+      if (LocalOffset % Alignment != 0) {
+        AlignedLocalOffset += Alignment - (LocalOffset % Alignment);
       }
 
-      add_arg(index, sizeof(size_t), (const void *)&(alignedLocalOffset),
-              size + (alignedLocalOffset - localOffset));
+      addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
+             Size + (AlignedLocalOffset - LocalOffset));
     }
 
-    void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) {
-      assert(size == sizeof(std::uint32_t) * 3);
-      std::memcpy(implicitOffsetArgs_, implicitOffset, size);
+    void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) {
+      assert(Size == sizeof(std::uint32_t) * 3);
+      std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
     }
 
-    void clear_local_size() {
-      std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0);
+    void clearLocalSize() {
+      std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0);
     }
 
-    const args_index_t &get_indices() const noexcept { return indices_; }
+    const args_index_t &getIndices() const noexcept { return Indices; }
 
-    uint32_t get_local_size() const {
-      return std::accumulate(std::begin(offsetPerIndex_),
-                             std::end(offsetPerIndex_), 0);
+    uint32_t getLocalSize() const {
+      return std::accumulate(std::begin(OffsetPerIndex),
+                             std::end(OffsetPerIndex), 0);
     }
-  } args_;
-
-  ur_kernel_handle_t_(CUfunction func, CUfunction funcWithOffsetParam,
-                      const char *name, ur_program_handle_t program,
-                      ur_context_handle_t ctxt)
-      : function_{func}, functionWithOffsetParam_{funcWithOffsetParam},
-        name_{name}, context_{ctxt}, program_{program}, refCount_{1} {
-    urProgramRetain(program_);
-    urContextRetain(context_);
+  } Args;
+
+  ur_kernel_handle_t_(CUfunction Func, CUfunction FuncWithOffsetParam,
+                      const char *Name, ur_program_handle_t Program,
+                      ur_context_handle_t Context)
+      : Function{Func}, FunctionWithOffsetParam{FuncWithOffsetParam},
+        Name{Name}, Context{Context}, Program{Program}, RefCount{1} {
+    urProgramRetain(Program);
+    urContextRetain(Context);
     /// Note: this code assumes that there is only one device per context
     ur_result_t retError = urKernelGetGroupInfo(
-        this, ctxt->get_device(), UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
-        sizeof(reqdThreadsPerBlock_), reqdThreadsPerBlock_, nullptr);
+        this, Context->getDevice(),
+        UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
+        sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr);
     assert(retError == UR_RESULT_SUCCESS);
   }
 
   ~ur_kernel_handle_t_() {
-    urProgramRelease(program_);
-    urContextRelease(context_);
+    urProgramRelease(Program);
+    urContextRelease(Context);
   }
 
-  ur_program_handle_t get_program() const noexcept { return program_; }
+  ur_program_handle_t get_program() const noexcept { return Program; }
 
-  uint32_t increment_reference_count() noexcept { return ++refCount_; }
+  uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
 
-  uint32_t decrement_reference_count() noexcept { return --refCount_; }
+  uint32_t decrementReferenceCount() noexcept { return --RefCount; }
 
-  uint32_t get_reference_count() const noexcept { return refCount_; }
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
 
-  native_type get() const noexcept { return function_; };
+  native_type get() const noexcept { return Function; };
 
   native_type get_with_offset_parameter() const noexcept {
-    return functionWithOffsetParam_;
+    return FunctionWithOffsetParam;
   };
 
   bool has_with_offset_parameter() const noexcept {
-    return functionWithOffsetParam_ != nullptr;
+    return FunctionWithOffsetParam != nullptr;
   }
 
-  ur_context_handle_t get_context() const noexcept { return context_; };
+  ur_context_handle_t getContext() const noexcept { return Context; };
 
-  const char *get_name() const noexcept { return name_.c_str(); }
+  const char *getName() const noexcept { return Name.c_str(); }
 
   /// Returns the number of arguments, excluding the implicit global offset.
   /// Note this only returns the current known number of arguments, not the
   /// real one required by the kernel, since this cannot be queried from
   /// the CUDA Driver API
-  size_t get_num_args() const noexcept { return args_.indices_.size() - 1; }
+  size_t getNumArgs() const noexcept { return Args.Indices.size() - 1; }
 
-  void set_kernel_arg(int index, size_t size, const void *arg) {
-    args_.add_arg(index, size, arg);
+  void setKernelArg(int Index, size_t Size, const void *Arg) {
+    Args.addArg(Index, Size, Arg);
   }
 
-  void set_kernel_local_arg(int index, size_t size) {
-    args_.add_local_arg(index, size);
+  void setKernelLocalArg(int Index, size_t Size) {
+    Args.addLocalArg(Index, Size);
   }
 
-  void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) {
-    return args_.set_implicit_offset(size, implicitOffset);
+  void setImplicitOffsetArg(size_t Size, std::uint32_t *ImplicitOffset) {
+    return Args.setImplicitOffset(Size, ImplicitOffset);
   }
 
-  const arguments::args_index_t &get_arg_indices() const {
-    return args_.get_indices();
+  const arguments::args_index_t &getArgIndices() const {
+    return Args.getIndices();
   }
 
-  uint32_t get_local_size() const noexcept { return args_.get_local_size(); }
+  uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); }
 
-  void clear_local_size() { args_.clear_local_size(); }
+  void clearLocalSize() { Args.clearLocalSize(); }
 };
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
index abca91b594e19..b88d5307f4711 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
@@ -35,76 +35,74 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
   // Currently, USE_HOST_PTR is not implemented using host register
   // since this triggers a weird segfault after program ends.
   // Setting this constant to true enables testing that behavior.
-  const bool enableUseHostPtr = false;
-  const bool performInitialCopy =
+  const bool EnableUseHostPtr = false;
+  const bool PerformInitialCopy =
       (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) ||
-      ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && !enableUseHostPtr);
-  ur_result_t retErr = UR_RESULT_SUCCESS;
-  ur_mem_handle_t retMemObj = nullptr;
+      ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && !EnableUseHostPtr);
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  ur_mem_handle_t MemObj = nullptr;
 
   try {
-    ScopedContext active(hContext);
-    CUdeviceptr ptr;
-    auto pHost = pProperties ? pProperties->pHost : nullptr;
-
-    ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode allocMode =
-        ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic;
-
-    if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && enableUseHostPtr) {
-      retErr = UR_CHECK_ERROR(
-          cuMemHostRegister(pHost, size, CU_MEMHOSTREGISTER_DEVICEMAP));
-      retErr = UR_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, pHost, 0));
-      allocMode = ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::use_host_ptr;
+    ScopedContext Active(hContext);
+    CUdeviceptr Ptr;
+    auto HostPtr = pProperties ? pProperties->pHost : nullptr;
+
+    ur_mem_handle_t_::MemImpl::BufferMem::AllocMode AllocMode =
+        ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic;
+
+    if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && EnableUseHostPtr) {
+      Result = UR_CHECK_ERROR(
+          cuMemHostRegister(HostPtr, size, CU_MEMHOSTREGISTER_DEVICEMAP));
+      Result = UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0));
+      AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::UseHostPtr;
     } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) {
-      retErr = UR_CHECK_ERROR(cuMemAllocHost(&pHost, size));
-      retErr = UR_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, pHost, 0));
-      allocMode =
-          ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr;
+      Result = UR_CHECK_ERROR(cuMemAllocHost(&HostPtr, size));
+      Result = UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0));
+      AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr;
     } else {
-      retErr = UR_CHECK_ERROR(cuMemAlloc(&ptr, size));
+      Result = UR_CHECK_ERROR(cuMemAlloc(&Ptr, size));
       if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) {
-        allocMode = ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::copy_in;
+        AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::CopyIn;
       }
     }
 
-    if (retErr == UR_RESULT_SUCCESS) {
+    if (Result == UR_RESULT_SUCCESS) {
       ur_mem_handle_t parentBuffer = nullptr;
 
-      auto piMemObj = std::unique_ptr<ur_mem_handle_t_>(new ur_mem_handle_t_{
-          hContext, parentBuffer, flags, allocMode, ptr, pHost, size});
-      if (piMemObj != nullptr) {
-        retMemObj = piMemObj.release();
-        if (performInitialCopy) {
+      auto URMemObj = std::unique_ptr<ur_mem_handle_t_>(new ur_mem_handle_t_{
+          hContext, parentBuffer, flags, AllocMode, Ptr, HostPtr, size});
+      if (URMemObj != nullptr) {
+        MemObj = URMemObj.release();
+        if (PerformInitialCopy) {
           // Operates on the default stream of the current CUDA context.
-          retErr = UR_CHECK_ERROR(cuMemcpyHtoD(ptr, pHost, size));
+          Result = UR_CHECK_ERROR(cuMemcpyHtoD(Ptr, HostPtr, size));
           // Synchronize with default stream implicitly used by cuMemcpyHtoD
           // to make buffer data available on device before any other UR call
           // uses it.
-          if (retErr == UR_RESULT_SUCCESS) {
+          if (Result == UR_RESULT_SUCCESS) {
             CUstream defaultStream = 0;
-            retErr = UR_CHECK_ERROR(cuStreamSynchronize(defaultStream));
+            Result = UR_CHECK_ERROR(cuStreamSynchronize(defaultStream));
           }
         }
       } else {
-        retErr = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+        Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
       }
     }
-  } catch (ur_result_t err) {
-    retErr = err;
+  } catch (ur_result_t Err) {
+    Result = Err;
   } catch (...) {
-    retErr = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
   }
 
-  *phBuffer = retMemObj;
+  *phBuffer = MemObj;
 
-  return retErr;
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) {
   UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  UR_ASSERT(hMem->get_reference_count() > 0,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  hMem->increment_reference_count();
+  UR_ASSERT(hMem->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  hMem->incrementReferenceCount();
   return UR_RESULT_SUCCESS;
 }
 
@@ -115,52 +113,52 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) {
 UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
   UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  ur_result_t ret = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
 
     // Do nothing if there are other references
-    if (hMem->decrement_reference_count() > 0) {
+    if (hMem->decrementReferenceCount() > 0) {
       return UR_RESULT_SUCCESS;
     }
 
-    // make sure hMem is released in case check_error_ur throws
-    std::unique_ptr<ur_mem_handle_t_> uniqueMemObj(hMem);
+    // make sure hMem is released in case checkErrorUR throws
+    std::unique_ptr<ur_mem_handle_t_> MemObjPtr(hMem);
 
-    if (hMem->is_sub_buffer()) {
+    if (hMem->isSubBuffer()) {
       return UR_RESULT_SUCCESS;
     }
 
-    ScopedContext active(uniqueMemObj->get_context());
+    ScopedContext Active(MemObjPtr->getContext());
 
-    if (hMem->mem_type_ == ur_mem_handle_t_::mem_type::buffer) {
-      switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) {
-      case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::copy_in:
-      case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic:
-        ret = UR_CHECK_ERROR(cuMemFree(uniqueMemObj->mem_.buffer_mem_.ptr_));
+    if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) {
+      switch (MemObjPtr->Mem.BufferMem.MemAllocMode) {
+      case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::CopyIn:
+      case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic:
+        Result = UR_CHECK_ERROR(cuMemFree(MemObjPtr->Mem.BufferMem.Ptr));
         break;
-      case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::use_host_ptr:
-        ret = UR_CHECK_ERROR(
-            cuMemHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
+      case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::UseHostPtr:
+        Result = UR_CHECK_ERROR(
+            cuMemHostUnregister(MemObjPtr->Mem.BufferMem.HostPtr));
         break;
-      case ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::alloc_host_ptr:
-        ret = UR_CHECK_ERROR(
-            cuMemFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
+      case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr:
+        Result =
+            UR_CHECK_ERROR(cuMemFreeHost(MemObjPtr->Mem.BufferMem.HostPtr));
       };
-    } else if (hMem->mem_type_ == ur_mem_handle_t_::mem_type::surface) {
-      ret = UR_CHECK_ERROR(
-          cuSurfObjectDestroy(uniqueMemObj->mem_.surface_mem_.get_surface()));
-      ret = UR_CHECK_ERROR(
-          cuArrayDestroy(uniqueMemObj->mem_.surface_mem_.get_array()));
+    } else if (hMem->MemType == ur_mem_handle_t_::Type::Surface) {
+      Result = UR_CHECK_ERROR(
+          cuSurfObjectDestroy(MemObjPtr->Mem.SurfaceMem.getSurface()));
+      Result =
+          UR_CHECK_ERROR(cuArrayDestroy(MemObjPtr->Mem.SurfaceMem.getArray()));
     }
 
-  } catch (ur_result_t err) {
-    ret = err;
+  } catch (ur_result_t Err) {
+    Result = Err;
   } catch (...) {
-    ret = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
   }
 
-  if (ret != UR_RESULT_SUCCESS) {
+  if (Result != UR_RESULT_SUCCESS) {
     // A reported CUDA error is either an implementation or an asynchronous CUDA
     // error for which it is unclear if the function that reported it succeeded
     // or not. Either way, the state of the program is compromised and likely
@@ -183,7 +181,7 @@ urMemGetNativeHandle(ur_mem_handle_t hMem, ur_native_handle_t *phNativeMem) {
   UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(phNativeMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   *phNativeMem =
-      reinterpret_cast<ur_native_handle_t>(hMem->mem_.buffer_mem_.get());
+      reinterpret_cast<ur_native_handle_t>(hMem->Mem.BufferMem.get());
   return UR_RESULT_SUCCESS;
 }
 
@@ -195,27 +193,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory,
   UR_ASSERT(hMemory, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(MemInfoType <= UR_MEM_INFO_CONTEXT,
             UR_RESULT_ERROR_INVALID_ENUMERATION);
-  UR_ASSERT(hMemory->is_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hMemory->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
   UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet);
 
-  ScopedContext active(hMemory->get_context());
+  ScopedContext Active(hMemory->getContext());
 
   switch (MemInfoType) {
   case UR_MEM_INFO_SIZE: {
     try {
-      size_t allocSize = 0;
-      UR_CHECK_ERROR(cuMemGetAddressRange(nullptr, &allocSize,
-                                          hMemory->mem_.buffer_mem_.ptr_));
-      return ReturnValue(allocSize);
-    } catch (ur_result_t err) {
-      return err;
+      size_t AllocSize = 0;
+      UR_CHECK_ERROR(cuMemGetAddressRange(nullptr, &AllocSize,
+                                          hMemory->Mem.BufferMem.Ptr));
+      return ReturnValue(AllocSize);
+    } catch (ur_result_t Err) {
+      return Err;
     } catch (...) {
       return UR_RESULT_ERROR_UNKNOWN;
     }
   }
   case UR_MEM_INFO_CONTEXT: {
-    return ReturnValue(hMemory->get_context());
+    return ReturnValue(hMemory->getContext());
   }
 
   default:
@@ -251,7 +249,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
       (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)) {
     UR_ASSERT(pHost, UR_RESULT_ERROR_INVALID_HOST_PTR);
   }
-  const bool performInitialCopy =
+  const bool PerformInitialCopy =
       (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) ||
       ((flags & UR_MEM_FLAG_USE_HOST_POINTER));
 
@@ -270,7 +268,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
               UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
   }
 
-  ur_result_t retErr = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
 
   // We only support RBGA channel order
   // TODO: check SYCL CTS and spec. May also have to support BGRA
@@ -280,58 +278,58 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
   // We have to use cuArray3DCreate, which has some caveats. The height and
   // depth parameters must be set to 0 produce 1D or 2D arrays. pImageDesc gives
   // a minimum value of 1, so we need to convert the answer.
-  CUDA_ARRAY3D_DESCRIPTOR array_desc;
-  array_desc.NumChannels = 4; // Only support 4 channel image
-  array_desc.Flags = 0;       // No flags required
-  array_desc.Width = pImageDesc->width;
+  CUDA_ARRAY3D_DESCRIPTOR ArrayDesc;
+  ArrayDesc.NumChannels = 4; // Only support 4 channel image
+  ArrayDesc.Flags = 0;       // No flags required
+  ArrayDesc.Width = pImageDesc->width;
   if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
-    array_desc.Height = 0;
-    array_desc.Depth = 0;
+    ArrayDesc.Height = 0;
+    ArrayDesc.Depth = 0;
   } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
-    array_desc.Height = pImageDesc->height;
-    array_desc.Depth = 0;
+    ArrayDesc.Height = pImageDesc->height;
+    ArrayDesc.Depth = 0;
   } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
-    array_desc.Height = pImageDesc->height;
-    array_desc.Depth = pImageDesc->depth;
+    ArrayDesc.Height = pImageDesc->height;
+    ArrayDesc.Depth = pImageDesc->depth;
   }
 
   // We need to get this now in bytes for calculating the total image size later
-  size_t pixel_type_size_bytes;
+  size_t PixelTypeSizeBytes;
 
   switch (pImageFormat->channelType) {
   case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8:
   case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
-    array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
-    pixel_type_size_bytes = 1;
+    ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
+    PixelTypeSizeBytes = 1;
     break;
   case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
-    array_desc.Format = CU_AD_FORMAT_SIGNED_INT8;
-    pixel_type_size_bytes = 1;
+    ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT8;
+    PixelTypeSizeBytes = 1;
     break;
   case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16:
   case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16:
-    array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
-    pixel_type_size_bytes = 2;
+    ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
+    PixelTypeSizeBytes = 2;
     break;
   case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16:
-    array_desc.Format = CU_AD_FORMAT_SIGNED_INT16;
-    pixel_type_size_bytes = 2;
+    ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT16;
+    PixelTypeSizeBytes = 2;
     break;
   case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT:
-    array_desc.Format = CU_AD_FORMAT_HALF;
-    pixel_type_size_bytes = 2;
+    ArrayDesc.Format = CU_AD_FORMAT_HALF;
+    PixelTypeSizeBytes = 2;
     break;
   case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
-    array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
-    pixel_type_size_bytes = 4;
+    ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
+    PixelTypeSizeBytes = 4;
     break;
   case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
-    array_desc.Format = CU_AD_FORMAT_SIGNED_INT32;
-    pixel_type_size_bytes = 4;
+    ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT32;
+    PixelTypeSizeBytes = 4;
     break;
   case UR_IMAGE_CHANNEL_TYPE_FLOAT:
-    array_desc.Format = CU_AD_FORMAT_FLOAT;
-    pixel_type_size_bytes = 4;
+    ArrayDesc.Format = CU_AD_FORMAT_FLOAT;
+    PixelTypeSizeBytes = 4;
     break;
   default:
     sycl::detail::ur::die(
@@ -339,51 +337,51 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
   }
 
   // When a dimension isn't used pImageDesc has the size set to 1
-  size_t pixel_size_bytes =
-      pixel_type_size_bytes * 4; // 4 is the only number of channels we support
-  size_t image_size_bytes = pixel_size_bytes * pImageDesc->width *
-                            pImageDesc->height * pImageDesc->depth;
+  size_t PixelSizeBytes =
+      PixelTypeSizeBytes * 4; // 4 is the only number of channels we support
+  size_t ImageSizeBytes = PixelSizeBytes * pImageDesc->width *
+                          pImageDesc->height * pImageDesc->depth;
 
-  ScopedContext active(hContext);
-  CUarray image_array = nullptr;
+  ScopedContext Active(hContext);
+  CUarray ImageArray = nullptr;
   try {
-    retErr = UR_CHECK_ERROR(cuArray3DCreate(&image_array, &array_desc));
-  } catch (ur_result_t err) {
-    if (err == UR_RESULT_ERROR_INVALID_VALUE) {
+    Result = UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &ArrayDesc));
+  } catch (ur_result_t Err) {
+    if (Err == UR_RESULT_ERROR_INVALID_VALUE) {
       return UR_RESULT_ERROR_INVALID_IMAGE_SIZE;
     }
-    return err;
+    return Err;
   } catch (...) {
     return UR_RESULT_ERROR_UNKNOWN;
   }
 
   try {
-    if (performInitialCopy) {
+    if (PerformInitialCopy) {
       // We have to use a different copy function for each image dimensionality
       if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
-        retErr = UR_CHECK_ERROR(
-            cuMemcpyHtoA(image_array, 0, pHost, image_size_bytes));
+        Result =
+            UR_CHECK_ERROR(cuMemcpyHtoA(ImageArray, 0, pHost, ImageSizeBytes));
       } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
-        CUDA_MEMCPY2D cpy_desc;
-        memset(&cpy_desc, 0, sizeof(cpy_desc));
-        cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
-        cpy_desc.srcHost = pHost;
-        cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
-        cpy_desc.dstArray = image_array;
-        cpy_desc.WidthInBytes = pixel_size_bytes * pImageDesc->width;
-        cpy_desc.Height = pImageDesc->height;
-        retErr = UR_CHECK_ERROR(cuMemcpy2D(&cpy_desc));
+        CUDA_MEMCPY2D CpyDesc;
+        memset(&CpyDesc, 0, sizeof(CpyDesc));
+        CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
+        CpyDesc.srcHost = pHost;
+        CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
+        CpyDesc.dstArray = ImageArray;
+        CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width;
+        CpyDesc.Height = pImageDesc->height;
+        Result = UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc));
       } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
-        CUDA_MEMCPY3D cpy_desc;
-        memset(&cpy_desc, 0, sizeof(cpy_desc));
-        cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
-        cpy_desc.srcHost = pHost;
-        cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
-        cpy_desc.dstArray = image_array;
-        cpy_desc.WidthInBytes = pixel_size_bytes * pImageDesc->width;
-        cpy_desc.Height = pImageDesc->height;
-        cpy_desc.Depth = pImageDesc->depth;
-        retErr = UR_CHECK_ERROR(cuMemcpy3D(&cpy_desc));
+        CUDA_MEMCPY3D CpyDesc;
+        memset(&CpyDesc, 0, sizeof(CpyDesc));
+        CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
+        CpyDesc.srcHost = pHost;
+        CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
+        CpyDesc.dstArray = ImageArray;
+        CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width;
+        CpyDesc.Height = pImageDesc->height;
+        CpyDesc.Depth = pImageDesc->depth;
+        Result = UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc));
       }
     }
 
@@ -396,35 +394,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
     // handle.
     // CUDA_RESOURCE_DESC::flags must be set to zero
 
-    CUDA_RESOURCE_DESC image_res_desc;
-    image_res_desc.res.array.hArray = image_array;
-    image_res_desc.resType = CU_RESOURCE_TYPE_ARRAY;
-    image_res_desc.flags = 0;
+    CUDA_RESOURCE_DESC ImageResDesc;
+    ImageResDesc.res.array.hArray = ImageArray;
+    ImageResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+    ImageResDesc.flags = 0;
 
-    CUsurfObject surface;
-    retErr = UR_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc));
+    CUsurfObject Surface;
+    Result = UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc));
 
-    auto urMemObj = std::unique_ptr<ur_mem_handle_t_>(new ur_mem_handle_t_(
-        hContext, image_array, surface, flags, pImageDesc->type, phMem));
+    auto MemObj = std::unique_ptr<ur_mem_handle_t_>(new ur_mem_handle_t_(
+        hContext, ImageArray, Surface, flags, pImageDesc->type, phMem));
 
-    if (urMemObj == nullptr) {
+    if (MemObj == nullptr) {
       return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
     }
 
-    *phMem = urMemObj.release();
-  } catch (ur_result_t err) {
-    if (image_array) {
-      cuArrayDestroy(image_array);
+    *phMem = MemObj.release();
+  } catch (ur_result_t Err) {
+    if (ImageArray) {
+      cuArrayDestroy(ImageArray);
     }
-    return err;
+    return Err;
   } catch (...) {
-    if (image_array) {
-      cuArrayDestroy(image_array);
+    if (ImageArray) {
+      cuArrayDestroy(ImageArray);
     }
     return UR_RESULT_ERROR_UNKNOWN;
   }
 
-  return retErr;
+  return Result;
 }
 
 /// \TODO Not implemented
@@ -445,8 +443,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
   UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0,
             UR_RESULT_ERROR_INVALID_ENUMERATION);
-  UR_ASSERT(hBuffer->is_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  UR_ASSERT(!hBuffer->is_sub_buffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(!hBuffer->isSubBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
   // Default value for flags means UR_MEM_FLAG_READ_WRITE.
   if (flags == 0) {
@@ -457,11 +455,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
               (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER |
                UR_MEM_FLAG_ALLOC_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)),
             UR_RESULT_ERROR_INVALID_VALUE);
-  if (hBuffer->memFlags_ & UR_MEM_FLAG_WRITE_ONLY) {
+  if (hBuffer->MemFlags & UR_MEM_FLAG_WRITE_ONLY) {
     UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_READ_ONLY)),
               UR_RESULT_ERROR_INVALID_VALUE);
   }
-  if (hBuffer->memFlags_ & UR_MEM_FLAG_READ_ONLY) {
+  if (hBuffer->MemFlags & UR_MEM_FLAG_READ_ONLY) {
     UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)),
               UR_RESULT_ERROR_INVALID_VALUE);
   }
@@ -474,38 +472,38 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
   UR_ASSERT(pRegion->size != 0u, UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
 
   assert((pRegion->origin <= (pRegion->origin + pRegion->size)) && "Overflow");
-  UR_ASSERT(((pRegion->origin + pRegion->size) <=
-             hBuffer->mem_.buffer_mem_.get_size()),
-            UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
+  UR_ASSERT(
+      ((pRegion->origin + pRegion->size) <= hBuffer->Mem.BufferMem.getSize()),
+      UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
   // Retained indirectly due to retaining parent buffer below.
-  ur_context_handle_t context = hBuffer->context_;
+  ur_context_handle_t Context = hBuffer->Context;
 
-  ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode allocMode =
-      ur_mem_handle_t_::mem_::buffer_mem_::alloc_mode::classic;
+  ur_mem_handle_t_::MemImpl::BufferMem::AllocMode AllocMode =
+      ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic;
 
-  assert(hBuffer->mem_.buffer_mem_.ptr_ !=
-         ur_mem_handle_t_::mem_::buffer_mem_::native_type{0});
-  ur_mem_handle_t_::mem_::buffer_mem_::native_type ptr =
-      hBuffer->mem_.buffer_mem_.ptr_ + pRegion->origin;
+  assert(hBuffer->Mem.BufferMem.Ptr !=
+         ur_mem_handle_t_::MemImpl::BufferMem::native_type{0});
+  ur_mem_handle_t_::MemImpl::BufferMem::native_type Ptr =
+      hBuffer->Mem.BufferMem.Ptr + pRegion->origin;
 
-  void *hostPtr = nullptr;
-  if (hBuffer->mem_.buffer_mem_.hostPtr_) {
-    hostPtr = static_cast<char *>(hBuffer->mem_.buffer_mem_.hostPtr_) +
-              pRegion->origin;
+  void *HostPtr = nullptr;
+  if (hBuffer->Mem.BufferMem.HostPtr) {
+    HostPtr =
+        static_cast<char *>(hBuffer->Mem.BufferMem.HostPtr) + pRegion->origin;
   }
 
-  std::unique_ptr<ur_mem_handle_t_> retMemObj{nullptr};
+  std::unique_ptr<ur_mem_handle_t_> MemObj{nullptr};
   try {
-    retMemObj = std::unique_ptr<ur_mem_handle_t_>{new ur_mem_handle_t_{
-        context, hBuffer, flags, allocMode, ptr, hostPtr, pRegion->size}};
-  } catch (ur_result_t err) {
+    MemObj = std::unique_ptr<ur_mem_handle_t_>{new ur_mem_handle_t_{
+        Context, hBuffer, flags, AllocMode, Ptr, HostPtr, pRegion->size}};
+  } catch (ur_result_t Err) {
     *phMem = nullptr;
-    return err;
+    return Err;
   } catch (...) {
     *phMem = nullptr;
     return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
   }
 
-  *phMem = retMemObj.release();
+  *phMem = MemObj.release();
   return UR_RESULT_SUCCESS;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp
index 5712218b06425..a1b484e3212bf 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp
@@ -18,178 +18,173 @@
 /// Keeps tracks of all mapped regions used for Map/Unmap calls.
 /// Only one region can be active at the same time per allocation.
 struct ur_mem_handle_t_ {
-
-  // TODO: Move as much shared data up as possible
-  using ur_context = ur_context_handle_t_ *;
-  using ur_mem = ur_mem_handle_t_ *;
-
   // Context where the memory object is accessibles
-  ur_context context_;
+  ur_context_handle_t Context;
 
   /// Reference counting of the handler
-  std::atomic_uint32_t refCount_;
-  enum class mem_type { buffer, surface } mem_type_;
+  std::atomic_uint32_t RefCount;
+  enum class Type { Buffer, Surface } MemType;
 
   // Original mem flags passed
-  ur_mem_flags_t memFlags_;
+  ur_mem_flags_t MemFlags;
 
   /// A UR Memory object represents either plain memory allocations ("Buffers"
   /// in OpenCL) or typed allocations ("Images" in OpenCL).
   /// In CUDA their API handlers are different. Whereas "Buffers" are allocated
   /// as pointer-like structs, "Images" are stored in Textures or Surfaces
   /// This union allows implementation to use either from the same handler.
-  union mem_ {
+  union MemImpl {
     // Handler for plain, pointer-based CUDA allocations
-    struct buffer_mem_ {
+    struct BufferMem {
       using native_type = CUdeviceptr;
 
       // If this allocation is a sub-buffer (i.e., a view on an existing
       // allocation), this is the pointer to the parent handler structure
-      ur_mem parent_;
+      ur_mem_handle_t Parent;
       // CUDA handler for the pointer
-      native_type ptr_;
+      native_type Ptr;
 
       /// Pointer associated with this device on the host
-      void *hostPtr_;
+      void *HostPtr;
       /// Size of the allocation in bytes
-      size_t size_;
+      size_t Size;
       /// Offset of the active mapped region.
-      size_t mapOffset_;
+      size_t MapOffset;
       /// Pointer to the active mapped region, if any
-      void *mapPtr_;
+      void *MapPtr;
       /// Original flags for the mapped region
-      ur_map_flags_t mapFlags_;
+      ur_map_flags_t MapFlags;
 
-      /** alloc_mode
+      /** AllocMode
        * classic: Just a normal buffer allocated on the device via cuda malloc
        * use_host_ptr: Use an address on the host for the device
        * copy_in: The data for the device comes from the host but the host
        pointer is not available later for re-use
        * alloc_host_ptr: Uses pinned-memory allocation
       */
-      enum class alloc_mode {
-        classic,
-        use_host_ptr,
-        copy_in,
-        alloc_host_ptr
-      } allocMode_;
+      enum class AllocMode {
+        Classic,
+        UseHostPtr,
+        CopyIn,
+        AllocHostPtr,
+      } MemAllocMode;
 
-      native_type get() const noexcept { return ptr_; }
+      native_type get() const noexcept { return Ptr; }
 
-      size_t get_size() const noexcept { return size_; }
+      size_t getSize() const noexcept { return Size; }
 
-      void *get_map_ptr() const noexcept { return mapPtr_; }
+      void *getMapPtr() const noexcept { return MapPtr; }
 
-      size_t get_map_offset(void *) const noexcept { return mapOffset_; }
+      size_t getMapOffset(void *) const noexcept { return MapOffset; }
 
       /// Returns a pointer to data visible on the host that contains
       /// the data on the device associated with this allocation.
       /// The offset is used to index into the CUDA allocation.
       ///
-      void *map_to_ptr(size_t offset, ur_map_flags_t flags) noexcept {
-        assert(mapPtr_ == nullptr);
-        mapOffset_ = offset;
-        mapFlags_ = flags;
-        if (hostPtr_) {
-          mapPtr_ = static_cast<char *>(hostPtr_) + offset;
+      void *mapToPtr(size_t Offset, ur_map_flags_t Flags) noexcept {
+        assert(MapPtr == nullptr);
+        MapOffset = Offset;
+        MapFlags = Flags;
+        if (HostPtr) {
+          MapPtr = static_cast<char *>(HostPtr) + Offset;
         } else {
           // TODO: Allocate only what is needed based on the offset
-          mapPtr_ = static_cast<void *>(malloc(this->get_size()));
+          MapPtr = static_cast<void *>(malloc(this->getSize()));
         }
-        return mapPtr_;
+        return MapPtr;
       }
 
       /// Detach the allocation from the host memory.
       void unmap(void *) noexcept {
-        assert(mapPtr_ != nullptr);
+        assert(MapPtr != nullptr);
 
-        if (mapPtr_ != hostPtr_) {
-          free(mapPtr_);
+        if (MapPtr != HostPtr) {
+          free(MapPtr);
         }
-        mapPtr_ = nullptr;
-        mapOffset_ = 0;
+        MapPtr = nullptr;
+        MapOffset = 0;
       }
 
-      ur_map_flags_t get_map_flags() const noexcept {
-        assert(mapPtr_ != nullptr);
-        return mapFlags_;
+      ur_map_flags_t getMapFlags() const noexcept {
+        assert(MapPtr != nullptr);
+        return MapFlags;
       }
-    } buffer_mem_;
+    } BufferMem;
 
     // Handler data for surface object (i.e. Images)
-    struct surface_mem_ {
-      CUarray array_;
-      CUsurfObject surfObj_;
-      ur_mem_type_t imageType_;
+    struct SurfaceMem {
+      CUarray Array;
+      CUsurfObject SurfObj;
+      ur_mem_type_t ImageType;
 
-      CUarray get_array() const noexcept { return array_; }
+      CUarray getArray() const noexcept { return Array; }
 
-      CUsurfObject get_surface() const noexcept { return surfObj_; }
+      CUsurfObject getSurface() const noexcept { return SurfObj; }
 
-      ur_mem_type_t get_image_type() const noexcept { return imageType_; }
-    } surface_mem_;
-  } mem_;
+      ur_mem_type_t getImageType() const noexcept { return ImageType; }
+    } SurfaceMem;
+  } Mem;
 
   /// Constructs the UR mem handler for a non-typed allocation ("buffer")
-  ur_mem_handle_t_(ur_context ctxt, ur_mem parent, ur_mem_flags_t mem_flags,
-                   mem_::buffer_mem_::alloc_mode mode, CUdeviceptr ptr,
-                   void *host_ptr, size_t size)
-      : context_{ctxt}, refCount_{1}, mem_type_{mem_type::buffer},
-        memFlags_{mem_flags} {
-    mem_.buffer_mem_.ptr_ = ptr;
-    mem_.buffer_mem_.parent_ = parent;
-    mem_.buffer_mem_.hostPtr_ = host_ptr;
-    mem_.buffer_mem_.size_ = size;
-    mem_.buffer_mem_.mapOffset_ = 0;
-    mem_.buffer_mem_.mapPtr_ = nullptr;
-    mem_.buffer_mem_.mapFlags_ = UR_MAP_FLAG_WRITE;
-    mem_.buffer_mem_.allocMode_ = mode;
-    if (is_sub_buffer()) {
-      urMemRetain(mem_.buffer_mem_.parent_);
+  ur_mem_handle_t_(ur_context_handle_t Context, ur_mem_handle_t Parent,
+                   ur_mem_flags_t MemFlags, MemImpl::BufferMem::AllocMode Mode,
+                   CUdeviceptr Ptr, void *HostPtr, size_t Size)
+      : Context{Context}, RefCount{1}, MemType{Type::Buffer},
+        MemFlags{MemFlags} {
+    Mem.BufferMem.Ptr = Ptr;
+    Mem.BufferMem.Parent = Parent;
+    Mem.BufferMem.HostPtr = HostPtr;
+    Mem.BufferMem.Size = Size;
+    Mem.BufferMem.MapOffset = 0;
+    Mem.BufferMem.MapPtr = nullptr;
+    Mem.BufferMem.MapFlags = UR_MAP_FLAG_WRITE;
+    Mem.BufferMem.MemAllocMode = Mode;
+    if (isSubBuffer()) {
+      urMemRetain(Mem.BufferMem.Parent);
     } else {
-      urContextRetain(context_);
+      urContextRetain(Context);
     }
   };
 
   /// Constructs the UR allocation for an Image object (surface in CUDA)
-  ur_mem_handle_t_(ur_context ctxt, CUarray array, CUsurfObject surf,
-                   ur_mem_flags_t mem_flags, ur_mem_type_t image_type,
-                   void *host_ptr)
-      : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface},
-        memFlags_{mem_flags} {
+  ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array,
+                   CUsurfObject Surf, ur_mem_flags_t MemFlags,
+                   ur_mem_type_t ImageType, void *HostPtr)
+      : Context{Context}, RefCount{1}, MemType{Type::Surface},
+        MemFlags{MemFlags} {
     // Ignore unused parameter
-    (void)host_ptr;
+    (void)HostPtr;
 
-    mem_.surface_mem_.array_ = array;
-    mem_.surface_mem_.surfObj_ = surf;
-    mem_.surface_mem_.imageType_ = image_type;
-    urContextRetain(context_);
+    Mem.SurfaceMem.Array = Array;
+    Mem.SurfaceMem.SurfObj = Surf;
+    Mem.SurfaceMem.ImageType = ImageType;
+    urContextRetain(Context);
   }
 
   ~ur_mem_handle_t_() {
-    if (mem_type_ == mem_type::buffer) {
-      if (is_sub_buffer()) {
-        urMemRelease(mem_.buffer_mem_.parent_);
+    if (MemType == Type::Buffer) {
+      if (isSubBuffer()) {
+        urMemRelease(Mem.BufferMem.Parent);
         return;
       }
     }
-    urContextRelease(context_);
+    urContextRelease(Context);
   }
 
   // TODO: Move as many shared funcs up as possible
-  bool is_buffer() const noexcept { return mem_type_ == mem_type::buffer; }
+  bool isBuffer() const noexcept { return MemType == Type::Buffer; }
 
-  bool is_sub_buffer() const noexcept {
-    return (is_buffer() && (mem_.buffer_mem_.parent_ != nullptr));
+  bool isSubBuffer() const noexcept {
+    return (isBuffer() && (Mem.BufferMem.Parent != nullptr));
   }
 
-  bool is_image() const noexcept { return mem_type_ == mem_type::surface; }
+  bool isImage() const noexcept { return MemType == Type::Surface; }
 
-  ur_context get_context() const noexcept { return context_; }
+  ur_context_handle_t getContext() const noexcept { return Context; }
 
-  uint32_t increment_reference_count() noexcept { return ++refCount_; }
+  uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
 
-  uint32_t decrement_reference_count() noexcept { return --refCount_; }
+  uint32_t decrementReferenceCount() noexcept { return --RefCount; }
 
-  uint32_t get_reference_count() const noexcept { return refCount_; }
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
 };
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
index fdf0f723e168f..f28f76c2a95df 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
@@ -33,8 +33,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo(
   case UR_PLATFORM_INFO_PROFILE:
     return ReturnValue("FULL PROFILE");
   case UR_PLATFORM_INFO_VERSION: {
-    auto version = getCudaVersionString();
-    return ReturnValue(version.c_str());
+    auto Version = getCudaVersionString();
+    return ReturnValue(Version.c_str());
   }
   case UR_PLATFORM_INFO_EXTENSIONS: {
     return ReturnValue("");
@@ -62,102 +62,103 @@ urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms,
               uint32_t *pNumPlatforms) {
 
   try {
-    static std::once_flag initFlag;
-    static uint32_t numPlatforms = 1;
-    static std::vector<ur_platform_handle_t_> platformIds;
+    static std::once_flag InitFlag;
+    static uint32_t NumPlatforms = 1;
+    static std::vector<ur_platform_handle_t_> Platforms;
 
     UR_ASSERT(phPlatforms || pNumPlatforms, UR_RESULT_ERROR_INVALID_VALUE);
     UR_ASSERT(!phPlatforms || NumEntries > 0, UR_RESULT_ERROR_INVALID_SIZE);
 
-    ur_result_t err = UR_RESULT_SUCCESS;
+    ur_result_t Result = UR_RESULT_SUCCESS;
 
     std::call_once(
-        initFlag,
-        [](ur_result_t &err) {
+        InitFlag,
+        [](ur_result_t &Result) {
           if (cuInit(0) != CUDA_SUCCESS) {
-            numPlatforms = 0;
+            NumPlatforms = 0;
             return;
           }
-          int numDevices = 0;
-          err = UR_CHECK_ERROR(cuDeviceGetCount(&numDevices));
-          if (numDevices == 0) {
-            numPlatforms = 0;
+          int NumDevices = 0;
+          Result = UR_CHECK_ERROR(cuDeviceGetCount(&NumDevices));
+          if (NumDevices == 0) {
+            NumPlatforms = 0;
             return;
           }
           try {
             // make one platform per device
-            numPlatforms = numDevices;
-            platformIds.resize(numDevices);
+            NumPlatforms = NumDevices;
+            Platforms.resize(NumDevices);
 
-            for (int i = 0; i < numDevices; ++i) {
-              CUdevice device;
-              err = UR_CHECK_ERROR(cuDeviceGet(&device, i));
-              CUcontext context;
-              err = UR_CHECK_ERROR(cuDevicePrimaryCtxRetain(&context, device));
+            for (int i = 0; i < NumDevices; ++i) {
+              CUdevice Device;
+              Result = UR_CHECK_ERROR(cuDeviceGet(&Device, i));
+              CUcontext Context;
+              Result =
+                  UR_CHECK_ERROR(cuDevicePrimaryCtxRetain(&Context, Device));
 
-              ScopedContext active(context);
-              CUevent evBase;
-              err = UR_CHECK_ERROR(cuEventCreate(&evBase, CU_EVENT_DEFAULT));
+              ScopedContext active(Context);
+              CUevent EvBase;
+              Result = UR_CHECK_ERROR(cuEventCreate(&EvBase, CU_EVENT_DEFAULT));
 
               // Use default stream to record base event counter
-              err = UR_CHECK_ERROR(cuEventRecord(evBase, 0));
+              Result = UR_CHECK_ERROR(cuEventRecord(EvBase, 0));
 
-              platformIds[i].devices_.emplace_back(new ur_device_handle_t_{
-                  device, context, evBase, &platformIds[i]});
+              Platforms[i].Devices.emplace_back(new ur_device_handle_t_{
+                  Device, Context, EvBase, &Platforms[i]});
               {
-                const auto &dev = platformIds[i].devices_.back().get();
-                size_t maxWorkGroupSize = 0u;
-                size_t maxThreadsPerBlock[3] = {};
-                ur_result_t retError = urDeviceGetInfo(
-                    dev, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
-                    sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
-                if (retError != UR_RESULT_SUCCESS) {
-                  throw retError;
+                const auto &Dev = Platforms[i].Devices.back().get();
+                size_t MaxWorkGroupSize = 0u;
+                size_t MaxThreadsPerBlock[3] = {};
+                ur_result_t RetError = urDeviceGetInfo(
+                    Dev, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
+                    sizeof(MaxThreadsPerBlock), MaxThreadsPerBlock, nullptr);
+                if (RetError != UR_RESULT_SUCCESS) {
+                  throw RetError;
                 }
 
-                retError = urDeviceGetInfo(
-                    dev, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
-                    sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr);
-                if (retError != UR_RESULT_SUCCESS) {
-                  throw retError;
+                RetError = urDeviceGetInfo(
+                    Dev, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
+                    sizeof(MaxWorkGroupSize), &MaxWorkGroupSize, nullptr);
+                if (RetError != UR_RESULT_SUCCESS) {
+                  throw RetError;
                 }
 
-                dev->save_max_work_item_sizes(sizeof(maxThreadsPerBlock),
-                                              maxThreadsPerBlock);
-                dev->save_max_work_group_size(maxWorkGroupSize);
+                Dev->saveMaxWorkItemSizes(sizeof(MaxThreadsPerBlock),
+                                          MaxThreadsPerBlock);
+                Dev->saveMaxWorkGroupSize(MaxWorkGroupSize);
               }
             }
           } catch (const std::bad_alloc &) {
             // Signal out-of-memory situation
-            for (int i = 0; i < numDevices; ++i) {
-              platformIds[i].devices_.clear();
+            for (int i = 0; i < NumDevices; ++i) {
+              Platforms[i].Devices.clear();
             }
-            platformIds.clear();
-            err = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+            Platforms.clear();
+            Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
           } catch (...) {
             // Clear and rethrow to allow retry
-            for (int i = 0; i < numDevices; ++i) {
-              platformIds[i].devices_.clear();
+            for (int i = 0; i < NumDevices; ++i) {
+              Platforms[i].Devices.clear();
             }
-            platformIds.clear();
+            Platforms.clear();
             throw;
           }
         },
-        err);
+        Result);
 
     if (pNumPlatforms != nullptr) {
-      *pNumPlatforms = numPlatforms;
+      *pNumPlatforms = NumPlatforms;
     }
 
     if (phPlatforms != nullptr) {
-      for (unsigned i = 0; i < std::min(NumEntries, numPlatforms); ++i) {
-        phPlatforms[i] = &platformIds[i];
+      for (unsigned i = 0; i < std::min(NumEntries, NumPlatforms); ++i) {
+        phPlatforms[i] = &Platforms[i];
       }
     }
 
-    return err;
-  } catch (ur_result_t err) {
-    return err;
+    return Result;
+  } catch (ur_result_t Err) {
+    return Err;
   } catch (...) {
     return UR_RESULT_ERROR_OUT_OF_RESOURCES;
   }
@@ -189,7 +190,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urTearDown(void *) {
 UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption(
     ur_platform_handle_t hPlatform, const char *pFrontendOption,
     const char **ppPlatformOption) {
-  (void)hPlatform;
+  std::ignore = hPlatform;
   using namespace std::literals;
   if (pFrontendOption == nullptr)
     return UR_RESULT_ERROR_INVALID_NULL_POINTER;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp
index 5b2e79f49be8d..187290718aebf 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp
@@ -11,5 +11,5 @@
 #include <vector>
 
 struct ur_platform_handle_t_ {
-  std::vector<std::unique_ptr<ur_device_handle_t_>> devices_;
+  std::vector<std::unique_ptr<ur_device_handle_t_>> Devices;
 };
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
index 314a9a866c813..ce8d7c705ae83 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
@@ -8,48 +8,47 @@
 
 #include "program.hpp"
 
-bool getMaxRegistersJitOptionValue(const std::string &build_options,
-                                   unsigned int &value) {
+bool getMaxRegistersJitOptionValue(const std::string &BuildOptions,
+                                   unsigned int &Value) {
   using namespace std::string_view_literals;
-  const std::size_t optionPos = build_options.find_first_of("maxrregcount"sv);
-  if (optionPos == std::string::npos) {
+  const std::size_t OptionPos = BuildOptions.find_first_of("maxrregcount"sv);
+  if (OptionPos == std::string::npos) {
     return false;
   }
 
-  const std::size_t delimPos = build_options.find('=', optionPos + 1u);
-  if (delimPos == std::string::npos) {
+  const std::size_t DelimPos = BuildOptions.find('=', OptionPos + 1u);
+  if (DelimPos == std::string::npos) {
     return false;
   }
 
-  const std::size_t length = build_options.length();
-  const std::size_t startPos = delimPos + 1u;
-  if (delimPos == std::string::npos || startPos >= length) {
+  const std::size_t Length = BuildOptions.length();
+  const std::size_t StartPos = DelimPos + 1u;
+  if (DelimPos == std::string::npos || StartPos >= Length) {
     return false;
   }
 
-  std::size_t pos = startPos;
-  while (pos < length &&
-         std::isdigit(static_cast<unsigned char>(build_options[pos]))) {
-    pos++;
+  std::size_t Pos = StartPos;
+  while (Pos < Length &&
+         std::isdigit(static_cast<unsigned char>(BuildOptions[Pos]))) {
+    Pos++;
   }
 
-  const std::string valueString =
-      build_options.substr(startPos, pos - startPos);
-  if (valueString.empty()) {
+  const std::string ValueString = BuildOptions.substr(StartPos, Pos - StartPos);
+  if (ValueString.empty()) {
     return false;
   }
 
-  value = static_cast<unsigned int>(std::stoi(valueString));
+  Value = static_cast<unsigned int>(std::stoi(ValueString));
   return true;
 }
 
-ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t ctxt)
-    : module_{nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1},
-      context_{ctxt}, kernelReqdWorkGroupSizeMD_{} {
-  urContextRetain(context_);
+ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Context)
+    : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1},
+      Context{Context}, KernelReqdWorkGroupSizeMD{} {
+  urContextRetain(Context);
 }
 
-ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(context_); }
+ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); }
 
 std::pair<std::string, std::string>
 splitMetadataName(const std::string &metadataName) {
@@ -61,18 +60,18 @@ splitMetadataName(const std::string &metadataName) {
 }
 
 ur_result_t
-ur_program_handle_t_::set_metadata(const ur_program_metadata_t *metadata,
-                                   size_t length) {
-  for (size_t i = 0; i < length; ++i) {
-    const ur_program_metadata_t metadataElement = metadata[i];
-    std::string metadataElementName{metadataElement.pName};
+ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
+                                  size_t Length) {
+  for (size_t i = 0; i < Length; ++i) {
+    const ur_program_metadata_t MetadataElement = Metadata[i];
+    std::string MetadataElementName{MetadataElement.pName};
 
-    auto [prefix, tag] = splitMetadataName(metadataElementName);
+    auto [Prefix, Tag] = splitMetadataName(MetadataElementName);
 
-    if (tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) {
+    if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) {
       // If metadata is reqd_work_group_size, record it for the corresponding
       // kernel name.
-      size_t MDElemsSize = metadataElement.size - sizeof(std::uint64_t);
+      size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t);
 
       // Expect between 1 and 3 32-bit integer values.
       UR_ASSERT(MDElemsSize >= sizeof(std::uint32_t) &&
@@ -81,80 +80,79 @@ ur_program_handle_t_::set_metadata(const ur_program_metadata_t *metadata,
 
       // Get pointer to data, skipping 64-bit size at the start of the data.
       const char *ValuePtr =
-          reinterpret_cast<const char *>(metadataElement.value.pData) +
+          reinterpret_cast<const char *>(MetadataElement.value.pData) +
           sizeof(std::uint64_t);
       // Read values and pad with 1's for values not present.
-      std::uint32_t reqdWorkGroupElements[] = {1, 1, 1};
-      std::memcpy(reqdWorkGroupElements, ValuePtr, MDElemsSize);
-      kernelReqdWorkGroupSizeMD_[prefix] =
-          std::make_tuple(reqdWorkGroupElements[0], reqdWorkGroupElements[1],
-                          reqdWorkGroupElements[2]);
-    } else if (tag == __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING) {
-      const char *metadataValPtr =
-          reinterpret_cast<const char *>(metadataElement.value.pData) +
+      std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1};
+      std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize);
+      KernelReqdWorkGroupSizeMD[Prefix] =
+          std::make_tuple(ReqdWorkGroupElements[0], ReqdWorkGroupElements[1],
+                          ReqdWorkGroupElements[2]);
+    } else if (Tag == __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING) {
+      const char *MetadataValPtr =
+          reinterpret_cast<const char *>(MetadataElement.value.pData) +
           sizeof(std::uint64_t);
-      const char *metadataValPtrEnd =
-          metadataValPtr + metadataElement.size - sizeof(std::uint64_t);
-      globalIDMD_[prefix] = std::string{metadataValPtr, metadataValPtrEnd};
+      const char *MetadataValPtrEnd =
+          MetadataValPtr + MetadataElement.size - sizeof(std::uint64_t);
+      GlobalIDMD[Prefix] = std::string{MetadataValPtr, MetadataValPtrEnd};
     }
   }
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t ur_program_handle_t_::set_binary(const char *source,
-                                             size_t length) {
+ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) {
   // Do not re-set program binary data which has already been set as that will
   // delete the old binary data.
-  UR_ASSERT(binary_ == nullptr && binarySizeInBytes_ == 0,
+  UR_ASSERT(Binary == nullptr && BinarySizeInBytes == 0,
             UR_RESULT_ERROR_INVALID_OPERATION);
-  binary_ = source;
-  binarySizeInBytes_ = length;
+  Binary = Source;
+  BinarySizeInBytes = Length;
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t ur_program_handle_t_::build_program(const char *build_options) {
-  if (build_options) {
-    this->buildOptions_ = build_options;
+ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) {
+  if (BuildOptions) {
+    this->BuildOptions = BuildOptions;
   }
 
-  constexpr const unsigned int numberOfOptions = 4u;
+  constexpr const unsigned int NumberOfOptions = 4u;
 
-  std::vector<CUjit_option> options(numberOfOptions);
-  std::vector<void *> optionVals(numberOfOptions);
+  std::vector<CUjit_option> Options(NumberOfOptions);
+  std::vector<void *> OptionVals(NumberOfOptions);
 
   // Pass a buffer for info messages
-  options[0] = CU_JIT_INFO_LOG_BUFFER;
-  optionVals[0] = (void *)infoLog_;
+  Options[0] = CU_JIT_INFO_LOG_BUFFER;
+  OptionVals[0] = (void *)InfoLog;
   // Pass the size of the info buffer
-  options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
-  optionVals[1] = (void *)(long)MAX_LOG_SIZE;
+  Options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+  OptionVals[1] = (void *)(long)MaxLogSize;
   // Pass a buffer for error message
-  options[2] = CU_JIT_ERROR_LOG_BUFFER;
-  optionVals[2] = (void *)errorLog_;
+  Options[2] = CU_JIT_ERROR_LOG_BUFFER;
+  OptionVals[2] = (void *)ErrorLog;
   // Pass the size of the error buffer
-  options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
-  optionVals[3] = (void *)(long)MAX_LOG_SIZE;
-
-  if (!buildOptions_.empty()) {
-    unsigned int maxRegs;
-    bool valid = getMaxRegistersJitOptionValue(buildOptions_, maxRegs);
-    if (valid) {
-      options.push_back(CU_JIT_MAX_REGISTERS);
-      optionVals.push_back(reinterpret_cast<void *>(maxRegs));
+  Options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+  OptionVals[3] = (void *)(long)MaxLogSize;
+
+  if (!this->BuildOptions.empty()) {
+    unsigned int MaxRegs;
+    bool Valid = getMaxRegistersJitOptionValue(BuildOptions, MaxRegs);
+    if (Valid) {
+      Options.push_back(CU_JIT_MAX_REGISTERS);
+      OptionVals.push_back(reinterpret_cast<void *>(MaxRegs));
     }
   }
 
   auto result = UR_CHECK_ERROR(
-      cuModuleLoadDataEx(&module_, static_cast<const void *>(binary_),
-                         options.size(), options.data(), optionVals.data()));
+      cuModuleLoadDataEx(&Module, static_cast<const void *>(Binary),
+                         Options.size(), Options.data(), OptionVals.data()));
 
-  const auto success = (result == UR_RESULT_SUCCESS);
+  const auto Success = (result == UR_RESULT_SUCCESS);
 
-  buildStatus_ =
-      success ? UR_PROGRAM_BUILD_STATUS_SUCCESS : UR_PROGRAM_BUILD_STATUS_ERROR;
+  BuildStatus =
+      Success ? UR_PROGRAM_BUILD_STATUS_SUCCESS : UR_PROGRAM_BUILD_STATUS_ERROR;
 
   // If no exception, result is correct
-  return success ? UR_RESULT_SUCCESS : UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE;
+  return Success ? UR_RESULT_SUCCESS : UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE;
 }
 
 /// Finds kernel names by searching for entry points in the PTX source, as the
@@ -178,7 +176,7 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
                       ur_program_handle_t *phProgram) {
   UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  ur_device_handle_t hDevice = hContext->get_device();
+  ur_device_handle_t hDevice = hContext->getDevice();
   auto pBinary = reinterpret_cast<const uint8_t *>(pIL);
 
   return urProgramCreateWithBinary(hContext, hDevice, length, pBinary,
@@ -204,17 +202,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
                                                    const char *pOptions) {
   UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  ur_result_t retError = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    ScopedContext active(hProgram->get_context());
+    ScopedContext Active(hProgram->getContext());
 
-    hProgram->build_program(pOptions);
+    hProgram->buildProgram(pOptions);
 
-  } catch (ur_result_t err) {
-    retError = err;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
-  return retError;
+  return Result;
 }
 
 /// Creates a new UR program object that is the outcome of linking all input
@@ -230,44 +228,44 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count,
   UR_ASSERT(phPrograms, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
-  ur_result_t retError = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    ScopedContext active(hContext);
+    ScopedContext Active(hContext);
 
-    CUlinkState state;
-    std::unique_ptr<ur_program_handle_t_> retProgram{
+    CUlinkState State;
+    std::unique_ptr<ur_program_handle_t_> RetProgram{
         new ur_program_handle_t_{hContext}};
 
-    retError = UR_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &state));
+    Result = UR_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &State));
     try {
       for (size_t i = 0; i < count; ++i) {
-        ur_program_handle_t program = phPrograms[i];
-        retError = UR_CHECK_ERROR(cuLinkAddData(
-            state, CU_JIT_INPUT_PTX, const_cast<char *>(program->binary_),
-            program->binarySizeInBytes_, nullptr, 0, nullptr, nullptr));
+        ur_program_handle_t Program = phPrograms[i];
+        Result = UR_CHECK_ERROR(cuLinkAddData(
+            State, CU_JIT_INPUT_PTX, const_cast<char *>(Program->Binary),
+            Program->BinarySizeInBytes, nullptr, 0, nullptr, nullptr));
       }
-      void *cubin = nullptr;
-      size_t cubinSize = 0;
-      retError = UR_CHECK_ERROR(cuLinkComplete(state, &cubin, &cubinSize));
+      void *CuBin = nullptr;
+      size_t CuBinSize = 0;
+      Result = UR_CHECK_ERROR(cuLinkComplete(State, &CuBin, &CuBinSize));
 
-      retError =
-          retProgram->set_binary(static_cast<const char *>(cubin), cubinSize);
+      Result =
+          RetProgram->setBinary(static_cast<const char *>(CuBin), CuBinSize);
 
-      retError = retProgram->build_program(pOptions);
+      Result = RetProgram->buildProgram(pOptions);
     } catch (...) {
       // Upon error attempt cleanup
-      UR_CHECK_ERROR(cuLinkDestroy(state));
+      UR_CHECK_ERROR(cuLinkDestroy(State));
       throw;
     }
 
-    retError = UR_CHECK_ERROR(cuLinkDestroy(state));
-    *phProgram = retProgram.release();
+    Result = UR_CHECK_ERROR(cuLinkDestroy(State));
+    *phProgram = RetProgram.release();
 
-  } catch (ur_result_t err) {
-    retError = err;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
-  return retError;
+  return Result;
 }
 
 /// Created a UR program object from a CUDA program handle.
@@ -299,12 +297,12 @@ urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice,
 
   switch (propName) {
   case UR_PROGRAM_BUILD_INFO_STATUS: {
-    return ReturnValue(hProgram->buildStatus_);
+    return ReturnValue(hProgram->BuildStatus);
   }
   case UR_PROGRAM_BUILD_INFO_OPTIONS:
-    return ReturnValue(hProgram->buildOptions_.c_str());
+    return ReturnValue(hProgram->BuildOptions.c_str());
   case UR_PROGRAM_BUILD_INFO_LOG:
-    return ReturnValue(hProgram->infoLog_, hProgram->MAX_LOG_SIZE);
+    return ReturnValue(hProgram->InfoLog, hProgram->MaxLogSize);
   default:
     break;
   }
@@ -320,19 +318,19 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName,
 
   switch (propName) {
   case UR_PROGRAM_INFO_REFERENCE_COUNT:
-    return ReturnValue(hProgram->get_reference_count());
+    return ReturnValue(hProgram->getReferenceCount());
   case UR_PROGRAM_INFO_CONTEXT:
-    return ReturnValue(hProgram->context_);
+    return ReturnValue(hProgram->Context);
   case UR_PROGRAM_INFO_NUM_DEVICES:
     return ReturnValue(1u);
   case UR_PROGRAM_INFO_DEVICES:
-    return ReturnValue(&hProgram->context_->deviceId_, 1);
+    return ReturnValue(&hProgram->Context->DeviceID, 1);
   case UR_PROGRAM_INFO_SOURCE:
-    return ReturnValue(hProgram->binary_);
+    return ReturnValue(hProgram->Binary);
   case UR_PROGRAM_INFO_BINARY_SIZES:
-    return ReturnValue(&hProgram->binarySizeInBytes_, 1);
+    return ReturnValue(&hProgram->BinarySizeInBytes, 1);
   case UR_PROGRAM_INFO_BINARIES:
-    return ReturnValue(&hProgram->binary_, 1);
+    return ReturnValue(&hProgram->Binary, 1);
   case UR_PROGRAM_INFO_KERNEL_NAMES:
     return getKernelNames(hProgram);
   default:
@@ -344,9 +342,8 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName,
 UR_APIEXPORT ur_result_t UR_APICALL
 urProgramRetain(ur_program_handle_t program) {
   UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  UR_ASSERT(program->get_reference_count() > 0,
-            UR_RESULT_ERROR_INVALID_PROGRAM);
-  program->increment_reference_count();
+  UR_ASSERT(program->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_PROGRAM);
+  program->incrementReferenceCount();
   return UR_RESULT_SUCCESS;
 }
 
@@ -354,38 +351,38 @@ urProgramRetain(ur_program_handle_t program) {
 /// When the reference count reaches 0, it unloads the module from
 /// the context.
 UR_APIEXPORT ur_result_t UR_APICALL
-urProgramRelease(ur_program_handle_t program) {
-  UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+urProgramRelease(ur_program_handle_t hProgram) {
+  UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
   // double delete or someone is messing with the ref count.
   // either way, cannot safely proceed.
-  UR_ASSERT(program->get_reference_count() != 0,
+  UR_ASSERT(hProgram->getReferenceCount() != 0,
             UR_RESULT_ERROR_INVALID_PROGRAM);
 
   // decrement ref count. If it is 0, delete the program.
-  if (program->decrement_reference_count() == 0) {
+  if (hProgram->decrementReferenceCount() == 0) {
 
-    std::unique_ptr<ur_program_handle_t_> program_ptr{program};
+    std::unique_ptr<ur_program_handle_t_> ProgramPtr{hProgram};
 
-    ur_result_t result = UR_RESULT_ERROR_INVALID_PROGRAM;
+    ur_result_t Result = UR_RESULT_ERROR_INVALID_PROGRAM;
 
     try {
-      ScopedContext active(program->get_context());
-      auto cuModule = program->get();
+      ScopedContext Active(hProgram->getContext());
+      auto cuModule = hProgram->get();
       // "0" is a valid handle for a cuModule, so the best way to check if we
       // actually loaded a module and need to unload it is to look at the build
       // status.
-      if (program->buildStatus_ == UR_PROGRAM_BUILD_STATUS_SUCCESS) {
-        result = UR_CHECK_ERROR(cuModuleUnload(cuModule));
-      } else if (program->buildStatus_ == UR_PROGRAM_BUILD_STATUS_NONE) {
+      if (hProgram->BuildStatus == UR_PROGRAM_BUILD_STATUS_SUCCESS) {
+        Result = UR_CHECK_ERROR(cuModuleUnload(cuModule));
+      } else if (hProgram->BuildStatus == UR_PROGRAM_BUILD_STATUS_NONE) {
         // Nothing to free.
-        result = UR_RESULT_SUCCESS;
+        Result = UR_RESULT_SUCCESS;
       }
     } catch (...) {
-      result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+      Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
     }
 
-    return result;
+    return Result;
   }
 
   return UR_RESULT_SUCCESS;
@@ -419,13 +416,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(pBinary != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
-  UR_ASSERT(hContext->get_device()->get() == hDevice->get(),
+  UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
   UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);
 
-  ur_result_t retError = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
 
-  std::unique_ptr<ur_program_handle_t_> retProgram{
+  std::unique_ptr<ur_program_handle_t_> RetProgram{
       new ur_program_handle_t_{hContext}};
 
   if (pProperties) {
@@ -434,19 +431,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
     } else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
       return UR_RESULT_ERROR_INVALID_SIZE;
     }
-    retError =
-        retProgram->set_metadata(pProperties->pMetadatas, pProperties->count);
+    Result =
+        RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count);
   }
-  UR_ASSERT(retError == UR_RESULT_SUCCESS, retError);
+  UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);
 
   auto pBinary_string = reinterpret_cast<const char *>(pBinary);
 
-  retError = retProgram->set_binary(pBinary_string, size);
-  UR_ASSERT(retError == UR_RESULT_SUCCESS, retError);
+  Result = RetProgram->setBinary(pBinary_string, size);
+  UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);
 
-  *phProgram = retProgram.release();
+  *phProgram = RetProgram.release();
 
-  return retError;
+  return Result;
 }
 
 // This entry point is only used for native specialization constants (SPIR-V),
@@ -462,22 +459,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer(
   // Check if device passed is the same the device bound to the context
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  UR_ASSERT(hDevice == hProgram->get_context()->get_device(),
+  UR_ASSERT(hDevice == hProgram->getContext()->getDevice(),
             UR_RESULT_ERROR_INVALID_DEVICE);
   UR_ASSERT(pFunctionName, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(ppFunctionPointer, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
-  CUfunction func;
-  CUresult ret = cuModuleGetFunction(&func, hProgram->get(), pFunctionName);
-  *ppFunctionPointer = func;
-  ur_result_t retError = UR_RESULT_SUCCESS;
+  CUfunction Func;
+  CUresult Ret = cuModuleGetFunction(&Func, hProgram->get(), pFunctionName);
+  *ppFunctionPointer = Func;
+  ur_result_t Result = UR_RESULT_SUCCESS;
 
-  if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_FOUND)
-    retError = UR_CHECK_ERROR(ret);
-  if (ret == CUDA_ERROR_NOT_FOUND) {
+  if (Ret != CUDA_SUCCESS && Ret != CUDA_ERROR_NOT_FOUND)
+    Result = UR_CHECK_ERROR(Ret);
+  if (Ret == CUDA_ERROR_NOT_FOUND) {
     *ppFunctionPointer = 0;
-    retError = UR_RESULT_ERROR_INVALID_FUNCTION_NAME;
+    Result = UR_RESULT_ERROR_INVALID_FUNCTION_NAME;
   }
 
-  return retError;
+  return Result;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp
index 35ac6fb215ea0..6d47df5b78523 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp
@@ -17,39 +17,38 @@
 
 struct ur_program_handle_t_ {
   using native_type = CUmodule;
-  native_type module_;
-  const char *binary_;
-  size_t binarySizeInBytes_;
-  std::atomic_uint32_t refCount_;
-  ur_context_handle_t context_;
+  native_type Module;
+  const char *Binary;
+  size_t BinarySizeInBytes;
+  std::atomic_uint32_t RefCount;
+  ur_context_handle_t Context;
 
   // Metadata
   std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
-      kernelReqdWorkGroupSizeMD_;
-  std::unordered_map<std::string, std::string> globalIDMD_;
+      KernelReqdWorkGroupSizeMD;
+  std::unordered_map<std::string, std::string> GlobalIDMD;
 
-  constexpr static size_t MAX_LOG_SIZE = 8192u;
+  constexpr static size_t MaxLogSize = 8192u;
 
-  char errorLog_[MAX_LOG_SIZE], infoLog_[MAX_LOG_SIZE];
-  std::string buildOptions_;
-  ur_program_build_status_t buildStatus_ = UR_PROGRAM_BUILD_STATUS_NONE;
+  char ErrorLog[MaxLogSize], InfoLog[MaxLogSize];
+  std::string BuildOptions;
+  ur_program_build_status_t BuildStatus = UR_PROGRAM_BUILD_STATUS_NONE;
 
-  ur_program_handle_t_(ur_context_handle_t ctxt);
+  ur_program_handle_t_(ur_context_handle_t Context);
   ~ur_program_handle_t_();
 
-  ur_result_t set_metadata(const ur_program_metadata_t *metadata,
-                           size_t length);
+  ur_result_t setMetadata(const ur_program_metadata_t *Metadata, size_t Length);
 
-  ur_result_t set_binary(const char *binary, size_t binarySizeInBytes);
+  ur_result_t setBinary(const char *Binary, size_t BinarySizeInBytes);
 
-  ur_result_t build_program(const char *build_options);
-  ur_context_handle_t get_context() const { return context_; };
+  ur_result_t buildProgram(const char *BuildOptions);
+  ur_context_handle_t getContext() const { return Context; };
 
-  native_type get() const noexcept { return module_; };
+  native_type get() const noexcept { return Module; };
 
-  uint32_t increment_reference_count() noexcept { return ++refCount_; }
+  uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
 
-  uint32_t decrement_reference_count() noexcept { return --refCount_; }
+  uint32_t decrementReferenceCount() noexcept { return --RefCount; }
 
-  uint32_t get_reference_count() const noexcept { return refCount_; }
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
 };
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
index 2c13c6ea29d14..82edf55612669 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
@@ -14,102 +14,101 @@
 #include <cassert>
 #include <cuda.h>
 
-void ur_queue_handle_t_::compute_stream_wait_for_barrier_if_needed(
-    CUstream stream, uint32_t stream_i) {
-  if (barrier_event_ && !compute_applied_barrier_[stream_i]) {
-    UR_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0));
-    compute_applied_barrier_[stream_i] = true;
+void ur_queue_handle_t_::computeStreamWaitForBarrierIfNeeded(CUstream Stream,
+                                                             uint32_t StreamI) {
+  if (BarrierEvent && !ComputeAppliedBarrier[StreamI]) {
+    UR_CHECK_ERROR(cuStreamWaitEvent(Stream, BarrierEvent, 0));
+    ComputeAppliedBarrier[StreamI] = true;
   }
 }
 
-void ur_queue_handle_t_::transfer_stream_wait_for_barrier_if_needed(
-    CUstream stream, uint32_t stream_i) {
-  if (barrier_event_ && !transfer_applied_barrier_[stream_i]) {
-    UR_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0));
-    transfer_applied_barrier_[stream_i] = true;
+void ur_queue_handle_t_::transferStreamWaitForBarrierIfNeeded(
+    CUstream Stream, uint32_t StreamI) {
+  if (BarrierEvent && !TransferAppliedBarrier[StreamI]) {
+    UR_CHECK_ERROR(cuStreamWaitEvent(Stream, BarrierEvent, 0));
+    TransferAppliedBarrier[StreamI] = true;
   }
 }
 
-CUstream ur_queue_handle_t_::get_next_compute_stream(uint32_t *stream_token) {
-  uint32_t stream_i;
-  uint32_t token;
+CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) {
+  uint32_t StreamI;
+  uint32_t Token;
   while (true) {
-    if (num_compute_streams_ < compute_streams_.size()) {
+    if (NumComputeStreams < ComputeStreams.size()) {
       // the check above is for performance - so as not to lock mutex every time
-      std::lock_guard<std::mutex> guard(compute_stream_mutex_);
+      std::lock_guard<std::mutex> guard(ComputeStreamMutex);
       // The second check is done after mutex is locked so other threads can not
-      // change num_compute_streams_ after that
-      if (num_compute_streams_ < compute_streams_.size()) {
+      // change NumComputeStreams after that
+      if (NumComputeStreams < ComputeStreams.size()) {
         UR_CHECK_ERROR(
-            cuStreamCreate(&compute_streams_[num_compute_streams_++], flags_));
+            cuStreamCreate(&ComputeStreams[NumComputeStreams++], Flags));
       }
     }
-    token = compute_stream_idx_++;
-    stream_i = token % compute_streams_.size();
+    Token = ComputeStreamIndex++;
+    StreamI = Token % ComputeStreams.size();
     // if a stream has been reused before it was next selected round-robin
     // fashion, we want to delay its next use and instead select another one
     // that is more likely to have completed all the enqueued work.
-    if (delay_compute_[stream_i]) {
-      delay_compute_[stream_i] = false;
+    if (DelayCompute[StreamI]) {
+      DelayCompute[StreamI] = false;
     } else {
       break;
     }
   }
-  if (stream_token) {
-    *stream_token = token;
+  if (StreamToken) {
+    *StreamToken = Token;
   }
-  CUstream res = compute_streams_[stream_i];
-  compute_stream_wait_for_barrier_if_needed(res, stream_i);
+  CUstream res = ComputeStreams[StreamI];
+  computeStreamWaitForBarrierIfNeeded(res, StreamI);
   return res;
 }
 
-CUstream ur_queue_handle_t_::get_next_compute_stream(
-    uint32_t num_events_in_wait_list, const ur_event_handle_t *event_wait_list,
-    ur_stream_guard_ &guard, uint32_t *stream_token) {
-  for (uint32_t i = 0; i < num_events_in_wait_list; i++) {
-    uint32_t token = event_wait_list[i]->get_compute_stream_token();
-    if (reinterpret_cast<ur_queue_handle_t>(event_wait_list[i]->get_queue()) ==
+CUstream ur_queue_handle_t_::getNextComputeStream(
+    uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
+    ur_stream_guard_ &Guard, uint32_t *StreamToken) {
+  for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
+    uint32_t Token = EventWaitList[i]->getComputeStreamToken();
+    if (reinterpret_cast<ur_queue_handle_t>(EventWaitList[i]->getQueue()) ==
             this &&
-        can_reuse_stream(token)) {
-      std::unique_lock<std::mutex> compute_sync_guard(
-          compute_stream_sync_mutex_);
+        canReuseStream(Token)) {
+      std::unique_lock<std::mutex> ComputeSyncGuard(ComputeStreamSyncMutex);
       // redo the check after lock to avoid data races on
-      // last_sync_compute_streams_
-      if (can_reuse_stream(token)) {
-        uint32_t stream_i = token % delay_compute_.size();
-        delay_compute_[stream_i] = true;
-        if (stream_token) {
-          *stream_token = token;
+      // LastSyncComputeStreams
+      if (canReuseStream(Token)) {
+        uint32_t StreamI = Token % DelayCompute.size();
+        DelayCompute[StreamI] = true;
+        if (StreamToken) {
+          *StreamToken = Token;
         }
-        guard = ur_stream_guard_{std::move(compute_sync_guard)};
-        CUstream res = event_wait_list[i]->get_stream();
-        compute_stream_wait_for_barrier_if_needed(res, stream_i);
-        return res;
+        Guard = ur_stream_guard_{std::move(ComputeSyncGuard)};
+        CUstream Result = EventWaitList[i]->getStream();
+        computeStreamWaitForBarrierIfNeeded(Result, StreamI);
+        return Result;
       }
     }
   }
-  guard = {};
-  return get_next_compute_stream(stream_token);
+  Guard = {};
+  return getNextComputeStream(StreamToken);
 }
 
-CUstream ur_queue_handle_t_::get_next_transfer_stream() {
-  if (transfer_streams_.empty()) { // for example in in-order queue
-    return get_next_compute_stream();
+CUstream ur_queue_handle_t_::getNextTransferStream() {
+  if (TransferStreams.empty()) { // for example in in-order queue
+    return getNextComputeStream();
   }
-  if (num_transfer_streams_ < transfer_streams_.size()) {
+  if (NumTransferStreams < TransferStreams.size()) {
     // the check above is for performance - so as not to lock mutex every time
-    std::lock_guard<std::mutex> guard(transfer_stream_mutex_);
+    std::lock_guard<std::mutex> Guuard(TransferStreamMutex);
     // The second check is done after mutex is locked so other threads can not
-    // change num_transfer_streams_ after that
-    if (num_transfer_streams_ < transfer_streams_.size()) {
+    // change NumTransferStreams after that
+    if (NumTransferStreams < TransferStreams.size()) {
       UR_CHECK_ERROR(
-          cuStreamCreate(&transfer_streams_[num_transfer_streams_++], flags_));
+          cuStreamCreate(&TransferStreams[NumTransferStreams++], Flags));
     }
   }
-  uint32_t stream_i = transfer_stream_idx_++ % transfer_streams_.size();
-  CUstream res = transfer_streams_[stream_i];
-  transfer_stream_wait_for_barrier_if_needed(res, stream_i);
-  return res;
+  uint32_t StreamI = TransferStreamIndex++ % TransferStreams.size();
+  CUstream Result = TransferStreams[StreamI];
+  transferStreamWaitForBarrierIfNeeded(Result, StreamI);
+  return Result;
 }
 
 /// Creates a `ur_queue_handle_t` object on the CUDA backend.
@@ -121,47 +120,47 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
               const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) {
   try {
-    std::unique_ptr<ur_queue_handle_t_> queueImpl{nullptr};
+    std::unique_ptr<ur_queue_handle_t_> Queue{nullptr};
     UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
     UR_ASSERT(phQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER);
     UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-    if (hContext->get_device() != hDevice) {
+    if (hContext->getDevice() != hDevice) {
       *phQueue = nullptr;
       return UR_RESULT_ERROR_INVALID_DEVICE;
     }
 
-    unsigned int flags = CU_STREAM_NON_BLOCKING;
-    ur_queue_flags_t urFlags = 0;
-    bool is_out_of_order = false;
+    unsigned int Flags = CU_STREAM_NON_BLOCKING;
+    ur_queue_flags_t URFlags = 0;
+    bool IsOutOfOrder = false;
     if (pProps && pProps->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) {
-      urFlags = pProps->flags;
-      if (urFlags == __SYCL_UR_CUDA_USE_DEFAULT_STREAM) {
-        flags = CU_STREAM_DEFAULT;
-      } else if (urFlags == __SYCL_UR_CUDA_SYNC_WITH_DEFAULT) {
-        flags = 0;
+      URFlags = pProps->flags;
+      if (URFlags == __SYCL_UR_CUDA_USE_DEFAULT_STREAM) {
+        Flags = CU_STREAM_DEFAULT;
+      } else if (URFlags == __SYCL_UR_CUDA_SYNC_WITH_DEFAULT) {
+        Flags = 0;
       }
 
-      if (urFlags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
-        is_out_of_order = true;
+      if (URFlags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
+        IsOutOfOrder = true;
       }
     }
 
-    std::vector<CUstream> computeCuStreams(
-        is_out_of_order ? ur_queue_handle_t_::default_num_compute_streams : 1);
-    std::vector<CUstream> transferCuStreams(
-        is_out_of_order ? ur_queue_handle_t_::default_num_transfer_streams : 0);
+    std::vector<CUstream> ComputeCuStreams(
+        IsOutOfOrder ? ur_queue_handle_t_::DefaultNumComputeStreams : 1);
+    std::vector<CUstream> TransferCuStreams(
+        IsOutOfOrder ? ur_queue_handle_t_::DefaultNumTransferStreams : 0);
 
-    queueImpl = std::unique_ptr<ur_queue_handle_t_>(new ur_queue_handle_t_{
-        std::move(computeCuStreams), std::move(transferCuStreams), hContext,
-        hDevice, flags, urFlags});
+    Queue = std::unique_ptr<ur_queue_handle_t_>(new ur_queue_handle_t_{
+        std::move(ComputeCuStreams), std::move(TransferCuStreams), hContext,
+        hDevice, Flags, URFlags});
 
-    *phQueue = queueImpl.release();
+    *phQueue = Queue.release();
 
     return UR_RESULT_SUCCESS;
-  } catch (ur_result_t err) {
+  } catch (ur_result_t Err) {
 
-    return err;
+    return Err;
 
   } catch (...) {
 
@@ -171,61 +170,61 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
 
 UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) {
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  assert(hQueue->get_reference_count() > 0);
+  assert(hQueue->getReferenceCount() > 0);
 
-  hQueue->increment_reference_count();
+  hQueue->incrementReferenceCount();
   return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) {
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  if (hQueue->decrement_reference_count() > 0) {
+  if (hQueue->decrementReferenceCount() > 0) {
     return UR_RESULT_SUCCESS;
   }
 
   try {
-    std::unique_ptr<ur_queue_handle_t_> queueImpl(hQueue);
+    std::unique_ptr<ur_queue_handle_t_> Queue(hQueue);
 
-    if (!hQueue->backend_has_ownership())
+    if (!hQueue->backendHasOwnership())
       return UR_RESULT_SUCCESS;
 
-    ScopedContext active(hQueue->get_context());
+    ScopedContext Active(hQueue->getContext());
 
-    hQueue->for_each_stream([](CUstream s) {
-      UR_CHECK_ERROR(cuStreamSynchronize(s));
-      UR_CHECK_ERROR(cuStreamDestroy(s));
+    hQueue->forEachStream([](CUstream S) {
+      UR_CHECK_ERROR(cuStreamSynchronize(S));
+      UR_CHECK_ERROR(cuStreamDestroy(S));
     });
 
     return UR_RESULT_SUCCESS;
-  } catch (ur_result_t err) {
-    return err;
+  } catch (ur_result_t Err) {
+    return Err;
   } catch (...) {
     return UR_RESULT_ERROR_OUT_OF_RESOURCES;
   }
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) {
-  ur_result_t result = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
     UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-    ScopedContext active(hQueue->get_context());
+    ScopedContext active(hQueue->getContext());
 
-    hQueue->sync_streams</*ResetUsed=*/true>([&result](CUstream s) {
-      result = UR_CHECK_ERROR(cuStreamSynchronize(s));
+    hQueue->syncStreams</*ResetUsed=*/true>([&Result](CUstream s) {
+      Result = UR_CHECK_ERROR(cuStreamSynchronize(s));
     });
 
-  } catch (ur_result_t err) {
+  } catch (ur_result_t Err) {
 
-    result = err;
+    Result = Err;
 
   } catch (...) {
 
-    result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
   }
 
-  return result;
+  return Result;
 }
 
 // There is no CUDA counterpart for queue flushing and we don't run into the
@@ -242,9 +241,9 @@ urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc,
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(phNativeQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
-  ScopedContext active(hQueue->get_context());
+  ScopedContext Active(hQueue->getContext());
   *phNativeQueue =
-      reinterpret_cast<ur_native_handle_t>(hQueue->get_next_compute_stream());
+      reinterpret_cast<ur_native_handle_t>(hQueue->getNextComputeStream());
   return UR_RESULT_SUCCESS;
 }
 
@@ -254,35 +253,35 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
     ur_queue_handle_t *phQueue) {
   (void)pProperties;
 
-  unsigned int cuFlags;
-  CUstream cuStream = reinterpret_cast<CUstream>(hNativeQueue);
-  UR_ASSERT(hContext->get_device() == hDevice, UR_RESULT_ERROR_INVALID_DEVICE);
+  unsigned int CuFlags;
+  CUstream CuStream = reinterpret_cast<CUstream>(hNativeQueue);
+  UR_ASSERT(hContext->getDevice() == hDevice, UR_RESULT_ERROR_INVALID_DEVICE);
 
-  auto retErr = UR_CHECK_ERROR(cuStreamGetFlags(cuStream, &cuFlags));
+  auto Return = UR_CHECK_ERROR(cuStreamGetFlags(CuStream, &CuFlags));
 
-  ur_queue_flags_t flags = 0;
-  if (cuFlags == CU_STREAM_DEFAULT)
-    flags = __SYCL_UR_CUDA_USE_DEFAULT_STREAM;
-  else if (cuFlags == CU_STREAM_NON_BLOCKING)
-    flags = __SYCL_UR_CUDA_SYNC_WITH_DEFAULT;
+  ur_queue_flags_t Flags = 0;
+  if (CuFlags == CU_STREAM_DEFAULT)
+    Flags = __SYCL_UR_CUDA_USE_DEFAULT_STREAM;
+  else if (CuFlags == CU_STREAM_NON_BLOCKING)
+    Flags = __SYCL_UR_CUDA_SYNC_WITH_DEFAULT;
   else
     sycl::detail::ur::die("Unknown cuda stream");
 
-  std::vector<CUstream> computeCuStreams(1, cuStream);
-  std::vector<CUstream> transferCuStreams(0);
+  std::vector<CUstream> ComputeCuStreams(1, CuStream);
+  std::vector<CUstream> TransferCuStreams(0);
 
   // Create queue and set num_compute_streams to 1, as computeCuStreams has
   // valid stream
-  *phQueue = new ur_queue_handle_t_{std::move(computeCuStreams),
-                                    std::move(transferCuStreams),
+  *phQueue = new ur_queue_handle_t_{std::move(ComputeCuStreams),
+                                    std::move(TransferCuStreams),
                                     hContext,
                                     hDevice,
-                                    cuFlags,
-                                    flags,
+                                    CuFlags,
+                                    Flags,
                                     /*backend_owns*/ false};
-  (*phQueue)->num_compute_streams_ = 1;
+  (*phQueue)->NumComputeStreams = 1;
 
-  return retErr;
+  return Return;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
@@ -297,29 +296,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
 
   switch (uint32_t{propName}) {
   case UR_QUEUE_INFO_CONTEXT:
-    return ReturnValue(hQueue->context_);
+    return ReturnValue(hQueue->Context);
   case UR_QUEUE_INFO_DEVICE:
-    return ReturnValue(hQueue->device_);
+    return ReturnValue(hQueue->Device);
   case UR_QUEUE_INFO_REFERENCE_COUNT:
-    return ReturnValue(hQueue->get_reference_count());
+    return ReturnValue(hQueue->getReferenceCount());
   case UR_QUEUE_INFO_FLAGS:
-    return ReturnValue(hQueue->ur_flags_);
+    return ReturnValue(hQueue->URFlags);
   case UR_QUEUE_INFO_EMPTY: {
     try {
-      bool IsReady = hQueue->all_of([](CUstream s) -> bool {
-        const CUresult ret = cuStreamQuery(s);
-        if (ret == CUDA_SUCCESS)
+      bool IsReady = hQueue->allOf([](CUstream S) -> bool {
+        const CUresult Ret = cuStreamQuery(S);
+        if (Ret == CUDA_SUCCESS)
           return true;
 
-        if (ret == CUDA_ERROR_NOT_READY)
+        if (Ret == CUDA_ERROR_NOT_READY)
           return false;
 
-        UR_CHECK_ERROR(ret);
+        UR_CHECK_ERROR(Ret);
         return false;
       });
       return ReturnValue(IsReady);
-    } catch (ur_result_t err) {
-      return err;
+    } catch (ur_result_t Err) {
+      return Err;
     } catch (...) {
       return UR_RESULT_ERROR_OUT_OF_RESOURCES;
     }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
index daa1017d0f0aa..bfb8f6606b645 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
@@ -19,104 +19,100 @@ using ur_stream_guard_ = std::unique_lock<std::mutex>;
 struct ur_queue_handle_t_ {
 
   using native_type = CUstream;
-  static constexpr int default_num_compute_streams = 128;
-  static constexpr int default_num_transfer_streams = 64;
+  static constexpr int DefaultNumComputeStreams = 128;
+  static constexpr int DefaultNumTransferStreams = 64;
 
-  std::vector<native_type> compute_streams_;
-  std::vector<native_type> transfer_streams_;
+  std::vector<native_type> ComputeStreams;
+  std::vector<native_type> TransferStreams;
   // delay_compute_ keeps track of which streams have been recently reused and
   // their next use should be delayed. If a stream has been recently reused it
   // will be skipped the next time it would be selected round-robin style. When
   // skipped, its delay flag is cleared.
-  std::vector<bool> delay_compute_;
+  std::vector<bool> DelayCompute;
   // keep track of which streams have applied barrier
-  std::vector<bool> compute_applied_barrier_;
-  std::vector<bool> transfer_applied_barrier_;
-  ur_context_handle_t_ *context_;
-  ur_device_handle_t_ *device_;
-  // ur_queue_properties_t properties_;
-  CUevent barrier_event_ = nullptr;
-  CUevent barrier_tmp_event_ = nullptr;
-  std::atomic_uint32_t refCount_;
-  std::atomic_uint32_t eventCount_;
-  std::atomic_uint32_t compute_stream_idx_;
-  std::atomic_uint32_t transfer_stream_idx_;
-  unsigned int num_compute_streams_;
-  unsigned int num_transfer_streams_;
-  unsigned int last_sync_compute_streams_;
-  unsigned int last_sync_transfer_streams_;
-  unsigned int flags_;
-  ur_queue_flags_t ur_flags_;
-  // When compute_stream_sync_mutex_ and compute_stream_mutex_ both need to be
-  // locked at the same time, compute_stream_sync_mutex_ should be locked first
+  std::vector<bool> ComputeAppliedBarrier;
+  std::vector<bool> TransferAppliedBarrier;
+  ur_context_handle_t_ *Context;
+  ur_device_handle_t_ *Device;
+  CUevent BarrierEvent = nullptr;
+  CUevent BarrierTmpEvent = nullptr;
+  std::atomic_uint32_t RefCount;
+  std::atomic_uint32_t EventCount;
+  std::atomic_uint32_t ComputeStreamIndex;
+  std::atomic_uint32_t TransferStreamIndex;
+  unsigned int NumComputeStreams;
+  unsigned int NumTransferStreams;
+  unsigned int LastSyncComputeStreams;
+  unsigned int LastSyncTransferStreams;
+  unsigned int Flags;
+  ur_queue_flags_t URFlags;
+  // When ComputeStreamSyncMutex and ComputeStreamMutex both need to be
+  // locked at the same time, ComputeStreamSyncMutex should be locked first
   // to avoid deadlocks
-  std::mutex compute_stream_sync_mutex_;
-  std::mutex compute_stream_mutex_;
-  std::mutex transfer_stream_mutex_;
-  std::mutex barrier_mutex_;
-  bool has_ownership_;
-
-  ur_queue_handle_t_(std::vector<CUstream> &&compute_streams,
-                     std::vector<CUstream> &&transfer_streams,
-                     ur_context_handle_t_ *context, ur_device_handle_t_ *device,
-                     unsigned int flags, ur_queue_flags_t ur_flags,
-                     bool backend_owns = true)
-      : compute_streams_{std::move(compute_streams)},
-        transfer_streams_{std::move(transfer_streams)},
-        delay_compute_(compute_streams_.size(), false),
-        compute_applied_barrier_(compute_streams_.size()),
-        transfer_applied_barrier_(transfer_streams_.size()), context_{context},
-        device_{device}, refCount_{1}, eventCount_{0}, compute_stream_idx_{0},
-        transfer_stream_idx_{0}, num_compute_streams_{0},
-        num_transfer_streams_{0}, last_sync_compute_streams_{0},
-        last_sync_transfer_streams_{0}, flags_(flags), ur_flags_(ur_flags),
-        has_ownership_{backend_owns} {
-    urContextRetain(context_);
-    urDeviceRetain(device_);
+  std::mutex ComputeStreamSyncMutex;
+  std::mutex ComputeStreamMutex;
+  std::mutex TransferStreamMutex;
+  std::mutex BarrierMutex;
+  bool HasOwnership;
+
+  ur_queue_handle_t_(std::vector<CUstream> &&ComputeStreams,
+                     std::vector<CUstream> &&TransferStreams,
+                     ur_context_handle_t_ *Context, ur_device_handle_t_ *Device,
+                     unsigned int Flags, ur_queue_flags_t URFlags,
+                     bool BackendOwns = true)
+      : ComputeStreams{std::move(ComputeStreams)},
+        TransferStreams{std::move(TransferStreams)},
+        DelayCompute(this->ComputeStreams.size(), false),
+        ComputeAppliedBarrier(this->ComputeStreams.size()),
+        TransferAppliedBarrier(this->TransferStreams.size()), Context{Context},
+        Device{Device}, RefCount{1}, EventCount{0}, ComputeStreamIndex{0},
+        TransferStreamIndex{0}, NumComputeStreams{0}, NumTransferStreams{0},
+        LastSyncComputeStreams{0}, LastSyncTransferStreams{0}, Flags(Flags),
+        URFlags(URFlags), HasOwnership{BackendOwns} {
+    urContextRetain(Context);
+    urDeviceRetain(Device);
   }
 
   ~ur_queue_handle_t_() {
-    urContextRelease(context_);
-    urDeviceRelease(device_);
+    urContextRelease(Context);
+    urDeviceRelease(Device);
   }
 
-  void compute_stream_wait_for_barrier_if_needed(CUstream stream,
-                                                 uint32_t stream_i);
-  void transfer_stream_wait_for_barrier_if_needed(CUstream stream,
-                                                  uint32_t stream_i);
+  void computeStreamWaitForBarrierIfNeeded(CUstream Strean, uint32_t StreamI);
+  void transferStreamWaitForBarrierIfNeeded(CUstream Stream, uint32_t StreamI);
 
   // get_next_compute/transfer_stream() functions return streams from
   // appropriate pools in round-robin fashion
-  native_type get_next_compute_stream(uint32_t *stream_token = nullptr);
+  native_type getNextComputeStream(uint32_t *StreamToken = nullptr);
   // this overload tries select a stream that was used by one of dependancies.
   // If that is not possible returns a new stream. If a stream is reused it
   // returns a lock that needs to remain locked as long as the stream is in use
-  native_type get_next_compute_stream(uint32_t num_events_in_wait_list,
-                                      const ur_event_handle_t *event_wait_list,
-                                      ur_stream_guard_ &guard,
-                                      uint32_t *stream_token = nullptr);
-  native_type get_next_transfer_stream();
-  native_type get() { return get_next_compute_stream(); };
-
-  bool has_been_synchronized(uint32_t stream_token) {
+  native_type getNextComputeStream(uint32_t NumEventsInWaitList,
+                                   const ur_event_handle_t *EventWaitList,
+                                   ur_stream_guard_ &Guard,
+                                   uint32_t *StreamToken = nullptr);
+  native_type getNextTransferStream();
+  native_type get() { return getNextComputeStream(); };
+
+  bool hasBeenSynchronized(uint32_t StreamToken) {
     // stream token not associated with one of the compute streams
-    if (stream_token == std::numeric_limits<uint32_t>::max()) {
+    if (StreamToken == std::numeric_limits<uint32_t>::max()) {
       return false;
     }
-    return last_sync_compute_streams_ >= stream_token;
+    return LastSyncComputeStreams >= StreamToken;
   }
 
-  bool can_reuse_stream(uint32_t stream_token) {
+  bool canReuseStream(uint32_t StreamToken) {
     // stream token not associated with one of the compute streams
-    if (stream_token == std::numeric_limits<uint32_t>::max()) {
+    if (StreamToken == std::numeric_limits<uint32_t>::max()) {
       return false;
     }
     // If the command represented by the stream token was not the last command
     // enqueued to the stream we can not reuse the stream - we need to allow for
     // commands enqueued after it and the one we are about to enqueue to run
     // concurrently
-    bool is_last_command =
-        (compute_stream_idx_ - stream_token) <= compute_streams_.size();
+    bool IsLastCommand =
+        (ComputeStreamIndex - StreamToken) <= ComputeStreams.size();
     // If there was a barrier enqueued to the queue after the command
     // represented by the stream token we should not reuse the stream, as we can
     // not take that stream into account for the bookkeeping for the next
@@ -125,129 +121,124 @@ struct ur_queue_handle_t_ {
     // represented by the stream token is guaranteed to be complete by the
     // barrier before any work we are about to enqueue to the stream will start,
     // so the event does not need to be synchronized with.
-    return is_last_command && !has_been_synchronized(stream_token);
+    return IsLastCommand && !hasBeenSynchronized(StreamToken);
   }
 
-  template <typename T> bool all_of(T &&f) {
+  template <typename T> bool allOf(T &&F) {
     {
-      std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
-      unsigned int end =
-          std::min(static_cast<unsigned int>(compute_streams_.size()),
-                   num_compute_streams_);
-      if (!std::all_of(compute_streams_.begin(), compute_streams_.begin() + end,
-                       f))
+      std::lock_guard<std::mutex> ComputeGuard(ComputeStreamMutex);
+      unsigned int End = std::min(
+          static_cast<unsigned int>(ComputeStreams.size()), NumComputeStreams);
+      if (!std::all_of(ComputeStreams.begin(), ComputeStreams.begin() + End, F))
         return false;
     }
     {
-      std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
-      unsigned int end =
-          std::min(static_cast<unsigned int>(transfer_streams_.size()),
-                   num_transfer_streams_);
-      if (!std::all_of(transfer_streams_.begin(),
-                       transfer_streams_.begin() + end, f))
+      std::lock_guard<std::mutex> TransferGuard(TransferStreamMutex);
+      unsigned int End =
+          std::min(static_cast<unsigned int>(TransferStreams.size()),
+                   NumTransferStreams);
+      if (!std::all_of(TransferStreams.begin(), TransferStreams.begin() + End,
+                       F))
         return false;
     }
     return true;
   }
 
-  template <typename T> void for_each_stream(T &&f) {
+  template <typename T> void forEachStream(T &&F) {
     {
-      std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
-      unsigned int end =
-          std::min(static_cast<unsigned int>(compute_streams_.size()),
-                   num_compute_streams_);
-      for (unsigned int i = 0; i < end; i++) {
-        f(compute_streams_[i]);
+      std::lock_guard<std::mutex> compute_guard(ComputeStreamMutex);
+      unsigned int End = std::min(
+          static_cast<unsigned int>(ComputeStreams.size()), NumComputeStreams);
+      for (unsigned int i = 0; i < End; i++) {
+        F(ComputeStreams[i]);
       }
     }
     {
-      std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
-      unsigned int end =
-          std::min(static_cast<unsigned int>(transfer_streams_.size()),
-                   num_transfer_streams_);
-      for (unsigned int i = 0; i < end; i++) {
-        f(transfer_streams_[i]);
+      std::lock_guard<std::mutex> transfer_guard(TransferStreamMutex);
+      unsigned int End =
+          std::min(static_cast<unsigned int>(TransferStreams.size()),
+                   NumTransferStreams);
+      for (unsigned int i = 0; i < End; i++) {
+        F(TransferStreams[i]);
       }
     }
   }
 
-  template <bool ResetUsed = false, typename T> void sync_streams(T &&f) {
-    auto sync_compute = [&f, &streams = compute_streams_,
-                         &delay = delay_compute_](unsigned int start,
-                                                  unsigned int stop) {
-      for (unsigned int i = start; i < stop; i++) {
-        f(streams[i]);
-        delay[i] = false;
+  template <bool ResetUsed = false, typename T> void syncStreams(T &&F) {
+    auto SyncCompute = [&F, &Streams = ComputeStreams, &Delay = DelayCompute](
+                           unsigned int Start, unsigned int Stop) {
+      for (unsigned int i = Start; i < Stop; i++) {
+        F(Streams[i]);
+        Delay[i] = false;
       }
     };
-    auto sync_transfer = [&f, &streams = transfer_streams_](unsigned int start,
-                                                            unsigned int stop) {
-      for (unsigned int i = start; i < stop; i++) {
-        f(streams[i]);
+    auto SyncTransfer = [&F, &streams = TransferStreams](unsigned int Start,
+                                                         unsigned int Stop) {
+      for (unsigned int i = Start; i < Stop; i++) {
+        F(streams[i]);
       }
     };
     {
-      unsigned int size = static_cast<unsigned int>(compute_streams_.size());
-      std::lock_guard compute_sync_guard(compute_stream_sync_mutex_);
-      std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
-      unsigned int start = last_sync_compute_streams_;
-      unsigned int end = num_compute_streams_ < size
-                             ? num_compute_streams_
-                             : compute_stream_idx_.load();
+      unsigned int Size = static_cast<unsigned int>(ComputeStreams.size());
+      std::lock_guard ComputeSyncGuard(ComputeStreamSyncMutex);
+      std::lock_guard<std::mutex> ComputeGuard(ComputeStreamMutex);
+      unsigned int Start = LastSyncComputeStreams;
+      unsigned int End = NumComputeStreams < Size ? NumComputeStreams
+                                                  : ComputeStreamIndex.load();
       if (ResetUsed) {
-        last_sync_compute_streams_ = end;
+        LastSyncComputeStreams = End;
       }
-      if (end - start >= size) {
-        sync_compute(0, size);
+      if (End - Start >= Size) {
+        SyncCompute(0, Size);
       } else {
-        start %= size;
-        end %= size;
-        if (start <= end) {
-          sync_compute(start, end);
+        Start %= Size;
+        End %= Size;
+        if (Start <= End) {
+          SyncCompute(Start, End);
         } else {
-          sync_compute(start, size);
-          sync_compute(0, end);
+          SyncCompute(Start, Size);
+          SyncCompute(0, End);
         }
       }
     }
     {
-      unsigned int size = static_cast<unsigned int>(transfer_streams_.size());
-      if (size > 0) {
-        std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
-        unsigned int start = last_sync_transfer_streams_;
-        unsigned int end = num_transfer_streams_ < size
-                               ? num_transfer_streams_
-                               : transfer_stream_idx_.load();
+      unsigned int Size = static_cast<unsigned int>(TransferStreams.size());
+      if (Size > 0) {
+        std::lock_guard<std::mutex> TransferGuard(TransferStreamMutex);
+        unsigned int Start = LastSyncTransferStreams;
+        unsigned int End = NumTransferStreams < Size
+                               ? NumTransferStreams
+                               : TransferStreamIndex.load();
         if (ResetUsed) {
-          last_sync_transfer_streams_ = end;
+          LastSyncTransferStreams = End;
         }
-        if (end - start >= size) {
-          sync_transfer(0, size);
+        if (End - Start >= Size) {
+          SyncTransfer(0, Size);
         } else {
-          start %= size;
-          end %= size;
-          if (start <= end) {
-            sync_transfer(start, end);
+          Start %= Size;
+          End %= Size;
+          if (Start <= End) {
+            SyncTransfer(Start, End);
           } else {
-            sync_transfer(start, size);
-            sync_transfer(0, end);
+            SyncTransfer(Start, Size);
+            SyncTransfer(0, End);
           }
         }
       }
     }
   }
 
-  ur_context_handle_t_ *get_context() const { return context_; };
+  ur_context_handle_t_ *getContext() const { return Context; };
 
-  ur_device_handle_t_ *get_device() const { return device_; };
+  ur_device_handle_t_ *get_device() const { return Device; };
 
-  uint32_t increment_reference_count() noexcept { return ++refCount_; }
+  uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
 
-  uint32_t decrement_reference_count() noexcept { return --refCount_; }
+  uint32_t decrementReferenceCount() noexcept { return --RefCount; }
 
-  uint32_t get_reference_count() const noexcept { return refCount_; }
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
 
-  uint32_t get_next_event_id() noexcept { return ++eventCount_; }
+  uint32_t getNextEventID() noexcept { return ++EventCount; }
 
-  bool backend_has_ownership() const noexcept { return has_ownership_; }
+  bool backendHasOwnership() const noexcept { return HasOwnership; }
 };
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
index c07f548c92a26..464bd783b4646 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
@@ -12,20 +12,20 @@
 ur_result_t urSamplerCreate(ur_context_handle_t hContext,
                             const ur_sampler_desc_t *pDesc,
                             ur_sampler_handle_t *phSampler) {
-  std::unique_ptr<ur_sampler_handle_t_> retImplSampl{
+  std::unique_ptr<ur_sampler_handle_t_> Sampler{
       new ur_sampler_handle_t_(hContext)};
 
   if (pDesc && pDesc->stype == UR_STRUCTURE_TYPE_SAMPLER_DESC) {
-    retImplSampl->props_ |= pDesc->normalizedCoords;
-    retImplSampl->props_ |= (pDesc->filterMode << 1);
-    retImplSampl->props_ |= (pDesc->addressingMode << 2);
+    Sampler->Props |= pDesc->normalizedCoords;
+    Sampler->Props |= (pDesc->filterMode << 1);
+    Sampler->Props |= (pDesc->addressingMode << 2);
   } else {
     // Set default values
-    retImplSampl->props_ |= true; // Normalized Coords
-    retImplSampl->props_ |= UR_SAMPLER_ADDRESSING_MODE_CLAMP << 2;
+    Sampler->Props |= true; // Normalized Coords
+    Sampler->Props |= UR_SAMPLER_ADDRESSING_MODE_CLAMP << 2;
   }
 
-  *phSampler = retImplSampl.release();
+  *phSampler = Sampler.release();
   return UR_RESULT_SUCCESS;
 }
 
@@ -37,22 +37,22 @@ ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler,
 
   switch (propName) {
   case UR_SAMPLER_INFO_REFERENCE_COUNT:
-    return ReturnValue(hSampler->get_reference_count());
+    return ReturnValue(hSampler->getReferenceCount());
   case UR_SAMPLER_INFO_CONTEXT:
-    return ReturnValue(hSampler->context_);
+    return ReturnValue(hSampler->Context);
   case UR_SAMPLER_INFO_NORMALIZED_COORDS: {
-    bool norm_coords_prop = static_cast<bool>(hSampler->props_);
-    return ReturnValue(norm_coords_prop);
+    bool NormCoordsProp = static_cast<bool>(hSampler->Props);
+    return ReturnValue(NormCoordsProp);
   }
   case UR_SAMPLER_INFO_FILTER_MODE: {
-    auto filter_prop =
-        static_cast<ur_sampler_filter_mode_t>(((hSampler->props_ >> 1) & 0x1));
-    return ReturnValue(filter_prop);
+    auto FilterProp =
+        static_cast<ur_sampler_filter_mode_t>(((hSampler->Props >> 1) & 0x1));
+    return ReturnValue(FilterProp);
   }
   case UR_SAMPLER_INFO_ADDRESSING_MODE: {
-    auto addressing_prop =
-        static_cast<ur_sampler_addressing_mode_t>(hSampler->props_ >> 2);
-    return ReturnValue(addressing_prop);
+    auto AddressingProp =
+        static_cast<ur_sampler_addressing_mode_t>(hSampler->Props >> 2);
+    return ReturnValue(AddressingProp);
   }
   default:
     return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
@@ -62,7 +62,7 @@ ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler,
 
 ur_result_t urSamplerRetain(ur_sampler_handle_t hSampler) {
   UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  hSampler->increment_reference_count();
+  hSampler->incrementReferenceCount();
   return UR_RESULT_SUCCESS;
 }
 
@@ -72,11 +72,11 @@ ur_result_t urSamplerRelease(ur_sampler_handle_t hSampler) {
   // double delete or someone is messing with the ref count.
   // either way, cannot safely proceed.
   sycl::detail::ur::assertion(
-      hSampler->get_reference_count() != 0,
+      hSampler->getReferenceCount() != 0,
       "Reference count overflow detected in urSamplerRelease.");
 
   // decrement ref count. If it is 0, delete the sampler.
-  if (hSampler->decrement_reference_count() == 0) {
+  if (hSampler->decrementReferenceCount() == 0) {
     delete hSampler;
   }
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp
index 61ed98325a5ed..6dbbb124ffc3e 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp
@@ -14,16 +14,16 @@
 /// | 31 30 ... 6 5 |      4 3 2      |     1      |         0        |
 /// |      N/A      | addressing mode | fiter mode | normalize coords |
 struct ur_sampler_handle_t_ {
-  std::atomic_uint32_t refCount_;
-  uint32_t props_;
-  ur_context_handle_t context_;
+  std::atomic_uint32_t RefCount;
+  uint32_t Props;
+  ur_context_handle_t Context;
 
-  ur_sampler_handle_t_(ur_context_handle_t context)
-      : refCount_(1), props_(0), context_(context) {}
+  ur_sampler_handle_t_(ur_context_handle_t Context)
+      : RefCount(1), Props(0), Context(Context) {}
 
-  uint32_t increment_reference_count() noexcept { return ++refCount_; }
+  uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
 
-  uint32_t decrement_reference_count() noexcept { return --refCount_; }
+  uint32_t decrementReferenceCount() noexcept { return --RefCount; }
 
-  uint32_t get_reference_count() const noexcept { return refCount_; }
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
 };
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp
index 0309d4a7b627a..67b98f5c30319 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp
@@ -25,32 +25,32 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
   UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  size_t device_max_mem_alloc_size = 0;
-  UR_ASSERT(urDeviceGetInfo(hContext->get_device(),
+  size_t DeviceMaxMemAllocSize = 0;
+  UR_ASSERT(urDeviceGetInfo(hContext->getDevice(),
                             UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, sizeof(size_t),
-                            static_cast<void *>(&device_max_mem_alloc_size),
+                            static_cast<void *>(&DeviceMaxMemAllocSize),
                             nullptr) == UR_RESULT_SUCCESS,
             UR_RESULT_ERROR_INVALID_DEVICE);
-  UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size,
+  UR_ASSERT(size > 0 && size <= DeviceMaxMemAllocSize,
             UR_RESULT_ERROR_INVALID_USM_SIZE);
 
-  ur_result_t result = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
   try {
-    ScopedContext active(hContext);
-    result = UR_CHECK_ERROR(cuMemAllocHost(ppMem, size));
-  } catch (ur_result_t error) {
-    result = error;
+    ScopedContext Active(hContext);
+    Result = UR_CHECK_ERROR(cuMemAllocHost(ppMem, size));
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
 
   UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 ||
                           ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)),
             UR_RESULT_ERROR_INVALID_VALUE);
 
-  assert(result == UR_RESULT_SUCCESS &&
+  assert(Result == UR_RESULT_SUCCESS &&
          (!pUSMDesc || pUSMDesc->align == 0 ||
           reinterpret_cast<std::uintptr_t>(*ppMem) % pUSMDesc->align == 0));
 
-  return result;
+  return Result;
 }
 
 /// USM: Implements USM device allocations using a normal CUDA device pointer
@@ -63,31 +63,31 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
   UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  size_t device_max_mem_alloc_size = 0;
+  size_t DeviceMaxMemAllocSize = 0;
   UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE,
                             sizeof(size_t),
-                            static_cast<void *>(&device_max_mem_alloc_size),
+                            static_cast<void *>(&DeviceMaxMemAllocSize),
                             nullptr) == UR_RESULT_SUCCESS,
             UR_RESULT_ERROR_INVALID_DEVICE);
-  UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size,
+  UR_ASSERT(size > 0 && size <= DeviceMaxMemAllocSize,
             UR_RESULT_ERROR_INVALID_USM_SIZE);
 
-  ur_result_t result = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
   try {
-    ScopedContext active(hContext);
-    result = UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ppMem, size));
-  } catch (ur_result_t error) {
-    result = error;
+    ScopedContext Active(hContext);
+    Result = UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ppMem, size));
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
   UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 ||
                           ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)),
             UR_RESULT_ERROR_INVALID_VALUE);
 
-  assert(result == UR_RESULT_SUCCESS &&
+  assert(Result == UR_RESULT_SUCCESS &&
          (!pUSMDesc || pUSMDesc->align == 0 ||
           reinterpret_cast<std::uintptr_t>(*ppMem) % pUSMDesc->align == 0));
 
-  return result;
+  return Result;
 }
 
 /// USM: Implements USM Shared allocations using CUDA Managed Memory
@@ -100,32 +100,32 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
   UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  size_t device_max_mem_alloc_size = 0;
+  size_t DeviceMaxMemAllocSize = 0;
   UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE,
                             sizeof(size_t),
-                            static_cast<void *>(&device_max_mem_alloc_size),
+                            static_cast<void *>(&DeviceMaxMemAllocSize),
                             nullptr) == UR_RESULT_SUCCESS,
             UR_RESULT_ERROR_INVALID_DEVICE);
-  UR_ASSERT(size > 0 && size <= device_max_mem_alloc_size,
+  UR_ASSERT(size > 0 && size <= DeviceMaxMemAllocSize,
             UR_RESULT_ERROR_INVALID_USM_SIZE);
 
-  ur_result_t result = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
   try {
-    ScopedContext active(hContext);
-    result = UR_CHECK_ERROR(
+    ScopedContext Active(hContext);
+    Result = UR_CHECK_ERROR(
         cuMemAllocManaged((CUdeviceptr *)ppMem, size, CU_MEM_ATTACH_GLOBAL));
-  } catch (ur_result_t error) {
-    result = error;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
   UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 ||
                           ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)),
             UR_RESULT_ERROR_INVALID_VALUE);
 
-  assert(result == UR_RESULT_SUCCESS &&
+  assert(Result == UR_RESULT_SUCCESS &&
          (!pUSMDesc || pUSMDesc->align == 0 ||
           reinterpret_cast<std::uintptr_t>(*ppMem) % pUSMDesc->align == 0));
 
-  return result;
+  return Result;
 }
 
 /// USM: Frees the given USM pointer associated with the context.
@@ -134,30 +134,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
                                               void *pMem) {
   UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
-  ur_result_t result = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
   try {
-    ScopedContext active(hContext);
-    bool is_managed;
-    unsigned int type;
-    void *attribute_values[2] = {&is_managed, &type};
-    CUpointer_attribute attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED,
+    ScopedContext Active(hContext);
+    bool IsManaged;
+    unsigned int Type;
+    void *AttributeValues[2] = {&IsManaged, &Type};
+    CUpointer_attribute Attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED,
                                          CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
-    result = UR_CHECK_ERROR(cuPointerGetAttributes(
-        2, attributes, attribute_values, (CUdeviceptr)pMem));
-    UR_ASSERT(type == CU_MEMORYTYPE_DEVICE || type == CU_MEMORYTYPE_HOST,
+    Result = UR_CHECK_ERROR(cuPointerGetAttributes(
+        2, Attributes, AttributeValues, (CUdeviceptr)pMem));
+    UR_ASSERT(Type == CU_MEMORYTYPE_DEVICE || Type == CU_MEMORYTYPE_HOST,
               UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-    if (is_managed || type == CU_MEMORYTYPE_DEVICE) {
+    if (IsManaged || Type == CU_MEMORYTYPE_DEVICE) {
       // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed
       // with cuMemFree
-      result = UR_CHECK_ERROR(cuMemFree((CUdeviceptr)pMem));
+      Result = UR_CHECK_ERROR(cuMemFree((CUdeviceptr)pMem));
     } else {
       // Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost
-      result = UR_CHECK_ERROR(cuMemFreeHost(pMem));
+      Result = UR_CHECK_ERROR(cuMemFreeHost(pMem));
     }
-  } catch (ur_result_t error) {
-    result = error;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
-  return result;
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL
@@ -167,36 +167,36 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
   UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
-  ur_result_t result = UR_RESULT_SUCCESS;
+  ur_result_t Result = UR_RESULT_SUCCESS;
 
   UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet);
 
   try {
-    ScopedContext active(hContext);
+    ScopedContext Active(hContext);
     switch (propName) {
     case UR_USM_ALLOC_INFO_TYPE: {
-      unsigned int value;
+      unsigned int Value;
       // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
-      CUresult ret = cuPointerGetAttribute(
-          &value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem);
-      if (ret == CUDA_ERROR_INVALID_VALUE) {
+      CUresult Ret = cuPointerGetAttribute(
+          &Value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem);
+      if (Ret == CUDA_ERROR_INVALID_VALUE) {
         // pointer not known to the CUDA subsystem
         return ReturnValue(UR_USM_TYPE_UNKNOWN);
       }
-      result = check_error_ur(ret, __func__, __LINE__ - 5, __FILE__);
-      if (value) {
+      Result = checkErrorUR(Ret, __func__, __LINE__ - 5, __FILE__);
+      if (Value) {
         // pointer to managed memory
         return ReturnValue(UR_USM_TYPE_SHARED);
       }
-      result = UR_CHECK_ERROR(cuPointerGetAttribute(
-          &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)pMem));
-      UR_ASSERT(value == CU_MEMORYTYPE_DEVICE || value == CU_MEMORYTYPE_HOST,
+      Result = UR_CHECK_ERROR(cuPointerGetAttribute(
+          &Value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)pMem));
+      UR_ASSERT(Value == CU_MEMORYTYPE_DEVICE || Value == CU_MEMORYTYPE_HOST,
                 UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-      if (value == CU_MEMORYTYPE_DEVICE) {
+      if (Value == CU_MEMORYTYPE_DEVICE) {
         // pointer to device memory
         return ReturnValue(UR_USM_TYPE_DEVICE);
       }
-      if (value == CU_MEMORYTYPE_HOST) {
+      if (Value == CU_MEMORYTYPE_HOST) {
         // pointer to host memory
         return ReturnValue(UR_USM_TYPE_HOST);
       }
@@ -211,10 +211,10 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
     case UR_USM_ALLOC_INFO_BASE_PTR: {
 #if __CUDA_API_VERSION >= 10020
       // CU_POINTER_ATTRIBUTE_RANGE_START_ADDR was introduced in CUDA 10.2
-      unsigned int value;
+      unsigned int Value;
       result = UR_CHECK_ERROR(cuPointerGetAttribute(
-          &value, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)pMem));
-      return ReturnValue(value);
+          &Value, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)pMem));
+      return ReturnValue(Value);
 #else
       return UR_RESULT_ERROR_INVALID_VALUE;
 #endif
@@ -222,35 +222,36 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
     case UR_USM_ALLOC_INFO_SIZE: {
 #if __CUDA_API_VERSION >= 10020
       // CU_POINTER_ATTRIBUTE_RANGE_SIZE was introduced in CUDA 10.2
-      unsigned int value;
+      unsigned int Value;
       result = UR_CHECK_ERROR(cuPointerGetAttribute(
-          &value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
-      return ReturnValue(value);
+          &Value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
+      return ReturnValue(Value);
 #else
       return UR_RESULT_ERROR_INVALID_VALUE;
 #endif
     }
     case UR_USM_ALLOC_INFO_DEVICE: {
       // get device index associated with this pointer
-      unsigned int device_idx;
-      result = UR_CHECK_ERROR(cuPointerGetAttribute(
-          &device_idx, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)pMem));
+      unsigned int DeviceIndex;
+      Result = UR_CHECK_ERROR(cuPointerGetAttribute(
+          &DeviceIndex, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
+          (CUdeviceptr)pMem));
 
       // currently each device is in its own platform, so find the platform at
       // the same index
-      std::vector<ur_platform_handle_t> platforms;
-      platforms.resize(device_idx + 1);
-      result = urPlatformGet(device_idx + 1, platforms.data(), nullptr);
+      std::vector<ur_platform_handle_t> Platforms;
+      Platforms.resize(DeviceIndex + 1);
+      Result = urPlatformGet(DeviceIndex + 1, Platforms.data(), nullptr);
 
       // get the device from the platform
-      ur_device_handle_t device = platforms[device_idx]->devices_[0].get();
-      return ReturnValue(device);
+      ur_device_handle_t Device = Platforms[DeviceIndex]->Devices[0].get();
+      return ReturnValue(Device);
     }
     default:
       return UR_RESULT_ERROR_INVALID_ENUMERATION;
     }
-  } catch (ur_result_t error) {
-    result = error;
+  } catch (ur_result_t Err) {
+    Result = Err;
   }
-  return result;
+  return Result;
 }

From a0de2d72c877ae6c182f54def6817f214d8d56db Mon Sep 17 00:00:00 2001
From: Petr Vesely <petr.vesely@codeplay.com>
Date: Thu, 8 Jun 2023 09:22:28 +0100
Subject: [PATCH 42/45] [SYCL][CUDA][PI][UR] Fix PR review comments

---
 .../ur/adapters/cuda/context.hpp              | 19 ++++-----
 .../ur/adapters/cuda/event.hpp                | 32 +++++++--------
 .../ur/adapters/cuda/kernel.cpp               |  6 ++-
 .../ur/adapters/cuda/kernel.hpp               | 12 +++---
 .../ur/adapters/cuda/memory.cpp               |  4 +-
 .../ur/adapters/cuda/memory.hpp               | 15 +++----
 .../ur/adapters/cuda/platform.cpp             |  3 +-
 .../ur/adapters/cuda/program.cpp              | 12 ++----
 .../ur/adapters/cuda/queue.cpp                |  5 +--
 .../ur/adapters/cuda/queue.hpp                | 40 +++++++++----------
 .../ur/adapters/cuda/sampler.cpp              |  6 +--
 .../ur/adapters/cuda/ur_interface_loader.cpp  |  6 +--
 .../unified_runtime/ur/adapters/cuda/usm.cpp  | 19 +++++----
 13 files changed, 85 insertions(+), 94 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp
index 96103d4d52c14..e13c48fa003b9 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp
@@ -21,8 +21,8 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
 
 /// UR context mapping to a CUDA context object.
 ///
-/// There is no direct mapping between a CUDA context and a UR context,
-/// main differences described below:
+/// There is no direct mapping between a CUDA context and a UR context.
+/// The main differences are described below:
 ///
 /// <b> CUDA context vs UR context </b>
 ///
@@ -32,21 +32,21 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
 /// with a given device and control access to said device from the user side.
 /// UR API context are objects that are passed to functions, and not bound
 /// to threads.
-/// The _ur_context object doesn't implement this behavior, only holds the
-/// CUDA context data. The RAII object \ref ScopedContext implements the active
-/// context behavior.
+/// The ur_context_handle_t_ object doesn't implement this behavior. It only
+/// holds the CUDA context data. The RAII object \ref ScopedContext implements
+/// the active context behavior.
 ///
 /// <b> Primary vs User-defined context </b>
 ///
 /// CUDA has two different types of context, the Primary context,
 /// which is usable by all threads on a given process for a given device, and
 /// the aforementioned custom contexts.
-/// CUDA documentation, and performance analysis, indicates it is recommended
-/// to use Primary context whenever possible.
-/// Primary context is used as well by the CUDA Runtime API.
+/// The CUDA documentation, confirmed with performance analysis, suggest using
+/// the Primary context whenever possible.
+/// The Primary context is also used by the CUDA Runtime API.
 /// For UR applications to interop with CUDA Runtime API, they have to use
 /// the primary context - and make that active in the thread.
-/// The `_ur_context` object can be constructed with a `kind` parameter
+/// The `ur_context_handle_t_` object can be constructed with a `kind` parameter
 /// that allows to construct a Primary or `user-defined` context, so that
 /// the UR object interface is always the same.
 ///
@@ -56,6 +56,7 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
 ///  the PI Context can store a number of callback functions that will be
 ///  called upon destruction of the UR Context.
 ///  See proposal for details.
+///  https://github.com/codeplaysoftware/standards-proposals/blob/master/extended-context-destruction/index.md
 ///
 struct ur_context_handle_t_ {
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp
index b1e0f939940ca..fe56c1e1ab501 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp
@@ -101,7 +101,7 @@ struct ur_event_handle_t_ {
                      uint32_t StreamToken);
 
   // This constructor is private to force programmers to use the
-  // makeWithNative for event introp
+  // makeWithNative for event interop
   ur_event_handle_t_(ur_context_handle_t Context, CUevent EventNative);
 
   ur_command_t CommandType; // The type of command associated with event.
@@ -117,33 +117,34 @@ struct ur_event_handle_t_ {
   bool IsRecorded; // Signifies wether a native CUDA event has been recorded
                    // yet.
   bool IsStarted;  // Signifies wether the operation associated with the
-                   // PI event has started or not
+                   // UR event has started or not
 
   uint32_t StreamToken;
   uint32_t EventID; // Queue identifier of the event.
 
-  native_type EvEnd; // CUDA event handle. If this _pi_event represents a user
-                     // event, this will be nullptr.
+  native_type EvEnd; // CUDA event handle. If this ur_event_handle_t represents
+                     // a user event, this will be nullptr.
 
   native_type EvStart; // CUDA event handle associated with the start
 
   native_type EvQueued; // CUDA event handle associated with the time
                         // the command was enqueued
 
-  ur_queue_handle_t Queue; // pi_queue associated with the event. If this is a
-                           // user event, this will be nullptr.
+  ur_queue_handle_t Queue; // ur_queue_handle_t associated with the event. If
+                           // this is a user event, this will be nullptr.
 
   CUstream Stream; // CUstream associated with the event. If this is a user
                    // event, this will be uninitialized.
 
-  ur_context_handle_t Context; // pi_context associated with the event. If this
-                               // is a native event, this will be the same
-                               // context associated with the queue_ member.
+  ur_context_handle_t Context; // ur_context_handle_t associated with the event.
+                               // If this is a native event, this will be the
+                               // same context associated with the queue member.
 };
 
-// Iterates over the event wait list, returns correct ur_result_t error codes.
-// Invokes the callback for the latest event of each queue in the wait list.
-// The callback must take a single pi_event argument and return a ur_result_t.
+// Iterate over `event_wait_list` and apply the given callback `f` to the
+// latest event on each queue therein. The callback must take a single
+// ur_event_handle_t argument and return a ur_result_t. If the callback returns
+// an error, the iteration terminates and the error is returned.
 template <typename Func>
 ur_result_t forLatestEvents(const ur_event_handle_t *EventWaitList,
                             std::size_t NumEventsInWaitList, Func &&F) {
@@ -169,14 +170,13 @@ ur_result_t forLatestEvents(const ur_event_handle_t *EventWaitList,
                       Event0->getEventID() > Event1->getEventID());
             });
 
-  bool First = true;
   CUstream LastSeenStream = 0;
-  for (ur_event_handle_t Event : Events) {
-    if (!Event || (!First && Event->getStream() == LastSeenStream)) {
+  for (size_t i = 0; i < Events.size(); i++) {
+    auto Event = Events[i];
+    if (!Event || (i != 0 && Event->getStream() == LastSeenStream)) {
       continue;
     }
 
-    First = false;
     LastSeenStream = Event->getStream();
 
     auto Result = F(Event);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
index f3c05e016e441..69c02392fa522 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
@@ -66,7 +66,6 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
                      void *pPropValue, size_t *pPropSizeRet) {
   UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
-  // Here we want to query about a kernel's cuda blocks!
   UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
 
   switch (propName) {
@@ -356,6 +355,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
     ur_program_handle_t hProgram,
     const ur_kernel_native_properties_t *pProperties,
     ur_kernel_handle_t *phKernel) {
+  std::ignore = hNativeKernel;
+  std::ignore = hContext;
+  std::ignore = hProgram;
+  std::ignore = pProperties;
+  std::ignore = phKernel;
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
index 3707cab1d1e0f..040f74ba6b403 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
@@ -23,7 +23,7 @@
 /// invocation. This is not the case of CUFunction objects,
 /// which are simply passed together with the arguments on the invocation.
 /// The UR Kernel implementation for CUDA stores the list of arguments,
-/// argument sizes and offsets to emulate the interface of UR Kernel,
+/// argument sizes, and offsets to emulate the interface of UR Kernel,
 /// saving the arguments for the later dispatch.
 /// Note that in UR API, the Local memory is specified as a size per
 /// individual argument, but in CUDA only the total usage of shared
@@ -31,7 +31,6 @@
 /// A compiler pass converts the UR API local memory model into the
 /// CUDA shared model. This object simply calculates the total of
 /// shared memory, and the initial offsets of each parameter.
-///
 struct ur_kernel_handle_t_ {
   using native_type = CUfunction;
 
@@ -68,7 +67,7 @@ struct ur_kernel_handle_t_ {
       Indices.emplace_back(&ImplicitOffsetArgs);
     }
 
-    /// Adds an argument to the kernel.
+    /// Add an argument to the kernel.
     /// If the argument existed before, it is replaced.
     /// Otherwise, it is added.
     /// Gaps are filled with empty arguments.
@@ -104,8 +103,9 @@ struct ur_kernel_handle_t_ {
 
       // align the argument
       size_t AlignedLocalOffset = LocalOffset;
-      if (LocalOffset % Alignment != 0) {
-        AlignedLocalOffset += Alignment - (LocalOffset % Alignment);
+      size_t Pad = LocalOffset % Alignment;
+      if (Pad != 0) {
+        AlignedLocalOffset += Alignment - Pad;
       }
 
       addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
@@ -171,7 +171,7 @@ struct ur_kernel_handle_t_ {
 
   const char *getName() const noexcept { return Name.c_str(); }
 
-  /// Returns the number of arguments, excluding the implicit global offset.
+  /// Get the number of kernel arguments, excluding the implicit global offset.
   /// Note this only returns the current known number of arguments, not the
   /// real one required by the kernel, since this cannot be queried from
   /// the CUDA Driver API
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
index b88d5307f4711..c8ecf9d5ddf12 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
@@ -14,7 +14,7 @@
 
 /// Creates a UR Memory object using a CUDA memory allocation.
 /// Can trigger a manual copy depending on the mode.
-/// \TODO Implement USE_HOST_PTR using cuHostRegister
+/// \TODO Implement USE_HOST_PTR using cuHostRegister - See #9789
 ///
 UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
     ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size,
@@ -109,7 +109,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) {
 /// Decreases the reference count of the Mem object.
 /// If this is zero, calls the relevant CUDA Free function
 /// \return UR_RESULT_SUCCESS unless deallocation error
-///
 UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
   UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
@@ -435,7 +434,6 @@ urMemImageGetInfo(ur_mem_handle_t hMemory, ur_image_info_t ImgInfoType,
 /// Implements a buffer partition in the CUDA backend.
 /// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented
 /// as an offset over an existing CUDA allocation.
-///
 UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
     ur_mem_handle_t hBuffer, ur_mem_flags_t flags,
     ur_buffer_create_type_t bufferCreateType, const ur_buffer_region_t *pRegion,
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp
index a1b484e3212bf..a986607a65d5e 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp
@@ -18,7 +18,7 @@
 /// Keeps tracks of all mapped regions used for Map/Unmap calls.
 /// Only one region can be active at the same time per allocation.
 struct ur_mem_handle_t_ {
-  // Context where the memory object is accessibles
+  // Context where the memory object is accessible
   ur_context_handle_t Context;
 
   /// Reference counting of the handler
@@ -31,7 +31,7 @@ struct ur_mem_handle_t_ {
   /// A UR Memory object represents either plain memory allocations ("Buffers"
   /// in OpenCL) or typed allocations ("Images" in OpenCL).
   /// In CUDA their API handlers are different. Whereas "Buffers" are allocated
-  /// as pointer-like structs, "Images" are stored in Textures or Surfaces
+  /// as pointer-like structs, "Images" are stored in Textures or Surfaces.
   /// This union allows implementation to use either from the same handler.
   union MemImpl {
     // Handler for plain, pointer-based CUDA allocations
@@ -80,7 +80,6 @@ struct ur_mem_handle_t_ {
       /// Returns a pointer to data visible on the host that contains
       /// the data on the device associated with this allocation.
       /// The offset is used to index into the CUDA allocation.
-      ///
       void *mapToPtr(size_t Offset, ur_map_flags_t Flags) noexcept {
         assert(MapPtr == nullptr);
         MapOffset = Offset;
@@ -152,7 +151,6 @@ struct ur_mem_handle_t_ {
                    ur_mem_type_t ImageType, void *HostPtr)
       : Context{Context}, RefCount{1}, MemType{Type::Surface},
         MemFlags{MemFlags} {
-    // Ignore unused parameter
     (void)HostPtr;
 
     Mem.SurfaceMem.Array = Array;
@@ -162,16 +160,13 @@ struct ur_mem_handle_t_ {
   }
 
   ~ur_mem_handle_t_() {
-    if (MemType == Type::Buffer) {
-      if (isSubBuffer()) {
-        urMemRelease(Mem.BufferMem.Parent);
-        return;
-      }
+    if (isBuffer() && isSubBuffer()) {
+      urMemRelease(Mem.BufferMem.Parent);
+      return;
     }
     urContextRelease(Context);
   }
 
-  // TODO: Move as many shared funcs up as possible
   bool isBuffer() const noexcept { return MemType == Type::Buffer; }
 
   bool isSubBuffer() const noexcept {
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
index f28f76c2a95df..c0150df284cc5 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
@@ -56,7 +56,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo(
 ///
 /// However because multiple devices in a context is not currently supported,
 /// place each device in a separate platform.
-///
 UR_DLLEXPORT ur_result_t UR_APICALL
 urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms,
               uint32_t *pNumPlatforms) {
@@ -183,7 +182,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urTearDown(void *) {
   return UR_RESULT_SUCCESS;
 }
 
-// Returns plugin specific backend option.
+// Get CUDA plugin specific backend option.
 // Current support is only for optimization options.
 // Return empty string for cuda.
 // TODO: Determine correct string to be passed.
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
index ce8d7c705ae83..f359b24eb68b6 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
@@ -159,7 +159,7 @@ ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) {
 /// CUDA driver API doesn't expose an operation for this.
 /// Note: This is currently only being used by the SYCL program class for the
 ///       has_kernel method, so an alternative would be to move the has_kernel
-///       query to PI and use cuModuleGetFunction to check for a kernel.
+///       query to UR and use cuModuleGetFunction to check for a kernel.
 /// Note: Another alternative is to add kernel names as metadata, like with
 ///       reqd_work_group_size.
 ur_result_t getKernelNames(ur_program_handle_t) {
@@ -169,7 +169,6 @@ ur_result_t getKernelNames(ur_program_handle_t) {
 /// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object.
 /// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in
 /// terms of CUDA adapter. See \ref urProgramCreateWithBinary.
-///
 UR_APIEXPORT ur_result_t UR_APICALL
 urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
                       size_t length, const ur_program_properties_t *pProperties,
@@ -186,7 +185,6 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
 /// CUDA will handle the PTX/CUBIN binaries internally through a call to
 /// cuModuleLoadDataEx. So, urProgramCompile and urProgramBuild are equivalent
 /// in terms of CUDA adapter. \TODO Implement asynchronous compilation
-///
 UR_APIEXPORT ur_result_t UR_APICALL
 urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram,
                  const char *pOptions) {
@@ -196,7 +194,6 @@ urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram,
 /// Loads the images from a UR program into a CUmodule that can be
 /// used later on to extract functions (kernels).
 /// See \ref ur_program_handle_t for implementation details.
-///
 UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
                                                    ur_program_handle_t hProgram,
                                                    const char *pOptions) {
@@ -218,7 +215,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
 /// Creates a new UR program object that is the outcome of linking all input
 /// programs.
 /// \TODO Implement linker options, requires mapping of OpenCL to CUDA
-///
 UR_APIEXPORT ur_result_t UR_APICALL
 urProgramLink(ur_context_handle_t hContext, uint32_t count,
               const ur_program_handle_t *phPrograms, const char *pOptions,
@@ -390,10 +386,10 @@ urProgramRelease(ur_program_handle_t hProgram) {
 
 /// Gets the native CUDA handle of a UR program object
 ///
-/// \param[in] program The PI program to get the native CUDA object of.
-/// \param[out] nativeHandle Set to the native handle of the PI program object.
+/// \param[in] program The UR program handle to get the native CUDA object of.
+/// \param[out] nativeHandle Set to the native handle of the UR program object.
 ///
-/// \return TBD
+/// \return ur_result_t
 UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
     ur_program_handle_t program, ur_native_handle_t *nativeHandle) {
   UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
index 82edf55612669..7eac0144f1e21 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
@@ -115,7 +115,6 @@ CUstream ur_queue_handle_t_::getNextTransferStream() {
 /// Valid properties
 /// * __SYCL_PI_CUDA_USE_DEFAULT_STREAM -> CU_STREAM_DEFAULT
 /// * __SYCL_PI_CUDA_SYNC_WITH_DEFAULT -> CU_STREAM_NON_BLOCKING
-///
 UR_APIEXPORT ur_result_t UR_APICALL
 urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
               const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) {
@@ -294,7 +293,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
 
   UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet);
 
-  switch (uint32_t{propName}) {
+  switch (propName) {
   case UR_QUEUE_INFO_CONTEXT:
     return ReturnValue(hQueue->Context);
   case UR_QUEUE_INFO_DEVICE:
@@ -324,7 +323,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
     }
   }
   default:
-    break;
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   }
 
   return UR_RESULT_ERROR_INVALID_ENUMERATION;
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
index bfb8f6606b645..5b37f750cb520 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
@@ -84,7 +84,7 @@ struct ur_queue_handle_t_ {
   // get_next_compute/transfer_stream() functions return streams from
   // appropriate pools in round-robin fashion
   native_type getNextComputeStream(uint32_t *StreamToken = nullptr);
-  // this overload tries select a stream that was used by one of dependancies.
+  // this overload tries select a stream that was used by one of dependencies.
   // If that is not possible returns a new stream. If a stream is reused it
   // returns a lock that needs to remain locked as long as the stream is in use
   native_type getNextComputeStream(uint32_t NumEventsInWaitList,
@@ -203,26 +203,26 @@ struct ur_queue_handle_t_ {
     }
     {
       unsigned int Size = static_cast<unsigned int>(TransferStreams.size());
-      if (Size > 0) {
-        std::lock_guard<std::mutex> TransferGuard(TransferStreamMutex);
-        unsigned int Start = LastSyncTransferStreams;
-        unsigned int End = NumTransferStreams < Size
-                               ? NumTransferStreams
-                               : TransferStreamIndex.load();
-        if (ResetUsed) {
-          LastSyncTransferStreams = End;
-        }
-        if (End - Start >= Size) {
-          SyncTransfer(0, Size);
+      if (!Size) {
+        return;
+      }
+      std::lock_guard<std::mutex> TransferGuard(TransferStreamMutex);
+      unsigned int Start = LastSyncTransferStreams;
+      unsigned int End = NumTransferStreams < Size ? NumTransferStreams
+                                                   : TransferStreamIndex.load();
+      if (ResetUsed) {
+        LastSyncTransferStreams = End;
+      }
+      if (End - Start >= Size) {
+        SyncTransfer(0, Size);
+      } else {
+        Start %= Size;
+        End %= Size;
+        if (Start <= End) {
+          SyncTransfer(Start, End);
         } else {
-          Start %= Size;
-          End %= Size;
-          if (Start <= End) {
-            SyncTransfer(Start, End);
-          } else {
-            SyncTransfer(Start, Size);
-            SyncTransfer(0, End);
-          }
+          SyncTransfer(Start, Size);
+          SyncTransfer(0, End);
         }
       }
     }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
index 464bd783b4646..decb3c1fd519a 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
@@ -17,8 +17,8 @@ ur_result_t urSamplerCreate(ur_context_handle_t hContext,
 
   if (pDesc && pDesc->stype == UR_STRUCTURE_TYPE_SAMPLER_DESC) {
     Sampler->Props |= pDesc->normalizedCoords;
-    Sampler->Props |= (pDesc->filterMode << 1);
-    Sampler->Props |= (pDesc->addressingMode << 2);
+    Sampler->Props |= pDesc->filterMode << 1;
+    Sampler->Props |= pDesc->addressingMode << 2;
   } else {
     // Set default values
     Sampler->Props |= true; // Normalized Coords
@@ -46,7 +46,7 @@ ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler,
   }
   case UR_SAMPLER_INFO_FILTER_MODE: {
     auto FilterProp =
-        static_cast<ur_sampler_filter_mode_t>(((hSampler->Props >> 1) & 0x1));
+        static_cast<ur_sampler_filter_mode_t>((hSampler->Props >> 1) & 0x1);
     return ReturnValue(FilterProp);
   }
   case UR_SAMPLER_INFO_ADDRESSING_MODE: {
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
index f0eb6008d8a36..c7258ad241373 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -14,11 +14,11 @@ namespace {
 // TODO - this is a duplicate of what is in the L0 plugin
 // We should move this to somewhere common
 ur_result_t validateProcInputs(ur_api_version_t version, void *pDdiTable) {
-  if (nullptr == pDdiTable) {
+  if (pDdiTable == nullptr) {
     return UR_RESULT_ERROR_INVALID_NULL_POINTER;
   }
-  // Pre 1.0 we enforce loader and adapter must have same version.
-  // Post 1.0 only major version match should be required.
+  // Pre 1.0 we enforce that loader and adapter must have the same version.
+  // Post 1.0 only a major version match should be required.
   if (version != UR_API_VERSION_CURRENT) {
     return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
   }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp
index 67b98f5c30319..7584e79a7c774 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp
@@ -18,7 +18,7 @@
 #include <cuda.h>
 
 /// USM: Implements USM Host allocations using CUDA Pinned Memory
-///
+/// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#page-locked-host-memory
 UR_APIEXPORT ur_result_t UR_APICALL
 urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
                ur_usm_pool_handle_t pool, size_t size, void **ppMem) {
@@ -62,6 +62,9 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
   UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 ||
+                          ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)),
+            UR_RESULT_ERROR_INVALID_VALUE);
 
   size_t DeviceMaxMemAllocSize = 0;
   UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE,
@@ -77,11 +80,8 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ScopedContext Active(hContext);
     Result = UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ppMem, size));
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 ||
-                          ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)),
-            UR_RESULT_ERROR_INVALID_VALUE);
 
   assert(Result == UR_RESULT_SUCCESS &&
          (!pUSMDesc || pUSMDesc->align == 0 ||
@@ -99,6 +99,9 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
   UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 ||
+                          ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)),
+            UR_RESULT_ERROR_INVALID_VALUE);
 
   size_t DeviceMaxMemAllocSize = 0;
   UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE,
@@ -115,11 +118,8 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
     Result = UR_CHECK_ERROR(
         cuMemAllocManaged((CUdeviceptr *)ppMem, size, CU_MEM_ATTACH_GLOBAL));
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 ||
-                          ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)),
-            UR_RESULT_ERROR_INVALID_VALUE);
 
   assert(Result == UR_RESULT_SUCCESS &&
          (!pUSMDesc || pUSMDesc->align == 0 ||
@@ -206,7 +206,6 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
 #else
       __builtin_unreachable();
 #endif
-      return ReturnValue(UR_USM_TYPE_UNKNOWN);
     }
     case UR_USM_ALLOC_INFO_BASE_PTR: {
 #if __CUDA_API_VERSION >= 10020

From 2a50972775de4337226548e2392338c39030e08d Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Fri, 9 Jun 2023 14:44:42 +0100
Subject: [PATCH 43/45] [SYCL][CUDA] Tidy CMakeLists.txt

---
 sycl/plugins/cuda/CMakeLists.txt            | 14 +++++++-------
 sycl/plugins/unified_runtime/CMakeLists.txt | 12 ++++++------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt
index 70e4e1a200e1a..2570b6f7e7348 100644
--- a/sycl/plugins/cuda/CMakeLists.txt
+++ b/sycl/plugins/cuda/CMakeLists.txt
@@ -64,21 +64,21 @@ add_sycl_plugin(cuda
     "../unified_runtime/ur/adapters/cuda/enqueue.cpp"
     "../unified_runtime/ur/adapters/cuda/event.cpp"
     "../unified_runtime/ur/adapters/cuda/event.hpp"
+    "../unified_runtime/ur/adapters/cuda/kernel.cpp"
+    "../unified_runtime/ur/adapters/cuda/kernel.hpp"
+    "../unified_runtime/ur/adapters/cuda/memory.cpp"
+    "../unified_runtime/ur/adapters/cuda/memory.hpp"
     "../unified_runtime/ur/adapters/cuda/platform.cpp"
     "../unified_runtime/ur/adapters/cuda/platform.hpp"
     "../unified_runtime/ur/adapters/cuda/program.cpp"
     "../unified_runtime/ur/adapters/cuda/program.hpp"
-    "../unified_runtime/ur/adapters/cuda/kernel.cpp"
-    "../unified_runtime/ur/adapters/cuda/kernel.hpp"
-    "../unified_runtime/ur/adapters/cuda/queue.hpp"
     "../unified_runtime/ur/adapters/cuda/queue.cpp"
+    "../unified_runtime/ur/adapters/cuda/queue.hpp"
     "../unified_runtime/ur/adapters/cuda/sampler.cpp"
     "../unified_runtime/ur/adapters/cuda/sampler.hpp"
-    "../unified_runtime/ur/adapters/cuda/usm.cpp"
-    "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp"
     "../unified_runtime/ur/adapters/cuda/tracing.cpp"
-    "../unified_runtime/ur/adapters/cuda/memory.cpp"
-    "../unified_runtime/ur/adapters/cuda/memory.hpp"
+    "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp"
+    "../unified_runtime/ur/adapters/cuda/usm.cpp"
     # --- 
     "${sycl_inc_dir}/sycl/detail/pi.h"
     "${sycl_inc_dir}/sycl/detail/pi.hpp"
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 9ceb01b670b98..372d7b5f82910 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -140,21 +140,21 @@ if ("cuda" IN_LIST SYCL_ENABLE_PLUGINS)
       "ur/adapters/cuda/enqueue.cpp"
       "ur/adapters/cuda/event.cpp"
       "ur/adapters/cuda/event.hpp"
+      "ur/adapters/cuda/kernel.cpp"
+      "ur/adapters/cuda/kernel.hpp"
+      "ur/adapters/cuda/memory.cpp"
+      "ur/adapters/cuda/memory.hpp"
       "ur/adapters/cuda/platform.cpp"
       "ur/adapters/cuda/platform.hpp"
       "ur/adapters/cuda/program.cpp"
       "ur/adapters/cuda/program.hpp"
-      "ur/adapters/cuda/kernel.cpp"
-      "ur/adapters/cuda/kernel.hpp"
       "ur/adapters/cuda/queue.cpp"
       "ur/adapters/cuda/queue.hpp"
       "ur/adapters/cuda/sampler.cpp"
       "ur/adapters/cuda/sampler.hpp"
-      "ur/adapters/cuda/memory.cpp"
-      "ur/adapters/cuda/memory.hpp"
-      "ur/adapters/cuda/usm.cpp"
-      "ur/adapters/cuda/ur_interface_loader.cpp"
       "ur/adapters/cuda/tracing.cpp"
+      "ur/adapters/cuda/ur_interface_loader.cpp"
+      "ur/adapters/cuda/usm.cpp"
     INCLUDE_DIRS
       ${sycl_inc_dir}
     LIBRARIES

From c39e7942f554b77f9a2c6b547ec659000bf63fb5 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Mon, 12 Jun 2023 16:41:37 +0100
Subject: [PATCH 44/45] Fix various build warnings

---
 .../ur/adapters/cuda/context.cpp              |  9 +++++++--
 .../ur/adapters/cuda/device.cpp               |  1 +
 .../ur/adapters/cuda/enqueue.cpp              | 20 ++++++++++++-------
 .../ur/adapters/cuda/kernel.cpp               |  1 +
 .../ur/adapters/cuda/kernel.hpp               |  5 +++--
 .../ur/adapters/cuda/memory.cpp               | 16 +++++++--------
 .../ur/adapters/cuda/program.cpp              |  6 +++---
 .../ur/adapters/cuda/queue.cpp                |  1 +
 .../ur/adapters/cuda/queue.hpp                |  2 +-
 9 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
index c922e8a3ddad6..74a32bdac2748 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
@@ -22,6 +22,8 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices,
                 const ur_context_properties_t *pProperties,
                 ur_context_handle_t *phContext) {
+  std::ignore = DeviceCount;
+  std::ignore = pProperties;
   UR_ASSERT(phDevices, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(phContext, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
@@ -132,8 +134,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle(
     const ur_device_handle_t *phDevices,
     const ur_context_native_properties_t *pProperties,
     ur_context_handle_t *phContext) {
-  (void)hNativeContext;
-  (void)phContext;
+  std::ignore = hNativeContext;
+  std::ignore = numDevices;
+  std::ignore = phDevices;
+  std::ignore = pProperties;
+  std::ignore = phContext;
 
   return UR_RESULT_ERROR_INVALID_OPERATION;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
index c3028a58717c6..51ceab14db3d2 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
@@ -1098,6 +1098,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
     ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform,
     const ur_device_native_properties_t *pProperties,
     ur_device_handle_t *phDevice) {
+  std::ignore = pProperties;
   UR_ASSERT(phDevice, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
   // We can't cast between ur_native_handle_t and CUdevice, so memcpy the bits
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
index ef87dab96d2fa..242a419407030 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
@@ -561,7 +561,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
     Result = commonEnqueueMemBufferCopyRect(
         CuStream, region, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
         bufferRowPitch, bufferSlicePitch, pDst, CU_MEMORYTYPE_HOST, hostOrigin,
-        hostRowPitch, bufferSlicePitch);
+        hostRowPitch, hostSlicePitch);
 
     if (phEvent) {
       Result = RetImplEvent->record();
@@ -905,8 +905,11 @@ static ur_result_t commonEnqueueMemImageNDCopy(
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
     ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingRead,
     ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch,
-    size_t phEventWaitListslicePitch, void *pDst, uint32_t numEventsInWaitList,
+    size_t slicePitch, void *pDst, uint32_t numEventsInWaitList,
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  std::ignore = rowPitch;
+  std::ignore = slicePitch;
+
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface,
@@ -972,6 +975,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
     ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch,
     size_t slicePitch, void *pSrc, uint32_t numEventsInWaitList,
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  std::ignore = blockingWrite;
+  std::ignore = rowPitch;
+  std::ignore = slicePitch;
+
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface,
@@ -1456,10 +1463,8 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
 // TODO: Implement this. Remember to return true for
 //       PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT when it is implemented.
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D(
-    ur_queue_handle_t hQueue, void *pMem, size_t pitch, size_t patternSize,
-    const void *pPattern, size_t width, size_t height,
-    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
-    ur_event_handle_t *phEvent) {
+    ur_queue_handle_t, void *, size_t, size_t, const void *, size_t, size_t,
+    uint32_t, const ur_event_handle_t *, ur_event_handle_t *) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
@@ -1484,7 +1489,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
 
     // Determine the direction of copy using cuPointerGetAttribute
     // for both the SrcPtr and DstPtr
-    CUDA_MEMCPY2D CpyDesc = {0};
+    CUDA_MEMCPY2D CpyDesc = {};
+    memset(&CpyDesc, 0, sizeof(CpyDesc));
 
     getUSMHostOrDevicePtr(pSrc, &CpyDesc.srcMemoryType, &CpyDesc.srcDevice,
                           &CpyDesc.srcHost);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
index 69c02392fa522..e1d6f9f9a2cd3 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
@@ -338,6 +338,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj(
 UR_APIEXPORT ur_result_t UR_APICALL
 urKernelSetExecInfo(ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName,
                     size_t propSize, const void *pPropValue) {
+  std::ignore = propSize;
   UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(pPropValue, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   switch (propName) {
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
index 040f74ba6b403..8b6a617126b08 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
@@ -137,11 +137,12 @@ struct ur_kernel_handle_t_ {
     urProgramRetain(Program);
     urContextRetain(Context);
     /// Note: this code assumes that there is only one device per context
-    ur_result_t retError = urKernelGetGroupInfo(
+    ur_result_t RetError = urKernelGetGroupInfo(
         this, Context->getDevice(),
         UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
         sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr);
-    assert(retError == UR_RESULT_SUCCESS);
+    (void)RetError;
+    assert(RetError == UR_RESULT_SUCCESS);
   }
 
   ~ur_kernel_handle_t_() {
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
index c8ecf9d5ddf12..b19acea3159f2 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
@@ -221,15 +221,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory,
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle(
-    ur_native_handle_t hNativeMem, ur_context_handle_t hContext,
-    const ur_mem_native_properties_t *pProperties, ur_mem_handle_t *phMem) {
+    ur_native_handle_t, ur_context_handle_t, const ur_mem_native_properties_t *,
+    ur_mem_handle_t *) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle(
-    ur_native_handle_t hNativeMem, ur_context_handle_t hContext,
-    const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc,
-    const ur_mem_native_properties_t *pProperties, ur_mem_handle_t *phMem) {
+    ur_native_handle_t, ur_context_handle_t, const ur_image_format_t *,
+    const ur_image_desc_t *, const ur_mem_native_properties_t *,
+    ur_mem_handle_t *) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
@@ -425,9 +425,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
 }
 
 /// \TODO Not implemented
-UR_APIEXPORT ur_result_t UR_APICALL
-urMemImageGetInfo(ur_mem_handle_t hMemory, ur_image_info_t ImgInfoType,
-                  size_t propSize, void *pImgInfo, size_t *pPropSizeRet) {
+UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t,
+                                                      ur_image_info_t, size_t,
+                                                      void *, size_t *) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
index f359b24eb68b6..e7467af0b8cbf 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
@@ -197,6 +197,7 @@ urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram,
 UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
                                                    ur_program_handle_t hProgram,
                                                    const char *pOptions) {
+  std::ignore = hContext;
   UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
   ur_result_t Result = UR_RESULT_SUCCESS;
@@ -274,9 +275,8 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count,
 ///
 /// \return TBD
 UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle(
-    ur_native_handle_t hNativeProgram, ur_context_handle_t hContext,
-    const ur_program_native_properties_t *pProperties,
-    ur_program_handle_t *phProgram) {
+    ur_native_handle_t, ur_context_handle_t,
+    const ur_program_native_properties_t *, ur_program_handle_t *) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
index 7eac0144f1e21..1aded75fb0741 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
@@ -237,6 +237,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) {
 UR_APIEXPORT ur_result_t UR_APICALL
 urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc,
                        ur_native_handle_t *phNativeQueue) {
+  std::ignore = pDesc;
   UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(phNativeQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
index 5b37f750cb520..69232efcc77e6 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
@@ -180,7 +180,7 @@ struct ur_queue_handle_t_ {
     };
     {
       unsigned int Size = static_cast<unsigned int>(ComputeStreams.size());
-      std::lock_guard ComputeSyncGuard(ComputeStreamSyncMutex);
+      std::lock_guard<std::mutex> ComputeSyncGuard(ComputeStreamSyncMutex);
       std::lock_guard<std::mutex> ComputeGuard(ComputeStreamMutex);
       unsigned int Start = LastSyncComputeStreams;
       unsigned int End = NumComputeStreams < Size ? NumComputeStreams

From b64fcbd5be136f2fb32d45f1e4d2adc8d983818d Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Wed, 14 Jun 2023 10:26:13 +0100
Subject: [PATCH 45/45] Address more review feedback

---
 sycl/plugins/unified_runtime/CMakeLists.txt    |  5 +++++
 .../ur/adapters/cuda/device.cpp                |  2 +-
 .../ur/adapters/cuda/enqueue.cpp               |  6 +++---
 .../ur/adapters/cuda/platform.cpp              | 10 +++++-----
 .../ur/adapters/cuda/sampler.cpp               | 18 ++++++++++--------
 5 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 372d7b5f82910..8cff5b2848b0f 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -162,6 +162,11 @@ if ("cuda" IN_LIST SYCL_ENABLE_PLUGINS)
       Threads::Threads
       cudadrv
   )
+
+  set_target_properties("ur_adapter_cuda" PROPERTIES
+    VERSION "0.0.0"
+    SOVERSION "0"
+  )
 endif()
 
 if (TARGET UnifiedRuntimeLoader)
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
index 51ceab14db3d2..c364c6f384a49 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
@@ -1026,7 +1026,7 @@ urDevicePartition(ur_device_handle_t, const ur_device_partition_property_t *,
 
 /// \return UR_RESULT_SUCCESS always since CUDA devices are always root
 /// devices.
-UR_DLLEXPORT ur_result_t UR_APICALL
+UR_APIEXPORT ur_result_t UR_APICALL
 urDeviceRelease(ur_device_handle_t hDevice) {
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
index 242a419407030..8dbd6ee2a27fe 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
@@ -223,7 +223,7 @@ bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device,
 /// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued
 /// wait will wait on all previous events in the queue.
 ///
-UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
     ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
   // This function makes one stream work on the previous work (or work
@@ -303,14 +303,14 @@ UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
 /// TODO: Add support for multiple streams once the Event class is properly
 /// refactored.
 ///
-UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
     ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
   return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
                                         phEventWaitList, phEvent);
 }
 
-UR_DLLEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
     const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
     const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
index c0150df284cc5..600512d0b01c7 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
@@ -18,7 +18,7 @@
 void enableCUDATracing();
 void disableCUDATracing();
 
-UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo(
+UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo(
     ur_platform_handle_t hPlatform, ur_platform_info_t PlatformInfoType,
     size_t Size, void *pPlatformInfo, size_t *pSizeRet) {
 
@@ -56,7 +56,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetInfo(
 ///
 /// However because multiple devices in a context is not currently supported,
 /// place each device in a separate platform.
-UR_DLLEXPORT ur_result_t UR_APICALL
+UR_APIEXPORT ur_result_t UR_APICALL
 urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms,
               uint32_t *pNumPlatforms) {
 
@@ -163,7 +163,7 @@ urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms,
   }
 }
 
-UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion(
+UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion(
     ur_platform_handle_t hDriver, ur_api_version_t *pVersion) {
   UR_ASSERT(hDriver, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(pVersion, UR_RESULT_ERROR_INVALID_NULL_POINTER);
@@ -172,12 +172,12 @@ UR_DLLEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion(
   return UR_RESULT_SUCCESS;
 }
 
-UR_DLLEXPORT ur_result_t UR_APICALL urInit(ur_device_init_flags_t) {
+UR_APIEXPORT ur_result_t UR_APICALL urInit(ur_device_init_flags_t) {
   enableCUDATracing();
   return UR_RESULT_SUCCESS;
 }
 
-UR_DLLEXPORT ur_result_t UR_APICALL urTearDown(void *) {
+UR_APIEXPORT ur_result_t UR_APICALL urTearDown(void *) {
   disableCUDATracing();
   return UR_RESULT_SUCCESS;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
index decb3c1fd519a..36ec89fb9da3c 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
@@ -9,9 +9,9 @@
 #include "sampler.hpp"
 #include "common.hpp"
 
-ur_result_t urSamplerCreate(ur_context_handle_t hContext,
-                            const ur_sampler_desc_t *pDesc,
-                            ur_sampler_handle_t *phSampler) {
+UR_APIEXPORT ur_result_t UR_APICALL
+urSamplerCreate(ur_context_handle_t hContext, const ur_sampler_desc_t *pDesc,
+                ur_sampler_handle_t *phSampler) {
   std::unique_ptr<ur_sampler_handle_t_> Sampler{
       new ur_sampler_handle_t_(hContext)};
 
@@ -29,9 +29,9 @@ ur_result_t urSamplerCreate(ur_context_handle_t hContext,
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler,
-                             ur_sampler_info_t propName, size_t propValueSize,
-                             void *pPropValue, size_t *pPropSizeRet) {
+UR_APIEXPORT ur_result_t UR_APICALL
+urSamplerGetInfo(ur_sampler_handle_t hSampler, ur_sampler_info_t propName,
+                 size_t propValueSize, void *pPropValue, size_t *pPropSizeRet) {
   UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet);
 
@@ -60,13 +60,15 @@ ur_result_t urSamplerGetInfo(ur_sampler_handle_t hSampler,
   return {};
 }
 
-ur_result_t urSamplerRetain(ur_sampler_handle_t hSampler) {
+UR_APIEXPORT ur_result_t UR_APICALL
+urSamplerRetain(ur_sampler_handle_t hSampler) {
   UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   hSampler->incrementReferenceCount();
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t urSamplerRelease(ur_sampler_handle_t hSampler) {
+UR_APIEXPORT ur_result_t UR_APICALL
+urSamplerRelease(ur_sampler_handle_t hSampler) {
   UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
   // double delete or someone is messing with the ref count.