From 444338ea55ba15057bb2bb5f295841ec78c0a055 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Thu, 10 Feb 2022 11:28:21 +0000
Subject: [PATCH 1/9] [SYCL][CUDA] Add context interop specialization

---
 sycl/include/CL/sycl/backend.hpp              |   4 +
 .../backend/backend_traits_cuda.hpp           | 144 ++++++++++++++++++
 .../ext/oneapi/experimental/backend/cuda.hpp  |  36 +++++
 3 files changed, 184 insertions(+)
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
diff --git a/sycl/include/CL/sycl/backend.hpp b/sycl/include/CL/sycl/backend.hpp
index 2954217f1ca64..2a228b66805c8 100644
--- a/sycl/include/CL/sycl/backend.hpp
+++ b/sycl/include/CL/sycl/backend.hpp
@@ -18,8 +18,12 @@
 #include <CL/sycl/detail/backend_traits_opencl.hpp>
 #endif
 #if SYCL_EXT_ONEAPI_BACKEND_CUDA
+#ifdef SYCL_EXT_ONEAPI_BACKEND_CUDA_EXPERIMENTAL
+#include <sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp>
+#else
 #include <CL/sycl/detail/backend_traits_cuda.hpp>
 #endif
+#endif
 #if SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO
 #include <CL/sycl/detail/backend_traits_level_zero.hpp>
 #endif
diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp
new file mode 100644
index 0000000000000..3d30043a3aa1e
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp
@@ -0,0 +1,144 @@
+//===------- backend_traits_cuda.hpp - Backend traits for CUDA ---*-C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the specializations of the sycl::detail::interop,
+// sycl::detail::BackendInput and sycl::detail::BackendReturn class templates
+// for the CUDA backend but there is no sycl::detail::InteropFeatureSupportMap
+// specialization for the CUDA backend.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/accessor.hpp>
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/detail/backend_traits.hpp>
+#include <CL/sycl/device.hpp>
+#include <CL/sycl/event.hpp>
+#include <CL/sycl/kernel_bundle.hpp>
+#include <CL/sycl/queue.hpp>
+
+#include <vector>
+
+typedef int CUdevice;
+typedef struct CUctx_st *CUcontext;
+typedef struct CUstream_st *CUstream;
+typedef struct CUevent_st *CUevent;
+typedef struct CUmod_st *CUmodule;
+
+// As defined in the CUDA 10.1 header file. This requires CUDA version > 3.2
+#if defined(_WIN64) || defined(__LP64__)
+typedef unsigned long long CUdeviceptr;
+#else
+typedef unsigned int CUdeviceptr;
+#endif
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace detail {
+
+// TODO the interops for context, device, event, platform and program
+// may be removed after removing the deprecated 'get_native()' methods
+// from the corresponding classes. The interop<backend, queue> specialization
+// is also used in the get_queue() method of the deprecated class
+// interop_handler and also can be removed after API cleanup.
+template <> struct interop<backend::ext_oneapi_cuda, context> {
+  using type = CUcontext;
+};
+
+template <> struct interop<backend::ext_oneapi_cuda, device> {
+  using type = CUdevice;
+};
+
+template <> struct interop<backend::ext_oneapi_cuda, event> {
+  using type = CUevent;
+};
+
+template <> struct interop<backend::ext_oneapi_cuda, queue> {
+  using type = CUstream;
+};
+
+#ifdef __SYCL_INTERNAL_API
+template <> struct interop<backend::ext_oneapi_cuda, program> {
+  using type = CUmodule;
+};
+#endif
+
+// TODO the interops for accessor is used in the already deprecated class
+// interop_handler and can be removed after API cleanup.
+template <typename DataT, int Dimensions, access::mode AccessMode>
+struct interop<backend::ext_oneapi_cuda,
+               accessor<DataT, Dimensions, AccessMode, access::target::device,
+                        access::placeholder::false_t>> {
+  using type = CUdeviceptr;
+};
+
+template <typename DataT, int Dimensions, access::mode AccessMode>
+struct interop<
+    backend::ext_oneapi_cuda,
+    accessor<DataT, Dimensions, AccessMode, access::target::constant_buffer,
+             access::placeholder::false_t>> {
+  using type = CUdeviceptr;
+};
+
+template <typename DataT, int Dimensions, typename AllocatorT>
+struct BackendInput<backend::ext_oneapi_cuda,
+                    buffer<DataT, Dimensions, AllocatorT>> {
+  using type = CUdeviceptr;
+};
+
+template <typename DataT, int Dimensions, typename AllocatorT>
+struct BackendReturn<backend::ext_oneapi_cuda,
+                     buffer<DataT, Dimensions, AllocatorT>> {
+  using type = CUdeviceptr;
+};
+
+template <> struct BackendInput<backend::ext_oneapi_cuda, context> {
+  using type = CUcontext;
+};
+
+template <> struct BackendReturn<backend::ext_oneapi_cuda, context> {
+  using type = std::vector<CUcontext>;
+};
+
+template <> struct BackendInput<backend::ext_oneapi_cuda, device> {
+  using type = CUdevice;
+};
+
+template <> struct BackendReturn<backend::ext_oneapi_cuda, device> {
+  using type = CUdevice;
+};
+
+template <> struct BackendInput<backend::ext_oneapi_cuda, event> {
+  using type = CUevent;
+};
+
+template <> struct BackendReturn<backend::ext_oneapi_cuda, event> {
+  using type = CUevent;
+};
+
+template <> struct BackendInput<backend::ext_oneapi_cuda, queue> {
+  using type = CUstream;
+};
+
+template <> struct BackendReturn<backend::ext_oneapi_cuda, queue> {
+  using type = CUstream;
+};
+
+#ifdef __SYCL_INTERNAL_API
+template <> struct BackendInput<backend::ext_oneapi_cuda, program> {
+  using type = CUmodule;
+};
+
+template <> struct BackendReturn<backend::ext_oneapi_cuda, program> {
+  using type = CUmodule;
+};
+#endif
+
+} // namespace detail
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
new file mode 100644
index 0000000000000..519a0f21714ad
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
@@ -0,0 +1,36 @@
+//==--------- cuda.hpp - SYCL CUDA backend ---------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/backend.hpp>
+#include <CL/sycl/context.hpp>
+
+#include <vector>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+
+// CUDA context specialization
+template <>
+auto get_native<backend::ext_oneapi_cuda, context>(const context &C)
+    -> backend_return_t<backend::ext_oneapi_cuda, context> {
+  // create a vector to be returned
+  backend_return_t<backend::ext_oneapi_cuda, context> ret;
+
+  // get the native CUDA context from the SYCL object
+  auto native = reinterpret_cast<
+      backend_return_t<backend::ext_oneapi_cuda, context>::value_type>(
+      C.getNative());
+  ret.push_back(native);
+
+  return ret;
+}
+
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)

From 195b96916ce3ce02e88ab4cb41f7317591047185 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Thu, 17 Feb 2022 16:45:49 +0000
Subject: [PATCH 2/9] [SYCL] Unify interop implementations

---
 sycl/include/CL/sycl/context.hpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sycl/include/CL/sycl/context.hpp b/sycl/include/CL/sycl/context.hpp
index a78e30f385f16..87cbc21808e03 100644
--- a/sycl/include/CL/sycl/context.hpp
+++ b/sycl/include/CL/sycl/context.hpp
@@ -27,6 +27,8 @@ class platform;
 namespace detail {
 class context_impl;
 }
+template <backend Backend, class SyclT>
+auto get_native(const SyclT &Obj) -> backend_return_t<Backend, SyclT>;
 
 /// The context class represents a SYCL context on which kernel functions may
 /// be executed.
@@ -220,7 +222,7 @@ class __SYCL_EXPORT context {
   template <backend Backend>
   __SYCL_DEPRECATED("Use SYCL 2020 sycl::get_native free function")
   backend_return_t<Backend, context> get_native() const {
-    return reinterpret_cast<backend_return_t<Backend, context>>(getNative());
+    return sycl::get_native<Backend, context>(*this);
   }
 
 private:
@@ -230,6 +232,10 @@ class __SYCL_EXPORT context {
   pi_native_handle getNative() const;
 
   std::shared_ptr<detail::context_impl> impl;
+
+  template <backend Backend, class SyclT>
+  friend auto get_native(const SyclT &Obj) -> backend_return_t<Backend, SyclT>;
+
   template <class Obj>
   friend decltype(Obj::impl) detail::getSyclObjImpl(const Obj &SyclObject);
 

From efec76c569099228c4167dfa25baa45d0adf3de0 Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Fri, 18 Feb 2022 11:55:19 +0000
Subject: [PATCH 3/9] add experimental platform and device interop

---
 .../backend/backend_traits_cuda.hpp           | 23 +++++
 .../ext/oneapi/experimental/backend/cuda.hpp  | 49 ++++++++++
 sycl/plugins/cuda/pi_cuda.cpp                 | 98 ++++++++++++++++++-
 sycl/source/backend.cpp                       |  2 +
 sycl/source/detail/pi.cpp                     |  1 +
 5 files changed, 168 insertions(+), 5 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp
index 3d30043a3aa1e..c460d2f854f67 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp
@@ -62,6 +62,10 @@ template <> struct interop<backend::ext_oneapi_cuda, queue> {
   using type = CUstream;
 };
 
+template <> struct interop<backend::ext_oneapi_cuda, platform> {
+  using type = std::vector<CUdevice>;
+};
+
 #ifdef __SYCL_INTERNAL_API
 template <> struct interop<backend::ext_oneapi_cuda, program> {
   using type = CUmodule;
@@ -129,6 +133,14 @@ template <> struct BackendReturn<backend::ext_oneapi_cuda, queue> {
   using type = CUstream;
 };
 
+template <> struct BackendInput<backend::ext_oneapi_cuda, platform> {
+  using type = std::vector<CUdevice>;
+};
+
+template <> struct BackendReturn<backend::ext_oneapi_cuda, platform> {
+  using type = std::vector<CUdevice>;
+};
+
 #ifdef __SYCL_INTERNAL_API
 template <> struct BackendInput<backend::ext_oneapi_cuda, program> {
   using type = CUmodule;
@@ -139,6 +151,17 @@ template <> struct BackendReturn<backend::ext_oneapi_cuda, program> {
 };
 #endif
 
+template <> struct InteropFeatureSupportMap<backend::ext_oneapi_cuda> {
+  static constexpr bool MakePlatform = true;
+  static constexpr bool MakeDevice = true;
+  static constexpr bool MakeContext = true;
+  static constexpr bool MakeQueue = true;
+  static constexpr bool MakeEvent = true;
+  static constexpr bool MakeBuffer = true;
+  static constexpr bool MakeKernel = true;
+  static constexpr bool MakeKernelBundle = true;
+};
+
 } // namespace detail
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
index 519a0f21714ad..76d92018aaf5d 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
@@ -15,6 +15,23 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
+namespace ext {
+namespace oneapi {
+namespace cuda {
+
+// Implementation of cuda::make<device>
+__SYCL_EXPORT device make_device(pi_native_handle NativeHandle) {
+  return detail::make_device(NativeHandle, backend::cuda);
+}
+
+// Implementation of cuda::make<platform>
+__SYCL_EXPORT platform make_platform(pi_native_handle NativeHandle) {
+  return detail::make_platform(NativeHandle, backend::cuda);
+}
+
+} // namespace cuda
+} // namespace oneapi
+} // namespace ext
 
 // CUDA context specialization
 template <>
@@ -32,5 +49,37 @@ auto get_native<backend::ext_oneapi_cuda, context>(const context &C)
   return ret;
 }
 
+// CUDA device specialization
+template <>
+device make_device<backend::ext_oneapi_cuda>(
+    const backend_input_t<backend::ext_oneapi_cuda, device> &BackendObject) {
+  pi_native_handle NativeHandle = static_cast<pi_native_handle>(BackendObject);
+  return ext::oneapi::cuda::make_device(NativeHandle);
+}
+
+// CUDA platform specialization
+template <>
+auto get_native<backend::ext_oneapi_cuda, platform>(const platform &C)
+    -> backend_return_t<backend::ext_oneapi_cuda, platform> {
+  // get list of platform devices, and transfer to native platform type
+  std::vector<device> platform_devices = C.get_devices();
+  std::vector<CUdevice> native_devices(platform_devices.size());
+
+  // Get the native CUdevice type for each device in platform
+  for (unsigned int i = 0; i < platform_devices.size(); ++i)
+    native_devices[i] =
+        get_native<backend::ext_oneapi_cuda>(platform_devices[i]);
+
+  return native_devices;
+}
+
+template <>
+platform make_platform<backend::ext_oneapi_cuda>(
+    const backend_input_t<backend::ext_oneapi_cuda, platform> &BackendObject) {
+  pi_native_handle NativeHandle =
+      detail::pi::cast<pi_native_handle>(&BackendObject);
+  return ext::oneapi::cuda::make_platform(NativeHandle);
+}
+
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 4b05475f86cc8..1f228aa1a194c 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -834,6 +834,53 @@ pi_result cuda_piDevicesGet(pi_platform platform, pi_device_type device_type,
   }
 }
 
+pi_result cuda_piextPlatformGetNativeHandle(pi_platform platform,
+                                            pi_native_handle *nativeHandle) {
+  cl::sycl::detail::pi::die(
+      "cuda_piextPlatformGetNativeHandle not implemented");
+  return {};
+}
+
+pi_result
+cuda_piextPlatformCreateWithNativeHandle(pi_native_handle nativeHandle,
+                                         pi_platform *platform) {
+  assert(platform);
+  assert(nativeHandle);
+
+  auto native_platform =
+      reinterpret_cast<std::vector<CUdevice> *>(nativeHandle);
+
+  // Get list of platforms
+  pi_uint32 num_platforms;
+  pi_result result = cuda_piPlatformsGet(0, nullptr, &num_platforms);
+
+  pi_platform *plat =
+      static_cast<pi_platform *>(malloc(num_platforms * sizeof(pi_platform)));
+  result = cuda_piPlatformsGet(num_platforms, plat, nullptr);
+
+  // Iterate through platforms to find device that matches nativeHandle
+  bool found_match = false;
+  for (pi_uint32 j = 0; j < num_platforms; ++j) {
+    bool is_same = true;
+    for (auto &dev : plat[j]->devices_) {
+      auto it =
+          find(native_platform->begin(), native_platform->end(), dev->get());
+      if (it == native_platform->end())
+        is_same = false;
+    }
+    if (is_same) {
+      found_match = true;
+      *platform = plat[j];
+    }
+  }
+
+  if (!found_match) {
+    return PI_INVALID_VALUE;
+  }
+
+  return result;
+}
+
 /// \return PI_SUCCESS if the function is executed successfully
 /// CUDA devices are always root devices so retain always returns success.
 pi_result cuda_piDeviceRetain(pi_device) { return PI_SUCCESS; }
@@ -1772,11 +1819,49 @@ pi_result cuda_piextDeviceGetNativeHandle(pi_device device,
 /// \param[out] device Set to the PI device object created from native handle.
 ///
 /// \return TBD
-pi_result cuda_piextDeviceCreateWithNativeHandle(pi_native_handle, pi_platform,
-                                                 pi_device *) {
-  cl::sycl::detail::pi::die(
-      "Creation of PI device from native handle not implemented");
-  return {};
+pi_result cuda_piextDeviceCreateWithNativeHandle(pi_native_handle nativeHandle,
+                                                 pi_platform platform,
+                                                 pi_device *piDevice) {
+  assert(piDevice != nullptr);
+
+  // If a platform is provided just check if the device is in it
+  if (platform) {
+    bool found_match = false;
+    for (auto &dev : platform->devices_) {
+      if (dev->get() == static_cast<CUdevice>(nativeHandle)) {
+        *piDevice = dev.get();
+        found_match = true;
+      }
+    }
+    if (!found_match)
+      return PI_INVALID_VALUE;
+    return PI_SUCCESS;
+  }
+
+  // Get list of platforms
+  pi_uint32 num_platforms;
+  pi_result result = cuda_piPlatformsGet(0, nullptr, &num_platforms);
+
+  pi_platform *plat =
+      static_cast<pi_platform *>(malloc(num_platforms * sizeof(pi_platform)));
+  result = cuda_piPlatformsGet(num_platforms, plat, nullptr);
+
+  // Iterate through platforms to find device that matches nativeHandle
+  bool found_match = false;
+  for (pi_uint32 j = 0; j < num_platforms; ++j) {
+    for (auto &dev : plat[j]->devices_) {
+      if (dev->get() == static_cast<CUdevice>(nativeHandle)) {
+        *piDevice = dev.get();
+        found_match = true;
+      }
+    }
+  }
+
+  // If the provided nativeHandle cannot be matched to an
+  // existing device return error
+  if (!found_match)
+    return PI_INVALID_VALUE;
+  return result;
 }
 
 /* Context APIs */
@@ -4937,6 +5022,9 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   // Platform
   _PI_CL(piPlatformsGet, cuda_piPlatformsGet)
   _PI_CL(piPlatformGetInfo, cuda_piPlatformGetInfo)
+  _PI_CL(piextPlatformGetNativeHandle, cuda_piextPlatformGetNativeHandle)
+  _PI_CL(piextPlatformCreateWithNativeHandle,
+         cuda_piextPlatformCreateWithNativeHandle)
   // Device
   _PI_CL(piDevicesGet, cuda_piDevicesGet)
   _PI_CL(piDeviceGetInfo, cuda_piDeviceGetInfo)
diff --git a/sycl/source/backend.cpp b/sycl/source/backend.cpp
index 691bdbf0ab91c..f5cbf0b5d9088 100644
--- a/sycl/source/backend.cpp
+++ b/sycl/source/backend.cpp
@@ -35,6 +35,8 @@ static const plugin &getPlugin(backend Backend) {
     return pi::getPlugin<backend::opencl>();
   case backend::ext_oneapi_level_zero:
     return pi::getPlugin<backend::ext_oneapi_level_zero>();
+  case backend::ext_oneapi_cuda:
+    return pi::getPlugin<backend::ext_oneapi_cuda>();
   default:
     throw sycl::runtime_error{"Unsupported backend", PI_INVALID_OPERATION};
   }
diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp
index 34f830de3cacc..4136d6b82cf51 100644
--- a/sycl/source/detail/pi.cpp
+++ b/sycl/source/detail/pi.cpp
@@ -529,6 +529,7 @@ template __SYCL_EXPORT const plugin &
 getPlugin<backend::ext_oneapi_level_zero>();
 template __SYCL_EXPORT const plugin &
 getPlugin<backend::ext_intel_esimd_emulator>();
+template __SYCL_EXPORT const plugin &getPlugin<backend::ext_oneapi_cuda>();
 
 // Report error and no return (keeps compiler from printing warnings).
 // TODO: Probably change that to throw a catchable exception,

From 06afe8bd2757d0470e8427c0dd627474087cc878 Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Fri, 18 Feb 2022 14:54:06 +0000
Subject: [PATCH 4/9] Change get_native_mem type

---
 .../ext/oneapi/experimental/backend/backend_traits_cuda.hpp     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp
index c460d2f854f67..d6f3555757b91 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp
@@ -98,7 +98,7 @@ struct BackendInput<backend::ext_oneapi_cuda,
 template <typename DataT, int Dimensions, typename AllocatorT>
 struct BackendReturn<backend::ext_oneapi_cuda,
                      buffer<DataT, Dimensions, AllocatorT>> {
-  using type = CUdeviceptr;
+  using type = void *;
 };
 
 template <> struct BackendInput<backend::ext_oneapi_cuda, context> {

From d9daac27135f78954204b1b3f52817792188abc4 Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Mon, 21 Feb 2022 11:13:02 +0000
Subject: [PATCH 5/9] add cuda experimental interop testing

---
 .../basic_tests/interop-backend-traits.cpp    |  13 ++
 .../basic_tests/interop-cuda-experimental.cpp | 111 ++++++++++++++++++
 2 files changed, 124 insertions(+)
 create mode 100644 sycl/test/basic_tests/interop-cuda-experimental.cpp

diff --git a/sycl/test/basic_tests/interop-backend-traits.cpp b/sycl/test/basic_tests/interop-backend-traits.cpp
index 996b896a024e7..f3c04a05c641c 100644
--- a/sycl/test/basic_tests/interop-backend-traits.cpp
+++ b/sycl/test/basic_tests/interop-backend-traits.cpp
@@ -1,6 +1,7 @@
 // RUN: %clangxx -fsycl -DUSE_OPENCL %s
 // RUN: %clangxx -fsycl -DUSE_L0 %s
 // RUN: %clangxx -fsycl -DUSE_CUDA %s
+// RUN: %clangxx -fsycl -DUSE_CUDA_EXPERIMENTAL %s
 
 #ifdef USE_OPENCL
 #include <CL/cl.h>
@@ -24,6 +25,14 @@ constexpr auto Backend = sycl::backend::ext_oneapi_level_zero;
 constexpr auto Backend = sycl::backend::ext_oneapi_cuda;
 #endif
 
+#ifdef USE_CUDA_EXPERIMENTAL
+#define SYCL_EXT_ONEAPI_BACKEND_CUDA_EXPERIMENTAL 1
+#include <sycl/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/backend/cuda.hpp>
+
+constexpr auto Backend = sycl::backend::ext_oneapi_cuda;
+#endif
+
 #include <sycl/sycl.hpp>
 
 int main() {
@@ -55,9 +64,13 @@ int main() {
   static_assert(
       std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::device>,
                      sycl::detail::interop<Backend, sycl::device>::type>);
+
+// CUDA experimental return type is different to inpt type
+#ifndef USE_CUDA_EXPERIMENTAL
   static_assert(
       std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::context>,
                      sycl::detail::interop<Backend, sycl::context>::type>);
+#endif
   static_assert(
       std::is_same_v<sycl::backend_traits<Backend>::return_type<sycl::queue>,
                      sycl::detail::interop<Backend, sycl::queue>::type>);
diff --git a/sycl/test/basic_tests/interop-cuda-experimental.cpp b/sycl/test/basic_tests/interop-cuda-experimental.cpp
new file mode 100644
index 0000000000000..b04a92f9cbfad
--- /dev/null
+++ b/sycl/test/basic_tests/interop-cuda-experimental.cpp
@@ -0,0 +1,111 @@
+// REQUIRES: cuda
+// RUN: %clangxx %fsycl-host-only -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=note %s -o %t.out
+// RUN: %clangxx %fsycl-host-only -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=note -D__SYCL_INTERNAL_API %s -o %t.out
+
+// Test for experimental CUDA interop API
+
+#define SYCL_EXT_ONEAPI_BACKEND_CUDA_EXPERIMENTAL 1
+#include <sycl/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/backend/cuda.hpp>
+
+#include <cuda.h>
+
+using namespace sycl;
+
+//
+// 4.5.1 SYCL application interoperability may be provided for
+// platform,
+// device,
+// context,
+// queue,
+// event,
+// TODO:
+// buffer,
+// device_image,
+// sampled_image,
+// unsampled_image.
+
+int main() {
+
+  // Create SYCL objects
+  device Device;
+  platform Platform = Device.get_info<info::device::platform>();
+  context Context(Device);
+  queue Queue(Device);
+  event Event;
+
+  // 4.5.1.1 For each SYCL runtime class T which supports SYCL application
+  // interoperability with the SYCL backend, a specialization of return_type
+  // must be defined as the type of SYCL application interoperability native
+  // backend object associated with T for the SYCL backend, specified in the
+  // SYCL backend specification.
+  //
+  // return_type is used when retrieving the backend specific native object from
+  // a SYCL object. See the relevant backend specification for details.
+
+  backend_traits<backend::ext_oneapi_cuda>::return_type<platform> cu_platform;
+  backend_traits<backend::ext_oneapi_cuda>::return_type<device> cu_device;
+  backend_traits<backend::ext_oneapi_cuda>::return_type<context> cu_context;
+  backend_traits<backend::ext_oneapi_cuda>::return_type<queue> cu_queue;
+  backend_traits<backend::ext_oneapi_cuda>::return_type<event> cu_event;
+
+  // 4.5.1.2 For each SYCL runtime class T which supports SYCL application
+  // interoperability, a specialization of get_native must be defined, which
+  // takes an instance of T and returns a SYCL application interoperability
+  // native backend object associated with syclObject which can be used for SYCL
+  // application interoperability. The lifetime of the object returned are
+  // backend-defined and specified in the backend specification.
+
+  cu_platform = get_native<backend::ext_oneapi_cuda>(Platform);
+  cu_device = get_native<backend::ext_oneapi_cuda>(Device);
+  cu_context = get_native<backend::ext_oneapi_cuda>(Context);
+  cu_queue = get_native<backend::ext_oneapi_cuda>(Queue);
+  cu_event = get_native<backend::ext_oneapi_cuda>(Event);
+
+  // Check deprecated
+  // expected-warning@+2 {{'get_native' is deprecated: Use SYCL 2020 sycl::get_native free function}}
+  // expected-warning@+1 {{'get_native<sycl::backend::ext_oneapi_cuda>' is deprecated: Use SYCL 2020 sycl::get_native free function}}
+  cu_platform = Platform.get_native<backend::ext_oneapi_cuda>();
+  // expected-warning@+2 {{'get_native' is deprecated: Use SYCL 2020 sycl::get_native free function}}
+  // expected-warning@+1 {{'get_native<sycl::backend::ext_oneapi_cuda>' is deprecated: Use SYCL 2020 sycl::get_native free function}}
+  cu_device = Device.get_native<backend::ext_oneapi_cuda>();
+  // expected-warning@+2 {{'get_native' is deprecated: Use SYCL 2020 sycl::get_native free function}}
+  // expected-warning@+1 {{'get_native<sycl::backend::ext_oneapi_cuda>' is deprecated: Use SYCL 2020 sycl::get_native free function}}
+  cu_context = Context.get_native<backend::ext_oneapi_cuda>();
+  // expected-warning@+2 {{'get_native' is deprecated: Use SYCL 2020 sycl::get_native free function}}
+  // expected-warning@+1 {{'get_native<sycl::backend::ext_oneapi_cuda>' is deprecated: Use SYCL 2020 sycl::get_native free function}}
+  cu_queue = Queue.get_native<backend::ext_oneapi_cuda>();
+  // expected-warning@+2 {{'get_native' is deprecated: Use SYCL 2020 sycl::get_native free function}}
+  // expected-warning@+1 {{'get_native<sycl::backend::ext_oneapi_cuda>' is deprecated: Use SYCL 2020 sycl::get_native free function}}
+  cu_event = Event.get_native<backend::ext_oneapi_cuda>();
+
+  // 4.5.1.1 For each SYCL runtime class T which supports SYCL application
+  // interoperability with the SYCL backend, a specialization of input_type must
+  // be defined as the type of SYCL application interoperability native backend
+  // object associated with T for the SYCL backend, specified in the SYCL
+  // backend specification. input_type is used when constructing SYCL objects
+  // from backend specific native objects. See the relevant backend
+  // specification for details.
+
+  // 4.5.1.3 For each SYCL runtime class T which supports SYCL application
+  // interoperability, a specialization of the appropriate template function
+  // make_{sycl_class} where {sycl_class} is the class name of T, must be
+  // defined, which takes a SYCL application interoperability native backend
+  // object and constructs and returns an instance of T. The availability and
+  // behavior of these template functions is defined by the SYCL backend
+  // specification document.
+
+  backend_input_t<backend::ext_oneapi_cuda, platform> InteropPlatformInput{cu_platform};
+  platform InteropPlatform = make_platform<backend::ext_oneapi_cuda>(InteropPlatformInput);
+
+  backend_input_t<backend::ext_oneapi_cuda, device> InteropDeviceInput{cu_device};
+  device InteropDevice = make_device<backend::ext_oneapi_cuda>(InteropDeviceInput);
+
+  backend_input_t<backend::ext_oneapi_cuda, context> InteropContextInput{cu_context[0]};
+  context InteropContext = make_context<backend::ext_oneapi_cuda>(InteropContextInput);
+
+  queue InteropQueue = make_queue<backend::ext_oneapi_cuda>(cu_queue, Context);
+  event InteropEvent = make_event<backend::ext_oneapi_cuda>(cu_event, Context);
+
+  return 0;
+}

From 88429feaa64604f5d8dabe2aa1d51b9f0241da64 Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Mon, 21 Feb 2022 11:13:50 +0000
Subject: [PATCH 6/9] update platform

---
 sycl/include/CL/sycl/platform.hpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sycl/include/CL/sycl/platform.hpp b/sycl/include/CL/sycl/platform.hpp
index e6249a4883bd5..aed37a00389e9 100644
--- a/sycl/include/CL/sycl/platform.hpp
+++ b/sycl/include/CL/sycl/platform.hpp
@@ -27,6 +27,8 @@ class device;
 namespace detail {
 class platform_impl;
 }
+template <backend Backend, class SyclT>
+auto get_native(const SyclT &Obj) -> backend_return_t<Backend, SyclT>;
 
 /// Encapsulates a SYCL platform on which kernels may be executed.
 ///
@@ -123,7 +125,7 @@ class __SYCL_EXPORT platform {
   template <backend Backend>
   __SYCL_DEPRECATED("Use SYCL 2020 sycl::get_native free function")
   backend_return_t<Backend, platform> get_native() const {
-    return reinterpret_cast<backend_return_t<Backend, platform>>(getNative());
+    return sycl::get_native<Backend, platform>(*this);
   }
 
   /// Indicates if all of the SYCL devices on this platform have the
@@ -147,6 +149,9 @@ class __SYCL_EXPORT platform {
   std::shared_ptr<detail::platform_impl> impl;
   platform(std::shared_ptr<detail::platform_impl> impl) : impl(impl) {}
 
+  template <backend Backend, class SyclT>
+  friend auto get_native(const SyclT &Obj) -> backend_return_t<Backend, SyclT>;
+
   template <class T>
   friend T detail::createSyclObjFromImpl(decltype(T::impl) ImplObj);
   template <class Obj>

From 589ec40f1cbfe4396df1f31ae2d26faa0ab177d4 Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Wed, 16 Mar 2022 09:39:46 +0000
Subject: [PATCH 7/9] add specialisation for interop_handle get_native_context

---
 .../ext/oneapi/experimental/backend/cuda.hpp  | 13 ++++++++++++
 .../basic_tests/interop-backend-traits.cpp    |  2 +-
 .../basic_tests/interop-cuda-experimental.cpp | 20 ++++++++++++-------
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
index 76d92018aaf5d..333ee5ba18b83 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
@@ -10,6 +10,7 @@
 
 #include <CL/sycl/backend.hpp>
 #include <CL/sycl/context.hpp>
+#include <CL/sycl/interop_handle.hpp>
 
 #include <vector>
 
@@ -81,5 +82,17 @@ platform make_platform<backend::ext_oneapi_cuda>(
   return ext::oneapi::cuda::make_platform(NativeHandle);
 }
 
+// Specialisation of interop_handles get_native_context
+template <>
+backend_return_t<backend::ext_oneapi_cuda, context>
+interop_handle::get_native_context<backend::ext_oneapi_cuda>() const {
+#ifndef __SYCL_DEVICE_ONLY__
+  return std::vector{reinterpret_cast<CUcontext>(getNativeContext())};
+#else
+  // we believe this won't be ever called on device side
+  return {};
+#endif
+}
+
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/test/basic_tests/interop-backend-traits.cpp b/sycl/test/basic_tests/interop-backend-traits.cpp
index f3c04a05c641c..d4384cde80ad0 100644
--- a/sycl/test/basic_tests/interop-backend-traits.cpp
+++ b/sycl/test/basic_tests/interop-backend-traits.cpp
@@ -27,8 +27,8 @@ constexpr auto Backend = sycl::backend::ext_oneapi_cuda;
 
 #ifdef USE_CUDA_EXPERIMENTAL
 #define SYCL_EXT_ONEAPI_BACKEND_CUDA_EXPERIMENTAL 1
-#include <sycl/sycl.hpp>
 #include <sycl/ext/oneapi/experimental/backend/cuda.hpp>
+#include <sycl/sycl.hpp>
 
 constexpr auto Backend = sycl::backend::ext_oneapi_cuda;
 #endif
diff --git a/sycl/test/basic_tests/interop-cuda-experimental.cpp b/sycl/test/basic_tests/interop-cuda-experimental.cpp
index b04a92f9cbfad..8dcdb1de6c410 100644
--- a/sycl/test/basic_tests/interop-cuda-experimental.cpp
+++ b/sycl/test/basic_tests/interop-cuda-experimental.cpp
@@ -5,8 +5,8 @@
 // Test for experimental CUDA interop API
 
 #define SYCL_EXT_ONEAPI_BACKEND_CUDA_EXPERIMENTAL 1
-#include <sycl/sycl.hpp>
 #include <sycl/ext/oneapi/experimental/backend/cuda.hpp>
+#include <sycl/sycl.hpp>
 
 #include <cuda.h>
 
@@ -95,14 +95,20 @@ int main() {
   // behavior of these template functions is defined by the SYCL backend
   // specification document.
 
-  backend_input_t<backend::ext_oneapi_cuda, platform> InteropPlatformInput{cu_platform};
-  platform InteropPlatform = make_platform<backend::ext_oneapi_cuda>(InteropPlatformInput);
+  backend_input_t<backend::ext_oneapi_cuda, platform> InteropPlatformInput{
+      cu_platform};
+  platform InteropPlatform =
+      make_platform<backend::ext_oneapi_cuda>(InteropPlatformInput);
 
-  backend_input_t<backend::ext_oneapi_cuda, device> InteropDeviceInput{cu_device};
-  device InteropDevice = make_device<backend::ext_oneapi_cuda>(InteropDeviceInput);
+  backend_input_t<backend::ext_oneapi_cuda, device> InteropDeviceInput{
+      cu_device};
+  device InteropDevice =
+      make_device<backend::ext_oneapi_cuda>(InteropDeviceInput);
 
-  backend_input_t<backend::ext_oneapi_cuda, context> InteropContextInput{cu_context[0]};
-  context InteropContext = make_context<backend::ext_oneapi_cuda>(InteropContextInput);
+  backend_input_t<backend::ext_oneapi_cuda, context> InteropContextInput{
+      cu_context[0]};
+  context InteropContext =
+      make_context<backend::ext_oneapi_cuda>(InteropContextInput);
 
   queue InteropQueue = make_queue<backend::ext_oneapi_cuda>(cu_queue, Context);
   event InteropEvent = make_event<backend::ext_oneapi_cuda>(cu_event, Context);

From d17840dfe449cab90405699a1c1e8999730fd7bb Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Thu, 14 Apr 2022 10:58:47 +0100
Subject: [PATCH 8/9] Add support for kernel function interop

---
 sycl/include/CL/sycl/accessor.hpp             |  6 +++++
 sycl/include/CL/sycl/backend.hpp              | 18 ++++++++++----
 .../backend/backend_traits_cuda.hpp           | 24 +++++++++----------
 .../ext/oneapi/experimental/backend/cuda.hpp  | 14 +++++------
 4 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/sycl/include/CL/sycl/accessor.hpp b/sycl/include/CL/sycl/accessor.hpp
index cf13264b4fca0..d21015c086f05 100644
--- a/sycl/include/CL/sycl/accessor.hpp
+++ b/sycl/include/CL/sycl/accessor.hpp
@@ -2134,6 +2134,12 @@ class __SYCL_SPECIAL_CLASS accessor<DataT, Dimensions, AccessMode,
   bool operator!=(const accessor &Rhs) const { return !(*this == Rhs); }
 };
 
+// local_accessor
+// Same semantics and restrictions as accessor with target::local
+template <typename DataT, int Dimensions = 1>
+using local_accessor =
+    accessor<DataT, Dimensions, access_mode::read_write, access::target::local>;
+
 /// Image accessors.
 ///
 /// Available only when accessTarget == access::target::image.
diff --git a/sycl/include/CL/sycl/backend.hpp b/sycl/include/CL/sycl/backend.hpp
index 2a228b66805c8..79009c217404e 100644
--- a/sycl/include/CL/sycl/backend.hpp
+++ b/sycl/include/CL/sycl/backend.hpp
@@ -115,15 +115,25 @@ inline backend_return_t<backend::opencl, event> get_native<
 }
 #endif
 
-// Native handle of an accessor should be accessed through interop_handler
+// Native accessor handle for kernel function interop
 template <backend BackendName, typename DataT, int Dimensions,
           access::mode AccessMode, access::target AccessTarget,
           access::placeholder IsPlaceholder>
 auto get_native(const accessor<DataT, Dimensions, AccessMode, AccessTarget,
                                IsPlaceholder> &Obj) ->
-    typename detail::interop<
-        BackendName, accessor<DataT, Dimensions, AccessMode, AccessTarget,
-                              IsPlaceholder>>::type = delete;
+    typename detail::interop<BackendName,
+                             accessor<DataT, Dimensions, AccessMode,
+                                      AccessTarget, IsPlaceholder>>::type {
+#ifdef __SYCL_DEVICE_ONLY__
+  return reinterpret_cast<typename detail::interop<
+      BackendName, accessor<DataT, Dimensions, AccessMode, AccessTarget,
+                            IsPlaceholder>>::type>(Obj.get_pointer().get());
+
+#else
+  throw runtime_error("Get native accessor is not support on host.",
+                      PI_INVALID_VALUE);
+#endif
+}
 
 namespace detail {
 // Forward declaration
diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp
index d6f3555757b91..8c658106e1536 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp
@@ -30,13 +30,6 @@ typedef struct CUstream_st *CUstream;
 typedef struct CUevent_st *CUevent;
 typedef struct CUmod_st *CUmodule;
 
-// As defined in the CUDA 10.1 header file. This requires CUDA version > 3.2
-#if defined(_WIN64) || defined(__LP64__)
-typedef unsigned long long CUdeviceptr;
-#else
-typedef unsigned int CUdeviceptr;
-#endif
-
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace detail {
@@ -72,13 +65,11 @@ template <> struct interop<backend::ext_oneapi_cuda, program> {
 };
 #endif
 
-// TODO the interops for accessor is used in the already deprecated class
-// interop_handler and can be removed after API cleanup.
 template <typename DataT, int Dimensions, access::mode AccessMode>
 struct interop<backend::ext_oneapi_cuda,
                accessor<DataT, Dimensions, AccessMode, access::target::device,
                         access::placeholder::false_t>> {
-  using type = CUdeviceptr;
+  using type = DataT *;
 };
 
 template <typename DataT, int Dimensions, access::mode AccessMode>
@@ -86,19 +77,26 @@ struct interop<
     backend::ext_oneapi_cuda,
     accessor<DataT, Dimensions, AccessMode, access::target::constant_buffer,
              access::placeholder::false_t>> {
-  using type = CUdeviceptr;
+  using type = DataT *;
+};
+
+template <typename DataT, int Dimensions, access::mode AccessMode>
+struct interop<backend::ext_oneapi_cuda,
+               accessor<DataT, Dimensions, AccessMode, access::target::local,
+                        access::placeholder::false_t>> {
+  using type = DataT *;
 };
 
 template <typename DataT, int Dimensions, typename AllocatorT>
 struct BackendInput<backend::ext_oneapi_cuda,
                     buffer<DataT, Dimensions, AllocatorT>> {
-  using type = CUdeviceptr;
+  using type = DataT *;
 };
 
 template <typename DataT, int Dimensions, typename AllocatorT>
 struct BackendReturn<backend::ext_oneapi_cuda,
                      buffer<DataT, Dimensions, AllocatorT>> {
-  using type = void *;
+  using type = DataT *;
 };
 
 template <> struct BackendInput<backend::ext_oneapi_cuda, context> {
diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
index 333ee5ba18b83..d3aa8c074ece5 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
@@ -21,12 +21,12 @@ namespace oneapi {
 namespace cuda {
 
 // Implementation of cuda::make<device>
-__SYCL_EXPORT device make_device(pi_native_handle NativeHandle) {
+inline __SYCL_EXPORT device make_device(pi_native_handle NativeHandle) {
   return detail::make_device(NativeHandle, backend::cuda);
 }
 
 // Implementation of cuda::make<platform>
-__SYCL_EXPORT platform make_platform(pi_native_handle NativeHandle) {
+inline __SYCL_EXPORT platform make_platform(pi_native_handle NativeHandle) {
   return detail::make_platform(NativeHandle, backend::cuda);
 }
 
@@ -36,7 +36,7 @@ __SYCL_EXPORT platform make_platform(pi_native_handle NativeHandle) {
 
 // CUDA context specialization
 template <>
-auto get_native<backend::ext_oneapi_cuda, context>(const context &C)
+inline auto get_native<backend::ext_oneapi_cuda, context>(const context &C)
     -> backend_return_t<backend::ext_oneapi_cuda, context> {
   // create a vector to be returned
   backend_return_t<backend::ext_oneapi_cuda, context> ret;
@@ -52,7 +52,7 @@ auto get_native<backend::ext_oneapi_cuda, context>(const context &C)
 
 // CUDA device specialization
 template <>
-device make_device<backend::ext_oneapi_cuda>(
+inline device make_device<backend::ext_oneapi_cuda>(
     const backend_input_t<backend::ext_oneapi_cuda, device> &BackendObject) {
   pi_native_handle NativeHandle = static_cast<pi_native_handle>(BackendObject);
   return ext::oneapi::cuda::make_device(NativeHandle);
@@ -60,7 +60,7 @@ device make_device<backend::ext_oneapi_cuda>(
 
 // CUDA platform specialization
 template <>
-auto get_native<backend::ext_oneapi_cuda, platform>(const platform &C)
+inline auto get_native<backend::ext_oneapi_cuda, platform>(const platform &C)
     -> backend_return_t<backend::ext_oneapi_cuda, platform> {
   // get list of platform devices, and transfer to native platform type
   std::vector<device> platform_devices = C.get_devices();
@@ -75,7 +75,7 @@ auto get_native<backend::ext_oneapi_cuda, platform>(const platform &C)
 }
 
 template <>
-platform make_platform<backend::ext_oneapi_cuda>(
+inline platform make_platform<backend::ext_oneapi_cuda>(
     const backend_input_t<backend::ext_oneapi_cuda, platform> &BackendObject) {
   pi_native_handle NativeHandle =
       detail::pi::cast<pi_native_handle>(&BackendObject);
@@ -84,7 +84,7 @@ platform make_platform<backend::ext_oneapi_cuda>(
 
 // Specialisation of interop_handles get_native_context
 template <>
-backend_return_t<backend::ext_oneapi_cuda, context>
+inline backend_return_t<backend::ext_oneapi_cuda, context>
 interop_handle::get_native_context<backend::ext_oneapi_cuda>() const {
 #ifndef __SYCL_DEVICE_ONLY__
   return std::vector{reinterpret_cast<CUcontext>(getNativeContext())};

From 80568c6d2c060039cc3bdc41f900403a8511a6c1 Mon Sep 17 00:00:00 2001
From: "aidan.belton" <aidan.belton@codeplay.com>
Date: Thu, 5 May 2022 10:17:57 +0100
Subject: [PATCH 9/9] add cuda interop context, queue, event

---
 .../ext/oneapi/experimental/backend/cuda.hpp  |  4 +-
 sycl/plugins/cuda/pi_cuda.cpp                 | 97 +++++++++++++++----
 sycl/plugins/cuda/pi_cuda.hpp                 |  8 ++
 3 files changed, 89 insertions(+), 20 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
index d3aa8c074ece5..184a109a321a9 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/backend/cuda.hpp
@@ -22,12 +22,12 @@ namespace cuda {
 
 // Implementation of cuda::make<device>
 inline __SYCL_EXPORT device make_device(pi_native_handle NativeHandle) {
-  return detail::make_device(NativeHandle, backend::cuda);
+  return sycl::detail::make_device(NativeHandle, backend::cuda);
 }
 
 // Implementation of cuda::make<platform>
 inline __SYCL_EXPORT platform make_platform(pi_native_handle NativeHandle) {
-  return detail::make_platform(NativeHandle, backend::cuda);
+  return sycl::detail::make_platform(NativeHandle, backend::cuda);
 }
 
 } // namespace cuda
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 1f228aa1a194c..173bbee37e05b 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -344,6 +344,12 @@ _pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue)
   cuda_piContextRetain(context_);
 }
 
+_pi_event::_pi_event(pi_context context, CUevent eventNative)
+    : commandType_{PI_COMMAND_TYPE_USER}, refCount_{1}, hasBeenWaitedOn_{false},
+      isRecorded_{false}, isStarted_{false}, evEnd_{eventNative},
+      evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr}, context_{
+                                                                  context} {}
+
 _pi_event::~_pi_event() {
   if (queue_ != nullptr) {
     cuda_piQueueRelease(queue_);
@@ -1977,8 +1983,6 @@ pi_result cuda_piContextRelease(pi_context ctxt) {
 
   std::unique_ptr<_pi_context> context{ctxt};
 
-  PI_CHECK_ERROR(cuEventDestroy(context->evBase_));
-
   if (!ctxt->is_primary()) {
     CUcontext cuCtxt = ctxt->get();
     CUcontext current = nullptr;
@@ -1986,6 +1990,7 @@ pi_result cuda_piContextRelease(pi_context ctxt) {
     if (cuCtxt != current) {
       PI_CHECK_ERROR(cuCtxPushCurrent(cuCtxt));
     }
+    PI_CHECK_ERROR(cuEventDestroy(context->evBase_));
     PI_CHECK_ERROR(cuCtxSynchronize());
     cuCtxGetCurrent(&current);
     if (cuCtxt == current) {
@@ -1994,6 +1999,7 @@ pi_result cuda_piContextRelease(pi_context ctxt) {
     return PI_CHECK_ERROR(cuCtxDestroy(cuCtxt));
   } else {
     // Primary context is not destroyed, but released
+    PI_CHECK_ERROR(cuEventDestroy(context->evBase_));
     CUdevice cuDev = ctxt->get_device()->get();
     CUcontext current;
     cuCtxPopCurrent(&current);
@@ -2021,12 +2027,43 @@ pi_result cuda_piextContextGetNativeHandle(pi_context context,
 /// \param[out] context Set to the PI context object created from native handle.
 ///
 /// \return TBD
-pi_result cuda_piextContextCreateWithNativeHandle(pi_native_handle, pi_uint32,
-                                                  const pi_device *, bool,
-                                                  pi_context *) {
-  cl::sycl::detail::pi::die(
-      "Creation of PI context from native handle not implemented");
-  return {};
+pi_result cuda_piextContextCreateWithNativeHandle(pi_native_handle nativeHandle,
+                                                  pi_uint32 num_devices,
+                                                  const pi_device *devices,
+                                                  bool ownNativeHandle,
+                                                  pi_context *piContext) {
+  (void)num_devices;
+  (void)devices;
+  (void)ownNativeHandle;
+  assert(piContext != nullptr);
+  assert(ownNativeHandle == false);
+
+  CUcontext newContext = reinterpret_cast<CUcontext>(nativeHandle);
+
+  // Push native context to thread
+  pi_result retErr = PI_CHECK_ERROR(cuCtxPushCurrent(newContext));
+
+  // Get context's native device
+  CUdevice cu_device;
+  retErr = PI_CHECK_ERROR(cuCtxGetDevice(&cu_device));
+
+  // Create a SYCL device from the ctx device
+  pi_device device = nullptr;
+  retErr = cuda_piextDeviceCreateWithNativeHandle(cu_device, nullptr, &device);
+
+  // Create sycl context
+  *piContext =
+      new _pi_context{_pi_context::kind::user_defined, newContext, device};
+
+  // Use default stream to record base event counter
+  retErr =
+      PI_CHECK_ERROR(cuEventCreate(&(*piContext)->evBase_, CU_EVENT_DEFAULT));
+  retErr = PI_CHECK_ERROR(cuEventRecord((*piContext)->evBase_, 0));
+
+  // Pop native context
+  retErr = PI_CHECK_ERROR(cuCtxPopCurrent(nullptr));
+
+  return retErr;
 }
 
 /// Creates a PI Memory object using a CUDA memory allocation.
@@ -2430,13 +2467,29 @@ pi_result cuda_piextQueueGetNativeHandle(pi_queue queue,
 ///        the native handle, if it can.
 ///
 /// \return TBD
-pi_result cuda_piextQueueCreateWithNativeHandle(pi_native_handle, pi_context,
-                                                pi_queue *,
+pi_result cuda_piextQueueCreateWithNativeHandle(pi_native_handle nativeHandle,
+                                                pi_context context,
+                                                pi_queue *queue,
                                                 bool ownNativeHandle) {
   (void)ownNativeHandle;
-  cl::sycl::detail::pi::die(
-      "Creation of PI queue from native handle not implemented");
-  return {};
+  assert(ownNativeHandle == 1);
+
+  unsigned int flags;
+  CUstream cuStream = reinterpret_cast<CUstream>(nativeHandle);
+
+  auto retErr = PI_CHECK_ERROR(cuStreamGetFlags(cuStream, &flags));
+
+  pi_queue_properties properties = 0;
+  if (flags == CU_STREAM_DEFAULT)
+    properties = __SYCL_PI_CUDA_USE_DEFAULT_STREAM;
+  else if (flags == CU_STREAM_NON_BLOCKING)
+    properties = __SYCL_PI_CUDA_SYNC_WITH_DEFAULT;
+  else
+    cl::sycl::detail::pi::die("Unknown cuda stream");
+
+  *queue = new _pi_queue{cuStream, context, context->get_device(), properties};
+
+  return retErr;
 }
 
 pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer,
@@ -3699,11 +3752,19 @@ pi_result cuda_piextEventGetNativeHandle(pi_event event,
 /// \param[out] event Set to the PI event object created from native handle.
 ///
 /// \return TBD
-pi_result cuda_piextEventCreateWithNativeHandle(pi_native_handle, pi_context,
-                                                bool, pi_event *) {
-  cl::sycl::detail::pi::die(
-      "Creation of PI event from native handle not implemented");
-  return {};
+pi_result cuda_piextEventCreateWithNativeHandle(pi_native_handle nativeHandle,
+                                                pi_context context,
+                                                bool ownNativeHandle,
+                                                pi_event *event) {
+  (void)ownNativeHandle;
+  assert(ownNativeHandle == true);
+
+  std::unique_ptr<_pi_event> event_ptr{nullptr};
+
+  *event = _pi_event::make_with_native(context,
+                                       reinterpret_cast<CUevent>(nativeHandle));
+
+  return PI_SUCCESS;
 }
 
 /// Creates a PI sampler object
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index aa8c8945265a2..a23a4f7ca9b9d 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -477,6 +477,10 @@ struct _pi_event {
     return new _pi_event(type, queue->get_context(), queue);
   }
 
+  static pi_event make_with_native(pi_context context, CUevent eventNative) {
+    return new _pi_event(context, eventNative);
+  }
+
   pi_result release();
 
   ~_pi_event();
@@ -486,6 +490,10 @@ struct _pi_event {
   // make_user static members in order to create a pi_event for CUDA.
   _pi_event(pi_command_type type, pi_context context, pi_queue queue);
 
+  // This constructor is private to force programmers to use the
+  // make_from_native / for event introp
+  _pi_event(pi_context context, CUevent eventNative);
+
   pi_command_type commandType_; // The type of command associated with event.
 
   std::atomic_uint32_t refCount_; // Event reference count.