[SYCL][CUDA] Decouple CUDA contexts from PI contexts

npmiller · npmiller · commit 5d788c609f1c · 2023-02-06T14:30:44.000Z
This patch moves the CUDA context from the PI context to the PI device, and switches to always using the primary context. CUDA contexts are different from SYCL contexts in that they're tied to a single device, and that they are required to be active on a thread for most calls to the CUDA driver API. As shown in #8124 and #7526 the current mapping of CUDA context to PI context, causes issues for device based entry points that still need to call the CUDA APIs, we have workarounds to solve that but they're a bit hacky, inefficient, and have a lot of edge case issues. The peer to peer interface proposal in #6104, is also device based, but enabling peer to peer for CUDA is done on the CUDA contexts, so the current mapping would make it difficult to implement. So this patch solves most of these issues by decoupling the CUDA context from the SYCL context, and simply managing the CUDA contexts in the devices, it also changes the CUDA context management to always use the primary context. This approach as a number of advantages: * Use of the primary context is recommended by Nvidia * Simplifies the CUDA context management in the plugin * Available CUDA context in device based entry points * Likely more efficient in the general case, with less opportunities to accidentally cause costly CUDA context switches. * Easier and likely more efficient interactions with CUDA runtime applications. * Easier to expose P2P capabilities * Easier to support multiple devices in a SYCL context It does have a few drawbacks from the previous approach: * Drops support for `make_context` interop, no sensible "native handle" to pass in (`get_native` is still supported fine). * No opportunity for users to separate their work into different CUDA contexts. It's unclear if there's any actual use case for this, it seems very uncommon in CUDA codebases to have multiple CUDA contexts for a single CUDA device in the same process. So overall I believe this should be a net benefit in general, and we could revisit if we run into an edge case that would need more fine grained CUDA context management.
diff --git a/sycl/include/sycl/detail/properties_traits.def b/sycl/include/sycl/detail/properties_traits.def
@@ -11,9 +11,9 @@ __SYCL_PARAM_TRAITS_SPEC(sycl::property::no_init)
 __SYCL_PARAM_TRAITS_SPEC(
     sycl::property::context::cuda::use_primary_context) // Deprecated
 __SYCL_PARAM_TRAITS_SPEC(
-    sycl::ext::oneapi::cuda::property::context::use_primary_context)
+    sycl::ext::oneapi::cuda::property::context::use_primary_context) // Deprecated
 __SYCL_PARAM_TRAITS_SPEC(sycl::property::queue::in_order)
 __SYCL_PARAM_TRAITS_SPEC(sycl::property::reduction::initialize_to_identity)
 __SYCL_PARAM_TRAITS_SPEC(sycl::ext::oneapi::property::queue::priority_low)
 __SYCL_PARAM_TRAITS_SPEC(sycl::ext::oneapi::property::queue::priority_high)
-__SYCL_PARAM_TRAITS_SPEC(sycl::ext::oneapi::property::queue::priority_normal)
+__SYCL_PARAM_TRAITS_SPEC(sycl::ext::oneapi::property::queue::priority_normal)
diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp
@@ -114,7 +114,7 @@ template <> struct BackendReturn<backend::ext_oneapi_cuda, platform> {
 template <> struct InteropFeatureSupportMap<backend::ext_oneapi_cuda> {
   static constexpr bool MakePlatform = false;
   static constexpr bool MakeDevice = true;
-  static constexpr bool MakeContext = true;
+  static constexpr bool MakeContext = false;
   static constexpr bool MakeQueue = true;
   static constexpr bool MakeEvent = true;
   static constexpr bool MakeBuffer = false;
diff --git a/sycl/include/sycl/properties/context_properties.hpp b/sycl/include/sycl/properties/context_properties.hpp
@@ -14,10 +14,14 @@
 
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
-namespace ext::oneapi::cuda::property::context {
+namespace ext::oneapi::cuda::property {
+namespace __SYCL_DEPRECATED(
+    "the primary contexts are now always used") context {
 class use_primary_context : public ::sycl::detail::DataLessProperty<
                                 ::sycl::detail::UsePrimaryContext> {};
-} // namespace ext::oneapi::cuda::property::context
+} // namespace __SYCL_DEPRECATED("the primary contexts are now always
+  // used")context
+} // namespace ext::oneapi::cuda::property
 
 namespace property::context {
 namespace __SYCL2020_DEPRECATED(
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
@@ -917,8 +917,11 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
             for (int i = 0; i < numDevices; ++i) {
               CUdevice device;
               err = PI_CHECK_ERROR(cuDeviceGet(&device, i));
+              CUcontext context;
+              err = PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(&context, device));
+
               platformIds[i].devices_.emplace_back(
-                  new _pi_device{device, &platformIds[i]});
+                  new _pi_device{device, context, &platformIds[i]});
 
               {
                 const auto &dev = platformIds[i].devices_.back().get();
@@ -1183,6 +1186,8 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
 
   assert(device != nullptr);
 
+  ScopedContext active(device->get_context());
+
   switch (param_name) {
   case PI_DEVICE_INFO_TYPE: {
     return getInfo(param_value_size, param_value, param_value_size_ret,
@@ -1961,7 +1966,6 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
   }
 
   case PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY: {
-    ScopedContext active(device);
     size_t FreeMemory = 0;
     size_t TotalMemory = 0;
     sycl::detail::pi::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) ==
@@ -2121,50 +2125,10 @@ pi_result cuda_piContextCreate(const pi_context_properties *properties,
   assert(retcontext != nullptr);
   pi_result errcode_ret = PI_SUCCESS;
 
-  // Parse properties.
-  bool property_cuda_primary = false;
-  while (properties && (0 != *properties)) {
-    // Consume property ID.
-    pi_context_properties id = *properties;
-    ++properties;
-    // Consume property value.
-    pi_context_properties value = *properties;
-    ++properties;
-    switch (id) {
-    case __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY:
-      assert(value == PI_FALSE || value == PI_TRUE);
-      property_cuda_primary = static_cast<bool>(value);
-      break;
-    default:
-      // Unknown property.
-      sycl::detail::pi::die(
-          "Unknown piContextCreate property in property list");
-      return PI_ERROR_INVALID_VALUE;
-    }
-  }
-
   std::unique_ptr<_pi_context> piContextPtr{nullptr};
   try {
-    CUcontext current = nullptr;
-
-    if (property_cuda_primary) {
-      // Use the CUDA primary context and assume that we want to use it
-      // immediately as we want to forge context switches.
-      CUcontext Ctxt;
-      errcode_ret =
-          PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(&Ctxt, devices[0]->get()));
-      piContextPtr = std::unique_ptr<_pi_context>(
-          new _pi_context{_pi_context::kind::primary, Ctxt, *devices});
-      errcode_ret = PI_CHECK_ERROR(cuCtxPushCurrent(Ctxt));
-    } else {
-      // Create a scoped context.
-      CUcontext newContext;
-      PI_CHECK_ERROR(cuCtxGetCurrent(&current));
-      errcode_ret = PI_CHECK_ERROR(
-          cuCtxCreate(&newContext, CU_CTX_MAP_HOST, devices[0]->get()));
-      piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{
-          _pi_context::kind::user_defined, newContext, *devices});
-    }
+    piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{*devices});
+
     static std::once_flag initFlag;
     std::call_once(
         initFlag,
@@ -2176,14 +2140,6 @@ pi_result cuda_piContextCreate(const pi_context_properties *properties,
         },
         errcode_ret);
 
-    // For non-primary scoped contexts keep the last active on top of the stack
-    // as `cuCtxCreate` replaces it implicitly otherwise.
-    // Primary contexts are kept on top of the stack, so the previous context
-    // is not queried and therefore not recovered.
-    if (current != nullptr) {
-      PI_CHECK_ERROR(cuCtxSetCurrent(current));
-    }
-
     *retcontext = piContextPtr.release();
   } catch (pi_result err) {
     errcode_ret = err;
@@ -2194,7 +2150,6 @@ pi_result cuda_piContextCreate(const pi_context_properties *properties,
 }
 
 pi_result cuda_piContextRelease(pi_context ctxt) {
-
   assert(ctxt != nullptr);
 
   if (ctxt->decrement_reference_count() > 0) {
@@ -2204,29 +2159,7 @@ pi_result cuda_piContextRelease(pi_context ctxt) {
 
   std::unique_ptr<_pi_context> context{ctxt};
 
-  if (!ctxt->backend_has_ownership())
-    return PI_SUCCESS;
-
-  if (!ctxt->is_primary()) {
-    CUcontext cuCtxt = ctxt->get();
-    CUcontext current = nullptr;
-    cuCtxGetCurrent(&current);
-    if (cuCtxt != current) {
-      PI_CHECK_ERROR(cuCtxPushCurrent(cuCtxt));
-    }
-    PI_CHECK_ERROR(cuCtxSynchronize());
-    cuCtxGetCurrent(&current);
-    if (cuCtxt == current) {
-      PI_CHECK_ERROR(cuCtxPopCurrent(&current));
-    }
-    return PI_CHECK_ERROR(cuCtxDestroy(cuCtxt));
-  }
-
-  // Primary context is not destroyed, but released
-  CUdevice cuDev = ctxt->get_device()->get();
-  CUcontext current;
-  cuCtxPopCurrent(&current);
-  return PI_CHECK_ERROR(cuDevicePrimaryCtxRelease(cuDev));
+  return PI_SUCCESS;
 }
 
 /// Gets the native CUDA handle of a PI context object
@@ -2253,29 +2186,15 @@ pi_result cuda_piextContextCreateWithNativeHandle(pi_native_handle nativeHandle,
                                                   const pi_device *devices,
                                                   bool ownNativeHandle,
                                                   pi_context *piContext) {
+  (void)nativeHandle;
   (void)num_devices;
   (void)devices;
   (void)ownNativeHandle;
+  (void)piContext;
   assert(piContext != nullptr);
   assert(ownNativeHandle == false);
 
-  CUcontext newContext = reinterpret_cast<CUcontext>(nativeHandle);
-
-  ScopedContext active(newContext);
-
-  // Get context's native device
-  CUdevice cu_device;
-  pi_result retErr = PI_CHECK_ERROR(cuCtxGetDevice(&cu_device));
-
-  // Create a SYCL device from the ctx device
-  pi_device device = nullptr;
-  retErr = cuda_piextDeviceCreateWithNativeHandle(cu_device, nullptr, &device);
-
-  // Create sycl context
-  *piContext = new _pi_context{_pi_context::kind::user_defined, newContext,
-                               device, /*backend_owns*/ false};
-
-  return retErr;
+  return PI_ERROR_INVALID_OPERATION;
 }
 
 /// Creates a PI Memory object using a CUDA memory allocation.
@@ -2469,8 +2388,6 @@ pi_result cuda_piMemBufferPartition(pi_mem parent_buffer, pi_mem_flags flags,
 
   std::unique_ptr<_pi_mem> retMemObj{nullptr};
   try {
-    ScopedContext active(context);
-
     retMemObj = std::unique_ptr<_pi_mem>{new _pi_mem{
         context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}};
   } catch (pi_result err) {
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
@@ -86,28 +86,29 @@ struct _pi_device {
   using native_type = CUdevice;
 
   native_type cuDevice_;
+  CUcontext cuContext_;
   std::atomic_uint32_t refCount_;
   pi_platform platform_;
-  pi_context context_;
 
   static constexpr pi_uint32 max_work_item_dimensions = 3u;
   size_t max_work_item_sizes[max_work_item_dimensions];
   int max_work_group_size;
 
 public:
-  _pi_device(native_type cuDevice, pi_platform platform)
-      : cuDevice_(cuDevice), refCount_{1}, platform_(platform) {}
+  _pi_device(native_type cuDevice, CUcontext cuContext, pi_platform platform)
+      : cuDevice_(cuDevice), cuContext_(cuContext), refCount_{1},
+        platform_(platform) {}
+
+  ~_pi_device() { cuDevicePrimaryCtxRelease(cuDevice_); }
 
   native_type get() const noexcept { return cuDevice_; };
 
+  CUcontext get_context() const noexcept { return cuContext_; };
+
   pi_uint32 get_reference_count() const noexcept { return refCount_; }
 
   pi_platform get_platform() const noexcept { return platform_; };
 
-  void set_context(pi_context ctx) { context_ = ctx; };
-
-  pi_context get_context() { return context_; };
-
   void save_max_work_item_sizes(size_t size,
                                 size_t *save_max_work_item_sizes) noexcept {
     memcpy(max_work_item_sizes, save_max_work_item_sizes, size);
@@ -174,16 +175,12 @@ struct _pi_context {
 
   using native_type = CUcontext;
 
-  enum class kind { primary, user_defined } kind_;
   native_type cuContext_;
   _pi_device *deviceId_;
   std::atomic_uint32_t refCount_;
 
-  _pi_context(kind k, CUcontext ctxt, _pi_device *devId,
-              bool backend_owns = true)
-      : kind_{k}, cuContext_{ctxt}, deviceId_{devId}, refCount_{1},
-        has_ownership{backend_owns} {
-    deviceId_->set_context(this);
+  _pi_context(_pi_device *devId)
+      : cuContext_{devId->get_context()}, deviceId_{devId}, refCount_{1} {
     cuda_piDeviceRetain(deviceId_);
   };
 
@@ -206,20 +203,15 @@ struct _pi_context {
 
   native_type get() const noexcept { return cuContext_; }
 
-  bool is_primary() const noexcept { return kind_ == kind::primary; }
-
   pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
 
   pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
 
   pi_uint32 get_reference_count() const noexcept { return refCount_; }
 
-  bool backend_has_ownership() const noexcept { return has_ownership; }
-
 private:
   std::mutex mutex_;
   std::vector<deleter_data> extended_deleters_;
-  const bool has_ownership;
 };
 
 /// PI Mem mapping to CUDA memory allocations, both data and texture/surface.
diff --git a/sycl/test/basic_tests/interop-cuda.cpp b/sycl/test/basic_tests/interop-cuda.cpp
@@ -87,8 +87,6 @@ int main() {
 
   backend_input_t<backend::ext_oneapi_cuda, context> InteropContextInput{
       cu_context[0]};
-  context InteropContext =
-      make_context<backend::ext_oneapi_cuda>(InteropContextInput);
   event InteropEvent = make_event<backend::ext_oneapi_cuda>(cu_event, Context);
 
   queue InteropQueue = make_queue<backend::ext_oneapi_cuda>(cu_queue, Context);
diff --git a/sycl/unittests/pi/cuda/test_base_objects.cpp b/sycl/unittests/pi/cuda/test_base_objects.cpp