From 5d788c609f1c5cfde37414af49fbc8996bf2a712 Mon Sep 17 00:00:00 2001 From: Nicolas Miller Date: Wed, 1 Feb 2023 18:11:15 +0000 Subject: [PATCH 1/2] [SYCL][CUDA] Decouple CUDA contexts from PI contexts This patch moves the CUDA context from the PI context to the PI device, and switches to always using the primary context. CUDA contexts are different from SYCL contexts in that they're tied to a single device, and that they are required to be active on a thread for most calls to the CUDA driver API. As shown in intel/llvm#8124 and intel/llvm#7526 the current mapping of CUDA context to PI context, causes issues for device based entry points that still need to call the CUDA APIs, we have workarounds to solve that but they're a bit hacky, inefficient, and have a lot of edge case issues. The peer to peer interface proposal in intel/llvm#6104, is also device based, but enabling peer to peer for CUDA is done on the CUDA contexts, so the current mapping would make it difficult to implement. So this patch solves most of these issues by decoupling the CUDA context from the SYCL context, and simply managing the CUDA contexts in the devices, it also changes the CUDA context management to always use the primary context. This approach as a number of advantages: * Use of the primary context is recommended by Nvidia * Simplifies the CUDA context management in the plugin * Available CUDA context in device based entry points * Likely more efficient in the general case, with less opportunities to accidentally cause costly CUDA context switches. * Easier and likely more efficient interactions with CUDA runtime applications. * Easier to expose P2P capabilities * Easier to support multiple devices in a SYCL context It does have a few drawbacks from the previous approach: * Drops support for `make_context` interop, no sensible "native handle" to pass in (`get_native` is still supported fine). * No opportunity for users to separate their work into different CUDA contexts. It's unclear if there's any actual use case for this, it seems very uncommon in CUDA codebases to have multiple CUDA contexts for a single CUDA device in the same process. So overall I believe this should be a net benefit in general, and we could revisit if we run into an edge case that would need more fine grained CUDA context management. --- .../include/sycl/detail/properties_traits.def | 4 +- .../backend/backend_traits_cuda.hpp | 2 +- .../sycl/properties/context_properties.hpp | 8 +- sycl/plugins/cuda/pi_cuda.cpp | 107 ++---------------- sycl/plugins/cuda/pi_cuda.hpp | 28 ++--- sycl/test/basic_tests/interop-cuda.cpp | 2 - sycl/unittests/pi/cuda/test_base_objects.cpp | 90 --------------- 7 files changed, 31 insertions(+), 210 deletions(-) diff --git a/sycl/include/sycl/detail/properties_traits.def b/sycl/include/sycl/detail/properties_traits.def index 61101b9985d04..581380cc51f05 100644 --- a/sycl/include/sycl/detail/properties_traits.def +++ b/sycl/include/sycl/detail/properties_traits.def @@ -11,9 +11,9 @@ __SYCL_PARAM_TRAITS_SPEC(sycl::property::no_init) __SYCL_PARAM_TRAITS_SPEC( sycl::property::context::cuda::use_primary_context) // Deprecated __SYCL_PARAM_TRAITS_SPEC( - sycl::ext::oneapi::cuda::property::context::use_primary_context) + sycl::ext::oneapi::cuda::property::context::use_primary_context) // Deprecated __SYCL_PARAM_TRAITS_SPEC(sycl::property::queue::in_order) __SYCL_PARAM_TRAITS_SPEC(sycl::property::reduction::initialize_to_identity) __SYCL_PARAM_TRAITS_SPEC(sycl::ext::oneapi::property::queue::priority_low) __SYCL_PARAM_TRAITS_SPEC(sycl::ext::oneapi::property::queue::priority_high) -__SYCL_PARAM_TRAITS_SPEC(sycl::ext::oneapi::property::queue::priority_normal) \ No newline at end of file +__SYCL_PARAM_TRAITS_SPEC(sycl::ext::oneapi::property::queue::priority_normal) diff --git a/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp index 52d8cc81366dd..370866eb126d5 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/backend/backend_traits_cuda.hpp @@ -114,7 +114,7 @@ template <> struct BackendReturn { template <> struct InteropFeatureSupportMap { static constexpr bool MakePlatform = false; static constexpr bool MakeDevice = true; - static constexpr bool MakeContext = true; + static constexpr bool MakeContext = false; static constexpr bool MakeQueue = true; static constexpr bool MakeEvent = true; static constexpr bool MakeBuffer = false; diff --git a/sycl/include/sycl/properties/context_properties.hpp b/sycl/include/sycl/properties/context_properties.hpp index 443448684e001..68b3d37dba5b4 100644 --- a/sycl/include/sycl/properties/context_properties.hpp +++ b/sycl/include/sycl/properties/context_properties.hpp @@ -14,10 +14,14 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { -namespace ext::oneapi::cuda::property::context { +namespace ext::oneapi::cuda::property { +namespace __SYCL_DEPRECATED( + "the primary contexts are now always used") context { class use_primary_context : public ::sycl::detail::DataLessProperty< ::sycl::detail::UsePrimaryContext> {}; -} // namespace ext::oneapi::cuda::property::context +} // namespace __SYCL_DEPRECATED("the primary contexts are now always + // used")context +} // namespace ext::oneapi::cuda::property namespace property::context { namespace __SYCL2020_DEPRECATED( diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index b7c64ef9f9a58..2a1168f0b2067 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -917,8 +917,11 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms, for (int i = 0; i < numDevices; ++i) { CUdevice device; err = PI_CHECK_ERROR(cuDeviceGet(&device, i)); + CUcontext context; + err = PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(&context, device)); + platformIds[i].devices_.emplace_back( - new _pi_device{device, &platformIds[i]}); + new _pi_device{device, context, &platformIds[i]}); { const auto &dev = platformIds[i].devices_.back().get(); @@ -1183,6 +1186,8 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, assert(device != nullptr); + ScopedContext active(device->get_context()); + switch (param_name) { case PI_DEVICE_INFO_TYPE: { return getInfo(param_value_size, param_value, param_value_size_ret, @@ -1961,7 +1966,6 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, } case PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY: { - ScopedContext active(device); size_t FreeMemory = 0; size_t TotalMemory = 0; sycl::detail::pi::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) == @@ -2121,50 +2125,10 @@ pi_result cuda_piContextCreate(const pi_context_properties *properties, assert(retcontext != nullptr); pi_result errcode_ret = PI_SUCCESS; - // Parse properties. - bool property_cuda_primary = false; - while (properties && (0 != *properties)) { - // Consume property ID. - pi_context_properties id = *properties; - ++properties; - // Consume property value. - pi_context_properties value = *properties; - ++properties; - switch (id) { - case __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY: - assert(value == PI_FALSE || value == PI_TRUE); - property_cuda_primary = static_cast(value); - break; - default: - // Unknown property. - sycl::detail::pi::die( - "Unknown piContextCreate property in property list"); - return PI_ERROR_INVALID_VALUE; - } - } - std::unique_ptr<_pi_context> piContextPtr{nullptr}; try { - CUcontext current = nullptr; - - if (property_cuda_primary) { - // Use the CUDA primary context and assume that we want to use it - // immediately as we want to forge context switches. - CUcontext Ctxt; - errcode_ret = - PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(&Ctxt, devices[0]->get())); - piContextPtr = std::unique_ptr<_pi_context>( - new _pi_context{_pi_context::kind::primary, Ctxt, *devices}); - errcode_ret = PI_CHECK_ERROR(cuCtxPushCurrent(Ctxt)); - } else { - // Create a scoped context. - CUcontext newContext; - PI_CHECK_ERROR(cuCtxGetCurrent(¤t)); - errcode_ret = PI_CHECK_ERROR( - cuCtxCreate(&newContext, CU_CTX_MAP_HOST, devices[0]->get())); - piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{ - _pi_context::kind::user_defined, newContext, *devices}); - } + piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{*devices}); + static std::once_flag initFlag; std::call_once( initFlag, @@ -2176,14 +2140,6 @@ pi_result cuda_piContextCreate(const pi_context_properties *properties, }, errcode_ret); - // For non-primary scoped contexts keep the last active on top of the stack - // as `cuCtxCreate` replaces it implicitly otherwise. - // Primary contexts are kept on top of the stack, so the previous context - // is not queried and therefore not recovered. - if (current != nullptr) { - PI_CHECK_ERROR(cuCtxSetCurrent(current)); - } - *retcontext = piContextPtr.release(); } catch (pi_result err) { errcode_ret = err; @@ -2194,7 +2150,6 @@ pi_result cuda_piContextCreate(const pi_context_properties *properties, } pi_result cuda_piContextRelease(pi_context ctxt) { - assert(ctxt != nullptr); if (ctxt->decrement_reference_count() > 0) { @@ -2204,29 +2159,7 @@ pi_result cuda_piContextRelease(pi_context ctxt) { std::unique_ptr<_pi_context> context{ctxt}; - if (!ctxt->backend_has_ownership()) - return PI_SUCCESS; - - if (!ctxt->is_primary()) { - CUcontext cuCtxt = ctxt->get(); - CUcontext current = nullptr; - cuCtxGetCurrent(¤t); - if (cuCtxt != current) { - PI_CHECK_ERROR(cuCtxPushCurrent(cuCtxt)); - } - PI_CHECK_ERROR(cuCtxSynchronize()); - cuCtxGetCurrent(¤t); - if (cuCtxt == current) { - PI_CHECK_ERROR(cuCtxPopCurrent(¤t)); - } - return PI_CHECK_ERROR(cuCtxDestroy(cuCtxt)); - } - - // Primary context is not destroyed, but released - CUdevice cuDev = ctxt->get_device()->get(); - CUcontext current; - cuCtxPopCurrent(¤t); - return PI_CHECK_ERROR(cuDevicePrimaryCtxRelease(cuDev)); + return PI_SUCCESS; } /// Gets the native CUDA handle of a PI context object @@ -2253,29 +2186,15 @@ pi_result cuda_piextContextCreateWithNativeHandle(pi_native_handle nativeHandle, const pi_device *devices, bool ownNativeHandle, pi_context *piContext) { + (void)nativeHandle; (void)num_devices; (void)devices; (void)ownNativeHandle; + (void)piContext; assert(piContext != nullptr); assert(ownNativeHandle == false); - CUcontext newContext = reinterpret_cast(nativeHandle); - - ScopedContext active(newContext); - - // Get context's native device - CUdevice cu_device; - pi_result retErr = PI_CHECK_ERROR(cuCtxGetDevice(&cu_device)); - - // Create a SYCL device from the ctx device - pi_device device = nullptr; - retErr = cuda_piextDeviceCreateWithNativeHandle(cu_device, nullptr, &device); - - // Create sycl context - *piContext = new _pi_context{_pi_context::kind::user_defined, newContext, - device, /*backend_owns*/ false}; - - return retErr; + return PI_ERROR_INVALID_OPERATION; } /// Creates a PI Memory object using a CUDA memory allocation. @@ -2469,8 +2388,6 @@ pi_result cuda_piMemBufferPartition(pi_mem parent_buffer, pi_mem_flags flags, std::unique_ptr<_pi_mem> retMemObj{nullptr}; try { - ScopedContext active(context); - retMemObj = std::unique_ptr<_pi_mem>{new _pi_mem{ context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}}; } catch (pi_result err) { diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index b4949b03ad046..a957b8df603c7 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -86,28 +86,29 @@ struct _pi_device { using native_type = CUdevice; native_type cuDevice_; + CUcontext cuContext_; std::atomic_uint32_t refCount_; pi_platform platform_; - pi_context context_; static constexpr pi_uint32 max_work_item_dimensions = 3u; size_t max_work_item_sizes[max_work_item_dimensions]; int max_work_group_size; public: - _pi_device(native_type cuDevice, pi_platform platform) - : cuDevice_(cuDevice), refCount_{1}, platform_(platform) {} + _pi_device(native_type cuDevice, CUcontext cuContext, pi_platform platform) + : cuDevice_(cuDevice), cuContext_(cuContext), refCount_{1}, + platform_(platform) {} + + ~_pi_device() { cuDevicePrimaryCtxRelease(cuDevice_); } native_type get() const noexcept { return cuDevice_; }; + CUcontext get_context() const noexcept { return cuContext_; }; + pi_uint32 get_reference_count() const noexcept { return refCount_; } pi_platform get_platform() const noexcept { return platform_; }; - void set_context(pi_context ctx) { context_ = ctx; }; - - pi_context get_context() { return context_; }; - void save_max_work_item_sizes(size_t size, size_t *save_max_work_item_sizes) noexcept { memcpy(max_work_item_sizes, save_max_work_item_sizes, size); @@ -174,16 +175,12 @@ struct _pi_context { using native_type = CUcontext; - enum class kind { primary, user_defined } kind_; native_type cuContext_; _pi_device *deviceId_; std::atomic_uint32_t refCount_; - _pi_context(kind k, CUcontext ctxt, _pi_device *devId, - bool backend_owns = true) - : kind_{k}, cuContext_{ctxt}, deviceId_{devId}, refCount_{1}, - has_ownership{backend_owns} { - deviceId_->set_context(this); + _pi_context(_pi_device *devId) + : cuContext_{devId->get_context()}, deviceId_{devId}, refCount_{1} { cuda_piDeviceRetain(deviceId_); }; @@ -206,20 +203,15 @@ struct _pi_context { native_type get() const noexcept { return cuContext_; } - bool is_primary() const noexcept { return kind_ == kind::primary; } - pi_uint32 increment_reference_count() noexcept { return ++refCount_; } pi_uint32 decrement_reference_count() noexcept { return --refCount_; } pi_uint32 get_reference_count() const noexcept { return refCount_; } - bool backend_has_ownership() const noexcept { return has_ownership; } - private: std::mutex mutex_; std::vector extended_deleters_; - const bool has_ownership; }; /// PI Mem mapping to CUDA memory allocations, both data and texture/surface. diff --git a/sycl/test/basic_tests/interop-cuda.cpp b/sycl/test/basic_tests/interop-cuda.cpp index b2e1cb3d9026f..234f1ed9e1da0 100644 --- a/sycl/test/basic_tests/interop-cuda.cpp +++ b/sycl/test/basic_tests/interop-cuda.cpp @@ -87,8 +87,6 @@ int main() { backend_input_t InteropContextInput{ cu_context[0]}; - context InteropContext = - make_context(InteropContextInput); event InteropEvent = make_event(cu_event, Context); queue InteropQueue = make_queue(cu_queue, Context); diff --git a/sycl/unittests/pi/cuda/test_base_objects.cpp b/sycl/unittests/pi/cuda/test_base_objects.cpp index 15f7f7d2651b1..9bcc9e9f24d56 100644 --- a/sycl/unittests/pi/cuda/test_base_objects.cpp +++ b/sycl/unittests/pi/cuda/test_base_objects.cpp @@ -79,96 +79,6 @@ TEST_F(CudaBaseObjectsTest, piContextCreate) { cuCtxGetApiVersion(cudaContext, &version); EXPECT_EQ(version, LATEST_KNOWN_CUDA_DRIVER_API_VERSION); - CUresult cuErr = cuCtxDestroy(cudaContext); - ASSERT_EQ(cuErr, CUDA_SUCCESS); -} - -TEST_F(CudaBaseObjectsTest, piContextCreatePrimaryTrue) { - pi_uint32 numPlatforms = 0; - pi_platform platform; - pi_device device; - - ASSERT_EQ((plugin->call_nocheck( - 0, nullptr, &numPlatforms)), - PI_SUCCESS) - << "piPlatformsGet failed.\n"; - - ASSERT_EQ((plugin->call_nocheck( - numPlatforms, &platform, nullptr)), - PI_SUCCESS) - << "piPlatformsGet failed.\n"; - - ASSERT_EQ((plugin->call_nocheck( - platform, PI_DEVICE_TYPE_GPU, 1, &device, nullptr)), - PI_SUCCESS); - pi_context_properties properties[] = { - __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY, PI_TRUE, 0}; - - pi_context ctxt; - ASSERT_EQ((plugin->call_nocheck( - properties, 1, &device, nullptr, nullptr, &ctxt)), - PI_SUCCESS); - EXPECT_NE(ctxt, nullptr); - EXPECT_EQ(ctxt->get_device(), device); - EXPECT_TRUE(ctxt->is_primary()); - - // Retrieve the cuCtxt to check information is correct - CUcontext cudaContext = ctxt->get(); - unsigned int version = 0; - CUresult cuErr = cuCtxGetApiVersion(cudaContext, &version); - ASSERT_EQ(cuErr, CUDA_SUCCESS); - EXPECT_EQ(version, LATEST_KNOWN_CUDA_DRIVER_API_VERSION); - - // Current context in the stack? - CUcontext current; - cuErr = cuCtxGetCurrent(¤t); - ASSERT_EQ(cuErr, CUDA_SUCCESS); - ASSERT_EQ(current, cudaContext); - ASSERT_EQ((plugin->call_nocheck(ctxt)), - PI_SUCCESS); -} - -TEST_F(CudaBaseObjectsTest, piContextCreatePrimaryFalse) { - pi_uint32 numPlatforms = 0; - pi_platform platform; - pi_device device; - - ASSERT_EQ((plugin->call_nocheck( - 0, nullptr, &numPlatforms)), - PI_SUCCESS) - << "piPlatformsGet failed.\n"; - - ASSERT_EQ((plugin->call_nocheck( - numPlatforms, &platform, nullptr)), - PI_SUCCESS) - << "piPlatformsGet failed.\n"; - - ASSERT_EQ((plugin->call_nocheck( - platform, PI_DEVICE_TYPE_GPU, 1, &device, nullptr)), - PI_SUCCESS); - pi_context_properties properties[] = { - __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY, PI_FALSE, 0}; - - pi_context ctxt; - ASSERT_EQ((plugin->call_nocheck( - properties, 1, &device, nullptr, nullptr, &ctxt)), - PI_SUCCESS); - EXPECT_NE(ctxt, nullptr); - EXPECT_EQ(ctxt->get_device(), device); - EXPECT_FALSE(ctxt->is_primary()); - - // Retrieve the cuCtxt to check information is correct - CUcontext cudaContext = ctxt->get(); - unsigned int version = 0; - CUresult cuErr = cuCtxGetApiVersion(cudaContext, &version); - ASSERT_EQ(cuErr, CUDA_SUCCESS); - EXPECT_EQ(version, LATEST_KNOWN_CUDA_DRIVER_API_VERSION); - - // Current context in the stack? - CUcontext current; - cuErr = cuCtxGetCurrent(¤t); - ASSERT_EQ(cuErr, CUDA_SUCCESS); - ASSERT_EQ(current, cudaContext); ASSERT_EQ((plugin->call_nocheck(ctxt)), PI_SUCCESS); } From 868547506fe323f69fc263c385b85012b08eff63 Mon Sep 17 00:00:00 2001 From: Nicolas Miller Date: Fri, 3 Feb 2023 14:51:46 +0000 Subject: [PATCH 2/2] [SYCL][CUDA] Move deprecation warning to class Older versions of gcc struggle with attributes on namespaces --- sycl/include/sycl/properties/context_properties.hpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/sycl/include/sycl/properties/context_properties.hpp b/sycl/include/sycl/properties/context_properties.hpp index 68b3d37dba5b4..d0764493fb063 100644 --- a/sycl/include/sycl/properties/context_properties.hpp +++ b/sycl/include/sycl/properties/context_properties.hpp @@ -14,14 +14,11 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { -namespace ext::oneapi::cuda::property { -namespace __SYCL_DEPRECATED( - "the primary contexts are now always used") context { -class use_primary_context : public ::sycl::detail::DataLessProperty< - ::sycl::detail::UsePrimaryContext> {}; -} // namespace __SYCL_DEPRECATED("the primary contexts are now always - // used")context -} // namespace ext::oneapi::cuda::property +namespace ext::oneapi::cuda::property::context { +class __SYCL_DEPRECATED("the primary contexts are now always used") + use_primary_context : public ::sycl::detail::DataLessProperty< + ::sycl::detail::UsePrimaryContext> {}; +} // namespace ext::oneapi::cuda::property::context namespace property::context { namespace __SYCL2020_DEPRECATED(