Skip to content

[SYCL][CUDA] Decouple CUDA contexts from PI contexts #8197

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions sycl/include/sycl/detail/properties_traits.def
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ __SYCL_PARAM_TRAITS_SPEC(sycl::property::no_init)
__SYCL_PARAM_TRAITS_SPEC(
sycl::property::context::cuda::use_primary_context) // Deprecated
__SYCL_PARAM_TRAITS_SPEC(
sycl::ext::oneapi::cuda::property::context::use_primary_context)
sycl::ext::oneapi::cuda::property::context::use_primary_context) // Deprecated
__SYCL_PARAM_TRAITS_SPEC(sycl::property::queue::in_order)
__SYCL_PARAM_TRAITS_SPEC(sycl::property::reduction::initialize_to_identity)
__SYCL_PARAM_TRAITS_SPEC(sycl::ext::oneapi::property::queue::priority_low)
__SYCL_PARAM_TRAITS_SPEC(sycl::ext::oneapi::property::queue::priority_high)
__SYCL_PARAM_TRAITS_SPEC(sycl::ext::oneapi::property::queue::priority_normal)
__SYCL_PARAM_TRAITS_SPEC(sycl::ext::oneapi::property::queue::priority_normal)
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ template <> struct BackendReturn<backend::ext_oneapi_cuda, platform> {
template <> struct InteropFeatureSupportMap<backend::ext_oneapi_cuda> {
static constexpr bool MakePlatform = false;
static constexpr bool MakeDevice = true;
static constexpr bool MakeContext = true;
static constexpr bool MakeContext = false;
static constexpr bool MakeQueue = true;
static constexpr bool MakeEvent = true;
static constexpr bool MakeBuffer = false;
Expand Down
5 changes: 3 additions & 2 deletions sycl/include/sycl/properties/context_properties.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
namespace sycl {
__SYCL_INLINE_VER_NAMESPACE(_V1) {
namespace ext::oneapi::cuda::property::context {
class use_primary_context : public ::sycl::detail::DataLessProperty<
::sycl::detail::UsePrimaryContext> {};
class __SYCL_DEPRECATED("the primary contexts are now always used")
use_primary_context : public ::sycl::detail::DataLessProperty<
::sycl::detail::UsePrimaryContext> {};
} // namespace ext::oneapi::cuda::property::context

namespace property::context {
Expand Down
107 changes: 12 additions & 95 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -917,8 +917,11 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
for (int i = 0; i < numDevices; ++i) {
CUdevice device;
err = PI_CHECK_ERROR(cuDeviceGet(&device, i));
CUcontext context;
err = PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(&context, device));

platformIds[i].devices_.emplace_back(
new _pi_device{device, &platformIds[i]});
new _pi_device{device, context, &platformIds[i]});

{
const auto &dev = platformIds[i].devices_.back().get();
Expand Down Expand Up @@ -1183,6 +1186,8 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,

assert(device != nullptr);

ScopedContext active(device->get_context());

switch (param_name) {
case PI_DEVICE_INFO_TYPE: {
return getInfo(param_value_size, param_value, param_value_size_ret,
Expand Down Expand Up @@ -1961,7 +1966,6 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
}

case PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY: {
ScopedContext active(device);
size_t FreeMemory = 0;
size_t TotalMemory = 0;
sycl::detail::pi::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) ==
Expand Down Expand Up @@ -2121,50 +2125,10 @@ pi_result cuda_piContextCreate(const pi_context_properties *properties,
assert(retcontext != nullptr);
pi_result errcode_ret = PI_SUCCESS;

// Parse properties.
bool property_cuda_primary = false;
while (properties && (0 != *properties)) {
// Consume property ID.
pi_context_properties id = *properties;
++properties;
// Consume property value.
pi_context_properties value = *properties;
++properties;
switch (id) {
case __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY:
assert(value == PI_FALSE || value == PI_TRUE);
property_cuda_primary = static_cast<bool>(value);
break;
default:
// Unknown property.
sycl::detail::pi::die(
"Unknown piContextCreate property in property list");
return PI_ERROR_INVALID_VALUE;
}
}

std::unique_ptr<_pi_context> piContextPtr{nullptr};
try {
CUcontext current = nullptr;

if (property_cuda_primary) {
// Use the CUDA primary context and assume that we want to use it
// immediately as we want to forge context switches.
CUcontext Ctxt;
errcode_ret =
PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(&Ctxt, devices[0]->get()));
piContextPtr = std::unique_ptr<_pi_context>(
new _pi_context{_pi_context::kind::primary, Ctxt, *devices});
errcode_ret = PI_CHECK_ERROR(cuCtxPushCurrent(Ctxt));
} else {
// Create a scoped context.
CUcontext newContext;
PI_CHECK_ERROR(cuCtxGetCurrent(&current));
errcode_ret = PI_CHECK_ERROR(
cuCtxCreate(&newContext, CU_CTX_MAP_HOST, devices[0]->get()));
piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{
_pi_context::kind::user_defined, newContext, *devices});
}
piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{*devices});

static std::once_flag initFlag;
std::call_once(
initFlag,
Expand All @@ -2176,14 +2140,6 @@ pi_result cuda_piContextCreate(const pi_context_properties *properties,
},
errcode_ret);

// For non-primary scoped contexts keep the last active on top of the stack
// as `cuCtxCreate` replaces it implicitly otherwise.
// Primary contexts are kept on top of the stack, so the previous context
// is not queried and therefore not recovered.
if (current != nullptr) {
PI_CHECK_ERROR(cuCtxSetCurrent(current));
}

*retcontext = piContextPtr.release();
} catch (pi_result err) {
errcode_ret = err;
Expand All @@ -2194,7 +2150,6 @@ pi_result cuda_piContextCreate(const pi_context_properties *properties,
}

pi_result cuda_piContextRelease(pi_context ctxt) {

assert(ctxt != nullptr);

if (ctxt->decrement_reference_count() > 0) {
Expand All @@ -2204,29 +2159,7 @@ pi_result cuda_piContextRelease(pi_context ctxt) {

std::unique_ptr<_pi_context> context{ctxt};

if (!ctxt->backend_has_ownership())
return PI_SUCCESS;

if (!ctxt->is_primary()) {
CUcontext cuCtxt = ctxt->get();
CUcontext current = nullptr;
cuCtxGetCurrent(&current);
if (cuCtxt != current) {
PI_CHECK_ERROR(cuCtxPushCurrent(cuCtxt));
}
PI_CHECK_ERROR(cuCtxSynchronize());
cuCtxGetCurrent(&current);
if (cuCtxt == current) {
PI_CHECK_ERROR(cuCtxPopCurrent(&current));
}
return PI_CHECK_ERROR(cuCtxDestroy(cuCtxt));
}

// Primary context is not destroyed, but released
CUdevice cuDev = ctxt->get_device()->get();
CUcontext current;
cuCtxPopCurrent(&current);
return PI_CHECK_ERROR(cuDevicePrimaryCtxRelease(cuDev));
return PI_SUCCESS;
}

/// Gets the native CUDA handle of a PI context object
Expand All @@ -2253,29 +2186,15 @@ pi_result cuda_piextContextCreateWithNativeHandle(pi_native_handle nativeHandle,
const pi_device *devices,
bool ownNativeHandle,
pi_context *piContext) {
(void)nativeHandle;
(void)num_devices;
(void)devices;
(void)ownNativeHandle;
(void)piContext;
assert(piContext != nullptr);
assert(ownNativeHandle == false);

CUcontext newContext = reinterpret_cast<CUcontext>(nativeHandle);

ScopedContext active(newContext);

// Get context's native device
CUdevice cu_device;
pi_result retErr = PI_CHECK_ERROR(cuCtxGetDevice(&cu_device));

// Create a SYCL device from the ctx device
pi_device device = nullptr;
retErr = cuda_piextDeviceCreateWithNativeHandle(cu_device, nullptr, &device);

// Create sycl context
*piContext = new _pi_context{_pi_context::kind::user_defined, newContext,
device, /*backend_owns*/ false};

return retErr;
return PI_ERROR_INVALID_OPERATION;
}

/// Creates a PI Memory object using a CUDA memory allocation.
Expand Down Expand Up @@ -2469,8 +2388,6 @@ pi_result cuda_piMemBufferPartition(pi_mem parent_buffer, pi_mem_flags flags,

std::unique_ptr<_pi_mem> retMemObj{nullptr};
try {
ScopedContext active(context);

retMemObj = std::unique_ptr<_pi_mem>{new _pi_mem{
context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}};
} catch (pi_result err) {
Expand Down
28 changes: 10 additions & 18 deletions sycl/plugins/cuda/pi_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,28 +86,29 @@ struct _pi_device {
using native_type = CUdevice;

native_type cuDevice_;
CUcontext cuContext_;
std::atomic_uint32_t refCount_;
pi_platform platform_;
pi_context context_;

static constexpr pi_uint32 max_work_item_dimensions = 3u;
size_t max_work_item_sizes[max_work_item_dimensions];
int max_work_group_size;

public:
_pi_device(native_type cuDevice, pi_platform platform)
: cuDevice_(cuDevice), refCount_{1}, platform_(platform) {}
_pi_device(native_type cuDevice, CUcontext cuContext, pi_platform platform)
: cuDevice_(cuDevice), cuContext_(cuContext), refCount_{1},
platform_(platform) {}

~_pi_device() { cuDevicePrimaryCtxRelease(cuDevice_); }

native_type get() const noexcept { return cuDevice_; };

CUcontext get_context() const noexcept { return cuContext_; };

pi_uint32 get_reference_count() const noexcept { return refCount_; }

pi_platform get_platform() const noexcept { return platform_; };

void set_context(pi_context ctx) { context_ = ctx; };

pi_context get_context() { return context_; };

void save_max_work_item_sizes(size_t size,
size_t *save_max_work_item_sizes) noexcept {
memcpy(max_work_item_sizes, save_max_work_item_sizes, size);
Expand Down Expand Up @@ -174,16 +175,12 @@ struct _pi_context {

using native_type = CUcontext;

enum class kind { primary, user_defined } kind_;
native_type cuContext_;
_pi_device *deviceId_;
std::atomic_uint32_t refCount_;

_pi_context(kind k, CUcontext ctxt, _pi_device *devId,
bool backend_owns = true)
: kind_{k}, cuContext_{ctxt}, deviceId_{devId}, refCount_{1},
has_ownership{backend_owns} {
deviceId_->set_context(this);
_pi_context(_pi_device *devId)
: cuContext_{devId->get_context()}, deviceId_{devId}, refCount_{1} {
cuda_piDeviceRetain(deviceId_);
};

Expand All @@ -206,20 +203,15 @@ struct _pi_context {

native_type get() const noexcept { return cuContext_; }

bool is_primary() const noexcept { return kind_ == kind::primary; }

pi_uint32 increment_reference_count() noexcept { return ++refCount_; }

pi_uint32 decrement_reference_count() noexcept { return --refCount_; }

pi_uint32 get_reference_count() const noexcept { return refCount_; }

bool backend_has_ownership() const noexcept { return has_ownership; }

private:
std::mutex mutex_;
std::vector<deleter_data> extended_deleters_;
const bool has_ownership;
};

/// PI Mem mapping to CUDA memory allocations, both data and texture/surface.
Expand Down
2 changes: 0 additions & 2 deletions sycl/test/basic_tests/interop-cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,6 @@ int main() {

backend_input_t<backend::ext_oneapi_cuda, context> InteropContextInput{
cu_context[0]};
context InteropContext =
make_context<backend::ext_oneapi_cuda>(InteropContextInput);
event InteropEvent = make_event<backend::ext_oneapi_cuda>(cu_event, Context);

queue InteropQueue = make_queue<backend::ext_oneapi_cuda>(cu_queue, Context);
Expand Down
90 changes: 0 additions & 90 deletions sycl/unittests/pi/cuda/test_base_objects.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,96 +79,6 @@ TEST_F(CudaBaseObjectsTest, piContextCreate) {
cuCtxGetApiVersion(cudaContext, &version);
EXPECT_EQ(version, LATEST_KNOWN_CUDA_DRIVER_API_VERSION);

CUresult cuErr = cuCtxDestroy(cudaContext);
ASSERT_EQ(cuErr, CUDA_SUCCESS);
}

TEST_F(CudaBaseObjectsTest, piContextCreatePrimaryTrue) {
pi_uint32 numPlatforms = 0;
pi_platform platform;
pi_device device;

ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
0, nullptr, &numPlatforms)),
PI_SUCCESS)
<< "piPlatformsGet failed.\n";

ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
numPlatforms, &platform, nullptr)),
PI_SUCCESS)
<< "piPlatformsGet failed.\n";

ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
platform, PI_DEVICE_TYPE_GPU, 1, &device, nullptr)),
PI_SUCCESS);
pi_context_properties properties[] = {
__SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY, PI_TRUE, 0};

pi_context ctxt;
ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
properties, 1, &device, nullptr, nullptr, &ctxt)),
PI_SUCCESS);
EXPECT_NE(ctxt, nullptr);
EXPECT_EQ(ctxt->get_device(), device);
EXPECT_TRUE(ctxt->is_primary());

// Retrieve the cuCtxt to check information is correct
CUcontext cudaContext = ctxt->get();
unsigned int version = 0;
CUresult cuErr = cuCtxGetApiVersion(cudaContext, &version);
ASSERT_EQ(cuErr, CUDA_SUCCESS);
EXPECT_EQ(version, LATEST_KNOWN_CUDA_DRIVER_API_VERSION);

// Current context in the stack?
CUcontext current;
cuErr = cuCtxGetCurrent(&current);
ASSERT_EQ(cuErr, CUDA_SUCCESS);
ASSERT_EQ(current, cudaContext);
ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextRelease>(ctxt)),
PI_SUCCESS);
}

TEST_F(CudaBaseObjectsTest, piContextCreatePrimaryFalse) {
pi_uint32 numPlatforms = 0;
pi_platform platform;
pi_device device;

ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
0, nullptr, &numPlatforms)),
PI_SUCCESS)
<< "piPlatformsGet failed.\n";

ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
numPlatforms, &platform, nullptr)),
PI_SUCCESS)
<< "piPlatformsGet failed.\n";

ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
platform, PI_DEVICE_TYPE_GPU, 1, &device, nullptr)),
PI_SUCCESS);
pi_context_properties properties[] = {
__SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY, PI_FALSE, 0};

pi_context ctxt;
ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
properties, 1, &device, nullptr, nullptr, &ctxt)),
PI_SUCCESS);
EXPECT_NE(ctxt, nullptr);
EXPECT_EQ(ctxt->get_device(), device);
EXPECT_FALSE(ctxt->is_primary());

// Retrieve the cuCtxt to check information is correct
CUcontext cudaContext = ctxt->get();
unsigned int version = 0;
CUresult cuErr = cuCtxGetApiVersion(cudaContext, &version);
ASSERT_EQ(cuErr, CUDA_SUCCESS);
EXPECT_EQ(version, LATEST_KNOWN_CUDA_DRIVER_API_VERSION);

// Current context in the stack?
CUcontext current;
cuErr = cuCtxGetCurrent(&current);
ASSERT_EQ(cuErr, CUDA_SUCCESS);
ASSERT_EQ(current, cudaContext);
ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextRelease>(ctxt)),
PI_SUCCESS);
}
Expand Down