Skip to content

Commit 6a76d50

Browse files
authored
[SYCL][CUDA] Move base event into the device (#8312)
Events are tied to a specific context, calling `cuEventElapsedTime` with two events created in separate contexts returns a `CUDA_ERROR_INVALID_HANDLE` error. So it really doesn't make sense to keep that on the platform, and now that we're using in the primary context and tied it to the device, it makes sense to also move the event base into the device class. This is technically not a problem right now because we only have one device per platform, but it's a bit cleaner will help towards the multi-device context work.
1 parent 960a063 commit 6a76d50

File tree

2 files changed

+26
-38
lines changed

2 files changed

+26
-38
lines changed

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 19 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -566,31 +566,27 @@ bool _pi_event::is_completed() const noexcept {
566566
return true;
567567
}
568568

569-
pi_uint64 _pi_event::get_queued_time() const {
569+
pi_uint64 _pi_device::get_elapsed_time(CUevent ev) const {
570570
float miliSeconds = 0.0f;
571-
assert(is_started());
572571

573-
PI_CHECK_ERROR(
574-
cuEventElapsedTime(&miliSeconds, _pi_platform::evBase_, evQueued_));
572+
PI_CHECK_ERROR(cuEventElapsedTime(&miliSeconds, evBase_, ev));
573+
575574
return static_cast<pi_uint64>(miliSeconds * 1.0e6);
576575
}
577576

578-
pi_uint64 _pi_event::get_start_time() const {
579-
float miliSeconds = 0.0f;
577+
pi_uint64 _pi_event::get_queued_time() const {
580578
assert(is_started());
579+
return queue_->get_device()->get_elapsed_time(evQueued_);
580+
}
581581

582-
PI_CHECK_ERROR(
583-
cuEventElapsedTime(&miliSeconds, _pi_platform::evBase_, evStart_));
584-
return static_cast<pi_uint64>(miliSeconds * 1.0e6);
582+
pi_uint64 _pi_event::get_start_time() const {
583+
assert(is_started());
584+
return queue_->get_device()->get_elapsed_time(evStart_);
585585
}
586586

587587
pi_uint64 _pi_event::get_end_time() const {
588-
float miliSeconds = 0.0f;
589588
assert(is_started() && is_recorded());
590-
591-
PI_CHECK_ERROR(
592-
cuEventElapsedTime(&miliSeconds, _pi_platform::evBase_, evEnd_));
593-
return static_cast<pi_uint64>(miliSeconds * 1.0e6);
589+
return queue_->get_device()->get_elapsed_time(evEnd_);
594590
}
595591

596592
pi_result _pi_event::record() {
@@ -830,8 +826,15 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
830826
CUcontext context;
831827
err = PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(&context, device));
832828

829+
ScopedContext active(context);
830+
CUevent evBase;
831+
err = PI_CHECK_ERROR(cuEventCreate(&evBase, CU_EVENT_DEFAULT));
832+
833+
// Use default stream to record base event counter
834+
err = PI_CHECK_ERROR(cuEventRecord(evBase, 0));
835+
833836
platformIds[i].devices_.emplace_back(
834-
new _pi_device{device, context, &platformIds[i]});
837+
new _pi_device{device, context, evBase, &platformIds[i]});
835838

836839
{
837840
const auto &dev = platformIds[i].devices_.back().get();
@@ -2061,18 +2064,6 @@ pi_result cuda_piContextCreate(const pi_context_properties *properties,
20612064
std::unique_ptr<_pi_context> piContextPtr{nullptr};
20622065
try {
20632066
piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{*devices});
2064-
2065-
static std::once_flag initFlag;
2066-
std::call_once(
2067-
initFlag,
2068-
[](pi_result &err) {
2069-
// Use default stream to record base event counter
2070-
PI_CHECK_ERROR(
2071-
cuEventCreate(&_pi_platform::evBase_, CU_EVENT_DEFAULT));
2072-
PI_CHECK_ERROR(cuEventRecord(_pi_platform::evBase_, 0));
2073-
},
2074-
errcode_ret);
2075-
20762067
*retcontext = piContextPtr.release();
20772068
} catch (pi_result err) {
20782069
errcode_ret = err;
@@ -5537,11 +5528,7 @@ pi_result cuda_piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime,
55375528

55385529
if (DeviceTime) {
55395530
PI_CHECK_ERROR(cuEventSynchronize(event));
5540-
5541-
float elapsedTime = 0.0f;
5542-
PI_CHECK_ERROR(
5543-
cuEventElapsedTime(&elapsedTime, _pi_platform::evBase_, event));
5544-
*DeviceTime = (uint64_t)(elapsedTime * (double)1e6);
5531+
*DeviceTime = Device->get_elapsed_time(event);
55455532
}
55465533

55475534
return PI_SUCCESS;
@@ -5708,5 +5695,3 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
57085695
}
57095696

57105697
} // extern "C"
5711-
5712-
CUevent _pi_platform::evBase_{nullptr};

sycl/plugins/cuda/pi_cuda.hpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ using _pi_stream_guard = std::unique_lock<std::mutex>;
7272
/// when devices are used.
7373
///
7474
struct _pi_platform {
75-
static CUevent evBase_; // CUDA event used as base counter
7675
std::vector<std::unique_ptr<_pi_device>> devices_;
7776
};
7877

@@ -87,6 +86,7 @@ struct _pi_device {
8786

8887
native_type cuDevice_;
8988
CUcontext cuContext_;
89+
CUevent evBase_; // CUDA event used as base counter
9090
std::atomic_uint32_t refCount_;
9191
pi_platform platform_;
9292

@@ -95,9 +95,10 @@ struct _pi_device {
9595
int max_work_group_size;
9696

9797
public:
98-
_pi_device(native_type cuDevice, CUcontext cuContext, pi_platform platform)
99-
: cuDevice_(cuDevice), cuContext_(cuContext), refCount_{1},
100-
platform_(platform) {}
98+
_pi_device(native_type cuDevice, CUcontext cuContext, CUevent evBase,
99+
pi_platform platform)
100+
: cuDevice_(cuDevice), cuContext_(cuContext),
101+
evBase_(evBase), refCount_{1}, platform_(platform) {}
101102

102103
~_pi_device() { cuDevicePrimaryCtxRelease(cuDevice_); }
103104

@@ -109,6 +110,8 @@ struct _pi_device {
109110

110111
pi_platform get_platform() const noexcept { return platform_; };
111112

113+
pi_uint64 get_elapsed_time(CUevent) const;
114+
112115
void save_max_work_item_sizes(size_t size,
113116
size_t *save_max_work_item_sizes) noexcept {
114117
memcpy(max_work_item_sizes, save_max_work_item_sizes, size);

0 commit comments

Comments
 (0)