Skip to content

Commit f0f916c

Browse files
author
Steffen Larsen
committed
[SYCL][CUDA] Event synchronization only done for latest events
CUDA streams operate in-order, so when waiting for a list of events we introduce unnecessary overhead. These changes makes the PI CUDA backend only wait for the latest event for each stream in a given list of events. Signed-off-by: Steffen Larsen <[email protected]>
1 parent fdf44d4 commit f0f916c

File tree

2 files changed

+50
-21
lines changed

2 files changed

+50
-21
lines changed

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 42 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -67,19 +67,40 @@ inline void assign_result(pi_result *ptr, pi_result value) noexcept {
6767
// Invokes the callback for each event in the wait list. The callback must take
6868
// a single pi_event argument and return a pi_result.
6969
template <typename Func>
70-
pi_result forEachEvent(const pi_event *event_wait_list,
71-
std::size_t num_events_in_wait_list, Func &&f) {
70+
pi_result forLatestEvents(const pi_event *event_wait_list,
71+
std::size_t num_events_in_wait_list, Func &&f) {
7272

7373
if (event_wait_list == nullptr || num_events_in_wait_list == 0) {
7474
return PI_INVALID_EVENT_WAIT_LIST;
7575
}
7676

77-
for (size_t i = 0; i < num_events_in_wait_list; i++) {
78-
auto event = event_wait_list[i];
79-
if (event == nullptr) {
80-
return PI_INVALID_EVENT_WAIT_LIST;
77+
// Fast path if we only have a single event
78+
if (num_events_in_wait_list == 1) {
79+
return f(event_wait_list[0]);
80+
}
81+
82+
std::vector<pi_event> events{event_wait_list,
83+
event_wait_list + num_events_in_wait_list};
84+
std::sort(events.begin(), events.end(), [](pi_event e0, pi_event e1) {
85+
// Tiered sort creating sublists of streams (smallest value first) in which
86+
// the corresponding events are sorted into a sequence of newest first.
87+
return e0->get_queue()->stream_ < e1->get_queue()->stream_ ||
88+
(e0->get_queue()->stream_ == e1->get_queue()->stream_ &&
89+
e0->get_event_id() > e1->get_event_id());
90+
});
91+
92+
bool first = true;
93+
CUstream lastSeenStream = 0;
94+
for (pi_event event : events) {
95+
CUstream stream = event->get_queue()->stream_;
96+
97+
if (!event || (!first && stream == lastSeenStream)) {
98+
continue;
8199
}
82100

101+
first = false;
102+
lastSeenStream = stream;
103+
83104
auto result = f(event);
84105
if (result != PI_SUCCESS) {
85106
return result;
@@ -354,6 +375,11 @@ pi_result _pi_event::record() {
354375
CUstream cuStream = queue_->get();
355376

356377
try {
378+
eventId_ = queue_->get_next_event_id();
379+
if (eventId_ == 0) {
380+
cl::sycl::detail::pi::die(
381+
"Unrecoverable program state reached in event identifier overflow");
382+
}
357383
result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));
358384
} catch (pi_result error) {
359385
result = error;
@@ -1958,8 +1984,8 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer,
19581984
pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
19591985

19601986
try {
1961-
pi_result err = PI_SUCCESS;
1962-
1987+
assert(num_events != 0);
1988+
assert(event_list);
19631989
if (num_events == 0) {
19641990
return PI_INVALID_VALUE;
19651991
}
@@ -1971,11 +1997,7 @@ pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
19711997
auto context = event_list[0]->get_context();
19721998
ScopedContext active(context);
19731999

1974-
for (pi_uint32 count = 0; count < num_events && (err == PI_SUCCESS);
1975-
count++) {
1976-
1977-
auto event = event_list[count];
1978-
2000+
auto waitFunc = [context](pi_event event) -> pi_result {
19792001
if (!event) {
19802002
return PI_INVALID_EVENT;
19812003
}
@@ -1984,9 +2006,9 @@ pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
19842006
return PI_INVALID_CONTEXT;
19852007
}
19862008

1987-
err = event->wait();
1988-
}
1989-
return err;
2009+
return event->wait();
2010+
};
2011+
return forLatestEvents(event_list, num_events, waitFunc);
19902012
} catch (pi_result err) {
19912013
return err;
19922014
} catch (...) {
@@ -2760,10 +2782,10 @@ pi_result cuda_piEnqueueEventsWait(pi_queue command_queue,
27602782

27612783
if (event_wait_list) {
27622784
auto result =
2763-
forEachEvent(event_wait_list, num_events_in_wait_list,
2764-
[command_queue](pi_event event) -> pi_result {
2765-
return enqueueEventWait(command_queue, event);
2766-
});
2785+
forLatestEvents(event_wait_list, num_events_in_wait_list,
2786+
[command_queue](pi_event event) -> pi_result {
2787+
return enqueueEventWait(command_queue, event);
2788+
});
27672789

27682790
if (result != PI_SUCCESS) {
27692791
return result;

sycl/plugins/cuda/pi_cuda.hpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,11 +281,12 @@ struct _pi_queue {
281281
_pi_device *device_;
282282
pi_queue_properties properties_;
283283
std::atomic_uint32_t refCount_;
284+
std::atomic_uint32_t eventCount_;
284285

285286
_pi_queue(CUstream stream, _pi_context *context, _pi_device *device,
286287
pi_queue_properties properties)
287288
: stream_{stream}, context_{context}, device_{device},
288-
properties_{properties}, refCount_{1} {
289+
properties_{properties}, refCount_{1}, eventCount_{0} {
289290
cuda_piContextRetain(context_);
290291
cuda_piDeviceRetain(device_);
291292
}
@@ -304,6 +305,8 @@ struct _pi_queue {
304305
pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
305306

306307
pi_uint32 get_reference_count() const noexcept { return refCount_; }
308+
309+
pi_uint32 get_next_event_id() noexcept { return ++eventCount_; }
307310
};
308311

309312
typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
@@ -352,6 +355,8 @@ class _pi_event {
352355

353356
pi_uint32 decrement_reference_count() { return --refCount_; }
354357

358+
pi_uint32 get_event_id() const noexcept { return eventId_; }
359+
355360
// Returns the counter time when the associated command(s) were enqueued
356361
//
357362
pi_uint64 get_queued_time() const;
@@ -389,6 +394,8 @@ class _pi_event {
389394
// PI event has started or not
390395
//
391396

397+
pi_uint32 eventId_; // Queue identifier of the event.
398+
392399
native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
393400
// event, this will be nullptr.
394401

0 commit comments

Comments
 (0)