Skip to content

[SYCL][CUDA] Remove pi Event Callback implementation #1735

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 29, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 29 additions & 125 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,13 +276,15 @@ _pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue)
isStarted_{false}, evEnd_{nullptr}, evStart_{nullptr}, evQueued_{nullptr},
queue_{queue}, context_{context} {

if (is_native_event()) {
if (type != PI_COMMAND_TYPE_USER) {
PI_CHECK_ERROR(cuEventCreate(&evEnd_, CU_EVENT_DEFAULT));

if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
PI_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
PI_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
}
} else {
cl::sycl::detail::pi::die("User-defined events not implemented");
Copy link
Contributor

@nyalloc nyalloc May 21, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be good to give more granular message so that the user knows this error is coming from the CUDA backend. Same comment on line 2709.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in adcf897

}

if (queue_ != nullptr) {
Expand All @@ -303,7 +305,7 @@ pi_result _pi_event::start() {
pi_result result;

try {
if (is_native_event() && queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) {
// NOTE: This relies on the default stream to be unused.
result = PI_CHECK_ERROR(cuEventRecord(evQueued_, 0));
result = PI_CHECK_ERROR(cuEventRecord(evStart_, queue_->get()));
Expand All @@ -313,8 +315,6 @@ pi_result _pi_event::start() {
}

isStarted_ = true;
// let observers know that the event is "submitted"
trigger_callback(get_execution_status());
return result;
}

Expand Down Expand Up @@ -351,37 +351,16 @@ pi_result _pi_event::record() {

pi_result result = PI_INVALID_OPERATION;

if (is_native_event()) {

if (!queue_) {
return PI_INVALID_QUEUE;
}
if (!queue_) {
return PI_INVALID_QUEUE;
}

CUstream cuStream = queue_->get();
CUstream cuStream = queue_->get();

try {
result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));

result = cuda_piEventRetain(this);
try {
result = PI_CHECK_ERROR(cuLaunchHostFunc(
cuStream,
[](void *userData) {
pi_event event = reinterpret_cast<pi_event>(userData);
event->set_event_complete();
cuda_piEventRelease(event);
},
this));
} catch (...) {
// If host function fails to enqueue we must release the event here
result = cuda_piEventRelease(this);
throw;
}
} catch (pi_result error) {
result = error;
}
} else {
result = PI_SUCCESS;
try {
result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));
} catch (pi_result error) {
result = error;
}

if (result == PI_SUCCESS) {
Expand All @@ -392,65 +371,23 @@ pi_result _pi_event::record() {
}

pi_result _pi_event::wait() {

pi_result retErr;
if (is_native_event()) {
try {
retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
isCompleted_ = true;
} catch (pi_result error) {
retErr = error;
}
} else {

while (!is_completed()) {
// wait for user event to complete
}
retErr = PI_SUCCESS;
try {
retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
isCompleted_ = true;
} catch (pi_result error) {
retErr = error;
}

auto is_success = retErr == PI_SUCCESS;
auto status = is_success ? get_execution_status() : pi_int32(retErr);

trigger_callback(status);

return retErr;
}

// makes all future work submitted to queue wait for all work captured in event.
pi_result enqueueEventWait(pi_queue queue, pi_event event) {
if (event->is_native_event()) {

// for native events, the cuStreamWaitEvent call is used.
// This makes all future work submitted to stream wait for all
// work captured in event.

return PI_CHECK_ERROR(cuStreamWaitEvent(queue->get(), event->get(), 0));

} else {

// for user events, we enqueue a callback. When invoked, the
// callback will block until the user event is marked as
// completed.

static auto user_wait_func = [](void *user_data) {
// The host function must not make any CUDA API calls.
auto event = static_cast<pi_event>(user_data);

// busy wait for user event to complete
event->wait();

// this function does not need the event to be kept alive
// anymore
cuda_piEventRelease(event);
};

// retain event to ensure it is still alive when the
// user_wait_func callback is invoked
cuda_piEventRetain(event);

return PI_CHECK_ERROR(cuLaunchHostFunc(queue->get(), user_wait_func, event));
}
// for native events, the cuStreamWaitEvent call is used.
// This makes all future work submitted to stream wait for all
// work captured in event.
return PI_CHECK_ERROR(cuStreamWaitEvent(queue->get(), event->get(), 0));
}

_pi_program::_pi_program(pi_context ctxt)
Expand Down Expand Up @@ -2763,37 +2700,13 @@ pi_result cuda_piEventSetCallback(pi_event event,
pi_int32 command_exec_callback_type,
pfn_notify notify, void *user_data) {

assert(event);
assert(notify);
assert(command_exec_callback_type == PI_EVENT_SUBMITTED ||
command_exec_callback_type == PI_EVENT_RUNNING ||
command_exec_callback_type == PI_EVENT_COMPLETE);
event_callback callback(pi_event_status(command_exec_callback_type), notify,
user_data);

event->set_event_callback(callback);

cl::sycl::detail::pi::die("Event Callback not implemented");
return PI_SUCCESS;
}

pi_result cuda_piEventSetStatus(pi_event event, pi_int32 execution_status) {

assert(execution_status >= PI_EVENT_COMPLETE &&
execution_status <= PI_EVENT_QUEUED);

if (!event || event->is_native_event()) {
return PI_INVALID_EVENT;
}

if (execution_status == PI_EVENT_COMPLETE) {
return event->set_event_complete();
} else if (execution_status < 0) {
// TODO: A negative integer value causes all enqueued commands that wait
// on this user event to be terminated.
cl::sycl::detail::pi::die("cuda_piEventSetStatus support for negative execution_status not "
"implemented.");
}

cl::sycl::detail::pi::die("Event Set Status not implemented");
return PI_INVALID_VALUE;
}

Expand Down Expand Up @@ -2821,19 +2734,13 @@ pi_result cuda_piEventRelease(pi_event event) {
if (event->decrement_reference_count() == 0) {
std::unique_ptr<_pi_event> event_ptr{event};
pi_result result = PI_INVALID_EVENT;

if (event->is_native_event()) {
try {
ScopedContext active(event->get_context());
auto cuEvent = event->get();
result = PI_CHECK_ERROR(cuEventDestroy(cuEvent));
} catch (...) {
result = PI_OUT_OF_RESOURCES;
}
} else {
result = PI_SUCCESS;
try {
ScopedContext active(event->get_context());
auto cuEvent = event->get();
result = PI_CHECK_ERROR(cuEventDestroy(cuEvent));
} catch (...) {
result = PI_OUT_OF_RESOURCES;
}

return result;
}

Expand Down Expand Up @@ -2888,9 +2795,6 @@ pi_result cuda_piEnqueueEventsWait(pi_queue command_queue,
/// \return PI_SUCCESS on success. PI_INVALID_EVENT if given a user event.
pi_result cuda_piextEventGetNativeHandle(pi_event event,
pi_native_handle *nativeHandle) {
if (event->is_user_event()) {
return PI_INVALID_EVENT;
}
*nativeHandle = reinterpret_cast<pi_native_handle>(event->get());
return PI_SUCCESS;
}
Expand Down
101 changes: 7 additions & 94 deletions sycl/plugins/cuda/pi_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,37 +302,6 @@ struct _pi_queue {

typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
void *userData);

class event_callback {
public:
void trigger_callback(pi_event event, pi_int32 currentEventStatus) const {

auto validParameters = callback_ && event;

// As a pi_event_status value approaches 0, it gets closer to completion.
// If the calling pi_event's status is less than or equal to the event
// status the user is interested in, invoke the callback anyway. The event
// will have passed through that state anyway.
auto validStatus = currentEventStatus <= observedEventStatus_;

if (validParameters && validStatus) {

callback_(event, currentEventStatus, userData_);
}
}

event_callback(pi_event_status status, pfn_notify callback, void *userData)
: observedEventStatus_{status}, callback_{callback}, userData_{userData} {
}

pi_event_status get_status() const noexcept { return observedEventStatus_; }

private:
pi_event_status observedEventStatus_;
pfn_notify callback_;
void *userData_;
};

/// PI Event mapping to CUevent
///
class _pi_event {
Expand All @@ -347,41 +316,6 @@ class _pi_event {

native_type get() const noexcept { return evEnd_; };

pi_result set_event_complete() noexcept {

if (isCompleted_) {
return PI_INVALID_OPERATION;
}

isRecorded_ = true;
isCompleted_ = true;

trigger_callback(get_execution_status());

return PI_SUCCESS;
}

void trigger_callback(pi_int32 status) {

std::vector<event_callback> callbacks;

// Here we move all callbacks into local variable before we call them.
// This is a defensive maneuver; if any of the callbacks attempt to
// add additional callbacks, we will end up in a bad spot. Our mutex
// will be locked twice and the vector will be modified as it is being
// iterated over! By moving everything locally, we can call all of these
// callbacks and let them modify the original vector without much worry.

{
std::lock_guard<std::mutex> lock(mutex_);
event_callbacks_.swap(callbacks);
}

for (auto &event_callback : callbacks) {
event_callback.trigger_callback(this, status);
}
}

pi_queue get_queue() const noexcept { return queue_; }

pi_command_type get_command_type() const noexcept { return commandType_; }
Expand All @@ -390,10 +324,10 @@ class _pi_event {

bool is_recorded() const noexcept { return isRecorded_; }

bool is_completed() const noexcept { return isCompleted_; }

bool is_started() const noexcept { return isStarted_; }

bool is_completed() const noexcept { return isCompleted_; };

pi_int32 get_execution_status() const noexcept {

if (!is_recorded()) {
Expand All @@ -406,24 +340,8 @@ class _pi_event {
return PI_EVENT_COMPLETE;
}

void set_event_callback(const event_callback &callback) {
auto current_status = get_execution_status();
if (current_status <= callback.get_status()) {
callback.trigger_callback(this, current_status);
} else {
std::lock_guard<std::mutex> lock(mutex_);
event_callbacks_.emplace_back(callback);
}
}

pi_context get_context() const noexcept { return context_; };

bool is_user_event() const noexcept {
return get_command_type() == PI_COMMAND_TYPE_USER;
}

bool is_native_event() const noexcept { return !is_user_event(); }

pi_uint32 increment_reference_count() { return ++refCount_; }

pi_uint32 decrement_reference_count() { return --refCount_; }
Expand Down Expand Up @@ -462,13 +380,14 @@ class _pi_event {

std::atomic_uint32_t refCount_; // Event reference count.

std::atomic_bool isCompleted_; // Atomic bool used by user events. Can be
// used to wait for a user event's completion.
bool isCompleted_; // Signifies whether the operations have completed
//

bool isRecorded_; // Signifies wether a native CUDA event has been recorded
// yet.
bool isStarted_; // Signifies wether the operation associated with the
// PI event has started or not
bool isStarted_; // Signifies wether the operation associated with the
// PI event has started or not
//

native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
// event, this will be nullptr.
Expand All @@ -484,12 +403,6 @@ class _pi_event {
pi_context context_; // pi_context associated with the event. If this is a
// native event, this will be the same context associated
// with the queue_ member.

std::mutex mutex_; // Protect access to event_callbacks_. TODO: There might be
// a lock-free data structure we can use here.
std::vector<event_callback>
event_callbacks_; // Callbacks that can be triggered when an event's state
// changes.
};

/// Implementation of PI Program on CUDA Module object
Expand Down