From 92b6d49ebd732b3459b0bad49092cb6b602900bb Mon Sep 17 00:00:00 2001 From: Martin Wehking Date: Wed, 17 Apr 2024 15:38:04 +0100 Subject: [PATCH 1/8] Cache events instead of destroying them for CUDA Don't destroy events that are not needed anymore, but put them on a stack of its associated queue (If there is one). Use events from the stack before creating new ones. This caching mechanism ensures that the number of CUDA API calls gets significantly reduced for event creations and destructions. --- source/adapters/cuda/event.cpp | 56 +++++++++++++++++++++++----------- source/adapters/cuda/event.hpp | 17 ++++++++++- source/adapters/cuda/queue.cpp | 10 ++++++ source/adapters/cuda/queue.hpp | 19 +++++++++--- 4 files changed, 79 insertions(+), 23 deletions(-) diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index 1e8f2dd384..3a74b80ece 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -42,11 +42,32 @@ ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t Context, urContextRetain(Context); } +void ur_event_handle_t_::reset() { + RefCount = 0; + HasBeenWaitedOn = false; + IsRecorded = false; + IsStarted = false; + Queue = nullptr; + Context = nullptr; +} + ur_event_handle_t_::~ur_event_handle_t_() { + if (HasOwnership) { + if (EvEnd) + UR_CHECK_ERROR(cuEventDestroy(EvEnd)); + + if (EvQueued) + UR_CHECK_ERROR(cuEventDestroy(EvQueued)); + + if (EvStart) + UR_CHECK_ERROR(cuEventDestroy(EvStart)); + } if (Queue != nullptr) { urQueueRelease(Queue); } - urContextRelease(Context); + if (Context != nullptr) { + urContextRelease(Context); + } } ur_result_t ur_event_handle_t_::start() { @@ -141,22 +162,6 @@ ur_result_t ur_event_handle_t_::wait() { return Result; } -ur_result_t ur_event_handle_t_::release() { - if (!backendHasOwnership()) - return UR_RESULT_SUCCESS; - - assert(Queue != nullptr); - - UR_CHECK_ERROR(cuEventDestroy(EvEnd)); - - if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) { - UR_CHECK_ERROR(cuEventDestroy(EvQueued)); - UR_CHECK_ERROR(cuEventDestroy(EvStart)); - } - - return UR_RESULT_SUCCESS; -} - UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, ur_event_info_t propName, size_t propValueSize, @@ -257,7 +262,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { ur_result_t Result = UR_RESULT_ERROR_INVALID_EVENT; try { ScopedContext Active(hEvent->getContext()); - Result = hEvent->release(); + if (!hEvent->backendHasOwnership()) { + Result = UR_RESULT_SUCCESS; + } + else { + assert(hEvent->getQueue() != nullptr); + assert(hEvent->getContext() != nullptr); + + auto queue = event_ptr->getQueue(); + auto context = event_ptr->getContext(); + + event_ptr->reset(); + queue->CachedEvents.emplace(event_ptr.release()); + urQueueRelease(queue); + urContextRelease(context); + Result = UR_RESULT_SUCCESS; + } } catch (...) { Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; } diff --git a/source/adapters/cuda/event.hpp b/source/adapters/cuda/event.hpp index 5ed68f0f25..051f277475 100644 --- a/source/adapters/cuda/event.hpp +++ b/source/adapters/cuda/event.hpp @@ -90,6 +90,21 @@ struct ur_event_handle_t_ { const bool RequiresTimings = Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || Type == UR_COMMAND_TIMESTAMP_RECORDING_EXP; + if (Queue->has_cached_events()) { + auto retEvent = Queue->get_cached_event(); + + retEvent->Stream = Stream; + retEvent->StreamToken = StreamToken; + retEvent->CommandType = Type; + retEvent->Queue = Queue; + retEvent->Context = Queue->Context; + retEvent->RefCount = 1; + + urQueueRetain(retEvent->Queue); + urContextRetain(retEvent->Context); + + return retEvent; + } native_type EvEnd = nullptr, EvQueued = nullptr, EvStart = nullptr; UR_CHECK_ERROR(cuEventCreate( &EvEnd, RequiresTimings ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING)); @@ -107,7 +122,7 @@ struct ur_event_handle_t_ { return new ur_event_handle_t_(context, eventNative); } - ur_result_t release(); + void reset(); ~ur_event_handle_t_(); diff --git a/source/adapters/cuda/queue.cpp b/source/adapters/cuda/queue.cpp index 120d665524..06d559fbec 100644 --- a/source/adapters/cuda/queue.cpp +++ b/source/adapters/cuda/queue.cpp @@ -32,6 +32,16 @@ void ur_queue_handle_t_::transferStreamWaitForBarrierIfNeeded( } } +ur_queue_handle_t_::~ur_queue_handle_t_() { + urContextRelease(Context); + urDeviceRelease(Device); + + while (!CachedEvents.empty()) { + std::unique_ptr p{CachedEvents.top()}; + CachedEvents.pop(); + } +} + CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { uint32_t StreamI; uint32_t Token; diff --git a/source/adapters/cuda/queue.hpp b/source/adapters/cuda/queue.hpp index c79ca18a9b..c1e489c0a6 100644 --- a/source/adapters/cuda/queue.hpp +++ b/source/adapters/cuda/queue.hpp @@ -13,6 +13,7 @@ #include #include +#include #include using ur_stream_guard_ = std::unique_lock; @@ -35,6 +36,9 @@ struct ur_queue_handle_t_ { // keep track of which streams have applied barrier std::vector ComputeAppliedBarrier; std::vector TransferAppliedBarrier; + // CachedEvents assumes ownership of events. + // Events on the stack are destructed when queue is destructed as well. + std::stack CachedEvents; ur_context_handle_t_ *Context; ur_device_handle_t_ *Device; CUevent BarrierEvent = nullptr; @@ -77,10 +81,7 @@ struct ur_queue_handle_t_ { urDeviceRetain(Device); } - ~ur_queue_handle_t_() { - urContextRelease(Context); - urDeviceRelease(Device); - } + ~ur_queue_handle_t_(); void computeStreamWaitForBarrierIfNeeded(CUstream Strean, uint32_t StreamI); void transferStreamWaitForBarrierIfNeeded(CUstream Stream, uint32_t StreamI); @@ -245,4 +246,14 @@ struct ur_queue_handle_t_ { uint32_t getNextEventID() noexcept { return ++EventCount; } bool backendHasOwnership() const noexcept { return HasOwnership; } + + bool has_cached_events() const { return !CachedEvents.empty(); } + + // Returns and removes an event from the CachedEvents stack. + ur_event_handle_t get_cached_event() { + assert(has_cached_events()); + auto retEv = CachedEvents.top(); + CachedEvents.pop(); + return retEv; + } }; From 4cb82d910a63bf0a3406d3e17d29ce2fb2c51784 Mon Sep 17 00:00:00 2001 From: Martin Wehking Date: Fri, 19 Apr 2024 07:53:00 -0400 Subject: [PATCH 2/8] Cache events instead of destroying them for AMD Follow a similar logic as in #13f097c. Reduce the number of HIP API calls for event creations and destructions. --- source/adapters/hip/event.cpp | 54 ++++++++++++++++++++++++----------- source/adapters/hip/event.hpp | 17 ++++++++++- source/adapters/hip/queue.cpp | 10 +++++++ source/adapters/hip/queue.hpp | 19 +++++++++--- 4 files changed, 78 insertions(+), 22 deletions(-) diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp index 5327c43a3b..d3dc483e39 100644 --- a/source/adapters/hip/event.cpp +++ b/source/adapters/hip/event.cpp @@ -47,11 +47,32 @@ ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t Context, urContextRetain(Context); } +void ur_event_handle_t_::reset() { + RefCount = 0; + HasBeenWaitedOn = false; + IsRecorded = false; + IsStarted = false; + Queue = nullptr; + Context = nullptr; +} + ur_event_handle_t_::~ur_event_handle_t_() { + if (HasOwnership) { + if (EvEnd) + UR_CHECK_ERROR(hipEventDestroy(EvEnd)); + + if (EvQueued) + UR_CHECK_ERROR(hipEventDestroy(EvQueued)); + + if (EvStart) + UR_CHECK_ERROR(hipEventDestroy(EvStart)); + } if (Queue != nullptr) { urQueueRelease(Queue); } - urContextRelease(Context); + if (Context != nullptr) { + urContextRelease(Context); + } } ur_result_t ur_event_handle_t_::start() { @@ -171,21 +192,6 @@ ur_result_t ur_event_handle_t_::wait() { return Result; } -ur_result_t ur_event_handle_t_::release() { - if (!backendHasOwnership()) - return UR_RESULT_SUCCESS; - - assert(Queue != nullptr); - UR_CHECK_ERROR(hipEventDestroy(EvEnd)); - - if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) { - UR_CHECK_ERROR(hipEventDestroy(EvQueued)); - UR_CHECK_ERROR(hipEventDestroy(EvStart)); - } - - return UR_RESULT_SUCCESS; -} - UR_APIEXPORT ur_result_t UR_APICALL urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { UR_ASSERT(numEvents > 0, UR_RESULT_ERROR_INVALID_VALUE); @@ -293,7 +299,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { std::unique_ptr event_ptr{hEvent}; ur_result_t Result = UR_RESULT_ERROR_INVALID_EVENT; try { - Result = hEvent->release(); + if (!hEvent->backendHasOwnership()) { + Result = UR_RESULT_SUCCESS; + } else { + assert(hEvent->getQueue() != nullptr); + assert(hEvent->getContext() != nullptr); + + auto queue = event_ptr->getQueue(); + auto context = event_ptr->getContext(); + + event_ptr->reset(); + queue->CachedEvents.emplace(event_ptr.release()); + urQueueRelease(queue); + urContextRelease(context); + Result = UR_RESULT_SUCCESS; + } } catch (...) { Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; } diff --git a/source/adapters/hip/event.hpp b/source/adapters/hip/event.hpp index 64e8b2d9c8..8126ba395a 100644 --- a/source/adapters/hip/event.hpp +++ b/source/adapters/hip/event.hpp @@ -82,6 +82,21 @@ struct ur_event_handle_t_ { static ur_event_handle_t makeNative(ur_command_t Type, ur_queue_handle_t Queue, hipStream_t Stream, uint32_t StreamToken = std::numeric_limits::max()) { + if (Queue->has_cached_events()) { + auto retEvent = Queue->get_cached_event(); + + retEvent->Stream = Stream; + retEvent->StreamToken = StreamToken; + retEvent->CommandType = Type; + retEvent->Queue = Queue; + retEvent->Context = Queue->Context; + retEvent->RefCount = 1; + + urQueueRetain(retEvent->Queue); + urContextRetain(retEvent->Context); + + return retEvent; + } return new ur_event_handle_t_(Type, Queue->getContext(), Queue, Stream, StreamToken); } @@ -91,7 +106,7 @@ struct ur_event_handle_t_ { return new ur_event_handle_t_(context, eventNative); } - ur_result_t release(); + void reset(); ~ur_event_handle_t_(); diff --git a/source/adapters/hip/queue.cpp b/source/adapters/hip/queue.cpp index 6e6496fec1..cb13226f9a 100644 --- a/source/adapters/hip/queue.cpp +++ b/source/adapters/hip/queue.cpp @@ -28,6 +28,16 @@ void ur_queue_handle_t_::transferStreamWaitForBarrierIfNeeded( } } +ur_queue_handle_t_::~ur_queue_handle_t_() { + urContextRelease(Context); + urDeviceRelease(Device); + + while (!CachedEvents.empty()) { + std::unique_ptr p{CachedEvents.top()}; + CachedEvents.pop(); + } +} + hipStream_t ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) { uint32_t Stream_i; uint32_t Token; diff --git a/source/adapters/hip/queue.hpp b/source/adapters/hip/queue.hpp index ad2f0f016e..d16716218c 100644 --- a/source/adapters/hip/queue.hpp +++ b/source/adapters/hip/queue.hpp @@ -10,6 +10,7 @@ #pragma once #include "common.hpp" +#include using ur_stream_quard = std::unique_lock; @@ -30,6 +31,9 @@ struct ur_queue_handle_t_ { // keep track of which streams have applied barrier std::vector ComputeAppliedBarrier; std::vector TransferAppliedBarrier; + // CachedEvents assumes ownership of events. + // Events on the stack are destructed when queue is destructed as well. + std::stack CachedEvents; ur_context_handle_t Context; ur_device_handle_t Device; hipEvent_t BarrierEvent = nullptr; @@ -72,10 +76,7 @@ struct ur_queue_handle_t_ { urDeviceRetain(Device); } - ~ur_queue_handle_t_() { - urContextRelease(Context); - urDeviceRelease(Device); - } + ~ur_queue_handle_t_(); void computeStreamWaitForBarrierIfNeeded(hipStream_t Stream, uint32_t Stream_i); @@ -242,4 +243,14 @@ struct ur_queue_handle_t_ { uint32_t getNextEventId() noexcept { return ++EventCount; } bool backendHasOwnership() const noexcept { return HasOwnership; } + + bool has_cached_events() const { return !CachedEvents.empty(); } + + // Returns and removes an event from the CachedEvents stack. + ur_event_handle_t get_cached_event() { + assert(has_cached_events()); + auto retEv = CachedEvents.top(); + CachedEvents.pop(); + return retEv; + } }; From b883f80decc43f328ad7287a6372943bc21d1a5e Mon Sep 17 00:00:00 2001 From: Martin Wehking Date: Fri, 10 May 2024 10:04:19 +0100 Subject: [PATCH 3/8] Use mutexes for cached events storing and loading Make the retrieval and pushing of events onto the stack thread-safe by locking it with mutexes. Use these mutexes inside the queue, which is responsible for caching the events. Apply further minor changes, such as returning the result status for event releases directly, usage of the proper UR assertions and proper styling for variable names. --- source/adapters/cuda/event.cpp | 25 ++++++++++++++----------- source/adapters/cuda/event.hpp | 2 ++ source/adapters/cuda/queue.hpp | 8 ++++++++ 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index 3a74b80ece..2fdc0703e6 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -43,7 +43,9 @@ ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t Context, } void ur_event_handle_t_::reset() { - RefCount = 0; + detail::ur::assertion(RefCount == 0, + "Attempting to reset an event that is still referenced"); + HasBeenWaitedOn = false; IsRecorded = false; IsStarted = false; @@ -263,20 +265,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { try { ScopedContext Active(hEvent->getContext()); if (!hEvent->backendHasOwnership()) { - Result = UR_RESULT_SUCCESS; + return UR_RESULT_SUCCESS; } else { - assert(hEvent->getQueue() != nullptr); - assert(hEvent->getContext() != nullptr); - - auto queue = event_ptr->getQueue(); - auto context = event_ptr->getContext(); + auto Queue = event_ptr->getQueue(); + auto Context = event_ptr->getContext(); event_ptr->reset(); - queue->CachedEvents.emplace(event_ptr.release()); - urQueueRelease(queue); - urContextRelease(context); - Result = UR_RESULT_SUCCESS; + if (Queue) { + Queue->cache_event(event_ptr.release()); + urQueueRelease(Queue); + } + if (Context) { + urContextRelease(Context); + } + return UR_RESULT_SUCCESS; } } catch (...) { Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; diff --git a/source/adapters/cuda/event.hpp b/source/adapters/cuda/event.hpp index 051f277475..802a29240e 100644 --- a/source/adapters/cuda/event.hpp +++ b/source/adapters/cuda/event.hpp @@ -122,6 +122,8 @@ struct ur_event_handle_t_ { return new ur_event_handle_t_(context, eventNative); } + // Resets attributes of an event. + // Throws an error if its RefCount is not 0. void reset(); ~ur_event_handle_t_(); diff --git a/source/adapters/cuda/queue.hpp b/source/adapters/cuda/queue.hpp index c1e489c0a6..7b8b1ad01a 100644 --- a/source/adapters/cuda/queue.hpp +++ b/source/adapters/cuda/queue.hpp @@ -61,6 +61,8 @@ struct ur_queue_handle_t_ { std::mutex ComputeStreamMutex; std::mutex TransferStreamMutex; std::mutex BarrierMutex; + // The event cache might be accessed in multiple threads. + std::mutex CacheMutex; bool HasOwnership; ur_queue_handle_t_(std::vector &&ComputeStreams, @@ -249,8 +251,14 @@ struct ur_queue_handle_t_ { bool has_cached_events() const { return !CachedEvents.empty(); } + void cache_event(ur_event_handle_t Event) { + std::lock_guard CacheGuard(CacheMutex); + CachedEvents.push(Event); + } + // Returns and removes an event from the CachedEvents stack. ur_event_handle_t get_cached_event() { + std::lock_guard CacheGuard(CacheMutex); assert(has_cached_events()); auto retEv = CachedEvents.top(); CachedEvents.pop(); From af8eed4de5804ea29fac90bc8925238c8598d4af Mon Sep 17 00:00:00 2001 From: Martin Wehking Date: Fri, 10 May 2024 15:42:18 +0100 Subject: [PATCH 4/8] Copy thread-safety event caching changes for HIP Copy thread-safety mechanism for event caching from CUDA to HIP. (#590897e9103c06ef1f1d86a6edf7c7eb525bd7d8) Use atomic deletion of cached events in queue destructor. Refactor minor styling/redundancy issues for the CUDA and HIP adapter. --- source/adapters/cuda/event.cpp | 43 +++++++++++++++------------------- source/adapters/cuda/queue.cpp | 5 ++-- source/adapters/cuda/queue.hpp | 9 ++++--- source/adapters/hip/event.cpp | 34 +++++++++++++-------------- source/adapters/hip/event.hpp | 2 ++ source/adapters/hip/queue.cpp | 5 ++-- source/adapters/hip/queue.hpp | 17 +++++++++++--- 7 files changed, 62 insertions(+), 53 deletions(-) diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index 2fdc0703e6..6ba14c0f53 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -43,8 +43,8 @@ ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t Context, } void ur_event_handle_t_::reset() { - detail::ur::assertion(RefCount == 0, - "Attempting to reset an event that is still referenced"); + detail::ur::assertion( + RefCount == 0, "Attempting to reset an event that is still referenced"); HasBeenWaitedOn = false; IsRecorded = false; @@ -64,10 +64,10 @@ ur_event_handle_t_::~ur_event_handle_t_() { if (EvStart) UR_CHECK_ERROR(cuEventDestroy(EvStart)); } - if (Queue != nullptr) { + if (Queue) { urQueueRelease(Queue); } - if (Context != nullptr) { + if (Context) { urContextRelease(Context); } } @@ -261,33 +261,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { // decrement ref count. If it is 0, delete the event. if (hEvent->decrementReferenceCount() == 0) { std::unique_ptr event_ptr{hEvent}; - ur_result_t Result = UR_RESULT_ERROR_INVALID_EVENT; try { ScopedContext Active(hEvent->getContext()); - if (!hEvent->backendHasOwnership()) { - return UR_RESULT_SUCCESS; - } - else { - auto Queue = event_ptr->getQueue(); - auto Context = event_ptr->getContext(); - - event_ptr->reset(); - if (Queue) { - Queue->cache_event(event_ptr.release()); - urQueueRelease(Queue); - } - if (Context) { + if (!hEvent->backendHasOwnership()) { + return UR_RESULT_SUCCESS; + } else { + auto Queue = event_ptr->getQueue(); + auto Context = event_ptr->getContext(); + + event_ptr->reset(); + if (Queue) { + Queue->cache_event(event_ptr.release()); + urQueueRelease(Queue); + } urContextRelease(Context); + + return UR_RESULT_SUCCESS; } - return UR_RESULT_SUCCESS; - } } catch (...) { - Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + return UR_RESULT_ERROR_OUT_OF_RESOURCES; } - return Result; } - - return UR_RESULT_SUCCESS; + return UR_RESULT_ERROR_INVALID_EVENT; } UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle( diff --git a/source/adapters/cuda/queue.cpp b/source/adapters/cuda/queue.cpp index 06d559fbec..f7da7cea16 100644 --- a/source/adapters/cuda/queue.cpp +++ b/source/adapters/cuda/queue.cpp @@ -36,9 +36,8 @@ ur_queue_handle_t_::~ur_queue_handle_t_() { urContextRelease(Context); urDeviceRelease(Device); - while (!CachedEvents.empty()) { - std::unique_ptr p{CachedEvents.top()}; - CachedEvents.pop(); + while (has_cached_events()) { + get_cached_event(); } } diff --git a/source/adapters/cuda/queue.hpp b/source/adapters/cuda/queue.hpp index 7b8b1ad01a..e18be86122 100644 --- a/source/adapters/cuda/queue.hpp +++ b/source/adapters/cuda/queue.hpp @@ -249,7 +249,10 @@ struct ur_queue_handle_t_ { bool backendHasOwnership() const noexcept { return HasOwnership; } - bool has_cached_events() const { return !CachedEvents.empty(); } + bool has_cached_events() { + std::lock_guard CacheGuard(CacheMutex); + return !CachedEvents.empty(); + } void cache_event(ur_event_handle_t Event) { std::lock_guard CacheGuard(CacheMutex); @@ -260,8 +263,8 @@ struct ur_queue_handle_t_ { ur_event_handle_t get_cached_event() { std::lock_guard CacheGuard(CacheMutex); assert(has_cached_events()); - auto retEv = CachedEvents.top(); + auto RetEv = CachedEvents.top(); CachedEvents.pop(); - return retEv; + return RetEv; } }; diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp index d3dc483e39..70602c01eb 100644 --- a/source/adapters/hip/event.cpp +++ b/source/adapters/hip/event.cpp @@ -48,7 +48,9 @@ ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t Context, } void ur_event_handle_t_::reset() { - RefCount = 0; + detail::ur::assertion( + RefCount == 0, "Attempting to reset an event that is still referenced"); + HasBeenWaitedOn = false; IsRecorded = false; IsStarted = false; @@ -67,10 +69,10 @@ ur_event_handle_t_::~ur_event_handle_t_() { if (EvStart) UR_CHECK_ERROR(hipEventDestroy(EvStart)); } - if (Queue != nullptr) { + if (Queue) { urQueueRelease(Queue); } - if (Context != nullptr) { + if (Context) { urContextRelease(Context); } } @@ -300,27 +302,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { ur_result_t Result = UR_RESULT_ERROR_INVALID_EVENT; try { if (!hEvent->backendHasOwnership()) { - Result = UR_RESULT_SUCCESS; + return UR_RESULT_SUCCESS; } else { - assert(hEvent->getQueue() != nullptr); - assert(hEvent->getContext() != nullptr); - - auto queue = event_ptr->getQueue(); - auto context = event_ptr->getContext(); + auto Queue = event_ptr->getQueue(); + auto Context = event_ptr->getContext(); event_ptr->reset(); - queue->CachedEvents.emplace(event_ptr.release()); - urQueueRelease(queue); - urContextRelease(context); - Result = UR_RESULT_SUCCESS; + if (Queue) { + Queue->cache_event(event_ptr.release()); + urQueueRelease(Queue); + } + urContextRelease(Context); + + return UR_RESULT_SUCCESS; } } catch (...) { - Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + return UR_RESULT_ERROR_OUT_OF_RESOURCES; } - return Result; } - - return UR_RESULT_SUCCESS; + return UR_RESULT_ERROR_INVALID_EVENT; } /// Gets the native HIP handle of a UR event object diff --git a/source/adapters/hip/event.hpp b/source/adapters/hip/event.hpp index 8126ba395a..2125f13f55 100644 --- a/source/adapters/hip/event.hpp +++ b/source/adapters/hip/event.hpp @@ -106,6 +106,8 @@ struct ur_event_handle_t_ { return new ur_event_handle_t_(context, eventNative); } + // Resets attributes of an event. + // Throws an error if its RefCount is not 0. void reset(); ~ur_event_handle_t_(); diff --git a/source/adapters/hip/queue.cpp b/source/adapters/hip/queue.cpp index cb13226f9a..98d431231f 100644 --- a/source/adapters/hip/queue.cpp +++ b/source/adapters/hip/queue.cpp @@ -32,9 +32,8 @@ ur_queue_handle_t_::~ur_queue_handle_t_() { urContextRelease(Context); urDeviceRelease(Device); - while (!CachedEvents.empty()) { - std::unique_ptr p{CachedEvents.top()}; - CachedEvents.pop(); + while (has_cached_events()) { + get_cached_event(); } } diff --git a/source/adapters/hip/queue.hpp b/source/adapters/hip/queue.hpp index d16716218c..f6fe4cec0b 100644 --- a/source/adapters/hip/queue.hpp +++ b/source/adapters/hip/queue.hpp @@ -56,6 +56,8 @@ struct ur_queue_handle_t_ { std::mutex ComputeStreamMutex; std::mutex TransferStreamMutex; std::mutex BarrierMutex; + // The event cache might be accessed in multiple threads. + std::mutex CacheMutex; bool HasOwnership; ur_queue_handle_t_(std::vector &&ComputeStreams, @@ -244,13 +246,22 @@ struct ur_queue_handle_t_ { bool backendHasOwnership() const noexcept { return HasOwnership; } - bool has_cached_events() const { return !CachedEvents.empty(); } + bool has_cached_events() { + std::lock_guard CacheGuard(CacheMutex); + return !CachedEvents.empty(); + } + + void cache_event(ur_event_handle_t Event) { + std::lock_guard CacheGuard(CacheMutex); + CachedEvents.push(Event); + } // Returns and removes an event from the CachedEvents stack. ur_event_handle_t get_cached_event() { + std::lock_guard CacheGuard(CacheMutex); assert(has_cached_events()); - auto retEv = CachedEvents.top(); + auto RetEv = CachedEvents.top(); CachedEvents.pop(); - return retEv; + return RetEv; } }; From ffa30c7f22c750b2cc83b3b36b49074c52b09a97 Mon Sep 17 00:00:00 2001 From: Martin Wehking Date: Fri, 10 May 2024 15:53:51 +0100 Subject: [PATCH 5/8] Fix thread-safe destruction of cached events Use a mutex to make the emptiness check and retrieval of cached events atomic. --- source/adapters/cuda/queue.cpp | 6 ++++-- source/adapters/hip/event.cpp | 1 - source/adapters/hip/queue.cpp | 6 ++++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/source/adapters/cuda/queue.cpp b/source/adapters/cuda/queue.cpp index f7da7cea16..4dce8eb534 100644 --- a/source/adapters/cuda/queue.cpp +++ b/source/adapters/cuda/queue.cpp @@ -36,8 +36,10 @@ ur_queue_handle_t_::~ur_queue_handle_t_() { urContextRelease(Context); urDeviceRelease(Device); - while (has_cached_events()) { - get_cached_event(); + std::lock_guard CacheGuard(CacheMutex); + while (!CachedEvents.empty()) { + std::unique_ptr Ev{CachedEvents.top()}; + CachedEvents.pop(); } } diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp index 70602c01eb..bc9c8b6d64 100644 --- a/source/adapters/hip/event.cpp +++ b/source/adapters/hip/event.cpp @@ -299,7 +299,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { // decrement ref count. If it is 0, delete the event. if (hEvent->decrementReferenceCount() == 0) { std::unique_ptr event_ptr{hEvent}; - ur_result_t Result = UR_RESULT_ERROR_INVALID_EVENT; try { if (!hEvent->backendHasOwnership()) { return UR_RESULT_SUCCESS; diff --git a/source/adapters/hip/queue.cpp b/source/adapters/hip/queue.cpp index 98d431231f..3758b40104 100644 --- a/source/adapters/hip/queue.cpp +++ b/source/adapters/hip/queue.cpp @@ -32,8 +32,10 @@ ur_queue_handle_t_::~ur_queue_handle_t_() { urContextRelease(Context); urDeviceRelease(Device); - while (has_cached_events()) { - get_cached_event(); + std::lock_guard CacheGuard(CacheMutex); + while (!CachedEvents.empty()) { + std::unique_ptr Ev{CachedEvents.top()}; + CachedEvents.pop(); } } From 4552eb0b58fa4e632c82021143c5bbf639200ff4 Mon Sep 17 00:00:00 2001 From: Martin Wehking Date: Mon, 13 May 2024 14:42:50 +0100 Subject: [PATCH 6/8] Fix return vals for event release Return UR_RESULT_ERROR_INVALID_EVENT only at the end of a try catch block and UR_RESULT_SUCCESS as default. --- source/adapters/cuda/event.cpp | 3 ++- source/adapters/hip/event.cpp | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index 6ba14c0f53..1ef6c47deb 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -281,8 +281,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { } catch (...) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; } + return UR_RESULT_ERROR_INVALID_EVENT; } - return UR_RESULT_ERROR_INVALID_EVENT; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle( diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp index bc9c8b6d64..50b8b0d932 100644 --- a/source/adapters/hip/event.cpp +++ b/source/adapters/hip/event.cpp @@ -318,8 +318,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { } catch (...) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; } + return UR_RESULT_ERROR_INVALID_EVENT; } - return UR_RESULT_ERROR_INVALID_EVENT; + return UR_RESULT_SUCCESS; } /// Gets the native HIP handle of a UR event object From 852d02fa049516ff388da39ba5cd006c4e17952e Mon Sep 17 00:00:00 2001 From: Martin Wehking Date: Mon, 13 May 2024 15:36:42 +0100 Subject: [PATCH 7/8] Return ur_result_t error directly in catch block Modify the return value inside urEventRelease. --- source/adapters/cuda/event.cpp | 7 ++----- source/adapters/hip/event.cpp | 7 ++----- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index 1ef6c47deb..11f9df37bc 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -275,13 +275,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { urQueueRelease(Queue); } urContextRelease(Context); - - return UR_RESULT_SUCCESS; } - } catch (...) { - return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } catch (ur_result_t Err) { + return Err; } - return UR_RESULT_ERROR_INVALID_EVENT; } return UR_RESULT_SUCCESS; } diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp index 50b8b0d932..9cd6e00df6 100644 --- a/source/adapters/hip/event.cpp +++ b/source/adapters/hip/event.cpp @@ -312,13 +312,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { urQueueRelease(Queue); } urContextRelease(Context); - - return UR_RESULT_SUCCESS; } - } catch (...) { - return UR_RESULT_ERROR_OUT_OF_RESOURCES; + } catch (ur_result_t Err) { + return Err; } - return UR_RESULT_ERROR_INVALID_EVENT; } return UR_RESULT_SUCCESS; } From 30a66024bc713df57c06518d09eff16c2215eada Mon Sep 17 00:00:00 2001 From: Martin Wehking Date: Tue, 14 May 2024 10:43:22 +0100 Subject: [PATCH 8/8] Don't call mutex locking function Don't use a lock_guard inside another mutex locking function. This led to deadlocks for the CUDA and HIP adapters. --- source/adapters/cuda/queue.hpp | 2 +- source/adapters/hip/queue.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adapters/cuda/queue.hpp b/source/adapters/cuda/queue.hpp index e18be86122..e3fb2d1d51 100644 --- a/source/adapters/cuda/queue.hpp +++ b/source/adapters/cuda/queue.hpp @@ -262,7 +262,7 @@ struct ur_queue_handle_t_ { // Returns and removes an event from the CachedEvents stack. ur_event_handle_t get_cached_event() { std::lock_guard CacheGuard(CacheMutex); - assert(has_cached_events()); + assert(!CachedEvents.empty()); auto RetEv = CachedEvents.top(); CachedEvents.pop(); return RetEv; diff --git a/source/adapters/hip/queue.hpp b/source/adapters/hip/queue.hpp index f6fe4cec0b..99e96c752e 100644 --- a/source/adapters/hip/queue.hpp +++ b/source/adapters/hip/queue.hpp @@ -259,7 +259,7 @@ struct ur_queue_handle_t_ { // Returns and removes an event from the CachedEvents stack. ur_event_handle_t get_cached_event() { std::lock_guard CacheGuard(CacheMutex); - assert(has_cached_events()); + assert(!CachedEvents.empty()); auto RetEv = CachedEvents.top(); CachedEvents.pop(); return RetEv;