From a3624765877c2468ac5c3b07ad15e0693623582e Mon Sep 17 00:00:00 2001 From: Kevin B Smith Date: Fri, 21 May 2021 07:49:53 -0700 Subject: [PATCH 1/2] [SYCL]Fixes a performance issue due to https://github.com/intel/llvm/pull/3612 This change does two things to fix the performance issue. The first is in the level zero plugin. Here the change is to only close and submit the batch if the event being queried is one that will be signalled one of the commands in the batch. The second change is in the sycl run-time itself. This change prevents the event cleanup code from querying every single event that is outstanding in the system. This is necessary to prevent the most recent events (which are likely to be in the plug-ins batch) from being queried. Both changes are required to fix the performance issue. --- sycl/plugins/level_zero/pi_level_zero.cpp | 9 +++++++-- sycl/source/detail/queue_impl.cpp | 19 +++++++++++++------ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index f3d9434673817..f3c9adc64c758 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -4111,8 +4111,13 @@ pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName, // Lock automatically releases when this goes out of scope. std::lock_guard lock(Event->Queue->PiQueueMutex); - if (auto Res = Event->Queue->executeOpenCommandList()) - return Res; + // Only do the execute of the open command list if the event that + // is being queried and event that is to be signalled by something + // currently in that open command list. + if (Event->Queue->ZeOpenCommandList == Event->ZeCommandList) { + if (auto Res = Event->Queue->executeOpenCommandList()) + return Res; + } } ze_result_t ZeResult; diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index bcbe28720ac56..52c49cf26f0ea 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -122,14 +122,21 @@ void queue_impl::addSharedEvent(const event &Event) { // of them can be released. const size_t EventThreshold = 128; if (MEventsShared.size() >= EventThreshold) { + // Generally, the vector is ordered so that the oldest events are in the + // front and the newer events are in the end. So, search to find the first + // event that isn't yet complete. All the events prior to that can be + // erased. This could leave some few events further on that have completed + // not yet erased, but that is OK. This cleanup doesn't have to be perfect. + // This also keeps the algorithm linear rather than quadratic because it + // doesn't continually recheck things towards the back of the list that + // really haven't had time to complete. MEventsShared.erase( - std::remove_if( - MEventsShared.begin(), MEventsShared.end(), - [](const event &E) { - return E.get_info() == + MEventsShared.begin(), + std::find_if( + MEventsShared.begin(), MEventsShared.end(), [](const event &E) { + return E.get_info() != info::event_command_status::complete; - }), - MEventsShared.end()); + })); } MEventsShared.push_back(Event); } From 83e06e3cce44612481065905e668808a7517792f Mon Sep 17 00:00:00 2001 From: Kevin B Smith Date: Fri, 21 May 2021 10:59:07 -0700 Subject: [PATCH 2/2] Updates sycl/unittests/queue/EventClear.cpp unit test --- sycl/unittests/queue/EventClear.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/sycl/unittests/queue/EventClear.cpp b/sycl/unittests/queue/EventClear.cpp index 0a1dd61dae8f4..d5b42a187b292 100644 --- a/sycl/unittests/queue/EventClear.cpp +++ b/sycl/unittests/queue/EventClear.cpp @@ -23,6 +23,8 @@ struct TestCtx { std::unique_ptr TestContext; +const int ExpectedEventThreshold = 128; + pi_result redefinedUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, size_t count, pi_uint32 num_events_in_waitlist, @@ -44,10 +46,16 @@ pi_result redefinedEventGetInfo(pi_event event, pi_event_info param_name, size_t *param_value_size_ret) { EXPECT_EQ(param_name, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS) << "Unexpected event info requested"; - // Report half of events as complete + // Report first half of events as complete. + // Report second half of events as running. + // This is important, because removal algorithm assumes that + // events are likely to be removed oldest first, and stops removing + // at the first non-completed event. static int Counter = 0; auto *Result = reinterpret_cast(param_value); - *Result = (++Counter % 2 == 0) ? PI_EVENT_COMPLETE : PI_EVENT_RUNNING; + *Result = (Counter < (ExpectedEventThreshold / 2)) ? PI_EVENT_COMPLETE + : PI_EVENT_RUNNING; + Counter++; return PI_SUCCESS; } @@ -117,7 +125,6 @@ TEST(QueueEventClear, CleanupOnThreshold) { queue Q{Ctx, default_selector()}; unsigned char *HostAlloc = (unsigned char *)malloc_host(1, Ctx); - const int ExpectedEventThreshold = 128; TestContext->EventReferenceCount = ExpectedEventThreshold; for (size_t I = 0; I < ExpectedEventThreshold; ++I) { Q.memset(HostAlloc, 42, 1).wait();