From e3b65522365cdcfd2a0e365c0d330b767f67cae5 Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Fri, 29 Sep 2023 15:04:21 +0100 Subject: [PATCH 1/7] [SYCL][Graph] Fix memset queue shortcut when queue is recorded (#329) Memset queue shortcut `queue::memset()` manages the memset direclty from the host (without going through the normal path, i.e. the handler). We added a specific case when the queue is recorded to use the normal path instead of the optimized path. --- sycl/source/detail/queue_impl.cpp | 11 +++-- .../RecordReplay/usm_memset_shortcut.cpp | 47 +++++++++++++++++++ 2 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index b8785e12f0535..5d244010d494a 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -99,10 +99,15 @@ event queue_impl::memset(const std::shared_ptr &Self, // Emit a begin/end scope for this call PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin); #endif + // If we have a command graph set we need to capture the memset through normal + // queue submission rather than execute the memset directly. if (MGraph.lock()) { - throw sycl::exception(make_error_code(errc::invalid), - "The memset feature is not yet available " - "for use with the SYCL Graph extension."); + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.memset(Ptr, Value, Count); + }, + Self, {}); } return submitMemOpHelper( diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp new file mode 100644 index 0000000000000..d170c9607d821 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp @@ -0,0 +1,47 @@ +// REQUIRES: cuda || level_zero, gpu +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using ZE_DEBUG +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} +// +// CHECK-NOT: LEAK +// +// Tests adding a USM memset queue shortcut operation as a graph node. + +#include "../graph_common.hpp" + +int main() { + + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + unsigned char *Arr = malloc_device(N, Queue); + + int Value = 77; + Graph.begin_recording(Queue); + auto Init = Queue.memset(Arr, Value, N); + Queue.submit([&](handler &CGH) { + CGH.depends_on(Init); + CGH.single_task([=]() { + for (int i = 0; i < Size; i++) + Arr[i] = 2 * Arr[i]; + }); + }); + + Graph.end_recording(); + + auto ExecGraph = Graph.finalize(); + + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); }).wait(); + + std::vector Output(N); + Queue.memcpy(Output.data(), Arr, N).wait(); + for (int i = 0; i < N; i++) + assert(Output[i] == (Value * 2)); + + sycl::free(Arr, Queue); + + return 0; +} From 8fd8c38e1515e2b20ac76c389bc613d8b77f34bb Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Fri, 26 Jan 2024 10:32:53 +0000 Subject: [PATCH 2/7] Removes exception checking --- sycl/unittests/Extensions/CommandGraph.cpp | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp index b8d45404c627d..39f06f6a40a67 100644 --- a/sycl/unittests/Extensions/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph.cpp @@ -1846,25 +1846,6 @@ TEST_F(CommandGraphTest, FusionExtensionExceptionCheck) { ASSERT_EQ(ExceptionCode, sycl::errc::invalid); } -TEST_F(CommandGraphTest, USMMemsetShortcutExceptionCheck) { - - const size_t N = 10; - unsigned char *Arr = malloc_device(N, Queue); - int Value = 77; - - Graph.begin_recording(Queue); - - std::error_code ExceptionCode = make_error_code(sycl::errc::success); - try { - Queue.memset(Arr, Value, N); - } catch (exception &Exception) { - ExceptionCode = Exception.code(); - } - ASSERT_EQ(ExceptionCode, sycl::errc::invalid); - - Graph.end_recording(Queue); -} - TEST_F(CommandGraphTest, Memcpy2DExceptionCheck) { constexpr size_t RECT_WIDTH = 30; constexpr size_t RECT_HEIGHT = 21; From aaf0d5d49cdb93a1b472998a5c9ebbfc9564a677 Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Fri, 26 Jan 2024 12:33:43 +0000 Subject: [PATCH 3/7] Updates test --- sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp index d170c9607d821..613f1c690c854 100644 --- a/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp @@ -1,4 +1,3 @@ -// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG @@ -14,6 +13,10 @@ int main() { queue Queue; + if (!are_graphs_supported(Queue)) { + return 0; + } + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; const size_t N = 10; From 0c5efd213b083fce75c85f1fc8004d7de6defafa Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Fri, 26 Jan 2024 12:35:30 +0000 Subject: [PATCH 4/7] Updates test inline filecheck instruction --- sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp index 613f1c690c854..ca6c9576a9094 100644 --- a/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp @@ -1,9 +1,7 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG -// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} -// -// CHECK-NOT: LEAK +// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // // Tests adding a USM memset queue shortcut operation as a graph node. From b3ae58ff26a5f9e97078621a959631e6056f36bb Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Mon, 29 Jan 2024 09:55:49 +0000 Subject: [PATCH 5/7] Updates test --- sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp index ca6c9576a9094..d3936a2cfe74d 100644 --- a/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/usm_memset_shortcut.cpp @@ -1,7 +1,7 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// Extra run to check for leaks in Level Zero using ZE_DEBUG -// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // // Tests adding a USM memset queue shortcut operation as a graph node. From d8d8db5d158fe70c0d81d309097ec16cea464d2c Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Wed, 7 Feb 2024 16:41:17 +0000 Subject: [PATCH 6/7] refactoring of the code moving graph shortcut management in submitMemOpHelper --- sycl/source/detail/queue_impl.cpp | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 3feee74a1343a..2365273861521 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -118,16 +118,6 @@ event queue_impl::memset(const std::shared_ptr &Self, // Emit a begin/end scope for this call PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin); #endif - // If we have a command graph set we need to capture the memset through normal - // queue submission rather than execute the memset directly. - if (MGraph.lock()) { - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.memset(Ptr, Value, Count); - }, - Self, {}); - } return submitMemOpHelper( Self, DepEvents, [&](handler &CGH) { CGH.memset(Ptr, Value, Count); }, @@ -174,11 +164,6 @@ event queue_impl::memcpy(const std::shared_ptr &Self, // Emit a begin/end scope for this call PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin); #endif - // If we have a command graph set we need to capture the copy through normal - // queue submission rather than execute the copy directly. - auto HandlerFunc = [&](handler &CGH) { CGH.memcpy(Dest, Src, Count); }; - if (MGraph.lock()) - return submitWithHandler(Self, DepEvents, HandlerFunc); if ((!Src || !Dest) && Count != 0) { report(CodeLoc); @@ -186,7 +171,7 @@ event queue_impl::memcpy(const std::shared_ptr &Self, PI_ERROR_INVALID_VALUE); } return submitMemOpHelper( - Self, DepEvents, HandlerFunc, + Self, DepEvents, [&](handler &CGH) { CGH.memcpy(Dest, Src, Count); }, [](const auto &...Args) { MemoryManager::copy_usm(Args...); }, Src, Self, Count, Dest); } @@ -195,14 +180,9 @@ event queue_impl::mem_advise(const std::shared_ptr &Self, const void *Ptr, size_t Length, pi_mem_advice Advice, const std::vector &DepEvents) { - // If we have a command graph set we need to capture the advise through normal - // queue submission. - auto HandlerFunc = [&](handler &CGH) { CGH.mem_advise(Ptr, Length, Advice); }; - if (MGraph.lock()) - return submitWithHandler(Self, DepEvents, HandlerFunc); - return submitMemOpHelper( - Self, DepEvents, HandlerFunc, + Self, DepEvents, + [&](handler &CGH) { CGH.mem_advise(Ptr, Length, Advice); }, [](const auto &...Args) { MemoryManager::advise_usm(Args...); }, Ptr, Self, Length, Advice); } @@ -358,7 +338,10 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr &Self, const std::vector &ExpandedDepEvents = getExtendDependencyList(DepEvents, MutableDepEvents, Lock); - if (areEventsSafeForSchedulerBypass(ExpandedDepEvents, MContext)) { + // If we have a command graph set we need to capture the op through the + // handler rather than by-passing the scheduler. + if (areEventsSafeForSchedulerBypass(ExpandedDepEvents, MContext) && + !MGraph.lock()) { if (MHasDiscardEventsSupport) { MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents), /*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr); From 19166bb0f39fb2797013e040d3aadb6ec34713c3 Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Thu, 8 Feb 2024 11:23:31 +0000 Subject: [PATCH 7/7] swap condition tests --- sycl/source/detail/queue_impl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 2365273861521..ba50f0562ff56 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -340,8 +340,8 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr &Self, // If we have a command graph set we need to capture the op through the // handler rather than by-passing the scheduler. - if (areEventsSafeForSchedulerBypass(ExpandedDepEvents, MContext) && - !MGraph.lock()) { + if (!MGraph.lock() && + areEventsSafeForSchedulerBypass(ExpandedDepEvents, MContext)) { if (MHasDiscardEventsSupport) { MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents), /*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr);