From 1c420b5c565ad6cc509dfd6a89cbeadcce48239c Mon Sep 17 00:00:00 2001 From: "Neil R. Spruit" Date: Wed, 21 May 2025 15:55:04 -0700 Subject: [PATCH 1/2] Revert "[L0] Refactor Copy Engine Usage checks for Performance" This reverts commit 781e5761928e82bba6de5f9172ed9fe70d7ea93e. --- .../adapters/level_zero/command_buffer.cpp | 15 ++++- .../source/adapters/level_zero/memory.cpp | 55 ++++++++++--------- .../source/adapters/level_zero/memory.hpp | 3 - 3 files changed, 42 insertions(+), 31 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index 5cb85d2059e65..a2351f773f9c4 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -1097,10 +1097,19 @@ ur_result_t urCommandBufferAppendUSMMemcpyExp( ur_event_handle_t * /*Event*/, ur_exp_command_buffer_command_handle_t * /*Command*/) { + bool PreferCopyEngine = !IsDevicePointer(CommandBuffer->Context, Src) || + !IsDevicePointer(CommandBuffer->Context, Dst); + // For better performance, Copy Engines are not preferred given Shared + // pointers on DG2. + if (CommandBuffer->Device->isDG2() && + (IsSharedPointer(CommandBuffer->Context, Src) || + IsSharedPointer(CommandBuffer->Context, Dst))) { + PreferCopyEngine = false; + } + PreferCopyEngine |= UseCopyEngineForD2DCopy; + return enqueueCommandBufferMemCopyHelper( - UR_COMMAND_USM_MEMCPY, CommandBuffer, Dst, Src, Size, - PreferCopyEngineUsage(CommandBuffer->Device, CommandBuffer->Context, Src, - Dst), + UR_COMMAND_USM_MEMCPY, CommandBuffer, Dst, Src, Size, PreferCopyEngine, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } diff --git a/unified-runtime/source/adapters/level_zero/memory.cpp b/unified-runtime/source/adapters/level_zero/memory.cpp index edb055f08b202..0f6bb37dde904 100644 --- a/unified-runtime/source/adapters/level_zero/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/memory.cpp @@ -57,27 +57,6 @@ bool IsSharedPointer(ur_context_handle_t Context, const void *Ptr) { return (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_SHARED); } -// Helper Function to check if the Copy Engine should be preferred given the -// types of memory used. -bool PreferCopyEngineUsage(ur_device_handle_t Device, - ur_context_handle_t Context, const void *Src, - void *Dst) { - bool PreferCopyEngine = false; - // Given Integrated Devices, Copy Engines are not preferred for any Copy - // operations. - if (!Device->isIntegrated()) { - // Given non D2D Copies, for better performance, Copy Engines are preferred - // only if one has both the Main and Link Copy Engines. - if (Device->hasLinkCopyEngine() && Device->hasMainCopyEngine() && - (!IsDevicePointer(Context, Src) || !IsDevicePointer(Context, Dst))) { - PreferCopyEngine = true; - } - } - // Temporary option added to use force engine for D2D copy - PreferCopyEngine |= UseCopyEngineForD2DCopy; - return PreferCopyEngine; -} - // Shared by all memory read/write/copy PI interfaces. // PI interfaces must have queue's and destination buffer's mutexes locked for // exclusive use and source buffer's mutex locked for shared use on entry. @@ -1259,10 +1238,23 @@ ur_result_t urEnqueueUSMMemcpy( ur_event_handle_t *OutEvent) { std::scoped_lock lock(Queue->Mutex); + // Device to Device copies are found to execute slower on copy engine + // (versus compute engine). + bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src) || + !IsDevicePointer(Queue->Context, Dst); + // For better performance, Copy Engines are not preferred given Shared + // pointers on DG2. + if (Queue->Device->isDG2() && (IsSharedPointer(Queue->Context, Src) || + IsSharedPointer(Queue->Context, Dst))) { + PreferCopyEngine = false; + } + + // Temporary option added to use copy engine for D2D copy + PreferCopyEngine |= UseCopyEngineForD2DCopy; + return enqueueMemCopyHelper( // TODO: do we need a new command type for this? UR_COMMAND_MEM_BUFFER_COPY, Queue, Dst, Blocking, Size, Src, - NumEventsInWaitList, EventWaitList, OutEvent, - PreferCopyEngineUsage(Queue->Device, Queue->Context, Src, Dst)); + NumEventsInWaitList, EventWaitList, OutEvent, PreferCopyEngine); } ur_result_t urEnqueueUSMPrefetch( @@ -1462,13 +1454,26 @@ ur_result_t urEnqueueUSMMemcpy2D( std::scoped_lock lock(Queue->Mutex); + // Device to Device copies are found to execute slower on copy engine + // (versus compute engine). + bool PreferCopyEngine = !IsDevicePointer(Queue->Context, Src) || + !IsDevicePointer(Queue->Context, Dst); + // For better performance, Copy Engines are not preferred given Shared + // pointers on DG2. + if (Queue->Device->isDG2() && (IsSharedPointer(Queue->Context, Src) || + IsSharedPointer(Queue->Context, Dst))) { + PreferCopyEngine = false; + } + + // Temporary option added to use copy engine for D2D copy + PreferCopyEngine |= UseCopyEngineForD2DCopy; + return enqueueMemCopyRectHelper( // TODO: do we need a new command type for // this? UR_COMMAND_MEM_BUFFER_COPY_RECT, Queue, Src, Dst, ZeroOffset, ZeroOffset, Region, SrcPitch, DstPitch, 0, /*SrcSlicePitch=*/ 0, /*DstSlicePitch=*/ - Blocking, NumEventsInWaitList, EventWaitList, Event, - PreferCopyEngineUsage(Queue->Device, Queue->Context, Src, Dst)); + Blocking, NumEventsInWaitList, EventWaitList, Event, PreferCopyEngine); } ur_result_t urMemImageCreate( diff --git a/unified-runtime/source/adapters/level_zero/memory.hpp b/unified-runtime/source/adapters/level_zero/memory.hpp index 7cbc57f041231..715b5b51870c1 100644 --- a/unified-runtime/source/adapters/level_zero/memory.hpp +++ b/unified-runtime/source/adapters/level_zero/memory.hpp @@ -29,9 +29,6 @@ struct ur_device_handle_t_; bool IsDevicePointer(ur_context_handle_t Context, const void *Ptr); bool IsSharedPointer(ur_context_handle_t Context, const void *Ptr); -bool PreferCopyEngineUsage(ur_device_handle_t Device, - ur_context_handle_t Context, const void *Src, - void *Dst); // This is an experimental option to test performance of device to device copy // operations on copy engines (versus compute engine) From a207347d14645794106635262d526457ee577b5c Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Thu, 29 May 2025 09:42:21 +0100 Subject: [PATCH 2/2] [SYCL][Graph] Disable DG2 copy engine Always disable copy engine usage on DG2 to as a workaround to CI fails, see CMPLRLLVM-68064 --- unified-runtime/source/adapters/level_zero/command_buffer.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index a2351f773f9c4..020afb90564ff 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -742,7 +742,9 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, // Create a list for copy commands. Note that to simplify the implementation, // the current implementation only uses the main copy engine and does not use // the link engine even if available. - if (Device->hasMainCopyEngine()) { + // + // Copy engine usage disabled for DG2, see CMPLRLLVM-68064 + if (Device->hasMainCopyEngine() && !Device->isDG2()) { UR_CALL(createMainCommandList(Context, Device, IsInOrder, false, true, ZeCopyCommandList)); }