diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 8b8cf3d2858e4..29a9192be6069 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -692,7 +692,8 @@ pi_result _pi_context::finalize() { std::scoped_lock Lock(ZeCommandListCacheMutex); for (auto &List : ZeComputeCommandListCache) { - for (ze_command_list_handle_t &ZeCommandList : List.second) { + for (auto &Item : List.second) { + ze_command_list_handle_t ZeCommandList = Item.first; if (ZeCommandList) { auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); // Gracefully handle the case that L0 was already unloaded. @@ -702,7 +703,8 @@ pi_result _pi_context::finalize() { } } for (auto &List : ZeCopyCommandListCache) { - for (ze_command_list_handle_t &ZeCommandList : List.second) { + for (auto &Item : List.second) { + ze_command_list_handle_t ZeCommandList = Item.first; if (ZeCommandList) { auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); // Gracefully handle the case that L0 was already unloaded. @@ -715,7 +717,7 @@ pi_result _pi_context::finalize() { } bool pi_command_list_info_t::isCopy(pi_queue Queue) const { - return ZeQueueGroupOrdinal != + return ZeQueueDesc.ordinal != (uint32_t)Queue->Device ->QueueGroup[_pi_device::queue_group_info_t::type::Compute] .ZeOrdinal; @@ -824,7 +826,8 @@ pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList, UseCopyEngine ? this->Context->ZeCopyCommandListCache[this->Device->ZeDevice] : this->Context->ZeComputeCommandListCache[this->Device->ZeDevice]; - ZeCommandListCache.push_back(CommandList->first); + ZeCommandListCache.push_back( + {CommandList->first, CommandList->second.ZeQueueDesc}); } return PI_SUCCESS; @@ -1273,7 +1276,7 @@ pi_result _pi_context::getAvailableCommandList( for (auto ZeCommandListIt = ZeCommandListCache.begin(); ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) { - auto &ZeCommandList = *ZeCommandListIt; + auto &ZeCommandList = ZeCommandListIt->first; auto it = Queue->CommandListMap.find(ZeCommandList); if (it != Queue->CommandListMap.end()) { if (ForcedCmdQueue && *ForcedCmdQueue != it->second.ZeQueue) @@ -1297,12 +1300,14 @@ pi_result _pi_context::getAvailableCommandList( ze_fence_handle_t ZeFence; ZeStruct ZeFenceDesc; ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); - CommandList = Queue->CommandListMap - .emplace(ZeCommandList, - pi_command_list_info_t{ZeFence, true, false, - ZeCommandQueue, - QueueGroupOrdinal}) - .first; + ZeStruct ZeQueueDesc; + ZeQueueDesc.ordinal = QueueGroupOrdinal; + CommandList = + Queue->CommandListMap + .emplace(ZeCommandList, + pi_command_list_info_t{ZeFence, true, false, + ZeCommandQueue, ZeQueueDesc}) + .first; } ZeCommandListCache.erase(ZeCommandListIt); if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList)) @@ -1379,10 +1384,11 @@ _pi_queue::createCommandList(bool UseCopyEngine, &ZeCommandListDesc, &ZeCommandList)); ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); + ZeStruct ZeQueueDesc; + ZeQueueDesc.ordinal = QueueGroupOrdinal; std::tie(CommandList, std::ignore) = CommandListMap.insert( std::pair( - ZeCommandList, - {ZeFence, false, false, ZeCommandQueue, QueueGroupOrdinal})); + ZeCommandList, {ZeFence, false, false, ZeCommandQueue, ZeQueueDesc})); PI_CALL(insertStartBarrierIfDiscardEventsMode(CommandList)); PI_CALL(insertActiveBarriers(CommandList, UseCopyEngine)); @@ -1778,7 +1784,6 @@ _pi_queue::pi_queue_group_t::getZeQueue(uint32_t *QueueGroupOrdinal) { // This function will return one of possibly multiple available // immediate commandlists associated with this Queue. pi_command_list_ptr_t &_pi_queue::pi_queue_group_t::getImmCmdList() { - uint32_t QueueIndex, QueueOrdinal; auto Index = getQueueIndex(&QueueOrdinal, &QueueIndex); @@ -1797,35 +1802,56 @@ pi_command_list_ptr_t &_pi_queue::pi_queue_group_t::getImmCmdList() { ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH; Priority = "High"; } - // Evaluate performance of explicit usage for "0" index. if (QueueIndex != 0) { ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY; } - urPrint("[getZeQueue]: create queue ordinal = %d, index = %d " - "(round robin in [%d, %d]) priority = %s\n", - ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex, - UpperIndex, Priority); + // Check if context's command list cache has an immediate command list with + // matching index. + ze_command_list_handle_t ZeCommandList = nullptr; + { + // Acquire lock to avoid race conditions. + std::scoped_lock Lock(Queue->Context->ZeCommandListCacheMutex); + // Under mutex since operator[] does insertion on the first usage for every + // unique ZeDevice. + auto &ZeCommandListCache = + isCopy() + ? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice] + : Queue->Context + ->ZeComputeCommandListCache[Queue->Device->ZeDevice]; + for (auto ZeCommandListIt = ZeCommandListCache.begin(); + ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) { + const auto &Desc = (*ZeCommandListIt).second; + if (Desc.index == ZeCommandQueueDesc.index && + Desc.flags == ZeCommandQueueDesc.flags && + Desc.mode == ZeCommandQueueDesc.mode && + Desc.priority == ZeCommandQueueDesc.priority) { + ZeCommandList = (*ZeCommandListIt).first; + ZeCommandListCache.erase(ZeCommandListIt); + break; + } + } + } + + // If cache didn't contain a command list, create one. + if (!ZeCommandList) { + urPrint("[getZeQueue]: create queue ordinal = %d, index = %d " + "(round robin in [%d, %d]) priority = %s\n", + ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex, + UpperIndex, Priority); + + ZE_CALL_NOCHECK(zeCommandListCreateImmediate, + (Queue->Context->ZeContext, Queue->Device->ZeDevice, + &ZeCommandQueueDesc, &ZeCommandList)); + } - ze_command_list_handle_t ZeCommandList; - ZE_CALL_NOCHECK(zeCommandListCreateImmediate, - (Queue->Context->ZeContext, Queue->Device->ZeDevice, - &ZeCommandQueueDesc, &ZeCommandList)); ImmCmdLists[Index] = Queue->CommandListMap .insert(std::pair{ - ZeCommandList, {nullptr, true, false, nullptr, QueueOrdinal}}) + ZeCommandList, + {nullptr, true, false, nullptr, ZeCommandQueueDesc}}) .first; - // Add this commandlist to the cache so it can be destroyed as part of - // piQueueReleaseInternal - auto QueueType = Type; - std::scoped_lock Lock(Queue->Context->ZeCommandListCacheMutex); - auto &ZeCommandListCache = - QueueType == queue_type::Compute - ? Queue->Context->ZeComputeCommandListCache[Queue->Device->ZeDevice] - : Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]; - ZeCommandListCache.push_back(ZeCommandList); return ImmCmdLists[Index]; } @@ -2833,6 +2859,29 @@ pi_result piQueueRelease(pi_queue Queue) { if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) return mapError(ZeResult); } + if (Queue->UsingImmCmdLists && Queue->OwnZeCommandQueue) { + std::scoped_lock Lock( + Queue->Context->ZeCommandListCacheMutex); + const pi_command_list_info_t &MapEntry = it->second; + if (MapEntry.CanReuse) { + // Add commandlist to the cache for future use. + // It will be deleted when the context is destroyed. + auto &ZeCommandListCache = + MapEntry.isCopy(Queue) + ? Queue->Context + ->ZeCopyCommandListCache[Queue->Device->ZeDevice] + : Queue->Context + ->ZeComputeCommandListCache[Queue->Device->ZeDevice]; + ZeCommandListCache.push_back({it->first, it->second.ZeQueueDesc}); + } else { + // A non-reusable comamnd list that came from a make_queue call is + // destroyed since it cannot be recycled. + ze_command_list_handle_t ZeCommandList = it->first; + if (ZeCommandList) { + ZE_CALL(zeCommandListDestroy, (ZeCommandList)); + } + } + } } Queue->CommandListMap.clear(); } @@ -2997,11 +3046,15 @@ pi_result piextQueueGetNativeHandle(pi_queue Queue, void _pi_queue::pi_queue_group_t::setImmCmdList( ze_command_list_handle_t ZeCommandList) { + // An immediate command list was given to us but we don't have the queue + // descriptor information. Create a dummy and note that it is not recycleable. + ZeStruct ZeQueueDesc; ImmCmdLists = std::vector( 1, Queue->CommandListMap .insert(std::pair{ - ZeCommandList, {nullptr, true, false, nullptr, 0}}) + ZeCommandList, + {nullptr, true, false, nullptr, ZeQueueDesc, false}}) .first); } diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index d09947bc6ff5a..2634e03cae595 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -153,8 +153,16 @@ struct pi_command_list_info_t { // Record the queue to which the command list will be submitted. ze_command_queue_handle_t ZeQueue{nullptr}; - // Keeps the ordinal of the ZeQueue queue group. Invalid if ZeQueue==nullptr - uint32_t ZeQueueGroupOrdinal{0}; + + // Record the queue descriptor fields used when creating the command list + // because we cannot recover these fields from the command list. Immediate + // command lists are recycled across queues and then all fields are used. For + // standard command lists only the ordinal is used. For queues created through + // the make_queue API the descriptor is unavailable so a dummy descriptor is + // used and then this entry is marked as not eligible for recycling. + ZeStruct ZeQueueDesc; + bool CanReuse{true}; + // Helper functions to tell if this is a copy command-list. bool isCopy(pi_queue Queue) const; @@ -244,9 +252,13 @@ struct _pi_context : _ur_object { // application must only use the command list for the device, or its // sub-devices, which was provided during creation." // - std::unordered_map> + std::unordered_map>>> ZeComputeCommandListCache; - std::unordered_map> + std::unordered_map>>> ZeCopyCommandListCache; // Retrieves a command list for executing on this device along with diff --git a/sycl/test-e2e/Plugin/level_zero_queue_priority.cpp b/sycl/test-e2e/Plugin/level_zero_queue_priority.cpp index a3bc361ed55e1..d89b83a67b418 100644 --- a/sycl/test-e2e/Plugin/level_zero_queue_priority.cpp +++ b/sycl/test-e2e/Plugin/level_zero_queue_priority.cpp @@ -1,10 +1,12 @@ // REQUIRES: gpu, level_zero, level_zero_dev_kit // RUN: %{build} %level_zero_options -o %t.out -// RUN: env ZE_DEBUG=-1 %{run} %t.out 2>&1 | FileCheck %s +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{run} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK-STD +// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{run} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK-IMM // // Check that queue priority is passed to Level Zero runtime // This is the last value in the ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC // +// With immediate command lists the command lists are recycled between queues. #include void test(sycl::property_list Props) { @@ -17,19 +19,25 @@ void test(sycl::property_list Props) { int main(int Argc, const char *Argv[]) { - // CHECK: [getZeQueue]: create queue {{.*}} priority = Normal + // CHECK-STD: [getZeQueue]: create queue {{.*}} priority = Normal + // CHECK-IMM: [getZeQueue]: create queue {{.*}} priority = Normal test(sycl::property_list{}); - // CHECK: [getZeQueue]: create queue {{.*}} priority = Normal + // CHECK-STD: [getZeQueue]: create queue {{.*}} priority = Normal + // With immediate command list recycling, a new IMM is not created here. + // CHECK-IMM-NOT: [getZeQueue]: create queue {{.*}} priority = Normal test({sycl::ext::oneapi::property::queue::priority_normal{}}); - // CHECK: [getZeQueue]: create queue {{.*}} priority = Low + // CHECK-STD: [getZeQueue]: create queue {{.*}} priority = Low + // CHECK-IMM: [getZeQueue]: create queue {{.*}} priority = Low test({sycl::ext::oneapi::property::queue::priority_low{}}); - // CHECK: [getZeQueue]: create queue {{.*}} priority = High + // CHECK-STD: [getZeQueue]: create queue {{.*}} priority = High + // CHECK-IMM: [getZeQueue]: create queue {{.*}} priority = High test({sycl::ext::oneapi::property::queue::priority_high{}}); - // CHECK: Queue cannot be constructed with different priorities. + // CHECK-STD: Queue cannot be constructed with different priorities. + // CHECK-IMM: Queue cannot be constructed with different priorities. try { test({sycl::ext::oneapi::property::queue::priority_low{}, sycl::ext::oneapi::property::queue::priority_high{}});