Skip to content

[SYCL] [L0] Recycle immediate command lists for queues in a context. #9409

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
May 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 87 additions & 34 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -692,7 +692,8 @@ pi_result _pi_context::finalize() {

std::scoped_lock<ur_mutex> Lock(ZeCommandListCacheMutex);
for (auto &List : ZeComputeCommandListCache) {
for (ze_command_list_handle_t &ZeCommandList : List.second) {
for (auto &Item : List.second) {
ze_command_list_handle_t ZeCommandList = Item.first;
if (ZeCommandList) {
auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList));
// Gracefully handle the case that L0 was already unloaded.
Expand All @@ -702,7 +703,8 @@ pi_result _pi_context::finalize() {
}
}
for (auto &List : ZeCopyCommandListCache) {
for (ze_command_list_handle_t &ZeCommandList : List.second) {
for (auto &Item : List.second) {
ze_command_list_handle_t ZeCommandList = Item.first;
if (ZeCommandList) {
auto ZeResult = ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList));
// Gracefully handle the case that L0 was already unloaded.
Expand All @@ -715,7 +717,7 @@ pi_result _pi_context::finalize() {
}

bool pi_command_list_info_t::isCopy(pi_queue Queue) const {
return ZeQueueGroupOrdinal !=
return ZeQueueDesc.ordinal !=
(uint32_t)Queue->Device
->QueueGroup[_pi_device::queue_group_info_t::type::Compute]
.ZeOrdinal;
Expand Down Expand Up @@ -824,7 +826,8 @@ pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList,
UseCopyEngine
? this->Context->ZeCopyCommandListCache[this->Device->ZeDevice]
: this->Context->ZeComputeCommandListCache[this->Device->ZeDevice];
ZeCommandListCache.push_back(CommandList->first);
ZeCommandListCache.push_back(
{CommandList->first, CommandList->second.ZeQueueDesc});
}

return PI_SUCCESS;
Expand Down Expand Up @@ -1273,7 +1276,7 @@ pi_result _pi_context::getAvailableCommandList(

for (auto ZeCommandListIt = ZeCommandListCache.begin();
ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) {
auto &ZeCommandList = *ZeCommandListIt;
auto &ZeCommandList = ZeCommandListIt->first;
auto it = Queue->CommandListMap.find(ZeCommandList);
if (it != Queue->CommandListMap.end()) {
if (ForcedCmdQueue && *ForcedCmdQueue != it->second.ZeQueue)
Expand All @@ -1297,12 +1300,14 @@ pi_result _pi_context::getAvailableCommandList(
ze_fence_handle_t ZeFence;
ZeStruct<ze_fence_desc_t> ZeFenceDesc;
ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
CommandList = Queue->CommandListMap
.emplace(ZeCommandList,
pi_command_list_info_t{ZeFence, true, false,
ZeCommandQueue,
QueueGroupOrdinal})
.first;
ZeStruct<ze_command_queue_desc_t> ZeQueueDesc;
ZeQueueDesc.ordinal = QueueGroupOrdinal;
CommandList =
Queue->CommandListMap
.emplace(ZeCommandList,
pi_command_list_info_t{ZeFence, true, false,
ZeCommandQueue, ZeQueueDesc})
.first;
}
ZeCommandListCache.erase(ZeCommandListIt);
if (auto Res = Queue->insertStartBarrierIfDiscardEventsMode(CommandList))
Expand Down Expand Up @@ -1379,10 +1384,11 @@ _pi_queue::createCommandList(bool UseCopyEngine,
&ZeCommandListDesc, &ZeCommandList));

ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
ZeStruct<ze_command_queue_desc_t> ZeQueueDesc;
ZeQueueDesc.ordinal = QueueGroupOrdinal;
std::tie(CommandList, std::ignore) = CommandListMap.insert(
std::pair<ze_command_list_handle_t, pi_command_list_info_t>(
ZeCommandList,
{ZeFence, false, false, ZeCommandQueue, QueueGroupOrdinal}));
ZeCommandList, {ZeFence, false, false, ZeCommandQueue, ZeQueueDesc}));

PI_CALL(insertStartBarrierIfDiscardEventsMode(CommandList));
PI_CALL(insertActiveBarriers(CommandList, UseCopyEngine));
Expand Down Expand Up @@ -1778,7 +1784,6 @@ _pi_queue::pi_queue_group_t::getZeQueue(uint32_t *QueueGroupOrdinal) {
// This function will return one of possibly multiple available
// immediate commandlists associated with this Queue.
pi_command_list_ptr_t &_pi_queue::pi_queue_group_t::getImmCmdList() {

uint32_t QueueIndex, QueueOrdinal;
auto Index = getQueueIndex(&QueueOrdinal, &QueueIndex);

Expand All @@ -1797,35 +1802,56 @@ pi_command_list_ptr_t &_pi_queue::pi_queue_group_t::getImmCmdList() {
ZeCommandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH;
Priority = "High";
}

// Evaluate performance of explicit usage for "0" index.
if (QueueIndex != 0) {
ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY;
}

urPrint("[getZeQueue]: create queue ordinal = %d, index = %d "
"(round robin in [%d, %d]) priority = %s\n",
ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
UpperIndex, Priority);
// Check if context's command list cache has an immediate command list with
// matching index.
ze_command_list_handle_t ZeCommandList = nullptr;
{
// Acquire lock to avoid race conditions.
std::scoped_lock<ur_mutex> Lock(Queue->Context->ZeCommandListCacheMutex);
// Under mutex since operator[] does insertion on the first usage for every
// unique ZeDevice.
auto &ZeCommandListCache =
isCopy()
? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]
: Queue->Context
->ZeComputeCommandListCache[Queue->Device->ZeDevice];
for (auto ZeCommandListIt = ZeCommandListCache.begin();
ZeCommandListIt != ZeCommandListCache.end(); ++ZeCommandListIt) {
const auto &Desc = (*ZeCommandListIt).second;
if (Desc.index == ZeCommandQueueDesc.index &&
Desc.flags == ZeCommandQueueDesc.flags &&
Desc.mode == ZeCommandQueueDesc.mode &&
Desc.priority == ZeCommandQueueDesc.priority) {
ZeCommandList = (*ZeCommandListIt).first;
ZeCommandListCache.erase(ZeCommandListIt);
break;
}
}
}

// If cache didn't contain a command list, create one.
if (!ZeCommandList) {
urPrint("[getZeQueue]: create queue ordinal = %d, index = %d "
"(round robin in [%d, %d]) priority = %s\n",
ZeCommandQueueDesc.ordinal, ZeCommandQueueDesc.index, LowerIndex,
UpperIndex, Priority);

ZE_CALL_NOCHECK(zeCommandListCreateImmediate,
(Queue->Context->ZeContext, Queue->Device->ZeDevice,
&ZeCommandQueueDesc, &ZeCommandList));
}

ze_command_list_handle_t ZeCommandList;
ZE_CALL_NOCHECK(zeCommandListCreateImmediate,
(Queue->Context->ZeContext, Queue->Device->ZeDevice,
&ZeCommandQueueDesc, &ZeCommandList));
ImmCmdLists[Index] =
Queue->CommandListMap
.insert(std::pair<ze_command_list_handle_t, pi_command_list_info_t>{
ZeCommandList, {nullptr, true, false, nullptr, QueueOrdinal}})
ZeCommandList,
{nullptr, true, false, nullptr, ZeCommandQueueDesc}})
.first;
// Add this commandlist to the cache so it can be destroyed as part of
// piQueueReleaseInternal
auto QueueType = Type;
std::scoped_lock<ur_mutex> Lock(Queue->Context->ZeCommandListCacheMutex);
auto &ZeCommandListCache =
QueueType == queue_type::Compute
? Queue->Context->ZeComputeCommandListCache[Queue->Device->ZeDevice]
: Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice];
ZeCommandListCache.push_back(ZeCommandList);

return ImmCmdLists[Index];
}
Expand Down Expand Up @@ -2833,6 +2859,29 @@ pi_result piQueueRelease(pi_queue Queue) {
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return mapError(ZeResult);
}
if (Queue->UsingImmCmdLists && Queue->OwnZeCommandQueue) {
std::scoped_lock<ur_mutex> Lock(
Queue->Context->ZeCommandListCacheMutex);
const pi_command_list_info_t &MapEntry = it->second;
if (MapEntry.CanReuse) {
// Add commandlist to the cache for future use.
// It will be deleted when the context is destroyed.
auto &ZeCommandListCache =
MapEntry.isCopy(Queue)
? Queue->Context
->ZeCopyCommandListCache[Queue->Device->ZeDevice]
: Queue->Context
->ZeComputeCommandListCache[Queue->Device->ZeDevice];
ZeCommandListCache.push_back({it->first, it->second.ZeQueueDesc});
} else {
// A non-reusable comamnd list that came from a make_queue call is
// destroyed since it cannot be recycled.
ze_command_list_handle_t ZeCommandList = it->first;
if (ZeCommandList) {
ZE_CALL(zeCommandListDestroy, (ZeCommandList));
}
}
}
}
Queue->CommandListMap.clear();
}
Expand Down Expand Up @@ -2997,11 +3046,15 @@ pi_result piextQueueGetNativeHandle(pi_queue Queue,

void _pi_queue::pi_queue_group_t::setImmCmdList(
ze_command_list_handle_t ZeCommandList) {
// An immediate command list was given to us but we don't have the queue
// descriptor information. Create a dummy and note that it is not recycleable.
ZeStruct<ze_command_queue_desc_t> ZeQueueDesc;
ImmCmdLists = std::vector<pi_command_list_ptr_t>(
1,
Queue->CommandListMap
.insert(std::pair<ze_command_list_handle_t, pi_command_list_info_t>{
ZeCommandList, {nullptr, true, false, nullptr, 0}})
ZeCommandList,
{nullptr, true, false, nullptr, ZeQueueDesc, false}})
.first);
}

Expand Down
20 changes: 16 additions & 4 deletions sycl/plugins/level_zero/pi_level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,16 @@ struct pi_command_list_info_t {

// Record the queue to which the command list will be submitted.
ze_command_queue_handle_t ZeQueue{nullptr};
// Keeps the ordinal of the ZeQueue queue group. Invalid if ZeQueue==nullptr
uint32_t ZeQueueGroupOrdinal{0};

// Record the queue descriptor fields used when creating the command list
// because we cannot recover these fields from the command list. Immediate
// command lists are recycled across queues and then all fields are used. For
// standard command lists only the ordinal is used. For queues created through
// the make_queue API the descriptor is unavailable so a dummy descriptor is
// used and then this entry is marked as not eligible for recycling.
ZeStruct<ze_command_queue_desc_t> ZeQueueDesc;
bool CanReuse{true};

// Helper functions to tell if this is a copy command-list.
bool isCopy(pi_queue Queue) const;

Expand Down Expand Up @@ -244,9 +252,13 @@ struct _pi_context : _ur_object {
// application must only use the command list for the device, or its
// sub-devices, which was provided during creation."
//
std::unordered_map<ze_device_handle_t, std::list<ze_command_list_handle_t>>
std::unordered_map<ze_device_handle_t,
std::list<std::pair<ze_command_list_handle_t,
ZeStruct<ze_command_queue_desc_t>>>>
ZeComputeCommandListCache;
std::unordered_map<ze_device_handle_t, std::list<ze_command_list_handle_t>>
std::unordered_map<ze_device_handle_t,
std::list<std::pair<ze_command_list_handle_t,
ZeStruct<ze_command_queue_desc_t>>>>
ZeCopyCommandListCache;

// Retrieves a command list for executing on this device along with
Expand Down
20 changes: 14 additions & 6 deletions sycl/test-e2e/Plugin/level_zero_queue_priority.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
// REQUIRES: gpu, level_zero, level_zero_dev_kit
// RUN: %{build} %level_zero_options -o %t.out
// RUN: env ZE_DEBUG=-1 %{run} %t.out 2>&1 | FileCheck %s
// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{run} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK-STD
// RUN: env ZE_DEBUG=1 SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{run} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK-IMM
//
// Check that queue priority is passed to Level Zero runtime
// This is the last value in the ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC
//
// With immediate command lists the command lists are recycled between queues.
#include <sycl/sycl.hpp>

void test(sycl::property_list Props) {
Expand All @@ -17,19 +19,25 @@ void test(sycl::property_list Props) {

int main(int Argc, const char *Argv[]) {

// CHECK: [getZeQueue]: create queue {{.*}} priority = Normal
// CHECK-STD: [getZeQueue]: create queue {{.*}} priority = Normal
// CHECK-IMM: [getZeQueue]: create queue {{.*}} priority = Normal
test(sycl::property_list{});

// CHECK: [getZeQueue]: create queue {{.*}} priority = Normal
// CHECK-STD: [getZeQueue]: create queue {{.*}} priority = Normal
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please either add a CHECK-IMM or a comment why it shouldn't be here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CHECK-IMM is not needed here precisely because of IMM recycling.
I've added a comment saying so.

// With immediate command list recycling, a new IMM is not created here.
// CHECK-IMM-NOT: [getZeQueue]: create queue {{.*}} priority = Normal
test({sycl::ext::oneapi::property::queue::priority_normal{}});

// CHECK: [getZeQueue]: create queue {{.*}} priority = Low
// CHECK-STD: [getZeQueue]: create queue {{.*}} priority = Low
// CHECK-IMM: [getZeQueue]: create queue {{.*}} priority = Low
test({sycl::ext::oneapi::property::queue::priority_low{}});

// CHECK: [getZeQueue]: create queue {{.*}} priority = High
// CHECK-STD: [getZeQueue]: create queue {{.*}} priority = High
// CHECK-IMM: [getZeQueue]: create queue {{.*}} priority = High
test({sycl::ext::oneapi::property::queue::priority_high{}});

// CHECK: Queue cannot be constructed with different priorities.
// CHECK-STD: Queue cannot be constructed with different priorities.
// CHECK-IMM: Queue cannot be constructed with different priorities.
try {
test({sycl::ext::oneapi::property::queue::priority_low{},
sycl::ext::oneapi::property::queue::priority_high{}});
Expand Down