From cf3085b02a19763f4f6630ba706aaffe10e24cc5 Mon Sep 17 00:00:00 2001 From: smaslov Date: Fri, 12 May 2023 12:13:02 -0700 Subject: [PATCH 1/3] [SYCL[L0] Change the SYCL_PI_LEVEL_ZERO_USM_RESIDENT default to force device allocations only Signed-off-by: smaslov --- sycl/plugins/level_zero/pi_level_zero.cpp | 56 +++++++++++++++++------ 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index fde97657ff641..4f3f7d938967e 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -7231,21 +7231,31 @@ enum class USMAllocationForceResidencyType { // Force memory resident on the device of allocation at allocation time. // For host allocation force residency on all devices in a context. Device = 1, - // [Default] Force memory resident on all devices in the context with P2P + // Force memory resident on all devices in the context with P2P // access to the device of allocation. // For host allocation force residency on all devices in a context. P2PDevices = 2 }; // Returns the desired USM residency setting -static USMAllocationForceResidencyType USMAllocationForceResidency = [] { +// Input value is of the form 0xHSD, where: +// 4-bits of D control device allocations +// 4-bits of S control shared allocations +// 4-bits of H control host allocations +// Each 4-bit value is holding a USMAllocationForceResidencyType enum value. +// The default is 0x2, i.e. force full residency for device allocations only. +// +static uint32_t USMAllocationForceResidency = [] { const char *UrRet = std::getenv("UR_L0_USM_RESIDENT"); const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT"); const char *Str = UrRet ? UrRet : (PiRet ? PiRet : nullptr); + return Str ? std::atoi(Str) : 0x2; +}(); - if (!Str) - return USMAllocationForceResidencyType::P2PDevices; - switch (std::atoi(Str)) { +// Convert from an integer value to USMAllocationForceResidencyType enum value +static USMAllocationForceResidencyType +USMAllocationForceResidencyConvert(uint32_t Val) { + switch (Val) { case 1: return USMAllocationForceResidencyType::Device; case 2: @@ -7253,26 +7263,39 @@ static USMAllocationForceResidencyType USMAllocationForceResidency = [] { default: return USMAllocationForceResidencyType::None; }; +} + +static USMAllocationForceResidencyType USMHostAllocationForceResidency = [] { + return USMAllocationForceResidencyConvert( + (USMAllocationForceResidency & 0xf00) >> 8); +}(); +static USMAllocationForceResidencyType USMSharedAllocationForceResidency = [] { + return USMAllocationForceResidencyConvert( + (USMAllocationForceResidency & 0x0f0) >> 4); +}(); +static USMAllocationForceResidencyType USMDeviceAllocationForceResidency = [] { + return USMAllocationForceResidencyConvert( + (USMAllocationForceResidency & 0x00f)); }(); // Make USM allocation resident as requested static pi_result -USMAllocationMakeResident(pi_context Context, +USMAllocationMakeResident(USMAllocationForceResidencyType ForceResidency, + pi_context Context, pi_device Device, // nullptr for host allocation void *Ptr, size_t Size) { - std::list Devices; - - if (USMAllocationForceResidency == USMAllocationForceResidencyType::None) + if (ForceResidency == USMAllocationForceResidencyType::None) return PI_SUCCESS; - else if (!Device) { + + std::list Devices; + if (!Device) { // Host allocation, make it resident on all devices in the context Devices.insert(Devices.end(), Context->Devices.begin(), Context->Devices.end()); } else { Devices.push_back(Device); - if (USMAllocationForceResidency == - USMAllocationForceResidencyType::P2PDevices) { + if (ForceResidency == USMAllocationForceResidencyType::P2PDevices) { ze_bool_t P2P; for (const auto &D : Context->Devices) { if (D == Device) @@ -7322,7 +7345,8 @@ static pi_result USMDeviceAllocImpl(void **ResultPtr, pi_context Context, reinterpret_cast(*ResultPtr) % Alignment == 0, PI_ERROR_INVALID_VALUE); - USMAllocationMakeResident(Context, Device, *ResultPtr, Size); + USMAllocationMakeResident(USMDeviceAllocationForceResidency, Context, Device, + *ResultPtr, Size); return PI_SUCCESS; } @@ -7353,7 +7377,8 @@ static pi_result USMSharedAllocImpl(void **ResultPtr, pi_context Context, reinterpret_cast(*ResultPtr) % Alignment == 0, PI_ERROR_INVALID_VALUE); - USMAllocationMakeResident(Context, Device, *ResultPtr, Size); + USMAllocationMakeResident(USMSharedAllocationForceResidency, Context, Device, + *ResultPtr, Size); // TODO: Handle PI_MEM_ALLOC_DEVICE_READ_ONLY. return PI_SUCCESS; @@ -7379,7 +7404,8 @@ static pi_result USMHostAllocImpl(void **ResultPtr, pi_context Context, reinterpret_cast(*ResultPtr) % Alignment == 0, PI_ERROR_INVALID_VALUE); - USMAllocationMakeResident(Context, nullptr, *ResultPtr, Size); + USMAllocationMakeResident(USMHostAllocationForceResidency, Context, nullptr, + *ResultPtr, Size); return PI_SUCCESS; } From 076f82d649580dd00526b21e91770fe37cd5cb8d Mon Sep 17 00:00:00 2001 From: smaslov Date: Mon, 15 May 2023 13:59:05 -0700 Subject: [PATCH 2/3] [SYCL][NFC] Update doc Signed-off-by: smaslov --- sycl/doc/EnvironmentVariables.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index c6838ba2c0999..45b32720221d7 100755 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -259,7 +259,7 @@ variables in production code. | `SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING` (Deprecated) | Integer | When set to non-zero value exposes compute slices as sub-sub-devices in `sycl::info::partition_property::partition_by_affinity_domain` partitioning scheme. Default is zero meaning that they are only exposed when partitioning by `sycl::info::partition_property::ext_intel_partition_by_cslice`. This option is introduced for compatibility reasons and is immediately deprecated. New code must not rely on this behavior. Also note that even if sub-sub-device was created using `partition_by_affinity_domain` it would still be reported as created via partitioning by compute slices. | | `SYCL_PI_LEVEL_ZERO_COMMANDLISTS_CLEANUP_THRESHOLD` | Integer | If non-negative then the threshold is set to this value. If negative, the threshold is set to INT_MAX. Whenever the number of command lists in a queue exceeds this threshold, an attempt is made to cleanup completed command lists for their subsequent reuse. The default is 20. | | `SYCL_PI_LEVEL_ZERO_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD` | Integer | If non-negative then the threshold is set to this value. If negative, the threshold is set to INT_MAX. Whenever the number of events associated with an immediate command list exceeds this threshold, a check is made for signaled events and these events are recycled. Setting this threshold low causes events to be checked more often, which could result in unneeded events being recycled sooner. However, more frequent event status checks may cost time. The default is 1000. | -| `SYCL_PI_LEVEL_ZERO_USM_RESIDENT` | Integer | Controls if/where to make USM allocations resident at the time of allocation. If set to 0 then no special residency is forced. If set to 1 then allocation (device or shared) is made resident at the device of allocation. If set to 2 then allocation (device or shared) is made resident on all devices in the context of allocation that have P2P access to the device of allocation. For host allocation, any non-0 setting forces the allocation resident on all devices in the context. Default is 2. | +| `SYCL_PI_LEVEL_ZERO_USM_RESIDENT` | Integer | Controls if/where to make USM allocations resident at the time of allocation. Input value is of the form 0xHSD, where 4-bits of D control device allocations, 4-bits of S control shared allocations, and 4-bits of H control host allocations. Each 4-bit componenet is holding one of the following values: "0" - then no special residency is forced, "1" - then allocation is made resident at the device of allocation, or "2" - then allocation is made resident on all devices in the context of allocation that have P2P access to the device of allocation. Default is 0x002, i.e. force full residency for device allocations only. | | `SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D` | Integer | When set to a positive value enables the use of Level Zero USM 2D memory copy operations. Default is 0. | ## Debugging variables for CUDA Plugin From 7e64203aaa0bbab37faf64f88540b4ba9a81dea4 Mon Sep 17 00:00:00 2001 From: smaslov Date: Mon, 15 May 2023 15:45:58 -0700 Subject: [PATCH 3/3] support hex Signed-off-by: smaslov --- sycl/doc/EnvironmentVariables.md | 2 +- sycl/plugins/level_zero/pi_level_zero.cpp | 10 ++++- .../Plugin/level_zero_usm_residency.cpp | 40 +++++++++++++++++++ 3 files changed, 49 insertions(+), 3 deletions(-) create mode 100644 sycl/test-e2e/Plugin/level_zero_usm_residency.cpp diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 45b32720221d7..3970086ff3c45 100755 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -259,7 +259,7 @@ variables in production code. | `SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING` (Deprecated) | Integer | When set to non-zero value exposes compute slices as sub-sub-devices in `sycl::info::partition_property::partition_by_affinity_domain` partitioning scheme. Default is zero meaning that they are only exposed when partitioning by `sycl::info::partition_property::ext_intel_partition_by_cslice`. This option is introduced for compatibility reasons and is immediately deprecated. New code must not rely on this behavior. Also note that even if sub-sub-device was created using `partition_by_affinity_domain` it would still be reported as created via partitioning by compute slices. | | `SYCL_PI_LEVEL_ZERO_COMMANDLISTS_CLEANUP_THRESHOLD` | Integer | If non-negative then the threshold is set to this value. If negative, the threshold is set to INT_MAX. Whenever the number of command lists in a queue exceeds this threshold, an attempt is made to cleanup completed command lists for their subsequent reuse. The default is 20. | | `SYCL_PI_LEVEL_ZERO_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD` | Integer | If non-negative then the threshold is set to this value. If negative, the threshold is set to INT_MAX. Whenever the number of events associated with an immediate command list exceeds this threshold, a check is made for signaled events and these events are recycled. Setting this threshold low causes events to be checked more often, which could result in unneeded events being recycled sooner. However, more frequent event status checks may cost time. The default is 1000. | -| `SYCL_PI_LEVEL_ZERO_USM_RESIDENT` | Integer | Controls if/where to make USM allocations resident at the time of allocation. Input value is of the form 0xHSD, where 4-bits of D control device allocations, 4-bits of S control shared allocations, and 4-bits of H control host allocations. Each 4-bit componenet is holding one of the following values: "0" - then no special residency is forced, "1" - then allocation is made resident at the device of allocation, or "2" - then allocation is made resident on all devices in the context of allocation that have P2P access to the device of allocation. Default is 0x002, i.e. force full residency for device allocations only. | +| `SYCL_PI_LEVEL_ZERO_USM_RESIDENT` | Integer | Bit-mask controls if/where to make USM allocations resident at the time of allocation. Input value is of the form 0xHSD, where 4-bits of D control device allocations, 4-bits of S control shared allocations, and 4-bits of H control host allocations. Each 4-bit componenet is holding one of the following values: "0" - then no special residency is forced, "1" - then allocation is made resident at the device of allocation, or "2" - then allocation is made resident on all devices in the context of allocation that have P2P access to the device of allocation. Default is 0x002, i.e. force full residency for device allocations only. | | `SYCL_PI_LEVEL_ZERO_USE_NATIVE_USM_MEMCPY2D` | Integer | When set to a positive value enables the use of Level Zero USM 2D memory copy operations. Default is 0. | ## Debugging variables for CUDA Plugin diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 4f3f7d938967e..b56e5dd47079d 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -7249,7 +7249,14 @@ static uint32_t USMAllocationForceResidency = [] { const char *UrRet = std::getenv("UR_L0_USM_RESIDENT"); const char *PiRet = std::getenv("SYCL_PI_LEVEL_ZERO_USM_RESIDENT"); const char *Str = UrRet ? UrRet : (PiRet ? PiRet : nullptr); - return Str ? std::atoi(Str) : 0x2; + try { + if (Str) { + // Auto-detect radix to allow more convinient hex base + return std::stoi(Str, nullptr, 0); + } + } catch (...) { + } + return 0x2; }(); // Convert from an integer value to USMAllocationForceResidencyType enum value @@ -7284,7 +7291,6 @@ USMAllocationMakeResident(USMAllocationForceResidencyType ForceResidency, pi_context Context, pi_device Device, // nullptr for host allocation void *Ptr, size_t Size) { - if (ForceResidency == USMAllocationForceResidencyType::None) return PI_SUCCESS; diff --git a/sycl/test-e2e/Plugin/level_zero_usm_residency.cpp b/sycl/test-e2e/Plugin/level_zero_usm_residency.cpp new file mode 100644 index 0000000000000..340a557068e2e --- /dev/null +++ b/sycl/test-e2e/Plugin/level_zero_usm_residency.cpp @@ -0,0 +1,40 @@ +// REQUIRES: gpu, level_zero + +// RUN: %{build} %level_zero_options -o %t.out +// RUN: env SYCL_PI_TRACE=-1 ZE_DEBUG=-1 %{run} %t.out 2>&1 | FileCheck --check-prefixes=DEVICE %s +// RUN: env SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0x001 SYCL_PI_TRACE=-1 ZE_DEBUG=-1 %{run} %t.out 2>&1 | FileCheck --check-prefixes=DEVICE %s +// RUN: env SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0x010 SYCL_PI_TRACE=-1 ZE_DEBUG=-1 %{run} %t.out 2>&1 | FileCheck --check-prefixes=SHARED %s +// RUN: env SYCL_PI_LEVEL_ZERO_USM_RESIDENT=0x100 SYCL_PI_TRACE=-1 ZE_DEBUG=-1 %{run} %t.out 2>&1 | FileCheck --check-prefixes=HOST %s + +// Test that USM is made resident at allocation as requested. + +#include + +using namespace std; +using namespace sycl; + +int main(int argc, char *argv[]) { + queue Q; + + auto ptr1 = malloc_device(1, Q); + // DEVICE: ---> piextUSMDeviceAlloc + // DEVICE: ZE ---> zeMemAllocDevice + // DEVICE: ZE ---> zeContextMakeMemoryResident + // DEVICE-NOT: ZE ---> zeContextMakeMemoryResident + + auto ptr2 = malloc_shared(1, Q); + // SHARED: ---> piextUSMSharedAlloc + // SHARED: ZE ---> zeMemAllocShared + // SHARED: ZE ---> zeContextMakeMemoryResident + // SHARED-NOT: ZE ---> zeContextMakeMemoryResident + + auto ptr3 = malloc_host(1, Q); + // HOST: ---> piextUSMHostAlloc + // HOST: ZE ---> zeMemAllocHost + // HOST: ZE ---> zeContextMakeMemoryResident + + free(ptr1, Q); + free(ptr2, Q); + free(ptr3, Q); + return 0; +}