From ce087ff9351b9e57f88ef86be656926ad2f88298 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Fri, 2 Dec 2022 13:41:05 -0800 Subject: [PATCH 01/13] [SYCL][Level Zero] Implement sycl_ext_intel_cslice extension With this change, on PVC sub-sub-devices now require info::partition_property::ext_intel_partition_by_cslice instead of info::partition_property::partition_by_affinity_domain that wasn't quite accurately describing the actual scheme. On other devices, CSlice-based partitioning is now disabled because that's not how the actual H/W works. If precise manual access to individual CCS is required than sycl_ext_intel_queue_index extension should be used instead. Extension specification is being added in https://github.com/intel/llvm/pull/7513. --- sycl/include/sycl/detail/pi.h | 2 + sycl/include/sycl/device.hpp | 13 +++ sycl/include/sycl/info/info_desc.hpp | 3 +- sycl/plugins/level_zero/pi_level_zero.cpp | 121 +++++++++++++++------- sycl/source/detail/device_impl.cpp | 22 ++++ sycl/source/detail/device_impl.hpp | 10 ++ sycl/source/detail/device_info.hpp | 1 + sycl/source/device.cpp | 8 ++ 8 files changed, 141 insertions(+), 39 deletions(-) diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index e1818f5c50143..2830f7c8e182a 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -625,6 +625,8 @@ static constexpr pi_device_partition_property PI_DEVICE_PARTITION_BY_COUNTS_LIST_END = 0x0; static constexpr pi_device_partition_property PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN = 0x1088; +static constexpr pi_device_partition_property + PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE = 0x1089; // For compatibility with OpenCL define this not as enum. using pi_device_affinity_domain = pi_bitfield; diff --git a/sycl/include/sycl/device.hpp b/sycl/include/sycl/device.hpp index 686d7e730edc4..97c9641e6b626 100644 --- a/sycl/include/sycl/device.hpp +++ b/sycl/include/sycl/device.hpp @@ -176,6 +176,19 @@ class __SYCL_EXPORT device { std::vector create_sub_devices(info::partition_affinity_domain AffinityDomain) const; + /// Partition device into sub devices + /// + /// Available only when prop is + /// info::partition_property::ext_intel_partition_by_cslice. If this SYCL + /// device does not support + /// info::partition_property::ext_intel_partition_by_cslice a + /// feature_not_supported exception must be thrown. + /// + /// \return a vector class of sub devices partitioned from this SYCL + /// device at a granularity of "cslice" (compute slice). + template + std::vector create_sub_devices() const; + /// Queries this SYCL device for information requested by the template /// parameter param /// diff --git a/sycl/include/sycl/info/info_desc.hpp b/sycl/include/sycl/info/info_desc.hpp index 20d6d9b376edb..62e0f56388d59 100644 --- a/sycl/include/sycl/info/info_desc.hpp +++ b/sycl/include/sycl/info/info_desc.hpp @@ -55,7 +55,8 @@ enum class partition_property : pi_device_partition_property { no_partition = 0, partition_equally = PI_DEVICE_PARTITION_EQUALLY, partition_by_counts = PI_DEVICE_PARTITION_BY_COUNTS, - partition_by_affinity_domain = PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN + partition_by_affinity_domain = PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, + ext_intel_partition_by_cslice = PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE }; enum class partition_affinity_domain : pi_device_affinity_domain { diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 8659a42a058dc..a34db9c47793e 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -2603,29 +2603,36 @@ pi_result _pi_platform::populateDeviceCacheIfNeeded() { Ordinals.push_back(i); } } - - // Create PI sub-sub-devices with the sub-device for all the ordinals. - // Each {ordinal, index} points to a specific CCS which constructs - // a sub-sub-device at this point. - // FIXME: Level Zero creates multiple PiDevices for a single physical - // device when sub-device is partitioned into sub-sub-devices. - // Sub-sub-device is technically a command queue and we should not build - // program for each command queue. PiDevice is probably not the right - // abstraction for a Level Zero command queue. - for (uint32_t J = 0; J < Ordinals.size(); ++J) { - for (uint32_t K = 0; K < QueueGroupProperties[Ordinals[J]].numQueues; - ++K) { - std::unique_ptr<_pi_device> PiSubSubDevice( - new _pi_device(ZeSubdevices[I], this, PiSubDevice.get())); - pi_result Result = PiSubSubDevice->initialize(Ordinals[J], K); - if (Result != PI_SUCCESS) { - return Result; + bool IsPVC = + (PiSubDevice->ZeDeviceProperties->deviceId & 0xff0) == 0xbd0; + + // If isn't PVC, then submissions to different CCS can be executed on + // the same EUs still, so we cannot treat them as sub-sub-devices. + if (IsPVC) { + // Create PI sub-sub-devices with the sub-device for all the ordinals. + // Each {ordinal, index} points to a specific CCS which constructs + // a sub-sub-device at this point. + // + // FIXME: Level Zero creates multiple PiDevices for a single physical + // device when sub-device is partitioned into sub-sub-devices. + // Sub-sub-device is technically a command queue and we should not + // build program for each command queue. PiDevice is probably not the + // right abstraction for a Level Zero command queue. + for (uint32_t J = 0; J < Ordinals.size(); ++J) { + for (uint32_t K = 0; + K < QueueGroupProperties[Ordinals[J]].numQueues; ++K) { + std::unique_ptr<_pi_device> PiSubSubDevice( + new _pi_device(ZeSubdevices[I], this, PiSubDevice.get())); + pi_result Result = PiSubSubDevice->initialize(Ordinals[J], K); + if (Result != PI_SUCCESS) { + return Result; + } + + // save pointers to sub-sub-devices for quick retrieval in the + // future. + PiSubDevice->SubDevices.push_back(PiSubSubDevice.get()); + PiDevicesCache.push_back(std::move(PiSubSubDevice)); } - - // save pointers to sub-sub-devices for quick retrieval in the - // future. - PiSubDevice->SubDevices.push_back(PiSubSubDevice.get()); - PiDevicesCache.push_back(std::move(PiSubSubDevice)); } } @@ -2860,14 +2867,21 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, if (ZeSubDeviceCount < 2) { return ReturnValue(pi_device_partition_property{0}); } + bool PartitionedByCSlice = Device->SubDevices[0] + ->QueueGroup[_pi_queue::queue_type::Compute] + .ZeIndex >= 0; + // It is debatable if SYCL sub-device and partitioning APIs sufficient to // expose Level Zero sub-devices? We start with support of // "partition_by_affinity_domain" and "next_partitionable" but if that - // doesn't seem to be a good fit we could look at adding a more descriptive - // partitioning type. + // doesn't seem to be a good fit we could look at adding a more + // descriptive partitioning type. struct { pi_device_partition_property Arr[2]; - } PartitionProperties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, 0}}; + } PartitionProperties = {{PartitionedByCSlice + ? PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE + : PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, + 0}}; return ReturnValue(PartitionProperties); } case PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: @@ -2875,16 +2889,22 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, PI_DEVICE_AFFINITY_DOMAIN_NUMA | PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE}); case PI_DEVICE_INFO_PARTITION_TYPE: { - if (Device->isSubDevice()) { + // For root-device there is no partitioning to report. + if (!Device->isSubDevice()) + return ReturnValue(pi_device_partition_property{0}); + + if (Device->QueueGroup[_pi_queue::queue_type::Compute].ZeIndex >= 0) { struct { - pi_device_partition_property Arr[3]; - } PartitionProperties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, - PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE, - 0}}; + pi_device_partition_property Arr[2]; + } PartitionProperties = {{PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE, 0}}; return ReturnValue(PartitionProperties); } - // For root-device there is no partitioning to report. - return ReturnValue(pi_device_partition_property{0}); + + struct { + pi_device_partition_property Arr[3]; + } PartitionProperties = {{PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, + PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE, 0}}; + return ReturnValue(PartitionProperties); } // Everything under here is not supported yet @@ -3256,14 +3276,19 @@ pi_result piDevicePartition(pi_device Device, const pi_device_partition_property *Properties, pi_uint32 NumDevices, pi_device *OutDevices, pi_uint32 *OutNumDevices) { + PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); // Other partitioning ways are not supported by Level Zero - if (Properties[0] != PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN || - (Properties[1] != PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE && - Properties[1] != PI_DEVICE_AFFINITY_DOMAIN_NUMA)) { + if (Properties[0] == PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN) { + if ((Properties[1] != PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE && + Properties[1] != PI_DEVICE_AFFINITY_DOMAIN_NUMA)) + return PI_ERROR_INVALID_VALUE; + } else if (Properties[0] == PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE) { + if (Properties[1] != 0) + return PI_ERROR_INVALID_VALUE; + } else { return PI_ERROR_INVALID_VALUE; } - PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); // Devices cache is normally created in piDevicesGet but still make // sure that cache is populated. @@ -3273,8 +3298,28 @@ pi_result piDevicePartition(pi_device Device, return Res; } + auto EffectiveSize = [&]() -> decltype(Device->SubDevices.size()) { + if (Device->SubDevices.size() == 0) + return 0; + + // Sub-Sub-Devices are partitioned by CSlices, not by affinity domain. + if (Properties[0] == PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN) { + if (Device->isSubDevice()) + return 0; + } + if (Properties[0] == PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE) { + // Not a CSlice-based partitioning. + if (Device->SubDevices[0] + ->QueueGroup[_pi_queue::queue_type::Compute] + .ZeIndex < 0) + return 0; + } + + return Device->SubDevices.size(); + }(); + if (OutNumDevices) { - *OutNumDevices = Device->SubDevices.size(); + *OutNumDevices = EffectiveSize; } if (OutDevices) { @@ -3282,7 +3327,7 @@ pi_result piDevicePartition(pi_device Device, // Currently supported partitioning (by affinity domain/numa) would always // partition to all sub-devices. // - PI_ASSERT(NumDevices == Device->SubDevices.size(), PI_ERROR_INVALID_VALUE); + PI_ASSERT(NumDevices == EffectiveSize, PI_ERROR_INVALID_VALUE); for (uint32_t I = 0; I < NumDevices; I++) { OutDevices[I] = Device->SubDevices[I]; diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index b4c33e7181531..15287a44c4c2b 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -250,6 +250,28 @@ std::vector device_impl::create_sub_devices( return create_sub_devices(Properties, SubDevicesCount); } +std::vector device_impl::create_sub_devices() const { + assert(!MIsHostDevice && "Partitioning is not supported on host."); + + if (!is_partition_supported( + info::partition_property::ext_intel_partition_by_cslice)) { + throw sycl::feature_not_supported( + "Device does not support " + "sycl::info::partition_property::ext_intel_partition_by_cslice.", + PI_ERROR_INVALID_OPERATION); + } + + const pi_device_partition_property Properties[2] = { + PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE, 0}; + + pi_uint32 SubDevicesCount = 0; + const detail::plugin &Plugin = getPlugin(); + Plugin.call( + MDevice, Properties, 0, nullptr, &SubDevicesCount); + + return create_sub_devices(Properties, SubDevicesCount); +} + pi_native_handle device_impl::getNative() const { auto Plugin = getPlugin(); if (Plugin.getBackend() == backend::opencl) diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp index 24154f9b41ef7..17fdd6c746367 100644 --- a/sycl/source/detail/device_impl.hpp +++ b/sycl/source/detail/device_impl.hpp @@ -172,6 +172,16 @@ class device_impl { std::vector create_sub_devices(info::partition_affinity_domain AffinityDomain) const; + /// Partition device into sub devices + /// + /// If this SYCL device does not support + /// info::partition_property::ext_intel_partition_by_cslice a + /// feature_not_supported exception must be thrown. + /// + /// \return a vector class of sub devices partitioned from this SYCL + /// device at a granularity of "cslice" (compute slice). + std::vector create_sub_devices() const; + /// Check if desired partition property supported by device /// /// \param Prop is one of info::partition_property::(partition_equally, diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index 9e5306fd27107..748e3399ac947 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -358,6 +358,7 @@ static bool is_sycl_partition_property(info::partition_property PP) { case info::partition_property::partition_equally: case info::partition_property::partition_by_counts: case info::partition_property::partition_by_affinity_domain: + case info::partition_property::ext_intel_partition_by_cslice: return true; } return false; diff --git a/sycl/source/device.cpp b/sycl/source/device.cpp index b170720aa8fcb..a39f35dbd5684 100644 --- a/sycl/source/device.cpp +++ b/sycl/source/device.cpp @@ -117,6 +117,14 @@ template __SYCL_EXPORT std::vector device::create_sub_devices< info::partition_property::partition_by_affinity_domain>( info::partition_affinity_domain AffinityDomain) const; +template +std::vector device::create_sub_devices() const { + return impl->create_sub_devices(); +} + +template __SYCL_EXPORT std::vector device::create_sub_devices< + info::partition_property::ext_intel_partition_by_cslice>() const; + bool device::has_extension(const std::string &extension_name) const { return impl->has_extension(extension_name); } From fdaad62a70da7ffda77aaf2526b19c952b6e1ca5 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Fri, 2 Dec 2022 14:09:44 -0800 Subject: [PATCH 02/13] Update Linux symbols --- sycl/test/abi/sycl_symbols_linux.dump | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index d8304a7f85068..82698c08f2671 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -4172,6 +4172,7 @@ _ZNK4sycl3_V16device14is_acceleratorEv _ZNK4sycl3_V16device18create_sub_devicesILNS0_4info18partition_propertyE4230EEESt6vectorIS1_SaIS1_EEm _ZNK4sycl3_V16device18create_sub_devicesILNS0_4info18partition_propertyE4231EEESt6vectorIS1_SaIS1_EERKS5_ImSaImEE _ZNK4sycl3_V16device18create_sub_devicesILNS0_4info18partition_propertyE4232EEESt6vectorIS1_SaIS1_EENS3_25partition_affinity_domainE +_ZNK4sycl3_V16device18create_sub_devicesILNS0_4info18partition_propertyE4233EEESt6vectorIS1_SaIS1_EEv _ZNK4sycl3_V16device3getEv _ZNK4sycl3_V16device3hasENS0_6aspectE _ZNK4sycl3_V16device6is_cpuEv From 02f7f006941c2bb7efa1f25772a830259ca118e3 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Mon, 5 Dec 2022 11:33:05 -0800 Subject: [PATCH 03/13] clang-format --- sycl/plugins/level_zero/pi_level_zero.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 0b9cdadc38d49..6aeb50e09bbb5 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -3282,7 +3282,7 @@ pi_result piDevicePartition(pi_device Device, // Other partitioning ways are not supported by Level Zero if (Properties[0] == PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN) { if ((Properties[1] != PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE && - Properties[1] != PI_DEVICE_AFFINITY_DOMAIN_NUMA)) + Properties[1] != PI_DEVICE_AFFINITY_DOMAIN_NUMA)) return PI_ERROR_INVALID_VALUE; } else if (Properties[0] == PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE) { if (Properties[1] != 0) From c5195f91e5f1cf1457f5a63497eee63df1de95cd Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Mon, 5 Dec 2022 11:36:49 -0800 Subject: [PATCH 04/13] More clang-format (extra empty line) --- sycl/plugins/level_zero/pi_level_zero.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 6aeb50e09bbb5..56f17a0611507 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -3291,7 +3291,6 @@ pi_result piDevicePartition(pi_device Device, return PI_ERROR_INVALID_VALUE; } - // Devices cache is normally created in piDevicesGet but still make // sure that cache is populated. // From 57c763304ffa853d941dc2a9761509d9d1195e38 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Mon, 5 Dec 2022 17:04:00 -0800 Subject: [PATCH 05/13] Bump minor version --- sycl/include/sycl/detail/pi.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index ec1da8f03660a..87671c563aedf 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -60,9 +60,11 @@ // PI_EXT_ONEAPI_QUEUE_PRIORITY_HIGH queue properties. // 11.18 Add new parameter name PI_EXT_ONEAPI_QUEUE_INFO_EMPTY to // _pi_queue_info. +// 11.19 Add new PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE piDevicePartition +// scheme. #define _PI_H_VERSION_MAJOR 11 -#define _PI_H_VERSION_MINOR 18 +#define _PI_H_VERSION_MINOR 19 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) From 72dac5b92bf247a57ceaadc5030051930edc6492 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Tue, 6 Dec 2022 11:27:21 -0800 Subject: [PATCH 06/13] Add SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING support Version bump have to be major actually. --- sycl/doc/EnvironmentVariables.md | 1 + sycl/include/sycl/detail/pi.h | 9 +++-- sycl/plugins/level_zero/pi_level_zero.cpp | 44 ++++++++++++++++------- 3 files changed, 38 insertions(+), 16 deletions(-) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 81fc992bb47e2..1a8e2f9cd41cb 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -249,6 +249,7 @@ variables in production code. | `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL` | Integer | When set to a positive value enables use of a copy engine for memory fill operations. Default is 0. | | `SYCL_PI_LEVEL_ZERO_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION` | Integer | When set to "0" tells to use single root-device allocation for all devices in a context where all devices have same root. Otherwise performs regular buffer migration. Default is 1. | | `SYCL_PI_LEVEL_ZERO_REUSE_DISCARDED_EVENTS` | Integer | When set to a positive value enables the mode when discarded Level Zero events are reset and reused in scope of the same in-order queue based on the dependency chain between commands. Default is 1. | +| `SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING` (Deprecated) | Integer | When set to non-zero value exposes compute slices as sub-sub-devices in `sycl::info::partition_property::partition_by_affinity_domain` partitioning scheme. Default is zero meaning that they are only exposed when partitioning by `sycl::info::partition_property::ext_intel_partition_by_cslice`. If the device doesn't support partitioning by compute slice this variable has no effect. This option is introduced for compatibility reasons and is immediately deprecated. New code must not rely on this behavior. Also note that even if sub-sub-device was created using `partition_by_affinity_domain` it would still be reported as created via partitioning by compute slices. | ## Debugging variables for CUDA Plugin diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 87671c563aedf..3bdaca0bb05c5 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -60,10 +60,13 @@ // PI_EXT_ONEAPI_QUEUE_PRIORITY_HIGH queue properties. // 11.18 Add new parameter name PI_EXT_ONEAPI_QUEUE_INFO_EMPTY to // _pi_queue_info. -// 11.19 Add new PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE piDevicePartition -// scheme. +// 12.19 Add new PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE piDevicePartition +// scheme. Sub-sub-devices (representing compute slice) creation via +// partitioning by affinity domain is disabled by default and can be temporarily +// restored via SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING +// environment variable. -#define _PI_H_VERSION_MAJOR 11 +#define _PI_H_VERSION_MAJOR 12 #define _PI_H_VERSION_MINOR 19 #define _PI_STRING_HELPER(a) #a diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 56f17a0611507..5a4cc3f20ed36 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -170,6 +170,12 @@ static const int DeviceEventsSetting = [] { return AllHostVisible; }(); +static const bool ExposeCSliceInAffinityPartitioning = [] { + const char *Flag = + std::getenv("SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING"); + return Flag ? std::atoi(Flag) != 0 : false; +}(); + // Helper function to implement zeHostSynchronize. // The behavior is to avoid infinite wait during host sync under ZE_DEBUG. // This allows for a much more responsive debugging of hangs. @@ -2873,18 +2879,25 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, ->QueueGroup[_pi_queue::queue_type::Compute] .ZeIndex >= 0; - // It is debatable if SYCL sub-device and partitioning APIs sufficient to - // expose Level Zero sub-devices? We start with support of - // "partition_by_affinity_domain" and "next_partitionable" but if that - // doesn't seem to be a good fit we could look at adding a more - // descriptive partitioning type. - struct { - pi_device_partition_property Arr[2]; - } PartitionProperties = {{PartitionedByCSlice - ? PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE - : PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, - 0}}; - return ReturnValue(PartitionProperties); + auto ReturnHelper = [&](auto... Partitions) { + struct { + pi_device_partition_property Arr[sizeof...(Partitions)+1]; + } PartitionProperties = {{Partitions..., 0}}; + return ReturnValue(PartitionProperties); + }; + + if (ExposeCSliceInAffinityPartitioning) { + if (PartitionedByCSlice) + return ReturnHelper(PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE, + PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); + + else + return ReturnHelper(PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); + } else { + return ReturnHelper(PartitionedByCSlice + ? PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE + : PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); + } } case PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: return ReturnValue(pi_device_affinity_domain{ @@ -3304,7 +3317,12 @@ pi_result piDevicePartition(pi_device Device, return 0; // Sub-Sub-Devices are partitioned by CSlices, not by affinity domain. - if (Properties[0] == PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN) { + // However, if + // SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING overrides that + // still expose CSlices in partitioning by affinity domain for compatibility + // reasons. + if (Properties[0] == PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN && + !ExposeCSliceInAffinityPartitioning) { if (Device->isSubDevice()) return 0; } From e33e82d244607a30aab9b39a3fb1dc44122c0a58 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Tue, 6 Dec 2022 11:32:02 -0800 Subject: [PATCH 07/13] clang-format --- sycl/plugins/level_zero/pi_level_zero.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 5a4cc3f20ed36..71c3b61f420da 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -2881,7 +2881,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, auto ReturnHelper = [&](auto... Partitions) { struct { - pi_device_partition_property Arr[sizeof...(Partitions)+1]; + pi_device_partition_property Arr[sizeof...(Partitions) + 1]; } PartitionProperties = {{Partitions..., 0}}; return ReturnValue(PartitionProperties); }; From 73530047a24f7d3e19e88399e3d707a3ddf1ea2b Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Fri, 9 Dec 2022 13:40:27 -0800 Subject: [PATCH 08/13] Move extension to supported --- .../{proposed => supported}/sycl_ext_intel_cslice.asciidoc | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) rename sycl/doc/extensions/{proposed => supported}/sycl_ext_intel_cslice.asciidoc (97%) diff --git a/sycl/doc/extensions/proposed/sycl_ext_intel_cslice.asciidoc b/sycl/doc/extensions/supported/sycl_ext_intel_cslice.asciidoc similarity index 97% rename from sycl/doc/extensions/proposed/sycl_ext_intel_cslice.asciidoc rename to sycl/doc/extensions/supported/sycl_ext_intel_cslice.asciidoc index 2e53585c710cf..26546b546ebc9 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_intel_cslice.asciidoc +++ b/sycl/doc/extensions/supported/sycl_ext_intel_cslice.asciidoc @@ -43,11 +43,7 @@ SYCL specification refer to that revision. == Status -This is a proposed extension specification, intended to gather community -feedback. Interfaces defined in this specification may not be implemented yet -or may be in a preliminary state. The specification itself may also change in -incompatible ways before it is finalized. *Shipping software products should -not rely on APIs defined in this specification.* +This extension is implemented and fully supported by DPC++. == Overview From 5552e86adcecf3bef069869b79e269d9108bfb21 Mon Sep 17 00:00:00 2001 From: "Elovikov, Andrei" Date: Fri, 9 Dec 2022 11:20:46 -0800 Subject: [PATCH 09/13] Update Windows symbols --- sycl/test/abi/sycl_symbols_windows.dump | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index c6c7c8b0fb9fd..2e5f68e24ba66 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -12,6 +12,7 @@ ??$create_sub_devices@$0BAIG@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@_K@Z ??$create_sub_devices@$0BAIH@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@AEBV?$vector@_KV?$allocator@_K@std@@@4@@Z ??$create_sub_devices@$0BAII@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@W4partition_affinity_domain@info@12@@Z +??$create_sub_devices@$0BAIJ@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@XZ ??$getPlugin@$00@pi@detail@_V1@sycl@@YAAEBVplugin@123@XZ ??$getPlugin@$01@pi@detail@_V1@sycl@@YAAEBVplugin@123@XZ ??$getPlugin@$02@pi@detail@_V1@sycl@@YAAEBVplugin@123@XZ From a645e2a95a063b2f645e1abb0c57887a52acef4f Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Fri, 9 Dec 2022 15:31:11 -0800 Subject: [PATCH 10/13] Add helper member functions to _pi_device --- sycl/plugins/level_zero/pi_level_zero.cpp | 14 ++++---------- sycl/plugins/level_zero/pi_level_zero.hpp | 8 ++++++++ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 71c3b61f420da..9fe0f7088f8de 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -2611,12 +2611,10 @@ pi_result _pi_platform::populateDeviceCacheIfNeeded() { Ordinals.push_back(i); } } - bool IsPVC = - (PiSubDevice->ZeDeviceProperties->deviceId & 0xff0) == 0xbd0; // If isn't PVC, then submissions to different CCS can be executed on // the same EUs still, so we cannot treat them as sub-sub-devices. - if (IsPVC) { + if (PiSubDevice->isPVC()) { // Create PI sub-sub-devices with the sub-device for all the ordinals. // Each {ordinal, index} points to a specific CCS which constructs // a sub-sub-device at this point. @@ -2875,9 +2873,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, if (ZeSubDeviceCount < 2) { return ReturnValue(pi_device_partition_property{0}); } - bool PartitionedByCSlice = Device->SubDevices[0] - ->QueueGroup[_pi_queue::queue_type::Compute] - .ZeIndex >= 0; + bool PartitionedByCSlice = Device->SubDevices[0]->isCCS(); auto ReturnHelper = [&](auto... Partitions) { struct { @@ -2908,7 +2904,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, if (!Device->isSubDevice()) return ReturnValue(pi_device_partition_property{0}); - if (Device->QueueGroup[_pi_queue::queue_type::Compute].ZeIndex >= 0) { + if (Device->isCCS()) { struct { pi_device_partition_property Arr[2]; } PartitionProperties = {{PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE, 0}}; @@ -3328,9 +3324,7 @@ pi_result piDevicePartition(pi_device Device, } if (Properties[0] == PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE) { // Not a CSlice-based partitioning. - if (Device->SubDevices[0] - ->QueueGroup[_pi_queue::queue_type::Compute] - .ZeIndex < 0) + if (!Device->SubDevices[0]->isCCS()) return 0; } diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 2712120e2f940..2ce69970d2999 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -379,6 +379,14 @@ struct _pi_device : _pi_object { bool isSubDevice() { return RootDevice != nullptr; } + // Is this a Data Center GPU Max series (aka PVC). + bool isPVC() { return (ZeDeviceProperties->deviceId & 0xff0) == 0xbd0; } + + // Does this device represent a single compute slice? + bool isCCS() const { + return QueueGroup[_pi_device::queue_group_info_t::Compute].ZeIndex >= 0; + } + // Cache of the immutable device properties. ZeCache> ZeDeviceProperties; ZeCache> ZeDeviceComputeProperties; From f1d518bfc2e74934f3b9e7857ccbb8e7fc29bece Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Fri, 9 Dec 2022 15:31:46 -0800 Subject: [PATCH 11/13] Rename PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE->PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE --- sycl/include/sycl/detail/pi.h | 4 ++-- sycl/include/sycl/info/info_desc.hpp | 2 +- sycl/plugins/level_zero/pi_level_zero.cpp | 10 +++++----- sycl/source/detail/device_impl.cpp | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 3bdaca0bb05c5..33c41767c4c44 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -60,7 +60,7 @@ // PI_EXT_ONEAPI_QUEUE_PRIORITY_HIGH queue properties. // 11.18 Add new parameter name PI_EXT_ONEAPI_QUEUE_INFO_EMPTY to // _pi_queue_info. -// 12.19 Add new PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE piDevicePartition +// 12.19 Add new PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE piDevicePartition // scheme. Sub-sub-devices (representing compute slice) creation via // partitioning by affinity domain is disabled by default and can be temporarily // restored via SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING @@ -636,7 +636,7 @@ static constexpr pi_device_partition_property static constexpr pi_device_partition_property PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN = 0x1088; static constexpr pi_device_partition_property - PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE = 0x1089; + PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE = 0x1089; // For compatibility with OpenCL define this not as enum. using pi_device_affinity_domain = pi_bitfield; diff --git a/sycl/include/sycl/info/info_desc.hpp b/sycl/include/sycl/info/info_desc.hpp index 62e0f56388d59..e033ca37e93be 100644 --- a/sycl/include/sycl/info/info_desc.hpp +++ b/sycl/include/sycl/info/info_desc.hpp @@ -56,7 +56,7 @@ enum class partition_property : pi_device_partition_property { partition_equally = PI_DEVICE_PARTITION_EQUALLY, partition_by_counts = PI_DEVICE_PARTITION_BY_COUNTS, partition_by_affinity_domain = PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN, - ext_intel_partition_by_cslice = PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE + ext_intel_partition_by_cslice = PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE }; enum class partition_affinity_domain : pi_device_affinity_domain { diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 9fe0f7088f8de..ce68edcc3812b 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -2884,14 +2884,14 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, if (ExposeCSliceInAffinityPartitioning) { if (PartitionedByCSlice) - return ReturnHelper(PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE, + return ReturnHelper(PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE, PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); else return ReturnHelper(PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); } else { return ReturnHelper(PartitionedByCSlice - ? PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE + ? PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE : PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN); } } @@ -2907,7 +2907,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, if (Device->isCCS()) { struct { pi_device_partition_property Arr[2]; - } PartitionProperties = {{PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE, 0}}; + } PartitionProperties = {{PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE, 0}}; return ReturnValue(PartitionProperties); } @@ -3293,7 +3293,7 @@ pi_result piDevicePartition(pi_device Device, if ((Properties[1] != PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE && Properties[1] != PI_DEVICE_AFFINITY_DOMAIN_NUMA)) return PI_ERROR_INVALID_VALUE; - } else if (Properties[0] == PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE) { + } else if (Properties[0] == PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE) { if (Properties[1] != 0) return PI_ERROR_INVALID_VALUE; } else { @@ -3322,7 +3322,7 @@ pi_result piDevicePartition(pi_device Device, if (Device->isSubDevice()) return 0; } - if (Properties[0] == PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE) { + if (Properties[0] == PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE) { // Not a CSlice-based partitioning. if (!Device->SubDevices[0]->isCCS()) return 0; diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index 15287a44c4c2b..f039e99afd2a4 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -262,7 +262,7 @@ std::vector device_impl::create_sub_devices() const { } const pi_device_partition_property Properties[2] = { - PI_DEVICE_EXT_INTEL_PARTITION_BY_CSLICE, 0}; + PI_EXT_INTEL_DEVICE_PARTITION_BY_CSLICE, 0}; pi_uint32 SubDevicesCount = 0; const detail::plugin &Plugin = getPlugin(); From 19671136176ec2cdc8e2df63edad9ee6fa44b189 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Fri, 9 Dec 2022 16:03:43 -0800 Subject: [PATCH 12/13] Rename EffectiveSize -> EffectiveNumDevices --- sycl/plugins/level_zero/pi_level_zero.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index ce68edcc3812b..01acb1acf1c2d 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -3308,7 +3308,7 @@ pi_result piDevicePartition(pi_device Device, return Res; } - auto EffectiveSize = [&]() -> decltype(Device->SubDevices.size()) { + auto EffectiveNumDevices = [&]() -> decltype(Device->SubDevices.size()) { if (Device->SubDevices.size() == 0) return 0; @@ -3332,7 +3332,7 @@ pi_result piDevicePartition(pi_device Device, }(); if (OutNumDevices) { - *OutNumDevices = EffectiveSize; + *OutNumDevices = EffectiveNumDevices; } if (OutDevices) { @@ -3340,7 +3340,7 @@ pi_result piDevicePartition(pi_device Device, // Currently supported partitioning (by affinity domain/numa) would always // partition to all sub-devices. // - PI_ASSERT(NumDevices == EffectiveSize, PI_ERROR_INVALID_VALUE); + PI_ASSERT(NumDevices == EffectiveNumDevices, PI_ERROR_INVALID_VALUE); for (uint32_t I = 0; I < NumDevices; I++) { OutDevices[I] = Device->SubDevices[I]; From a55f7f3e08ce702530ee2a4c47a7a00bfab8075f Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Fri, 9 Dec 2022 16:50:37 -0800 Subject: [PATCH 13/13] Expose CSlices as sub-sub-device on non-PVC under SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING For backward compatibility and should really be addressed by customers ASAP because the behavior might be different from what they expect. --- sycl/doc/EnvironmentVariables.md | 2 +- sycl/plugins/level_zero/pi_level_zero.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 1a8e2f9cd41cb..28db4597f1bd3 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -249,7 +249,7 @@ variables in production code. | `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL` | Integer | When set to a positive value enables use of a copy engine for memory fill operations. Default is 0. | | `SYCL_PI_LEVEL_ZERO_SINGLE_ROOT_DEVICE_BUFFER_MIGRATION` | Integer | When set to "0" tells to use single root-device allocation for all devices in a context where all devices have same root. Otherwise performs regular buffer migration. Default is 1. | | `SYCL_PI_LEVEL_ZERO_REUSE_DISCARDED_EVENTS` | Integer | When set to a positive value enables the mode when discarded Level Zero events are reset and reused in scope of the same in-order queue based on the dependency chain between commands. Default is 1. | -| `SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING` (Deprecated) | Integer | When set to non-zero value exposes compute slices as sub-sub-devices in `sycl::info::partition_property::partition_by_affinity_domain` partitioning scheme. Default is zero meaning that they are only exposed when partitioning by `sycl::info::partition_property::ext_intel_partition_by_cslice`. If the device doesn't support partitioning by compute slice this variable has no effect. This option is introduced for compatibility reasons and is immediately deprecated. New code must not rely on this behavior. Also note that even if sub-sub-device was created using `partition_by_affinity_domain` it would still be reported as created via partitioning by compute slices. | +| `SYCL_PI_LEVEL_ZERO_EXPOSE_CSLICE_IN_AFFINITY_PARTITIONING` (Deprecated) | Integer | When set to non-zero value exposes compute slices as sub-sub-devices in `sycl::info::partition_property::partition_by_affinity_domain` partitioning scheme. Default is zero meaning that they are only exposed when partitioning by `sycl::info::partition_property::ext_intel_partition_by_cslice`. This option is introduced for compatibility reasons and is immediately deprecated. New code must not rely on this behavior. Also note that even if sub-sub-device was created using `partition_by_affinity_domain` it would still be reported as created via partitioning by compute slices. | ## Debugging variables for CUDA Plugin diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 01acb1acf1c2d..338bbecc2434c 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -2614,7 +2614,7 @@ pi_result _pi_platform::populateDeviceCacheIfNeeded() { // If isn't PVC, then submissions to different CCS can be executed on // the same EUs still, so we cannot treat them as sub-sub-devices. - if (PiSubDevice->isPVC()) { + if (PiSubDevice->isPVC() || ExposeCSliceInAffinityPartitioning) { // Create PI sub-sub-devices with the sub-device for all the ordinals. // Each {ordinal, index} points to a specific CCS which constructs // a sub-sub-device at this point.